Compare commits
8 Commits
v0.7.6
...
fix/docker
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1a22fb4d4f | ||
|
|
81b5312629 | ||
|
|
73a5a7b0f5 | ||
|
|
05921811b8 | ||
|
|
25507adb5b | ||
|
|
aba4036ab6 | ||
|
|
e2af031b09 | ||
|
|
b97eaeea4c |
81
.github/workflows/docker-release.yml
vendored
81
.github/workflows/docker-release.yml
vendored
@@ -1,81 +0,0 @@
|
|||||||
name: Docker Release
|
|
||||||
on:
|
|
||||||
release:
|
|
||||||
types: [published]
|
|
||||||
push:
|
|
||||||
tags:
|
|
||||||
- 'docker-rebuild-v*' # Allow manual Docker rebuilds via tags
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
docker:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Checkout code
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
|
|
||||||
- name: Extract version from release or tag
|
|
||||||
id: get_version
|
|
||||||
run: |
|
|
||||||
if [ "${{ github.event_name }}" == "release" ]; then
|
|
||||||
# Triggered by release event
|
|
||||||
VERSION="${{ github.event.release.tag_name }}"
|
|
||||||
VERSION=${VERSION#v} # Remove 'v' prefix
|
|
||||||
else
|
|
||||||
# Triggered by docker-rebuild-v* tag
|
|
||||||
VERSION=${GITHUB_REF#refs/tags/docker-rebuild-v}
|
|
||||||
fi
|
|
||||||
echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
|
|
||||||
echo "Building Docker images for version: $VERSION"
|
|
||||||
|
|
||||||
- name: Extract major and minor versions
|
|
||||||
id: versions
|
|
||||||
run: |
|
|
||||||
VERSION=${{ steps.get_version.outputs.VERSION }}
|
|
||||||
MAJOR=$(echo $VERSION | cut -d. -f1)
|
|
||||||
MINOR=$(echo $VERSION | cut -d. -f1-2)
|
|
||||||
echo "MAJOR=$MAJOR" >> $GITHUB_OUTPUT
|
|
||||||
echo "MINOR=$MINOR" >> $GITHUB_OUTPUT
|
|
||||||
echo "Semantic versions - Major: $MAJOR, Minor: $MINOR"
|
|
||||||
|
|
||||||
- name: Set up Docker Buildx
|
|
||||||
uses: docker/setup-buildx-action@v3
|
|
||||||
|
|
||||||
- name: Log in to Docker Hub
|
|
||||||
uses: docker/login-action@v3
|
|
||||||
with:
|
|
||||||
username: ${{ secrets.DOCKER_USERNAME }}
|
|
||||||
password: ${{ secrets.DOCKER_TOKEN }}
|
|
||||||
|
|
||||||
- name: Build and push Docker images
|
|
||||||
uses: docker/build-push-action@v5
|
|
||||||
with:
|
|
||||||
context: .
|
|
||||||
push: true
|
|
||||||
tags: |
|
|
||||||
unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
|
||||||
unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}
|
|
||||||
unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}
|
|
||||||
unclecode/crawl4ai:latest
|
|
||||||
platforms: linux/amd64,linux/arm64
|
|
||||||
cache-from: type=gha
|
|
||||||
cache-to: type=gha,mode=max
|
|
||||||
|
|
||||||
- name: Summary
|
|
||||||
run: |
|
|
||||||
echo "## 🐳 Docker Release Complete!" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "### Published Images" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "- \`unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}\`" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}\`" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "- \`unclecode/crawl4ai:latest\`" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "### Platforms" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "- linux/amd64" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "- linux/arm64" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "### 🚀 Pull Command" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "\`\`\`bash" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "docker pull unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
|
|
||||||
917
.github/workflows/docs/ARCHITECTURE.md
vendored
917
.github/workflows/docs/ARCHITECTURE.md
vendored
@@ -1,917 +0,0 @@
|
|||||||
# Workflow Architecture Documentation
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
This document describes the technical architecture of the split release pipeline for Crawl4AI.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Architecture Diagram
|
|
||||||
|
|
||||||
```
|
|
||||||
┌─────────────────────────────────────────────────────────────────┐
|
|
||||||
│ Developer │
|
|
||||||
│ │ │
|
|
||||||
│ ▼ │
|
|
||||||
│ git tag v1.2.3 │
|
|
||||||
│ git push --tags │
|
|
||||||
└──────────────────────────────┬──────────────────────────────────┘
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
┌─────────────────────────────────────────────────────────────────┐
|
|
||||||
│ GitHub Repository │
|
|
||||||
│ │
|
|
||||||
│ ┌────────────────────────────────────────────────────────┐ │
|
|
||||||
│ │ Tag Event: v1.2.3 │ │
|
|
||||||
│ └────────────────────────────────────────────────────────┘ │
|
|
||||||
│ │ │
|
|
||||||
│ ▼ │
|
|
||||||
│ ┌────────────────────────────────────────────────────────┐ │
|
|
||||||
│ │ release.yml (Release Pipeline) │ │
|
|
||||||
│ │ ┌──────────────────────────────────────────────┐ │ │
|
|
||||||
│ │ │ 1. Extract Version │ │ │
|
|
||||||
│ │ │ v1.2.3 → 1.2.3 │ │ │
|
|
||||||
│ │ └──────────────────────────────────────────────┘ │ │
|
|
||||||
│ │ ┌──────────────────────────────────────────────┐ │ │
|
|
||||||
│ │ │ 2. Validate Version │ │ │
|
|
||||||
│ │ │ Tag == __version__.py │ │ │
|
|
||||||
│ │ └──────────────────────────────────────────────┘ │ │
|
|
||||||
│ │ ┌──────────────────────────────────────────────┐ │ │
|
|
||||||
│ │ │ 3. Build Python Package │ │ │
|
|
||||||
│ │ │ - Source dist (.tar.gz) │ │ │
|
|
||||||
│ │ │ - Wheel (.whl) │ │ │
|
|
||||||
│ │ └──────────────────────────────────────────────┘ │ │
|
|
||||||
│ │ ┌──────────────────────────────────────────────┐ │ │
|
|
||||||
│ │ │ 4. Upload to PyPI │ │ │
|
|
||||||
│ │ │ - Authenticate with token │ │ │
|
|
||||||
│ │ │ - Upload dist/* │ │ │
|
|
||||||
│ │ └──────────────────────────────────────────────┘ │ │
|
|
||||||
│ │ ┌──────────────────────────────────────────────┐ │ │
|
|
||||||
│ │ │ 5. Create GitHub Release │ │ │
|
|
||||||
│ │ │ - Tag: v1.2.3 │ │ │
|
|
||||||
│ │ │ - Body: Install instructions │ │ │
|
|
||||||
│ │ │ - Status: Published │ │ │
|
|
||||||
│ │ └──────────────────────────────────────────────┘ │ │
|
|
||||||
│ └────────────────────────────────────────────────────────┘ │
|
|
||||||
│ │ │
|
|
||||||
│ ▼ │
|
|
||||||
│ ┌────────────────────────────────────────────────────────┐ │
|
|
||||||
│ │ Release Event: published (v1.2.3) │ │
|
|
||||||
│ └────────────────────────────────────────────────────────┘ │
|
|
||||||
│ │ │
|
|
||||||
│ ▼ │
|
|
||||||
│ ┌────────────────────────────────────────────────────────┐ │
|
|
||||||
│ │ docker-release.yml (Docker Pipeline) │ │
|
|
||||||
│ │ ┌──────────────────────────────────────────────┐ │ │
|
|
||||||
│ │ │ 1. Extract Version from Release │ │ │
|
|
||||||
│ │ │ github.event.release.tag_name → 1.2.3 │ │ │
|
|
||||||
│ │ └──────────────────────────────────────────────┘ │ │
|
|
||||||
│ │ ┌──────────────────────────────────────────────┐ │ │
|
|
||||||
│ │ │ 2. Parse Semantic Versions │ │ │
|
|
||||||
│ │ │ 1.2.3 → Major: 1, Minor: 1.2 │ │ │
|
|
||||||
│ │ └──────────────────────────────────────────────┘ │ │
|
|
||||||
│ │ ┌──────────────────────────────────────────────┐ │ │
|
|
||||||
│ │ │ 3. Setup Multi-Arch Build │ │ │
|
|
||||||
│ │ │ - Docker Buildx │ │ │
|
|
||||||
│ │ │ - QEMU emulation │ │ │
|
|
||||||
│ │ └──────────────────────────────────────────────┘ │ │
|
|
||||||
│ │ ┌──────────────────────────────────────────────┐ │ │
|
|
||||||
│ │ │ 4. Authenticate Docker Hub │ │ │
|
|
||||||
│ │ │ - Username: DOCKER_USERNAME │ │ │
|
|
||||||
│ │ │ - Token: DOCKER_TOKEN │ │ │
|
|
||||||
│ │ └──────────────────────────────────────────────┘ │ │
|
|
||||||
│ │ ┌──────────────────────────────────────────────┐ │ │
|
|
||||||
│ │ │ 5. Build Multi-Arch Images │ │ │
|
|
||||||
│ │ │ ┌────────────────┬────────────────┐ │ │ │
|
|
||||||
│ │ │ │ linux/amd64 │ linux/arm64 │ │ │ │
|
|
||||||
│ │ │ └────────────────┴────────────────┘ │ │ │
|
|
||||||
│ │ │ Cache: GitHub Actions (type=gha) │ │ │
|
|
||||||
│ │ └──────────────────────────────────────────────┘ │ │
|
|
||||||
│ │ ┌──────────────────────────────────────────────┐ │ │
|
|
||||||
│ │ │ 6. Push to Docker Hub │ │ │
|
|
||||||
│ │ │ Tags: │ │ │
|
|
||||||
│ │ │ - unclecode/crawl4ai:1.2.3 │ │ │
|
|
||||||
│ │ │ - unclecode/crawl4ai:1.2 │ │ │
|
|
||||||
│ │ │ - unclecode/crawl4ai:1 │ │ │
|
|
||||||
│ │ │ - unclecode/crawl4ai:latest │ │ │
|
|
||||||
│ │ └──────────────────────────────────────────────┘ │ │
|
|
||||||
│ └────────────────────────────────────────────────────────┘ │
|
|
||||||
└─────────────────────────────────────────────────────────────────┘
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
┌─────────────────────────────────────────────────────────────────┐
|
|
||||||
│ External Services │
|
|
||||||
│ │
|
|
||||||
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
|
|
||||||
│ │ PyPI │ │ Docker Hub │ │ GitHub │ │
|
|
||||||
│ │ │ │ │ │ │ │
|
|
||||||
│ │ crawl4ai │ │ unclecode/ │ │ Releases │ │
|
|
||||||
│ │ 1.2.3 │ │ crawl4ai │ │ v1.2.3 │ │
|
|
||||||
│ └──────────────┘ └──────────────┘ └──────────────┘ │
|
|
||||||
└─────────────────────────────────────────────────────────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Component Details
|
|
||||||
|
|
||||||
### 1. Release Pipeline (release.yml)
|
|
||||||
|
|
||||||
#### Purpose
|
|
||||||
Fast publication of Python package and GitHub release.
|
|
||||||
|
|
||||||
#### Input
|
|
||||||
- **Trigger**: Git tag matching `v*` (excluding `test-v*`)
|
|
||||||
- **Example**: `v1.2.3`
|
|
||||||
|
|
||||||
#### Processing Stages
|
|
||||||
|
|
||||||
##### Stage 1: Version Extraction
|
|
||||||
```bash
|
|
||||||
Input: refs/tags/v1.2.3
|
|
||||||
Output: VERSION=1.2.3
|
|
||||||
```
|
|
||||||
|
|
||||||
**Implementation**:
|
|
||||||
```bash
|
|
||||||
TAG_VERSION=${GITHUB_REF#refs/tags/v} # Remove 'refs/tags/v' prefix
|
|
||||||
echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Stage 2: Version Validation
|
|
||||||
```bash
|
|
||||||
Input: TAG_VERSION=1.2.3
|
|
||||||
Check: crawl4ai/__version__.py contains __version__ = "1.2.3"
|
|
||||||
Output: Pass/Fail
|
|
||||||
```
|
|
||||||
|
|
||||||
**Implementation**:
|
|
||||||
```bash
|
|
||||||
PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)")
|
|
||||||
if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Stage 3: Package Build
|
|
||||||
```bash
|
|
||||||
Input: Source code + pyproject.toml
|
|
||||||
Output: dist/crawl4ai-1.2.3.tar.gz
|
|
||||||
dist/crawl4ai-1.2.3-py3-none-any.whl
|
|
||||||
```
|
|
||||||
|
|
||||||
**Implementation**:
|
|
||||||
```bash
|
|
||||||
python -m build
|
|
||||||
# Uses build backend defined in pyproject.toml
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Stage 4: PyPI Upload
|
|
||||||
```bash
|
|
||||||
Input: dist/*.{tar.gz,whl}
|
|
||||||
Auth: PYPI_TOKEN
|
|
||||||
Output: Package published to PyPI
|
|
||||||
```
|
|
||||||
|
|
||||||
**Implementation**:
|
|
||||||
```bash
|
|
||||||
twine upload dist/*
|
|
||||||
# Environment:
|
|
||||||
# TWINE_USERNAME: __token__
|
|
||||||
# TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Stage 5: GitHub Release Creation
|
|
||||||
```bash
|
|
||||||
Input: Tag: v1.2.3
|
|
||||||
Body: Markdown content
|
|
||||||
Output: Published GitHub release
|
|
||||||
```
|
|
||||||
|
|
||||||
**Implementation**:
|
|
||||||
```yaml
|
|
||||||
uses: softprops/action-gh-release@v2
|
|
||||||
with:
|
|
||||||
tag_name: v1.2.3
|
|
||||||
name: Release v1.2.3
|
|
||||||
body: |
|
|
||||||
Installation instructions and changelog
|
|
||||||
draft: false
|
|
||||||
prerelease: false
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Output
|
|
||||||
- **PyPI Package**: https://pypi.org/project/crawl4ai/1.2.3/
|
|
||||||
- **GitHub Release**: Published release on repository
|
|
||||||
- **Event**: `release.published` (triggers Docker workflow)
|
|
||||||
|
|
||||||
#### Timeline
|
|
||||||
```
|
|
||||||
0:00 - Tag pushed
|
|
||||||
0:01 - Checkout + Python setup
|
|
||||||
0:02 - Version validation
|
|
||||||
0:03 - Package build
|
|
||||||
0:04 - PyPI upload starts
|
|
||||||
0:06 - PyPI upload complete
|
|
||||||
0:07 - GitHub release created
|
|
||||||
0:08 - Workflow complete
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### 2. Docker Release Pipeline (docker-release.yml)
|
|
||||||
|
|
||||||
#### Purpose
|
|
||||||
Build and publish multi-architecture Docker images.
|
|
||||||
|
|
||||||
#### Inputs
|
|
||||||
|
|
||||||
##### Input 1: Release Event (Automatic)
|
|
||||||
```yaml
|
|
||||||
Event: release.published
|
|
||||||
Data: github.event.release.tag_name = "v1.2.3"
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Input 2: Docker Rebuild Tag (Manual)
|
|
||||||
```yaml
|
|
||||||
Tag: docker-rebuild-v1.2.3
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Processing Stages
|
|
||||||
|
|
||||||
##### Stage 1: Version Detection
|
|
||||||
```bash
|
|
||||||
# From release event:
|
|
||||||
VERSION = github.event.release.tag_name.strip("v")
|
|
||||||
# Result: "1.2.3"
|
|
||||||
|
|
||||||
# From rebuild tag:
|
|
||||||
VERSION = GITHUB_REF.replace("refs/tags/docker-rebuild-v", "")
|
|
||||||
# Result: "1.2.3"
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Stage 2: Semantic Version Parsing
|
|
||||||
```bash
|
|
||||||
Input: VERSION=1.2.3
|
|
||||||
Output: MAJOR=1
|
|
||||||
MINOR=1.2
|
|
||||||
PATCH=3 (implicit)
|
|
||||||
```
|
|
||||||
|
|
||||||
**Implementation**:
|
|
||||||
```bash
|
|
||||||
MAJOR=$(echo $VERSION | cut -d. -f1) # Extract first component
|
|
||||||
MINOR=$(echo $VERSION | cut -d. -f1-2) # Extract first two components
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Stage 3: Multi-Architecture Setup
|
|
||||||
```yaml
|
|
||||||
Setup:
|
|
||||||
- Docker Buildx (multi-platform builder)
|
|
||||||
- QEMU (ARM emulation on x86)
|
|
||||||
|
|
||||||
Platforms:
|
|
||||||
- linux/amd64 (x86_64)
|
|
||||||
- linux/arm64 (aarch64)
|
|
||||||
```
|
|
||||||
|
|
||||||
**Architecture**:
|
|
||||||
```
|
|
||||||
GitHub Runner (linux/amd64)
|
|
||||||
├─ Buildx Builder
|
|
||||||
│ ├─ Native: Build linux/amd64 image
|
|
||||||
│ └─ QEMU: Emulate ARM to build linux/arm64 image
|
|
||||||
└─ Generate manifest list (points to both images)
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Stage 4: Docker Hub Authentication
|
|
||||||
```bash
|
|
||||||
Input: DOCKER_USERNAME
|
|
||||||
DOCKER_TOKEN
|
|
||||||
Output: Authenticated Docker client
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Stage 5: Build with Cache
|
|
||||||
```yaml
|
|
||||||
Cache Configuration:
|
|
||||||
cache-from: type=gha # Read from GitHub Actions cache
|
|
||||||
cache-to: type=gha,mode=max # Write all layers
|
|
||||||
|
|
||||||
Cache Key Components:
|
|
||||||
- Workflow file path
|
|
||||||
- Branch name
|
|
||||||
- Architecture (amd64/arm64)
|
|
||||||
```
|
|
||||||
|
|
||||||
**Cache Hierarchy**:
|
|
||||||
```
|
|
||||||
Cache Entry: main/docker-release.yml/linux-amd64
|
|
||||||
├─ Layer: sha256:abc123... (FROM python:3.12)
|
|
||||||
├─ Layer: sha256:def456... (RUN apt-get update)
|
|
||||||
├─ Layer: sha256:ghi789... (COPY requirements.txt)
|
|
||||||
├─ Layer: sha256:jkl012... (RUN pip install)
|
|
||||||
└─ Layer: sha256:mno345... (COPY . /app)
|
|
||||||
|
|
||||||
Cache Hit/Miss Logic:
|
|
||||||
- If layer input unchanged → cache hit → skip build
|
|
||||||
- If layer input changed → cache miss → rebuild + all subsequent layers
|
|
||||||
```
|
|
||||||
|
|
||||||
##### Stage 6: Tag Generation
|
|
||||||
```bash
|
|
||||||
Input: VERSION=1.2.3, MAJOR=1, MINOR=1.2
|
|
||||||
|
|
||||||
Output Tags:
|
|
||||||
- unclecode/crawl4ai:1.2.3 (exact version)
|
|
||||||
- unclecode/crawl4ai:1.2 (minor version)
|
|
||||||
- unclecode/crawl4ai:1 (major version)
|
|
||||||
- unclecode/crawl4ai:latest (latest stable)
|
|
||||||
```
|
|
||||||
|
|
||||||
**Tag Strategy**:
|
|
||||||
- All tags point to same image SHA
|
|
||||||
- Users can pin to desired stability level
|
|
||||||
- Pushing new version updates `1`, `1.2`, and `latest` automatically
|
|
||||||
|
|
||||||
##### Stage 7: Push to Registry
|
|
||||||
```bash
|
|
||||||
For each tag:
|
|
||||||
For each platform (amd64, arm64):
|
|
||||||
Push image to Docker Hub
|
|
||||||
|
|
||||||
Create manifest list:
|
|
||||||
Manifest: unclecode/crawl4ai:1.2.3
|
|
||||||
├─ linux/amd64: sha256:abc...
|
|
||||||
└─ linux/arm64: sha256:def...
|
|
||||||
|
|
||||||
Docker CLI automatically selects correct platform on pull
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Output
|
|
||||||
- **Docker Images**: 4 tags × 2 platforms = 8 image variants + 4 manifests
|
|
||||||
- **Docker Hub**: https://hub.docker.com/r/unclecode/crawl4ai/tags
|
|
||||||
|
|
||||||
#### Timeline
|
|
||||||
|
|
||||||
**Cold Cache (First Build)**:
|
|
||||||
```
|
|
||||||
0:00 - Release event received
|
|
||||||
0:01 - Checkout + Buildx setup
|
|
||||||
0:02 - Docker Hub auth
|
|
||||||
0:03 - Start build (amd64)
|
|
||||||
0:08 - Complete amd64 build
|
|
||||||
0:09 - Start build (arm64)
|
|
||||||
0:14 - Complete arm64 build
|
|
||||||
0:15 - Generate manifests
|
|
||||||
0:16 - Push all tags
|
|
||||||
0:17 - Workflow complete
|
|
||||||
```
|
|
||||||
|
|
||||||
**Warm Cache (Code Change Only)**:
|
|
||||||
```
|
|
||||||
0:00 - Release event received
|
|
||||||
0:01 - Checkout + Buildx setup
|
|
||||||
0:02 - Docker Hub auth
|
|
||||||
0:03 - Start build (amd64) - cache hit for layers 1-4
|
|
||||||
0:04 - Complete amd64 build (only layer 5 rebuilt)
|
|
||||||
0:05 - Start build (arm64) - cache hit for layers 1-4
|
|
||||||
0:06 - Complete arm64 build (only layer 5 rebuilt)
|
|
||||||
0:07 - Generate manifests
|
|
||||||
0:08 - Push all tags
|
|
||||||
0:09 - Workflow complete
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Data Flow
|
|
||||||
|
|
||||||
### Version Information Flow
|
|
||||||
|
|
||||||
```
|
|
||||||
Developer
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
crawl4ai/__version__.py
|
|
||||||
__version__ = "1.2.3"
|
|
||||||
│
|
|
||||||
├─► Git Tag
|
|
||||||
│ v1.2.3
|
|
||||||
│ │
|
|
||||||
│ ▼
|
|
||||||
│ release.yml
|
|
||||||
│ │
|
|
||||||
│ ├─► Validation
|
|
||||||
│ │ ✓ Match
|
|
||||||
│ │
|
|
||||||
│ ├─► PyPI Package
|
|
||||||
│ │ crawl4ai==1.2.3
|
|
||||||
│ │
|
|
||||||
│ └─► GitHub Release
|
|
||||||
│ v1.2.3
|
|
||||||
│ │
|
|
||||||
│ ▼
|
|
||||||
│ docker-release.yml
|
|
||||||
│ │
|
|
||||||
│ └─► Docker Tags
|
|
||||||
│ 1.2.3, 1.2, 1, latest
|
|
||||||
│
|
|
||||||
└─► Package Metadata
|
|
||||||
pyproject.toml
|
|
||||||
version = "1.2.3"
|
|
||||||
```
|
|
||||||
|
|
||||||
### Secrets Flow
|
|
||||||
|
|
||||||
```
|
|
||||||
GitHub Secrets (Encrypted at Rest)
|
|
||||||
│
|
|
||||||
├─► PYPI_TOKEN
|
|
||||||
│ │
|
|
||||||
│ ▼
|
|
||||||
│ release.yml
|
|
||||||
│ │
|
|
||||||
│ ▼
|
|
||||||
│ TWINE_PASSWORD env var (masked in logs)
|
|
||||||
│ │
|
|
||||||
│ ▼
|
|
||||||
│ PyPI API (HTTPS)
|
|
||||||
│
|
|
||||||
├─► DOCKER_USERNAME
|
|
||||||
│ │
|
|
||||||
│ ▼
|
|
||||||
│ docker-release.yml
|
|
||||||
│ │
|
|
||||||
│ ▼
|
|
||||||
│ docker/login-action (masked in logs)
|
|
||||||
│ │
|
|
||||||
│ ▼
|
|
||||||
│ Docker Hub API (HTTPS)
|
|
||||||
│
|
|
||||||
└─► DOCKER_TOKEN
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
docker-release.yml
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
docker/login-action (masked in logs)
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
Docker Hub API (HTTPS)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Artifact Flow
|
|
||||||
|
|
||||||
```
|
|
||||||
Source Code
|
|
||||||
│
|
|
||||||
├─► release.yml
|
|
||||||
│ │
|
|
||||||
│ ▼
|
|
||||||
│ python -m build
|
|
||||||
│ │
|
|
||||||
│ ├─► crawl4ai-1.2.3.tar.gz
|
|
||||||
│ │ │
|
|
||||||
│ │ ▼
|
|
||||||
│ │ PyPI Storage
|
|
||||||
│ │ │
|
|
||||||
│ │ ▼
|
|
||||||
│ │ pip install crawl4ai
|
|
||||||
│ │
|
|
||||||
│ └─► crawl4ai-1.2.3-py3-none-any.whl
|
|
||||||
│ │
|
|
||||||
│ ▼
|
|
||||||
│ PyPI Storage
|
|
||||||
│ │
|
|
||||||
│ ▼
|
|
||||||
│ pip install crawl4ai
|
|
||||||
│
|
|
||||||
└─► docker-release.yml
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
docker build
|
|
||||||
│
|
|
||||||
├─► Image: linux/amd64
|
|
||||||
│ │
|
|
||||||
│ └─► Docker Hub
|
|
||||||
│ unclecode/crawl4ai:1.2.3-amd64
|
|
||||||
│
|
|
||||||
└─► Image: linux/arm64
|
|
||||||
│
|
|
||||||
└─► Docker Hub
|
|
||||||
unclecode/crawl4ai:1.2.3-arm64
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## State Machines
|
|
||||||
|
|
||||||
### Release Pipeline State Machine
|
|
||||||
|
|
||||||
```
|
|
||||||
┌─────────┐
|
|
||||||
│ START │
|
|
||||||
└────┬────┘
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
┌──────────────┐
|
|
||||||
│ Extract │
|
|
||||||
│ Version │
|
|
||||||
└──────┬───────┘
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
┌──────────────┐ ┌─────────┐
|
|
||||||
│ Validate │─────►│ FAILED │
|
|
||||||
│ Version │ No │ (Exit 1)│
|
|
||||||
└──────┬───────┘ └─────────┘
|
|
||||||
│ Yes
|
|
||||||
▼
|
|
||||||
┌──────────────┐
|
|
||||||
│ Build │
|
|
||||||
│ Package │
|
|
||||||
└──────┬───────┘
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
┌──────────────┐ ┌─────────┐
|
|
||||||
│ Upload │─────►│ FAILED │
|
|
||||||
│ to PyPI │ Error│ (Exit 1)│
|
|
||||||
└──────┬───────┘ └─────────┘
|
|
||||||
│ Success
|
|
||||||
▼
|
|
||||||
┌──────────────┐
|
|
||||||
│ Create │
|
|
||||||
│ GH Release │
|
|
||||||
└──────┬───────┘
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
┌──────────────┐
|
|
||||||
│ SUCCESS │
|
|
||||||
│ (Emit Event) │
|
|
||||||
└──────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
### Docker Pipeline State Machine
|
|
||||||
|
|
||||||
```
|
|
||||||
┌─────────┐
|
|
||||||
│ START │
|
|
||||||
│ (Event) │
|
|
||||||
└────┬────┘
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
┌──────────────┐
|
|
||||||
│ Detect │
|
|
||||||
│ Version │
|
|
||||||
│ Source │
|
|
||||||
└──────┬───────┘
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
┌──────────────┐
|
|
||||||
│ Parse │
|
|
||||||
│ Semantic │
|
|
||||||
│ Versions │
|
|
||||||
└──────┬───────┘
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
┌──────────────┐ ┌─────────┐
|
|
||||||
│ Authenticate │─────►│ FAILED │
|
|
||||||
│ Docker Hub │ Error│ (Exit 1)│
|
|
||||||
└──────┬───────┘ └─────────┘
|
|
||||||
│ Success
|
|
||||||
▼
|
|
||||||
┌──────────────┐
|
|
||||||
│ Build │
|
|
||||||
│ amd64 │
|
|
||||||
└──────┬───────┘
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
┌──────────────┐ ┌─────────┐
|
|
||||||
│ Build │─────►│ FAILED │
|
|
||||||
│ arm64 │ Error│ (Exit 1)│
|
|
||||||
└──────┬───────┘ └─────────┘
|
|
||||||
│ Success
|
|
||||||
▼
|
|
||||||
┌──────────────┐
|
|
||||||
│ Push All │
|
|
||||||
│ Tags │
|
|
||||||
└──────┬───────┘
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
┌──────────────┐
|
|
||||||
│ SUCCESS │
|
|
||||||
└──────────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Security Architecture
|
|
||||||
|
|
||||||
### Threat Model
|
|
||||||
|
|
||||||
#### Threats Mitigated
|
|
||||||
|
|
||||||
1. **Secret Exposure**
|
|
||||||
- Mitigation: GitHub Actions secret masking
|
|
||||||
- Evidence: Secrets never appear in logs
|
|
||||||
|
|
||||||
2. **Unauthorized Package Upload**
|
|
||||||
- Mitigation: Scoped PyPI tokens
|
|
||||||
- Evidence: Token limited to `crawl4ai` project
|
|
||||||
|
|
||||||
3. **Man-in-the-Middle**
|
|
||||||
- Mitigation: HTTPS for all API calls
|
|
||||||
- Evidence: PyPI, Docker Hub, GitHub all use TLS
|
|
||||||
|
|
||||||
4. **Supply Chain Tampering**
|
|
||||||
- Mitigation: Immutable artifacts, content checksums
|
|
||||||
- Evidence: PyPI stores SHA256, Docker uses content-addressable storage
|
|
||||||
|
|
||||||
#### Trust Boundaries
|
|
||||||
|
|
||||||
```
|
|
||||||
┌─────────────────────────────────────────┐
|
|
||||||
│ Trusted Zone │
|
|
||||||
│ ┌────────────────────────────────┐ │
|
|
||||||
│ │ GitHub Actions Runner │ │
|
|
||||||
│ │ - Ephemeral VM │ │
|
|
||||||
│ │ - Isolated environment │ │
|
|
||||||
│ │ - Access to secrets │ │
|
|
||||||
│ └────────────────────────────────┘ │
|
|
||||||
│ │ │
|
|
||||||
│ │ HTTPS (TLS 1.2+) │
|
|
||||||
│ ▼ │
|
|
||||||
└─────────────────────────────────────────┘
|
|
||||||
│
|
|
||||||
┌────────────┼────────────┐
|
|
||||||
│ │ │
|
|
||||||
▼ ▼ ▼
|
|
||||||
┌────────┐ ┌─────────┐ ┌──────────┐
|
|
||||||
│ PyPI │ │ Docker │ │ GitHub │
|
|
||||||
│ API │ │ Hub │ │ API │
|
|
||||||
└────────┘ └─────────┘ └──────────┘
|
|
||||||
External External External
|
|
||||||
Service Service Service
|
|
||||||
```
|
|
||||||
|
|
||||||
### Secret Management
|
|
||||||
|
|
||||||
#### Secret Lifecycle
|
|
||||||
|
|
||||||
```
|
|
||||||
Creation (Developer)
|
|
||||||
│
|
|
||||||
├─► PyPI: Create API token (scoped to project)
|
|
||||||
├─► Docker Hub: Create access token (read/write)
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
Storage (GitHub)
|
|
||||||
│
|
|
||||||
├─► Encrypted at rest (AES-256)
|
|
||||||
├─► Access controlled (repo-scoped)
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
Usage (Workflow)
|
|
||||||
│
|
|
||||||
├─► Injected as env vars
|
|
||||||
├─► Masked in logs (GitHub redacts on output)
|
|
||||||
├─► Never persisted to disk (in-memory only)
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
Transmission (API Call)
|
|
||||||
│
|
|
||||||
├─► HTTPS only
|
|
||||||
├─► TLS 1.2+ with strong ciphers
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
Rotation (Manual)
|
|
||||||
│
|
|
||||||
└─► Regenerate on PyPI/Docker Hub
|
|
||||||
Update GitHub secret
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Performance Characteristics
|
|
||||||
|
|
||||||
### Release Pipeline Performance
|
|
||||||
|
|
||||||
| Metric | Value | Notes |
|
|
||||||
|--------|-------|-------|
|
|
||||||
| Cold start | ~2-3 min | First run on new runner |
|
|
||||||
| Warm start | ~2-3 min | Minimal caching benefit |
|
|
||||||
| PyPI upload | ~30-60 sec | Network-bound |
|
|
||||||
| Package build | ~30 sec | CPU-bound |
|
|
||||||
| Parallelization | None | Sequential by design |
|
|
||||||
|
|
||||||
### Docker Pipeline Performance
|
|
||||||
|
|
||||||
| Metric | Cold Cache | Warm Cache (code) | Warm Cache (deps) |
|
|
||||||
|--------|-----------|-------------------|-------------------|
|
|
||||||
| Total time | 10-15 min | 1-2 min | 3-5 min |
|
|
||||||
| amd64 build | 5-7 min | 30-60 sec | 1-2 min |
|
|
||||||
| arm64 build | 5-7 min | 30-60 sec | 1-2 min |
|
|
||||||
| Push time | 1-2 min | 30 sec | 30 sec |
|
|
||||||
| Cache hit rate | 0% | 85% | 60% |
|
|
||||||
|
|
||||||
### Cache Performance Model
|
|
||||||
|
|
||||||
```python
|
|
||||||
def estimate_build_time(changes):
|
|
||||||
base_time = 60 # seconds (setup + push)
|
|
||||||
|
|
||||||
if "Dockerfile" in changes:
|
|
||||||
return base_time + (10 * 60) # Full rebuild: ~11 min
|
|
||||||
elif "requirements.txt" in changes:
|
|
||||||
return base_time + (3 * 60) # Deps rebuild: ~4 min
|
|
||||||
elif any(f.endswith(".py") for f in changes):
|
|
||||||
return base_time + 60 # Code only: ~2 min
|
|
||||||
else:
|
|
||||||
return base_time # No changes: ~1 min
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Scalability Considerations
|
|
||||||
|
|
||||||
### Current Limits
|
|
||||||
|
|
||||||
| Resource | Limit | Impact |
|
|
||||||
|----------|-------|--------|
|
|
||||||
| Workflow concurrency | 20 (default) | Max 20 releases in parallel |
|
|
||||||
| Artifact storage | 500 MB/artifact | PyPI packages small (<10 MB) |
|
|
||||||
| Cache storage | 10 GB/repo | Docker layers fit comfortably |
|
|
||||||
| Workflow run time | 6 hours | Plenty of headroom |
|
|
||||||
|
|
||||||
### Scaling Strategies
|
|
||||||
|
|
||||||
#### Horizontal Scaling (Multiple Repos)
|
|
||||||
```
|
|
||||||
crawl4ai (main)
|
|
||||||
├─ release.yml
|
|
||||||
└─ docker-release.yml
|
|
||||||
|
|
||||||
crawl4ai-plugins (separate)
|
|
||||||
├─ release.yml
|
|
||||||
└─ docker-release.yml
|
|
||||||
|
|
||||||
Each repo has independent:
|
|
||||||
- Secrets
|
|
||||||
- Cache (10 GB each)
|
|
||||||
- Concurrency limits (20 each)
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Vertical Scaling (Larger Runners)
|
|
||||||
```yaml
|
|
||||||
jobs:
|
|
||||||
docker:
|
|
||||||
runs-on: ubuntu-latest-8-cores # GitHub-hosted larger runner
|
|
||||||
# 4x faster builds for CPU-bound layers
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Disaster Recovery
|
|
||||||
|
|
||||||
### Failure Scenarios
|
|
||||||
|
|
||||||
#### Scenario 1: Release Pipeline Fails
|
|
||||||
|
|
||||||
**Failure Point**: PyPI upload fails (network error)
|
|
||||||
|
|
||||||
**State**:
|
|
||||||
- ✓ Version validated
|
|
||||||
- ✓ Package built
|
|
||||||
- ✗ PyPI upload
|
|
||||||
- ✗ GitHub release
|
|
||||||
|
|
||||||
**Recovery**:
|
|
||||||
```bash
|
|
||||||
# Manual upload
|
|
||||||
twine upload dist/*
|
|
||||||
|
|
||||||
# Retry workflow (re-run from GitHub Actions UI)
|
|
||||||
```
|
|
||||||
|
|
||||||
**Prevention**: Add retry logic to PyPI upload
|
|
||||||
|
|
||||||
#### Scenario 2: Docker Pipeline Fails
|
|
||||||
|
|
||||||
**Failure Point**: ARM build fails (dependency issue)
|
|
||||||
|
|
||||||
**State**:
|
|
||||||
- ✓ PyPI published
|
|
||||||
- ✓ GitHub release created
|
|
||||||
- ✓ amd64 image built
|
|
||||||
- ✗ arm64 image build
|
|
||||||
|
|
||||||
**Recovery**:
|
|
||||||
```bash
|
|
||||||
# Fix Dockerfile
|
|
||||||
git commit -am "fix: ARM build dependency"
|
|
||||||
|
|
||||||
# Trigger rebuild
|
|
||||||
git tag docker-rebuild-v1.2.3
|
|
||||||
git push origin docker-rebuild-v1.2.3
|
|
||||||
```
|
|
||||||
|
|
||||||
**Impact**: PyPI package available, only Docker ARM users affected
|
|
||||||
|
|
||||||
#### Scenario 3: Partial Release
|
|
||||||
|
|
||||||
**Failure Point**: GitHub release creation fails
|
|
||||||
|
|
||||||
**State**:
|
|
||||||
- ✓ PyPI published
|
|
||||||
- ✗ GitHub release
|
|
||||||
- ✗ Docker images
|
|
||||||
|
|
||||||
**Recovery**:
|
|
||||||
```bash
|
|
||||||
# Create release manually
|
|
||||||
gh release create v1.2.3 \
|
|
||||||
--title "Release v1.2.3" \
|
|
||||||
--notes "..."
|
|
||||||
|
|
||||||
# This triggers docker-release.yml automatically
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Monitoring and Observability
|
|
||||||
|
|
||||||
### Metrics to Track
|
|
||||||
|
|
||||||
#### Release Pipeline
|
|
||||||
- Success rate (target: >99%)
|
|
||||||
- Duration (target: <3 min)
|
|
||||||
- PyPI upload time (target: <60 sec)
|
|
||||||
|
|
||||||
#### Docker Pipeline
|
|
||||||
- Success rate (target: >95%)
|
|
||||||
- Duration (target: <15 min cold, <2 min warm)
|
|
||||||
- Cache hit rate (target: >80% for code changes)
|
|
||||||
|
|
||||||
### Alerting
|
|
||||||
|
|
||||||
**Critical Alerts**:
|
|
||||||
- Release pipeline failure (blocks release)
|
|
||||||
- PyPI authentication failure (expired token)
|
|
||||||
|
|
||||||
**Warning Alerts**:
|
|
||||||
- Docker build >15 min (performance degradation)
|
|
||||||
- Cache hit rate <50% (cache issue)
|
|
||||||
|
|
||||||
### Logging
|
|
||||||
|
|
||||||
**GitHub Actions Logs**:
|
|
||||||
- Retention: 90 days
|
|
||||||
- Downloadable: Yes
|
|
||||||
- Searchable: Limited
|
|
||||||
|
|
||||||
**Recommended External Logging**:
|
|
||||||
```yaml
|
|
||||||
- name: Send logs to external service
|
|
||||||
if: failure()
|
|
||||||
run: |
|
|
||||||
curl -X POST https://logs.example.com/api/v1/logs \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d "{\"workflow\": \"${{ github.workflow }}\", \"status\": \"failed\"}"
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Future Enhancements
|
|
||||||
|
|
||||||
### Planned Improvements
|
|
||||||
|
|
||||||
1. **Automated Changelog Generation**
|
|
||||||
- Use conventional commits
|
|
||||||
- Generate CHANGELOG.md automatically
|
|
||||||
|
|
||||||
2. **Pre-release Testing**
|
|
||||||
- Test builds on `test-v*` tags
|
|
||||||
- Upload to TestPyPI
|
|
||||||
|
|
||||||
3. **Notification System**
|
|
||||||
- Slack/Discord notifications on release
|
|
||||||
- Email on failure
|
|
||||||
|
|
||||||
4. **Performance Optimization**
|
|
||||||
- Parallel Docker builds (amd64 + arm64 simultaneously)
|
|
||||||
- Persistent runners for better caching
|
|
||||||
|
|
||||||
5. **Enhanced Validation**
|
|
||||||
- Smoke tests after PyPI upload
|
|
||||||
- Container security scanning
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## References
|
|
||||||
|
|
||||||
- [GitHub Actions Architecture](https://docs.github.com/en/actions/learn-github-actions/understanding-github-actions)
|
|
||||||
- [Docker Build Cache](https://docs.docker.com/build/cache/)
|
|
||||||
- [PyPI API Documentation](https://warehouse.pypa.io/api-reference/)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
**Last Updated**: 2025-01-21
|
|
||||||
**Version**: 2.0
|
|
||||||
1029
.github/workflows/docs/README.md
vendored
1029
.github/workflows/docs/README.md
vendored
File diff suppressed because it is too large
Load Diff
287
.github/workflows/docs/WORKFLOW_REFERENCE.md
vendored
287
.github/workflows/docs/WORKFLOW_REFERENCE.md
vendored
@@ -1,287 +0,0 @@
|
|||||||
# Workflow Quick Reference
|
|
||||||
|
|
||||||
## Quick Commands
|
|
||||||
|
|
||||||
### Standard Release
|
|
||||||
```bash
|
|
||||||
# 1. Update version
|
|
||||||
vim crawl4ai/__version__.py # Set to "1.2.3"
|
|
||||||
|
|
||||||
# 2. Commit and tag
|
|
||||||
git add crawl4ai/__version__.py
|
|
||||||
git commit -m "chore: bump version to 1.2.3"
|
|
||||||
git tag v1.2.3
|
|
||||||
git push origin main
|
|
||||||
git push origin v1.2.3
|
|
||||||
|
|
||||||
# 3. Monitor
|
|
||||||
# - PyPI: ~2-3 minutes
|
|
||||||
# - Docker: ~1-15 minutes
|
|
||||||
```
|
|
||||||
|
|
||||||
### Docker Rebuild Only
|
|
||||||
```bash
|
|
||||||
git tag docker-rebuild-v1.2.3
|
|
||||||
git push origin docker-rebuild-v1.2.3
|
|
||||||
```
|
|
||||||
|
|
||||||
### Delete Tag (Undo Release)
|
|
||||||
```bash
|
|
||||||
# Local
|
|
||||||
git tag -d v1.2.3
|
|
||||||
|
|
||||||
# Remote
|
|
||||||
git push --delete origin v1.2.3
|
|
||||||
|
|
||||||
# GitHub Release
|
|
||||||
gh release delete v1.2.3
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Workflow Triggers
|
|
||||||
|
|
||||||
### release.yml
|
|
||||||
| Event | Pattern | Example |
|
|
||||||
|-------|---------|---------|
|
|
||||||
| Tag push | `v*` | `v1.2.3` |
|
|
||||||
| Excludes | `test-v*` | `test-v1.2.3` |
|
|
||||||
|
|
||||||
### docker-release.yml
|
|
||||||
| Event | Pattern | Example |
|
|
||||||
|-------|---------|---------|
|
|
||||||
| Release published | `release.published` | Automatic |
|
|
||||||
| Tag push | `docker-rebuild-v*` | `docker-rebuild-v1.2.3` |
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Environment Variables
|
|
||||||
|
|
||||||
### release.yml
|
|
||||||
| Variable | Source | Example |
|
|
||||||
|----------|--------|---------|
|
|
||||||
| `VERSION` | Git tag | `1.2.3` |
|
|
||||||
| `TWINE_USERNAME` | Static | `__token__` |
|
|
||||||
| `TWINE_PASSWORD` | Secret | `pypi-Ag...` |
|
|
||||||
| `GITHUB_TOKEN` | Auto | `ghp_...` |
|
|
||||||
|
|
||||||
### docker-release.yml
|
|
||||||
| Variable | Source | Example |
|
|
||||||
|----------|--------|---------|
|
|
||||||
| `VERSION` | Release/Tag | `1.2.3` |
|
|
||||||
| `MAJOR` | Computed | `1` |
|
|
||||||
| `MINOR` | Computed | `1.2` |
|
|
||||||
| `DOCKER_USERNAME` | Secret | `unclecode` |
|
|
||||||
| `DOCKER_TOKEN` | Secret | `dckr_pat_...` |
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Docker Tags Generated
|
|
||||||
|
|
||||||
| Version | Tags Created |
|
|
||||||
|---------|-------------|
|
|
||||||
| v1.0.0 | `1.0.0`, `1.0`, `1`, `latest` |
|
|
||||||
| v1.1.0 | `1.1.0`, `1.1`, `1`, `latest` |
|
|
||||||
| v1.2.3 | `1.2.3`, `1.2`, `1`, `latest` |
|
|
||||||
| v2.0.0 | `2.0.0`, `2.0`, `2`, `latest` |
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Workflow Outputs
|
|
||||||
|
|
||||||
### release.yml
|
|
||||||
| Output | Location | Time |
|
|
||||||
|--------|----------|------|
|
|
||||||
| PyPI Package | https://pypi.org/project/crawl4ai/ | ~2-3 min |
|
|
||||||
| GitHub Release | Repository → Releases | ~2-3 min |
|
|
||||||
| Workflow Summary | Actions → Run → Summary | Immediate |
|
|
||||||
|
|
||||||
### docker-release.yml
|
|
||||||
| Output | Location | Time |
|
|
||||||
|--------|----------|------|
|
|
||||||
| Docker Images | https://hub.docker.com/r/unclecode/crawl4ai | ~1-15 min |
|
|
||||||
| Workflow Summary | Actions → Run → Summary | Immediate |
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Common Issues
|
|
||||||
|
|
||||||
| Issue | Solution |
|
|
||||||
|-------|----------|
|
|
||||||
| Version mismatch | Update `crawl4ai/__version__.py` to match tag |
|
|
||||||
| PyPI 403 Forbidden | Check `PYPI_TOKEN` secret |
|
|
||||||
| PyPI 400 File exists | Version already published, increment version |
|
|
||||||
| Docker auth failed | Regenerate `DOCKER_TOKEN` |
|
|
||||||
| Docker build timeout | Check Dockerfile, review build logs |
|
|
||||||
| Cache not working | First build on branch always cold |
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Secrets Checklist
|
|
||||||
|
|
||||||
- [ ] `PYPI_TOKEN` - PyPI API token (project or account scope)
|
|
||||||
- [ ] `DOCKER_USERNAME` - Docker Hub username
|
|
||||||
- [ ] `DOCKER_TOKEN` - Docker Hub access token (read/write)
|
|
||||||
- [ ] `GITHUB_TOKEN` - Auto-provided (no action needed)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Workflow Dependencies
|
|
||||||
|
|
||||||
### release.yml Dependencies
|
|
||||||
```yaml
|
|
||||||
Python: 3.12
|
|
||||||
Actions:
|
|
||||||
- actions/checkout@v4
|
|
||||||
- actions/setup-python@v5
|
|
||||||
- softprops/action-gh-release@v2
|
|
||||||
PyPI Packages:
|
|
||||||
- build
|
|
||||||
- twine
|
|
||||||
```
|
|
||||||
|
|
||||||
### docker-release.yml Dependencies
|
|
||||||
```yaml
|
|
||||||
Actions:
|
|
||||||
- actions/checkout@v4
|
|
||||||
- docker/setup-buildx-action@v3
|
|
||||||
- docker/login-action@v3
|
|
||||||
- docker/build-push-action@v5
|
|
||||||
Docker:
|
|
||||||
- Buildx
|
|
||||||
- QEMU (for multi-arch)
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Cache Information
|
|
||||||
|
|
||||||
### Type
|
|
||||||
- GitHub Actions Cache (`type=gha`)
|
|
||||||
|
|
||||||
### Storage
|
|
||||||
- **Limit**: 10GB per repository
|
|
||||||
- **Retention**: 7 days for unused entries
|
|
||||||
- **Cleanup**: Automatic LRU eviction
|
|
||||||
|
|
||||||
### Performance
|
|
||||||
| Scenario | Cache Hit | Build Time |
|
|
||||||
|----------|-----------|------------|
|
|
||||||
| First build | 0% | 10-15 min |
|
|
||||||
| Code change only | 85% | 1-2 min |
|
|
||||||
| Dependency update | 60% | 3-5 min |
|
|
||||||
| No changes | 100% | 30-60 sec |
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Build Platforms
|
|
||||||
|
|
||||||
| Platform | Architecture | Devices |
|
|
||||||
|----------|--------------|---------|
|
|
||||||
| linux/amd64 | x86_64 | Intel/AMD servers, AWS EC2, GCP |
|
|
||||||
| linux/arm64 | aarch64 | Apple Silicon, AWS Graviton, Raspberry Pi |
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Version Validation
|
|
||||||
|
|
||||||
### Pre-Tag Checklist
|
|
||||||
```bash
|
|
||||||
# Check current version
|
|
||||||
python -c "from crawl4ai.__version__ import __version__; print(__version__)"
|
|
||||||
|
|
||||||
# Verify it matches intended tag
|
|
||||||
# If tag is v1.2.3, version should be "1.2.3"
|
|
||||||
```
|
|
||||||
|
|
||||||
### Post-Release Verification
|
|
||||||
```bash
|
|
||||||
# PyPI
|
|
||||||
pip install crawl4ai==1.2.3
|
|
||||||
python -c "import crawl4ai; print(crawl4ai.__version__)"
|
|
||||||
|
|
||||||
# Docker
|
|
||||||
docker pull unclecode/crawl4ai:1.2.3
|
|
||||||
docker run unclecode/crawl4ai:1.2.3 python -c "import crawl4ai; print(crawl4ai.__version__)"
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Monitoring URLs
|
|
||||||
|
|
||||||
| Service | URL |
|
|
||||||
|---------|-----|
|
|
||||||
| GitHub Actions | `https://github.com/{owner}/{repo}/actions` |
|
|
||||||
| PyPI Project | `https://pypi.org/project/crawl4ai/` |
|
|
||||||
| Docker Hub | `https://hub.docker.com/r/unclecode/crawl4ai` |
|
|
||||||
| GitHub Releases | `https://github.com/{owner}/{repo}/releases` |
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Rollback Strategy
|
|
||||||
|
|
||||||
### PyPI (Cannot Delete)
|
|
||||||
```bash
|
|
||||||
# Increment patch version
|
|
||||||
git tag v1.2.4
|
|
||||||
git push origin v1.2.4
|
|
||||||
```
|
|
||||||
|
|
||||||
### Docker (Can Overwrite)
|
|
||||||
```bash
|
|
||||||
# Rebuild with fix
|
|
||||||
git tag docker-rebuild-v1.2.3
|
|
||||||
git push origin docker-rebuild-v1.2.3
|
|
||||||
```
|
|
||||||
|
|
||||||
### GitHub Release
|
|
||||||
```bash
|
|
||||||
# Delete release
|
|
||||||
gh release delete v1.2.3
|
|
||||||
|
|
||||||
# Delete tag
|
|
||||||
git push --delete origin v1.2.3
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Status Badge Markdown
|
|
||||||
|
|
||||||
```markdown
|
|
||||||
[](https://github.com/{owner}/{repo}/actions/workflows/release.yml)
|
|
||||||
|
|
||||||
[](https://github.com/{owner}/{repo}/actions/workflows/docker-release.yml)
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Timeline Example
|
|
||||||
|
|
||||||
```
|
|
||||||
0:00 - Push tag v1.2.3
|
|
||||||
0:01 - release.yml starts
|
|
||||||
0:02 - Version validation passes
|
|
||||||
0:03 - Package built
|
|
||||||
0:04 - PyPI upload starts
|
|
||||||
0:06 - PyPI upload complete ✓
|
|
||||||
0:07 - GitHub release created ✓
|
|
||||||
0:08 - release.yml complete
|
|
||||||
0:08 - docker-release.yml triggered
|
|
||||||
0:10 - Docker build starts
|
|
||||||
0:12 - amd64 image built (cache hit)
|
|
||||||
0:14 - arm64 image built (cache hit)
|
|
||||||
0:15 - Images pushed to Docker Hub ✓
|
|
||||||
0:16 - docker-release.yml complete
|
|
||||||
|
|
||||||
Total: ~16 minutes
|
|
||||||
Critical path (PyPI + GitHub): ~8 minutes
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Contact
|
|
||||||
|
|
||||||
For workflow issues:
|
|
||||||
1. Check Actions tab for logs
|
|
||||||
2. Review this reference
|
|
||||||
3. See [README.md](./README.md) for detailed docs
|
|
||||||
79
.github/workflows/release.yml
vendored
79
.github/workflows/release.yml
vendored
@@ -10,53 +10,53 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
permissions:
|
permissions:
|
||||||
contents: write # Required for creating releases
|
contents: write # Required for creating releases
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: '3.12'
|
python-version: '3.12'
|
||||||
|
|
||||||
- name: Extract version from tag
|
- name: Extract version from tag
|
||||||
id: get_version
|
id: get_version
|
||||||
run: |
|
run: |
|
||||||
TAG_VERSION=${GITHUB_REF#refs/tags/v}
|
TAG_VERSION=${GITHUB_REF#refs/tags/v}
|
||||||
echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT
|
echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT
|
||||||
echo "Releasing version: $TAG_VERSION"
|
echo "Releasing version: $TAG_VERSION"
|
||||||
|
|
||||||
- name: Install package dependencies
|
- name: Install package dependencies
|
||||||
run: |
|
run: |
|
||||||
pip install -e .
|
pip install -e .
|
||||||
|
|
||||||
- name: Check version consistency
|
- name: Check version consistency
|
||||||
run: |
|
run: |
|
||||||
TAG_VERSION=${{ steps.get_version.outputs.VERSION }}
|
TAG_VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||||
PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)")
|
PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)")
|
||||||
|
|
||||||
echo "Tag version: $TAG_VERSION"
|
echo "Tag version: $TAG_VERSION"
|
||||||
echo "Package version: $PACKAGE_VERSION"
|
echo "Package version: $PACKAGE_VERSION"
|
||||||
|
|
||||||
if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then
|
if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then
|
||||||
echo "❌ Version mismatch! Tag: $TAG_VERSION, Package: $PACKAGE_VERSION"
|
echo "❌ Version mismatch! Tag: $TAG_VERSION, Package: $PACKAGE_VERSION"
|
||||||
echo "Please update crawl4ai/__version__.py to match the tag version"
|
echo "Please update crawl4ai/__version__.py to match the tag version"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
echo "✅ Version check passed: $TAG_VERSION"
|
echo "✅ Version check passed: $TAG_VERSION"
|
||||||
|
|
||||||
- name: Install build dependencies
|
- name: Install build dependencies
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install build twine
|
pip install build twine
|
||||||
|
|
||||||
- name: Build package
|
- name: Build package
|
||||||
run: python -m build
|
run: python -m build
|
||||||
|
|
||||||
- name: Check package
|
- name: Check package
|
||||||
run: twine check dist/*
|
run: twine check dist/*
|
||||||
|
|
||||||
- name: Upload to PyPI
|
- name: Upload to PyPI
|
||||||
env:
|
env:
|
||||||
TWINE_USERNAME: __token__
|
TWINE_USERNAME: __token__
|
||||||
@@ -65,7 +65,37 @@ jobs:
|
|||||||
echo "📦 Uploading to PyPI..."
|
echo "📦 Uploading to PyPI..."
|
||||||
twine upload dist/*
|
twine upload dist/*
|
||||||
echo "✅ Package uploaded to https://pypi.org/project/crawl4ai/"
|
echo "✅ Package uploaded to https://pypi.org/project/crawl4ai/"
|
||||||
|
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
|
- name: Log in to Docker Hub
|
||||||
|
uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.DOCKER_USERNAME }}
|
||||||
|
password: ${{ secrets.DOCKER_TOKEN }}
|
||||||
|
|
||||||
|
- name: Extract major and minor versions
|
||||||
|
id: versions
|
||||||
|
run: |
|
||||||
|
VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||||
|
MAJOR=$(echo $VERSION | cut -d. -f1)
|
||||||
|
MINOR=$(echo $VERSION | cut -d. -f1-2)
|
||||||
|
echo "MAJOR=$MAJOR" >> $GITHUB_OUTPUT
|
||||||
|
echo "MINOR=$MINOR" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
|
- name: Build and push Docker images
|
||||||
|
uses: docker/build-push-action@v5
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
push: true
|
||||||
|
tags: |
|
||||||
|
unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
||||||
|
unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}
|
||||||
|
unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}
|
||||||
|
unclecode/crawl4ai:latest
|
||||||
|
platforms: linux/amd64,linux/arm64
|
||||||
|
|
||||||
- name: Create GitHub Release
|
- name: Create GitHub Release
|
||||||
uses: softprops/action-gh-release@v2
|
uses: softprops/action-gh-release@v2
|
||||||
with:
|
with:
|
||||||
@@ -73,29 +103,26 @@ jobs:
|
|||||||
name: Release v${{ steps.get_version.outputs.VERSION }}
|
name: Release v${{ steps.get_version.outputs.VERSION }}
|
||||||
body: |
|
body: |
|
||||||
## 🎉 Crawl4AI v${{ steps.get_version.outputs.VERSION }} Released!
|
## 🎉 Crawl4AI v${{ steps.get_version.outputs.VERSION }} Released!
|
||||||
|
|
||||||
### 📦 Installation
|
### 📦 Installation
|
||||||
|
|
||||||
**PyPI:**
|
**PyPI:**
|
||||||
```bash
|
```bash
|
||||||
pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}
|
pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}
|
||||||
```
|
```
|
||||||
|
|
||||||
**Docker:**
|
**Docker:**
|
||||||
```bash
|
```bash
|
||||||
docker pull unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
docker pull unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
||||||
docker pull unclecode/crawl4ai:latest
|
docker pull unclecode/crawl4ai:latest
|
||||||
```
|
```
|
||||||
|
|
||||||
**Note:** Docker images are being built and will be available shortly.
|
|
||||||
Check the [Docker Release workflow](https://github.com/${{ github.repository }}/actions/workflows/docker-release.yml) for build status.
|
|
||||||
|
|
||||||
### 📝 What's Changed
|
### 📝 What's Changed
|
||||||
See [CHANGELOG.md](https://github.com/${{ github.repository }}/blob/main/CHANGELOG.md) for details.
|
See [CHANGELOG.md](https://github.com/${{ github.repository }}/blob/main/CHANGELOG.md) for details.
|
||||||
draft: false
|
draft: false
|
||||||
prerelease: false
|
prerelease: false
|
||||||
token: ${{ secrets.GITHUB_TOKEN }}
|
token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
||||||
- name: Summary
|
- name: Summary
|
||||||
run: |
|
run: |
|
||||||
echo "## 🚀 Release Complete!" >> $GITHUB_STEP_SUMMARY
|
echo "## 🚀 Release Complete!" >> $GITHUB_STEP_SUMMARY
|
||||||
@@ -105,9 +132,11 @@ jobs:
|
|||||||
echo "- URL: https://pypi.org/project/crawl4ai/" >> $GITHUB_STEP_SUMMARY
|
echo "- URL: https://pypi.org/project/crawl4ai/" >> $GITHUB_STEP_SUMMARY
|
||||||
echo "- Install: \`pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
echo "- Install: \`pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||||
echo "" >> $GITHUB_STEP_SUMMARY
|
echo "" >> $GITHUB_STEP_SUMMARY
|
||||||
echo "### 📋 GitHub Release" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "- https://github.com/${{ github.repository }}/releases/tag/v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "### 🐳 Docker Images" >> $GITHUB_STEP_SUMMARY
|
echo "### 🐳 Docker Images" >> $GITHUB_STEP_SUMMARY
|
||||||
echo "Docker images are being built in a separate workflow." >> $GITHUB_STEP_SUMMARY
|
echo "- \`unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||||
echo "Check: https://github.com/${{ github.repository }}/actions/workflows/docker-release.yml" >> $GITHUB_STEP_SUMMARY
|
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}\`" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}\`" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "- \`unclecode/crawl4ai:latest\`" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "### 📋 GitHub Release" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "https://github.com/${{ github.repository }}/releases/tag/v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||||
142
.github/workflows/release.yml.backup
vendored
142
.github/workflows/release.yml.backup
vendored
@@ -1,142 +0,0 @@
|
|||||||
name: Release Pipeline
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
tags:
|
|
||||||
- 'v*'
|
|
||||||
- '!test-v*' # Exclude test tags
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
release:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
permissions:
|
|
||||||
contents: write # Required for creating releases
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Checkout code
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
|
|
||||||
- name: Set up Python
|
|
||||||
uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: '3.12'
|
|
||||||
|
|
||||||
- name: Extract version from tag
|
|
||||||
id: get_version
|
|
||||||
run: |
|
|
||||||
TAG_VERSION=${GITHUB_REF#refs/tags/v}
|
|
||||||
echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT
|
|
||||||
echo "Releasing version: $TAG_VERSION"
|
|
||||||
|
|
||||||
- name: Install package dependencies
|
|
||||||
run: |
|
|
||||||
pip install -e .
|
|
||||||
|
|
||||||
- name: Check version consistency
|
|
||||||
run: |
|
|
||||||
TAG_VERSION=${{ steps.get_version.outputs.VERSION }}
|
|
||||||
PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)")
|
|
||||||
|
|
||||||
echo "Tag version: $TAG_VERSION"
|
|
||||||
echo "Package version: $PACKAGE_VERSION"
|
|
||||||
|
|
||||||
if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then
|
|
||||||
echo "❌ Version mismatch! Tag: $TAG_VERSION, Package: $PACKAGE_VERSION"
|
|
||||||
echo "Please update crawl4ai/__version__.py to match the tag version"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "✅ Version check passed: $TAG_VERSION"
|
|
||||||
|
|
||||||
- name: Install build dependencies
|
|
||||||
run: |
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
pip install build twine
|
|
||||||
|
|
||||||
- name: Build package
|
|
||||||
run: python -m build
|
|
||||||
|
|
||||||
- name: Check package
|
|
||||||
run: twine check dist/*
|
|
||||||
|
|
||||||
- name: Upload to PyPI
|
|
||||||
env:
|
|
||||||
TWINE_USERNAME: __token__
|
|
||||||
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
|
|
||||||
run: |
|
|
||||||
echo "📦 Uploading to PyPI..."
|
|
||||||
twine upload dist/*
|
|
||||||
echo "✅ Package uploaded to https://pypi.org/project/crawl4ai/"
|
|
||||||
|
|
||||||
- name: Set up Docker Buildx
|
|
||||||
uses: docker/setup-buildx-action@v3
|
|
||||||
|
|
||||||
- name: Log in to Docker Hub
|
|
||||||
uses: docker/login-action@v3
|
|
||||||
with:
|
|
||||||
username: ${{ secrets.DOCKER_USERNAME }}
|
|
||||||
password: ${{ secrets.DOCKER_TOKEN }}
|
|
||||||
|
|
||||||
- name: Extract major and minor versions
|
|
||||||
id: versions
|
|
||||||
run: |
|
|
||||||
VERSION=${{ steps.get_version.outputs.VERSION }}
|
|
||||||
MAJOR=$(echo $VERSION | cut -d. -f1)
|
|
||||||
MINOR=$(echo $VERSION | cut -d. -f1-2)
|
|
||||||
echo "MAJOR=$MAJOR" >> $GITHUB_OUTPUT
|
|
||||||
echo "MINOR=$MINOR" >> $GITHUB_OUTPUT
|
|
||||||
|
|
||||||
- name: Build and push Docker images
|
|
||||||
uses: docker/build-push-action@v5
|
|
||||||
with:
|
|
||||||
context: .
|
|
||||||
push: true
|
|
||||||
tags: |
|
|
||||||
unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
|
||||||
unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}
|
|
||||||
unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}
|
|
||||||
unclecode/crawl4ai:latest
|
|
||||||
platforms: linux/amd64,linux/arm64
|
|
||||||
|
|
||||||
- name: Create GitHub Release
|
|
||||||
uses: softprops/action-gh-release@v2
|
|
||||||
with:
|
|
||||||
tag_name: v${{ steps.get_version.outputs.VERSION }}
|
|
||||||
name: Release v${{ steps.get_version.outputs.VERSION }}
|
|
||||||
body: |
|
|
||||||
## 🎉 Crawl4AI v${{ steps.get_version.outputs.VERSION }} Released!
|
|
||||||
|
|
||||||
### 📦 Installation
|
|
||||||
|
|
||||||
**PyPI:**
|
|
||||||
```bash
|
|
||||||
pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Docker:**
|
|
||||||
```bash
|
|
||||||
docker pull unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
|
||||||
docker pull unclecode/crawl4ai:latest
|
|
||||||
```
|
|
||||||
|
|
||||||
### 📝 What's Changed
|
|
||||||
See [CHANGELOG.md](https://github.com/${{ github.repository }}/blob/main/CHANGELOG.md) for details.
|
|
||||||
draft: false
|
|
||||||
prerelease: false
|
|
||||||
token: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
|
|
||||||
- name: Summary
|
|
||||||
run: |
|
|
||||||
echo "## 🚀 Release Complete!" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "### 📦 PyPI Package" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "- Version: ${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "- URL: https://pypi.org/project/crawl4ai/" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "- Install: \`pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "### 🐳 Docker Images" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "- \`unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}\`" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}\`" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "- \`unclecode/crawl4ai:latest\`" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "### 📋 GitHub Release" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "https://github.com/${{ github.repository }}/releases/tag/v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
|
||||||
15
.gitignore
vendored
15
.gitignore
vendored
@@ -266,11 +266,11 @@ continue_config.json
|
|||||||
.llm.env
|
.llm.env
|
||||||
.private/
|
.private/
|
||||||
|
|
||||||
.claude/
|
|
||||||
|
|
||||||
CLAUDE_MONITOR.md
|
CLAUDE_MONITOR.md
|
||||||
CLAUDE.md
|
CLAUDE.md
|
||||||
|
|
||||||
|
.claude/
|
||||||
|
|
||||||
tests/**/test_site
|
tests/**/test_site
|
||||||
tests/**/reports
|
tests/**/reports
|
||||||
tests/**/benchmark_reports
|
tests/**/benchmark_reports
|
||||||
@@ -282,3 +282,14 @@ docs/apps/linkdin/debug*/
|
|||||||
docs/apps/linkdin/samples/insights/*
|
docs/apps/linkdin/samples/insights/*
|
||||||
|
|
||||||
scripts/
|
scripts/
|
||||||
|
|
||||||
|
|
||||||
|
# Databse files
|
||||||
|
*.sqlite3
|
||||||
|
*.sqlite3-journal
|
||||||
|
*.db-journal
|
||||||
|
*.db-wal
|
||||||
|
*.db-shm
|
||||||
|
*.db
|
||||||
|
*.rdb
|
||||||
|
*.ldb
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
FROM python:3.12-slim-bookworm AS build
|
FROM python:3.12-slim-bookworm AS build
|
||||||
|
|
||||||
# C4ai version
|
# C4ai version
|
||||||
ARG C4AI_VER=0.7.6
|
ARG C4AI_VER=0.7.0-r1
|
||||||
ENV C4AI_VERSION=$C4AI_VER
|
ENV C4AI_VERSION=$C4AI_VER
|
||||||
LABEL c4ai.version=$C4AI_VER
|
LABEL c4ai.version=$C4AI_VER
|
||||||
|
|
||||||
|
|||||||
88
README.md
88
README.md
@@ -27,13 +27,11 @@
|
|||||||
|
|
||||||
Crawl4AI turns the web into clean, LLM ready Markdown for RAG, agents, and data pipelines. Fast, controllable, battle tested by a 50k+ star community.
|
Crawl4AI turns the web into clean, LLM ready Markdown for RAG, agents, and data pipelines. Fast, controllable, battle tested by a 50k+ star community.
|
||||||
|
|
||||||
[✨ Check out latest update v0.7.6](#-recent-updates)
|
[✨ Check out latest update v0.7.4](#-recent-updates)
|
||||||
|
|
||||||
✨ **New in v0.7.6**: Complete Webhook Infrastructure for Docker Job Queue API! Real-time notifications for both `/crawl/job` and `/llm/job` endpoints with exponential backoff retry, custom headers, and flexible delivery modes. No more polling! [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.6.md)
|
✨ New in v0.7.4: Revolutionary LLM Table Extraction with intelligent chunking, enhanced concurrency fixes, memory management refactor, and critical stability improvements. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.4.md)
|
||||||
|
|
||||||
✨ Recent v0.7.5: Docker Hooks System with function-based API for pipeline customization, Enhanced LLM Integration with custom providers, HTTPS Preservation, and multiple community-reported bug fixes. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.5.md)
|
✨ Recent v0.7.3: Undetected Browser Support, Multi-URL Configurations, Memory Monitoring, Enhanced Table Extraction, GitHub Sponsors. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.3.md)
|
||||||
|
|
||||||
✨ Previous v0.7.4: Revolutionary LLM Table Extraction with intelligent chunking, enhanced concurrency fixes, memory management refactor, and critical stability improvements. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.4.md)
|
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary>🤓 <strong>My Personal Story</strong></summary>
|
<summary>🤓 <strong>My Personal Story</strong></summary>
|
||||||
@@ -179,7 +177,7 @@ No rate-limited APIs. No lock-in. Build and own your data pipeline with direct g
|
|||||||
- 📸 **Screenshots**: Capture page screenshots during crawling for debugging or analysis.
|
- 📸 **Screenshots**: Capture page screenshots during crawling for debugging or analysis.
|
||||||
- 📂 **Raw Data Crawling**: Directly process raw HTML (`raw:`) or local files (`file://`).
|
- 📂 **Raw Data Crawling**: Directly process raw HTML (`raw:`) or local files (`file://`).
|
||||||
- 🔗 **Comprehensive Link Extraction**: Extracts internal, external links, and embedded iframe content.
|
- 🔗 **Comprehensive Link Extraction**: Extracts internal, external links, and embedded iframe content.
|
||||||
- 🛠️ **Customizable Hooks**: Define hooks at every step to customize crawling behavior (supports both string and function-based APIs).
|
- 🛠️ **Customizable Hooks**: Define hooks at every step to customize crawling behavior.
|
||||||
- 💾 **Caching**: Cache data for improved speed and to avoid redundant fetches.
|
- 💾 **Caching**: Cache data for improved speed and to avoid redundant fetches.
|
||||||
- 📄 **Metadata Extraction**: Retrieve structured metadata from web pages.
|
- 📄 **Metadata Extraction**: Retrieve structured metadata from web pages.
|
||||||
- 📡 **IFrame Content Extraction**: Seamless extraction from embedded iframe content.
|
- 📡 **IFrame Content Extraction**: Seamless extraction from embedded iframe content.
|
||||||
@@ -546,54 +544,6 @@ async def test_news_crawl():
|
|||||||
|
|
||||||
## ✨ Recent Updates
|
## ✨ Recent Updates
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary><strong>Version 0.7.5 Release Highlights - The Docker Hooks & Security Update</strong></summary>
|
|
||||||
|
|
||||||
- **🔧 Docker Hooks System**: Complete pipeline customization with user-provided Python functions at 8 key points
|
|
||||||
- **✨ Function-Based Hooks API (NEW)**: Write hooks as regular Python functions with full IDE support:
|
|
||||||
```python
|
|
||||||
from crawl4ai import hooks_to_string
|
|
||||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
|
||||||
|
|
||||||
# Define hooks as regular Python functions
|
|
||||||
async def on_page_context_created(page, context, **kwargs):
|
|
||||||
"""Block images to speed up crawling"""
|
|
||||||
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
|
||||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
|
||||||
return page
|
|
||||||
|
|
||||||
async def before_goto(page, context, url, **kwargs):
|
|
||||||
"""Add custom headers"""
|
|
||||||
await page.set_extra_http_headers({'X-Crawl4AI': 'v0.7.5'})
|
|
||||||
return page
|
|
||||||
|
|
||||||
# Option 1: Use hooks_to_string() utility for REST API
|
|
||||||
hooks_code = hooks_to_string({
|
|
||||||
"on_page_context_created": on_page_context_created,
|
|
||||||
"before_goto": before_goto
|
|
||||||
})
|
|
||||||
|
|
||||||
# Option 2: Docker client with automatic conversion (Recommended)
|
|
||||||
client = Crawl4aiDockerClient(base_url="http://localhost:11235")
|
|
||||||
results = await client.crawl(
|
|
||||||
urls=["https://httpbin.org/html"],
|
|
||||||
hooks={
|
|
||||||
"on_page_context_created": on_page_context_created,
|
|
||||||
"before_goto": before_goto
|
|
||||||
}
|
|
||||||
)
|
|
||||||
# ✓ Full IDE support, type checking, and reusability!
|
|
||||||
```
|
|
||||||
|
|
||||||
- **🤖 Enhanced LLM Integration**: Custom providers with temperature control and base_url configuration
|
|
||||||
- **🔒 HTTPS Preservation**: Secure internal link handling with `preserve_https_for_internal_links=True`
|
|
||||||
- **🐍 Python 3.10+ Support**: Modern language features and enhanced performance
|
|
||||||
- **🛠️ Bug Fixes**: Resolved multiple community-reported issues including URL processing, JWT authentication, and proxy configuration
|
|
||||||
|
|
||||||
[Full v0.7.5 Release Notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.5.md)
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary><strong>Version 0.7.4 Release Highlights - The Intelligent Table Extraction & Performance Update</strong></summary>
|
<summary><strong>Version 0.7.4 Release Highlights - The Intelligent Table Extraction & Performance Update</strong></summary>
|
||||||
|
|
||||||
@@ -969,36 +919,6 @@ We envision a future where AI is powered by real human knowledge, ensuring data
|
|||||||
For more details, see our [full mission statement](./MISSION.md).
|
For more details, see our [full mission statement](./MISSION.md).
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
## 🌟 Current Sponsors
|
|
||||||
|
|
||||||
### 🏢 Enterprise Sponsors & Partners
|
|
||||||
|
|
||||||
Our enterprise sponsors and technology partners help scale Crawl4AI to power production-grade data pipelines.
|
|
||||||
|
|
||||||
| Company | About | Sponsorship Tier |
|
|
||||||
|------|------|----------------------------|
|
|
||||||
| <a href="https://dashboard.capsolver.com/passport/register?inviteCode=ESVSECTX5Q23" target="_blank"><picture><source width="120" media="(prefers-color-scheme: dark)" srcset="https://docs.crawl4ai.com/uploads/sponsors/20251013045338_72a71fa4ee4d2f40.png"><source width="120" media="(prefers-color-scheme: light)" srcset="https://www.capsolver.com/assets/images/logo-text.png"><img alt="Capsolver" src="https://www.capsolver.com/assets/images/logo-text.png"></picture></a> | AI-powered Captcha solving service. Supports all major Captcha types, including reCAPTCHA, Cloudflare, and more | 🥈 Silver |
|
|
||||||
| <a href="https://kipo.ai" target="_blank"><img src="https://docs.crawl4ai.com/uploads/sponsors/20251013045751_2d54f57f117c651e.png" alt="DataSync" width="120"/></a> | Helps engineers and buyers find, compare, and source electronic & industrial parts in seconds, with specs, pricing, lead times & alternatives.| 🥇 Gold |
|
|
||||||
| <a href="https://www.kidocode.com/" target="_blank"><img src="https://docs.crawl4ai.com/uploads/sponsors/20251013045045_bb8dace3f0440d65.svg" alt="Kidocode" width="120"/><p align="center">KidoCode</p></a> | Kidocode is a hybrid technology and entrepreneurship school for kids aged 5–18, offering both online and on-campus education. | 🥇 Gold |
|
|
||||||
| <a href="https://www.alephnull.sg/" target="_blank"><img src="https://docs.crawl4ai.com/uploads/sponsors/20251013050323_a9e8e8c4c3650421.svg" alt="Aleph null" width="120"/></a> | Singapore-based Aleph Null is Asia’s leading edtech hub, dedicated to student-centric, AI-driven education—empowering learners with the tools to thrive in a fast-changing world. | 🥇 Gold |
|
|
||||||
|
|
||||||
### 🧑🤝 Individual Sponsors
|
|
||||||
|
|
||||||
A heartfelt thanks to our individual supporters! Every contribution helps us keep our opensource mission alive and thriving!
|
|
||||||
|
|
||||||
<p align="left">
|
|
||||||
<a href="https://github.com/hafezparast"><img src="https://avatars.githubusercontent.com/u/14273305?s=60&v=4" style="border-radius:50%;" width="64px;"/></a>
|
|
||||||
<a href="https://github.com/ntohidi"><img src="https://avatars.githubusercontent.com/u/17140097?s=60&v=4" style="border-radius:50%;"width="64px;"/></a>
|
|
||||||
<a href="https://github.com/Sjoeborg"><img src="https://avatars.githubusercontent.com/u/17451310?s=60&v=4" style="border-radius:50%;"width="64px;"/></a>
|
|
||||||
<a href="https://github.com/romek-rozen"><img src="https://avatars.githubusercontent.com/u/30595969?s=60&v=4" style="border-radius:50%;"width="64px;"/></a>
|
|
||||||
<a href="https://github.com/Kourosh-Kiyani"><img src="https://avatars.githubusercontent.com/u/34105600?s=60&v=4" style="border-radius:50%;"width="64px;"/></a>
|
|
||||||
<a href="https://github.com/Etherdrake"><img src="https://avatars.githubusercontent.com/u/67021215?s=60&v=4" style="border-radius:50%;"width="64px;"/></a>
|
|
||||||
<a href="https://github.com/shaman247"><img src="https://avatars.githubusercontent.com/u/211010067?s=60&v=4" style="border-radius:50%;"width="64px;"/></a>
|
|
||||||
<a href="https://github.com/work-flow-manager"><img src="https://avatars.githubusercontent.com/u/217665461?s=60&v=4" style="border-radius:50%;"width="64px;"/></a>
|
|
||||||
</p>
|
|
||||||
|
|
||||||
> Want to join them? [Sponsor Crawl4AI →](https://github.com/sponsors/unclecode)
|
|
||||||
|
|
||||||
## Star History
|
## Star History
|
||||||
|
|
||||||
[](https://star-history.com/#unclecode/crawl4ai&Date)
|
[](https://star-history.com/#unclecode/crawl4ai&Date)
|
||||||
|
|||||||
@@ -103,8 +103,7 @@ from .browser_adapter import (
|
|||||||
|
|
||||||
from .utils import (
|
from .utils import (
|
||||||
start_colab_display_server,
|
start_colab_display_server,
|
||||||
setup_colab_environment,
|
setup_colab_environment
|
||||||
hooks_to_string
|
|
||||||
)
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
@@ -184,7 +183,6 @@ __all__ = [
|
|||||||
"ProxyConfig",
|
"ProxyConfig",
|
||||||
"start_colab_display_server",
|
"start_colab_display_server",
|
||||||
"setup_colab_environment",
|
"setup_colab_environment",
|
||||||
"hooks_to_string",
|
|
||||||
# C4A Script additions
|
# C4A Script additions
|
||||||
"c4a_compile",
|
"c4a_compile",
|
||||||
"c4a_validate",
|
"c4a_validate",
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
# crawl4ai/__version__.py
|
# crawl4ai/__version__.py
|
||||||
|
|
||||||
# This is the version that will be used for stable releases
|
# This is the version that will be used for stable releases
|
||||||
__version__ = "0.7.6"
|
__version__ = "0.7.4"
|
||||||
|
|
||||||
# For nightly builds, this gets set during build process
|
# For nightly builds, this gets set during build process
|
||||||
__nightly_version__ = None
|
__nightly_version__ = None
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from typing import List, Optional, Union, AsyncGenerator, Dict, Any, Callable
|
from typing import List, Optional, Union, AsyncGenerator, Dict, Any
|
||||||
import httpx
|
import httpx
|
||||||
import json
|
import json
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
@@ -7,7 +7,6 @@ import asyncio
|
|||||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
from .models import CrawlResult
|
from .models import CrawlResult
|
||||||
from .async_logger import AsyncLogger, LogLevel
|
from .async_logger import AsyncLogger, LogLevel
|
||||||
from .utils import hooks_to_string
|
|
||||||
|
|
||||||
|
|
||||||
class Crawl4aiClientError(Exception):
|
class Crawl4aiClientError(Exception):
|
||||||
@@ -71,41 +70,17 @@ class Crawl4aiDockerClient:
|
|||||||
self.logger.error(f"Server unreachable: {str(e)}", tag="ERROR")
|
self.logger.error(f"Server unreachable: {str(e)}", tag="ERROR")
|
||||||
raise ConnectionError(f"Cannot connect to server: {str(e)}")
|
raise ConnectionError(f"Cannot connect to server: {str(e)}")
|
||||||
|
|
||||||
def _prepare_request(
|
def _prepare_request(self, urls: List[str], browser_config: Optional[BrowserConfig] = None,
|
||||||
self,
|
crawler_config: Optional[CrawlerRunConfig] = None) -> Dict[str, Any]:
|
||||||
urls: List[str],
|
|
||||||
browser_config: Optional[BrowserConfig] = None,
|
|
||||||
crawler_config: Optional[CrawlerRunConfig] = None,
|
|
||||||
hooks: Optional[Union[Dict[str, Callable], Dict[str, str]]] = None,
|
|
||||||
hooks_timeout: int = 30
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""Prepare request data from configs."""
|
"""Prepare request data from configs."""
|
||||||
if self._token:
|
if self._token:
|
||||||
self._http_client.headers["Authorization"] = f"Bearer {self._token}"
|
self._http_client.headers["Authorization"] = f"Bearer {self._token}"
|
||||||
|
return {
|
||||||
request_data = {
|
|
||||||
"urls": urls,
|
"urls": urls,
|
||||||
"browser_config": browser_config.dump() if browser_config else {},
|
"browser_config": browser_config.dump() if browser_config else {},
|
||||||
"crawler_config": crawler_config.dump() if crawler_config else {}
|
"crawler_config": crawler_config.dump() if crawler_config else {}
|
||||||
}
|
}
|
||||||
|
|
||||||
# Handle hooks if provided
|
|
||||||
if hooks:
|
|
||||||
# Check if hooks are already strings or need conversion
|
|
||||||
if any(callable(v) for v in hooks.values()):
|
|
||||||
# Convert function objects to strings
|
|
||||||
hooks_code = hooks_to_string(hooks)
|
|
||||||
else:
|
|
||||||
# Already in string format
|
|
||||||
hooks_code = hooks
|
|
||||||
|
|
||||||
request_data["hooks"] = {
|
|
||||||
"code": hooks_code,
|
|
||||||
"timeout": hooks_timeout
|
|
||||||
}
|
|
||||||
|
|
||||||
return request_data
|
|
||||||
|
|
||||||
async def _request(self, method: str, endpoint: str, **kwargs) -> httpx.Response:
|
async def _request(self, method: str, endpoint: str, **kwargs) -> httpx.Response:
|
||||||
"""Make an HTTP request with error handling."""
|
"""Make an HTTP request with error handling."""
|
||||||
url = urljoin(self.base_url, endpoint)
|
url = urljoin(self.base_url, endpoint)
|
||||||
@@ -127,42 +102,16 @@ class Crawl4aiDockerClient:
|
|||||||
self,
|
self,
|
||||||
urls: List[str],
|
urls: List[str],
|
||||||
browser_config: Optional[BrowserConfig] = None,
|
browser_config: Optional[BrowserConfig] = None,
|
||||||
crawler_config: Optional[CrawlerRunConfig] = None,
|
crawler_config: Optional[CrawlerRunConfig] = None
|
||||||
hooks: Optional[Union[Dict[str, Callable], Dict[str, str]]] = None,
|
|
||||||
hooks_timeout: int = 30
|
|
||||||
) -> Union[CrawlResult, List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
|
) -> Union[CrawlResult, List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
|
||||||
"""
|
"""Execute a crawl operation."""
|
||||||
Execute a crawl operation.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
urls: List of URLs to crawl
|
|
||||||
browser_config: Browser configuration
|
|
||||||
crawler_config: Crawler configuration
|
|
||||||
hooks: Optional hooks - can be either:
|
|
||||||
- Dict[str, Callable]: Function objects that will be converted to strings
|
|
||||||
- Dict[str, str]: Already stringified hook code
|
|
||||||
hooks_timeout: Timeout in seconds for each hook execution (1-120)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Single CrawlResult, list of results, or async generator for streaming
|
|
||||||
|
|
||||||
Example with function hooks:
|
|
||||||
>>> async def my_hook(page, context, **kwargs):
|
|
||||||
... await page.set_viewport_size({"width": 1920, "height": 1080})
|
|
||||||
... return page
|
|
||||||
>>>
|
|
||||||
>>> result = await client.crawl(
|
|
||||||
... ["https://example.com"],
|
|
||||||
... hooks={"on_page_context_created": my_hook}
|
|
||||||
... )
|
|
||||||
"""
|
|
||||||
await self._check_server()
|
await self._check_server()
|
||||||
|
|
||||||
data = self._prepare_request(urls, browser_config, crawler_config, hooks, hooks_timeout)
|
data = self._prepare_request(urls, browser_config, crawler_config)
|
||||||
is_streaming = crawler_config and crawler_config.stream
|
is_streaming = crawler_config and crawler_config.stream
|
||||||
|
|
||||||
self.logger.info(f"Crawling {len(urls)} URLs {'(streaming)' if is_streaming else ''}", tag="CRAWL")
|
self.logger.info(f"Crawling {len(urls)} URLs {'(streaming)' if is_streaming else ''}", tag="CRAWL")
|
||||||
|
|
||||||
if is_streaming:
|
if is_streaming:
|
||||||
async def stream_results() -> AsyncGenerator[CrawlResult, None]:
|
async def stream_results() -> AsyncGenerator[CrawlResult, None]:
|
||||||
async with self._http_client.stream("POST", f"{self.base_url}/crawl/stream", json=data) as response:
|
async with self._http_client.stream("POST", f"{self.base_url}/crawl/stream", json=data) as response:
|
||||||
@@ -179,12 +128,12 @@ class Crawl4aiDockerClient:
|
|||||||
else:
|
else:
|
||||||
yield CrawlResult(**result)
|
yield CrawlResult(**result)
|
||||||
return stream_results()
|
return stream_results()
|
||||||
|
|
||||||
response = await self._request("POST", "/crawl", json=data)
|
response = await self._request("POST", "/crawl", json=data)
|
||||||
result_data = response.json()
|
result_data = response.json()
|
||||||
if not result_data.get("success", False):
|
if not result_data.get("success", False):
|
||||||
raise RequestError(f"Crawl failed: {result_data.get('msg', 'Unknown error')}")
|
raise RequestError(f"Crawl failed: {result_data.get('msg', 'Unknown error')}")
|
||||||
|
|
||||||
results = [CrawlResult(**r) for r in result_data.get("results", [])]
|
results = [CrawlResult(**r) for r in result_data.get("results", [])]
|
||||||
self.logger.success(f"Crawl completed with {len(results)} results", tag="CRAWL")
|
self.logger.success(f"Crawl completed with {len(results)} results", tag="CRAWL")
|
||||||
return results[0] if len(results) == 1 else results
|
return results[0] if len(results) == 1 else results
|
||||||
|
|||||||
@@ -47,7 +47,6 @@ from urllib.parse import (
|
|||||||
urljoin, urlparse, urlunparse,
|
urljoin, urlparse, urlunparse,
|
||||||
parse_qsl, urlencode, quote, unquote
|
parse_qsl, urlencode, quote, unquote
|
||||||
)
|
)
|
||||||
import inspect
|
|
||||||
|
|
||||||
|
|
||||||
# Monkey patch to fix wildcard handling in urllib.robotparser
|
# Monkey patch to fix wildcard handling in urllib.robotparser
|
||||||
@@ -3530,52 +3529,4 @@ def get_memory_stats() -> Tuple[float, float, float]:
|
|||||||
available_gb = get_true_available_memory_gb()
|
available_gb = get_true_available_memory_gb()
|
||||||
used_percent = get_true_memory_usage_percent()
|
used_percent = get_true_memory_usage_percent()
|
||||||
|
|
||||||
return used_percent, available_gb, total_gb
|
return used_percent, available_gb, total_gb
|
||||||
|
|
||||||
|
|
||||||
# Hook utilities for Docker API
|
|
||||||
def hooks_to_string(hooks: Dict[str, Callable]) -> Dict[str, str]:
|
|
||||||
"""
|
|
||||||
Convert hook function objects to string representations for Docker API.
|
|
||||||
|
|
||||||
This utility simplifies the process of using hooks with the Docker API by converting
|
|
||||||
Python function objects into the string format required by the API.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
hooks: Dictionary mapping hook point names to Python function objects.
|
|
||||||
Functions should be async and follow hook signature requirements.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary mapping hook point names to string representations of the functions.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
>>> async def my_hook(page, context, **kwargs):
|
|
||||||
... await page.set_viewport_size({"width": 1920, "height": 1080})
|
|
||||||
... return page
|
|
||||||
>>>
|
|
||||||
>>> hooks_dict = {"on_page_context_created": my_hook}
|
|
||||||
>>> api_hooks = hooks_to_string(hooks_dict)
|
|
||||||
>>> # api_hooks is now ready to use with Docker API
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
ValueError: If a hook is not callable or source cannot be extracted
|
|
||||||
"""
|
|
||||||
result = {}
|
|
||||||
|
|
||||||
for hook_name, hook_func in hooks.items():
|
|
||||||
if not callable(hook_func):
|
|
||||||
raise ValueError(f"Hook '{hook_name}' must be a callable function, got {type(hook_func)}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Get the source code of the function
|
|
||||||
source = inspect.getsource(hook_func)
|
|
||||||
# Remove any leading indentation to get clean source
|
|
||||||
source = textwrap.dedent(source)
|
|
||||||
result[hook_name] = source
|
|
||||||
except (OSError, TypeError) as e:
|
|
||||||
raise ValueError(
|
|
||||||
f"Cannot extract source code for hook '{hook_name}'. "
|
|
||||||
f"Make sure the function is defined in a file (not interactively). Error: {e}"
|
|
||||||
)
|
|
||||||
|
|
||||||
return result
|
|
||||||
1149
deploy/docker/ARCHITECTURE.md
Normal file
1149
deploy/docker/ARCHITECTURE.md
Normal file
File diff suppressed because it is too large
Load Diff
@@ -12,7 +12,6 @@
|
|||||||
- [Python SDK](#python-sdk)
|
- [Python SDK](#python-sdk)
|
||||||
- [Understanding Request Schema](#understanding-request-schema)
|
- [Understanding Request Schema](#understanding-request-schema)
|
||||||
- [REST API Examples](#rest-api-examples)
|
- [REST API Examples](#rest-api-examples)
|
||||||
- [Asynchronous Jobs with Webhooks](#asynchronous-jobs-with-webhooks)
|
|
||||||
- [Additional API Endpoints](#additional-api-endpoints)
|
- [Additional API Endpoints](#additional-api-endpoints)
|
||||||
- [HTML Extraction Endpoint](#html-extraction-endpoint)
|
- [HTML Extraction Endpoint](#html-extraction-endpoint)
|
||||||
- [Screenshot Endpoint](#screenshot-endpoint)
|
- [Screenshot Endpoint](#screenshot-endpoint)
|
||||||
@@ -59,13 +58,15 @@ Pull and run images directly from Docker Hub without building locally.
|
|||||||
|
|
||||||
#### 1. Pull the Image
|
#### 1. Pull the Image
|
||||||
|
|
||||||
Our latest stable release is `0.7.6`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
|
Our latest release candidate is `0.7.0-r1`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
|
||||||
|
|
||||||
|
> ⚠️ **Important Note**: The `latest` tag currently points to the stable `0.6.0` version. After testing and validation, `0.7.0` (without -r1) will be released and `latest` will be updated. For now, please use `0.7.0-r1` to test the new features.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Pull the latest stable version (0.7.6)
|
# Pull the release candidate (for testing new features)
|
||||||
docker pull unclecode/crawl4ai:0.7.6
|
docker pull unclecode/crawl4ai:0.7.0-r1
|
||||||
|
|
||||||
# Or use the latest tag (points to 0.7.6)
|
# Or pull the current stable version (0.6.0)
|
||||||
docker pull unclecode/crawl4ai:latest
|
docker pull unclecode/crawl4ai:latest
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -100,7 +101,7 @@ EOL
|
|||||||
-p 11235:11235 \
|
-p 11235:11235 \
|
||||||
--name crawl4ai \
|
--name crawl4ai \
|
||||||
--shm-size=1g \
|
--shm-size=1g \
|
||||||
unclecode/crawl4ai:0.7.6
|
unclecode/crawl4ai:0.7.0-r1
|
||||||
```
|
```
|
||||||
|
|
||||||
* **With LLM support:**
|
* **With LLM support:**
|
||||||
@@ -111,7 +112,7 @@ EOL
|
|||||||
--name crawl4ai \
|
--name crawl4ai \
|
||||||
--env-file .llm.env \
|
--env-file .llm.env \
|
||||||
--shm-size=1g \
|
--shm-size=1g \
|
||||||
unclecode/crawl4ai:0.7.6
|
unclecode/crawl4ai:0.7.0-r1
|
||||||
```
|
```
|
||||||
|
|
||||||
> The server will be available at `http://localhost:11235`. Visit `/playground` to access the interactive testing interface.
|
> The server will be available at `http://localhost:11235`. Visit `/playground` to access the interactive testing interface.
|
||||||
@@ -184,7 +185,7 @@ The `docker-compose.yml` file in the project root provides a simplified approach
|
|||||||
```bash
|
```bash
|
||||||
# Pulls and runs the release candidate from Docker Hub
|
# Pulls and runs the release candidate from Docker Hub
|
||||||
# Automatically selects the correct architecture
|
# Automatically selects the correct architecture
|
||||||
IMAGE=unclecode/crawl4ai:0.7.6 docker compose up -d
|
IMAGE=unclecode/crawl4ai:0.7.0-r1 docker compose up -d
|
||||||
```
|
```
|
||||||
|
|
||||||
* **Build and Run Locally:**
|
* **Build and Run Locally:**
|
||||||
@@ -647,146 +648,6 @@ async def test_stream_crawl(token: str = None): # Made token optional
|
|||||||
# asyncio.run(test_stream_crawl())
|
# asyncio.run(test_stream_crawl())
|
||||||
```
|
```
|
||||||
|
|
||||||
### Asynchronous Jobs with Webhooks
|
|
||||||
|
|
||||||
For long-running crawls or when you want to avoid keeping connections open, use the job queue endpoints. Instead of polling for results, configure a webhook to receive notifications when jobs complete.
|
|
||||||
|
|
||||||
#### Why Use Jobs & Webhooks?
|
|
||||||
|
|
||||||
- **No Polling Required** - Get notified when crawls complete instead of constantly checking status
|
|
||||||
- **Better Resource Usage** - Free up client connections while jobs run in the background
|
|
||||||
- **Scalable Architecture** - Ideal for high-volume crawling with TypeScript/Node.js clients or microservices
|
|
||||||
- **Reliable Delivery** - Automatic retry with exponential backoff (5 attempts: 1s → 2s → 4s → 8s → 16s)
|
|
||||||
|
|
||||||
#### How It Works
|
|
||||||
|
|
||||||
1. **Submit Job** → POST to `/crawl/job` with optional `webhook_config`
|
|
||||||
2. **Get Task ID** → Receive a `task_id` immediately
|
|
||||||
3. **Job Runs** → Crawl executes in the background
|
|
||||||
4. **Webhook Fired** → Server POSTs completion notification to your webhook URL
|
|
||||||
5. **Fetch Results** → If data wasn't included in webhook, GET `/crawl/job/{task_id}`
|
|
||||||
|
|
||||||
#### Quick Example
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Submit a crawl job with webhook notification
|
|
||||||
curl -X POST http://localhost:11235/crawl/job \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"urls": ["https://example.com"],
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": "https://myapp.com/webhooks/crawl-complete",
|
|
||||||
"webhook_data_in_payload": false
|
|
||||||
}
|
|
||||||
}'
|
|
||||||
|
|
||||||
# Response: {"task_id": "crawl_a1b2c3d4"}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Your webhook receives:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"task_id": "crawl_a1b2c3d4",
|
|
||||||
"task_type": "crawl",
|
|
||||||
"status": "completed",
|
|
||||||
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
|
||||||
"urls": ["https://example.com"]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Then fetch the results:
|
|
||||||
```bash
|
|
||||||
curl http://localhost:11235/crawl/job/crawl_a1b2c3d4
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Include Data in Webhook
|
|
||||||
|
|
||||||
Set `webhook_data_in_payload: true` to receive the full crawl results directly in the webhook:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl -X POST http://localhost:11235/crawl/job \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"urls": ["https://example.com"],
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": "https://myapp.com/webhooks/crawl-complete",
|
|
||||||
"webhook_data_in_payload": true
|
|
||||||
}
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
**Your webhook receives the complete data:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"task_id": "crawl_a1b2c3d4",
|
|
||||||
"task_type": "crawl",
|
|
||||||
"status": "completed",
|
|
||||||
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
|
||||||
"urls": ["https://example.com"],
|
|
||||||
"data": {
|
|
||||||
"markdown": "...",
|
|
||||||
"html": "...",
|
|
||||||
"links": {...},
|
|
||||||
"metadata": {...}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Webhook Authentication
|
|
||||||
|
|
||||||
Add custom headers for authentication:
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"urls": ["https://example.com"],
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": "https://myapp.com/webhooks/crawl",
|
|
||||||
"webhook_data_in_payload": false,
|
|
||||||
"webhook_headers": {
|
|
||||||
"X-Webhook-Secret": "your-secret-token",
|
|
||||||
"X-Service-ID": "crawl4ai-prod"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Global Default Webhook
|
|
||||||
|
|
||||||
Configure a default webhook URL in `config.yml` for all jobs:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
webhooks:
|
|
||||||
enabled: true
|
|
||||||
default_url: "https://myapp.com/webhooks/default"
|
|
||||||
data_in_payload: false
|
|
||||||
retry:
|
|
||||||
max_attempts: 5
|
|
||||||
initial_delay_ms: 1000
|
|
||||||
max_delay_ms: 32000
|
|
||||||
timeout_ms: 30000
|
|
||||||
```
|
|
||||||
|
|
||||||
Now jobs without `webhook_config` automatically use the default webhook.
|
|
||||||
|
|
||||||
#### Job Status Polling (Without Webhooks)
|
|
||||||
|
|
||||||
If you prefer polling instead of webhooks, just omit `webhook_config`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Submit job
|
|
||||||
curl -X POST http://localhost:11235/crawl/job \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{"urls": ["https://example.com"]}'
|
|
||||||
# Response: {"task_id": "crawl_xyz"}
|
|
||||||
|
|
||||||
# Poll for status
|
|
||||||
curl http://localhost:11235/crawl/job/crawl_xyz
|
|
||||||
```
|
|
||||||
|
|
||||||
The response includes `status` field: `"processing"`, `"completed"`, or `"failed"`.
|
|
||||||
|
|
||||||
> 💡 **Pro tip**: See [WEBHOOK_EXAMPLES.md](./WEBHOOK_EXAMPLES.md) for detailed examples including TypeScript client code, Flask webhook handlers, and failure handling.
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Metrics & Monitoring
|
## Metrics & Monitoring
|
||||||
@@ -965,11 +826,10 @@ We're here to help you succeed with Crawl4AI! Here's how to get support:
|
|||||||
|
|
||||||
In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
|
In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
|
||||||
- Building and running the Docker container
|
- Building and running the Docker container
|
||||||
- Configuring the environment
|
- Configuring the environment
|
||||||
- Using the interactive playground for testing
|
- Using the interactive playground for testing
|
||||||
- Making API requests with proper typing
|
- Making API requests with proper typing
|
||||||
- Using the Python SDK
|
- Using the Python SDK
|
||||||
- Asynchronous job queues with webhook notifications
|
|
||||||
- Leveraging specialized endpoints for screenshots, PDFs, and JavaScript execution
|
- Leveraging specialized endpoints for screenshots, PDFs, and JavaScript execution
|
||||||
- Connecting via the Model Context Protocol (MCP)
|
- Connecting via the Model Context Protocol (MCP)
|
||||||
- Monitoring your deployment
|
- Monitoring your deployment
|
||||||
|
|||||||
241
deploy/docker/STRESS_TEST_PIPELINE.md
Normal file
241
deploy/docker/STRESS_TEST_PIPELINE.md
Normal file
@@ -0,0 +1,241 @@
|
|||||||
|
# Crawl4AI Docker Memory & Pool Optimization - Implementation Log
|
||||||
|
|
||||||
|
## Critical Issues Identified
|
||||||
|
|
||||||
|
### Memory Management
|
||||||
|
- **Host vs Container**: `psutil.virtual_memory()` reported host memory, not container limits
|
||||||
|
- **Browser Pooling**: No pool reuse - every endpoint created new browsers
|
||||||
|
- **Warmup Waste**: Permanent browser sat idle with mismatched config signature
|
||||||
|
- **Idle Cleanup**: 30min TTL too long, janitor ran every 60s
|
||||||
|
- **Endpoint Inconsistency**: 75% of endpoints bypassed pool (`/md`, `/html`, `/screenshot`, `/pdf`, `/execute_js`, `/llm`)
|
||||||
|
|
||||||
|
### Pool Design Flaws
|
||||||
|
- **Config Mismatch**: Permanent browser used `config.yml` args, endpoints used empty `BrowserConfig()`
|
||||||
|
- **Logging Level**: Pool hit markers at DEBUG, invisible with INFO logging
|
||||||
|
|
||||||
|
## Implementation Changes
|
||||||
|
|
||||||
|
### 1. Container-Aware Memory Detection (`utils.py`)
|
||||||
|
```python
|
||||||
|
def get_container_memory_percent() -> float:
|
||||||
|
# Try cgroup v2 → v1 → fallback to psutil
|
||||||
|
# Reads /sys/fs/cgroup/memory.{current,max} OR memory/memory.{usage,limit}_in_bytes
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Smart Browser Pool (`crawler_pool.py`)
|
||||||
|
**3-Tier System:**
|
||||||
|
- **PERMANENT**: Always-ready default browser (never cleaned)
|
||||||
|
- **HOT_POOL**: Configs used 3+ times (longer TTL)
|
||||||
|
- **COLD_POOL**: New/rare configs (short TTL)
|
||||||
|
|
||||||
|
**Key Functions:**
|
||||||
|
- `get_crawler(cfg)`: Check permanent → hot → cold → create new
|
||||||
|
- `init_permanent(cfg)`: Initialize permanent at startup
|
||||||
|
- `janitor()`: Adaptive cleanup (10s/30s/60s intervals based on memory)
|
||||||
|
- `_sig(cfg)`: SHA1 hash of config dict for pool keys
|
||||||
|
|
||||||
|
**Logging Fix**: Changed `logger.debug()` → `logger.info()` for pool hits
|
||||||
|
|
||||||
|
### 3. Endpoint Unification
|
||||||
|
**Helper Function** (`server.py`):
|
||||||
|
```python
|
||||||
|
def get_default_browser_config() -> BrowserConfig:
|
||||||
|
return BrowserConfig(
|
||||||
|
extra_args=config["crawler"]["browser"].get("extra_args", []),
|
||||||
|
**config["crawler"]["browser"].get("kwargs", {}),
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Migrated Endpoints:**
|
||||||
|
- `/html`, `/screenshot`, `/pdf`, `/execute_js` → use `get_default_browser_config()`
|
||||||
|
- `handle_llm_qa()`, `handle_markdown_request()` → same
|
||||||
|
|
||||||
|
**Result**: All endpoints now hit permanent browser pool
|
||||||
|
|
||||||
|
### 4. Config Updates (`config.yml`)
|
||||||
|
- `idle_ttl_sec: 1800` → `300` (30min → 5min base TTL)
|
||||||
|
- `port: 11234` → `11235` (fixed mismatch with Gunicorn)
|
||||||
|
|
||||||
|
### 5. Lifespan Fix (`server.py`)
|
||||||
|
```python
|
||||||
|
await init_permanent(BrowserConfig(
|
||||||
|
extra_args=config["crawler"]["browser"].get("extra_args", []),
|
||||||
|
**config["crawler"]["browser"].get("kwargs", {}),
|
||||||
|
))
|
||||||
|
```
|
||||||
|
Permanent browser now matches endpoint config signatures
|
||||||
|
|
||||||
|
## Test Results
|
||||||
|
|
||||||
|
### Test 1: Basic Health
|
||||||
|
- 10 requests to `/health`
|
||||||
|
- **Result**: 100% success, avg 3ms latency
|
||||||
|
- **Baseline**: Container starts in ~5s, 270 MB idle
|
||||||
|
|
||||||
|
### Test 2: Memory Monitoring
|
||||||
|
- 20 requests with Docker stats tracking
|
||||||
|
- **Result**: 100% success, no memory leak (-0.2 MB delta)
|
||||||
|
- **Baseline**: 269.7 MB container overhead
|
||||||
|
|
||||||
|
### Test 3: Pool Validation
|
||||||
|
- 30 requests to `/html` endpoint
|
||||||
|
- **Result**: **100% permanent browser hits**, 0 new browsers created
|
||||||
|
- **Memory**: 287 MB baseline → 396 MB active (+109 MB)
|
||||||
|
- **Latency**: Avg 4s (includes network to httpbin.org)
|
||||||
|
|
||||||
|
### Test 4: Concurrent Load
|
||||||
|
- Light (10) → Medium (50) → Heavy (100) concurrent
|
||||||
|
- **Total**: 320 requests
|
||||||
|
- **Result**: 100% success, **320/320 permanent hits**, 0 new browsers
|
||||||
|
- **Memory**: 269 MB → peak 1533 MB → final 993 MB
|
||||||
|
- **Latency**: P99 at 100 concurrent = 34s (expected with single browser)
|
||||||
|
|
||||||
|
### Test 5: Pool Stress (Mixed Configs)
|
||||||
|
- 20 requests with 4 different viewport configs
|
||||||
|
- **Result**: 4 new browsers, 4 cold hits, **4 promotions to hot**, 8 hot hits
|
||||||
|
- **Reuse Rate**: 60% (12 pool hits / 20 requests)
|
||||||
|
- **Memory**: 270 MB → 928 MB peak (+658 MB = ~165 MB per browser)
|
||||||
|
- **Proves**: Cold → hot promotion at 3 uses working perfectly
|
||||||
|
|
||||||
|
### Test 6: Multi-Endpoint
|
||||||
|
- 10 requests each: `/html`, `/screenshot`, `/pdf`, `/crawl`
|
||||||
|
- **Result**: 100% success across all 4 endpoints
|
||||||
|
- **Latency**: 5-8s avg (PDF slowest at 7.2s)
|
||||||
|
|
||||||
|
### Test 7: Cleanup Verification
|
||||||
|
- 20 requests (load spike) → 90s idle
|
||||||
|
- **Memory**: 269 MB → peak 1107 MB → final 780 MB
|
||||||
|
- **Recovery**: 327 MB (39%) - partial cleanup
|
||||||
|
- **Note**: Hot pool browsers persist (by design), janitor working correctly
|
||||||
|
|
||||||
|
## Performance Metrics
|
||||||
|
|
||||||
|
| Metric | Before | After | Improvement |
|
||||||
|
|--------|--------|-------|-------------|
|
||||||
|
| Pool Reuse | 0% | 100% (default config) | ∞ |
|
||||||
|
| Memory Leak | Unknown | 0 MB/cycle | Stable |
|
||||||
|
| Browser Reuse | No | Yes | ~3-5s saved per request |
|
||||||
|
| Idle Memory | 500-700 MB × N | 270-400 MB | 10x reduction |
|
||||||
|
| Concurrent Capacity | ~20 | 100+ | 5x |
|
||||||
|
|
||||||
|
## Key Learnings
|
||||||
|
|
||||||
|
1. **Config Signature Matching**: Permanent browser MUST match endpoint default config exactly (SHA1 hash)
|
||||||
|
2. **Logging Levels**: Pool diagnostics need INFO level, not DEBUG
|
||||||
|
3. **Memory in Docker**: Must read cgroup files, not host metrics
|
||||||
|
4. **Janitor Timing**: 60s interval adequate, but TTLs should be short (5min) for cold pool
|
||||||
|
5. **Hot Promotion**: 3-use threshold works well for production patterns
|
||||||
|
6. **Memory Per Browser**: ~150-200 MB per Chromium instance with headless + text_mode
|
||||||
|
|
||||||
|
## Test Infrastructure
|
||||||
|
|
||||||
|
**Location**: `deploy/docker/tests/`
|
||||||
|
**Dependencies**: `httpx`, `docker` (Python SDK)
|
||||||
|
**Pattern**: Sequential build - each test adds one capability
|
||||||
|
|
||||||
|
**Files**:
|
||||||
|
- `test_1_basic.py`: Health check + container lifecycle
|
||||||
|
- `test_2_memory.py`: + Docker stats monitoring
|
||||||
|
- `test_3_pool.py`: + Log analysis for pool markers
|
||||||
|
- `test_4_concurrent.py`: + asyncio.Semaphore for concurrency control
|
||||||
|
- `test_5_pool_stress.py`: + Config variants (viewports)
|
||||||
|
- `test_6_multi_endpoint.py`: + Multiple endpoint testing
|
||||||
|
- `test_7_cleanup.py`: + Time-series memory tracking for janitor
|
||||||
|
|
||||||
|
**Run Pattern**:
|
||||||
|
```bash
|
||||||
|
cd deploy/docker/tests
|
||||||
|
pip install -r requirements.txt
|
||||||
|
# Rebuild after code changes:
|
||||||
|
cd /path/to/repo && docker buildx build -t crawl4ai-local:latest --load .
|
||||||
|
# Run test:
|
||||||
|
python test_N_name.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## Architecture Decisions
|
||||||
|
|
||||||
|
**Why Permanent Browser?**
|
||||||
|
- 90% of requests use default config → single browser serves most traffic
|
||||||
|
- Eliminates 3-5s startup overhead per request
|
||||||
|
|
||||||
|
**Why 3-Tier Pool?**
|
||||||
|
- Permanent: Zero cost for common case
|
||||||
|
- Hot: Amortized cost for frequent variants
|
||||||
|
- Cold: Lazy allocation for rare configs
|
||||||
|
|
||||||
|
**Why Adaptive Janitor?**
|
||||||
|
- Memory pressure triggers aggressive cleanup
|
||||||
|
- Low memory allows longer TTLs for better reuse
|
||||||
|
|
||||||
|
**Why Not Close After Each Request?**
|
||||||
|
- Browser startup: 3-5s overhead
|
||||||
|
- Pool reuse: <100ms overhead
|
||||||
|
- Net: 30-50x faster
|
||||||
|
|
||||||
|
## Future Optimizations
|
||||||
|
|
||||||
|
1. **Request Queuing**: When at capacity, queue instead of reject
|
||||||
|
2. **Pre-warming**: Predict common configs, pre-create browsers
|
||||||
|
3. **Metrics Export**: Prometheus metrics for pool efficiency
|
||||||
|
4. **Config Normalization**: Group similar viewports (e.g., 1920±50 → 1920)
|
||||||
|
|
||||||
|
## Critical Code Paths
|
||||||
|
|
||||||
|
**Browser Acquisition** (`crawler_pool.py:34-78`):
|
||||||
|
```
|
||||||
|
get_crawler(cfg) →
|
||||||
|
_sig(cfg) →
|
||||||
|
if sig == DEFAULT_CONFIG_SIG → PERMANENT
|
||||||
|
elif sig in HOT_POOL → HOT_POOL[sig]
|
||||||
|
elif sig in COLD_POOL → promote if count >= 3
|
||||||
|
else → create new in COLD_POOL
|
||||||
|
```
|
||||||
|
|
||||||
|
**Janitor Loop** (`crawler_pool.py:107-146`):
|
||||||
|
```
|
||||||
|
while True:
|
||||||
|
mem% = get_container_memory_percent()
|
||||||
|
if mem% > 80: interval=10s, cold_ttl=30s
|
||||||
|
elif mem% > 60: interval=30s, cold_ttl=60s
|
||||||
|
else: interval=60s, cold_ttl=300s
|
||||||
|
sleep(interval)
|
||||||
|
close idle browsers (COLD then HOT)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Endpoint Pattern** (`server.py` example):
|
||||||
|
```python
|
||||||
|
@app.post("/html")
|
||||||
|
async def generate_html(...):
|
||||||
|
from crawler_pool import get_crawler
|
||||||
|
crawler = await get_crawler(get_default_browser_config())
|
||||||
|
results = await crawler.arun(url=body.url, config=cfg)
|
||||||
|
# No crawler.close() - returned to pool
|
||||||
|
```
|
||||||
|
|
||||||
|
## Debugging Tips
|
||||||
|
|
||||||
|
**Check Pool Activity**:
|
||||||
|
```bash
|
||||||
|
docker logs crawl4ai-test | grep -E "(🔥|♨️|❄️|🆕|⬆️)"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Verify Config Signature**:
|
||||||
|
```python
|
||||||
|
from crawl4ai import BrowserConfig
|
||||||
|
import json, hashlib
|
||||||
|
cfg = BrowserConfig(...)
|
||||||
|
sig = hashlib.sha1(json.dumps(cfg.to_dict(), sort_keys=True).encode()).hexdigest()
|
||||||
|
print(sig[:8]) # Compare with logs
|
||||||
|
```
|
||||||
|
|
||||||
|
**Monitor Memory**:
|
||||||
|
```bash
|
||||||
|
docker stats crawl4ai-test
|
||||||
|
```
|
||||||
|
|
||||||
|
## Known Limitations
|
||||||
|
|
||||||
|
- **Mac Docker Stats**: CPU metrics unreliable, memory works
|
||||||
|
- **PDF Generation**: Slowest endpoint (~7s), no optimization yet
|
||||||
|
- **Hot Pool Persistence**: May hold memory longer than needed (trade-off for performance)
|
||||||
|
- **Janitor Lag**: Up to 60s before cleanup triggers in low-memory scenarios
|
||||||
@@ -1,378 +0,0 @@
|
|||||||
# Webhook Feature Examples
|
|
||||||
|
|
||||||
This document provides examples of how to use the webhook feature for crawl jobs in Crawl4AI.
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
The webhook feature allows you to receive notifications when crawl jobs complete, eliminating the need for polling. Webhooks are sent with exponential backoff retry logic to ensure reliable delivery.
|
|
||||||
|
|
||||||
## Configuration
|
|
||||||
|
|
||||||
### Global Configuration (config.yml)
|
|
||||||
|
|
||||||
You can configure default webhook settings in `config.yml`:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
webhooks:
|
|
||||||
enabled: true
|
|
||||||
default_url: null # Optional: default webhook URL for all jobs
|
|
||||||
data_in_payload: false # Optional: default behavior for including data
|
|
||||||
retry:
|
|
||||||
max_attempts: 5
|
|
||||||
initial_delay_ms: 1000 # 1s, 2s, 4s, 8s, 16s exponential backoff
|
|
||||||
max_delay_ms: 32000
|
|
||||||
timeout_ms: 30000 # 30s timeout per webhook call
|
|
||||||
headers: # Optional: default headers to include
|
|
||||||
User-Agent: "Crawl4AI-Webhook/1.0"
|
|
||||||
```
|
|
||||||
|
|
||||||
## API Usage Examples
|
|
||||||
|
|
||||||
### Example 1: Basic Webhook (Notification Only)
|
|
||||||
|
|
||||||
Send a webhook notification without including the crawl data in the payload.
|
|
||||||
|
|
||||||
**Request:**
|
|
||||||
```bash
|
|
||||||
curl -X POST http://localhost:11235/crawl/job \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"urls": ["https://example.com"],
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": "https://myapp.com/webhooks/crawl-complete",
|
|
||||||
"webhook_data_in_payload": false
|
|
||||||
}
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
**Response:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"task_id": "crawl_a1b2c3d4"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Webhook Payload Received:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"task_id": "crawl_a1b2c3d4",
|
|
||||||
"task_type": "crawl",
|
|
||||||
"status": "completed",
|
|
||||||
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
|
||||||
"urls": ["https://example.com"]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Your webhook handler should then fetch the results:
|
|
||||||
```bash
|
|
||||||
curl http://localhost:11235/crawl/job/crawl_a1b2c3d4
|
|
||||||
```
|
|
||||||
|
|
||||||
### Example 2: Webhook with Data Included
|
|
||||||
|
|
||||||
Include the full crawl results in the webhook payload.
|
|
||||||
|
|
||||||
**Request:**
|
|
||||||
```bash
|
|
||||||
curl -X POST http://localhost:11235/crawl/job \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"urls": ["https://example.com"],
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": "https://myapp.com/webhooks/crawl-complete",
|
|
||||||
"webhook_data_in_payload": true
|
|
||||||
}
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
**Webhook Payload Received:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"task_id": "crawl_a1b2c3d4",
|
|
||||||
"task_type": "crawl",
|
|
||||||
"status": "completed",
|
|
||||||
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
|
||||||
"urls": ["https://example.com"],
|
|
||||||
"data": {
|
|
||||||
"markdown": "...",
|
|
||||||
"html": "...",
|
|
||||||
"links": {...},
|
|
||||||
"metadata": {...}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Example 3: Webhook with Custom Headers
|
|
||||||
|
|
||||||
Include custom headers for authentication or identification.
|
|
||||||
|
|
||||||
**Request:**
|
|
||||||
```bash
|
|
||||||
curl -X POST http://localhost:11235/crawl/job \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"urls": ["https://example.com"],
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": "https://myapp.com/webhooks/crawl-complete",
|
|
||||||
"webhook_data_in_payload": false,
|
|
||||||
"webhook_headers": {
|
|
||||||
"X-Webhook-Secret": "my-secret-token",
|
|
||||||
"X-Service-ID": "crawl4ai-production"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
The webhook will be sent with these additional headers plus the default headers from config.
|
|
||||||
|
|
||||||
### Example 4: Failure Notification
|
|
||||||
|
|
||||||
When a crawl job fails, a webhook is sent with error details.
|
|
||||||
|
|
||||||
**Webhook Payload on Failure:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"task_id": "crawl_a1b2c3d4",
|
|
||||||
"task_type": "crawl",
|
|
||||||
"status": "failed",
|
|
||||||
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
|
||||||
"urls": ["https://example.com"],
|
|
||||||
"error": "Connection timeout after 30s"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Example 5: Using Global Default Webhook
|
|
||||||
|
|
||||||
If you set a `default_url` in config.yml, jobs without webhook_config will use it:
|
|
||||||
|
|
||||||
**config.yml:**
|
|
||||||
```yaml
|
|
||||||
webhooks:
|
|
||||||
enabled: true
|
|
||||||
default_url: "https://myapp.com/webhooks/default"
|
|
||||||
data_in_payload: false
|
|
||||||
```
|
|
||||||
|
|
||||||
**Request (no webhook_config needed):**
|
|
||||||
```bash
|
|
||||||
curl -X POST http://localhost:11235/crawl/job \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"urls": ["https://example.com"]
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
The webhook will be sent to the default URL configured in config.yml.
|
|
||||||
|
|
||||||
### Example 6: LLM Extraction Job with Webhook
|
|
||||||
|
|
||||||
Use webhooks with the LLM extraction endpoint for asynchronous processing.
|
|
||||||
|
|
||||||
**Request:**
|
|
||||||
```bash
|
|
||||||
curl -X POST http://localhost:11235/llm/job \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"url": "https://example.com/article",
|
|
||||||
"q": "Extract the article title, author, and publication date",
|
|
||||||
"schema": "{\"type\": \"object\", \"properties\": {\"title\": {\"type\": \"string\"}, \"author\": {\"type\": \"string\"}, \"date\": {\"type\": \"string\"}}}",
|
|
||||||
"cache": false,
|
|
||||||
"provider": "openai/gpt-4o-mini",
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": "https://myapp.com/webhooks/llm-complete",
|
|
||||||
"webhook_data_in_payload": true
|
|
||||||
}
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
**Response:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"task_id": "llm_1698765432_12345"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Webhook Payload Received:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"task_id": "llm_1698765432_12345",
|
|
||||||
"task_type": "llm_extraction",
|
|
||||||
"status": "completed",
|
|
||||||
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
|
||||||
"urls": ["https://example.com/article"],
|
|
||||||
"data": {
|
|
||||||
"extracted_content": {
|
|
||||||
"title": "Understanding Web Scraping",
|
|
||||||
"author": "John Doe",
|
|
||||||
"date": "2025-10-21"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Webhook Handler Example
|
|
||||||
|
|
||||||
Here's a simple Python Flask webhook handler that supports both crawl and LLM extraction jobs:
|
|
||||||
|
|
||||||
```python
|
|
||||||
from flask import Flask, request, jsonify
|
|
||||||
import requests
|
|
||||||
|
|
||||||
app = Flask(__name__)
|
|
||||||
|
|
||||||
@app.route('/webhooks/crawl-complete', methods=['POST'])
|
|
||||||
def handle_crawl_webhook():
|
|
||||||
payload = request.json
|
|
||||||
|
|
||||||
task_id = payload['task_id']
|
|
||||||
task_type = payload['task_type']
|
|
||||||
status = payload['status']
|
|
||||||
|
|
||||||
if status == 'completed':
|
|
||||||
# If data not in payload, fetch it
|
|
||||||
if 'data' not in payload:
|
|
||||||
# Determine endpoint based on task type
|
|
||||||
endpoint = 'crawl' if task_type == 'crawl' else 'llm'
|
|
||||||
response = requests.get(f'http://localhost:11235/{endpoint}/job/{task_id}')
|
|
||||||
data = response.json()
|
|
||||||
else:
|
|
||||||
data = payload['data']
|
|
||||||
|
|
||||||
# Process based on task type
|
|
||||||
if task_type == 'crawl':
|
|
||||||
print(f"Processing crawl results for {task_id}")
|
|
||||||
# Handle crawl results
|
|
||||||
results = data.get('results', [])
|
|
||||||
for result in results:
|
|
||||||
print(f" - {result.get('url')}: {len(result.get('markdown', ''))} chars")
|
|
||||||
|
|
||||||
elif task_type == 'llm_extraction':
|
|
||||||
print(f"Processing LLM extraction for {task_id}")
|
|
||||||
# Handle LLM extraction
|
|
||||||
# Note: Webhook sends 'extracted_content', API returns 'result'
|
|
||||||
extracted = data.get('extracted_content', data.get('result', {}))
|
|
||||||
print(f" - Extracted: {extracted}")
|
|
||||||
|
|
||||||
# Your business logic here...
|
|
||||||
|
|
||||||
elif status == 'failed':
|
|
||||||
error = payload.get('error', 'Unknown error')
|
|
||||||
print(f"{task_type} job {task_id} failed: {error}")
|
|
||||||
# Handle failure...
|
|
||||||
|
|
||||||
return jsonify({"status": "received"}), 200
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
app.run(port=8080)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Retry Logic
|
|
||||||
|
|
||||||
The webhook delivery service uses exponential backoff retry logic:
|
|
||||||
|
|
||||||
- **Attempts:** Up to 5 attempts by default
|
|
||||||
- **Delays:** 1s → 2s → 4s → 8s → 16s
|
|
||||||
- **Timeout:** 30 seconds per attempt
|
|
||||||
- **Retry Conditions:**
|
|
||||||
- Server errors (5xx status codes)
|
|
||||||
- Network errors
|
|
||||||
- Timeouts
|
|
||||||
- **No Retry:**
|
|
||||||
- Client errors (4xx status codes)
|
|
||||||
- Successful delivery (2xx status codes)
|
|
||||||
|
|
||||||
## Benefits
|
|
||||||
|
|
||||||
1. **No Polling Required** - Eliminates constant API calls to check job status
|
|
||||||
2. **Real-time Notifications** - Immediate notification when jobs complete
|
|
||||||
3. **Reliable Delivery** - Exponential backoff ensures webhooks are delivered
|
|
||||||
4. **Flexible** - Choose between notification-only or full data delivery
|
|
||||||
5. **Secure** - Support for custom headers for authentication
|
|
||||||
6. **Configurable** - Global defaults or per-job configuration
|
|
||||||
7. **Universal Support** - Works with both `/crawl/job` and `/llm/job` endpoints
|
|
||||||
|
|
||||||
## TypeScript Client Example
|
|
||||||
|
|
||||||
```typescript
|
|
||||||
interface WebhookConfig {
|
|
||||||
webhook_url: string;
|
|
||||||
webhook_data_in_payload?: boolean;
|
|
||||||
webhook_headers?: Record<string, string>;
|
|
||||||
}
|
|
||||||
|
|
||||||
interface CrawlJobRequest {
|
|
||||||
urls: string[];
|
|
||||||
browser_config?: Record<string, any>;
|
|
||||||
crawler_config?: Record<string, any>;
|
|
||||||
webhook_config?: WebhookConfig;
|
|
||||||
}
|
|
||||||
|
|
||||||
interface LLMJobRequest {
|
|
||||||
url: string;
|
|
||||||
q: string;
|
|
||||||
schema?: string;
|
|
||||||
cache?: boolean;
|
|
||||||
provider?: string;
|
|
||||||
webhook_config?: WebhookConfig;
|
|
||||||
}
|
|
||||||
|
|
||||||
async function createCrawlJob(request: CrawlJobRequest) {
|
|
||||||
const response = await fetch('http://localhost:11235/crawl/job', {
|
|
||||||
method: 'POST',
|
|
||||||
headers: { 'Content-Type': 'application/json' },
|
|
||||||
body: JSON.stringify(request)
|
|
||||||
});
|
|
||||||
|
|
||||||
const { task_id } = await response.json();
|
|
||||||
return task_id;
|
|
||||||
}
|
|
||||||
|
|
||||||
async function createLLMJob(request: LLMJobRequest) {
|
|
||||||
const response = await fetch('http://localhost:11235/llm/job', {
|
|
||||||
method: 'POST',
|
|
||||||
headers: { 'Content-Type': 'application/json' },
|
|
||||||
body: JSON.stringify(request)
|
|
||||||
});
|
|
||||||
|
|
||||||
const { task_id } = await response.json();
|
|
||||||
return task_id;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Usage - Crawl Job
|
|
||||||
const crawlTaskId = await createCrawlJob({
|
|
||||||
urls: ['https://example.com'],
|
|
||||||
webhook_config: {
|
|
||||||
webhook_url: 'https://myapp.com/webhooks/crawl-complete',
|
|
||||||
webhook_data_in_payload: false,
|
|
||||||
webhook_headers: {
|
|
||||||
'X-Webhook-Secret': 'my-secret'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
// Usage - LLM Extraction Job
|
|
||||||
const llmTaskId = await createLLMJob({
|
|
||||||
url: 'https://example.com/article',
|
|
||||||
q: 'Extract the main points from this article',
|
|
||||||
provider: 'openai/gpt-4o-mini',
|
|
||||||
webhook_config: {
|
|
||||||
webhook_url: 'https://myapp.com/webhooks/llm-complete',
|
|
||||||
webhook_data_in_payload: true,
|
|
||||||
webhook_headers: {
|
|
||||||
'X-Webhook-Secret': 'my-secret'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
```
|
|
||||||
|
|
||||||
## Monitoring and Debugging
|
|
||||||
|
|
||||||
Webhook delivery attempts are logged at INFO level:
|
|
||||||
- Successful deliveries
|
|
||||||
- Retry attempts with delays
|
|
||||||
- Final failures after max attempts
|
|
||||||
|
|
||||||
Check the application logs for webhook delivery status:
|
|
||||||
```bash
|
|
||||||
docker logs crawl4ai-container | grep -i webhook
|
|
||||||
```
|
|
||||||
@@ -46,7 +46,6 @@ from utils import (
|
|||||||
get_llm_temperature,
|
get_llm_temperature,
|
||||||
get_llm_base_url
|
get_llm_base_url
|
||||||
)
|
)
|
||||||
from webhook import WebhookDeliveryService
|
|
||||||
|
|
||||||
import psutil, time
|
import psutil, time
|
||||||
|
|
||||||
@@ -67,6 +66,7 @@ async def handle_llm_qa(
|
|||||||
config: dict
|
config: dict
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Process QA using LLM with crawled content as context."""
|
"""Process QA using LLM with crawled content as context."""
|
||||||
|
from crawler_pool import get_crawler
|
||||||
try:
|
try:
|
||||||
if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")):
|
if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")):
|
||||||
url = 'https://' + url
|
url = 'https://' + url
|
||||||
@@ -75,15 +75,21 @@ async def handle_llm_qa(
|
|||||||
if last_q_index != -1:
|
if last_q_index != -1:
|
||||||
url = url[:last_q_index]
|
url = url[:last_q_index]
|
||||||
|
|
||||||
# Get markdown content
|
# Get markdown content (use default config)
|
||||||
async with AsyncWebCrawler() as crawler:
|
from utils import load_config
|
||||||
result = await crawler.arun(url)
|
cfg = load_config()
|
||||||
if not result.success:
|
browser_cfg = BrowserConfig(
|
||||||
raise HTTPException(
|
extra_args=cfg["crawler"]["browser"].get("extra_args", []),
|
||||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
**cfg["crawler"]["browser"].get("kwargs", {}),
|
||||||
detail=result.error_message
|
)
|
||||||
)
|
crawler = await get_crawler(browser_cfg)
|
||||||
content = result.markdown.fit_markdown or result.markdown.raw_markdown
|
result = await crawler.arun(url)
|
||||||
|
if not result.success:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
|
detail=result.error_message
|
||||||
|
)
|
||||||
|
content = result.markdown.fit_markdown or result.markdown.raw_markdown
|
||||||
|
|
||||||
# Create prompt and get LLM response
|
# Create prompt and get LLM response
|
||||||
prompt = f"""Use the following content as context to answer the question.
|
prompt = f"""Use the following content as context to answer the question.
|
||||||
@@ -121,14 +127,10 @@ async def process_llm_extraction(
|
|||||||
schema: Optional[str] = None,
|
schema: Optional[str] = None,
|
||||||
cache: str = "0",
|
cache: str = "0",
|
||||||
provider: Optional[str] = None,
|
provider: Optional[str] = None,
|
||||||
webhook_config: Optional[Dict] = None,
|
|
||||||
temperature: Optional[float] = None,
|
temperature: Optional[float] = None,
|
||||||
base_url: Optional[str] = None
|
base_url: Optional[str] = None
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Process LLM extraction in background."""
|
"""Process LLM extraction in background."""
|
||||||
# Initialize webhook service
|
|
||||||
webhook_service = WebhookDeliveryService(config)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Validate provider
|
# Validate provider
|
||||||
is_valid, error_msg = validate_llm_provider(config, provider)
|
is_valid, error_msg = validate_llm_provider(config, provider)
|
||||||
@@ -137,16 +139,6 @@ async def process_llm_extraction(
|
|||||||
"status": TaskStatus.FAILED,
|
"status": TaskStatus.FAILED,
|
||||||
"error": error_msg
|
"error": error_msg
|
||||||
})
|
})
|
||||||
|
|
||||||
# Send webhook notification on failure
|
|
||||||
await webhook_service.notify_job_completion(
|
|
||||||
task_id=task_id,
|
|
||||||
task_type="llm_extraction",
|
|
||||||
status="failed",
|
|
||||||
urls=[url],
|
|
||||||
webhook_config=webhook_config,
|
|
||||||
error=error_msg
|
|
||||||
)
|
|
||||||
return
|
return
|
||||||
api_key = get_llm_api_key(config, provider) # Returns None to let litellm handle it
|
api_key = get_llm_api_key(config, provider) # Returns None to let litellm handle it
|
||||||
llm_strategy = LLMExtractionStrategy(
|
llm_strategy = LLMExtractionStrategy(
|
||||||
@@ -177,40 +169,17 @@ async def process_llm_extraction(
|
|||||||
"status": TaskStatus.FAILED,
|
"status": TaskStatus.FAILED,
|
||||||
"error": result.error_message
|
"error": result.error_message
|
||||||
})
|
})
|
||||||
|
|
||||||
# Send webhook notification on failure
|
|
||||||
await webhook_service.notify_job_completion(
|
|
||||||
task_id=task_id,
|
|
||||||
task_type="llm_extraction",
|
|
||||||
status="failed",
|
|
||||||
urls=[url],
|
|
||||||
webhook_config=webhook_config,
|
|
||||||
error=result.error_message
|
|
||||||
)
|
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
content = json.loads(result.extracted_content)
|
content = json.loads(result.extracted_content)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
content = result.extracted_content
|
content = result.extracted_content
|
||||||
|
|
||||||
result_data = {"extracted_content": content}
|
|
||||||
|
|
||||||
await redis.hset(f"task:{task_id}", mapping={
|
await redis.hset(f"task:{task_id}", mapping={
|
||||||
"status": TaskStatus.COMPLETED,
|
"status": TaskStatus.COMPLETED,
|
||||||
"result": json.dumps(content)
|
"result": json.dumps(content)
|
||||||
})
|
})
|
||||||
|
|
||||||
# Send webhook notification on successful completion
|
|
||||||
await webhook_service.notify_job_completion(
|
|
||||||
task_id=task_id,
|
|
||||||
task_type="llm_extraction",
|
|
||||||
status="completed",
|
|
||||||
urls=[url],
|
|
||||||
webhook_config=webhook_config,
|
|
||||||
result=result_data
|
|
||||||
)
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"LLM extraction error: {str(e)}", exc_info=True)
|
logger.error(f"LLM extraction error: {str(e)}", exc_info=True)
|
||||||
await redis.hset(f"task:{task_id}", mapping={
|
await redis.hset(f"task:{task_id}", mapping={
|
||||||
@@ -218,16 +187,6 @@ async def process_llm_extraction(
|
|||||||
"error": str(e)
|
"error": str(e)
|
||||||
})
|
})
|
||||||
|
|
||||||
# Send webhook notification on failure
|
|
||||||
await webhook_service.notify_job_completion(
|
|
||||||
task_id=task_id,
|
|
||||||
task_type="llm_extraction",
|
|
||||||
status="failed",
|
|
||||||
urls=[url],
|
|
||||||
webhook_config=webhook_config,
|
|
||||||
error=str(e)
|
|
||||||
)
|
|
||||||
|
|
||||||
async def handle_markdown_request(
|
async def handle_markdown_request(
|
||||||
url: str,
|
url: str,
|
||||||
filter_type: FilterType,
|
filter_type: FilterType,
|
||||||
@@ -272,25 +231,32 @@ async def handle_markdown_request(
|
|||||||
|
|
||||||
cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.WRITE_ONLY
|
cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.WRITE_ONLY
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
from crawler_pool import get_crawler
|
||||||
result = await crawler.arun(
|
from utils import load_config as _load_config
|
||||||
url=decoded_url,
|
_cfg = _load_config()
|
||||||
config=CrawlerRunConfig(
|
browser_cfg = BrowserConfig(
|
||||||
markdown_generator=md_generator,
|
extra_args=_cfg["crawler"]["browser"].get("extra_args", []),
|
||||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
**_cfg["crawler"]["browser"].get("kwargs", {}),
|
||||||
cache_mode=cache_mode
|
)
|
||||||
)
|
crawler = await get_crawler(browser_cfg)
|
||||||
|
result = await crawler.arun(
|
||||||
|
url=decoded_url,
|
||||||
|
config=CrawlerRunConfig(
|
||||||
|
markdown_generator=md_generator,
|
||||||
|
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||||
|
cache_mode=cache_mode
|
||||||
)
|
)
|
||||||
|
)
|
||||||
if not result.success:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
||||||
detail=result.error_message
|
|
||||||
)
|
|
||||||
|
|
||||||
return (result.markdown.raw_markdown
|
if not result.success:
|
||||||
if filter_type == FilterType.RAW
|
raise HTTPException(
|
||||||
else result.markdown.fit_markdown)
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
|
detail=result.error_message
|
||||||
|
)
|
||||||
|
|
||||||
|
return (result.markdown.raw_markdown
|
||||||
|
if filter_type == FilterType.RAW
|
||||||
|
else result.markdown.fit_markdown)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Markdown error: {str(e)}", exc_info=True)
|
logger.error(f"Markdown error: {str(e)}", exc_info=True)
|
||||||
@@ -309,7 +275,6 @@ async def handle_llm_request(
|
|||||||
cache: str = "0",
|
cache: str = "0",
|
||||||
config: Optional[dict] = None,
|
config: Optional[dict] = None,
|
||||||
provider: Optional[str] = None,
|
provider: Optional[str] = None,
|
||||||
webhook_config: Optional[Dict] = None,
|
|
||||||
temperature: Optional[float] = None,
|
temperature: Optional[float] = None,
|
||||||
api_base_url: Optional[str] = None
|
api_base_url: Optional[str] = None
|
||||||
) -> JSONResponse:
|
) -> JSONResponse:
|
||||||
@@ -343,7 +308,6 @@ async def handle_llm_request(
|
|||||||
base_url,
|
base_url,
|
||||||
config,
|
config,
|
||||||
provider,
|
provider,
|
||||||
webhook_config,
|
|
||||||
temperature,
|
temperature,
|
||||||
api_base_url
|
api_base_url
|
||||||
)
|
)
|
||||||
@@ -391,7 +355,6 @@ async def create_new_task(
|
|||||||
base_url: str,
|
base_url: str,
|
||||||
config: dict,
|
config: dict,
|
||||||
provider: Optional[str] = None,
|
provider: Optional[str] = None,
|
||||||
webhook_config: Optional[Dict] = None,
|
|
||||||
temperature: Optional[float] = None,
|
temperature: Optional[float] = None,
|
||||||
api_base_url: Optional[str] = None
|
api_base_url: Optional[str] = None
|
||||||
) -> JSONResponse:
|
) -> JSONResponse:
|
||||||
@@ -402,18 +365,12 @@ async def create_new_task(
|
|||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
task_id = f"llm_{int(datetime.now().timestamp())}_{id(background_tasks)}"
|
task_id = f"llm_{int(datetime.now().timestamp())}_{id(background_tasks)}"
|
||||||
|
|
||||||
task_data = {
|
await redis.hset(f"task:{task_id}", mapping={
|
||||||
"status": TaskStatus.PROCESSING,
|
"status": TaskStatus.PROCESSING,
|
||||||
"created_at": datetime.now().isoformat(),
|
"created_at": datetime.now().isoformat(),
|
||||||
"url": decoded_url
|
"url": decoded_url
|
||||||
}
|
})
|
||||||
|
|
||||||
# Store webhook config if provided
|
|
||||||
if webhook_config:
|
|
||||||
task_data["webhook_config"] = json.dumps(webhook_config)
|
|
||||||
|
|
||||||
await redis.hset(f"task:{task_id}", mapping=task_data)
|
|
||||||
|
|
||||||
background_tasks.add_task(
|
background_tasks.add_task(
|
||||||
process_llm_extraction,
|
process_llm_extraction,
|
||||||
@@ -425,7 +382,6 @@ async def create_new_task(
|
|||||||
schema,
|
schema,
|
||||||
cache,
|
cache,
|
||||||
provider,
|
provider,
|
||||||
webhook_config,
|
|
||||||
temperature,
|
temperature,
|
||||||
api_base_url
|
api_base_url
|
||||||
)
|
)
|
||||||
@@ -504,12 +460,22 @@ async def handle_crawl_request(
|
|||||||
hooks_config: Optional[dict] = None
|
hooks_config: Optional[dict] = None
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""Handle non-streaming crawl requests with optional hooks."""
|
"""Handle non-streaming crawl requests with optional hooks."""
|
||||||
|
# Track request start
|
||||||
|
request_id = f"req_{uuid4().hex[:8]}"
|
||||||
|
try:
|
||||||
|
from monitor import get_monitor
|
||||||
|
await get_monitor().track_request_start(
|
||||||
|
request_id, "/crawl", urls[0] if urls else "batch", browser_config
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
pass # Monitor not critical
|
||||||
|
|
||||||
start_mem_mb = _get_memory_mb() # <--- Get memory before
|
start_mem_mb = _get_memory_mb() # <--- Get memory before
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
mem_delta_mb = None
|
mem_delta_mb = None
|
||||||
peak_mem_mb = start_mem_mb
|
peak_mem_mb = start_mem_mb
|
||||||
hook_manager = None
|
hook_manager = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls]
|
urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls]
|
||||||
browser_config = BrowserConfig.load(browser_config)
|
browser_config = BrowserConfig.load(browser_config)
|
||||||
@@ -614,7 +580,16 @@ async def handle_crawl_request(
|
|||||||
"server_memory_delta_mb": mem_delta_mb,
|
"server_memory_delta_mb": mem_delta_mb,
|
||||||
"server_peak_memory_mb": peak_mem_mb
|
"server_peak_memory_mb": peak_mem_mb
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Track request completion
|
||||||
|
try:
|
||||||
|
from monitor import get_monitor
|
||||||
|
await get_monitor().track_request_end(
|
||||||
|
request_id, success=True, pool_hit=True, status_code=200
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
# Add hooks information if hooks were used
|
# Add hooks information if hooks were used
|
||||||
if hooks_config and hook_manager:
|
if hooks_config and hook_manager:
|
||||||
from hook_manager import UserHookManager
|
from hook_manager import UserHookManager
|
||||||
@@ -643,6 +618,16 @@ async def handle_crawl_request(
|
|||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Crawl error: {str(e)}", exc_info=True)
|
logger.error(f"Crawl error: {str(e)}", exc_info=True)
|
||||||
|
|
||||||
|
# Track request error
|
||||||
|
try:
|
||||||
|
from monitor import get_monitor
|
||||||
|
await get_monitor().track_request_end(
|
||||||
|
request_id, success=False, error=str(e), status_code=500
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
if 'crawler' in locals() and crawler.ready: # Check if crawler was initialized and started
|
if 'crawler' in locals() and crawler.ready: # Check if crawler was initialized and started
|
||||||
# try:
|
# try:
|
||||||
# await crawler.close()
|
# await crawler.close()
|
||||||
@@ -738,7 +723,6 @@ async def handle_crawl_job(
|
|||||||
browser_config: Dict,
|
browser_config: Dict,
|
||||||
crawler_config: Dict,
|
crawler_config: Dict,
|
||||||
config: Dict,
|
config: Dict,
|
||||||
webhook_config: Optional[Dict] = None,
|
|
||||||
) -> Dict:
|
) -> Dict:
|
||||||
"""
|
"""
|
||||||
Fire-and-forget version of handle_crawl_request.
|
Fire-and-forget version of handle_crawl_request.
|
||||||
@@ -746,24 +730,13 @@ async def handle_crawl_job(
|
|||||||
lets /crawl/job/{task_id} polling fetch the result.
|
lets /crawl/job/{task_id} polling fetch the result.
|
||||||
"""
|
"""
|
||||||
task_id = f"crawl_{uuid4().hex[:8]}"
|
task_id = f"crawl_{uuid4().hex[:8]}"
|
||||||
|
await redis.hset(f"task:{task_id}", mapping={
|
||||||
# Store task data in Redis
|
|
||||||
task_data = {
|
|
||||||
"status": TaskStatus.PROCESSING, # <-- keep enum values consistent
|
"status": TaskStatus.PROCESSING, # <-- keep enum values consistent
|
||||||
"created_at": datetime.now(timezone.utc).replace(tzinfo=None).isoformat(),
|
"created_at": datetime.now(timezone.utc).replace(tzinfo=None).isoformat(),
|
||||||
"url": json.dumps(urls), # store list as JSON string
|
"url": json.dumps(urls), # store list as JSON string
|
||||||
"result": "",
|
"result": "",
|
||||||
"error": "",
|
"error": "",
|
||||||
}
|
})
|
||||||
|
|
||||||
# Store webhook config if provided
|
|
||||||
if webhook_config:
|
|
||||||
task_data["webhook_config"] = json.dumps(webhook_config)
|
|
||||||
|
|
||||||
await redis.hset(f"task:{task_id}", mapping=task_data)
|
|
||||||
|
|
||||||
# Initialize webhook service
|
|
||||||
webhook_service = WebhookDeliveryService(config)
|
|
||||||
|
|
||||||
async def _runner():
|
async def _runner():
|
||||||
try:
|
try:
|
||||||
@@ -777,17 +750,6 @@ async def handle_crawl_job(
|
|||||||
"status": TaskStatus.COMPLETED,
|
"status": TaskStatus.COMPLETED,
|
||||||
"result": json.dumps(result),
|
"result": json.dumps(result),
|
||||||
})
|
})
|
||||||
|
|
||||||
# Send webhook notification on successful completion
|
|
||||||
await webhook_service.notify_job_completion(
|
|
||||||
task_id=task_id,
|
|
||||||
task_type="crawl",
|
|
||||||
status="completed",
|
|
||||||
urls=urls,
|
|
||||||
webhook_config=webhook_config,
|
|
||||||
result=result
|
|
||||||
)
|
|
||||||
|
|
||||||
await asyncio.sleep(5) # Give Redis time to process the update
|
await asyncio.sleep(5) # Give Redis time to process the update
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
await redis.hset(f"task:{task_id}", mapping={
|
await redis.hset(f"task:{task_id}", mapping={
|
||||||
@@ -795,15 +757,5 @@ async def handle_crawl_job(
|
|||||||
"error": str(exc),
|
"error": str(exc),
|
||||||
})
|
})
|
||||||
|
|
||||||
# Send webhook notification on failure
|
|
||||||
await webhook_service.notify_job_completion(
|
|
||||||
task_id=task_id,
|
|
||||||
task_type="crawl",
|
|
||||||
status="failed",
|
|
||||||
urls=urls,
|
|
||||||
webhook_config=webhook_config,
|
|
||||||
error=str(exc)
|
|
||||||
)
|
|
||||||
|
|
||||||
background_tasks.add_task(_runner)
|
background_tasks.add_task(_runner)
|
||||||
return {"task_id": task_id}
|
return {"task_id": task_id}
|
||||||
@@ -3,7 +3,7 @@ app:
|
|||||||
title: "Crawl4AI API"
|
title: "Crawl4AI API"
|
||||||
version: "1.0.0"
|
version: "1.0.0"
|
||||||
host: "0.0.0.0"
|
host: "0.0.0.0"
|
||||||
port: 11234
|
port: 11235
|
||||||
reload: False
|
reload: False
|
||||||
workers: 1
|
workers: 1
|
||||||
timeout_keep_alive: 300
|
timeout_keep_alive: 300
|
||||||
@@ -61,7 +61,7 @@ crawler:
|
|||||||
batch_process: 300.0 # Timeout for batch processing
|
batch_process: 300.0 # Timeout for batch processing
|
||||||
pool:
|
pool:
|
||||||
max_pages: 40 # ← GLOBAL_SEM permits
|
max_pages: 40 # ← GLOBAL_SEM permits
|
||||||
idle_ttl_sec: 1800 # ← 30 min janitor cutoff
|
idle_ttl_sec: 300 # ← 30 min janitor cutoff
|
||||||
browser:
|
browser:
|
||||||
kwargs:
|
kwargs:
|
||||||
headless: true
|
headless: true
|
||||||
@@ -87,17 +87,4 @@ observability:
|
|||||||
enabled: True
|
enabled: True
|
||||||
endpoint: "/metrics"
|
endpoint: "/metrics"
|
||||||
health_check:
|
health_check:
|
||||||
endpoint: "/health"
|
endpoint: "/health"
|
||||||
|
|
||||||
# Webhook Configuration
|
|
||||||
webhooks:
|
|
||||||
enabled: true
|
|
||||||
default_url: null # Optional: default webhook URL for all jobs
|
|
||||||
data_in_payload: false # Optional: default behavior for including data
|
|
||||||
retry:
|
|
||||||
max_attempts: 5
|
|
||||||
initial_delay_ms: 1000 # 1s, 2s, 4s, 8s, 16s exponential backoff
|
|
||||||
max_delay_ms: 32000
|
|
||||||
timeout_ms: 30000 # 30s timeout per webhook call
|
|
||||||
headers: # Optional: default headers to include
|
|
||||||
User-Agent: "Crawl4AI-Webhook/1.0"
|
|
||||||
@@ -1,60 +1,170 @@
|
|||||||
# crawler_pool.py (new file)
|
# crawler_pool.py - Smart browser pool with tiered management
|
||||||
import asyncio, json, hashlib, time, psutil
|
import asyncio, json, hashlib, time
|
||||||
from contextlib import suppress
|
from contextlib import suppress
|
||||||
from typing import Dict
|
from typing import Dict, Optional
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
||||||
from typing import Dict
|
from utils import load_config, get_container_memory_percent
|
||||||
from utils import load_config
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
CONFIG = load_config()
|
CONFIG = load_config()
|
||||||
|
|
||||||
POOL: Dict[str, AsyncWebCrawler] = {}
|
# Pool tiers
|
||||||
|
PERMANENT: Optional[AsyncWebCrawler] = None # Always-ready default browser
|
||||||
|
HOT_POOL: Dict[str, AsyncWebCrawler] = {} # Frequent configs
|
||||||
|
COLD_POOL: Dict[str, AsyncWebCrawler] = {} # Rare configs
|
||||||
LAST_USED: Dict[str, float] = {}
|
LAST_USED: Dict[str, float] = {}
|
||||||
|
USAGE_COUNT: Dict[str, int] = {}
|
||||||
LOCK = asyncio.Lock()
|
LOCK = asyncio.Lock()
|
||||||
|
|
||||||
MEM_LIMIT = CONFIG.get("crawler", {}).get("memory_threshold_percent", 95.0) # % RAM – refuse new browsers above this
|
# Config
|
||||||
IDLE_TTL = CONFIG.get("crawler", {}).get("pool", {}).get("idle_ttl_sec", 1800) # close if unused for 30 min
|
MEM_LIMIT = CONFIG.get("crawler", {}).get("memory_threshold_percent", 95.0)
|
||||||
|
BASE_IDLE_TTL = CONFIG.get("crawler", {}).get("pool", {}).get("idle_ttl_sec", 300)
|
||||||
|
DEFAULT_CONFIG_SIG = None # Cached sig for default config
|
||||||
|
|
||||||
def _sig(cfg: BrowserConfig) -> str:
|
def _sig(cfg: BrowserConfig) -> str:
|
||||||
|
"""Generate config signature."""
|
||||||
payload = json.dumps(cfg.to_dict(), sort_keys=True, separators=(",",":"))
|
payload = json.dumps(cfg.to_dict(), sort_keys=True, separators=(",",":"))
|
||||||
return hashlib.sha1(payload.encode()).hexdigest()
|
return hashlib.sha1(payload.encode()).hexdigest()
|
||||||
|
|
||||||
|
def _is_default_config(sig: str) -> bool:
|
||||||
|
"""Check if config matches default."""
|
||||||
|
return sig == DEFAULT_CONFIG_SIG
|
||||||
|
|
||||||
async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
|
async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
|
||||||
try:
|
"""Get crawler from pool with tiered strategy."""
|
||||||
sig = _sig(cfg)
|
sig = _sig(cfg)
|
||||||
async with LOCK:
|
|
||||||
if sig in POOL:
|
|
||||||
LAST_USED[sig] = time.time();
|
|
||||||
return POOL[sig]
|
|
||||||
if psutil.virtual_memory().percent >= MEM_LIMIT:
|
|
||||||
raise MemoryError("RAM pressure – new browser denied")
|
|
||||||
crawler = AsyncWebCrawler(config=cfg, thread_safe=False)
|
|
||||||
await crawler.start()
|
|
||||||
POOL[sig] = crawler; LAST_USED[sig] = time.time()
|
|
||||||
return crawler
|
|
||||||
except MemoryError as e:
|
|
||||||
raise MemoryError(f"RAM pressure – new browser denied: {e}")
|
|
||||||
except Exception as e:
|
|
||||||
raise RuntimeError(f"Failed to start browser: {e}")
|
|
||||||
finally:
|
|
||||||
if sig in POOL:
|
|
||||||
LAST_USED[sig] = time.time()
|
|
||||||
else:
|
|
||||||
# If we failed to start the browser, we should remove it from the pool
|
|
||||||
POOL.pop(sig, None)
|
|
||||||
LAST_USED.pop(sig, None)
|
|
||||||
# If we failed to start the browser, we should remove it from the pool
|
|
||||||
async def close_all():
|
|
||||||
async with LOCK:
|
async with LOCK:
|
||||||
await asyncio.gather(*(c.close() for c in POOL.values()), return_exceptions=True)
|
# Check permanent browser for default config
|
||||||
POOL.clear(); LAST_USED.clear()
|
if PERMANENT and _is_default_config(sig):
|
||||||
|
LAST_USED[sig] = time.time()
|
||||||
|
USAGE_COUNT[sig] = USAGE_COUNT.get(sig, 0) + 1
|
||||||
|
logger.info("🔥 Using permanent browser")
|
||||||
|
return PERMANENT
|
||||||
|
|
||||||
|
# Check hot pool
|
||||||
|
if sig in HOT_POOL:
|
||||||
|
LAST_USED[sig] = time.time()
|
||||||
|
USAGE_COUNT[sig] = USAGE_COUNT.get(sig, 0) + 1
|
||||||
|
logger.info(f"♨️ Using hot pool browser (sig={sig[:8]})")
|
||||||
|
return HOT_POOL[sig]
|
||||||
|
|
||||||
|
# Check cold pool (promote to hot if used 3+ times)
|
||||||
|
if sig in COLD_POOL:
|
||||||
|
LAST_USED[sig] = time.time()
|
||||||
|
USAGE_COUNT[sig] = USAGE_COUNT.get(sig, 0) + 1
|
||||||
|
|
||||||
|
if USAGE_COUNT[sig] >= 3:
|
||||||
|
logger.info(f"⬆️ Promoting to hot pool (sig={sig[:8]}, count={USAGE_COUNT[sig]})")
|
||||||
|
HOT_POOL[sig] = COLD_POOL.pop(sig)
|
||||||
|
|
||||||
|
# Track promotion in monitor
|
||||||
|
try:
|
||||||
|
from monitor import get_monitor
|
||||||
|
await get_monitor().track_janitor_event("promote", sig, {"count": USAGE_COUNT[sig]})
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return HOT_POOL[sig]
|
||||||
|
|
||||||
|
logger.info(f"❄️ Using cold pool browser (sig={sig[:8]})")
|
||||||
|
return COLD_POOL[sig]
|
||||||
|
|
||||||
|
# Memory check before creating new
|
||||||
|
mem_pct = get_container_memory_percent()
|
||||||
|
if mem_pct >= MEM_LIMIT:
|
||||||
|
logger.error(f"💥 Memory pressure: {mem_pct:.1f}% >= {MEM_LIMIT}%")
|
||||||
|
raise MemoryError(f"Memory at {mem_pct:.1f}%, refusing new browser")
|
||||||
|
|
||||||
|
# Create new in cold pool
|
||||||
|
logger.info(f"🆕 Creating new browser in cold pool (sig={sig[:8]}, mem={mem_pct:.1f}%)")
|
||||||
|
crawler = AsyncWebCrawler(config=cfg, thread_safe=False)
|
||||||
|
await crawler.start()
|
||||||
|
COLD_POOL[sig] = crawler
|
||||||
|
LAST_USED[sig] = time.time()
|
||||||
|
USAGE_COUNT[sig] = 1
|
||||||
|
return crawler
|
||||||
|
|
||||||
|
async def init_permanent(cfg: BrowserConfig):
|
||||||
|
"""Initialize permanent default browser."""
|
||||||
|
global PERMANENT, DEFAULT_CONFIG_SIG
|
||||||
|
async with LOCK:
|
||||||
|
if PERMANENT:
|
||||||
|
return
|
||||||
|
DEFAULT_CONFIG_SIG = _sig(cfg)
|
||||||
|
logger.info("🔥 Creating permanent default browser")
|
||||||
|
PERMANENT = AsyncWebCrawler(config=cfg, thread_safe=False)
|
||||||
|
await PERMANENT.start()
|
||||||
|
LAST_USED[DEFAULT_CONFIG_SIG] = time.time()
|
||||||
|
USAGE_COUNT[DEFAULT_CONFIG_SIG] = 0
|
||||||
|
|
||||||
|
async def close_all():
|
||||||
|
"""Close all browsers."""
|
||||||
|
async with LOCK:
|
||||||
|
tasks = []
|
||||||
|
if PERMANENT:
|
||||||
|
tasks.append(PERMANENT.close())
|
||||||
|
tasks.extend([c.close() for c in HOT_POOL.values()])
|
||||||
|
tasks.extend([c.close() for c in COLD_POOL.values()])
|
||||||
|
await asyncio.gather(*tasks, return_exceptions=True)
|
||||||
|
HOT_POOL.clear()
|
||||||
|
COLD_POOL.clear()
|
||||||
|
LAST_USED.clear()
|
||||||
|
USAGE_COUNT.clear()
|
||||||
|
|
||||||
async def janitor():
|
async def janitor():
|
||||||
|
"""Adaptive cleanup based on memory pressure."""
|
||||||
while True:
|
while True:
|
||||||
await asyncio.sleep(60)
|
mem_pct = get_container_memory_percent()
|
||||||
|
|
||||||
|
# Adaptive intervals and TTLs
|
||||||
|
if mem_pct > 80:
|
||||||
|
interval, cold_ttl, hot_ttl = 10, 30, 120
|
||||||
|
elif mem_pct > 60:
|
||||||
|
interval, cold_ttl, hot_ttl = 30, 60, 300
|
||||||
|
else:
|
||||||
|
interval, cold_ttl, hot_ttl = 60, BASE_IDLE_TTL, BASE_IDLE_TTL * 2
|
||||||
|
|
||||||
|
await asyncio.sleep(interval)
|
||||||
|
|
||||||
now = time.time()
|
now = time.time()
|
||||||
async with LOCK:
|
async with LOCK:
|
||||||
for sig, crawler in list(POOL.items()):
|
# Clean cold pool
|
||||||
if now - LAST_USED[sig] > IDLE_TTL:
|
for sig in list(COLD_POOL.keys()):
|
||||||
with suppress(Exception): await crawler.close()
|
if now - LAST_USED.get(sig, now) > cold_ttl:
|
||||||
POOL.pop(sig, None); LAST_USED.pop(sig, None)
|
idle_time = now - LAST_USED[sig]
|
||||||
|
logger.info(f"🧹 Closing cold browser (sig={sig[:8]}, idle={idle_time:.0f}s)")
|
||||||
|
with suppress(Exception):
|
||||||
|
await COLD_POOL[sig].close()
|
||||||
|
COLD_POOL.pop(sig, None)
|
||||||
|
LAST_USED.pop(sig, None)
|
||||||
|
USAGE_COUNT.pop(sig, None)
|
||||||
|
|
||||||
|
# Track in monitor
|
||||||
|
try:
|
||||||
|
from monitor import get_monitor
|
||||||
|
await get_monitor().track_janitor_event("close_cold", sig, {"idle_seconds": int(idle_time), "ttl": cold_ttl})
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Clean hot pool (more conservative)
|
||||||
|
for sig in list(HOT_POOL.keys()):
|
||||||
|
if now - LAST_USED.get(sig, now) > hot_ttl:
|
||||||
|
idle_time = now - LAST_USED[sig]
|
||||||
|
logger.info(f"🧹 Closing hot browser (sig={sig[:8]}, idle={idle_time:.0f}s)")
|
||||||
|
with suppress(Exception):
|
||||||
|
await HOT_POOL[sig].close()
|
||||||
|
HOT_POOL.pop(sig, None)
|
||||||
|
LAST_USED.pop(sig, None)
|
||||||
|
USAGE_COUNT.pop(sig, None)
|
||||||
|
|
||||||
|
# Track in monitor
|
||||||
|
try:
|
||||||
|
from monitor import get_monitor
|
||||||
|
await get_monitor().track_janitor_event("close_hot", sig, {"idle_seconds": int(idle_time), "ttl": hot_ttl})
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Log pool stats
|
||||||
|
if mem_pct > 60:
|
||||||
|
logger.info(f"📊 Pool: hot={len(HOT_POOL)}, cold={len(COLD_POOL)}, mem={mem_pct:.1f}%")
|
||||||
|
|||||||
@@ -12,7 +12,6 @@ from api import (
|
|||||||
handle_crawl_job,
|
handle_crawl_job,
|
||||||
handle_task_status,
|
handle_task_status,
|
||||||
)
|
)
|
||||||
from schemas import WebhookConfig
|
|
||||||
|
|
||||||
# ------------- dependency placeholders -------------
|
# ------------- dependency placeholders -------------
|
||||||
_redis = None # will be injected from server.py
|
_redis = None # will be injected from server.py
|
||||||
@@ -38,7 +37,6 @@ class LlmJobPayload(BaseModel):
|
|||||||
schema: Optional[str] = None
|
schema: Optional[str] = None
|
||||||
cache: bool = False
|
cache: bool = False
|
||||||
provider: Optional[str] = None
|
provider: Optional[str] = None
|
||||||
webhook_config: Optional[WebhookConfig] = None
|
|
||||||
temperature: Optional[float] = None
|
temperature: Optional[float] = None
|
||||||
base_url: Optional[str] = None
|
base_url: Optional[str] = None
|
||||||
|
|
||||||
@@ -47,7 +45,6 @@ class CrawlJobPayload(BaseModel):
|
|||||||
urls: list[HttpUrl]
|
urls: list[HttpUrl]
|
||||||
browser_config: Dict = {}
|
browser_config: Dict = {}
|
||||||
crawler_config: Dict = {}
|
crawler_config: Dict = {}
|
||||||
webhook_config: Optional[WebhookConfig] = None
|
|
||||||
|
|
||||||
|
|
||||||
# ---------- LLM job ---------------------------------------------------------
|
# ---------- LLM job ---------------------------------------------------------
|
||||||
@@ -58,10 +55,6 @@ async def llm_job_enqueue(
|
|||||||
request: Request,
|
request: Request,
|
||||||
_td: Dict = Depends(lambda: _token_dep()), # late-bound dep
|
_td: Dict = Depends(lambda: _token_dep()), # late-bound dep
|
||||||
):
|
):
|
||||||
webhook_config = None
|
|
||||||
if payload.webhook_config:
|
|
||||||
webhook_config = payload.webhook_config.model_dump(mode='json')
|
|
||||||
|
|
||||||
return await handle_llm_request(
|
return await handle_llm_request(
|
||||||
_redis,
|
_redis,
|
||||||
background_tasks,
|
background_tasks,
|
||||||
@@ -72,7 +65,6 @@ async def llm_job_enqueue(
|
|||||||
cache=payload.cache,
|
cache=payload.cache,
|
||||||
config=_config,
|
config=_config,
|
||||||
provider=payload.provider,
|
provider=payload.provider,
|
||||||
webhook_config=webhook_config,
|
|
||||||
temperature=payload.temperature,
|
temperature=payload.temperature,
|
||||||
api_base_url=payload.base_url,
|
api_base_url=payload.base_url,
|
||||||
)
|
)
|
||||||
@@ -94,10 +86,6 @@ async def crawl_job_enqueue(
|
|||||||
background_tasks: BackgroundTasks,
|
background_tasks: BackgroundTasks,
|
||||||
_td: Dict = Depends(lambda: _token_dep()),
|
_td: Dict = Depends(lambda: _token_dep()),
|
||||||
):
|
):
|
||||||
webhook_config = None
|
|
||||||
if payload.webhook_config:
|
|
||||||
webhook_config = payload.webhook_config.model_dump(mode='json')
|
|
||||||
|
|
||||||
return await handle_crawl_job(
|
return await handle_crawl_job(
|
||||||
_redis,
|
_redis,
|
||||||
background_tasks,
|
background_tasks,
|
||||||
@@ -105,7 +93,6 @@ async def crawl_job_enqueue(
|
|||||||
payload.browser_config,
|
payload.browser_config,
|
||||||
payload.crawler_config,
|
payload.crawler_config,
|
||||||
config=_config,
|
config=_config,
|
||||||
webhook_config=webhook_config,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
382
deploy/docker/monitor.py
Normal file
382
deploy/docker/monitor.py
Normal file
@@ -0,0 +1,382 @@
|
|||||||
|
# monitor.py - Real-time monitoring stats with Redis persistence
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
import asyncio
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from collections import deque
|
||||||
|
from redis import asyncio as aioredis
|
||||||
|
from utils import get_container_memory_percent
|
||||||
|
import psutil
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class MonitorStats:
|
||||||
|
"""Tracks real-time server stats with Redis persistence."""
|
||||||
|
|
||||||
|
def __init__(self, redis: aioredis.Redis):
|
||||||
|
self.redis = redis
|
||||||
|
self.start_time = time.time()
|
||||||
|
|
||||||
|
# In-memory queues (fast reads, Redis backup)
|
||||||
|
self.active_requests: Dict[str, Dict] = {} # id -> request info
|
||||||
|
self.completed_requests: deque = deque(maxlen=100) # Last 100
|
||||||
|
self.janitor_events: deque = deque(maxlen=100)
|
||||||
|
self.errors: deque = deque(maxlen=100)
|
||||||
|
|
||||||
|
# Endpoint stats (persisted in Redis)
|
||||||
|
self.endpoint_stats: Dict[str, Dict] = {} # endpoint -> {count, total_time, errors, ...}
|
||||||
|
|
||||||
|
# Background persistence queue (max 10 pending persist requests)
|
||||||
|
self._persist_queue: asyncio.Queue = asyncio.Queue(maxsize=10)
|
||||||
|
self._persist_worker_task: Optional[asyncio.Task] = None
|
||||||
|
|
||||||
|
# Timeline data (5min window, 5s resolution = 60 points)
|
||||||
|
self.memory_timeline: deque = deque(maxlen=60)
|
||||||
|
self.requests_timeline: deque = deque(maxlen=60)
|
||||||
|
self.browser_timeline: deque = deque(maxlen=60)
|
||||||
|
|
||||||
|
async def track_request_start(self, request_id: str, endpoint: str, url: str, config: Dict = None):
|
||||||
|
"""Track new request start."""
|
||||||
|
req_info = {
|
||||||
|
"id": request_id,
|
||||||
|
"endpoint": endpoint,
|
||||||
|
"url": url[:100], # Truncate long URLs
|
||||||
|
"start_time": time.time(),
|
||||||
|
"config_sig": config.get("sig", "default") if config else "default",
|
||||||
|
"mem_start": psutil.Process().memory_info().rss / (1024 * 1024)
|
||||||
|
}
|
||||||
|
self.active_requests[request_id] = req_info
|
||||||
|
|
||||||
|
# Increment endpoint counter
|
||||||
|
if endpoint not in self.endpoint_stats:
|
||||||
|
self.endpoint_stats[endpoint] = {
|
||||||
|
"count": 0, "total_time": 0, "errors": 0,
|
||||||
|
"pool_hits": 0, "success": 0
|
||||||
|
}
|
||||||
|
self.endpoint_stats[endpoint]["count"] += 1
|
||||||
|
|
||||||
|
# Queue persistence (handled by background worker)
|
||||||
|
try:
|
||||||
|
self._persist_queue.put_nowait(True)
|
||||||
|
except asyncio.QueueFull:
|
||||||
|
logger.warning("Persistence queue full, skipping")
|
||||||
|
|
||||||
|
async def track_request_end(self, request_id: str, success: bool, error: str = None,
|
||||||
|
pool_hit: bool = True, status_code: int = 200):
|
||||||
|
"""Track request completion."""
|
||||||
|
if request_id not in self.active_requests:
|
||||||
|
return
|
||||||
|
|
||||||
|
req_info = self.active_requests.pop(request_id)
|
||||||
|
end_time = time.time()
|
||||||
|
elapsed = end_time - req_info["start_time"]
|
||||||
|
mem_end = psutil.Process().memory_info().rss / (1024 * 1024)
|
||||||
|
mem_delta = mem_end - req_info["mem_start"]
|
||||||
|
|
||||||
|
# Update stats
|
||||||
|
endpoint = req_info["endpoint"]
|
||||||
|
if endpoint in self.endpoint_stats:
|
||||||
|
self.endpoint_stats[endpoint]["total_time"] += elapsed
|
||||||
|
if success:
|
||||||
|
self.endpoint_stats[endpoint]["success"] += 1
|
||||||
|
else:
|
||||||
|
self.endpoint_stats[endpoint]["errors"] += 1
|
||||||
|
if pool_hit:
|
||||||
|
self.endpoint_stats[endpoint]["pool_hits"] += 1
|
||||||
|
|
||||||
|
# Add to completed queue
|
||||||
|
completed = {
|
||||||
|
**req_info,
|
||||||
|
"end_time": end_time,
|
||||||
|
"elapsed": round(elapsed, 2),
|
||||||
|
"mem_delta": round(mem_delta, 1),
|
||||||
|
"success": success,
|
||||||
|
"error": error,
|
||||||
|
"status_code": status_code,
|
||||||
|
"pool_hit": pool_hit
|
||||||
|
}
|
||||||
|
self.completed_requests.append(completed)
|
||||||
|
|
||||||
|
# Track errors
|
||||||
|
if not success and error:
|
||||||
|
self.errors.append({
|
||||||
|
"timestamp": end_time,
|
||||||
|
"endpoint": endpoint,
|
||||||
|
"url": req_info["url"],
|
||||||
|
"error": error,
|
||||||
|
"request_id": request_id
|
||||||
|
})
|
||||||
|
|
||||||
|
await self._persist_endpoint_stats()
|
||||||
|
|
||||||
|
async def track_janitor_event(self, event_type: str, sig: str, details: Dict):
|
||||||
|
"""Track janitor cleanup events."""
|
||||||
|
self.janitor_events.append({
|
||||||
|
"timestamp": time.time(),
|
||||||
|
"type": event_type, # "close_cold", "close_hot", "promote"
|
||||||
|
"sig": sig[:8],
|
||||||
|
"details": details
|
||||||
|
})
|
||||||
|
|
||||||
|
def _cleanup_old_entries(self, max_age_seconds: int = 300):
|
||||||
|
"""Remove entries older than max_age_seconds (default 5min)."""
|
||||||
|
now = time.time()
|
||||||
|
cutoff = now - max_age_seconds
|
||||||
|
|
||||||
|
# Clean completed requests
|
||||||
|
while self.completed_requests and self.completed_requests[0].get("end_time", 0) < cutoff:
|
||||||
|
self.completed_requests.popleft()
|
||||||
|
|
||||||
|
# Clean janitor events
|
||||||
|
while self.janitor_events and self.janitor_events[0].get("timestamp", 0) < cutoff:
|
||||||
|
self.janitor_events.popleft()
|
||||||
|
|
||||||
|
# Clean errors
|
||||||
|
while self.errors and self.errors[0].get("timestamp", 0) < cutoff:
|
||||||
|
self.errors.popleft()
|
||||||
|
|
||||||
|
async def update_timeline(self):
|
||||||
|
"""Update timeline data points (called every 5s)."""
|
||||||
|
now = time.time()
|
||||||
|
mem_pct = get_container_memory_percent()
|
||||||
|
|
||||||
|
# Clean old entries (keep last 5 minutes)
|
||||||
|
self._cleanup_old_entries(max_age_seconds=300)
|
||||||
|
|
||||||
|
# Count requests in last 5s
|
||||||
|
recent_reqs = sum(1 for req in self.completed_requests
|
||||||
|
if now - req.get("end_time", 0) < 5)
|
||||||
|
|
||||||
|
# Browser counts (acquire lock to prevent race conditions)
|
||||||
|
from crawler_pool import PERMANENT, HOT_POOL, COLD_POOL, LOCK
|
||||||
|
async with LOCK:
|
||||||
|
browser_count = {
|
||||||
|
"permanent": 1 if PERMANENT else 0,
|
||||||
|
"hot": len(HOT_POOL),
|
||||||
|
"cold": len(COLD_POOL)
|
||||||
|
}
|
||||||
|
|
||||||
|
self.memory_timeline.append({"time": now, "value": mem_pct})
|
||||||
|
self.requests_timeline.append({"time": now, "value": recent_reqs})
|
||||||
|
self.browser_timeline.append({"time": now, "browsers": browser_count})
|
||||||
|
|
||||||
|
async def _persist_endpoint_stats(self):
|
||||||
|
"""Persist endpoint stats to Redis."""
|
||||||
|
try:
|
||||||
|
await self.redis.set(
|
||||||
|
"monitor:endpoint_stats",
|
||||||
|
json.dumps(self.endpoint_stats),
|
||||||
|
ex=86400 # 24h TTL
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to persist endpoint stats: {e}")
|
||||||
|
|
||||||
|
async def _persistence_worker(self):
|
||||||
|
"""Background worker to persist stats to Redis."""
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
await self._persist_queue.get()
|
||||||
|
await self._persist_endpoint_stats()
|
||||||
|
self._persist_queue.task_done()
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Persistence worker error: {e}")
|
||||||
|
|
||||||
|
def start_persistence_worker(self):
|
||||||
|
"""Start the background persistence worker."""
|
||||||
|
if not self._persist_worker_task:
|
||||||
|
self._persist_worker_task = asyncio.create_task(self._persistence_worker())
|
||||||
|
logger.info("Started persistence worker")
|
||||||
|
|
||||||
|
async def stop_persistence_worker(self):
|
||||||
|
"""Stop the background persistence worker."""
|
||||||
|
if self._persist_worker_task:
|
||||||
|
self._persist_worker_task.cancel()
|
||||||
|
try:
|
||||||
|
await self._persist_worker_task
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
pass
|
||||||
|
self._persist_worker_task = None
|
||||||
|
logger.info("Stopped persistence worker")
|
||||||
|
|
||||||
|
async def cleanup(self):
|
||||||
|
"""Cleanup on shutdown - persist final stats and stop workers."""
|
||||||
|
logger.info("Monitor cleanup starting...")
|
||||||
|
try:
|
||||||
|
# Persist final stats before shutdown
|
||||||
|
await self._persist_endpoint_stats()
|
||||||
|
# Stop background worker
|
||||||
|
await self.stop_persistence_worker()
|
||||||
|
logger.info("Monitor cleanup completed")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Monitor cleanup error: {e}")
|
||||||
|
|
||||||
|
async def load_from_redis(self):
|
||||||
|
"""Load persisted stats from Redis."""
|
||||||
|
try:
|
||||||
|
data = await self.redis.get("monitor:endpoint_stats")
|
||||||
|
if data:
|
||||||
|
self.endpoint_stats = json.loads(data)
|
||||||
|
logger.info("Loaded endpoint stats from Redis")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to load from Redis: {e}")
|
||||||
|
|
||||||
|
async def get_health_summary(self) -> Dict:
|
||||||
|
"""Get current system health snapshot."""
|
||||||
|
mem_pct = get_container_memory_percent()
|
||||||
|
cpu_pct = psutil.cpu_percent(interval=0.1)
|
||||||
|
|
||||||
|
# Network I/O (delta since last call)
|
||||||
|
net = psutil.net_io_counters()
|
||||||
|
|
||||||
|
# Pool status (acquire lock to prevent race conditions)
|
||||||
|
from crawler_pool import PERMANENT, HOT_POOL, COLD_POOL, LOCK
|
||||||
|
async with LOCK:
|
||||||
|
# TODO: Track actual browser process memory instead of estimates
|
||||||
|
# These are conservative estimates based on typical Chromium usage
|
||||||
|
permanent_mem = 270 if PERMANENT else 0 # Estimate: ~270MB for permanent browser
|
||||||
|
hot_mem = len(HOT_POOL) * 180 # Estimate: ~180MB per hot pool browser
|
||||||
|
cold_mem = len(COLD_POOL) * 180 # Estimate: ~180MB per cold pool browser
|
||||||
|
permanent_active = PERMANENT is not None
|
||||||
|
hot_count = len(HOT_POOL)
|
||||||
|
cold_count = len(COLD_POOL)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"container": {
|
||||||
|
"memory_percent": round(mem_pct, 1),
|
||||||
|
"cpu_percent": round(cpu_pct, 1),
|
||||||
|
"network_sent_mb": round(net.bytes_sent / (1024**2), 2),
|
||||||
|
"network_recv_mb": round(net.bytes_recv / (1024**2), 2),
|
||||||
|
"uptime_seconds": int(time.time() - self.start_time)
|
||||||
|
},
|
||||||
|
"pool": {
|
||||||
|
"permanent": {"active": permanent_active, "memory_mb": permanent_mem},
|
||||||
|
"hot": {"count": hot_count, "memory_mb": hot_mem},
|
||||||
|
"cold": {"count": cold_count, "memory_mb": cold_mem},
|
||||||
|
"total_memory_mb": permanent_mem + hot_mem + cold_mem
|
||||||
|
},
|
||||||
|
"janitor": {
|
||||||
|
"next_cleanup_estimate": "adaptive", # Would need janitor state
|
||||||
|
"memory_pressure": "LOW" if mem_pct < 60 else "MEDIUM" if mem_pct < 80 else "HIGH"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_active_requests(self) -> List[Dict]:
|
||||||
|
"""Get list of currently active requests."""
|
||||||
|
now = time.time()
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
**req,
|
||||||
|
"elapsed": round(now - req["start_time"], 1),
|
||||||
|
"status": "running"
|
||||||
|
}
|
||||||
|
for req in self.active_requests.values()
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_completed_requests(self, limit: int = 50, filter_status: str = "all") -> List[Dict]:
|
||||||
|
"""Get recent completed requests."""
|
||||||
|
requests = list(self.completed_requests)[-limit:]
|
||||||
|
if filter_status == "success":
|
||||||
|
requests = [r for r in requests if r.get("success")]
|
||||||
|
elif filter_status == "error":
|
||||||
|
requests = [r for r in requests if not r.get("success")]
|
||||||
|
return requests
|
||||||
|
|
||||||
|
async def get_browser_list(self) -> List[Dict]:
|
||||||
|
"""Get detailed browser pool information."""
|
||||||
|
from crawler_pool import PERMANENT, HOT_POOL, COLD_POOL, LAST_USED, USAGE_COUNT, DEFAULT_CONFIG_SIG, LOCK
|
||||||
|
|
||||||
|
browsers = []
|
||||||
|
now = time.time()
|
||||||
|
|
||||||
|
# Acquire lock to prevent race conditions during iteration
|
||||||
|
async with LOCK:
|
||||||
|
if PERMANENT:
|
||||||
|
browsers.append({
|
||||||
|
"type": "permanent",
|
||||||
|
"sig": DEFAULT_CONFIG_SIG[:8] if DEFAULT_CONFIG_SIG else "unknown",
|
||||||
|
"age_seconds": int(now - self.start_time),
|
||||||
|
"last_used_seconds": int(now - LAST_USED.get(DEFAULT_CONFIG_SIG, now)),
|
||||||
|
"memory_mb": 270,
|
||||||
|
"hits": USAGE_COUNT.get(DEFAULT_CONFIG_SIG, 0),
|
||||||
|
"killable": False
|
||||||
|
})
|
||||||
|
|
||||||
|
for sig, crawler in HOT_POOL.items():
|
||||||
|
browsers.append({
|
||||||
|
"type": "hot",
|
||||||
|
"sig": sig[:8],
|
||||||
|
"age_seconds": int(now - self.start_time), # Approximation
|
||||||
|
"last_used_seconds": int(now - LAST_USED.get(sig, now)),
|
||||||
|
"memory_mb": 180, # Estimate
|
||||||
|
"hits": USAGE_COUNT.get(sig, 0),
|
||||||
|
"killable": True
|
||||||
|
})
|
||||||
|
|
||||||
|
for sig, crawler in COLD_POOL.items():
|
||||||
|
browsers.append({
|
||||||
|
"type": "cold",
|
||||||
|
"sig": sig[:8],
|
||||||
|
"age_seconds": int(now - self.start_time),
|
||||||
|
"last_used_seconds": int(now - LAST_USED.get(sig, now)),
|
||||||
|
"memory_mb": 180,
|
||||||
|
"hits": USAGE_COUNT.get(sig, 0),
|
||||||
|
"killable": True
|
||||||
|
})
|
||||||
|
|
||||||
|
return browsers
|
||||||
|
|
||||||
|
def get_endpoint_stats_summary(self) -> Dict[str, Dict]:
|
||||||
|
"""Get aggregated endpoint statistics."""
|
||||||
|
summary = {}
|
||||||
|
for endpoint, stats in self.endpoint_stats.items():
|
||||||
|
count = stats["count"]
|
||||||
|
avg_time = (stats["total_time"] / count) if count > 0 else 0
|
||||||
|
success_rate = (stats["success"] / count * 100) if count > 0 else 0
|
||||||
|
pool_hit_rate = (stats["pool_hits"] / count * 100) if count > 0 else 0
|
||||||
|
|
||||||
|
summary[endpoint] = {
|
||||||
|
"count": count,
|
||||||
|
"avg_latency_ms": round(avg_time * 1000, 1),
|
||||||
|
"success_rate_percent": round(success_rate, 1),
|
||||||
|
"pool_hit_rate_percent": round(pool_hit_rate, 1),
|
||||||
|
"errors": stats["errors"]
|
||||||
|
}
|
||||||
|
return summary
|
||||||
|
|
||||||
|
def get_timeline_data(self, metric: str, window: str = "5m") -> Dict:
|
||||||
|
"""Get timeline data for charts."""
|
||||||
|
# For now, only 5m window supported
|
||||||
|
if metric == "memory":
|
||||||
|
data = list(self.memory_timeline)
|
||||||
|
elif metric == "requests":
|
||||||
|
data = list(self.requests_timeline)
|
||||||
|
elif metric == "browsers":
|
||||||
|
data = list(self.browser_timeline)
|
||||||
|
else:
|
||||||
|
return {"timestamps": [], "values": []}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"timestamps": [int(d["time"]) for d in data],
|
||||||
|
"values": [d.get("value", d.get("browsers")) for d in data]
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_janitor_log(self, limit: int = 100) -> List[Dict]:
|
||||||
|
"""Get recent janitor events."""
|
||||||
|
return list(self.janitor_events)[-limit:]
|
||||||
|
|
||||||
|
def get_errors_log(self, limit: int = 100) -> List[Dict]:
|
||||||
|
"""Get recent errors."""
|
||||||
|
return list(self.errors)[-limit:]
|
||||||
|
|
||||||
|
# Global instance (initialized in server.py)
|
||||||
|
monitor_stats: Optional[MonitorStats] = None
|
||||||
|
|
||||||
|
def get_monitor() -> MonitorStats:
|
||||||
|
"""Get global monitor instance."""
|
||||||
|
if monitor_stats is None:
|
||||||
|
raise RuntimeError("Monitor not initialized")
|
||||||
|
return monitor_stats
|
||||||
405
deploy/docker/monitor_routes.py
Normal file
405
deploy/docker/monitor_routes.py
Normal file
@@ -0,0 +1,405 @@
|
|||||||
|
# monitor_routes.py - Monitor API endpoints
|
||||||
|
from fastapi import APIRouter, HTTPException, WebSocket, WebSocketDisconnect
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from typing import Optional
|
||||||
|
from monitor import get_monitor
|
||||||
|
import logging
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
router = APIRouter(prefix="/monitor", tags=["monitor"])
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/health")
|
||||||
|
async def get_health():
|
||||||
|
"""Get current system health snapshot."""
|
||||||
|
try:
|
||||||
|
monitor = get_monitor()
|
||||||
|
return await monitor.get_health_summary()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error getting health: {e}")
|
||||||
|
raise HTTPException(500, str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/requests")
|
||||||
|
async def get_requests(status: str = "all", limit: int = 50):
|
||||||
|
"""Get active and completed requests.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
status: Filter by 'active', 'completed', 'success', 'error', or 'all'
|
||||||
|
limit: Max number of completed requests to return (default 50)
|
||||||
|
"""
|
||||||
|
# Input validation
|
||||||
|
if status not in ["all", "active", "completed", "success", "error"]:
|
||||||
|
raise HTTPException(400, f"Invalid status: {status}. Must be one of: all, active, completed, success, error")
|
||||||
|
if limit < 1 or limit > 1000:
|
||||||
|
raise HTTPException(400, f"Invalid limit: {limit}. Must be between 1 and 1000")
|
||||||
|
|
||||||
|
try:
|
||||||
|
monitor = get_monitor()
|
||||||
|
|
||||||
|
if status == "active":
|
||||||
|
return {"active": monitor.get_active_requests(), "completed": []}
|
||||||
|
elif status == "completed":
|
||||||
|
return {"active": [], "completed": monitor.get_completed_requests(limit)}
|
||||||
|
elif status in ["success", "error"]:
|
||||||
|
return {"active": [], "completed": monitor.get_completed_requests(limit, status)}
|
||||||
|
else: # "all"
|
||||||
|
return {
|
||||||
|
"active": monitor.get_active_requests(),
|
||||||
|
"completed": monitor.get_completed_requests(limit)
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error getting requests: {e}")
|
||||||
|
raise HTTPException(500, str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/browsers")
|
||||||
|
async def get_browsers():
|
||||||
|
"""Get detailed browser pool information."""
|
||||||
|
try:
|
||||||
|
monitor = get_monitor()
|
||||||
|
browsers = await monitor.get_browser_list()
|
||||||
|
|
||||||
|
# Calculate summary stats
|
||||||
|
total_browsers = len(browsers)
|
||||||
|
total_memory = sum(b["memory_mb"] for b in browsers)
|
||||||
|
|
||||||
|
# Calculate reuse rate from recent requests
|
||||||
|
recent = monitor.get_completed_requests(100)
|
||||||
|
pool_hits = sum(1 for r in recent if r.get("pool_hit", False))
|
||||||
|
reuse_rate = (pool_hits / len(recent) * 100) if recent else 0
|
||||||
|
|
||||||
|
return {
|
||||||
|
"browsers": browsers,
|
||||||
|
"summary": {
|
||||||
|
"total_count": total_browsers,
|
||||||
|
"total_memory_mb": total_memory,
|
||||||
|
"reuse_rate_percent": round(reuse_rate, 1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error getting browsers: {e}")
|
||||||
|
raise HTTPException(500, str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/endpoints/stats")
|
||||||
|
async def get_endpoint_stats():
|
||||||
|
"""Get aggregated endpoint statistics."""
|
||||||
|
try:
|
||||||
|
monitor = get_monitor()
|
||||||
|
return monitor.get_endpoint_stats_summary()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error getting endpoint stats: {e}")
|
||||||
|
raise HTTPException(500, str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/timeline")
|
||||||
|
async def get_timeline(metric: str = "memory", window: str = "5m"):
|
||||||
|
"""Get timeline data for charts.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
metric: 'memory', 'requests', or 'browsers'
|
||||||
|
window: Time window (only '5m' supported for now)
|
||||||
|
"""
|
||||||
|
# Input validation
|
||||||
|
if metric not in ["memory", "requests", "browsers"]:
|
||||||
|
raise HTTPException(400, f"Invalid metric: {metric}. Must be one of: memory, requests, browsers")
|
||||||
|
if window != "5m":
|
||||||
|
raise HTTPException(400, f"Invalid window: {window}. Only '5m' is currently supported")
|
||||||
|
|
||||||
|
try:
|
||||||
|
monitor = get_monitor()
|
||||||
|
return monitor.get_timeline_data(metric, window)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error getting timeline: {e}")
|
||||||
|
raise HTTPException(500, str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/logs/janitor")
|
||||||
|
async def get_janitor_log(limit: int = 100):
|
||||||
|
"""Get recent janitor cleanup events."""
|
||||||
|
# Input validation
|
||||||
|
if limit < 1 or limit > 1000:
|
||||||
|
raise HTTPException(400, f"Invalid limit: {limit}. Must be between 1 and 1000")
|
||||||
|
|
||||||
|
try:
|
||||||
|
monitor = get_monitor()
|
||||||
|
return {"events": monitor.get_janitor_log(limit)}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error getting janitor log: {e}")
|
||||||
|
raise HTTPException(500, str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/logs/errors")
|
||||||
|
async def get_errors_log(limit: int = 100):
|
||||||
|
"""Get recent errors."""
|
||||||
|
# Input validation
|
||||||
|
if limit < 1 or limit > 1000:
|
||||||
|
raise HTTPException(400, f"Invalid limit: {limit}. Must be between 1 and 1000")
|
||||||
|
|
||||||
|
try:
|
||||||
|
monitor = get_monitor()
|
||||||
|
return {"errors": monitor.get_errors_log(limit)}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error getting errors log: {e}")
|
||||||
|
raise HTTPException(500, str(e))
|
||||||
|
|
||||||
|
|
||||||
|
# ========== Control Actions ==========
|
||||||
|
|
||||||
|
class KillBrowserRequest(BaseModel):
|
||||||
|
sig: str
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/actions/cleanup")
|
||||||
|
async def force_cleanup():
|
||||||
|
"""Force immediate janitor cleanup (kills idle cold pool browsers)."""
|
||||||
|
try:
|
||||||
|
from crawler_pool import COLD_POOL, LAST_USED, USAGE_COUNT, LOCK
|
||||||
|
import time
|
||||||
|
from contextlib import suppress
|
||||||
|
|
||||||
|
killed_count = 0
|
||||||
|
now = time.time()
|
||||||
|
|
||||||
|
async with LOCK:
|
||||||
|
for sig in list(COLD_POOL.keys()):
|
||||||
|
# Kill all cold pool browsers immediately
|
||||||
|
logger.info(f"🧹 Force cleanup: closing cold browser (sig={sig[:8]})")
|
||||||
|
with suppress(Exception):
|
||||||
|
await COLD_POOL[sig].close()
|
||||||
|
COLD_POOL.pop(sig, None)
|
||||||
|
LAST_USED.pop(sig, None)
|
||||||
|
USAGE_COUNT.pop(sig, None)
|
||||||
|
killed_count += 1
|
||||||
|
|
||||||
|
monitor = get_monitor()
|
||||||
|
await monitor.track_janitor_event("force_cleanup", "manual", {"killed": killed_count})
|
||||||
|
|
||||||
|
return {"success": True, "killed_browsers": killed_count}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error during force cleanup: {e}")
|
||||||
|
raise HTTPException(500, str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/actions/kill_browser")
|
||||||
|
async def kill_browser(req: KillBrowserRequest):
|
||||||
|
"""Kill a specific browser by signature (hot or cold only).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sig: Browser config signature (first 8 chars)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from crawler_pool import HOT_POOL, COLD_POOL, LAST_USED, USAGE_COUNT, LOCK, DEFAULT_CONFIG_SIG
|
||||||
|
from contextlib import suppress
|
||||||
|
|
||||||
|
# Find full signature matching prefix
|
||||||
|
target_sig = None
|
||||||
|
pool_type = None
|
||||||
|
|
||||||
|
async with LOCK:
|
||||||
|
# Check hot pool
|
||||||
|
for sig in HOT_POOL.keys():
|
||||||
|
if sig.startswith(req.sig):
|
||||||
|
target_sig = sig
|
||||||
|
pool_type = "hot"
|
||||||
|
break
|
||||||
|
|
||||||
|
# Check cold pool
|
||||||
|
if not target_sig:
|
||||||
|
for sig in COLD_POOL.keys():
|
||||||
|
if sig.startswith(req.sig):
|
||||||
|
target_sig = sig
|
||||||
|
pool_type = "cold"
|
||||||
|
break
|
||||||
|
|
||||||
|
# Check if trying to kill permanent
|
||||||
|
if DEFAULT_CONFIG_SIG and DEFAULT_CONFIG_SIG.startswith(req.sig):
|
||||||
|
raise HTTPException(403, "Cannot kill permanent browser. Use restart instead.")
|
||||||
|
|
||||||
|
if not target_sig:
|
||||||
|
raise HTTPException(404, f"Browser with sig={req.sig} not found")
|
||||||
|
|
||||||
|
# Warn if there are active requests (browser might be in use)
|
||||||
|
monitor = get_monitor()
|
||||||
|
active_count = len(monitor.get_active_requests())
|
||||||
|
if active_count > 0:
|
||||||
|
logger.warning(f"Killing browser {target_sig[:8]} while {active_count} requests are active - may cause failures")
|
||||||
|
|
||||||
|
# Kill the browser
|
||||||
|
if pool_type == "hot":
|
||||||
|
browser = HOT_POOL.pop(target_sig)
|
||||||
|
else:
|
||||||
|
browser = COLD_POOL.pop(target_sig)
|
||||||
|
|
||||||
|
with suppress(Exception):
|
||||||
|
await browser.close()
|
||||||
|
|
||||||
|
LAST_USED.pop(target_sig, None)
|
||||||
|
USAGE_COUNT.pop(target_sig, None)
|
||||||
|
|
||||||
|
logger.info(f"🔪 Killed {pool_type} browser (sig={target_sig[:8]})")
|
||||||
|
|
||||||
|
monitor = get_monitor()
|
||||||
|
await monitor.track_janitor_event("kill_browser", target_sig, {"pool": pool_type, "manual": True})
|
||||||
|
|
||||||
|
return {"success": True, "killed_sig": target_sig[:8], "pool_type": pool_type}
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error killing browser: {e}")
|
||||||
|
raise HTTPException(500, str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/actions/restart_browser")
|
||||||
|
async def restart_browser(req: KillBrowserRequest):
|
||||||
|
"""Restart a browser (kill + recreate). Works for permanent too.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sig: Browser config signature (first 8 chars), or "permanent"
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from crawler_pool import (PERMANENT, HOT_POOL, COLD_POOL, LAST_USED,
|
||||||
|
USAGE_COUNT, LOCK, DEFAULT_CONFIG_SIG, init_permanent)
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
||||||
|
from contextlib import suppress
|
||||||
|
import time
|
||||||
|
|
||||||
|
# Handle permanent browser restart
|
||||||
|
if req.sig == "permanent" or (DEFAULT_CONFIG_SIG and DEFAULT_CONFIG_SIG.startswith(req.sig)):
|
||||||
|
async with LOCK:
|
||||||
|
if PERMANENT:
|
||||||
|
with suppress(Exception):
|
||||||
|
await PERMANENT.close()
|
||||||
|
|
||||||
|
# Reinitialize permanent
|
||||||
|
from utils import load_config
|
||||||
|
config = load_config()
|
||||||
|
await init_permanent(BrowserConfig(
|
||||||
|
extra_args=config["crawler"]["browser"].get("extra_args", []),
|
||||||
|
**config["crawler"]["browser"].get("kwargs", {}),
|
||||||
|
))
|
||||||
|
|
||||||
|
logger.info("🔄 Restarted permanent browser")
|
||||||
|
return {"success": True, "restarted": "permanent"}
|
||||||
|
|
||||||
|
# Handle hot/cold browser restart
|
||||||
|
target_sig = None
|
||||||
|
pool_type = None
|
||||||
|
browser_config = None
|
||||||
|
|
||||||
|
async with LOCK:
|
||||||
|
# Find browser
|
||||||
|
for sig in HOT_POOL.keys():
|
||||||
|
if sig.startswith(req.sig):
|
||||||
|
target_sig = sig
|
||||||
|
pool_type = "hot"
|
||||||
|
# Would need to reconstruct config (not stored currently)
|
||||||
|
break
|
||||||
|
|
||||||
|
if not target_sig:
|
||||||
|
for sig in COLD_POOL.keys():
|
||||||
|
if sig.startswith(req.sig):
|
||||||
|
target_sig = sig
|
||||||
|
pool_type = "cold"
|
||||||
|
break
|
||||||
|
|
||||||
|
if not target_sig:
|
||||||
|
raise HTTPException(404, f"Browser with sig={req.sig} not found")
|
||||||
|
|
||||||
|
# Kill existing
|
||||||
|
if pool_type == "hot":
|
||||||
|
browser = HOT_POOL.pop(target_sig)
|
||||||
|
else:
|
||||||
|
browser = COLD_POOL.pop(target_sig)
|
||||||
|
|
||||||
|
with suppress(Exception):
|
||||||
|
await browser.close()
|
||||||
|
|
||||||
|
# Note: We can't easily recreate with same config without storing it
|
||||||
|
# For now, just kill and let new requests create fresh ones
|
||||||
|
LAST_USED.pop(target_sig, None)
|
||||||
|
USAGE_COUNT.pop(target_sig, None)
|
||||||
|
|
||||||
|
logger.info(f"🔄 Restarted {pool_type} browser (sig={target_sig[:8]})")
|
||||||
|
|
||||||
|
monitor = get_monitor()
|
||||||
|
await monitor.track_janitor_event("restart_browser", target_sig, {"pool": pool_type})
|
||||||
|
|
||||||
|
return {"success": True, "restarted_sig": target_sig[:8], "note": "Browser will be recreated on next request"}
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error restarting browser: {e}")
|
||||||
|
raise HTTPException(500, str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/stats/reset")
|
||||||
|
async def reset_stats():
|
||||||
|
"""Reset today's endpoint counters."""
|
||||||
|
try:
|
||||||
|
monitor = get_monitor()
|
||||||
|
monitor.endpoint_stats.clear()
|
||||||
|
await monitor._persist_endpoint_stats()
|
||||||
|
|
||||||
|
return {"success": True, "message": "Endpoint stats reset"}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error resetting stats: {e}")
|
||||||
|
raise HTTPException(500, str(e))
|
||||||
|
|
||||||
|
|
||||||
|
@router.websocket("/ws")
|
||||||
|
async def websocket_endpoint(websocket: WebSocket):
|
||||||
|
"""WebSocket endpoint for real-time monitoring updates.
|
||||||
|
|
||||||
|
Sends updates every 2 seconds with:
|
||||||
|
- Health stats
|
||||||
|
- Active/completed requests
|
||||||
|
- Browser pool status
|
||||||
|
- Timeline data
|
||||||
|
"""
|
||||||
|
await websocket.accept()
|
||||||
|
logger.info("WebSocket client connected")
|
||||||
|
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
# Gather all monitoring data
|
||||||
|
monitor = get_monitor()
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"timestamp": asyncio.get_event_loop().time(),
|
||||||
|
"health": await monitor.get_health_summary(),
|
||||||
|
"requests": {
|
||||||
|
"active": monitor.get_active_requests(),
|
||||||
|
"completed": monitor.get_completed_requests(limit=10)
|
||||||
|
},
|
||||||
|
"browsers": await monitor.get_browser_list(),
|
||||||
|
"timeline": {
|
||||||
|
"memory": monitor.get_timeline_data("memory", "5m"),
|
||||||
|
"requests": monitor.get_timeline_data("requests", "5m"),
|
||||||
|
"browsers": monitor.get_timeline_data("browsers", "5m")
|
||||||
|
},
|
||||||
|
"janitor": monitor.get_janitor_log(limit=10),
|
||||||
|
"errors": monitor.get_errors_log(limit=10)
|
||||||
|
}
|
||||||
|
|
||||||
|
# Send update to client
|
||||||
|
await websocket.send_json(data)
|
||||||
|
|
||||||
|
# Wait 2 seconds before next update
|
||||||
|
await asyncio.sleep(2)
|
||||||
|
|
||||||
|
except WebSocketDisconnect:
|
||||||
|
logger.info("WebSocket client disconnected")
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"WebSocket error: {e}", exc_info=True)
|
||||||
|
await asyncio.sleep(2) # Continue trying
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"WebSocket connection error: {e}", exc_info=True)
|
||||||
|
finally:
|
||||||
|
logger.info("WebSocket connection closed")
|
||||||
@@ -12,6 +12,6 @@ pydantic>=2.11
|
|||||||
rank-bm25==0.2.2
|
rank-bm25==0.2.2
|
||||||
anyio==4.9.0
|
anyio==4.9.0
|
||||||
PyJWT==2.10.1
|
PyJWT==2.10.1
|
||||||
mcp>=1.18.0
|
mcp>=1.6.0
|
||||||
websockets>=15.0.1
|
websockets>=15.0.1
|
||||||
httpx[http2]>=0.27.2
|
httpx[http2]>=0.27.2
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
from typing import List, Optional, Dict
|
from typing import List, Optional, Dict
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pydantic import BaseModel, Field, HttpUrl
|
from pydantic import BaseModel, Field
|
||||||
from utils import FilterType
|
from utils import FilterType
|
||||||
|
|
||||||
|
|
||||||
@@ -85,22 +85,4 @@ class JSEndpointRequest(BaseModel):
|
|||||||
scripts: List[str] = Field(
|
scripts: List[str] = Field(
|
||||||
...,
|
...,
|
||||||
description="List of separated JavaScript snippets to execute"
|
description="List of separated JavaScript snippets to execute"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class WebhookConfig(BaseModel):
|
|
||||||
"""Configuration for webhook notifications."""
|
|
||||||
webhook_url: HttpUrl
|
|
||||||
webhook_data_in_payload: bool = False
|
|
||||||
webhook_headers: Optional[Dict[str, str]] = None
|
|
||||||
|
|
||||||
|
|
||||||
class WebhookPayload(BaseModel):
|
|
||||||
"""Payload sent to webhook endpoints."""
|
|
||||||
task_id: str
|
|
||||||
task_type: str # "crawl", "llm_extraction", etc.
|
|
||||||
status: str # "completed" or "failed"
|
|
||||||
timestamp: str # ISO 8601 format
|
|
||||||
urls: List[str]
|
|
||||||
error: Optional[str] = None
|
|
||||||
data: Optional[Dict] = None # Included only if webhook_data_in_payload=True
|
|
||||||
@@ -16,6 +16,7 @@ from fastapi import Request, Depends
|
|||||||
from fastapi.responses import FileResponse
|
from fastapi.responses import FileResponse
|
||||||
import base64
|
import base64
|
||||||
import re
|
import re
|
||||||
|
import logging
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||||
from api import (
|
from api import (
|
||||||
handle_markdown_request, handle_llm_qa,
|
handle_markdown_request, handle_llm_qa,
|
||||||
@@ -78,6 +79,14 @@ __version__ = "0.5.1-d1"
|
|||||||
MAX_PAGES = config["crawler"]["pool"].get("max_pages", 30)
|
MAX_PAGES = config["crawler"]["pool"].get("max_pages", 30)
|
||||||
GLOBAL_SEM = asyncio.Semaphore(MAX_PAGES)
|
GLOBAL_SEM = asyncio.Semaphore(MAX_PAGES)
|
||||||
|
|
||||||
|
# ── default browser config helper ─────────────────────────────
|
||||||
|
def get_default_browser_config() -> BrowserConfig:
|
||||||
|
"""Get default BrowserConfig from config.yml."""
|
||||||
|
return BrowserConfig(
|
||||||
|
extra_args=config["crawler"]["browser"].get("extra_args", []),
|
||||||
|
**config["crawler"]["browser"].get("kwargs", {}),
|
||||||
|
)
|
||||||
|
|
||||||
# import logging
|
# import logging
|
||||||
# page_log = logging.getLogger("page_cap")
|
# page_log = logging.getLogger("page_cap")
|
||||||
# orig_arun = AsyncWebCrawler.arun
|
# orig_arun = AsyncWebCrawler.arun
|
||||||
@@ -103,15 +112,52 @@ AsyncWebCrawler.arun = capped_arun
|
|||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def lifespan(_: FastAPI):
|
async def lifespan(_: FastAPI):
|
||||||
await get_crawler(BrowserConfig(
|
from crawler_pool import init_permanent
|
||||||
|
from monitor import MonitorStats
|
||||||
|
import monitor as monitor_module
|
||||||
|
|
||||||
|
# Initialize monitor
|
||||||
|
monitor_module.monitor_stats = MonitorStats(redis)
|
||||||
|
await monitor_module.monitor_stats.load_from_redis()
|
||||||
|
monitor_module.monitor_stats.start_persistence_worker()
|
||||||
|
|
||||||
|
# Initialize browser pool
|
||||||
|
await init_permanent(BrowserConfig(
|
||||||
extra_args=config["crawler"]["browser"].get("extra_args", []),
|
extra_args=config["crawler"]["browser"].get("extra_args", []),
|
||||||
**config["crawler"]["browser"].get("kwargs", {}),
|
**config["crawler"]["browser"].get("kwargs", {}),
|
||||||
)) # warm‑up
|
))
|
||||||
app.state.janitor = asyncio.create_task(janitor()) # idle GC
|
|
||||||
|
# Start background tasks
|
||||||
|
app.state.janitor = asyncio.create_task(janitor())
|
||||||
|
app.state.timeline_updater = asyncio.create_task(_timeline_updater())
|
||||||
|
|
||||||
yield
|
yield
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
app.state.janitor.cancel()
|
app.state.janitor.cancel()
|
||||||
|
app.state.timeline_updater.cancel()
|
||||||
|
|
||||||
|
# Monitor cleanup (persist stats and stop workers)
|
||||||
|
from monitor import get_monitor
|
||||||
|
try:
|
||||||
|
await get_monitor().cleanup()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Monitor cleanup failed: {e}")
|
||||||
|
|
||||||
await close_all()
|
await close_all()
|
||||||
|
|
||||||
|
async def _timeline_updater():
|
||||||
|
"""Update timeline data every 5 seconds."""
|
||||||
|
from monitor import get_monitor
|
||||||
|
while True:
|
||||||
|
await asyncio.sleep(5)
|
||||||
|
try:
|
||||||
|
await asyncio.wait_for(get_monitor().update_timeline(), timeout=4.0)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
logger.warning("Timeline update timeout after 4s")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Timeline update error: {e}")
|
||||||
|
|
||||||
# ───────────────────── FastAPI instance ──────────────────────
|
# ───────────────────── FastAPI instance ──────────────────────
|
||||||
app = FastAPI(
|
app = FastAPI(
|
||||||
title=config["app"]["title"],
|
title=config["app"]["title"],
|
||||||
@@ -129,6 +175,25 @@ app.mount(
|
|||||||
name="play",
|
name="play",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# ── static monitor dashboard ────────────────────────────────
|
||||||
|
MONITOR_DIR = pathlib.Path(__file__).parent / "static" / "monitor"
|
||||||
|
if not MONITOR_DIR.exists():
|
||||||
|
raise RuntimeError(f"Monitor assets not found at {MONITOR_DIR}")
|
||||||
|
app.mount(
|
||||||
|
"/dashboard",
|
||||||
|
StaticFiles(directory=MONITOR_DIR, html=True),
|
||||||
|
name="monitor_ui",
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── static assets (logo, etc) ────────────────────────────────
|
||||||
|
ASSETS_DIR = pathlib.Path(__file__).parent / "static" / "assets"
|
||||||
|
if ASSETS_DIR.exists():
|
||||||
|
app.mount(
|
||||||
|
"/static/assets",
|
||||||
|
StaticFiles(directory=ASSETS_DIR),
|
||||||
|
name="assets",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.get("/")
|
@app.get("/")
|
||||||
async def root():
|
async def root():
|
||||||
@@ -212,6 +277,12 @@ def _safe_eval_config(expr: str) -> dict:
|
|||||||
# ── job router ──────────────────────────────────────────────
|
# ── job router ──────────────────────────────────────────────
|
||||||
app.include_router(init_job_router(redis, config, token_dep))
|
app.include_router(init_job_router(redis, config, token_dep))
|
||||||
|
|
||||||
|
# ── monitor router ──────────────────────────────────────────
|
||||||
|
from monitor_routes import router as monitor_router
|
||||||
|
app.include_router(monitor_router)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# ──────────────────────── Endpoints ──────────────────────────
|
# ──────────────────────── Endpoints ──────────────────────────
|
||||||
@app.post("/token")
|
@app.post("/token")
|
||||||
async def get_token(req: TokenRequest):
|
async def get_token(req: TokenRequest):
|
||||||
@@ -266,27 +337,20 @@ async def generate_html(
|
|||||||
Crawls the URL, preprocesses the raw HTML for schema extraction, and returns the processed HTML.
|
Crawls the URL, preprocesses the raw HTML for schema extraction, and returns the processed HTML.
|
||||||
Use when you need sanitized HTML structures for building schemas or further processing.
|
Use when you need sanitized HTML structures for building schemas or further processing.
|
||||||
"""
|
"""
|
||||||
|
from crawler_pool import get_crawler
|
||||||
cfg = CrawlerRunConfig()
|
cfg = CrawlerRunConfig()
|
||||||
try:
|
try:
|
||||||
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
|
crawler = await get_crawler(get_default_browser_config())
|
||||||
results = await crawler.arun(url=body.url, config=cfg)
|
results = await crawler.arun(url=body.url, config=cfg)
|
||||||
# Check if the crawl was successful
|
|
||||||
if not results[0].success:
|
if not results[0].success:
|
||||||
raise HTTPException(
|
raise HTTPException(500, detail=results[0].error_message or "Crawl failed")
|
||||||
status_code=500,
|
|
||||||
detail=results[0].error_message or "Crawl failed"
|
|
||||||
)
|
|
||||||
|
|
||||||
raw_html = results[0].html
|
raw_html = results[0].html
|
||||||
from crawl4ai.utils import preprocess_html_for_schema
|
from crawl4ai.utils import preprocess_html_for_schema
|
||||||
processed_html = preprocess_html_for_schema(raw_html)
|
processed_html = preprocess_html_for_schema(raw_html)
|
||||||
return JSONResponse({"html": processed_html, "url": body.url, "success": True})
|
return JSONResponse({"html": processed_html, "url": body.url, "success": True})
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Log and raise as HTTP 500 for other exceptions
|
raise HTTPException(500, detail=str(e))
|
||||||
raise HTTPException(
|
|
||||||
status_code=500,
|
|
||||||
detail=str(e)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Screenshot endpoint
|
# Screenshot endpoint
|
||||||
|
|
||||||
@@ -304,16 +368,13 @@ async def generate_screenshot(
|
|||||||
Use when you need an image snapshot of the rendered page. Its recommened to provide an output path to save the screenshot.
|
Use when you need an image snapshot of the rendered page. Its recommened to provide an output path to save the screenshot.
|
||||||
Then in result instead of the screenshot you will get a path to the saved file.
|
Then in result instead of the screenshot you will get a path to the saved file.
|
||||||
"""
|
"""
|
||||||
|
from crawler_pool import get_crawler
|
||||||
try:
|
try:
|
||||||
cfg = CrawlerRunConfig(
|
cfg = CrawlerRunConfig(screenshot=True, screenshot_wait_for=body.screenshot_wait_for)
|
||||||
screenshot=True, screenshot_wait_for=body.screenshot_wait_for)
|
crawler = await get_crawler(get_default_browser_config())
|
||||||
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
|
results = await crawler.arun(url=body.url, config=cfg)
|
||||||
results = await crawler.arun(url=body.url, config=cfg)
|
|
||||||
if not results[0].success:
|
if not results[0].success:
|
||||||
raise HTTPException(
|
raise HTTPException(500, detail=results[0].error_message or "Crawl failed")
|
||||||
status_code=500,
|
|
||||||
detail=results[0].error_message or "Crawl failed"
|
|
||||||
)
|
|
||||||
screenshot_data = results[0].screenshot
|
screenshot_data = results[0].screenshot
|
||||||
if body.output_path:
|
if body.output_path:
|
||||||
abs_path = os.path.abspath(body.output_path)
|
abs_path = os.path.abspath(body.output_path)
|
||||||
@@ -323,10 +384,7 @@ async def generate_screenshot(
|
|||||||
return {"success": True, "path": abs_path}
|
return {"success": True, "path": abs_path}
|
||||||
return {"success": True, "screenshot": screenshot_data}
|
return {"success": True, "screenshot": screenshot_data}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(
|
raise HTTPException(500, detail=str(e))
|
||||||
status_code=500,
|
|
||||||
detail=str(e)
|
|
||||||
)
|
|
||||||
|
|
||||||
# PDF endpoint
|
# PDF endpoint
|
||||||
|
|
||||||
@@ -344,15 +402,13 @@ async def generate_pdf(
|
|||||||
Use when you need a printable or archivable snapshot of the page. It is recommended to provide an output path to save the PDF.
|
Use when you need a printable or archivable snapshot of the page. It is recommended to provide an output path to save the PDF.
|
||||||
Then in result instead of the PDF you will get a path to the saved file.
|
Then in result instead of the PDF you will get a path to the saved file.
|
||||||
"""
|
"""
|
||||||
|
from crawler_pool import get_crawler
|
||||||
try:
|
try:
|
||||||
cfg = CrawlerRunConfig(pdf=True)
|
cfg = CrawlerRunConfig(pdf=True)
|
||||||
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
|
crawler = await get_crawler(get_default_browser_config())
|
||||||
results = await crawler.arun(url=body.url, config=cfg)
|
results = await crawler.arun(url=body.url, config=cfg)
|
||||||
if not results[0].success:
|
if not results[0].success:
|
||||||
raise HTTPException(
|
raise HTTPException(500, detail=results[0].error_message or "Crawl failed")
|
||||||
status_code=500,
|
|
||||||
detail=results[0].error_message or "Crawl failed"
|
|
||||||
)
|
|
||||||
pdf_data = results[0].pdf
|
pdf_data = results[0].pdf
|
||||||
if body.output_path:
|
if body.output_path:
|
||||||
abs_path = os.path.abspath(body.output_path)
|
abs_path = os.path.abspath(body.output_path)
|
||||||
@@ -362,10 +418,7 @@ async def generate_pdf(
|
|||||||
return {"success": True, "path": abs_path}
|
return {"success": True, "path": abs_path}
|
||||||
return {"success": True, "pdf": base64.b64encode(pdf_data).decode()}
|
return {"success": True, "pdf": base64.b64encode(pdf_data).decode()}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(
|
raise HTTPException(500, detail=str(e))
|
||||||
status_code=500,
|
|
||||||
detail=str(e)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@app.post("/execute_js")
|
@app.post("/execute_js")
|
||||||
@@ -421,23 +474,17 @@ async def execute_js(
|
|||||||
```
|
```
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
from crawler_pool import get_crawler
|
||||||
try:
|
try:
|
||||||
cfg = CrawlerRunConfig(js_code=body.scripts)
|
cfg = CrawlerRunConfig(js_code=body.scripts)
|
||||||
async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
|
crawler = await get_crawler(get_default_browser_config())
|
||||||
results = await crawler.arun(url=body.url, config=cfg)
|
results = await crawler.arun(url=body.url, config=cfg)
|
||||||
if not results[0].success:
|
if not results[0].success:
|
||||||
raise HTTPException(
|
raise HTTPException(500, detail=results[0].error_message or "Crawl failed")
|
||||||
status_code=500,
|
|
||||||
detail=results[0].error_message or "Crawl failed"
|
|
||||||
)
|
|
||||||
# Return JSON-serializable dict of the first CrawlResult
|
|
||||||
data = results[0].model_dump()
|
data = results[0].model_dump()
|
||||||
return JSONResponse(data)
|
return JSONResponse(data)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(
|
raise HTTPException(500, detail=str(e))
|
||||||
status_code=500,
|
|
||||||
detail=str(e)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@app.get("/llm/{url:path}")
|
@app.get("/llm/{url:path}")
|
||||||
|
|||||||
BIN
deploy/docker/static/assets/crawl4ai-logo.jpg
Normal file
BIN
deploy/docker/static/assets/crawl4ai-logo.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 5.8 KiB |
BIN
deploy/docker/static/assets/crawl4ai-logo.png
Normal file
BIN
deploy/docker/static/assets/crawl4ai-logo.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 1.6 KiB |
BIN
deploy/docker/static/assets/logo.png
Normal file
BIN
deploy/docker/static/assets/logo.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 11 KiB |
1070
deploy/docker/static/monitor/index.html
Normal file
1070
deploy/docker/static/monitor/index.html
Normal file
File diff suppressed because it is too large
Load Diff
@@ -167,11 +167,14 @@
|
|||||||
</a>
|
</a>
|
||||||
</h1>
|
</h1>
|
||||||
|
|
||||||
<div class="ml-auto flex space-x-2">
|
<div class="ml-auto flex items-center space-x-4">
|
||||||
<button id="play-tab"
|
<a href="/dashboard" class="text-xs text-secondary hover:text-primary underline">Monitor</a>
|
||||||
class="px-3 py-1 rounded-t bg-surface border border-b-0 border-border text-primary">Playground</button>
|
<div class="flex space-x-2">
|
||||||
<button id="stress-tab" class="px-3 py-1 rounded-t border border-border hover:bg-surface">Stress
|
<button id="play-tab"
|
||||||
Test</button>
|
class="px-3 py-1 rounded-t bg-surface border border-b-0 border-border text-primary">Playground</button>
|
||||||
|
<button id="stress-tab" class="px-3 py-1 rounded-t border border-border hover:bg-surface">Stress
|
||||||
|
Test</button>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</header>
|
</header>
|
||||||
|
|
||||||
|
|||||||
34
deploy/docker/test-websocket.py
Executable file
34
deploy/docker/test-websocket.py
Executable file
@@ -0,0 +1,34 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Quick WebSocket test - Connect to monitor WebSocket and print updates
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import websockets
|
||||||
|
import json
|
||||||
|
|
||||||
|
async def test_websocket():
|
||||||
|
uri = "ws://localhost:11235/monitor/ws"
|
||||||
|
print(f"Connecting to {uri}...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with websockets.connect(uri) as websocket:
|
||||||
|
print("✅ Connected!")
|
||||||
|
|
||||||
|
# Receive and print 5 updates
|
||||||
|
for i in range(5):
|
||||||
|
message = await websocket.recv()
|
||||||
|
data = json.loads(message)
|
||||||
|
print(f"\n📊 Update #{i+1}:")
|
||||||
|
print(f" - Health: CPU {data['health']['container']['cpu_percent']}%, Memory {data['health']['container']['memory_percent']}%")
|
||||||
|
print(f" - Active Requests: {len(data['requests']['active'])}")
|
||||||
|
print(f" - Browsers: {len(data['browsers'])}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error: {e}")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
print("\n✅ WebSocket test passed!")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
exit(asyncio.run(test_websocket()))
|
||||||
164
deploy/docker/tests/demo_monitor_dashboard.py
Executable file
164
deploy/docker/tests/demo_monitor_dashboard.py
Executable file
@@ -0,0 +1,164 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Monitor Dashboard Demo Script
|
||||||
|
Generates varied activity to showcase all monitoring features for video recording.
|
||||||
|
"""
|
||||||
|
import httpx
|
||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
BASE_URL = "http://localhost:11235"
|
||||||
|
|
||||||
|
async def demo_dashboard():
|
||||||
|
print("🎬 Monitor Dashboard Demo - Starting...\n")
|
||||||
|
print(f"📊 Dashboard: {BASE_URL}/dashboard")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||||
|
|
||||||
|
# Phase 1: Simple requests (permanent browser)
|
||||||
|
print("\n🔷 Phase 1: Testing permanent browser pool")
|
||||||
|
print("-" * 60)
|
||||||
|
for i in range(5):
|
||||||
|
print(f" {i+1}/5 Request to /crawl (default config)...")
|
||||||
|
try:
|
||||||
|
r = await client.post(
|
||||||
|
f"{BASE_URL}/crawl",
|
||||||
|
json={"urls": [f"https://httpbin.org/html?req={i}"], "crawler_config": {}}
|
||||||
|
)
|
||||||
|
print(f" ✅ Status: {r.status_code}, Time: {r.elapsed.total_seconds():.2f}s")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ❌ Error: {e}")
|
||||||
|
await asyncio.sleep(1) # Small delay between requests
|
||||||
|
|
||||||
|
# Phase 2: Create variant browsers (different configs)
|
||||||
|
print("\n🔶 Phase 2: Testing cold→hot pool promotion")
|
||||||
|
print("-" * 60)
|
||||||
|
viewports = [
|
||||||
|
{"width": 1920, "height": 1080},
|
||||||
|
{"width": 1280, "height": 720},
|
||||||
|
{"width": 800, "height": 600}
|
||||||
|
]
|
||||||
|
|
||||||
|
for idx, viewport in enumerate(viewports):
|
||||||
|
print(f" Viewport {viewport['width']}x{viewport['height']}:")
|
||||||
|
for i in range(4): # 4 requests each to trigger promotion at 3
|
||||||
|
try:
|
||||||
|
r = await client.post(
|
||||||
|
f"{BASE_URL}/crawl",
|
||||||
|
json={
|
||||||
|
"urls": [f"https://httpbin.org/json?v={idx}&r={i}"],
|
||||||
|
"browser_config": {"viewport": viewport},
|
||||||
|
"crawler_config": {}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
print(f" {i+1}/4 ✅ {r.status_code} - Should see cold→hot after 3 uses")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" {i+1}/4 ❌ {e}")
|
||||||
|
await asyncio.sleep(0.5)
|
||||||
|
|
||||||
|
# Phase 3: Concurrent burst (stress pool)
|
||||||
|
print("\n🔷 Phase 3: Concurrent burst (10 parallel)")
|
||||||
|
print("-" * 60)
|
||||||
|
tasks = []
|
||||||
|
for i in range(10):
|
||||||
|
tasks.append(
|
||||||
|
client.post(
|
||||||
|
f"{BASE_URL}/crawl",
|
||||||
|
json={"urls": [f"https://httpbin.org/delay/2?burst={i}"], "crawler_config": {}}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
print(" Sending 10 concurrent requests...")
|
||||||
|
start = time.time()
|
||||||
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||||
|
elapsed = time.time() - start
|
||||||
|
|
||||||
|
successes = sum(1 for r in results if not isinstance(r, Exception) and r.status_code == 200)
|
||||||
|
print(f" ✅ {successes}/10 succeeded in {elapsed:.2f}s")
|
||||||
|
|
||||||
|
# Phase 4: Multi-endpoint coverage
|
||||||
|
print("\n🔶 Phase 4: Testing multiple endpoints")
|
||||||
|
print("-" * 60)
|
||||||
|
endpoints = [
|
||||||
|
("/md", {"url": "https://httpbin.org/html", "f": "fit", "c": "0"}),
|
||||||
|
("/screenshot", {"url": "https://httpbin.org/html"}),
|
||||||
|
("/pdf", {"url": "https://httpbin.org/html"}),
|
||||||
|
]
|
||||||
|
|
||||||
|
for endpoint, payload in endpoints:
|
||||||
|
print(f" Testing {endpoint}...")
|
||||||
|
try:
|
||||||
|
if endpoint == "/md":
|
||||||
|
r = await client.post(f"{BASE_URL}{endpoint}", json=payload)
|
||||||
|
else:
|
||||||
|
r = await client.post(f"{BASE_URL}{endpoint}", json=payload)
|
||||||
|
print(f" ✅ {r.status_code}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ❌ {e}")
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
|
# Phase 5: Intentional error (to populate errors tab)
|
||||||
|
print("\n🔷 Phase 5: Generating error examples")
|
||||||
|
print("-" * 60)
|
||||||
|
print(" Triggering invalid URL error...")
|
||||||
|
try:
|
||||||
|
r = await client.post(
|
||||||
|
f"{BASE_URL}/crawl",
|
||||||
|
json={"urls": ["invalid://bad-url"], "crawler_config": {}}
|
||||||
|
)
|
||||||
|
print(f" Response: {r.status_code}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ✅ Error captured: {type(e).__name__}")
|
||||||
|
|
||||||
|
# Phase 6: Wait for janitor activity
|
||||||
|
print("\n🔶 Phase 6: Waiting for janitor cleanup...")
|
||||||
|
print("-" * 60)
|
||||||
|
print(" Idle for 40s to allow janitor to clean cold pool browsers...")
|
||||||
|
for i in range(40, 0, -10):
|
||||||
|
print(f" {i}s remaining... (Check dashboard for cleanup events)")
|
||||||
|
await asyncio.sleep(10)
|
||||||
|
|
||||||
|
# Phase 7: Final stats check
|
||||||
|
print("\n🔷 Phase 7: Final dashboard state")
|
||||||
|
print("-" * 60)
|
||||||
|
|
||||||
|
r = await client.get(f"{BASE_URL}/monitor/health")
|
||||||
|
health = r.json()
|
||||||
|
print(f" Memory: {health['container']['memory_percent']:.1f}%")
|
||||||
|
print(f" Browsers: Perm={health['pool']['permanent']['active']}, "
|
||||||
|
f"Hot={health['pool']['hot']['count']}, Cold={health['pool']['cold']['count']}")
|
||||||
|
|
||||||
|
r = await client.get(f"{BASE_URL}/monitor/endpoints/stats")
|
||||||
|
stats = r.json()
|
||||||
|
print(f"\n Endpoint Stats:")
|
||||||
|
for endpoint, data in stats.items():
|
||||||
|
print(f" {endpoint}: {data['count']} req, "
|
||||||
|
f"{data['avg_latency_ms']:.0f}ms avg, "
|
||||||
|
f"{data['success_rate_percent']:.1f}% success")
|
||||||
|
|
||||||
|
r = await client.get(f"{BASE_URL}/monitor/browsers")
|
||||||
|
browsers = r.json()
|
||||||
|
print(f"\n Pool Efficiency:")
|
||||||
|
print(f" Total browsers: {browsers['summary']['total_count']}")
|
||||||
|
print(f" Memory usage: {browsers['summary']['total_memory_mb']} MB")
|
||||||
|
print(f" Reuse rate: {browsers['summary']['reuse_rate_percent']:.1f}%")
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("✅ Demo complete! Dashboard is now populated with rich data.")
|
||||||
|
print(f"\n📹 Recording tip: Refresh {BASE_URL}/dashboard")
|
||||||
|
print(" You should see:")
|
||||||
|
print(" • Active & completed requests")
|
||||||
|
print(" • Browser pool (permanent + hot/cold)")
|
||||||
|
print(" • Janitor cleanup events")
|
||||||
|
print(" • Endpoint analytics")
|
||||||
|
print(" • Memory timeline")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
try:
|
||||||
|
asyncio.run(demo_dashboard())
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\n\n⚠️ Demo interrupted by user")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n\n❌ Demo failed: {e}")
|
||||||
2
deploy/docker/tests/requirements.txt
Normal file
2
deploy/docker/tests/requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
httpx>=0.25.0
|
||||||
|
docker>=7.0.0
|
||||||
138
deploy/docker/tests/test_1_basic.py
Executable file
138
deploy/docker/tests/test_1_basic.py
Executable file
@@ -0,0 +1,138 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test 1: Basic Container Health + Single Endpoint
|
||||||
|
- Starts container
|
||||||
|
- Hits /health endpoint 10 times
|
||||||
|
- Reports success rate and basic latency
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
import docker
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
# Config
|
||||||
|
IMAGE = "crawl4ai-local:latest"
|
||||||
|
CONTAINER_NAME = "crawl4ai-test"
|
||||||
|
PORT = 11235
|
||||||
|
REQUESTS = 10
|
||||||
|
|
||||||
|
async def test_endpoint(url: str, count: int):
|
||||||
|
"""Hit endpoint multiple times, return stats."""
|
||||||
|
results = []
|
||||||
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||||
|
for i in range(count):
|
||||||
|
start = time.time()
|
||||||
|
try:
|
||||||
|
resp = await client.get(url)
|
||||||
|
elapsed = (time.time() - start) * 1000 # ms
|
||||||
|
results.append({
|
||||||
|
"success": resp.status_code == 200,
|
||||||
|
"latency_ms": elapsed,
|
||||||
|
"status": resp.status_code
|
||||||
|
})
|
||||||
|
print(f" [{i+1}/{count}] ✓ {resp.status_code} - {elapsed:.0f}ms")
|
||||||
|
except Exception as e:
|
||||||
|
results.append({
|
||||||
|
"success": False,
|
||||||
|
"latency_ms": None,
|
||||||
|
"error": str(e)
|
||||||
|
})
|
||||||
|
print(f" [{i+1}/{count}] ✗ Error: {e}")
|
||||||
|
return results
|
||||||
|
|
||||||
|
def start_container(client, image: str, name: str, port: int):
|
||||||
|
"""Start container, return container object."""
|
||||||
|
# Clean up existing
|
||||||
|
try:
|
||||||
|
old = client.containers.get(name)
|
||||||
|
print(f"🧹 Stopping existing container '{name}'...")
|
||||||
|
old.stop()
|
||||||
|
old.remove()
|
||||||
|
except docker.errors.NotFound:
|
||||||
|
pass
|
||||||
|
|
||||||
|
print(f"🚀 Starting container '{name}' from image '{image}'...")
|
||||||
|
container = client.containers.run(
|
||||||
|
image,
|
||||||
|
name=name,
|
||||||
|
ports={f"{port}/tcp": port},
|
||||||
|
detach=True,
|
||||||
|
shm_size="1g",
|
||||||
|
environment={"PYTHON_ENV": "production"}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Wait for health
|
||||||
|
print(f"⏳ Waiting for container to be healthy...")
|
||||||
|
for _ in range(30): # 30s timeout
|
||||||
|
time.sleep(1)
|
||||||
|
container.reload()
|
||||||
|
if container.status == "running":
|
||||||
|
try:
|
||||||
|
# Quick health check
|
||||||
|
import requests
|
||||||
|
resp = requests.get(f"http://localhost:{port}/health", timeout=2)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
print(f"✅ Container healthy!")
|
||||||
|
return container
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
raise TimeoutError("Container failed to start")
|
||||||
|
|
||||||
|
def stop_container(container):
|
||||||
|
"""Stop and remove container."""
|
||||||
|
print(f"🛑 Stopping container...")
|
||||||
|
container.stop()
|
||||||
|
container.remove()
|
||||||
|
print(f"✅ Container removed")
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
print("="*60)
|
||||||
|
print("TEST 1: Basic Container Health + Single Endpoint")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
client = docker.from_env()
|
||||||
|
container = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Start container
|
||||||
|
container = start_container(client, IMAGE, CONTAINER_NAME, PORT)
|
||||||
|
|
||||||
|
# Test /health endpoint
|
||||||
|
print(f"\n📊 Testing /health endpoint ({REQUESTS} requests)...")
|
||||||
|
url = f"http://localhost:{PORT}/health"
|
||||||
|
results = await test_endpoint(url, REQUESTS)
|
||||||
|
|
||||||
|
# Calculate stats
|
||||||
|
successes = sum(1 for r in results if r["success"])
|
||||||
|
success_rate = (successes / len(results)) * 100
|
||||||
|
latencies = [r["latency_ms"] for r in results if r["latency_ms"] is not None]
|
||||||
|
avg_latency = sum(latencies) / len(latencies) if latencies else 0
|
||||||
|
|
||||||
|
# Print results
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"RESULTS:")
|
||||||
|
print(f" Success Rate: {success_rate:.1f}% ({successes}/{len(results)})")
|
||||||
|
print(f" Avg Latency: {avg_latency:.0f}ms")
|
||||||
|
if latencies:
|
||||||
|
print(f" Min Latency: {min(latencies):.0f}ms")
|
||||||
|
print(f" Max Latency: {max(latencies):.0f}ms")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
# Pass/Fail
|
||||||
|
if success_rate >= 100:
|
||||||
|
print(f"✅ TEST PASSED")
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
print(f"❌ TEST FAILED (expected 100% success rate)")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n❌ TEST ERROR: {e}")
|
||||||
|
return 1
|
||||||
|
finally:
|
||||||
|
if container:
|
||||||
|
stop_container(container)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
exit_code = asyncio.run(main())
|
||||||
|
exit(exit_code)
|
||||||
205
deploy/docker/tests/test_2_memory.py
Executable file
205
deploy/docker/tests/test_2_memory.py
Executable file
@@ -0,0 +1,205 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test 2: Docker Stats Monitoring
|
||||||
|
- Extends Test 1 with real-time container stats
|
||||||
|
- Monitors memory % and CPU during requests
|
||||||
|
- Reports baseline, peak, and final memory
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
import docker
|
||||||
|
import httpx
|
||||||
|
from threading import Thread, Event
|
||||||
|
|
||||||
|
# Config
|
||||||
|
IMAGE = "crawl4ai-local:latest"
|
||||||
|
CONTAINER_NAME = "crawl4ai-test"
|
||||||
|
PORT = 11235
|
||||||
|
REQUESTS = 20 # More requests to see memory usage
|
||||||
|
|
||||||
|
# Stats tracking
|
||||||
|
stats_history = []
|
||||||
|
stop_monitoring = Event()
|
||||||
|
|
||||||
|
def monitor_stats(container):
|
||||||
|
"""Background thread to collect container stats."""
|
||||||
|
for stat in container.stats(decode=True, stream=True):
|
||||||
|
if stop_monitoring.is_set():
|
||||||
|
break
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Extract memory stats
|
||||||
|
mem_usage = stat['memory_stats'].get('usage', 0) / (1024 * 1024) # MB
|
||||||
|
mem_limit = stat['memory_stats'].get('limit', 1) / (1024 * 1024)
|
||||||
|
mem_percent = (mem_usage / mem_limit * 100) if mem_limit > 0 else 0
|
||||||
|
|
||||||
|
# Extract CPU stats (handle missing fields on Mac)
|
||||||
|
cpu_percent = 0
|
||||||
|
try:
|
||||||
|
cpu_delta = stat['cpu_stats']['cpu_usage']['total_usage'] - \
|
||||||
|
stat['precpu_stats']['cpu_usage']['total_usage']
|
||||||
|
system_delta = stat['cpu_stats'].get('system_cpu_usage', 0) - \
|
||||||
|
stat['precpu_stats'].get('system_cpu_usage', 0)
|
||||||
|
if system_delta > 0:
|
||||||
|
num_cpus = stat['cpu_stats'].get('online_cpus', 1)
|
||||||
|
cpu_percent = (cpu_delta / system_delta * num_cpus * 100.0)
|
||||||
|
except (KeyError, ZeroDivisionError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
stats_history.append({
|
||||||
|
'timestamp': time.time(),
|
||||||
|
'memory_mb': mem_usage,
|
||||||
|
'memory_percent': mem_percent,
|
||||||
|
'cpu_percent': cpu_percent
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
# Skip malformed stats
|
||||||
|
pass
|
||||||
|
|
||||||
|
time.sleep(0.5) # Sample every 500ms
|
||||||
|
|
||||||
|
async def test_endpoint(url: str, count: int):
|
||||||
|
"""Hit endpoint, return stats."""
|
||||||
|
results = []
|
||||||
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||||
|
for i in range(count):
|
||||||
|
start = time.time()
|
||||||
|
try:
|
||||||
|
resp = await client.get(url)
|
||||||
|
elapsed = (time.time() - start) * 1000
|
||||||
|
results.append({
|
||||||
|
"success": resp.status_code == 200,
|
||||||
|
"latency_ms": elapsed,
|
||||||
|
})
|
||||||
|
if (i + 1) % 5 == 0: # Print every 5 requests
|
||||||
|
print(f" [{i+1}/{count}] ✓ {resp.status_code} - {elapsed:.0f}ms")
|
||||||
|
except Exception as e:
|
||||||
|
results.append({"success": False, "error": str(e)})
|
||||||
|
print(f" [{i+1}/{count}] ✗ Error: {e}")
|
||||||
|
return results
|
||||||
|
|
||||||
|
def start_container(client, image: str, name: str, port: int):
|
||||||
|
"""Start container."""
|
||||||
|
try:
|
||||||
|
old = client.containers.get(name)
|
||||||
|
print(f"🧹 Stopping existing container '{name}'...")
|
||||||
|
old.stop()
|
||||||
|
old.remove()
|
||||||
|
except docker.errors.NotFound:
|
||||||
|
pass
|
||||||
|
|
||||||
|
print(f"🚀 Starting container '{name}'...")
|
||||||
|
container = client.containers.run(
|
||||||
|
image,
|
||||||
|
name=name,
|
||||||
|
ports={f"{port}/tcp": port},
|
||||||
|
detach=True,
|
||||||
|
shm_size="1g",
|
||||||
|
mem_limit="4g", # Set explicit memory limit
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"⏳ Waiting for health...")
|
||||||
|
for _ in range(30):
|
||||||
|
time.sleep(1)
|
||||||
|
container.reload()
|
||||||
|
if container.status == "running":
|
||||||
|
try:
|
||||||
|
import requests
|
||||||
|
resp = requests.get(f"http://localhost:{port}/health", timeout=2)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
print(f"✅ Container healthy!")
|
||||||
|
return container
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
raise TimeoutError("Container failed to start")
|
||||||
|
|
||||||
|
def stop_container(container):
|
||||||
|
"""Stop container."""
|
||||||
|
print(f"🛑 Stopping container...")
|
||||||
|
container.stop()
|
||||||
|
container.remove()
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
print("="*60)
|
||||||
|
print("TEST 2: Docker Stats Monitoring")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
client = docker.from_env()
|
||||||
|
container = None
|
||||||
|
monitor_thread = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Start container
|
||||||
|
container = start_container(client, IMAGE, CONTAINER_NAME, PORT)
|
||||||
|
|
||||||
|
# Start stats monitoring in background
|
||||||
|
print(f"\n📊 Starting stats monitor...")
|
||||||
|
stop_monitoring.clear()
|
||||||
|
stats_history.clear()
|
||||||
|
monitor_thread = Thread(target=monitor_stats, args=(container,), daemon=True)
|
||||||
|
monitor_thread.start()
|
||||||
|
|
||||||
|
# Wait a bit for baseline
|
||||||
|
await asyncio.sleep(2)
|
||||||
|
baseline_mem = stats_history[-1]['memory_mb'] if stats_history else 0
|
||||||
|
print(f"📏 Baseline memory: {baseline_mem:.1f} MB")
|
||||||
|
|
||||||
|
# Test /health endpoint
|
||||||
|
print(f"\n🔄 Running {REQUESTS} requests to /health...")
|
||||||
|
url = f"http://localhost:{PORT}/health"
|
||||||
|
results = await test_endpoint(url, REQUESTS)
|
||||||
|
|
||||||
|
# Wait a bit to capture peak
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
|
# Stop monitoring
|
||||||
|
stop_monitoring.set()
|
||||||
|
if monitor_thread:
|
||||||
|
monitor_thread.join(timeout=2)
|
||||||
|
|
||||||
|
# Calculate stats
|
||||||
|
successes = sum(1 for r in results if r.get("success"))
|
||||||
|
success_rate = (successes / len(results)) * 100
|
||||||
|
latencies = [r["latency_ms"] for r in results if "latency_ms" in r]
|
||||||
|
avg_latency = sum(latencies) / len(latencies) if latencies else 0
|
||||||
|
|
||||||
|
# Memory stats
|
||||||
|
memory_samples = [s['memory_mb'] for s in stats_history]
|
||||||
|
peak_mem = max(memory_samples) if memory_samples else 0
|
||||||
|
final_mem = memory_samples[-1] if memory_samples else 0
|
||||||
|
mem_delta = final_mem - baseline_mem
|
||||||
|
|
||||||
|
# Print results
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"RESULTS:")
|
||||||
|
print(f" Success Rate: {success_rate:.1f}% ({successes}/{len(results)})")
|
||||||
|
print(f" Avg Latency: {avg_latency:.0f}ms")
|
||||||
|
print(f"\n Memory Stats:")
|
||||||
|
print(f" Baseline: {baseline_mem:.1f} MB")
|
||||||
|
print(f" Peak: {peak_mem:.1f} MB")
|
||||||
|
print(f" Final: {final_mem:.1f} MB")
|
||||||
|
print(f" Delta: {mem_delta:+.1f} MB")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
# Pass/Fail
|
||||||
|
if success_rate >= 100 and mem_delta < 100: # No significant memory growth
|
||||||
|
print(f"✅ TEST PASSED")
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
if success_rate < 100:
|
||||||
|
print(f"❌ TEST FAILED (success rate < 100%)")
|
||||||
|
if mem_delta >= 100:
|
||||||
|
print(f"⚠️ WARNING: Memory grew by {mem_delta:.1f} MB")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n❌ TEST ERROR: {e}")
|
||||||
|
return 1
|
||||||
|
finally:
|
||||||
|
stop_monitoring.set()
|
||||||
|
if container:
|
||||||
|
stop_container(container)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
exit_code = asyncio.run(main())
|
||||||
|
exit(exit_code)
|
||||||
229
deploy/docker/tests/test_3_pool.py
Executable file
229
deploy/docker/tests/test_3_pool.py
Executable file
@@ -0,0 +1,229 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test 3: Pool Validation - Permanent Browser Reuse
|
||||||
|
- Tests /html endpoint (should use permanent browser)
|
||||||
|
- Monitors container logs for pool hit markers
|
||||||
|
- Validates browser reuse rate
|
||||||
|
- Checks memory after browser creation
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
import docker
|
||||||
|
import httpx
|
||||||
|
from threading import Thread, Event
|
||||||
|
|
||||||
|
# Config
|
||||||
|
IMAGE = "crawl4ai-local:latest"
|
||||||
|
CONTAINER_NAME = "crawl4ai-test"
|
||||||
|
PORT = 11235
|
||||||
|
REQUESTS = 30
|
||||||
|
|
||||||
|
# Stats tracking
|
||||||
|
stats_history = []
|
||||||
|
stop_monitoring = Event()
|
||||||
|
|
||||||
|
def monitor_stats(container):
|
||||||
|
"""Background stats collector."""
|
||||||
|
for stat in container.stats(decode=True, stream=True):
|
||||||
|
if stop_monitoring.is_set():
|
||||||
|
break
|
||||||
|
try:
|
||||||
|
mem_usage = stat['memory_stats'].get('usage', 0) / (1024 * 1024)
|
||||||
|
stats_history.append({
|
||||||
|
'timestamp': time.time(),
|
||||||
|
'memory_mb': mem_usage,
|
||||||
|
})
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
def count_log_markers(container):
|
||||||
|
"""Extract pool usage markers from logs."""
|
||||||
|
logs = container.logs().decode('utf-8')
|
||||||
|
|
||||||
|
permanent_hits = logs.count("🔥 Using permanent browser")
|
||||||
|
hot_hits = logs.count("♨️ Using hot pool browser")
|
||||||
|
cold_hits = logs.count("❄️ Using cold pool browser")
|
||||||
|
new_created = logs.count("🆕 Creating new browser")
|
||||||
|
|
||||||
|
return {
|
||||||
|
'permanent_hits': permanent_hits,
|
||||||
|
'hot_hits': hot_hits,
|
||||||
|
'cold_hits': cold_hits,
|
||||||
|
'new_created': new_created,
|
||||||
|
'total_hits': permanent_hits + hot_hits + cold_hits
|
||||||
|
}
|
||||||
|
|
||||||
|
async def test_endpoint(url: str, count: int):
|
||||||
|
"""Hit endpoint multiple times."""
|
||||||
|
results = []
|
||||||
|
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||||
|
for i in range(count):
|
||||||
|
start = time.time()
|
||||||
|
try:
|
||||||
|
resp = await client.post(url, json={"url": "https://httpbin.org/html"})
|
||||||
|
elapsed = (time.time() - start) * 1000
|
||||||
|
results.append({
|
||||||
|
"success": resp.status_code == 200,
|
||||||
|
"latency_ms": elapsed,
|
||||||
|
})
|
||||||
|
if (i + 1) % 10 == 0:
|
||||||
|
print(f" [{i+1}/{count}] ✓ {resp.status_code} - {elapsed:.0f}ms")
|
||||||
|
except Exception as e:
|
||||||
|
results.append({"success": False, "error": str(e)})
|
||||||
|
print(f" [{i+1}/{count}] ✗ Error: {e}")
|
||||||
|
return results
|
||||||
|
|
||||||
|
def start_container(client, image: str, name: str, port: int):
|
||||||
|
"""Start container."""
|
||||||
|
try:
|
||||||
|
old = client.containers.get(name)
|
||||||
|
print(f"🧹 Stopping existing container...")
|
||||||
|
old.stop()
|
||||||
|
old.remove()
|
||||||
|
except docker.errors.NotFound:
|
||||||
|
pass
|
||||||
|
|
||||||
|
print(f"🚀 Starting container...")
|
||||||
|
container = client.containers.run(
|
||||||
|
image,
|
||||||
|
name=name,
|
||||||
|
ports={f"{port}/tcp": port},
|
||||||
|
detach=True,
|
||||||
|
shm_size="1g",
|
||||||
|
mem_limit="4g",
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"⏳ Waiting for health...")
|
||||||
|
for _ in range(30):
|
||||||
|
time.sleep(1)
|
||||||
|
container.reload()
|
||||||
|
if container.status == "running":
|
||||||
|
try:
|
||||||
|
import requests
|
||||||
|
resp = requests.get(f"http://localhost:{port}/health", timeout=2)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
print(f"✅ Container healthy!")
|
||||||
|
return container
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
raise TimeoutError("Container failed to start")
|
||||||
|
|
||||||
|
def stop_container(container):
|
||||||
|
"""Stop container."""
|
||||||
|
print(f"🛑 Stopping container...")
|
||||||
|
container.stop()
|
||||||
|
container.remove()
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
print("="*60)
|
||||||
|
print("TEST 3: Pool Validation - Permanent Browser Reuse")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
client = docker.from_env()
|
||||||
|
container = None
|
||||||
|
monitor_thread = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Start container
|
||||||
|
container = start_container(client, IMAGE, CONTAINER_NAME, PORT)
|
||||||
|
|
||||||
|
# Wait for permanent browser initialization
|
||||||
|
print(f"\n⏳ Waiting for permanent browser init (3s)...")
|
||||||
|
await asyncio.sleep(3)
|
||||||
|
|
||||||
|
# Start stats monitoring
|
||||||
|
print(f"📊 Starting stats monitor...")
|
||||||
|
stop_monitoring.clear()
|
||||||
|
stats_history.clear()
|
||||||
|
monitor_thread = Thread(target=monitor_stats, args=(container,), daemon=True)
|
||||||
|
monitor_thread.start()
|
||||||
|
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
baseline_mem = stats_history[-1]['memory_mb'] if stats_history else 0
|
||||||
|
print(f"📏 Baseline (with permanent browser): {baseline_mem:.1f} MB")
|
||||||
|
|
||||||
|
# Test /html endpoint (uses permanent browser for default config)
|
||||||
|
print(f"\n🔄 Running {REQUESTS} requests to /html...")
|
||||||
|
url = f"http://localhost:{PORT}/html"
|
||||||
|
results = await test_endpoint(url, REQUESTS)
|
||||||
|
|
||||||
|
# Wait a bit
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
|
# Stop monitoring
|
||||||
|
stop_monitoring.set()
|
||||||
|
if monitor_thread:
|
||||||
|
monitor_thread.join(timeout=2)
|
||||||
|
|
||||||
|
# Analyze logs for pool markers
|
||||||
|
print(f"\n📋 Analyzing pool usage...")
|
||||||
|
pool_stats = count_log_markers(container)
|
||||||
|
|
||||||
|
# Calculate request stats
|
||||||
|
successes = sum(1 for r in results if r.get("success"))
|
||||||
|
success_rate = (successes / len(results)) * 100
|
||||||
|
latencies = [r["latency_ms"] for r in results if "latency_ms" in r]
|
||||||
|
avg_latency = sum(latencies) / len(latencies) if latencies else 0
|
||||||
|
|
||||||
|
# Memory stats
|
||||||
|
memory_samples = [s['memory_mb'] for s in stats_history]
|
||||||
|
peak_mem = max(memory_samples) if memory_samples else 0
|
||||||
|
final_mem = memory_samples[-1] if memory_samples else 0
|
||||||
|
mem_delta = final_mem - baseline_mem
|
||||||
|
|
||||||
|
# Calculate reuse rate
|
||||||
|
total_requests = len(results)
|
||||||
|
total_pool_hits = pool_stats['total_hits']
|
||||||
|
reuse_rate = (total_pool_hits / total_requests * 100) if total_requests > 0 else 0
|
||||||
|
|
||||||
|
# Print results
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"RESULTS:")
|
||||||
|
print(f" Success Rate: {success_rate:.1f}% ({successes}/{len(results)})")
|
||||||
|
print(f" Avg Latency: {avg_latency:.0f}ms")
|
||||||
|
print(f"\n Pool Stats:")
|
||||||
|
print(f" 🔥 Permanent Hits: {pool_stats['permanent_hits']}")
|
||||||
|
print(f" ♨️ Hot Pool Hits: {pool_stats['hot_hits']}")
|
||||||
|
print(f" ❄️ Cold Pool Hits: {pool_stats['cold_hits']}")
|
||||||
|
print(f" 🆕 New Created: {pool_stats['new_created']}")
|
||||||
|
print(f" 📊 Reuse Rate: {reuse_rate:.1f}%")
|
||||||
|
print(f"\n Memory Stats:")
|
||||||
|
print(f" Baseline: {baseline_mem:.1f} MB")
|
||||||
|
print(f" Peak: {peak_mem:.1f} MB")
|
||||||
|
print(f" Final: {final_mem:.1f} MB")
|
||||||
|
print(f" Delta: {mem_delta:+.1f} MB")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
# Pass/Fail
|
||||||
|
passed = True
|
||||||
|
if success_rate < 100:
|
||||||
|
print(f"❌ FAIL: Success rate {success_rate:.1f}% < 100%")
|
||||||
|
passed = False
|
||||||
|
if reuse_rate < 80:
|
||||||
|
print(f"❌ FAIL: Reuse rate {reuse_rate:.1f}% < 80% (expected high permanent browser usage)")
|
||||||
|
passed = False
|
||||||
|
if pool_stats['permanent_hits'] < (total_requests * 0.8):
|
||||||
|
print(f"⚠️ WARNING: Only {pool_stats['permanent_hits']} permanent hits out of {total_requests} requests")
|
||||||
|
if mem_delta > 200:
|
||||||
|
print(f"⚠️ WARNING: Memory grew by {mem_delta:.1f} MB (possible browser leak)")
|
||||||
|
|
||||||
|
if passed:
|
||||||
|
print(f"✅ TEST PASSED")
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
return 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n❌ TEST ERROR: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return 1
|
||||||
|
finally:
|
||||||
|
stop_monitoring.set()
|
||||||
|
if container:
|
||||||
|
stop_container(container)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
exit_code = asyncio.run(main())
|
||||||
|
exit(exit_code)
|
||||||
236
deploy/docker/tests/test_4_concurrent.py
Executable file
236
deploy/docker/tests/test_4_concurrent.py
Executable file
@@ -0,0 +1,236 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test 4: Concurrent Load Testing
|
||||||
|
- Tests pool under concurrent load
|
||||||
|
- Escalates: 10 → 50 → 100 concurrent requests
|
||||||
|
- Validates latency distribution (P50, P95, P99)
|
||||||
|
- Monitors memory stability
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
import docker
|
||||||
|
import httpx
|
||||||
|
from threading import Thread, Event
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
# Config
|
||||||
|
IMAGE = "crawl4ai-local:latest"
|
||||||
|
CONTAINER_NAME = "crawl4ai-test"
|
||||||
|
PORT = 11235
|
||||||
|
LOAD_LEVELS = [
|
||||||
|
{"name": "Light", "concurrent": 10, "requests": 20},
|
||||||
|
{"name": "Medium", "concurrent": 50, "requests": 100},
|
||||||
|
{"name": "Heavy", "concurrent": 100, "requests": 200},
|
||||||
|
]
|
||||||
|
|
||||||
|
# Stats
|
||||||
|
stats_history = []
|
||||||
|
stop_monitoring = Event()
|
||||||
|
|
||||||
|
def monitor_stats(container):
|
||||||
|
"""Background stats collector."""
|
||||||
|
for stat in container.stats(decode=True, stream=True):
|
||||||
|
if stop_monitoring.is_set():
|
||||||
|
break
|
||||||
|
try:
|
||||||
|
mem_usage = stat['memory_stats'].get('usage', 0) / (1024 * 1024)
|
||||||
|
stats_history.append({'timestamp': time.time(), 'memory_mb': mem_usage})
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
def count_log_markers(container):
|
||||||
|
"""Extract pool markers."""
|
||||||
|
logs = container.logs().decode('utf-8')
|
||||||
|
return {
|
||||||
|
'permanent': logs.count("🔥 Using permanent browser"),
|
||||||
|
'hot': logs.count("♨️ Using hot pool browser"),
|
||||||
|
'cold': logs.count("❄️ Using cold pool browser"),
|
||||||
|
'new': logs.count("🆕 Creating new browser"),
|
||||||
|
}
|
||||||
|
|
||||||
|
async def hit_endpoint(client, url, payload, semaphore):
|
||||||
|
"""Single request with concurrency control."""
|
||||||
|
async with semaphore:
|
||||||
|
start = time.time()
|
||||||
|
try:
|
||||||
|
resp = await client.post(url, json=payload, timeout=60.0)
|
||||||
|
elapsed = (time.time() - start) * 1000
|
||||||
|
return {"success": resp.status_code == 200, "latency_ms": elapsed}
|
||||||
|
except Exception as e:
|
||||||
|
return {"success": False, "error": str(e)}
|
||||||
|
|
||||||
|
async def run_concurrent_test(url, payload, concurrent, total_requests):
|
||||||
|
"""Run concurrent requests."""
|
||||||
|
semaphore = asyncio.Semaphore(concurrent)
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
tasks = [hit_endpoint(client, url, payload, semaphore) for _ in range(total_requests)]
|
||||||
|
results = await asyncio.gather(*tasks)
|
||||||
|
return results
|
||||||
|
|
||||||
|
def calculate_percentiles(latencies):
|
||||||
|
"""Calculate P50, P95, P99."""
|
||||||
|
if not latencies:
|
||||||
|
return 0, 0, 0
|
||||||
|
sorted_lat = sorted(latencies)
|
||||||
|
n = len(sorted_lat)
|
||||||
|
return (
|
||||||
|
sorted_lat[int(n * 0.50)],
|
||||||
|
sorted_lat[int(n * 0.95)],
|
||||||
|
sorted_lat[int(n * 0.99)],
|
||||||
|
)
|
||||||
|
|
||||||
|
def start_container(client, image, name, port):
|
||||||
|
"""Start container."""
|
||||||
|
try:
|
||||||
|
old = client.containers.get(name)
|
||||||
|
print(f"🧹 Stopping existing container...")
|
||||||
|
old.stop()
|
||||||
|
old.remove()
|
||||||
|
except docker.errors.NotFound:
|
||||||
|
pass
|
||||||
|
|
||||||
|
print(f"🚀 Starting container...")
|
||||||
|
container = client.containers.run(
|
||||||
|
image, name=name, ports={f"{port}/tcp": port},
|
||||||
|
detach=True, shm_size="1g", mem_limit="4g",
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"⏳ Waiting for health...")
|
||||||
|
for _ in range(30):
|
||||||
|
time.sleep(1)
|
||||||
|
container.reload()
|
||||||
|
if container.status == "running":
|
||||||
|
try:
|
||||||
|
import requests
|
||||||
|
if requests.get(f"http://localhost:{port}/health", timeout=2).status_code == 200:
|
||||||
|
print(f"✅ Container healthy!")
|
||||||
|
return container
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
raise TimeoutError("Container failed to start")
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
print("="*60)
|
||||||
|
print("TEST 4: Concurrent Load Testing")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
client = docker.from_env()
|
||||||
|
container = None
|
||||||
|
monitor_thread = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
container = start_container(client, IMAGE, CONTAINER_NAME, PORT)
|
||||||
|
|
||||||
|
print(f"\n⏳ Waiting for permanent browser init (3s)...")
|
||||||
|
await asyncio.sleep(3)
|
||||||
|
|
||||||
|
# Start monitoring
|
||||||
|
stop_monitoring.clear()
|
||||||
|
stats_history.clear()
|
||||||
|
monitor_thread = Thread(target=monitor_stats, args=(container,), daemon=True)
|
||||||
|
monitor_thread.start()
|
||||||
|
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
baseline_mem = stats_history[-1]['memory_mb'] if stats_history else 0
|
||||||
|
print(f"📏 Baseline: {baseline_mem:.1f} MB\n")
|
||||||
|
|
||||||
|
url = f"http://localhost:{PORT}/html"
|
||||||
|
payload = {"url": "https://httpbin.org/html"}
|
||||||
|
|
||||||
|
all_results = []
|
||||||
|
level_stats = []
|
||||||
|
|
||||||
|
# Run load levels
|
||||||
|
for level in LOAD_LEVELS:
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f"🔄 {level['name']} Load: {level['concurrent']} concurrent, {level['requests']} total")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
results = await run_concurrent_test(url, payload, level['concurrent'], level['requests'])
|
||||||
|
duration = time.time() - start_time
|
||||||
|
|
||||||
|
successes = sum(1 for r in results if r.get("success"))
|
||||||
|
success_rate = (successes / len(results)) * 100
|
||||||
|
latencies = [r["latency_ms"] for r in results if "latency_ms" in r]
|
||||||
|
p50, p95, p99 = calculate_percentiles(latencies)
|
||||||
|
avg_lat = sum(latencies) / len(latencies) if latencies else 0
|
||||||
|
|
||||||
|
print(f" Duration: {duration:.1f}s")
|
||||||
|
print(f" Success: {success_rate:.1f}% ({successes}/{len(results)})")
|
||||||
|
print(f" Avg Latency: {avg_lat:.0f}ms")
|
||||||
|
print(f" P50/P95/P99: {p50:.0f}ms / {p95:.0f}ms / {p99:.0f}ms")
|
||||||
|
|
||||||
|
level_stats.append({
|
||||||
|
'name': level['name'],
|
||||||
|
'concurrent': level['concurrent'],
|
||||||
|
'success_rate': success_rate,
|
||||||
|
'avg_latency': avg_lat,
|
||||||
|
'p50': p50, 'p95': p95, 'p99': p99,
|
||||||
|
})
|
||||||
|
all_results.extend(results)
|
||||||
|
|
||||||
|
await asyncio.sleep(2) # Cool down between levels
|
||||||
|
|
||||||
|
# Stop monitoring
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
stop_monitoring.set()
|
||||||
|
if monitor_thread:
|
||||||
|
monitor_thread.join(timeout=2)
|
||||||
|
|
||||||
|
# Final stats
|
||||||
|
pool_stats = count_log_markers(container)
|
||||||
|
memory_samples = [s['memory_mb'] for s in stats_history]
|
||||||
|
peak_mem = max(memory_samples) if memory_samples else 0
|
||||||
|
final_mem = memory_samples[-1] if memory_samples else 0
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"FINAL RESULTS:")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f" Total Requests: {len(all_results)}")
|
||||||
|
print(f"\n Pool Utilization:")
|
||||||
|
print(f" 🔥 Permanent: {pool_stats['permanent']}")
|
||||||
|
print(f" ♨️ Hot: {pool_stats['hot']}")
|
||||||
|
print(f" ❄️ Cold: {pool_stats['cold']}")
|
||||||
|
print(f" 🆕 New: {pool_stats['new']}")
|
||||||
|
print(f"\n Memory:")
|
||||||
|
print(f" Baseline: {baseline_mem:.1f} MB")
|
||||||
|
print(f" Peak: {peak_mem:.1f} MB")
|
||||||
|
print(f" Final: {final_mem:.1f} MB")
|
||||||
|
print(f" Delta: {final_mem - baseline_mem:+.1f} MB")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
# Pass/Fail
|
||||||
|
passed = True
|
||||||
|
for ls in level_stats:
|
||||||
|
if ls['success_rate'] < 99:
|
||||||
|
print(f"❌ FAIL: {ls['name']} success rate {ls['success_rate']:.1f}% < 99%")
|
||||||
|
passed = False
|
||||||
|
if ls['p99'] > 10000: # 10s threshold
|
||||||
|
print(f"⚠️ WARNING: {ls['name']} P99 latency {ls['p99']:.0f}ms very high")
|
||||||
|
|
||||||
|
if final_mem - baseline_mem > 300:
|
||||||
|
print(f"⚠️ WARNING: Memory grew {final_mem - baseline_mem:.1f} MB")
|
||||||
|
|
||||||
|
if passed:
|
||||||
|
print(f"✅ TEST PASSED")
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
return 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n❌ TEST ERROR: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return 1
|
||||||
|
finally:
|
||||||
|
stop_monitoring.set()
|
||||||
|
if container:
|
||||||
|
print(f"🛑 Stopping container...")
|
||||||
|
container.stop()
|
||||||
|
container.remove()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
exit_code = asyncio.run(main())
|
||||||
|
exit(exit_code)
|
||||||
267
deploy/docker/tests/test_5_pool_stress.py
Executable file
267
deploy/docker/tests/test_5_pool_stress.py
Executable file
@@ -0,0 +1,267 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test 5: Pool Stress - Mixed Configs
|
||||||
|
- Tests hot/cold pool with different browser configs
|
||||||
|
- Uses different viewports to create config variants
|
||||||
|
- Validates cold → hot promotion after 3 uses
|
||||||
|
- Monitors pool tier distribution
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
import docker
|
||||||
|
import httpx
|
||||||
|
from threading import Thread, Event
|
||||||
|
import random
|
||||||
|
|
||||||
|
# Config
|
||||||
|
IMAGE = "crawl4ai-local:latest"
|
||||||
|
CONTAINER_NAME = "crawl4ai-test"
|
||||||
|
PORT = 11235
|
||||||
|
REQUESTS_PER_CONFIG = 5 # 5 requests per config variant
|
||||||
|
|
||||||
|
# Different viewport configs to test pool tiers
|
||||||
|
VIEWPORT_CONFIGS = [
|
||||||
|
None, # Default (permanent browser)
|
||||||
|
{"width": 1920, "height": 1080}, # Desktop
|
||||||
|
{"width": 1024, "height": 768}, # Tablet
|
||||||
|
{"width": 375, "height": 667}, # Mobile
|
||||||
|
]
|
||||||
|
|
||||||
|
# Stats
|
||||||
|
stats_history = []
|
||||||
|
stop_monitoring = Event()
|
||||||
|
|
||||||
|
def monitor_stats(container):
|
||||||
|
"""Background stats collector."""
|
||||||
|
for stat in container.stats(decode=True, stream=True):
|
||||||
|
if stop_monitoring.is_set():
|
||||||
|
break
|
||||||
|
try:
|
||||||
|
mem_usage = stat['memory_stats'].get('usage', 0) / (1024 * 1024)
|
||||||
|
stats_history.append({'timestamp': time.time(), 'memory_mb': mem_usage})
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
def analyze_pool_logs(container):
|
||||||
|
"""Extract detailed pool stats from logs."""
|
||||||
|
logs = container.logs().decode('utf-8')
|
||||||
|
|
||||||
|
permanent = logs.count("🔥 Using permanent browser")
|
||||||
|
hot = logs.count("♨️ Using hot pool browser")
|
||||||
|
cold = logs.count("❄️ Using cold pool browser")
|
||||||
|
new = logs.count("🆕 Creating new browser")
|
||||||
|
promotions = logs.count("⬆️ Promoting to hot pool")
|
||||||
|
|
||||||
|
return {
|
||||||
|
'permanent': permanent,
|
||||||
|
'hot': hot,
|
||||||
|
'cold': cold,
|
||||||
|
'new': new,
|
||||||
|
'promotions': promotions,
|
||||||
|
'total': permanent + hot + cold
|
||||||
|
}
|
||||||
|
|
||||||
|
async def crawl_with_viewport(client, url, viewport):
|
||||||
|
"""Single request with specific viewport."""
|
||||||
|
payload = {
|
||||||
|
"urls": ["https://httpbin.org/html"],
|
||||||
|
"browser_config": {},
|
||||||
|
"crawler_config": {}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add viewport if specified
|
||||||
|
if viewport:
|
||||||
|
payload["browser_config"] = {
|
||||||
|
"type": "BrowserConfig",
|
||||||
|
"params": {
|
||||||
|
"viewport": {"type": "dict", "value": viewport},
|
||||||
|
"headless": True,
|
||||||
|
"text_mode": True,
|
||||||
|
"extra_args": [
|
||||||
|
"--no-sandbox",
|
||||||
|
"--disable-dev-shm-usage",
|
||||||
|
"--disable-gpu",
|
||||||
|
"--disable-software-rasterizer",
|
||||||
|
"--disable-web-security",
|
||||||
|
"--allow-insecure-localhost",
|
||||||
|
"--ignore-certificate-errors"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
try:
|
||||||
|
resp = await client.post(url, json=payload, timeout=60.0)
|
||||||
|
elapsed = (time.time() - start) * 1000
|
||||||
|
return {"success": resp.status_code == 200, "latency_ms": elapsed, "viewport": viewport}
|
||||||
|
except Exception as e:
|
||||||
|
return {"success": False, "error": str(e), "viewport": viewport}
|
||||||
|
|
||||||
|
def start_container(client, image, name, port):
|
||||||
|
"""Start container."""
|
||||||
|
try:
|
||||||
|
old = client.containers.get(name)
|
||||||
|
print(f"🧹 Stopping existing container...")
|
||||||
|
old.stop()
|
||||||
|
old.remove()
|
||||||
|
except docker.errors.NotFound:
|
||||||
|
pass
|
||||||
|
|
||||||
|
print(f"🚀 Starting container...")
|
||||||
|
container = client.containers.run(
|
||||||
|
image, name=name, ports={f"{port}/tcp": port},
|
||||||
|
detach=True, shm_size="1g", mem_limit="4g",
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"⏳ Waiting for health...")
|
||||||
|
for _ in range(30):
|
||||||
|
time.sleep(1)
|
||||||
|
container.reload()
|
||||||
|
if container.status == "running":
|
||||||
|
try:
|
||||||
|
import requests
|
||||||
|
if requests.get(f"http://localhost:{port}/health", timeout=2).status_code == 200:
|
||||||
|
print(f"✅ Container healthy!")
|
||||||
|
return container
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
raise TimeoutError("Container failed to start")
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
print("="*60)
|
||||||
|
print("TEST 5: Pool Stress - Mixed Configs")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
client = docker.from_env()
|
||||||
|
container = None
|
||||||
|
monitor_thread = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
container = start_container(client, IMAGE, CONTAINER_NAME, PORT)
|
||||||
|
|
||||||
|
print(f"\n⏳ Waiting for permanent browser init (3s)...")
|
||||||
|
await asyncio.sleep(3)
|
||||||
|
|
||||||
|
# Start monitoring
|
||||||
|
stop_monitoring.clear()
|
||||||
|
stats_history.clear()
|
||||||
|
monitor_thread = Thread(target=monitor_stats, args=(container,), daemon=True)
|
||||||
|
monitor_thread.start()
|
||||||
|
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
baseline_mem = stats_history[-1]['memory_mb'] if stats_history else 0
|
||||||
|
print(f"📏 Baseline: {baseline_mem:.1f} MB\n")
|
||||||
|
|
||||||
|
url = f"http://localhost:{PORT}/crawl"
|
||||||
|
|
||||||
|
print(f"Testing {len(VIEWPORT_CONFIGS)} different configs:")
|
||||||
|
for i, vp in enumerate(VIEWPORT_CONFIGS):
|
||||||
|
vp_str = "Default" if vp is None else f"{vp['width']}x{vp['height']}"
|
||||||
|
print(f" {i+1}. {vp_str}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Run requests: repeat each config REQUESTS_PER_CONFIG times
|
||||||
|
all_results = []
|
||||||
|
config_sequence = []
|
||||||
|
|
||||||
|
for _ in range(REQUESTS_PER_CONFIG):
|
||||||
|
for viewport in VIEWPORT_CONFIGS:
|
||||||
|
config_sequence.append(viewport)
|
||||||
|
|
||||||
|
# Shuffle to mix configs
|
||||||
|
random.shuffle(config_sequence)
|
||||||
|
|
||||||
|
print(f"🔄 Running {len(config_sequence)} requests with mixed configs...")
|
||||||
|
|
||||||
|
async with httpx.AsyncClient() as http_client:
|
||||||
|
for i, viewport in enumerate(config_sequence):
|
||||||
|
result = await crawl_with_viewport(http_client, url, viewport)
|
||||||
|
all_results.append(result)
|
||||||
|
|
||||||
|
if (i + 1) % 5 == 0:
|
||||||
|
vp_str = "default" if result['viewport'] is None else f"{result['viewport']['width']}x{result['viewport']['height']}"
|
||||||
|
status = "✓" if result.get('success') else "✗"
|
||||||
|
lat = f"{result.get('latency_ms', 0):.0f}ms" if 'latency_ms' in result else "error"
|
||||||
|
print(f" [{i+1}/{len(config_sequence)}] {status} {vp_str} - {lat}")
|
||||||
|
|
||||||
|
# Stop monitoring
|
||||||
|
await asyncio.sleep(2)
|
||||||
|
stop_monitoring.set()
|
||||||
|
if monitor_thread:
|
||||||
|
monitor_thread.join(timeout=2)
|
||||||
|
|
||||||
|
# Analyze results
|
||||||
|
pool_stats = analyze_pool_logs(container)
|
||||||
|
|
||||||
|
successes = sum(1 for r in all_results if r.get("success"))
|
||||||
|
success_rate = (successes / len(all_results)) * 100
|
||||||
|
latencies = [r["latency_ms"] for r in all_results if "latency_ms" in r]
|
||||||
|
avg_lat = sum(latencies) / len(latencies) if latencies else 0
|
||||||
|
|
||||||
|
memory_samples = [s['memory_mb'] for s in stats_history]
|
||||||
|
peak_mem = max(memory_samples) if memory_samples else 0
|
||||||
|
final_mem = memory_samples[-1] if memory_samples else 0
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"RESULTS:")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f" Requests: {len(all_results)}")
|
||||||
|
print(f" Success Rate: {success_rate:.1f}% ({successes}/{len(all_results)})")
|
||||||
|
print(f" Avg Latency: {avg_lat:.0f}ms")
|
||||||
|
print(f"\n Pool Statistics:")
|
||||||
|
print(f" 🔥 Permanent: {pool_stats['permanent']}")
|
||||||
|
print(f" ♨️ Hot: {pool_stats['hot']}")
|
||||||
|
print(f" ❄️ Cold: {pool_stats['cold']}")
|
||||||
|
print(f" 🆕 New: {pool_stats['new']}")
|
||||||
|
print(f" ⬆️ Promotions: {pool_stats['promotions']}")
|
||||||
|
print(f" 📊 Reuse: {(pool_stats['total'] / len(all_results) * 100):.1f}%")
|
||||||
|
print(f"\n Memory:")
|
||||||
|
print(f" Baseline: {baseline_mem:.1f} MB")
|
||||||
|
print(f" Peak: {peak_mem:.1f} MB")
|
||||||
|
print(f" Final: {final_mem:.1f} MB")
|
||||||
|
print(f" Delta: {final_mem - baseline_mem:+.1f} MB")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
# Pass/Fail
|
||||||
|
passed = True
|
||||||
|
|
||||||
|
if success_rate < 99:
|
||||||
|
print(f"❌ FAIL: Success rate {success_rate:.1f}% < 99%")
|
||||||
|
passed = False
|
||||||
|
|
||||||
|
# Should see promotions since we repeat each config 5 times
|
||||||
|
if pool_stats['promotions'] < (len(VIEWPORT_CONFIGS) - 1): # -1 for default
|
||||||
|
print(f"⚠️ WARNING: Only {pool_stats['promotions']} promotions (expected ~{len(VIEWPORT_CONFIGS)-1})")
|
||||||
|
|
||||||
|
# Should have created some browsers for different configs
|
||||||
|
if pool_stats['new'] == 0:
|
||||||
|
print(f"⚠️ NOTE: No new browsers created (all used default?)")
|
||||||
|
|
||||||
|
if pool_stats['permanent'] == len(all_results):
|
||||||
|
print(f"⚠️ NOTE: All requests used permanent browser (configs not varying enough?)")
|
||||||
|
|
||||||
|
if final_mem - baseline_mem > 500:
|
||||||
|
print(f"⚠️ WARNING: Memory grew {final_mem - baseline_mem:.1f} MB")
|
||||||
|
|
||||||
|
if passed:
|
||||||
|
print(f"✅ TEST PASSED")
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
return 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n❌ TEST ERROR: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return 1
|
||||||
|
finally:
|
||||||
|
stop_monitoring.set()
|
||||||
|
if container:
|
||||||
|
print(f"🛑 Stopping container...")
|
||||||
|
container.stop()
|
||||||
|
container.remove()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
exit_code = asyncio.run(main())
|
||||||
|
exit(exit_code)
|
||||||
234
deploy/docker/tests/test_6_multi_endpoint.py
Executable file
234
deploy/docker/tests/test_6_multi_endpoint.py
Executable file
@@ -0,0 +1,234 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test 6: Multi-Endpoint Testing
|
||||||
|
- Tests multiple endpoints together: /html, /screenshot, /pdf, /crawl
|
||||||
|
- Validates each endpoint works correctly
|
||||||
|
- Monitors success rates per endpoint
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
import docker
|
||||||
|
import httpx
|
||||||
|
from threading import Thread, Event
|
||||||
|
|
||||||
|
# Config
|
||||||
|
IMAGE = "crawl4ai-local:latest"
|
||||||
|
CONTAINER_NAME = "crawl4ai-test"
|
||||||
|
PORT = 11235
|
||||||
|
REQUESTS_PER_ENDPOINT = 10
|
||||||
|
|
||||||
|
# Stats
|
||||||
|
stats_history = []
|
||||||
|
stop_monitoring = Event()
|
||||||
|
|
||||||
|
def monitor_stats(container):
|
||||||
|
"""Background stats collector."""
|
||||||
|
for stat in container.stats(decode=True, stream=True):
|
||||||
|
if stop_monitoring.is_set():
|
||||||
|
break
|
||||||
|
try:
|
||||||
|
mem_usage = stat['memory_stats'].get('usage', 0) / (1024 * 1024)
|
||||||
|
stats_history.append({'timestamp': time.time(), 'memory_mb': mem_usage})
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
async def test_html(client, base_url, count):
|
||||||
|
"""Test /html endpoint."""
|
||||||
|
url = f"{base_url}/html"
|
||||||
|
results = []
|
||||||
|
for _ in range(count):
|
||||||
|
start = time.time()
|
||||||
|
try:
|
||||||
|
resp = await client.post(url, json={"url": "https://httpbin.org/html"}, timeout=30.0)
|
||||||
|
elapsed = (time.time() - start) * 1000
|
||||||
|
results.append({"success": resp.status_code == 200, "latency_ms": elapsed})
|
||||||
|
except Exception as e:
|
||||||
|
results.append({"success": False, "error": str(e)})
|
||||||
|
return results
|
||||||
|
|
||||||
|
async def test_screenshot(client, base_url, count):
|
||||||
|
"""Test /screenshot endpoint."""
|
||||||
|
url = f"{base_url}/screenshot"
|
||||||
|
results = []
|
||||||
|
for _ in range(count):
|
||||||
|
start = time.time()
|
||||||
|
try:
|
||||||
|
resp = await client.post(url, json={"url": "https://httpbin.org/html"}, timeout=30.0)
|
||||||
|
elapsed = (time.time() - start) * 1000
|
||||||
|
results.append({"success": resp.status_code == 200, "latency_ms": elapsed})
|
||||||
|
except Exception as e:
|
||||||
|
results.append({"success": False, "error": str(e)})
|
||||||
|
return results
|
||||||
|
|
||||||
|
async def test_pdf(client, base_url, count):
|
||||||
|
"""Test /pdf endpoint."""
|
||||||
|
url = f"{base_url}/pdf"
|
||||||
|
results = []
|
||||||
|
for _ in range(count):
|
||||||
|
start = time.time()
|
||||||
|
try:
|
||||||
|
resp = await client.post(url, json={"url": "https://httpbin.org/html"}, timeout=30.0)
|
||||||
|
elapsed = (time.time() - start) * 1000
|
||||||
|
results.append({"success": resp.status_code == 200, "latency_ms": elapsed})
|
||||||
|
except Exception as e:
|
||||||
|
results.append({"success": False, "error": str(e)})
|
||||||
|
return results
|
||||||
|
|
||||||
|
async def test_crawl(client, base_url, count):
|
||||||
|
"""Test /crawl endpoint."""
|
||||||
|
url = f"{base_url}/crawl"
|
||||||
|
results = []
|
||||||
|
payload = {
|
||||||
|
"urls": ["https://httpbin.org/html"],
|
||||||
|
"browser_config": {},
|
||||||
|
"crawler_config": {}
|
||||||
|
}
|
||||||
|
for _ in range(count):
|
||||||
|
start = time.time()
|
||||||
|
try:
|
||||||
|
resp = await client.post(url, json=payload, timeout=30.0)
|
||||||
|
elapsed = (time.time() - start) * 1000
|
||||||
|
results.append({"success": resp.status_code == 200, "latency_ms": elapsed})
|
||||||
|
except Exception as e:
|
||||||
|
results.append({"success": False, "error": str(e)})
|
||||||
|
return results
|
||||||
|
|
||||||
|
def start_container(client, image, name, port):
|
||||||
|
"""Start container."""
|
||||||
|
try:
|
||||||
|
old = client.containers.get(name)
|
||||||
|
print(f"🧹 Stopping existing container...")
|
||||||
|
old.stop()
|
||||||
|
old.remove()
|
||||||
|
except docker.errors.NotFound:
|
||||||
|
pass
|
||||||
|
|
||||||
|
print(f"🚀 Starting container...")
|
||||||
|
container = client.containers.run(
|
||||||
|
image, name=name, ports={f"{port}/tcp": port},
|
||||||
|
detach=True, shm_size="1g", mem_limit="4g",
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"⏳ Waiting for health...")
|
||||||
|
for _ in range(30):
|
||||||
|
time.sleep(1)
|
||||||
|
container.reload()
|
||||||
|
if container.status == "running":
|
||||||
|
try:
|
||||||
|
import requests
|
||||||
|
if requests.get(f"http://localhost:{port}/health", timeout=2).status_code == 200:
|
||||||
|
print(f"✅ Container healthy!")
|
||||||
|
return container
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
raise TimeoutError("Container failed to start")
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
print("="*60)
|
||||||
|
print("TEST 6: Multi-Endpoint Testing")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
client = docker.from_env()
|
||||||
|
container = None
|
||||||
|
monitor_thread = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
container = start_container(client, IMAGE, CONTAINER_NAME, PORT)
|
||||||
|
|
||||||
|
print(f"\n⏳ Waiting for permanent browser init (3s)...")
|
||||||
|
await asyncio.sleep(3)
|
||||||
|
|
||||||
|
# Start monitoring
|
||||||
|
stop_monitoring.clear()
|
||||||
|
stats_history.clear()
|
||||||
|
monitor_thread = Thread(target=monitor_stats, args=(container,), daemon=True)
|
||||||
|
monitor_thread.start()
|
||||||
|
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
baseline_mem = stats_history[-1]['memory_mb'] if stats_history else 0
|
||||||
|
print(f"📏 Baseline: {baseline_mem:.1f} MB\n")
|
||||||
|
|
||||||
|
base_url = f"http://localhost:{PORT}"
|
||||||
|
|
||||||
|
# Test each endpoint
|
||||||
|
endpoints = {
|
||||||
|
"/html": test_html,
|
||||||
|
"/screenshot": test_screenshot,
|
||||||
|
"/pdf": test_pdf,
|
||||||
|
"/crawl": test_crawl,
|
||||||
|
}
|
||||||
|
|
||||||
|
all_endpoint_stats = {}
|
||||||
|
|
||||||
|
async with httpx.AsyncClient() as http_client:
|
||||||
|
for endpoint_name, test_func in endpoints.items():
|
||||||
|
print(f"🔄 Testing {endpoint_name} ({REQUESTS_PER_ENDPOINT} requests)...")
|
||||||
|
results = await test_func(http_client, base_url, REQUESTS_PER_ENDPOINT)
|
||||||
|
|
||||||
|
successes = sum(1 for r in results if r.get("success"))
|
||||||
|
success_rate = (successes / len(results)) * 100
|
||||||
|
latencies = [r["latency_ms"] for r in results if "latency_ms" in r]
|
||||||
|
avg_lat = sum(latencies) / len(latencies) if latencies else 0
|
||||||
|
|
||||||
|
all_endpoint_stats[endpoint_name] = {
|
||||||
|
'success_rate': success_rate,
|
||||||
|
'avg_latency': avg_lat,
|
||||||
|
'total': len(results),
|
||||||
|
'successes': successes
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f" ✓ Success: {success_rate:.1f}% ({successes}/{len(results)}), Avg: {avg_lat:.0f}ms")
|
||||||
|
|
||||||
|
# Stop monitoring
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
stop_monitoring.set()
|
||||||
|
if monitor_thread:
|
||||||
|
monitor_thread.join(timeout=2)
|
||||||
|
|
||||||
|
# Final stats
|
||||||
|
memory_samples = [s['memory_mb'] for s in stats_history]
|
||||||
|
peak_mem = max(memory_samples) if memory_samples else 0
|
||||||
|
final_mem = memory_samples[-1] if memory_samples else 0
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"RESULTS:")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
for endpoint, stats in all_endpoint_stats.items():
|
||||||
|
print(f" {endpoint:12} Success: {stats['success_rate']:5.1f}% Avg: {stats['avg_latency']:6.0f}ms")
|
||||||
|
|
||||||
|
print(f"\n Memory:")
|
||||||
|
print(f" Baseline: {baseline_mem:.1f} MB")
|
||||||
|
print(f" Peak: {peak_mem:.1f} MB")
|
||||||
|
print(f" Final: {final_mem:.1f} MB")
|
||||||
|
print(f" Delta: {final_mem - baseline_mem:+.1f} MB")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
# Pass/Fail
|
||||||
|
passed = True
|
||||||
|
for endpoint, stats in all_endpoint_stats.items():
|
||||||
|
if stats['success_rate'] < 100:
|
||||||
|
print(f"❌ FAIL: {endpoint} success rate {stats['success_rate']:.1f}% < 100%")
|
||||||
|
passed = False
|
||||||
|
|
||||||
|
if passed:
|
||||||
|
print(f"✅ TEST PASSED")
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
return 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n❌ TEST ERROR: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return 1
|
||||||
|
finally:
|
||||||
|
stop_monitoring.set()
|
||||||
|
if container:
|
||||||
|
print(f"🛑 Stopping container...")
|
||||||
|
container.stop()
|
||||||
|
container.remove()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
exit_code = asyncio.run(main())
|
||||||
|
exit(exit_code)
|
||||||
199
deploy/docker/tests/test_7_cleanup.py
Executable file
199
deploy/docker/tests/test_7_cleanup.py
Executable file
@@ -0,0 +1,199 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test 7: Cleanup Verification (Janitor)
|
||||||
|
- Creates load spike then goes idle
|
||||||
|
- Verifies memory returns to near baseline
|
||||||
|
- Tests janitor cleanup of idle browsers
|
||||||
|
- Monitors memory recovery time
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
import docker
|
||||||
|
import httpx
|
||||||
|
from threading import Thread, Event
|
||||||
|
|
||||||
|
# Config
|
||||||
|
IMAGE = "crawl4ai-local:latest"
|
||||||
|
CONTAINER_NAME = "crawl4ai-test"
|
||||||
|
PORT = 11235
|
||||||
|
SPIKE_REQUESTS = 20 # Create some browsers
|
||||||
|
IDLE_TIME = 90 # Wait 90s for janitor (runs every 60s)
|
||||||
|
|
||||||
|
# Stats
|
||||||
|
stats_history = []
|
||||||
|
stop_monitoring = Event()
|
||||||
|
|
||||||
|
def monitor_stats(container):
|
||||||
|
"""Background stats collector."""
|
||||||
|
for stat in container.stats(decode=True, stream=True):
|
||||||
|
if stop_monitoring.is_set():
|
||||||
|
break
|
||||||
|
try:
|
||||||
|
mem_usage = stat['memory_stats'].get('usage', 0) / (1024 * 1024)
|
||||||
|
stats_history.append({'timestamp': time.time(), 'memory_mb': mem_usage})
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
time.sleep(1) # Sample every 1s for this test
|
||||||
|
|
||||||
|
def start_container(client, image, name, port):
|
||||||
|
"""Start container."""
|
||||||
|
try:
|
||||||
|
old = client.containers.get(name)
|
||||||
|
print(f"🧹 Stopping existing container...")
|
||||||
|
old.stop()
|
||||||
|
old.remove()
|
||||||
|
except docker.errors.NotFound:
|
||||||
|
pass
|
||||||
|
|
||||||
|
print(f"🚀 Starting container...")
|
||||||
|
container = client.containers.run(
|
||||||
|
image, name=name, ports={f"{port}/tcp": port},
|
||||||
|
detach=True, shm_size="1g", mem_limit="4g",
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"⏳ Waiting for health...")
|
||||||
|
for _ in range(30):
|
||||||
|
time.sleep(1)
|
||||||
|
container.reload()
|
||||||
|
if container.status == "running":
|
||||||
|
try:
|
||||||
|
import requests
|
||||||
|
if requests.get(f"http://localhost:{port}/health", timeout=2).status_code == 200:
|
||||||
|
print(f"✅ Container healthy!")
|
||||||
|
return container
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
raise TimeoutError("Container failed to start")
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
print("="*60)
|
||||||
|
print("TEST 7: Cleanup Verification (Janitor)")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
client = docker.from_env()
|
||||||
|
container = None
|
||||||
|
monitor_thread = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
container = start_container(client, IMAGE, CONTAINER_NAME, PORT)
|
||||||
|
|
||||||
|
print(f"\n⏳ Waiting for permanent browser init (3s)...")
|
||||||
|
await asyncio.sleep(3)
|
||||||
|
|
||||||
|
# Start monitoring
|
||||||
|
stop_monitoring.clear()
|
||||||
|
stats_history.clear()
|
||||||
|
monitor_thread = Thread(target=monitor_stats, args=(container,), daemon=True)
|
||||||
|
monitor_thread.start()
|
||||||
|
|
||||||
|
await asyncio.sleep(2)
|
||||||
|
baseline_mem = stats_history[-1]['memory_mb'] if stats_history else 0
|
||||||
|
print(f"📏 Baseline: {baseline_mem:.1f} MB\n")
|
||||||
|
|
||||||
|
# Create load spike with different configs to populate pool
|
||||||
|
print(f"🔥 Creating load spike ({SPIKE_REQUESTS} requests with varied configs)...")
|
||||||
|
url = f"http://localhost:{PORT}/crawl"
|
||||||
|
|
||||||
|
viewports = [
|
||||||
|
{"width": 1920, "height": 1080},
|
||||||
|
{"width": 1024, "height": 768},
|
||||||
|
{"width": 375, "height": 667},
|
||||||
|
]
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=60.0) as http_client:
|
||||||
|
tasks = []
|
||||||
|
for i in range(SPIKE_REQUESTS):
|
||||||
|
vp = viewports[i % len(viewports)]
|
||||||
|
payload = {
|
||||||
|
"urls": ["https://httpbin.org/html"],
|
||||||
|
"browser_config": {
|
||||||
|
"type": "BrowserConfig",
|
||||||
|
"params": {
|
||||||
|
"viewport": {"type": "dict", "value": vp},
|
||||||
|
"headless": True,
|
||||||
|
"text_mode": True,
|
||||||
|
"extra_args": [
|
||||||
|
"--no-sandbox", "--disable-dev-shm-usage",
|
||||||
|
"--disable-gpu", "--disable-software-rasterizer",
|
||||||
|
"--disable-web-security", "--allow-insecure-localhost",
|
||||||
|
"--ignore-certificate-errors"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"crawler_config": {}
|
||||||
|
}
|
||||||
|
tasks.append(http_client.post(url, json=payload))
|
||||||
|
|
||||||
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||||
|
successes = sum(1 for r in results if hasattr(r, 'status_code') and r.status_code == 200)
|
||||||
|
print(f" ✓ Spike completed: {successes}/{len(results)} successful")
|
||||||
|
|
||||||
|
# Measure peak
|
||||||
|
await asyncio.sleep(2)
|
||||||
|
peak_mem = max([s['memory_mb'] for s in stats_history]) if stats_history else baseline_mem
|
||||||
|
print(f" 📊 Peak memory: {peak_mem:.1f} MB (+{peak_mem - baseline_mem:.1f} MB)")
|
||||||
|
|
||||||
|
# Now go idle and wait for janitor
|
||||||
|
print(f"\n⏸️ Going idle for {IDLE_TIME}s (janitor cleanup)...")
|
||||||
|
print(f" (Janitor runs every 60s, checking for idle browsers)")
|
||||||
|
|
||||||
|
for elapsed in range(0, IDLE_TIME, 10):
|
||||||
|
await asyncio.sleep(10)
|
||||||
|
current_mem = stats_history[-1]['memory_mb'] if stats_history else 0
|
||||||
|
print(f" [{elapsed+10:3d}s] Memory: {current_mem:.1f} MB")
|
||||||
|
|
||||||
|
# Stop monitoring
|
||||||
|
stop_monitoring.set()
|
||||||
|
if monitor_thread:
|
||||||
|
monitor_thread.join(timeout=2)
|
||||||
|
|
||||||
|
# Analyze memory recovery
|
||||||
|
final_mem = stats_history[-1]['memory_mb'] if stats_history else 0
|
||||||
|
recovery_mb = peak_mem - final_mem
|
||||||
|
recovery_pct = (recovery_mb / (peak_mem - baseline_mem) * 100) if (peak_mem - baseline_mem) > 0 else 0
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"RESULTS:")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f" Memory Journey:")
|
||||||
|
print(f" Baseline: {baseline_mem:.1f} MB")
|
||||||
|
print(f" Peak: {peak_mem:.1f} MB (+{peak_mem - baseline_mem:.1f} MB)")
|
||||||
|
print(f" Final: {final_mem:.1f} MB (+{final_mem - baseline_mem:.1f} MB)")
|
||||||
|
print(f" Recovered: {recovery_mb:.1f} MB ({recovery_pct:.1f}%)")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
# Pass/Fail
|
||||||
|
passed = True
|
||||||
|
|
||||||
|
# Should have created some memory pressure
|
||||||
|
if peak_mem - baseline_mem < 100:
|
||||||
|
print(f"⚠️ WARNING: Peak increase only {peak_mem - baseline_mem:.1f} MB (expected more browsers)")
|
||||||
|
|
||||||
|
# Should recover most memory (within 100MB of baseline)
|
||||||
|
if final_mem - baseline_mem > 100:
|
||||||
|
print(f"⚠️ WARNING: Memory didn't recover well (still +{final_mem - baseline_mem:.1f} MB above baseline)")
|
||||||
|
else:
|
||||||
|
print(f"✅ Good memory recovery!")
|
||||||
|
|
||||||
|
# Baseline + 50MB tolerance
|
||||||
|
if final_mem - baseline_mem < 50:
|
||||||
|
print(f"✅ Excellent cleanup (within 50MB of baseline)")
|
||||||
|
|
||||||
|
print(f"✅ TEST PASSED")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n❌ TEST ERROR: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return 1
|
||||||
|
finally:
|
||||||
|
stop_monitoring.set()
|
||||||
|
if container:
|
||||||
|
print(f"🛑 Stopping container...")
|
||||||
|
container.stop()
|
||||||
|
container.remove()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
exit_code = asyncio.run(main())
|
||||||
|
exit(exit_code)
|
||||||
57
deploy/docker/tests/test_monitor_demo.py
Normal file
57
deploy/docker/tests/test_monitor_demo.py
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Quick test to generate monitor dashboard activity"""
|
||||||
|
import httpx
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
async def test_dashboard():
|
||||||
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||||
|
print("📊 Generating dashboard activity...")
|
||||||
|
|
||||||
|
# Test 1: Simple crawl
|
||||||
|
print("\n1️⃣ Running simple crawl...")
|
||||||
|
r1 = await client.post(
|
||||||
|
"http://localhost:11235/crawl",
|
||||||
|
json={"urls": ["https://httpbin.org/html"], "crawler_config": {}}
|
||||||
|
)
|
||||||
|
print(f" Status: {r1.status_code}")
|
||||||
|
|
||||||
|
# Test 2: Multiple URLs
|
||||||
|
print("\n2️⃣ Running multi-URL crawl...")
|
||||||
|
r2 = await client.post(
|
||||||
|
"http://localhost:11235/crawl",
|
||||||
|
json={
|
||||||
|
"urls": [
|
||||||
|
"https://httpbin.org/html",
|
||||||
|
"https://httpbin.org/json"
|
||||||
|
],
|
||||||
|
"crawler_config": {}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
print(f" Status: {r2.status_code}")
|
||||||
|
|
||||||
|
# Test 3: Check monitor health
|
||||||
|
print("\n3️⃣ Checking monitor health...")
|
||||||
|
r3 = await client.get("http://localhost:11235/monitor/health")
|
||||||
|
health = r3.json()
|
||||||
|
print(f" Memory: {health['container']['memory_percent']}%")
|
||||||
|
print(f" Browsers: {health['pool']['permanent']['active']}")
|
||||||
|
|
||||||
|
# Test 4: Check requests
|
||||||
|
print("\n4️⃣ Checking request log...")
|
||||||
|
r4 = await client.get("http://localhost:11235/monitor/requests")
|
||||||
|
reqs = r4.json()
|
||||||
|
print(f" Active: {len(reqs['active'])}")
|
||||||
|
print(f" Completed: {len(reqs['completed'])}")
|
||||||
|
|
||||||
|
# Test 5: Check endpoint stats
|
||||||
|
print("\n5️⃣ Checking endpoint stats...")
|
||||||
|
r5 = await client.get("http://localhost:11235/monitor/endpoints/stats")
|
||||||
|
stats = r5.json()
|
||||||
|
for endpoint, data in stats.items():
|
||||||
|
print(f" {endpoint}: {data['count']} requests, {data['avg_latency_ms']}ms avg")
|
||||||
|
|
||||||
|
print("\n✅ Dashboard should now show activity!")
|
||||||
|
print(f"\n🌐 Open: http://localhost:11235/dashboard")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(test_dashboard())
|
||||||
@@ -178,4 +178,29 @@ def verify_email_domain(email: str) -> bool:
|
|||||||
records = dns.resolver.resolve(domain, 'MX')
|
records = dns.resolver.resolve(domain, 'MX')
|
||||||
return True if records else False
|
return True if records else False
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def get_container_memory_percent() -> float:
|
||||||
|
"""Get actual container memory usage vs limit (cgroup v1/v2 aware)."""
|
||||||
|
try:
|
||||||
|
# Try cgroup v2 first
|
||||||
|
usage_path = Path("/sys/fs/cgroup/memory.current")
|
||||||
|
limit_path = Path("/sys/fs/cgroup/memory.max")
|
||||||
|
if not usage_path.exists():
|
||||||
|
# Fall back to cgroup v1
|
||||||
|
usage_path = Path("/sys/fs/cgroup/memory/memory.usage_in_bytes")
|
||||||
|
limit_path = Path("/sys/fs/cgroup/memory/memory.limit_in_bytes")
|
||||||
|
|
||||||
|
usage = int(usage_path.read_text())
|
||||||
|
limit = int(limit_path.read_text())
|
||||||
|
|
||||||
|
# Handle unlimited (v2: "max", v1: > 1e18)
|
||||||
|
if limit > 1e18:
|
||||||
|
import psutil
|
||||||
|
limit = psutil.virtual_memory().total
|
||||||
|
|
||||||
|
return (usage / limit) * 100
|
||||||
|
except:
|
||||||
|
# Non-container or unsupported: fallback to host
|
||||||
|
import psutil
|
||||||
|
return psutil.virtual_memory().percent
|
||||||
@@ -1,159 +0,0 @@
|
|||||||
"""
|
|
||||||
Webhook delivery service for Crawl4AI.
|
|
||||||
|
|
||||||
This module provides webhook notification functionality with exponential backoff retry logic.
|
|
||||||
"""
|
|
||||||
import asyncio
|
|
||||||
import httpx
|
|
||||||
import logging
|
|
||||||
from typing import Dict, Optional
|
|
||||||
from datetime import datetime, timezone
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class WebhookDeliveryService:
|
|
||||||
"""Handles webhook delivery with exponential backoff retry logic."""
|
|
||||||
|
|
||||||
def __init__(self, config: Dict):
|
|
||||||
"""
|
|
||||||
Initialize the webhook delivery service.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
config: Application configuration dictionary containing webhook settings
|
|
||||||
"""
|
|
||||||
self.config = config.get("webhooks", {})
|
|
||||||
self.max_attempts = self.config.get("retry", {}).get("max_attempts", 5)
|
|
||||||
self.initial_delay = self.config.get("retry", {}).get("initial_delay_ms", 1000) / 1000
|
|
||||||
self.max_delay = self.config.get("retry", {}).get("max_delay_ms", 32000) / 1000
|
|
||||||
self.timeout = self.config.get("retry", {}).get("timeout_ms", 30000) / 1000
|
|
||||||
|
|
||||||
async def send_webhook(
|
|
||||||
self,
|
|
||||||
webhook_url: str,
|
|
||||||
payload: Dict,
|
|
||||||
headers: Optional[Dict[str, str]] = None
|
|
||||||
) -> bool:
|
|
||||||
"""
|
|
||||||
Send webhook with exponential backoff retry logic.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
webhook_url: The URL to send the webhook to
|
|
||||||
payload: The JSON payload to send
|
|
||||||
headers: Optional custom headers
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
bool: True if delivered successfully, False otherwise
|
|
||||||
"""
|
|
||||||
default_headers = self.config.get("headers", {})
|
|
||||||
merged_headers = {**default_headers, **(headers or {})}
|
|
||||||
merged_headers["Content-Type"] = "application/json"
|
|
||||||
|
|
||||||
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
|
||||||
for attempt in range(self.max_attempts):
|
|
||||||
try:
|
|
||||||
logger.info(
|
|
||||||
f"Sending webhook (attempt {attempt + 1}/{self.max_attempts}) to {webhook_url}"
|
|
||||||
)
|
|
||||||
|
|
||||||
response = await client.post(
|
|
||||||
webhook_url,
|
|
||||||
json=payload,
|
|
||||||
headers=merged_headers
|
|
||||||
)
|
|
||||||
|
|
||||||
# Success or client error (don't retry client errors)
|
|
||||||
if response.status_code < 500:
|
|
||||||
if 200 <= response.status_code < 300:
|
|
||||||
logger.info(f"Webhook delivered successfully to {webhook_url}")
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
logger.warning(
|
|
||||||
f"Webhook rejected with status {response.status_code}: {response.text[:200]}"
|
|
||||||
)
|
|
||||||
return False # Client error - don't retry
|
|
||||||
|
|
||||||
# Server error - retry with backoff
|
|
||||||
logger.warning(
|
|
||||||
f"Webhook failed with status {response.status_code}, will retry"
|
|
||||||
)
|
|
||||||
|
|
||||||
except httpx.TimeoutException as exc:
|
|
||||||
logger.error(f"Webhook timeout (attempt {attempt + 1}): {exc}")
|
|
||||||
except httpx.RequestError as exc:
|
|
||||||
logger.error(f"Webhook request error (attempt {attempt + 1}): {exc}")
|
|
||||||
except Exception as exc:
|
|
||||||
logger.error(f"Webhook delivery error (attempt {attempt + 1}): {exc}")
|
|
||||||
|
|
||||||
# Calculate exponential backoff delay
|
|
||||||
if attempt < self.max_attempts - 1:
|
|
||||||
delay = min(self.initial_delay * (2 ** attempt), self.max_delay)
|
|
||||||
logger.info(f"Retrying in {delay}s...")
|
|
||||||
await asyncio.sleep(delay)
|
|
||||||
|
|
||||||
logger.error(
|
|
||||||
f"Webhook delivery failed after {self.max_attempts} attempts to {webhook_url}"
|
|
||||||
)
|
|
||||||
return False
|
|
||||||
|
|
||||||
async def notify_job_completion(
|
|
||||||
self,
|
|
||||||
task_id: str,
|
|
||||||
task_type: str,
|
|
||||||
status: str,
|
|
||||||
urls: list,
|
|
||||||
webhook_config: Optional[Dict],
|
|
||||||
result: Optional[Dict] = None,
|
|
||||||
error: Optional[str] = None
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Notify webhook of job completion.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
task_id: The task identifier
|
|
||||||
task_type: Type of task (e.g., "crawl", "llm_extraction")
|
|
||||||
status: Task status ("completed" or "failed")
|
|
||||||
urls: List of URLs that were crawled
|
|
||||||
webhook_config: Webhook configuration from the job request
|
|
||||||
result: Optional crawl result data
|
|
||||||
error: Optional error message if failed
|
|
||||||
"""
|
|
||||||
# Determine webhook URL
|
|
||||||
webhook_url = None
|
|
||||||
data_in_payload = self.config.get("data_in_payload", False)
|
|
||||||
custom_headers = None
|
|
||||||
|
|
||||||
if webhook_config:
|
|
||||||
webhook_url = webhook_config.get("webhook_url")
|
|
||||||
data_in_payload = webhook_config.get("webhook_data_in_payload", data_in_payload)
|
|
||||||
custom_headers = webhook_config.get("webhook_headers")
|
|
||||||
|
|
||||||
if not webhook_url:
|
|
||||||
webhook_url = self.config.get("default_url")
|
|
||||||
|
|
||||||
if not webhook_url:
|
|
||||||
logger.debug("No webhook URL configured, skipping notification")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Check if webhooks are enabled
|
|
||||||
if not self.config.get("enabled", True):
|
|
||||||
logger.debug("Webhooks are disabled, skipping notification")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Build payload
|
|
||||||
payload = {
|
|
||||||
"task_id": task_id,
|
|
||||||
"task_type": task_type,
|
|
||||||
"status": status,
|
|
||||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
||||||
"urls": urls
|
|
||||||
}
|
|
||||||
|
|
||||||
if error:
|
|
||||||
payload["error"] = error
|
|
||||||
|
|
||||||
if data_in_payload and result:
|
|
||||||
payload["data"] = result
|
|
||||||
|
|
||||||
# Send webhook (fire and forget - don't block on completion)
|
|
||||||
await self.send_webhook(webhook_url, payload, custom_headers)
|
|
||||||
@@ -10,6 +10,7 @@ Today I'm releasing Crawl4AI v0.7.4—the Intelligent Table Extraction & Perform
|
|||||||
|
|
||||||
- **🚀 LLMTableExtraction**: Revolutionary table extraction with intelligent chunking for massive tables
|
- **🚀 LLMTableExtraction**: Revolutionary table extraction with intelligent chunking for massive tables
|
||||||
- **⚡ Enhanced Concurrency**: True concurrency improvements for fast-completing tasks in batch operations
|
- **⚡ Enhanced Concurrency**: True concurrency improvements for fast-completing tasks in batch operations
|
||||||
|
- **🧹 Memory Management Refactor**: Streamlined memory utilities and better resource management
|
||||||
- **🔧 Browser Manager Fixes**: Resolved race conditions in concurrent page creation
|
- **🔧 Browser Manager Fixes**: Resolved race conditions in concurrent page creation
|
||||||
- **⌨️ Cross-Platform Browser Profiler**: Improved keyboard handling and quit mechanisms
|
- **⌨️ Cross-Platform Browser Profiler**: Improved keyboard handling and quit mechanisms
|
||||||
- **🔗 Advanced URL Processing**: Better handling of raw URLs and base tag link resolution
|
- **🔗 Advanced URL Processing**: Better handling of raw URLs and base tag link resolution
|
||||||
@@ -157,6 +158,40 @@ async with AsyncWebCrawler() as crawler:
|
|||||||
- **Monitoring Systems**: Faster health checks and status page monitoring
|
- **Monitoring Systems**: Faster health checks and status page monitoring
|
||||||
- **Data Aggregation**: Improved performance for real-time data collection
|
- **Data Aggregation**: Improved performance for real-time data collection
|
||||||
|
|
||||||
|
## 🧹 Memory Management Refactor: Cleaner Architecture
|
||||||
|
|
||||||
|
**The Problem:** Memory utilities were scattered and difficult to maintain, with potential import conflicts and unclear organization.
|
||||||
|
|
||||||
|
**My Solution:** I consolidated all memory-related utilities into the main `utils.py` module, creating a cleaner, more maintainable architecture.
|
||||||
|
|
||||||
|
### Improved Memory Handling
|
||||||
|
|
||||||
|
```python
|
||||||
|
# All memory utilities now consolidated
|
||||||
|
from crawl4ai.utils import get_true_memory_usage_percent, MemoryMonitor
|
||||||
|
|
||||||
|
# Enhanced memory monitoring
|
||||||
|
monitor = MemoryMonitor()
|
||||||
|
monitor.start_monitoring()
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
# Memory-efficient batch processing
|
||||||
|
results = await crawler.arun_many(large_url_list)
|
||||||
|
|
||||||
|
# Get accurate memory metrics
|
||||||
|
memory_usage = get_true_memory_usage_percent()
|
||||||
|
memory_report = monitor.get_report()
|
||||||
|
|
||||||
|
print(f"Memory efficiency: {memory_report['efficiency']:.1f}%")
|
||||||
|
print(f"Peak usage: {memory_report['peak_mb']:.1f} MB")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Expected Real-World Impact:**
|
||||||
|
- **Production Stability**: More reliable memory tracking and management
|
||||||
|
- **Code Maintainability**: Cleaner architecture for easier debugging
|
||||||
|
- **Import Clarity**: Resolved potential conflicts and import issues
|
||||||
|
- **Developer Experience**: Simpler API for memory monitoring
|
||||||
|
|
||||||
## 🔧 Critical Stability Fixes
|
## 🔧 Critical Stability Fixes
|
||||||
|
|
||||||
### Browser Manager Race Condition Resolution
|
### Browser Manager Race Condition Resolution
|
||||||
|
|||||||
@@ -1,318 +0,0 @@
|
|||||||
# 🚀 Crawl4AI v0.7.5: The Docker Hooks & Security Update
|
|
||||||
|
|
||||||
*September 29, 2025 • 8 min read*
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
Today I'm releasing Crawl4AI v0.7.5—focused on extensibility and security. This update introduces the Docker Hooks System for pipeline customization, enhanced LLM integration, and important security improvements.
|
|
||||||
|
|
||||||
## 🎯 What's New at a Glance
|
|
||||||
|
|
||||||
- **Docker Hooks System**: Custom Python functions at key pipeline points with function-based API
|
|
||||||
- **Function-Based Hooks**: New `hooks_to_string()` utility with Docker client auto-conversion
|
|
||||||
- **Enhanced LLM Integration**: Custom providers with temperature control
|
|
||||||
- **HTTPS Preservation**: Secure internal link handling
|
|
||||||
- **Bug Fixes**: Resolved multiple community-reported issues
|
|
||||||
- **Improved Docker Error Handling**: Better debugging and reliability
|
|
||||||
|
|
||||||
## 🔧 Docker Hooks System: Pipeline Customization
|
|
||||||
|
|
||||||
Every scraping project needs custom logic—authentication, performance optimization, content processing. Traditional solutions require forking or complex workarounds. Docker Hooks let you inject custom Python functions at 8 key points in the crawling pipeline.
|
|
||||||
|
|
||||||
### Real Example: Authentication & Performance
|
|
||||||
|
|
||||||
```python
|
|
||||||
import requests
|
|
||||||
|
|
||||||
# Real working hooks for httpbin.org
|
|
||||||
hooks_config = {
|
|
||||||
"on_page_context_created": """
|
|
||||||
async def hook(page, context, **kwargs):
|
|
||||||
print("Hook: Setting up page context")
|
|
||||||
# Block images to speed up crawling
|
|
||||||
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
|
||||||
print("Hook: Images blocked")
|
|
||||||
return page
|
|
||||||
""",
|
|
||||||
|
|
||||||
"before_retrieve_html": """
|
|
||||||
async def hook(page, context, **kwargs):
|
|
||||||
print("Hook: Before retrieving HTML")
|
|
||||||
# Scroll to bottom to load lazy content
|
|
||||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
||||||
await page.wait_for_timeout(1000)
|
|
||||||
print("Hook: Scrolled to bottom")
|
|
||||||
return page
|
|
||||||
""",
|
|
||||||
|
|
||||||
"before_goto": """
|
|
||||||
async def hook(page, context, url, **kwargs):
|
|
||||||
print(f"Hook: About to navigate to {url}")
|
|
||||||
# Add custom headers
|
|
||||||
await page.set_extra_http_headers({
|
|
||||||
'X-Test-Header': 'crawl4ai-hooks-test'
|
|
||||||
})
|
|
||||||
return page
|
|
||||||
"""
|
|
||||||
}
|
|
||||||
|
|
||||||
# Test with Docker API
|
|
||||||
payload = {
|
|
||||||
"urls": ["https://httpbin.org/html"],
|
|
||||||
"hooks": {
|
|
||||||
"code": hooks_config,
|
|
||||||
"timeout": 30
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
response = requests.post("http://localhost:11235/crawl", json=payload)
|
|
||||||
result = response.json()
|
|
||||||
|
|
||||||
if result.get('success'):
|
|
||||||
print("✅ Hooks executed successfully!")
|
|
||||||
print(f"Content length: {len(result.get('markdown', ''))} characters")
|
|
||||||
```
|
|
||||||
|
|
||||||
**Available Hook Points:**
|
|
||||||
- `on_browser_created`: Browser setup
|
|
||||||
- `on_page_context_created`: Page context configuration
|
|
||||||
- `before_goto`: Pre-navigation setup
|
|
||||||
- `after_goto`: Post-navigation processing
|
|
||||||
- `on_user_agent_updated`: User agent changes
|
|
||||||
- `on_execution_started`: Crawl initialization
|
|
||||||
- `before_retrieve_html`: Pre-extraction processing
|
|
||||||
- `before_return_html`: Final HTML processing
|
|
||||||
|
|
||||||
### Function-Based Hooks API
|
|
||||||
|
|
||||||
Writing hooks as strings works, but lacks IDE support and type checking. v0.7.5 introduces a function-based approach with automatic conversion!
|
|
||||||
|
|
||||||
**Option 1: Using the `hooks_to_string()` Utility**
|
|
||||||
|
|
||||||
```python
|
|
||||||
from crawl4ai import hooks_to_string
|
|
||||||
import requests
|
|
||||||
|
|
||||||
# Define hooks as regular Python functions (with full IDE support!)
|
|
||||||
async def on_page_context_created(page, context, **kwargs):
|
|
||||||
"""Block images to speed up crawling"""
|
|
||||||
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
|
||||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
|
||||||
return page
|
|
||||||
|
|
||||||
async def before_goto(page, context, url, **kwargs):
|
|
||||||
"""Add custom headers"""
|
|
||||||
await page.set_extra_http_headers({
|
|
||||||
'X-Crawl4AI': 'v0.7.5',
|
|
||||||
'X-Custom-Header': 'my-value'
|
|
||||||
})
|
|
||||||
return page
|
|
||||||
|
|
||||||
# Convert functions to strings
|
|
||||||
hooks_code = hooks_to_string({
|
|
||||||
"on_page_context_created": on_page_context_created,
|
|
||||||
"before_goto": before_goto
|
|
||||||
})
|
|
||||||
|
|
||||||
# Use with REST API
|
|
||||||
payload = {
|
|
||||||
"urls": ["https://httpbin.org/html"],
|
|
||||||
"hooks": {"code": hooks_code, "timeout": 30}
|
|
||||||
}
|
|
||||||
response = requests.post("http://localhost:11235/crawl", json=payload)
|
|
||||||
```
|
|
||||||
|
|
||||||
**Option 2: Docker Client with Automatic Conversion (Recommended!)**
|
|
||||||
|
|
||||||
```python
|
|
||||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
|
||||||
|
|
||||||
# Define hooks as functions (same as above)
|
|
||||||
async def on_page_context_created(page, context, **kwargs):
|
|
||||||
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
|
||||||
return page
|
|
||||||
|
|
||||||
async def before_retrieve_html(page, context, **kwargs):
|
|
||||||
# Scroll to load lazy content
|
|
||||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
||||||
await page.wait_for_timeout(1000)
|
|
||||||
return page
|
|
||||||
|
|
||||||
# Use Docker client - conversion happens automatically!
|
|
||||||
client = Crawl4aiDockerClient(base_url="http://localhost:11235")
|
|
||||||
|
|
||||||
results = await client.crawl(
|
|
||||||
urls=["https://httpbin.org/html"],
|
|
||||||
hooks={
|
|
||||||
"on_page_context_created": on_page_context_created,
|
|
||||||
"before_retrieve_html": before_retrieve_html
|
|
||||||
},
|
|
||||||
hooks_timeout=30
|
|
||||||
)
|
|
||||||
|
|
||||||
if results and results.success:
|
|
||||||
print(f"✅ Hooks executed! HTML length: {len(results.html)}")
|
|
||||||
```
|
|
||||||
|
|
||||||
**Benefits of Function-Based Hooks:**
|
|
||||||
- ✅ Full IDE support (autocomplete, syntax highlighting)
|
|
||||||
- ✅ Type checking and linting
|
|
||||||
- ✅ Easier to test and debug
|
|
||||||
- ✅ Reusable across projects
|
|
||||||
- ✅ Automatic conversion in Docker client
|
|
||||||
- ✅ No breaking changes - string hooks still work!
|
|
||||||
|
|
||||||
## 🤖 Enhanced LLM Integration
|
|
||||||
|
|
||||||
Enhanced LLM integration with custom providers, temperature control, and base URL configuration.
|
|
||||||
|
|
||||||
### Multi-Provider Support
|
|
||||||
|
|
||||||
```python
|
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
|
||||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
|
||||||
|
|
||||||
# Test with different providers
|
|
||||||
async def test_llm_providers():
|
|
||||||
# OpenAI with custom temperature
|
|
||||||
openai_strategy = LLMExtractionStrategy(
|
|
||||||
provider="gemini/gemini-2.5-flash-lite",
|
|
||||||
api_token="your-api-token",
|
|
||||||
temperature=0.7, # New in v0.7.5
|
|
||||||
instruction="Summarize this page in one sentence"
|
|
||||||
)
|
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
|
||||||
result = await crawler.arun(
|
|
||||||
"https://example.com",
|
|
||||||
config=CrawlerRunConfig(extraction_strategy=openai_strategy)
|
|
||||||
)
|
|
||||||
|
|
||||||
if result.success:
|
|
||||||
print("✅ LLM extraction completed")
|
|
||||||
print(result.extracted_content)
|
|
||||||
|
|
||||||
# Docker API with enhanced LLM config
|
|
||||||
llm_payload = {
|
|
||||||
"url": "https://example.com",
|
|
||||||
"f": "llm",
|
|
||||||
"q": "Summarize this page in one sentence.",
|
|
||||||
"provider": "gemini/gemini-2.5-flash-lite",
|
|
||||||
"temperature": 0.7
|
|
||||||
}
|
|
||||||
|
|
||||||
response = requests.post("http://localhost:11235/md", json=llm_payload)
|
|
||||||
```
|
|
||||||
|
|
||||||
**New Features:**
|
|
||||||
- Custom `temperature` parameter for creativity control
|
|
||||||
- `base_url` for custom API endpoints
|
|
||||||
- Multi-provider environment variable support
|
|
||||||
- Docker API integration
|
|
||||||
|
|
||||||
## 🔒 HTTPS Preservation
|
|
||||||
|
|
||||||
**The Problem:** Modern web apps require HTTPS everywhere. When crawlers downgrade internal links from HTTPS to HTTP, authentication breaks and security warnings appear.
|
|
||||||
|
|
||||||
**Solution:** HTTPS preservation maintains secure protocols throughout crawling.
|
|
||||||
|
|
||||||
```python
|
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, FilterChain, URLPatternFilter, BFSDeepCrawlStrategy
|
|
||||||
|
|
||||||
async def test_https_preservation():
|
|
||||||
# Enable HTTPS preservation
|
|
||||||
url_filter = URLPatternFilter(
|
|
||||||
patterns=["^(https:\/\/)?quotes\.toscrape\.com(\/.*)?$"]
|
|
||||||
)
|
|
||||||
|
|
||||||
config = CrawlerRunConfig(
|
|
||||||
exclude_external_links=True,
|
|
||||||
preserve_https_for_internal_links=True, # New in v0.7.5
|
|
||||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
|
||||||
max_depth=2,
|
|
||||||
max_pages=5,
|
|
||||||
filter_chain=FilterChain([url_filter])
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
|
||||||
async for result in await crawler.arun(
|
|
||||||
url="https://quotes.toscrape.com",
|
|
||||||
config=config
|
|
||||||
):
|
|
||||||
# All internal links maintain HTTPS
|
|
||||||
internal_links = [link['href'] for link in result.links['internal']]
|
|
||||||
https_links = [link for link in internal_links if link.startswith('https://')]
|
|
||||||
|
|
||||||
print(f"HTTPS links preserved: {len(https_links)}/{len(internal_links)}")
|
|
||||||
for link in https_links[:3]:
|
|
||||||
print(f" → {link}")
|
|
||||||
```
|
|
||||||
|
|
||||||
## 🛠️ Bug Fixes and Improvements
|
|
||||||
|
|
||||||
### Major Fixes
|
|
||||||
- **URL Processing**: Fixed '+' sign preservation in query parameters (#1332)
|
|
||||||
- **Proxy Configuration**: Enhanced proxy string parsing (old `proxy` parameter deprecated)
|
|
||||||
- **Docker Error Handling**: Comprehensive error messages with status codes
|
|
||||||
- **Memory Management**: Fixed leaks in long-running sessions
|
|
||||||
- **JWT Authentication**: Fixed Docker JWT validation issues (#1442)
|
|
||||||
- **Playwright Stealth**: Fixed stealth features for Playwright integration (#1481)
|
|
||||||
- **API Configuration**: Fixed config handling to prevent overriding user-provided settings (#1505)
|
|
||||||
- **Docker Filter Serialization**: Resolved JSON encoding errors in deep crawl strategy (#1419)
|
|
||||||
- **LLM Provider Support**: Fixed custom LLM provider integration for adaptive crawler (#1291)
|
|
||||||
- **Performance Issues**: Resolved backoff strategy failures and timeout handling (#989)
|
|
||||||
|
|
||||||
### Community-Reported Issues Fixed
|
|
||||||
This release addresses multiple issues reported by the community through GitHub issues and Discord discussions:
|
|
||||||
- Fixed browser configuration reference errors
|
|
||||||
- Resolved dependency conflicts with cssselect
|
|
||||||
- Improved error messaging for failed authentications
|
|
||||||
- Enhanced compatibility with various proxy configurations
|
|
||||||
- Fixed edge cases in URL normalization
|
|
||||||
|
|
||||||
### Configuration Updates
|
|
||||||
```python
|
|
||||||
# Old proxy config (deprecated)
|
|
||||||
# browser_config = BrowserConfig(proxy="http://proxy:8080")
|
|
||||||
|
|
||||||
# New enhanced proxy config
|
|
||||||
browser_config = BrowserConfig(
|
|
||||||
proxy_config={
|
|
||||||
"server": "http://proxy:8080",
|
|
||||||
"username": "optional-user",
|
|
||||||
"password": "optional-pass"
|
|
||||||
}
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
## 🔄 Breaking Changes
|
|
||||||
|
|
||||||
1. **Python 3.10+ Required**: Upgrade from Python 3.9
|
|
||||||
2. **Proxy Parameter Deprecated**: Use new `proxy_config` structure
|
|
||||||
3. **New Dependency**: Added `cssselect` for better CSS handling
|
|
||||||
|
|
||||||
## 🚀 Get Started
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Install latest version
|
|
||||||
pip install crawl4ai==0.7.5
|
|
||||||
|
|
||||||
# Docker deployment
|
|
||||||
docker pull unclecode/crawl4ai:latest
|
|
||||||
docker run -p 11235:11235 unclecode/crawl4ai:latest
|
|
||||||
```
|
|
||||||
|
|
||||||
**Try the Demo:**
|
|
||||||
```bash
|
|
||||||
# Run working examples
|
|
||||||
python docs/releases_review/demo_v0.7.5.py
|
|
||||||
```
|
|
||||||
|
|
||||||
**Resources:**
|
|
||||||
- 📖 Documentation: [docs.crawl4ai.com](https://docs.crawl4ai.com)
|
|
||||||
- 🐙 GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
|
||||||
- 💬 Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
|
||||||
- 🐦 Twitter: [@unclecode](https://x.com/unclecode)
|
|
||||||
|
|
||||||
Happy crawling! 🕷️
|
|
||||||
@@ -1,314 +0,0 @@
|
|||||||
# Crawl4AI v0.7.6 Release Notes
|
|
||||||
|
|
||||||
*Release Date: October 22, 2025*
|
|
||||||
|
|
||||||
I'm excited to announce Crawl4AI v0.7.6, featuring a complete webhook infrastructure for the Docker job queue API! This release eliminates polling and brings real-time notifications to both crawling and LLM extraction workflows.
|
|
||||||
|
|
||||||
## 🎯 What's New
|
|
||||||
|
|
||||||
### Webhook Support for Docker Job Queue API
|
|
||||||
|
|
||||||
The headline feature of v0.7.6 is comprehensive webhook support for asynchronous job processing. No more constant polling to check if your jobs are done - get instant notifications when they complete!
|
|
||||||
|
|
||||||
**Key Capabilities:**
|
|
||||||
|
|
||||||
- ✅ **Universal Webhook Support**: Both `/crawl/job` and `/llm/job` endpoints now support webhooks
|
|
||||||
- ✅ **Flexible Delivery Modes**: Choose notification-only or include full data in the webhook payload
|
|
||||||
- ✅ **Reliable Delivery**: Exponential backoff retry mechanism (5 attempts: 1s → 2s → 4s → 8s → 16s)
|
|
||||||
- ✅ **Custom Authentication**: Add custom headers for webhook authentication
|
|
||||||
- ✅ **Global Configuration**: Set default webhook URL in `config.yml` for all jobs
|
|
||||||
- ✅ **Task Type Identification**: Distinguish between `crawl` and `llm_extraction` tasks
|
|
||||||
|
|
||||||
### How It Works
|
|
||||||
|
|
||||||
Instead of constantly checking job status:
|
|
||||||
|
|
||||||
**OLD WAY (Polling):**
|
|
||||||
```python
|
|
||||||
# Submit job
|
|
||||||
response = requests.post("http://localhost:11235/crawl/job", json=payload)
|
|
||||||
task_id = response.json()['task_id']
|
|
||||||
|
|
||||||
# Poll until complete
|
|
||||||
while True:
|
|
||||||
status = requests.get(f"http://localhost:11235/crawl/job/{task_id}")
|
|
||||||
if status.json()['status'] == 'completed':
|
|
||||||
break
|
|
||||||
time.sleep(5) # Wait and try again
|
|
||||||
```
|
|
||||||
|
|
||||||
**NEW WAY (Webhooks):**
|
|
||||||
```python
|
|
||||||
# Submit job with webhook
|
|
||||||
payload = {
|
|
||||||
"urls": ["https://example.com"],
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": "https://myapp.com/webhook",
|
|
||||||
"webhook_data_in_payload": True
|
|
||||||
}
|
|
||||||
}
|
|
||||||
response = requests.post("http://localhost:11235/crawl/job", json=payload)
|
|
||||||
|
|
||||||
# Done! Webhook will notify you when complete
|
|
||||||
# Your webhook handler receives the results automatically
|
|
||||||
```
|
|
||||||
|
|
||||||
### Crawl Job Webhooks
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl -X POST http://localhost:11235/crawl/job \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"urls": ["https://example.com"],
|
|
||||||
"browser_config": {"headless": true},
|
|
||||||
"crawler_config": {"cache_mode": "bypass"},
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": "https://myapp.com/webhooks/crawl-complete",
|
|
||||||
"webhook_data_in_payload": false,
|
|
||||||
"webhook_headers": {
|
|
||||||
"X-Webhook-Secret": "your-secret-token"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
### LLM Extraction Job Webhooks (NEW!)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl -X POST http://localhost:11235/llm/job \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"url": "https://example.com/article",
|
|
||||||
"q": "Extract the article title, author, and publication date",
|
|
||||||
"schema": "{\"type\":\"object\",\"properties\":{\"title\":{\"type\":\"string\"}}}",
|
|
||||||
"provider": "openai/gpt-4o-mini",
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": "https://myapp.com/webhooks/llm-complete",
|
|
||||||
"webhook_data_in_payload": true
|
|
||||||
}
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
### Webhook Payload Structure
|
|
||||||
|
|
||||||
**Success (with data):**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"task_id": "llm_1698765432",
|
|
||||||
"task_type": "llm_extraction",
|
|
||||||
"status": "completed",
|
|
||||||
"timestamp": "2025-10-22T10:30:00.000000+00:00",
|
|
||||||
"urls": ["https://example.com/article"],
|
|
||||||
"data": {
|
|
||||||
"extracted_content": {
|
|
||||||
"title": "Understanding Web Scraping",
|
|
||||||
"author": "John Doe",
|
|
||||||
"date": "2025-10-22"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Failure:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"task_id": "crawl_abc123",
|
|
||||||
"task_type": "crawl",
|
|
||||||
"status": "failed",
|
|
||||||
"timestamp": "2025-10-22T10:30:00.000000+00:00",
|
|
||||||
"urls": ["https://example.com"],
|
|
||||||
"error": "Connection timeout after 30s"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Simple Webhook Handler Example
|
|
||||||
|
|
||||||
```python
|
|
||||||
from flask import Flask, request, jsonify
|
|
||||||
|
|
||||||
app = Flask(__name__)
|
|
||||||
|
|
||||||
@app.route('/webhook', methods=['POST'])
|
|
||||||
def handle_webhook():
|
|
||||||
payload = request.json
|
|
||||||
|
|
||||||
task_id = payload['task_id']
|
|
||||||
task_type = payload['task_type']
|
|
||||||
status = payload['status']
|
|
||||||
|
|
||||||
if status == 'completed':
|
|
||||||
if 'data' in payload:
|
|
||||||
# Process data directly
|
|
||||||
data = payload['data']
|
|
||||||
else:
|
|
||||||
# Fetch from API
|
|
||||||
endpoint = 'crawl' if task_type == 'crawl' else 'llm'
|
|
||||||
response = requests.get(f'http://localhost:11235/{endpoint}/job/{task_id}')
|
|
||||||
data = response.json()
|
|
||||||
|
|
||||||
# Your business logic here
|
|
||||||
print(f"Job {task_id} completed!")
|
|
||||||
|
|
||||||
elif status == 'failed':
|
|
||||||
error = payload.get('error', 'Unknown error')
|
|
||||||
print(f"Job {task_id} failed: {error}")
|
|
||||||
|
|
||||||
return jsonify({"status": "received"}), 200
|
|
||||||
|
|
||||||
app.run(port=8080)
|
|
||||||
```
|
|
||||||
|
|
||||||
## 📊 Performance Improvements
|
|
||||||
|
|
||||||
- **Reduced Server Load**: Eliminates constant polling requests
|
|
||||||
- **Lower Latency**: Instant notification vs. polling interval delay
|
|
||||||
- **Better Resource Usage**: Frees up client connections while jobs run in background
|
|
||||||
- **Scalable Architecture**: Handles high-volume crawling workflows efficiently
|
|
||||||
|
|
||||||
## 🐛 Bug Fixes
|
|
||||||
|
|
||||||
- Fixed webhook configuration serialization for Pydantic HttpUrl fields
|
|
||||||
- Improved error handling in webhook delivery service
|
|
||||||
- Enhanced Redis task storage for webhook config persistence
|
|
||||||
|
|
||||||
## 🌍 Expected Real-World Impact
|
|
||||||
|
|
||||||
### For Web Scraping Workflows
|
|
||||||
- **Reduced Costs**: Less API calls = lower bandwidth and server costs
|
|
||||||
- **Better UX**: Instant notifications improve user experience
|
|
||||||
- **Scalability**: Handle 100s of concurrent jobs without polling overhead
|
|
||||||
|
|
||||||
### For LLM Extraction Pipelines
|
|
||||||
- **Async Processing**: Submit LLM extraction jobs and move on
|
|
||||||
- **Batch Processing**: Queue multiple extractions, get notified as they complete
|
|
||||||
- **Integration**: Easy integration with workflow automation tools (Zapier, n8n, etc.)
|
|
||||||
|
|
||||||
### For Microservices
|
|
||||||
- **Event-Driven**: Perfect for event-driven microservice architectures
|
|
||||||
- **Decoupling**: Decouple job submission from result processing
|
|
||||||
- **Reliability**: Automatic retries ensure webhooks are delivered
|
|
||||||
|
|
||||||
## 🔄 Breaking Changes
|
|
||||||
|
|
||||||
**None!** This release is fully backward compatible.
|
|
||||||
|
|
||||||
- Webhook configuration is optional
|
|
||||||
- Existing code continues to work without modification
|
|
||||||
- Polling is still supported for jobs without webhook config
|
|
||||||
|
|
||||||
## 📚 Documentation
|
|
||||||
|
|
||||||
### New Documentation
|
|
||||||
- **[WEBHOOK_EXAMPLES.md](../deploy/docker/WEBHOOK_EXAMPLES.md)** - Comprehensive webhook usage guide
|
|
||||||
- **[docker_webhook_example.py](../docs/examples/docker_webhook_example.py)** - Working code examples
|
|
||||||
|
|
||||||
### Updated Documentation
|
|
||||||
- **[Docker README](../deploy/docker/README.md)** - Added webhook sections
|
|
||||||
- API documentation with webhook examples
|
|
||||||
|
|
||||||
## 🛠️ Migration Guide
|
|
||||||
|
|
||||||
No migration needed! Webhooks are opt-in:
|
|
||||||
|
|
||||||
1. **To use webhooks**: Add `webhook_config` to your job payload
|
|
||||||
2. **To keep polling**: Continue using your existing code
|
|
||||||
|
|
||||||
### Quick Start
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Just add webhook_config to your existing payload
|
|
||||||
payload = {
|
|
||||||
# Your existing configuration
|
|
||||||
"urls": ["https://example.com"],
|
|
||||||
"browser_config": {...},
|
|
||||||
"crawler_config": {...},
|
|
||||||
|
|
||||||
# NEW: Add webhook configuration
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": "https://myapp.com/webhook",
|
|
||||||
"webhook_data_in_payload": True
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## 🔧 Configuration
|
|
||||||
|
|
||||||
### Global Webhook Configuration (config.yml)
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
webhooks:
|
|
||||||
enabled: true
|
|
||||||
default_url: "https://myapp.com/webhooks/default" # Optional
|
|
||||||
data_in_payload: false
|
|
||||||
retry:
|
|
||||||
max_attempts: 5
|
|
||||||
initial_delay_ms: 1000
|
|
||||||
max_delay_ms: 32000
|
|
||||||
timeout_ms: 30000
|
|
||||||
headers:
|
|
||||||
User-Agent: "Crawl4AI-Webhook/1.0"
|
|
||||||
```
|
|
||||||
|
|
||||||
## 🚀 Upgrade Instructions
|
|
||||||
|
|
||||||
### Docker
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Pull the latest image
|
|
||||||
docker pull unclecode/crawl4ai:0.7.6
|
|
||||||
|
|
||||||
# Or use latest tag
|
|
||||||
docker pull unclecode/crawl4ai:latest
|
|
||||||
|
|
||||||
# Run with webhook support
|
|
||||||
docker run -d \
|
|
||||||
-p 11235:11235 \
|
|
||||||
--env-file .llm.env \
|
|
||||||
--name crawl4ai \
|
|
||||||
unclecode/crawl4ai:0.7.6
|
|
||||||
```
|
|
||||||
|
|
||||||
### Python Package
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install --upgrade crawl4ai
|
|
||||||
```
|
|
||||||
|
|
||||||
## 💡 Pro Tips
|
|
||||||
|
|
||||||
1. **Use notification-only mode** for large results - fetch data separately to avoid large webhook payloads
|
|
||||||
2. **Set custom headers** for webhook authentication and request tracking
|
|
||||||
3. **Configure global default webhook** for consistent handling across all jobs
|
|
||||||
4. **Implement idempotent webhook handlers** - same webhook may be delivered multiple times on retry
|
|
||||||
5. **Use structured schemas** with LLM extraction for predictable webhook data
|
|
||||||
|
|
||||||
## 🎬 Demo
|
|
||||||
|
|
||||||
Try the release demo:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python docs/releases_review/demo_v0.7.6.py
|
|
||||||
```
|
|
||||||
|
|
||||||
This comprehensive demo showcases:
|
|
||||||
- Crawl job webhooks (notification-only and with data)
|
|
||||||
- LLM extraction webhooks (with JSON schema support)
|
|
||||||
- Custom headers for authentication
|
|
||||||
- Webhook retry mechanism
|
|
||||||
- Real-time webhook receiver
|
|
||||||
|
|
||||||
## 🙏 Acknowledgments
|
|
||||||
|
|
||||||
Thank you to the community for the feedback that shaped this feature! Special thanks to everyone who requested webhook support for asynchronous job processing.
|
|
||||||
|
|
||||||
## 📞 Support
|
|
||||||
|
|
||||||
- **Documentation**: https://docs.crawl4ai.com
|
|
||||||
- **GitHub Issues**: https://github.com/unclecode/crawl4ai/issues
|
|
||||||
- **Discord**: https://discord.gg/crawl4ai
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
**Happy crawling with webhooks!** 🕷️🪝
|
|
||||||
|
|
||||||
*- unclecode*
|
|
||||||
@@ -1,522 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Comprehensive hooks examples using Docker Client with function objects.
|
|
||||||
|
|
||||||
This approach is recommended because:
|
|
||||||
- Write hooks as regular Python functions
|
|
||||||
- Full IDE support (autocomplete, type checking)
|
|
||||||
- Automatic conversion to API format
|
|
||||||
- Reusable and testable code
|
|
||||||
- Clean, readable syntax
|
|
||||||
"""
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
from crawl4ai import Crawl4aiDockerClient
|
|
||||||
|
|
||||||
# API_BASE_URL = "http://localhost:11235"
|
|
||||||
API_BASE_URL = "http://localhost:11234"
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
|
||||||
# Hook Function Definitions
|
|
||||||
# ============================================================================
|
|
||||||
|
|
||||||
# --- All Hooks Demo ---
|
|
||||||
async def browser_created_hook(browser, **kwargs):
|
|
||||||
"""Called after browser is created"""
|
|
||||||
print("[HOOK] Browser created and ready")
|
|
||||||
return browser
|
|
||||||
|
|
||||||
|
|
||||||
async def page_context_hook(page, context, **kwargs):
|
|
||||||
"""Setup page environment"""
|
|
||||||
print("[HOOK] Setting up page environment")
|
|
||||||
|
|
||||||
# Set viewport
|
|
||||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
|
||||||
|
|
||||||
# Add cookies
|
|
||||||
await context.add_cookies([{
|
|
||||||
"name": "test_session",
|
|
||||||
"value": "abc123xyz",
|
|
||||||
"domain": ".httpbin.org",
|
|
||||||
"path": "/"
|
|
||||||
}])
|
|
||||||
|
|
||||||
# Block resources
|
|
||||||
await context.route("**/*.{png,jpg,jpeg,gif}", lambda route: route.abort())
|
|
||||||
await context.route("**/analytics/*", lambda route: route.abort())
|
|
||||||
|
|
||||||
print("[HOOK] Environment configured")
|
|
||||||
return page
|
|
||||||
|
|
||||||
|
|
||||||
async def user_agent_hook(page, context, user_agent, **kwargs):
|
|
||||||
"""Called when user agent is updated"""
|
|
||||||
print(f"[HOOK] User agent: {user_agent[:50]}...")
|
|
||||||
return page
|
|
||||||
|
|
||||||
|
|
||||||
async def before_goto_hook(page, context, url, **kwargs):
|
|
||||||
"""Called before navigating to URL"""
|
|
||||||
print(f"[HOOK] Navigating to: {url}")
|
|
||||||
|
|
||||||
await page.set_extra_http_headers({
|
|
||||||
"X-Custom-Header": "crawl4ai-test",
|
|
||||||
"Accept-Language": "en-US"
|
|
||||||
})
|
|
||||||
|
|
||||||
return page
|
|
||||||
|
|
||||||
|
|
||||||
async def after_goto_hook(page, context, url, response, **kwargs):
|
|
||||||
"""Called after page loads"""
|
|
||||||
print(f"[HOOK] Page loaded: {url}")
|
|
||||||
|
|
||||||
await page.wait_for_timeout(1000)
|
|
||||||
|
|
||||||
try:
|
|
||||||
await page.wait_for_selector("body", timeout=2000)
|
|
||||||
print("[HOOK] Body element ready")
|
|
||||||
except:
|
|
||||||
print("[HOOK] Timeout, continuing")
|
|
||||||
|
|
||||||
return page
|
|
||||||
|
|
||||||
|
|
||||||
async def execution_started_hook(page, context, **kwargs):
|
|
||||||
"""Called when custom JS execution starts"""
|
|
||||||
print("[HOOK] JS execution started")
|
|
||||||
await page.evaluate("console.log('[HOOK] Custom JS');")
|
|
||||||
return page
|
|
||||||
|
|
||||||
|
|
||||||
async def before_retrieve_hook(page, context, **kwargs):
|
|
||||||
"""Called before retrieving HTML"""
|
|
||||||
print("[HOOK] Preparing HTML retrieval")
|
|
||||||
|
|
||||||
# Scroll for lazy content
|
|
||||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
|
||||||
await page.wait_for_timeout(500)
|
|
||||||
await page.evaluate("window.scrollTo(0, 0);")
|
|
||||||
|
|
||||||
print("[HOOK] Scrolling complete")
|
|
||||||
return page
|
|
||||||
|
|
||||||
|
|
||||||
async def before_return_hook(page, context, html, **kwargs):
|
|
||||||
"""Called before returning HTML"""
|
|
||||||
print(f"[HOOK] HTML ready: {len(html)} chars")
|
|
||||||
|
|
||||||
metrics = await page.evaluate('''() => ({
|
|
||||||
images: document.images.length,
|
|
||||||
links: document.links.length,
|
|
||||||
scripts: document.scripts.length
|
|
||||||
})''')
|
|
||||||
|
|
||||||
print(f"[HOOK] Metrics - Images: {metrics['images']}, Links: {metrics['links']}")
|
|
||||||
return page
|
|
||||||
|
|
||||||
|
|
||||||
# --- Authentication Hooks ---
|
|
||||||
async def auth_context_hook(page, context, **kwargs):
|
|
||||||
"""Setup authentication context"""
|
|
||||||
print("[HOOK] Setting up authentication")
|
|
||||||
|
|
||||||
# Add auth cookies
|
|
||||||
await context.add_cookies([{
|
|
||||||
"name": "auth_token",
|
|
||||||
"value": "fake_jwt_token",
|
|
||||||
"domain": ".httpbin.org",
|
|
||||||
"path": "/",
|
|
||||||
"httpOnly": True
|
|
||||||
}])
|
|
||||||
|
|
||||||
# Set localStorage
|
|
||||||
await page.evaluate('''
|
|
||||||
localStorage.setItem('user_id', '12345');
|
|
||||||
localStorage.setItem('auth_time', new Date().toISOString());
|
|
||||||
''')
|
|
||||||
|
|
||||||
print("[HOOK] Auth context ready")
|
|
||||||
return page
|
|
||||||
|
|
||||||
|
|
||||||
async def auth_headers_hook(page, context, url, **kwargs):
|
|
||||||
"""Add authentication headers"""
|
|
||||||
print(f"[HOOK] Adding auth headers for {url}")
|
|
||||||
|
|
||||||
import base64
|
|
||||||
credentials = base64.b64encode(b"user:passwd").decode('ascii')
|
|
||||||
|
|
||||||
await page.set_extra_http_headers({
|
|
||||||
'Authorization': f'Basic {credentials}',
|
|
||||||
'X-API-Key': 'test-key-123'
|
|
||||||
})
|
|
||||||
|
|
||||||
return page
|
|
||||||
|
|
||||||
|
|
||||||
# --- Performance Optimization Hooks ---
|
|
||||||
async def performance_hook(page, context, **kwargs):
|
|
||||||
"""Optimize page for performance"""
|
|
||||||
print("[HOOK] Optimizing for performance")
|
|
||||||
|
|
||||||
# Block resource-heavy content
|
|
||||||
await context.route("**/*.{png,jpg,jpeg,gif,webp,svg}", lambda r: r.abort())
|
|
||||||
await context.route("**/*.{woff,woff2,ttf}", lambda r: r.abort())
|
|
||||||
await context.route("**/*.{mp4,webm,ogg}", lambda r: r.abort())
|
|
||||||
await context.route("**/googletagmanager.com/*", lambda r: r.abort())
|
|
||||||
await context.route("**/google-analytics.com/*", lambda r: r.abort())
|
|
||||||
await context.route("**/facebook.com/*", lambda r: r.abort())
|
|
||||||
|
|
||||||
# Disable animations
|
|
||||||
await page.add_style_tag(content='''
|
|
||||||
*, *::before, *::after {
|
|
||||||
animation-duration: 0s !important;
|
|
||||||
transition-duration: 0s !important;
|
|
||||||
}
|
|
||||||
''')
|
|
||||||
|
|
||||||
print("[HOOK] Optimizations applied")
|
|
||||||
return page
|
|
||||||
|
|
||||||
|
|
||||||
async def cleanup_hook(page, context, **kwargs):
|
|
||||||
"""Clean page before extraction"""
|
|
||||||
print("[HOOK] Cleaning page")
|
|
||||||
|
|
||||||
await page.evaluate('''() => {
|
|
||||||
const selectors = [
|
|
||||||
'.ad', '.ads', '.advertisement',
|
|
||||||
'.popup', '.modal', '.overlay',
|
|
||||||
'.cookie-banner', '.newsletter'
|
|
||||||
];
|
|
||||||
|
|
||||||
selectors.forEach(sel => {
|
|
||||||
document.querySelectorAll(sel).forEach(el => el.remove());
|
|
||||||
});
|
|
||||||
|
|
||||||
document.querySelectorAll('script, style').forEach(el => el.remove());
|
|
||||||
}''')
|
|
||||||
|
|
||||||
print("[HOOK] Page cleaned")
|
|
||||||
return page
|
|
||||||
|
|
||||||
|
|
||||||
# --- Content Extraction Hooks ---
|
|
||||||
async def wait_dynamic_content_hook(page, context, url, response, **kwargs):
|
|
||||||
"""Wait for dynamic content to load"""
|
|
||||||
print(f"[HOOK] Waiting for dynamic content on {url}")
|
|
||||||
|
|
||||||
await page.wait_for_timeout(2000)
|
|
||||||
|
|
||||||
# Click "Load More" if exists
|
|
||||||
try:
|
|
||||||
load_more = await page.query_selector('[class*="load-more"], button:has-text("Load More")')
|
|
||||||
if load_more:
|
|
||||||
await load_more.click()
|
|
||||||
await page.wait_for_timeout(1000)
|
|
||||||
print("[HOOK] Clicked 'Load More'")
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return page
|
|
||||||
|
|
||||||
|
|
||||||
async def extract_metadata_hook(page, context, **kwargs):
|
|
||||||
"""Extract page metadata"""
|
|
||||||
print("[HOOK] Extracting metadata")
|
|
||||||
|
|
||||||
metadata = await page.evaluate('''() => {
|
|
||||||
const getMeta = (name) => {
|
|
||||||
const el = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
|
|
||||||
return el ? el.getAttribute('content') : null;
|
|
||||||
};
|
|
||||||
|
|
||||||
return {
|
|
||||||
title: document.title,
|
|
||||||
description: getMeta('description'),
|
|
||||||
author: getMeta('author'),
|
|
||||||
keywords: getMeta('keywords'),
|
|
||||||
};
|
|
||||||
}''')
|
|
||||||
|
|
||||||
print(f"[HOOK] Metadata: {metadata}")
|
|
||||||
|
|
||||||
# Infinite scroll
|
|
||||||
for i in range(3):
|
|
||||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
|
||||||
await page.wait_for_timeout(1000)
|
|
||||||
print(f"[HOOK] Scroll {i+1}/3")
|
|
||||||
|
|
||||||
return page
|
|
||||||
|
|
||||||
|
|
||||||
# --- Multi-URL Hooks ---
|
|
||||||
async def url_specific_hook(page, context, url, **kwargs):
|
|
||||||
"""Apply URL-specific logic"""
|
|
||||||
print(f"[HOOK] Processing URL: {url}")
|
|
||||||
|
|
||||||
# URL-specific headers
|
|
||||||
if 'html' in url:
|
|
||||||
await page.set_extra_http_headers({"X-Type": "HTML"})
|
|
||||||
elif 'json' in url:
|
|
||||||
await page.set_extra_http_headers({"X-Type": "JSON"})
|
|
||||||
|
|
||||||
return page
|
|
||||||
|
|
||||||
|
|
||||||
async def track_progress_hook(page, context, url, response, **kwargs):
|
|
||||||
"""Track crawl progress"""
|
|
||||||
status = response.status if response else 'unknown'
|
|
||||||
print(f"[HOOK] Loaded {url} - Status: {status}")
|
|
||||||
return page
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
|
||||||
# Test Functions
|
|
||||||
# ============================================================================
|
|
||||||
|
|
||||||
async def test_all_hooks_comprehensive():
|
|
||||||
"""Test all 8 hook types"""
|
|
||||||
print("=" * 70)
|
|
||||||
print("Test 1: All Hooks Comprehensive Demo (Docker Client)")
|
|
||||||
print("=" * 70)
|
|
||||||
|
|
||||||
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
|
||||||
print("\nCrawling with all 8 hooks...")
|
|
||||||
|
|
||||||
# Define hooks with function objects
|
|
||||||
hooks = {
|
|
||||||
"on_browser_created": browser_created_hook,
|
|
||||||
"on_page_context_created": page_context_hook,
|
|
||||||
"on_user_agent_updated": user_agent_hook,
|
|
||||||
"before_goto": before_goto_hook,
|
|
||||||
"after_goto": after_goto_hook,
|
|
||||||
"on_execution_started": execution_started_hook,
|
|
||||||
"before_retrieve_html": before_retrieve_hook,
|
|
||||||
"before_return_html": before_return_hook
|
|
||||||
}
|
|
||||||
|
|
||||||
result = await client.crawl(
|
|
||||||
["https://httpbin.org/html"],
|
|
||||||
hooks=hooks,
|
|
||||||
hooks_timeout=30
|
|
||||||
)
|
|
||||||
|
|
||||||
print("\n✅ Success!")
|
|
||||||
print(f" URL: {result.url}")
|
|
||||||
print(f" Success: {result.success}")
|
|
||||||
print(f" HTML: {len(result.html)} chars")
|
|
||||||
|
|
||||||
|
|
||||||
async def test_authentication_workflow():
|
|
||||||
"""Test authentication with hooks"""
|
|
||||||
print("\n" + "=" * 70)
|
|
||||||
print("Test 2: Authentication Workflow (Docker Client)")
|
|
||||||
print("=" * 70)
|
|
||||||
|
|
||||||
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
|
||||||
print("\nTesting authentication...")
|
|
||||||
|
|
||||||
hooks = {
|
|
||||||
"on_page_context_created": auth_context_hook,
|
|
||||||
"before_goto": auth_headers_hook
|
|
||||||
}
|
|
||||||
|
|
||||||
result = await client.crawl(
|
|
||||||
["https://httpbin.org/basic-auth/user/passwd"],
|
|
||||||
hooks=hooks,
|
|
||||||
hooks_timeout=15
|
|
||||||
)
|
|
||||||
|
|
||||||
print("\n✅ Authentication completed")
|
|
||||||
|
|
||||||
if result.success:
|
|
||||||
if '"authenticated"' in result.html and 'true' in result.html:
|
|
||||||
print(" ✅ Basic auth successful!")
|
|
||||||
else:
|
|
||||||
print(" ⚠️ Auth status unclear")
|
|
||||||
else:
|
|
||||||
print(f" ❌ Failed: {result.error_message}")
|
|
||||||
|
|
||||||
|
|
||||||
async def test_performance_optimization():
|
|
||||||
"""Test performance optimization"""
|
|
||||||
print("\n" + "=" * 70)
|
|
||||||
print("Test 3: Performance Optimization (Docker Client)")
|
|
||||||
print("=" * 70)
|
|
||||||
|
|
||||||
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
|
||||||
print("\nTesting performance hooks...")
|
|
||||||
|
|
||||||
hooks = {
|
|
||||||
"on_page_context_created": performance_hook,
|
|
||||||
"before_retrieve_html": cleanup_hook
|
|
||||||
}
|
|
||||||
|
|
||||||
result = await client.crawl(
|
|
||||||
["https://httpbin.org/html"],
|
|
||||||
hooks=hooks,
|
|
||||||
hooks_timeout=10
|
|
||||||
)
|
|
||||||
|
|
||||||
print("\n✅ Optimization completed")
|
|
||||||
print(f" HTML size: {len(result.html):,} chars")
|
|
||||||
print(" Resources blocked, ads removed")
|
|
||||||
|
|
||||||
|
|
||||||
async def test_content_extraction():
|
|
||||||
"""Test content extraction"""
|
|
||||||
print("\n" + "=" * 70)
|
|
||||||
print("Test 4: Content Extraction (Docker Client)")
|
|
||||||
print("=" * 70)
|
|
||||||
|
|
||||||
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
|
||||||
print("\nTesting extraction hooks...")
|
|
||||||
|
|
||||||
hooks = {
|
|
||||||
"after_goto": wait_dynamic_content_hook,
|
|
||||||
"before_retrieve_html": extract_metadata_hook
|
|
||||||
}
|
|
||||||
|
|
||||||
result = await client.crawl(
|
|
||||||
["https://www.kidocode.com/"],
|
|
||||||
hooks=hooks,
|
|
||||||
hooks_timeout=20
|
|
||||||
)
|
|
||||||
|
|
||||||
print("\n✅ Extraction completed")
|
|
||||||
print(f" URL: {result.url}")
|
|
||||||
print(f" Success: {result.success}")
|
|
||||||
print(f" Metadata: {result.metadata}")
|
|
||||||
|
|
||||||
|
|
||||||
async def test_multi_url_crawl():
|
|
||||||
"""Test hooks with multiple URLs"""
|
|
||||||
print("\n" + "=" * 70)
|
|
||||||
print("Test 5: Multi-URL Crawl (Docker Client)")
|
|
||||||
print("=" * 70)
|
|
||||||
|
|
||||||
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
|
||||||
print("\nCrawling multiple URLs...")
|
|
||||||
|
|
||||||
hooks = {
|
|
||||||
"before_goto": url_specific_hook,
|
|
||||||
"after_goto": track_progress_hook
|
|
||||||
}
|
|
||||||
|
|
||||||
results = await client.crawl(
|
|
||||||
[
|
|
||||||
"https://httpbin.org/html",
|
|
||||||
"https://httpbin.org/json",
|
|
||||||
"https://httpbin.org/xml"
|
|
||||||
],
|
|
||||||
hooks=hooks,
|
|
||||||
hooks_timeout=15
|
|
||||||
)
|
|
||||||
|
|
||||||
print("\n✅ Multi-URL crawl completed")
|
|
||||||
print(f"\n Crawled {len(results)} URLs:")
|
|
||||||
for i, result in enumerate(results, 1):
|
|
||||||
status = "✅" if result.success else "❌"
|
|
||||||
print(f" {status} {i}. {result.url}")
|
|
||||||
|
|
||||||
|
|
||||||
async def test_reusable_hook_library():
|
|
||||||
"""Test using reusable hook library"""
|
|
||||||
print("\n" + "=" * 70)
|
|
||||||
print("Test 6: Reusable Hook Library (Docker Client)")
|
|
||||||
print("=" * 70)
|
|
||||||
|
|
||||||
# Create a library of reusable hooks
|
|
||||||
class HookLibrary:
|
|
||||||
@staticmethod
|
|
||||||
async def block_images(page, context, **kwargs):
|
|
||||||
"""Block all images"""
|
|
||||||
await context.route("**/*.{png,jpg,jpeg,gif}", lambda r: r.abort())
|
|
||||||
print("[LIBRARY] Images blocked")
|
|
||||||
return page
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
async def block_analytics(page, context, **kwargs):
|
|
||||||
"""Block analytics"""
|
|
||||||
await context.route("**/analytics/*", lambda r: r.abort())
|
|
||||||
await context.route("**/google-analytics.com/*", lambda r: r.abort())
|
|
||||||
print("[LIBRARY] Analytics blocked")
|
|
||||||
return page
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
async def scroll_infinite(page, context, **kwargs):
|
|
||||||
"""Handle infinite scroll"""
|
|
||||||
for i in range(5):
|
|
||||||
prev = await page.evaluate("document.body.scrollHeight")
|
|
||||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
|
||||||
await page.wait_for_timeout(1000)
|
|
||||||
curr = await page.evaluate("document.body.scrollHeight")
|
|
||||||
if curr == prev:
|
|
||||||
break
|
|
||||||
print("[LIBRARY] Infinite scroll complete")
|
|
||||||
return page
|
|
||||||
|
|
||||||
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
|
||||||
print("\nUsing hook library...")
|
|
||||||
|
|
||||||
hooks = {
|
|
||||||
"on_page_context_created": HookLibrary.block_images,
|
|
||||||
"before_retrieve_html": HookLibrary.scroll_infinite
|
|
||||||
}
|
|
||||||
|
|
||||||
result = await client.crawl(
|
|
||||||
["https://www.kidocode.com/"],
|
|
||||||
hooks=hooks,
|
|
||||||
hooks_timeout=20
|
|
||||||
)
|
|
||||||
|
|
||||||
print("\n✅ Library hooks completed")
|
|
||||||
print(f" Success: {result.success}")
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
|
||||||
# Main
|
|
||||||
# ============================================================================
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
"""Run all Docker client hook examples"""
|
|
||||||
print("🔧 Crawl4AI Docker Client - Hooks Examples (Function-Based)")
|
|
||||||
print("Using Python function objects with automatic conversion")
|
|
||||||
print("=" * 70)
|
|
||||||
|
|
||||||
tests = [
|
|
||||||
("All Hooks Demo", test_all_hooks_comprehensive),
|
|
||||||
("Authentication", test_authentication_workflow),
|
|
||||||
("Performance", test_performance_optimization),
|
|
||||||
("Extraction", test_content_extraction),
|
|
||||||
("Multi-URL", test_multi_url_crawl),
|
|
||||||
("Hook Library", test_reusable_hook_library)
|
|
||||||
]
|
|
||||||
|
|
||||||
for i, (name, test_func) in enumerate(tests, 1):
|
|
||||||
try:
|
|
||||||
await test_func()
|
|
||||||
print(f"\n✅ Test {i}/{len(tests)}: {name} completed\n")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"\n❌ Test {i}/{len(tests)}: {name} failed: {e}\n")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
|
|
||||||
print("=" * 70)
|
|
||||||
print("🎉 All Docker client hook examples completed!")
|
|
||||||
print("\n💡 Key Benefits of Function-Based Hooks:")
|
|
||||||
print(" • Write as regular Python functions")
|
|
||||||
print(" • Full IDE support (autocomplete, types)")
|
|
||||||
print(" • Automatic conversion to API format")
|
|
||||||
print(" • Reusable across projects")
|
|
||||||
print(" • Clean, readable code")
|
|
||||||
print(" • Easy to test and debug")
|
|
||||||
print("=" * 70)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
@@ -1,461 +0,0 @@
|
|||||||
"""
|
|
||||||
Docker Webhook Example for Crawl4AI
|
|
||||||
|
|
||||||
This example demonstrates how to use webhooks with the Crawl4AI job queue API.
|
|
||||||
Instead of polling for results, webhooks notify your application when jobs complete.
|
|
||||||
|
|
||||||
Supports both:
|
|
||||||
- /crawl/job - Raw crawling with markdown extraction
|
|
||||||
- /llm/job - LLM-powered content extraction
|
|
||||||
|
|
||||||
Prerequisites:
|
|
||||||
1. Crawl4AI Docker container running on localhost:11235
|
|
||||||
2. Flask installed: pip install flask requests
|
|
||||||
3. LLM API key configured in .llm.env (for LLM extraction examples)
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
1. Run this script: python docker_webhook_example.py
|
|
||||||
2. The webhook server will start on http://localhost:8080
|
|
||||||
3. Jobs will be submitted and webhooks will be received automatically
|
|
||||||
"""
|
|
||||||
|
|
||||||
import requests
|
|
||||||
import json
|
|
||||||
import time
|
|
||||||
from flask import Flask, request, jsonify
|
|
||||||
from threading import Thread
|
|
||||||
|
|
||||||
# Configuration
|
|
||||||
CRAWL4AI_BASE_URL = "http://localhost:11235"
|
|
||||||
WEBHOOK_BASE_URL = "http://localhost:8080" # Your webhook receiver URL
|
|
||||||
|
|
||||||
# Initialize Flask app for webhook receiver
|
|
||||||
app = Flask(__name__)
|
|
||||||
|
|
||||||
# Store received webhook data for demonstration
|
|
||||||
received_webhooks = []
|
|
||||||
|
|
||||||
|
|
||||||
@app.route('/webhooks/crawl-complete', methods=['POST'])
|
|
||||||
def handle_crawl_webhook():
|
|
||||||
"""
|
|
||||||
Webhook handler that receives notifications when crawl jobs complete.
|
|
||||||
|
|
||||||
Payload structure:
|
|
||||||
{
|
|
||||||
"task_id": "crawl_abc123",
|
|
||||||
"task_type": "crawl",
|
|
||||||
"status": "completed" or "failed",
|
|
||||||
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
|
||||||
"urls": ["https://example.com"],
|
|
||||||
"error": "error message" (only if failed),
|
|
||||||
"data": {...} (only if webhook_data_in_payload=True)
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
payload = request.json
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"📬 Webhook received for task: {payload['task_id']}")
|
|
||||||
print(f" Status: {payload['status']}")
|
|
||||||
print(f" Timestamp: {payload['timestamp']}")
|
|
||||||
print(f" URLs: {payload['urls']}")
|
|
||||||
|
|
||||||
if payload['status'] == 'completed':
|
|
||||||
# If data is in payload, process it directly
|
|
||||||
if 'data' in payload:
|
|
||||||
print(f" ✅ Data included in webhook")
|
|
||||||
data = payload['data']
|
|
||||||
# Process the crawl results here
|
|
||||||
for result in data.get('results', []):
|
|
||||||
print(f" - Crawled: {result.get('url')}")
|
|
||||||
print(f" - Markdown length: {len(result.get('markdown', ''))}")
|
|
||||||
else:
|
|
||||||
# Fetch results from API if not included
|
|
||||||
print(f" 📥 Fetching results from API...")
|
|
||||||
task_id = payload['task_id']
|
|
||||||
result_response = requests.get(f"{CRAWL4AI_BASE_URL}/crawl/job/{task_id}")
|
|
||||||
if result_response.ok:
|
|
||||||
data = result_response.json()
|
|
||||||
print(f" ✅ Results fetched successfully")
|
|
||||||
# Process the crawl results here
|
|
||||||
for result in data['result'].get('results', []):
|
|
||||||
print(f" - Crawled: {result.get('url')}")
|
|
||||||
print(f" - Markdown length: {len(result.get('markdown', ''))}")
|
|
||||||
|
|
||||||
elif payload['status'] == 'failed':
|
|
||||||
print(f" ❌ Job failed: {payload.get('error', 'Unknown error')}")
|
|
||||||
|
|
||||||
print(f"{'='*60}\n")
|
|
||||||
|
|
||||||
# Store webhook for demonstration
|
|
||||||
received_webhooks.append(payload)
|
|
||||||
|
|
||||||
# Return 200 OK to acknowledge receipt
|
|
||||||
return jsonify({"status": "received"}), 200
|
|
||||||
|
|
||||||
|
|
||||||
@app.route('/webhooks/llm-complete', methods=['POST'])
|
|
||||||
def handle_llm_webhook():
|
|
||||||
"""
|
|
||||||
Webhook handler that receives notifications when LLM extraction jobs complete.
|
|
||||||
|
|
||||||
Payload structure:
|
|
||||||
{
|
|
||||||
"task_id": "llm_1698765432_12345",
|
|
||||||
"task_type": "llm_extraction",
|
|
||||||
"status": "completed" or "failed",
|
|
||||||
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
|
||||||
"urls": ["https://example.com/article"],
|
|
||||||
"error": "error message" (only if failed),
|
|
||||||
"data": {"extracted_content": {...}} (only if webhook_data_in_payload=True)
|
|
||||||
}
|
|
||||||
"""
|
|
||||||
payload = request.json
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"🤖 LLM Webhook received for task: {payload['task_id']}")
|
|
||||||
print(f" Task Type: {payload['task_type']}")
|
|
||||||
print(f" Status: {payload['status']}")
|
|
||||||
print(f" Timestamp: {payload['timestamp']}")
|
|
||||||
print(f" URL: {payload['urls'][0]}")
|
|
||||||
|
|
||||||
if payload['status'] == 'completed':
|
|
||||||
# If data is in payload, process it directly
|
|
||||||
if 'data' in payload:
|
|
||||||
print(f" ✅ Data included in webhook")
|
|
||||||
data = payload['data']
|
|
||||||
# Webhook wraps extracted content in 'extracted_content' field
|
|
||||||
extracted = data.get('extracted_content', {})
|
|
||||||
print(f" - Extracted content:")
|
|
||||||
print(f" {json.dumps(extracted, indent=8)}")
|
|
||||||
else:
|
|
||||||
# Fetch results from API if not included
|
|
||||||
print(f" 📥 Fetching results from API...")
|
|
||||||
task_id = payload['task_id']
|
|
||||||
result_response = requests.get(f"{CRAWL4AI_BASE_URL}/llm/job/{task_id}")
|
|
||||||
if result_response.ok:
|
|
||||||
data = result_response.json()
|
|
||||||
print(f" ✅ Results fetched successfully")
|
|
||||||
# API returns unwrapped content in 'result' field
|
|
||||||
extracted = data['result']
|
|
||||||
print(f" - Extracted content:")
|
|
||||||
print(f" {json.dumps(extracted, indent=8)}")
|
|
||||||
|
|
||||||
elif payload['status'] == 'failed':
|
|
||||||
print(f" ❌ Job failed: {payload.get('error', 'Unknown error')}")
|
|
||||||
|
|
||||||
print(f"{'='*60}\n")
|
|
||||||
|
|
||||||
# Store webhook for demonstration
|
|
||||||
received_webhooks.append(payload)
|
|
||||||
|
|
||||||
# Return 200 OK to acknowledge receipt
|
|
||||||
return jsonify({"status": "received"}), 200
|
|
||||||
|
|
||||||
|
|
||||||
def start_webhook_server():
|
|
||||||
"""Start the Flask webhook server in a separate thread"""
|
|
||||||
app.run(host='0.0.0.0', port=8080, debug=False, use_reloader=False)
|
|
||||||
|
|
||||||
|
|
||||||
def submit_crawl_job_with_webhook(urls, webhook_url, include_data=False):
|
|
||||||
"""
|
|
||||||
Submit a crawl job with webhook notification.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
urls: List of URLs to crawl
|
|
||||||
webhook_url: URL to receive webhook notifications
|
|
||||||
include_data: Whether to include full results in webhook payload
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
task_id: The job's task identifier
|
|
||||||
"""
|
|
||||||
payload = {
|
|
||||||
"urls": urls,
|
|
||||||
"browser_config": {"headless": True},
|
|
||||||
"crawler_config": {"cache_mode": "bypass"},
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": webhook_url,
|
|
||||||
"webhook_data_in_payload": include_data,
|
|
||||||
# Optional: Add custom headers for authentication
|
|
||||||
# "webhook_headers": {
|
|
||||||
# "X-Webhook-Secret": "your-secret-token"
|
|
||||||
# }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
print(f"\n🚀 Submitting crawl job...")
|
|
||||||
print(f" URLs: {urls}")
|
|
||||||
print(f" Webhook: {webhook_url}")
|
|
||||||
print(f" Include data: {include_data}")
|
|
||||||
|
|
||||||
response = requests.post(
|
|
||||||
f"{CRAWL4AI_BASE_URL}/crawl/job",
|
|
||||||
json=payload,
|
|
||||||
headers={"Content-Type": "application/json"}
|
|
||||||
)
|
|
||||||
|
|
||||||
if response.ok:
|
|
||||||
data = response.json()
|
|
||||||
task_id = data['task_id']
|
|
||||||
print(f" ✅ Job submitted successfully")
|
|
||||||
print(f" Task ID: {task_id}")
|
|
||||||
return task_id
|
|
||||||
else:
|
|
||||||
print(f" ❌ Failed to submit job: {response.text}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def submit_llm_job_with_webhook(url, query, webhook_url, include_data=False, schema=None, provider=None):
|
|
||||||
"""
|
|
||||||
Submit an LLM extraction job with webhook notification.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
url: URL to extract content from
|
|
||||||
query: Instruction for the LLM (e.g., "Extract article title and author")
|
|
||||||
webhook_url: URL to receive webhook notifications
|
|
||||||
include_data: Whether to include full results in webhook payload
|
|
||||||
schema: Optional JSON schema for structured extraction
|
|
||||||
provider: Optional LLM provider (e.g., "openai/gpt-4o-mini")
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
task_id: The job's task identifier
|
|
||||||
"""
|
|
||||||
payload = {
|
|
||||||
"url": url,
|
|
||||||
"q": query,
|
|
||||||
"cache": False,
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": webhook_url,
|
|
||||||
"webhook_data_in_payload": include_data,
|
|
||||||
# Optional: Add custom headers for authentication
|
|
||||||
# "webhook_headers": {
|
|
||||||
# "X-Webhook-Secret": "your-secret-token"
|
|
||||||
# }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if schema:
|
|
||||||
payload["schema"] = schema
|
|
||||||
|
|
||||||
if provider:
|
|
||||||
payload["provider"] = provider
|
|
||||||
|
|
||||||
print(f"\n🤖 Submitting LLM extraction job...")
|
|
||||||
print(f" URL: {url}")
|
|
||||||
print(f" Query: {query}")
|
|
||||||
print(f" Webhook: {webhook_url}")
|
|
||||||
print(f" Include data: {include_data}")
|
|
||||||
if provider:
|
|
||||||
print(f" Provider: {provider}")
|
|
||||||
|
|
||||||
response = requests.post(
|
|
||||||
f"{CRAWL4AI_BASE_URL}/llm/job",
|
|
||||||
json=payload,
|
|
||||||
headers={"Content-Type": "application/json"}
|
|
||||||
)
|
|
||||||
|
|
||||||
if response.ok:
|
|
||||||
data = response.json()
|
|
||||||
task_id = data['task_id']
|
|
||||||
print(f" ✅ Job submitted successfully")
|
|
||||||
print(f" Task ID: {task_id}")
|
|
||||||
return task_id
|
|
||||||
else:
|
|
||||||
print(f" ❌ Failed to submit job: {response.text}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def submit_job_without_webhook(urls):
|
|
||||||
"""
|
|
||||||
Submit a job without webhook (traditional polling approach).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
urls: List of URLs to crawl
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
task_id: The job's task identifier
|
|
||||||
"""
|
|
||||||
payload = {
|
|
||||||
"urls": urls,
|
|
||||||
"browser_config": {"headless": True},
|
|
||||||
"crawler_config": {"cache_mode": "bypass"}
|
|
||||||
}
|
|
||||||
|
|
||||||
print(f"\n🚀 Submitting crawl job (without webhook)...")
|
|
||||||
print(f" URLs: {urls}")
|
|
||||||
|
|
||||||
response = requests.post(
|
|
||||||
f"{CRAWL4AI_BASE_URL}/crawl/job",
|
|
||||||
json=payload
|
|
||||||
)
|
|
||||||
|
|
||||||
if response.ok:
|
|
||||||
data = response.json()
|
|
||||||
task_id = data['task_id']
|
|
||||||
print(f" ✅ Job submitted successfully")
|
|
||||||
print(f" Task ID: {task_id}")
|
|
||||||
return task_id
|
|
||||||
else:
|
|
||||||
print(f" ❌ Failed to submit job: {response.text}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def poll_job_status(task_id, timeout=60):
|
|
||||||
"""
|
|
||||||
Poll for job status (used when webhook is not configured).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
task_id: The job's task identifier
|
|
||||||
timeout: Maximum time to wait in seconds
|
|
||||||
"""
|
|
||||||
print(f"\n⏳ Polling for job status...")
|
|
||||||
start_time = time.time()
|
|
||||||
|
|
||||||
while time.time() - start_time < timeout:
|
|
||||||
response = requests.get(f"{CRAWL4AI_BASE_URL}/crawl/job/{task_id}")
|
|
||||||
|
|
||||||
if response.ok:
|
|
||||||
data = response.json()
|
|
||||||
status = data.get('status', 'unknown')
|
|
||||||
|
|
||||||
if status == 'completed':
|
|
||||||
print(f" ✅ Job completed!")
|
|
||||||
return data
|
|
||||||
elif status == 'failed':
|
|
||||||
print(f" ❌ Job failed: {data.get('error', 'Unknown error')}")
|
|
||||||
return data
|
|
||||||
else:
|
|
||||||
print(f" ⏳ Status: {status}, waiting...")
|
|
||||||
time.sleep(2)
|
|
||||||
else:
|
|
||||||
print(f" ❌ Failed to get status: {response.text}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
print(f" ⏰ Timeout reached")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Run the webhook demonstration"""
|
|
||||||
|
|
||||||
# Check if Crawl4AI is running
|
|
||||||
try:
|
|
||||||
health = requests.get(f"{CRAWL4AI_BASE_URL}/health", timeout=5)
|
|
||||||
print(f"✅ Crawl4AI is running: {health.json()}")
|
|
||||||
except:
|
|
||||||
print(f"❌ Cannot connect to Crawl4AI at {CRAWL4AI_BASE_URL}")
|
|
||||||
print(" Please make sure Docker container is running:")
|
|
||||||
print(" docker run -d -p 11235:11235 --name crawl4ai unclecode/crawl4ai:latest")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Start webhook server in background thread
|
|
||||||
print(f"\n🌐 Starting webhook server at {WEBHOOK_BASE_URL}...")
|
|
||||||
webhook_thread = Thread(target=start_webhook_server, daemon=True)
|
|
||||||
webhook_thread.start()
|
|
||||||
time.sleep(2) # Give server time to start
|
|
||||||
|
|
||||||
# Example 1: Job with webhook (notification only, fetch data separately)
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print("Example 1: Webhook Notification Only")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
task_id_1 = submit_crawl_job_with_webhook(
|
|
||||||
urls=["https://example.com"],
|
|
||||||
webhook_url=f"{WEBHOOK_BASE_URL}/webhooks/crawl-complete",
|
|
||||||
include_data=False
|
|
||||||
)
|
|
||||||
|
|
||||||
# Example 2: Job with webhook (data included in payload)
|
|
||||||
time.sleep(5) # Wait a bit between requests
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print("Example 2: Webhook with Full Data")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
task_id_2 = submit_crawl_job_with_webhook(
|
|
||||||
urls=["https://www.python.org"],
|
|
||||||
webhook_url=f"{WEBHOOK_BASE_URL}/webhooks/crawl-complete",
|
|
||||||
include_data=True
|
|
||||||
)
|
|
||||||
|
|
||||||
# Example 3: LLM extraction with webhook (notification only)
|
|
||||||
time.sleep(5) # Wait a bit between requests
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print("Example 3: LLM Extraction with Webhook (Notification Only)")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
task_id_3 = submit_llm_job_with_webhook(
|
|
||||||
url="https://www.example.com",
|
|
||||||
query="Extract the main heading and description from this page.",
|
|
||||||
webhook_url=f"{WEBHOOK_BASE_URL}/webhooks/llm-complete",
|
|
||||||
include_data=False,
|
|
||||||
provider="openai/gpt-4o-mini"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Example 4: LLM extraction with webhook (data included + schema)
|
|
||||||
time.sleep(5) # Wait a bit between requests
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print("Example 4: LLM Extraction with Schema and Full Data")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
|
|
||||||
# Define a schema for structured extraction
|
|
||||||
schema = json.dumps({
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"title": {"type": "string", "description": "Page title"},
|
|
||||||
"description": {"type": "string", "description": "Page description"}
|
|
||||||
},
|
|
||||||
"required": ["title"]
|
|
||||||
})
|
|
||||||
|
|
||||||
task_id_4 = submit_llm_job_with_webhook(
|
|
||||||
url="https://www.python.org",
|
|
||||||
query="Extract the title and description of this website",
|
|
||||||
webhook_url=f"{WEBHOOK_BASE_URL}/webhooks/llm-complete",
|
|
||||||
include_data=True,
|
|
||||||
schema=schema,
|
|
||||||
provider="openai/gpt-4o-mini"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Example 5: Traditional polling (no webhook)
|
|
||||||
time.sleep(5) # Wait a bit between requests
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print("Example 5: Traditional Polling (No Webhook)")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
task_id_5 = submit_job_without_webhook(
|
|
||||||
urls=["https://github.com"]
|
|
||||||
)
|
|
||||||
if task_id_5:
|
|
||||||
result = poll_job_status(task_id_5)
|
|
||||||
if result and result.get('status') == 'completed':
|
|
||||||
print(f" ✅ Results retrieved via polling")
|
|
||||||
|
|
||||||
# Wait for webhooks to arrive
|
|
||||||
print(f"\n⏳ Waiting for webhooks to be received...")
|
|
||||||
time.sleep(30) # Give jobs time to complete and webhooks to arrive (longer for LLM)
|
|
||||||
|
|
||||||
# Summary
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print("Summary")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
print(f"Total webhooks received: {len(received_webhooks)}")
|
|
||||||
|
|
||||||
crawl_webhooks = [w for w in received_webhooks if w['task_type'] == 'crawl']
|
|
||||||
llm_webhooks = [w for w in received_webhooks if w['task_type'] == 'llm_extraction']
|
|
||||||
|
|
||||||
print(f"\n📊 Breakdown:")
|
|
||||||
print(f" - Crawl webhooks: {len(crawl_webhooks)}")
|
|
||||||
print(f" - LLM extraction webhooks: {len(llm_webhooks)}")
|
|
||||||
|
|
||||||
print(f"\n📋 Details:")
|
|
||||||
for i, webhook in enumerate(received_webhooks, 1):
|
|
||||||
task_type = webhook['task_type']
|
|
||||||
icon = "🕷️" if task_type == "crawl" else "🤖"
|
|
||||||
print(f"{i}. {icon} Task {webhook['task_id']}: {webhook['status']} ({task_type})")
|
|
||||||
|
|
||||||
print(f"\n✅ Demo completed!")
|
|
||||||
print(f"\n💡 Pro tips:")
|
|
||||||
print(f" - In production, your webhook URL should be publicly accessible")
|
|
||||||
print(f" (e.g., https://myapp.com/webhooks) or use ngrok for testing")
|
|
||||||
print(f" - Both /crawl/job and /llm/job support the same webhook configuration")
|
|
||||||
print(f" - Use webhook_data_in_payload=true to get results directly in the webhook")
|
|
||||||
print(f" - LLM jobs may take longer, adjust timeouts accordingly")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
Binary file not shown.
@@ -20,43 +20,17 @@ Ever wondered why your AI coding assistant struggles with your library despite c
|
|||||||
|
|
||||||
## Latest Release
|
## Latest Release
|
||||||
|
|
||||||
### [Crawl4AI v0.7.6 – The Webhook Infrastructure Update](../blog/release-v0.7.6.md)
|
|
||||||
*October 22, 2025*
|
|
||||||
|
|
||||||
Crawl4AI v0.7.6 introduces comprehensive webhook support for the Docker job queue API, bringing real-time notifications to both crawling and LLM extraction workflows. No more polling!
|
|
||||||
|
|
||||||
Key highlights:
|
|
||||||
- **🪝 Complete Webhook Support**: Real-time notifications for both `/crawl/job` and `/llm/job` endpoints
|
|
||||||
- **🔄 Reliable Delivery**: Exponential backoff retry mechanism (5 attempts: 1s → 2s → 4s → 8s → 16s)
|
|
||||||
- **🔐 Custom Authentication**: Add custom headers for webhook authentication
|
|
||||||
- **📊 Flexible Delivery**: Choose notification-only or include full data in payload
|
|
||||||
- **⚙️ Global Configuration**: Set default webhook URL in config.yml for all jobs
|
|
||||||
- **🎯 Zero Breaking Changes**: Fully backward compatible, webhooks are opt-in
|
|
||||||
|
|
||||||
[Read full release notes →](../blog/release-v0.7.6.md)
|
|
||||||
|
|
||||||
## Recent Releases
|
|
||||||
|
|
||||||
### [Crawl4AI v0.7.5 – The Docker Hooks & Security Update](../blog/release-v0.7.5.md)
|
|
||||||
*September 29, 2025*
|
|
||||||
|
|
||||||
Crawl4AI v0.7.5 introduces the powerful Docker Hooks System for complete pipeline customization, enhanced LLM integration with custom providers, HTTPS preservation for modern web security, and resolves multiple community-reported issues.
|
|
||||||
|
|
||||||
Key highlights:
|
|
||||||
- **🔧 Docker Hooks System**: Custom Python functions at 8 key pipeline points for unprecedented customization
|
|
||||||
- **🤖 Enhanced LLM Integration**: Custom providers with temperature control and base_url configuration
|
|
||||||
- **🔒 HTTPS Preservation**: Secure internal link handling for modern web applications
|
|
||||||
- **🐍 Python 3.10+ Support**: Modern language features and enhanced performance
|
|
||||||
- **🛠️ Bug Fixes**: Resolved multiple community-reported issues including URL processing, JWT authentication, and proxy configuration
|
|
||||||
|
|
||||||
[Read full release notes →](../blog/release-v0.7.5.md)
|
|
||||||
|
|
||||||
## Recent Releases
|
|
||||||
|
|
||||||
### [Crawl4AI v0.7.4 – The Intelligent Table Extraction & Performance Update](../blog/release-v0.7.4.md)
|
### [Crawl4AI v0.7.4 – The Intelligent Table Extraction & Performance Update](../blog/release-v0.7.4.md)
|
||||||
*August 17, 2025*
|
*August 17, 2025*
|
||||||
|
|
||||||
Revolutionary LLM-powered table extraction with intelligent chunking, performance improvements for concurrent crawling, enhanced browser management, and critical stability fixes.
|
Crawl4AI v0.7.4 introduces revolutionary LLM-powered table extraction with intelligent chunking, performance improvements for concurrent crawling, enhanced browser management, and critical stability fixes that make Crawl4AI more robust for production workloads.
|
||||||
|
|
||||||
|
Key highlights:
|
||||||
|
- **🚀 LLMTableExtraction**: Revolutionary table extraction with intelligent chunking for massive tables
|
||||||
|
- **⚡ Dispatcher Bug Fix**: Fixed sequential processing issue in arun_many for fast-completing tasks
|
||||||
|
- **🧹 Memory Management Refactor**: Streamlined memory utilities and better resource management
|
||||||
|
- **🔧 Browser Manager Fixes**: Resolved race conditions in concurrent page creation
|
||||||
|
- **🔗 Advanced URL Processing**: Better handling of raw URLs and base tag link resolution
|
||||||
|
|
||||||
[Read full release notes →](../blog/release-v0.7.4.md)
|
[Read full release notes →](../blog/release-v0.7.4.md)
|
||||||
|
|
||||||
|
|||||||
@@ -1,314 +0,0 @@
|
|||||||
# Crawl4AI v0.7.6 Release Notes
|
|
||||||
|
|
||||||
*Release Date: October 22, 2025*
|
|
||||||
|
|
||||||
I'm excited to announce Crawl4AI v0.7.6, featuring a complete webhook infrastructure for the Docker job queue API! This release eliminates polling and brings real-time notifications to both crawling and LLM extraction workflows.
|
|
||||||
|
|
||||||
## 🎯 What's New
|
|
||||||
|
|
||||||
### Webhook Support for Docker Job Queue API
|
|
||||||
|
|
||||||
The headline feature of v0.7.6 is comprehensive webhook support for asynchronous job processing. No more constant polling to check if your jobs are done - get instant notifications when they complete!
|
|
||||||
|
|
||||||
**Key Capabilities:**
|
|
||||||
|
|
||||||
- ✅ **Universal Webhook Support**: Both `/crawl/job` and `/llm/job` endpoints now support webhooks
|
|
||||||
- ✅ **Flexible Delivery Modes**: Choose notification-only or include full data in the webhook payload
|
|
||||||
- ✅ **Reliable Delivery**: Exponential backoff retry mechanism (5 attempts: 1s → 2s → 4s → 8s → 16s)
|
|
||||||
- ✅ **Custom Authentication**: Add custom headers for webhook authentication
|
|
||||||
- ✅ **Global Configuration**: Set default webhook URL in `config.yml` for all jobs
|
|
||||||
- ✅ **Task Type Identification**: Distinguish between `crawl` and `llm_extraction` tasks
|
|
||||||
|
|
||||||
### How It Works
|
|
||||||
|
|
||||||
Instead of constantly checking job status:
|
|
||||||
|
|
||||||
**OLD WAY (Polling):**
|
|
||||||
```python
|
|
||||||
# Submit job
|
|
||||||
response = requests.post("http://localhost:11235/crawl/job", json=payload)
|
|
||||||
task_id = response.json()['task_id']
|
|
||||||
|
|
||||||
# Poll until complete
|
|
||||||
while True:
|
|
||||||
status = requests.get(f"http://localhost:11235/crawl/job/{task_id}")
|
|
||||||
if status.json()['status'] == 'completed':
|
|
||||||
break
|
|
||||||
time.sleep(5) # Wait and try again
|
|
||||||
```
|
|
||||||
|
|
||||||
**NEW WAY (Webhooks):**
|
|
||||||
```python
|
|
||||||
# Submit job with webhook
|
|
||||||
payload = {
|
|
||||||
"urls": ["https://example.com"],
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": "https://myapp.com/webhook",
|
|
||||||
"webhook_data_in_payload": True
|
|
||||||
}
|
|
||||||
}
|
|
||||||
response = requests.post("http://localhost:11235/crawl/job", json=payload)
|
|
||||||
|
|
||||||
# Done! Webhook will notify you when complete
|
|
||||||
# Your webhook handler receives the results automatically
|
|
||||||
```
|
|
||||||
|
|
||||||
### Crawl Job Webhooks
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl -X POST http://localhost:11235/crawl/job \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"urls": ["https://example.com"],
|
|
||||||
"browser_config": {"headless": true},
|
|
||||||
"crawler_config": {"cache_mode": "bypass"},
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": "https://myapp.com/webhooks/crawl-complete",
|
|
||||||
"webhook_data_in_payload": false,
|
|
||||||
"webhook_headers": {
|
|
||||||
"X-Webhook-Secret": "your-secret-token"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
### LLM Extraction Job Webhooks (NEW!)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl -X POST http://localhost:11235/llm/job \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"url": "https://example.com/article",
|
|
||||||
"q": "Extract the article title, author, and publication date",
|
|
||||||
"schema": "{\"type\":\"object\",\"properties\":{\"title\":{\"type\":\"string\"}}}",
|
|
||||||
"provider": "openai/gpt-4o-mini",
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": "https://myapp.com/webhooks/llm-complete",
|
|
||||||
"webhook_data_in_payload": true
|
|
||||||
}
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
### Webhook Payload Structure
|
|
||||||
|
|
||||||
**Success (with data):**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"task_id": "llm_1698765432",
|
|
||||||
"task_type": "llm_extraction",
|
|
||||||
"status": "completed",
|
|
||||||
"timestamp": "2025-10-22T10:30:00.000000+00:00",
|
|
||||||
"urls": ["https://example.com/article"],
|
|
||||||
"data": {
|
|
||||||
"extracted_content": {
|
|
||||||
"title": "Understanding Web Scraping",
|
|
||||||
"author": "John Doe",
|
|
||||||
"date": "2025-10-22"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Failure:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"task_id": "crawl_abc123",
|
|
||||||
"task_type": "crawl",
|
|
||||||
"status": "failed",
|
|
||||||
"timestamp": "2025-10-22T10:30:00.000000+00:00",
|
|
||||||
"urls": ["https://example.com"],
|
|
||||||
"error": "Connection timeout after 30s"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Simple Webhook Handler Example
|
|
||||||
|
|
||||||
```python
|
|
||||||
from flask import Flask, request, jsonify
|
|
||||||
|
|
||||||
app = Flask(__name__)
|
|
||||||
|
|
||||||
@app.route('/webhook', methods=['POST'])
|
|
||||||
def handle_webhook():
|
|
||||||
payload = request.json
|
|
||||||
|
|
||||||
task_id = payload['task_id']
|
|
||||||
task_type = payload['task_type']
|
|
||||||
status = payload['status']
|
|
||||||
|
|
||||||
if status == 'completed':
|
|
||||||
if 'data' in payload:
|
|
||||||
# Process data directly
|
|
||||||
data = payload['data']
|
|
||||||
else:
|
|
||||||
# Fetch from API
|
|
||||||
endpoint = 'crawl' if task_type == 'crawl' else 'llm'
|
|
||||||
response = requests.get(f'http://localhost:11235/{endpoint}/job/{task_id}')
|
|
||||||
data = response.json()
|
|
||||||
|
|
||||||
# Your business logic here
|
|
||||||
print(f"Job {task_id} completed!")
|
|
||||||
|
|
||||||
elif status == 'failed':
|
|
||||||
error = payload.get('error', 'Unknown error')
|
|
||||||
print(f"Job {task_id} failed: {error}")
|
|
||||||
|
|
||||||
return jsonify({"status": "received"}), 200
|
|
||||||
|
|
||||||
app.run(port=8080)
|
|
||||||
```
|
|
||||||
|
|
||||||
## 📊 Performance Improvements
|
|
||||||
|
|
||||||
- **Reduced Server Load**: Eliminates constant polling requests
|
|
||||||
- **Lower Latency**: Instant notification vs. polling interval delay
|
|
||||||
- **Better Resource Usage**: Frees up client connections while jobs run in background
|
|
||||||
- **Scalable Architecture**: Handles high-volume crawling workflows efficiently
|
|
||||||
|
|
||||||
## 🐛 Bug Fixes
|
|
||||||
|
|
||||||
- Fixed webhook configuration serialization for Pydantic HttpUrl fields
|
|
||||||
- Improved error handling in webhook delivery service
|
|
||||||
- Enhanced Redis task storage for webhook config persistence
|
|
||||||
|
|
||||||
## 🌍 Expected Real-World Impact
|
|
||||||
|
|
||||||
### For Web Scraping Workflows
|
|
||||||
- **Reduced Costs**: Less API calls = lower bandwidth and server costs
|
|
||||||
- **Better UX**: Instant notifications improve user experience
|
|
||||||
- **Scalability**: Handle 100s of concurrent jobs without polling overhead
|
|
||||||
|
|
||||||
### For LLM Extraction Pipelines
|
|
||||||
- **Async Processing**: Submit LLM extraction jobs and move on
|
|
||||||
- **Batch Processing**: Queue multiple extractions, get notified as they complete
|
|
||||||
- **Integration**: Easy integration with workflow automation tools (Zapier, n8n, etc.)
|
|
||||||
|
|
||||||
### For Microservices
|
|
||||||
- **Event-Driven**: Perfect for event-driven microservice architectures
|
|
||||||
- **Decoupling**: Decouple job submission from result processing
|
|
||||||
- **Reliability**: Automatic retries ensure webhooks are delivered
|
|
||||||
|
|
||||||
## 🔄 Breaking Changes
|
|
||||||
|
|
||||||
**None!** This release is fully backward compatible.
|
|
||||||
|
|
||||||
- Webhook configuration is optional
|
|
||||||
- Existing code continues to work without modification
|
|
||||||
- Polling is still supported for jobs without webhook config
|
|
||||||
|
|
||||||
## 📚 Documentation
|
|
||||||
|
|
||||||
### New Documentation
|
|
||||||
- **[WEBHOOK_EXAMPLES.md](../deploy/docker/WEBHOOK_EXAMPLES.md)** - Comprehensive webhook usage guide
|
|
||||||
- **[docker_webhook_example.py](../docs/examples/docker_webhook_example.py)** - Working code examples
|
|
||||||
|
|
||||||
### Updated Documentation
|
|
||||||
- **[Docker README](../deploy/docker/README.md)** - Added webhook sections
|
|
||||||
- API documentation with webhook examples
|
|
||||||
|
|
||||||
## 🛠️ Migration Guide
|
|
||||||
|
|
||||||
No migration needed! Webhooks are opt-in:
|
|
||||||
|
|
||||||
1. **To use webhooks**: Add `webhook_config` to your job payload
|
|
||||||
2. **To keep polling**: Continue using your existing code
|
|
||||||
|
|
||||||
### Quick Start
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Just add webhook_config to your existing payload
|
|
||||||
payload = {
|
|
||||||
# Your existing configuration
|
|
||||||
"urls": ["https://example.com"],
|
|
||||||
"browser_config": {...},
|
|
||||||
"crawler_config": {...},
|
|
||||||
|
|
||||||
# NEW: Add webhook configuration
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": "https://myapp.com/webhook",
|
|
||||||
"webhook_data_in_payload": True
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## 🔧 Configuration
|
|
||||||
|
|
||||||
### Global Webhook Configuration (config.yml)
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
webhooks:
|
|
||||||
enabled: true
|
|
||||||
default_url: "https://myapp.com/webhooks/default" # Optional
|
|
||||||
data_in_payload: false
|
|
||||||
retry:
|
|
||||||
max_attempts: 5
|
|
||||||
initial_delay_ms: 1000
|
|
||||||
max_delay_ms: 32000
|
|
||||||
timeout_ms: 30000
|
|
||||||
headers:
|
|
||||||
User-Agent: "Crawl4AI-Webhook/1.0"
|
|
||||||
```
|
|
||||||
|
|
||||||
## 🚀 Upgrade Instructions
|
|
||||||
|
|
||||||
### Docker
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Pull the latest image
|
|
||||||
docker pull unclecode/crawl4ai:0.7.6
|
|
||||||
|
|
||||||
# Or use latest tag
|
|
||||||
docker pull unclecode/crawl4ai:latest
|
|
||||||
|
|
||||||
# Run with webhook support
|
|
||||||
docker run -d \
|
|
||||||
-p 11235:11235 \
|
|
||||||
--env-file .llm.env \
|
|
||||||
--name crawl4ai \
|
|
||||||
unclecode/crawl4ai:0.7.6
|
|
||||||
```
|
|
||||||
|
|
||||||
### Python Package
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install --upgrade crawl4ai
|
|
||||||
```
|
|
||||||
|
|
||||||
## 💡 Pro Tips
|
|
||||||
|
|
||||||
1. **Use notification-only mode** for large results - fetch data separately to avoid large webhook payloads
|
|
||||||
2. **Set custom headers** for webhook authentication and request tracking
|
|
||||||
3. **Configure global default webhook** for consistent handling across all jobs
|
|
||||||
4. **Implement idempotent webhook handlers** - same webhook may be delivered multiple times on retry
|
|
||||||
5. **Use structured schemas** with LLM extraction for predictable webhook data
|
|
||||||
|
|
||||||
## 🎬 Demo
|
|
||||||
|
|
||||||
Try the release demo:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python docs/releases_review/demo_v0.7.6.py
|
|
||||||
```
|
|
||||||
|
|
||||||
This comprehensive demo showcases:
|
|
||||||
- Crawl job webhooks (notification-only and with data)
|
|
||||||
- LLM extraction webhooks (with JSON schema support)
|
|
||||||
- Custom headers for authentication
|
|
||||||
- Webhook retry mechanism
|
|
||||||
- Real-time webhook receiver
|
|
||||||
|
|
||||||
## 🙏 Acknowledgments
|
|
||||||
|
|
||||||
Thank you to the community for the feedback that shaped this feature! Special thanks to everyone who requested webhook support for asynchronous job processing.
|
|
||||||
|
|
||||||
## 📞 Support
|
|
||||||
|
|
||||||
- **Documentation**: https://docs.crawl4ai.com
|
|
||||||
- **GitHub Issues**: https://github.com/unclecode/crawl4ai/issues
|
|
||||||
- **Discord**: https://discord.gg/crawl4ai
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
**Happy crawling with webhooks!** 🕷️🪝
|
|
||||||
|
|
||||||
*- unclecode*
|
|
||||||
@@ -1,318 +0,0 @@
|
|||||||
# 🚀 Crawl4AI v0.7.5: The Docker Hooks & Security Update
|
|
||||||
|
|
||||||
*September 29, 2025 • 8 min read*
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
Today I'm releasing Crawl4AI v0.7.5—focused on extensibility and security. This update introduces the Docker Hooks System for pipeline customization, enhanced LLM integration, and important security improvements.
|
|
||||||
|
|
||||||
## 🎯 What's New at a Glance
|
|
||||||
|
|
||||||
- **Docker Hooks System**: Custom Python functions at key pipeline points with function-based API
|
|
||||||
- **Function-Based Hooks**: New `hooks_to_string()` utility with Docker client auto-conversion
|
|
||||||
- **Enhanced LLM Integration**: Custom providers with temperature control
|
|
||||||
- **HTTPS Preservation**: Secure internal link handling
|
|
||||||
- **Bug Fixes**: Resolved multiple community-reported issues
|
|
||||||
- **Improved Docker Error Handling**: Better debugging and reliability
|
|
||||||
|
|
||||||
## 🔧 Docker Hooks System: Pipeline Customization
|
|
||||||
|
|
||||||
Every scraping project needs custom logic—authentication, performance optimization, content processing. Traditional solutions require forking or complex workarounds. Docker Hooks let you inject custom Python functions at 8 key points in the crawling pipeline.
|
|
||||||
|
|
||||||
### Real Example: Authentication & Performance
|
|
||||||
|
|
||||||
```python
|
|
||||||
import requests
|
|
||||||
|
|
||||||
# Real working hooks for httpbin.org
|
|
||||||
hooks_config = {
|
|
||||||
"on_page_context_created": """
|
|
||||||
async def hook(page, context, **kwargs):
|
|
||||||
print("Hook: Setting up page context")
|
|
||||||
# Block images to speed up crawling
|
|
||||||
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
|
||||||
print("Hook: Images blocked")
|
|
||||||
return page
|
|
||||||
""",
|
|
||||||
|
|
||||||
"before_retrieve_html": """
|
|
||||||
async def hook(page, context, **kwargs):
|
|
||||||
print("Hook: Before retrieving HTML")
|
|
||||||
# Scroll to bottom to load lazy content
|
|
||||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
||||||
await page.wait_for_timeout(1000)
|
|
||||||
print("Hook: Scrolled to bottom")
|
|
||||||
return page
|
|
||||||
""",
|
|
||||||
|
|
||||||
"before_goto": """
|
|
||||||
async def hook(page, context, url, **kwargs):
|
|
||||||
print(f"Hook: About to navigate to {url}")
|
|
||||||
# Add custom headers
|
|
||||||
await page.set_extra_http_headers({
|
|
||||||
'X-Test-Header': 'crawl4ai-hooks-test'
|
|
||||||
})
|
|
||||||
return page
|
|
||||||
"""
|
|
||||||
}
|
|
||||||
|
|
||||||
# Test with Docker API
|
|
||||||
payload = {
|
|
||||||
"urls": ["https://httpbin.org/html"],
|
|
||||||
"hooks": {
|
|
||||||
"code": hooks_config,
|
|
||||||
"timeout": 30
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
response = requests.post("http://localhost:11235/crawl", json=payload)
|
|
||||||
result = response.json()
|
|
||||||
|
|
||||||
if result.get('success'):
|
|
||||||
print("✅ Hooks executed successfully!")
|
|
||||||
print(f"Content length: {len(result.get('markdown', ''))} characters")
|
|
||||||
```
|
|
||||||
|
|
||||||
**Available Hook Points:**
|
|
||||||
- `on_browser_created`: Browser setup
|
|
||||||
- `on_page_context_created`: Page context configuration
|
|
||||||
- `before_goto`: Pre-navigation setup
|
|
||||||
- `after_goto`: Post-navigation processing
|
|
||||||
- `on_user_agent_updated`: User agent changes
|
|
||||||
- `on_execution_started`: Crawl initialization
|
|
||||||
- `before_retrieve_html`: Pre-extraction processing
|
|
||||||
- `before_return_html`: Final HTML processing
|
|
||||||
|
|
||||||
### Function-Based Hooks API
|
|
||||||
|
|
||||||
Writing hooks as strings works, but lacks IDE support and type checking. v0.7.5 introduces a function-based approach with automatic conversion!
|
|
||||||
|
|
||||||
**Option 1: Using the `hooks_to_string()` Utility**
|
|
||||||
|
|
||||||
```python
|
|
||||||
from crawl4ai import hooks_to_string
|
|
||||||
import requests
|
|
||||||
|
|
||||||
# Define hooks as regular Python functions (with full IDE support!)
|
|
||||||
async def on_page_context_created(page, context, **kwargs):
|
|
||||||
"""Block images to speed up crawling"""
|
|
||||||
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
|
||||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
|
||||||
return page
|
|
||||||
|
|
||||||
async def before_goto(page, context, url, **kwargs):
|
|
||||||
"""Add custom headers"""
|
|
||||||
await page.set_extra_http_headers({
|
|
||||||
'X-Crawl4AI': 'v0.7.5',
|
|
||||||
'X-Custom-Header': 'my-value'
|
|
||||||
})
|
|
||||||
return page
|
|
||||||
|
|
||||||
# Convert functions to strings
|
|
||||||
hooks_code = hooks_to_string({
|
|
||||||
"on_page_context_created": on_page_context_created,
|
|
||||||
"before_goto": before_goto
|
|
||||||
})
|
|
||||||
|
|
||||||
# Use with REST API
|
|
||||||
payload = {
|
|
||||||
"urls": ["https://httpbin.org/html"],
|
|
||||||
"hooks": {"code": hooks_code, "timeout": 30}
|
|
||||||
}
|
|
||||||
response = requests.post("http://localhost:11235/crawl", json=payload)
|
|
||||||
```
|
|
||||||
|
|
||||||
**Option 2: Docker Client with Automatic Conversion (Recommended!)**
|
|
||||||
|
|
||||||
```python
|
|
||||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
|
||||||
|
|
||||||
# Define hooks as functions (same as above)
|
|
||||||
async def on_page_context_created(page, context, **kwargs):
|
|
||||||
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
|
||||||
return page
|
|
||||||
|
|
||||||
async def before_retrieve_html(page, context, **kwargs):
|
|
||||||
# Scroll to load lazy content
|
|
||||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
||||||
await page.wait_for_timeout(1000)
|
|
||||||
return page
|
|
||||||
|
|
||||||
# Use Docker client - conversion happens automatically!
|
|
||||||
client = Crawl4aiDockerClient(base_url="http://localhost:11235")
|
|
||||||
|
|
||||||
results = await client.crawl(
|
|
||||||
urls=["https://httpbin.org/html"],
|
|
||||||
hooks={
|
|
||||||
"on_page_context_created": on_page_context_created,
|
|
||||||
"before_retrieve_html": before_retrieve_html
|
|
||||||
},
|
|
||||||
hooks_timeout=30
|
|
||||||
)
|
|
||||||
|
|
||||||
if results and results.success:
|
|
||||||
print(f"✅ Hooks executed! HTML length: {len(results.html)}")
|
|
||||||
```
|
|
||||||
|
|
||||||
**Benefits of Function-Based Hooks:**
|
|
||||||
- ✅ Full IDE support (autocomplete, syntax highlighting)
|
|
||||||
- ✅ Type checking and linting
|
|
||||||
- ✅ Easier to test and debug
|
|
||||||
- ✅ Reusable across projects
|
|
||||||
- ✅ Automatic conversion in Docker client
|
|
||||||
- ✅ No breaking changes - string hooks still work!
|
|
||||||
|
|
||||||
## 🤖 Enhanced LLM Integration
|
|
||||||
|
|
||||||
Enhanced LLM integration with custom providers, temperature control, and base URL configuration.
|
|
||||||
|
|
||||||
### Multi-Provider Support
|
|
||||||
|
|
||||||
```python
|
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
|
||||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
|
||||||
|
|
||||||
# Test with different providers
|
|
||||||
async def test_llm_providers():
|
|
||||||
# OpenAI with custom temperature
|
|
||||||
openai_strategy = LLMExtractionStrategy(
|
|
||||||
provider="gemini/gemini-2.5-flash-lite",
|
|
||||||
api_token="your-api-token",
|
|
||||||
temperature=0.7, # New in v0.7.5
|
|
||||||
instruction="Summarize this page in one sentence"
|
|
||||||
)
|
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
|
||||||
result = await crawler.arun(
|
|
||||||
"https://example.com",
|
|
||||||
config=CrawlerRunConfig(extraction_strategy=openai_strategy)
|
|
||||||
)
|
|
||||||
|
|
||||||
if result.success:
|
|
||||||
print("✅ LLM extraction completed")
|
|
||||||
print(result.extracted_content)
|
|
||||||
|
|
||||||
# Docker API with enhanced LLM config
|
|
||||||
llm_payload = {
|
|
||||||
"url": "https://example.com",
|
|
||||||
"f": "llm",
|
|
||||||
"q": "Summarize this page in one sentence.",
|
|
||||||
"provider": "gemini/gemini-2.5-flash-lite",
|
|
||||||
"temperature": 0.7
|
|
||||||
}
|
|
||||||
|
|
||||||
response = requests.post("http://localhost:11235/md", json=llm_payload)
|
|
||||||
```
|
|
||||||
|
|
||||||
**New Features:**
|
|
||||||
- Custom `temperature` parameter for creativity control
|
|
||||||
- `base_url` for custom API endpoints
|
|
||||||
- Multi-provider environment variable support
|
|
||||||
- Docker API integration
|
|
||||||
|
|
||||||
## 🔒 HTTPS Preservation
|
|
||||||
|
|
||||||
**The Problem:** Modern web apps require HTTPS everywhere. When crawlers downgrade internal links from HTTPS to HTTP, authentication breaks and security warnings appear.
|
|
||||||
|
|
||||||
**Solution:** HTTPS preservation maintains secure protocols throughout crawling.
|
|
||||||
|
|
||||||
```python
|
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, FilterChain, URLPatternFilter, BFSDeepCrawlStrategy
|
|
||||||
|
|
||||||
async def test_https_preservation():
|
|
||||||
# Enable HTTPS preservation
|
|
||||||
url_filter = URLPatternFilter(
|
|
||||||
patterns=["^(https:\/\/)?quotes\.toscrape\.com(\/.*)?$"]
|
|
||||||
)
|
|
||||||
|
|
||||||
config = CrawlerRunConfig(
|
|
||||||
exclude_external_links=True,
|
|
||||||
preserve_https_for_internal_links=True, # New in v0.7.5
|
|
||||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
|
||||||
max_depth=2,
|
|
||||||
max_pages=5,
|
|
||||||
filter_chain=FilterChain([url_filter])
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
|
||||||
async for result in await crawler.arun(
|
|
||||||
url="https://quotes.toscrape.com",
|
|
||||||
config=config
|
|
||||||
):
|
|
||||||
# All internal links maintain HTTPS
|
|
||||||
internal_links = [link['href'] for link in result.links['internal']]
|
|
||||||
https_links = [link for link in internal_links if link.startswith('https://')]
|
|
||||||
|
|
||||||
print(f"HTTPS links preserved: {len(https_links)}/{len(internal_links)}")
|
|
||||||
for link in https_links[:3]:
|
|
||||||
print(f" → {link}")
|
|
||||||
```
|
|
||||||
|
|
||||||
## 🛠️ Bug Fixes and Improvements
|
|
||||||
|
|
||||||
### Major Fixes
|
|
||||||
- **URL Processing**: Fixed '+' sign preservation in query parameters (#1332)
|
|
||||||
- **Proxy Configuration**: Enhanced proxy string parsing (old `proxy` parameter deprecated)
|
|
||||||
- **Docker Error Handling**: Comprehensive error messages with status codes
|
|
||||||
- **Memory Management**: Fixed leaks in long-running sessions
|
|
||||||
- **JWT Authentication**: Fixed Docker JWT validation issues (#1442)
|
|
||||||
- **Playwright Stealth**: Fixed stealth features for Playwright integration (#1481)
|
|
||||||
- **API Configuration**: Fixed config handling to prevent overriding user-provided settings (#1505)
|
|
||||||
- **Docker Filter Serialization**: Resolved JSON encoding errors in deep crawl strategy (#1419)
|
|
||||||
- **LLM Provider Support**: Fixed custom LLM provider integration for adaptive crawler (#1291)
|
|
||||||
- **Performance Issues**: Resolved backoff strategy failures and timeout handling (#989)
|
|
||||||
|
|
||||||
### Community-Reported Issues Fixed
|
|
||||||
This release addresses multiple issues reported by the community through GitHub issues and Discord discussions:
|
|
||||||
- Fixed browser configuration reference errors
|
|
||||||
- Resolved dependency conflicts with cssselect
|
|
||||||
- Improved error messaging for failed authentications
|
|
||||||
- Enhanced compatibility with various proxy configurations
|
|
||||||
- Fixed edge cases in URL normalization
|
|
||||||
|
|
||||||
### Configuration Updates
|
|
||||||
```python
|
|
||||||
# Old proxy config (deprecated)
|
|
||||||
# browser_config = BrowserConfig(proxy="http://proxy:8080")
|
|
||||||
|
|
||||||
# New enhanced proxy config
|
|
||||||
browser_config = BrowserConfig(
|
|
||||||
proxy_config={
|
|
||||||
"server": "http://proxy:8080",
|
|
||||||
"username": "optional-user",
|
|
||||||
"password": "optional-pass"
|
|
||||||
}
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
## 🔄 Breaking Changes
|
|
||||||
|
|
||||||
1. **Python 3.10+ Required**: Upgrade from Python 3.9
|
|
||||||
2. **Proxy Parameter Deprecated**: Use new `proxy_config` structure
|
|
||||||
3. **New Dependency**: Added `cssselect` for better CSS handling
|
|
||||||
|
|
||||||
## 🚀 Get Started
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Install latest version
|
|
||||||
pip install crawl4ai==0.7.5
|
|
||||||
|
|
||||||
# Docker deployment
|
|
||||||
docker pull unclecode/crawl4ai:latest
|
|
||||||
docker run -p 11235:11235 unclecode/crawl4ai:latest
|
|
||||||
```
|
|
||||||
|
|
||||||
**Try the Demo:**
|
|
||||||
```bash
|
|
||||||
# Run working examples
|
|
||||||
python docs/releases_review/demo_v0.7.5.py
|
|
||||||
```
|
|
||||||
|
|
||||||
**Resources:**
|
|
||||||
- 📖 Documentation: [docs.crawl4ai.com](https://docs.crawl4ai.com)
|
|
||||||
- 🐙 GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
|
||||||
- 💬 Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
|
||||||
- 🐦 Twitter: [@unclecode](https://x.com/unclecode)
|
|
||||||
|
|
||||||
Happy crawling! 🕷️
|
|
||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -59,27 +59,6 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant
|
|||||||
|
|
||||||
> **Note**: If you're looking for the old documentation, you can access it [here](https://old.docs.crawl4ai.com).
|
> **Note**: If you're looking for the old documentation, you can access it [here](https://old.docs.crawl4ai.com).
|
||||||
|
|
||||||
## 🆕 AI Assistant Skill Now Available!
|
|
||||||
|
|
||||||
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 10px; margin: 20px 0; box-shadow: 0 4px 6px rgba(0,0,0,0.1);">
|
|
||||||
<h3 style="color: white; margin: 0 0 10px 0;">🤖 Crawl4AI Skill for Claude & AI Assistants</h3>
|
|
||||||
<p style="color: white; margin: 10px 0;">Supercharge your AI coding assistant with complete Crawl4AI knowledge! Download our comprehensive skill package that includes:</p>
|
|
||||||
<ul style="color: white; margin: 10px 0;">
|
|
||||||
<li>📚 Complete SDK reference (23K+ words)</li>
|
|
||||||
<li>🚀 Ready-to-use extraction scripts</li>
|
|
||||||
<li>⚡ Schema generation for efficient scraping</li>
|
|
||||||
<li>🔧 Version 0.7.4 compatible</li>
|
|
||||||
</ul>
|
|
||||||
<div style="text-align: center; margin-top: 15px;">
|
|
||||||
<a href="assets/crawl4ai-skill.zip" download style="background: white; color: #667eea; padding: 12px 30px; border-radius: 5px; text-decoration: none; font-weight: bold; display: inline-block; transition: transform 0.2s;">
|
|
||||||
📦 Download Skill Package
|
|
||||||
</a>
|
|
||||||
</div>
|
|
||||||
<p style="color: white; margin: 15px 0 0 0; font-size: 0.9em; text-align: center;">
|
|
||||||
Works with Claude, Cursor, Windsurf, and other AI coding assistants. Import the .zip file into your AI assistant's skill/knowledge system.
|
|
||||||
</p>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
## 🎯 New: Adaptive Web Crawling
|
## 🎯 New: Adaptive Web Crawling
|
||||||
|
|
||||||
Crawl4AI now features intelligent adaptive crawling that knows when to stop! Using advanced information foraging algorithms, it determines when sufficient information has been gathered to answer your query.
|
Crawl4AI now features intelligent adaptive crawling that knows when to stop! Using advanced information foraging algorithms, it determines when sufficient information has been gathered to answer your query.
|
||||||
|
|||||||
@@ -1,338 +0,0 @@
|
|||||||
"""
|
|
||||||
🚀 Crawl4AI v0.7.5 Release Demo - Working Examples
|
|
||||||
==================================================
|
|
||||||
This demo showcases key features introduced in v0.7.5 with real, executable examples.
|
|
||||||
|
|
||||||
Featured Demos:
|
|
||||||
1. ✅ Docker Hooks System - Real API calls with custom hooks (string & function-based)
|
|
||||||
2. ✅ Enhanced LLM Integration - Working LLM configurations
|
|
||||||
3. ✅ HTTPS Preservation - Live crawling with HTTPS maintenance
|
|
||||||
|
|
||||||
Requirements:
|
|
||||||
- crawl4ai v0.7.5 installed
|
|
||||||
- Docker running with crawl4ai image (optional for Docker demos)
|
|
||||||
- Valid API keys for LLM demos (optional)
|
|
||||||
"""
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
import requests
|
|
||||||
import time
|
|
||||||
import sys
|
|
||||||
|
|
||||||
from crawl4ai import (AsyncWebCrawler, CrawlerRunConfig, BrowserConfig,
|
|
||||||
CacheMode, FilterChain, URLPatternFilter, BFSDeepCrawlStrategy,
|
|
||||||
hooks_to_string)
|
|
||||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
|
||||||
|
|
||||||
|
|
||||||
def print_section(title: str, description: str = ""):
|
|
||||||
"""Print a section header"""
|
|
||||||
print(f"\n{'=' * 60}")
|
|
||||||
print(f"{title}")
|
|
||||||
if description:
|
|
||||||
print(f"{description}")
|
|
||||||
print(f"{'=' * 60}\n")
|
|
||||||
|
|
||||||
|
|
||||||
async def demo_1_docker_hooks_system():
|
|
||||||
"""Demo 1: Docker Hooks System - Real API calls with custom hooks"""
|
|
||||||
print_section(
|
|
||||||
"Demo 1: Docker Hooks System",
|
|
||||||
"Testing both string-based and function-based hooks (NEW in v0.7.5!)"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Check Docker service availability
|
|
||||||
def check_docker_service():
|
|
||||||
try:
|
|
||||||
response = requests.get("http://localhost:11235/", timeout=3)
|
|
||||||
return response.status_code == 200
|
|
||||||
except:
|
|
||||||
return False
|
|
||||||
|
|
||||||
print("Checking Docker service...")
|
|
||||||
docker_running = check_docker_service()
|
|
||||||
|
|
||||||
if not docker_running:
|
|
||||||
print("⚠️ Docker service not running on localhost:11235")
|
|
||||||
print("To test Docker hooks:")
|
|
||||||
print("1. Run: docker run -p 11235:11235 unclecode/crawl4ai:latest")
|
|
||||||
print("2. Wait for service to start")
|
|
||||||
print("3. Re-run this demo\n")
|
|
||||||
return
|
|
||||||
|
|
||||||
print("✓ Docker service detected!")
|
|
||||||
|
|
||||||
# ============================================================================
|
|
||||||
# PART 1: Traditional String-Based Hooks (Works with REST API)
|
|
||||||
# ============================================================================
|
|
||||||
print("\n" + "─" * 60)
|
|
||||||
print("Part 1: String-Based Hooks (REST API)")
|
|
||||||
print("─" * 60)
|
|
||||||
|
|
||||||
hooks_config_string = {
|
|
||||||
"on_page_context_created": """
|
|
||||||
async def hook(page, context, **kwargs):
|
|
||||||
print("[String Hook] Setting up page context")
|
|
||||||
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
|
||||||
return page
|
|
||||||
""",
|
|
||||||
"before_retrieve_html": """
|
|
||||||
async def hook(page, context, **kwargs):
|
|
||||||
print("[String Hook] Before retrieving HTML")
|
|
||||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
||||||
await page.wait_for_timeout(1000)
|
|
||||||
return page
|
|
||||||
"""
|
|
||||||
}
|
|
||||||
|
|
||||||
payload = {
|
|
||||||
"urls": ["https://httpbin.org/html"],
|
|
||||||
"hooks": {
|
|
||||||
"code": hooks_config_string,
|
|
||||||
"timeout": 30
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
print("🔧 Using string-based hooks for REST API...")
|
|
||||||
try:
|
|
||||||
start_time = time.time()
|
|
||||||
response = requests.post("http://localhost:11235/crawl", json=payload, timeout=60)
|
|
||||||
execution_time = time.time() - start_time
|
|
||||||
|
|
||||||
if response.status_code == 200:
|
|
||||||
result = response.json()
|
|
||||||
print(f"✅ String-based hooks executed in {execution_time:.2f}s")
|
|
||||||
if result.get('results') and result['results'][0].get('success'):
|
|
||||||
html_length = len(result['results'][0].get('html', ''))
|
|
||||||
print(f" 📄 HTML length: {html_length} characters")
|
|
||||||
else:
|
|
||||||
print(f"❌ Request failed: {response.status_code}")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Error: {str(e)}")
|
|
||||||
|
|
||||||
# ============================================================================
|
|
||||||
# PART 2: NEW Function-Based Hooks with Docker Client (v0.7.5)
|
|
||||||
# ============================================================================
|
|
||||||
print("\n" + "─" * 60)
|
|
||||||
print("Part 2: Function-Based Hooks with Docker Client (✨ NEW!)")
|
|
||||||
print("─" * 60)
|
|
||||||
|
|
||||||
# Define hooks as regular Python functions
|
|
||||||
async def on_page_context_created_func(page, context, **kwargs):
|
|
||||||
"""Block images to speed up crawling"""
|
|
||||||
print("[Function Hook] Setting up page context")
|
|
||||||
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
|
||||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
|
||||||
return page
|
|
||||||
|
|
||||||
async def before_goto_func(page, context, url, **kwargs):
|
|
||||||
"""Add custom headers before navigation"""
|
|
||||||
print(f"[Function Hook] About to navigate to {url}")
|
|
||||||
await page.set_extra_http_headers({
|
|
||||||
'X-Crawl4AI': 'v0.7.5-function-hooks',
|
|
||||||
'X-Test-Header': 'demo'
|
|
||||||
})
|
|
||||||
return page
|
|
||||||
|
|
||||||
async def before_retrieve_html_func(page, context, **kwargs):
|
|
||||||
"""Scroll to load lazy content"""
|
|
||||||
print("[Function Hook] Scrolling page for lazy-loaded content")
|
|
||||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
||||||
await page.wait_for_timeout(500)
|
|
||||||
await page.evaluate("window.scrollTo(0, 0)")
|
|
||||||
return page
|
|
||||||
|
|
||||||
# Use the hooks_to_string utility (can be used standalone)
|
|
||||||
print("\n📦 Converting functions to strings with hooks_to_string()...")
|
|
||||||
hooks_as_strings = hooks_to_string({
|
|
||||||
"on_page_context_created": on_page_context_created_func,
|
|
||||||
"before_goto": before_goto_func,
|
|
||||||
"before_retrieve_html": before_retrieve_html_func
|
|
||||||
})
|
|
||||||
print(f" ✓ Converted {len(hooks_as_strings)} hooks to string format")
|
|
||||||
|
|
||||||
# OR use Docker Client which does conversion automatically!
|
|
||||||
print("\n🐳 Using Docker Client with automatic conversion...")
|
|
||||||
try:
|
|
||||||
client = Crawl4aiDockerClient(base_url="http://localhost:11235")
|
|
||||||
|
|
||||||
# Pass function objects directly - conversion happens automatically!
|
|
||||||
results = await client.crawl(
|
|
||||||
urls=["https://httpbin.org/html"],
|
|
||||||
hooks={
|
|
||||||
"on_page_context_created": on_page_context_created_func,
|
|
||||||
"before_goto": before_goto_func,
|
|
||||||
"before_retrieve_html": before_retrieve_html_func
|
|
||||||
},
|
|
||||||
hooks_timeout=30
|
|
||||||
)
|
|
||||||
|
|
||||||
if results and results.success:
|
|
||||||
print(f"✅ Function-based hooks executed successfully!")
|
|
||||||
print(f" 📄 HTML length: {len(results.html)} characters")
|
|
||||||
print(f" 🎯 URL: {results.url}")
|
|
||||||
else:
|
|
||||||
print("⚠️ Crawl completed but may have warnings")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Docker client error: {str(e)}")
|
|
||||||
|
|
||||||
# Show the benefits
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("✨ Benefits of Function-Based Hooks:")
|
|
||||||
print("=" * 60)
|
|
||||||
print("✓ Full IDE support (autocomplete, syntax highlighting)")
|
|
||||||
print("✓ Type checking and linting")
|
|
||||||
print("✓ Easier to test and debug")
|
|
||||||
print("✓ Reusable across projects")
|
|
||||||
print("✓ Automatic conversion in Docker client")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
|
|
||||||
async def demo_2_enhanced_llm_integration():
|
|
||||||
"""Demo 2: Enhanced LLM Integration - Working LLM configurations"""
|
|
||||||
print_section(
|
|
||||||
"Demo 2: Enhanced LLM Integration",
|
|
||||||
"Testing custom LLM providers and configurations"
|
|
||||||
)
|
|
||||||
|
|
||||||
print("🤖 Testing Enhanced LLM Integration Features")
|
|
||||||
|
|
||||||
provider = "gemini/gemini-2.5-flash-lite"
|
|
||||||
payload = {
|
|
||||||
"url": "https://example.com",
|
|
||||||
"f": "llm",
|
|
||||||
"q": "Summarize this page in one sentence.",
|
|
||||||
"provider": provider, # Explicitly set provider
|
|
||||||
"temperature": 0.7
|
|
||||||
}
|
|
||||||
try:
|
|
||||||
response = requests.post(
|
|
||||||
"http://localhost:11235/md",
|
|
||||||
json=payload,
|
|
||||||
timeout=60
|
|
||||||
)
|
|
||||||
if response.status_code == 200:
|
|
||||||
result = response.json()
|
|
||||||
print(f"✓ Request successful with provider: {provider}")
|
|
||||||
print(f" - Response keys: {list(result.keys())}")
|
|
||||||
print(f" - Content length: {len(result.get('markdown', ''))} characters")
|
|
||||||
print(f" - Note: Actual LLM call may fail without valid API key")
|
|
||||||
else:
|
|
||||||
print(f"❌ Request failed: {response.status_code}")
|
|
||||||
print(f" - Response: {response.text[:500]}")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"[red]Error: {e}[/]")
|
|
||||||
|
|
||||||
|
|
||||||
async def demo_3_https_preservation():
|
|
||||||
"""Demo 3: HTTPS Preservation - Live crawling with HTTPS maintenance"""
|
|
||||||
print_section(
|
|
||||||
"Demo 3: HTTPS Preservation",
|
|
||||||
"Testing HTTPS preservation for internal links"
|
|
||||||
)
|
|
||||||
|
|
||||||
print("🔒 Testing HTTPS Preservation Feature")
|
|
||||||
|
|
||||||
# Test with HTTPS preservation enabled
|
|
||||||
print("\nTest 1: HTTPS Preservation ENABLED")
|
|
||||||
|
|
||||||
url_filter = URLPatternFilter(
|
|
||||||
patterns=["^(https:\/\/)?quotes\.toscrape\.com(\/.*)?$"]
|
|
||||||
)
|
|
||||||
config = CrawlerRunConfig(
|
|
||||||
exclude_external_links=True,
|
|
||||||
stream=True,
|
|
||||||
verbose=False,
|
|
||||||
preserve_https_for_internal_links=True,
|
|
||||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
|
||||||
max_depth=2,
|
|
||||||
max_pages=5,
|
|
||||||
filter_chain=FilterChain([url_filter])
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
test_url = "https://quotes.toscrape.com"
|
|
||||||
print(f"🎯 Testing URL: {test_url}")
|
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
|
||||||
async for result in await crawler.arun(url=test_url, config=config):
|
|
||||||
print("✓ HTTPS Preservation Test Completed")
|
|
||||||
internal_links = [i['href'] for i in result.links['internal']]
|
|
||||||
for link in internal_links:
|
|
||||||
print(f" → {link}")
|
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
"""Run all demos"""
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("🚀 Crawl4AI v0.7.5 Working Demo")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
# Check system requirements
|
|
||||||
print("🔍 System Requirements Check:")
|
|
||||||
print(f" - Python version: {sys.version.split()[0]} {'✓' if sys.version_info >= (3, 10) else '❌ (3.10+ required)'}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
import requests
|
|
||||||
print(f" - Requests library: ✓")
|
|
||||||
except ImportError:
|
|
||||||
print(f" - Requests library: ❌")
|
|
||||||
|
|
||||||
print()
|
|
||||||
|
|
||||||
demos = [
|
|
||||||
("Docker Hooks System", demo_1_docker_hooks_system),
|
|
||||||
("Enhanced LLM Integration", demo_2_enhanced_llm_integration),
|
|
||||||
("HTTPS Preservation", demo_3_https_preservation),
|
|
||||||
]
|
|
||||||
|
|
||||||
for i, (name, demo_func) in enumerate(demos, 1):
|
|
||||||
try:
|
|
||||||
print(f"\n📍 Starting Demo {i}/{len(demos)}: {name}")
|
|
||||||
await demo_func()
|
|
||||||
|
|
||||||
if i < len(demos):
|
|
||||||
print(f"\n✨ Demo {i} complete! Press Enter for next demo...")
|
|
||||||
input()
|
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
print(f"\n⏹️ Demo interrupted by user")
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Demo {i} error: {str(e)}")
|
|
||||||
print("Continuing to next demo...")
|
|
||||||
continue
|
|
||||||
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("🎉 Demo Complete!")
|
|
||||||
print("=" * 60)
|
|
||||||
print("You've experienced the power of Crawl4AI v0.7.5!")
|
|
||||||
print("")
|
|
||||||
print("Key Features Demonstrated:")
|
|
||||||
print("🔧 Docker Hooks - String-based & function-based (NEW!)")
|
|
||||||
print(" • hooks_to_string() utility for function conversion")
|
|
||||||
print(" • Docker client with automatic conversion")
|
|
||||||
print(" • Full IDE support and type checking")
|
|
||||||
print("🤖 Enhanced LLM - Better AI integration")
|
|
||||||
print("🔒 HTTPS Preservation - Secure link handling")
|
|
||||||
print("")
|
|
||||||
print("Ready to build something amazing? 🚀")
|
|
||||||
print("")
|
|
||||||
print("📖 Docs: https://docs.crawl4ai.com/")
|
|
||||||
print("🐙 GitHub: https://github.com/unclecode/crawl4ai")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
print("🚀 Crawl4AI v0.7.5 Live Demo Starting...")
|
|
||||||
print("Press Ctrl+C anytime to exit\n")
|
|
||||||
|
|
||||||
try:
|
|
||||||
asyncio.run(main())
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
print("\n👋 Demo stopped by user. Thanks for trying Crawl4AI v0.7.5!")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"\n❌ Demo error: {str(e)}")
|
|
||||||
print("Make sure you have the required dependencies installed.")
|
|
||||||
@@ -1,359 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Crawl4AI v0.7.6 Release Demo
|
|
||||||
============================
|
|
||||||
|
|
||||||
This demo showcases the major feature in v0.7.6:
|
|
||||||
**Webhook Support for Docker Job Queue API**
|
|
||||||
|
|
||||||
Features Demonstrated:
|
|
||||||
1. Asynchronous job processing with webhook notifications
|
|
||||||
2. Webhook support for /crawl/job endpoint
|
|
||||||
3. Webhook support for /llm/job endpoint
|
|
||||||
4. Notification-only vs data-in-payload modes
|
|
||||||
5. Custom webhook headers for authentication
|
|
||||||
6. Structured extraction with JSON schemas
|
|
||||||
7. Exponential backoff retry for reliable delivery
|
|
||||||
|
|
||||||
Prerequisites:
|
|
||||||
- Crawl4AI Docker container running on localhost:11235
|
|
||||||
- Flask installed: pip install flask requests
|
|
||||||
- LLM API key configured (for LLM examples)
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
python docs/releases_review/demo_v0.7.6.py
|
|
||||||
"""
|
|
||||||
|
|
||||||
import requests
|
|
||||||
import json
|
|
||||||
import time
|
|
||||||
from flask import Flask, request, jsonify
|
|
||||||
from threading import Thread
|
|
||||||
|
|
||||||
# Configuration
|
|
||||||
CRAWL4AI_BASE_URL = "http://localhost:11235"
|
|
||||||
WEBHOOK_BASE_URL = "http://localhost:8080"
|
|
||||||
|
|
||||||
# Flask app for webhook receiver
|
|
||||||
app = Flask(__name__)
|
|
||||||
received_webhooks = []
|
|
||||||
|
|
||||||
|
|
||||||
@app.route('/webhook', methods=['POST'])
|
|
||||||
def webhook_handler():
|
|
||||||
"""Universal webhook handler for both crawl and LLM extraction jobs."""
|
|
||||||
payload = request.json
|
|
||||||
task_id = payload['task_id']
|
|
||||||
task_type = payload['task_type']
|
|
||||||
status = payload['status']
|
|
||||||
|
|
||||||
print(f"\n{'='*70}")
|
|
||||||
print(f"📬 Webhook Received!")
|
|
||||||
print(f" Task ID: {task_id}")
|
|
||||||
print(f" Task Type: {task_type}")
|
|
||||||
print(f" Status: {status}")
|
|
||||||
print(f" Timestamp: {payload['timestamp']}")
|
|
||||||
|
|
||||||
if status == 'completed':
|
|
||||||
if 'data' in payload:
|
|
||||||
print(f" ✅ Data included in webhook")
|
|
||||||
if task_type == 'crawl':
|
|
||||||
results = payload['data'].get('results', [])
|
|
||||||
print(f" 📊 Crawled {len(results)} URL(s)")
|
|
||||||
elif task_type == 'llm_extraction':
|
|
||||||
extracted = payload['data'].get('extracted_content', {})
|
|
||||||
print(f" 🤖 Extracted: {json.dumps(extracted, indent=6)}")
|
|
||||||
else:
|
|
||||||
print(f" 📥 Notification only (fetch data separately)")
|
|
||||||
elif status == 'failed':
|
|
||||||
print(f" ❌ Error: {payload.get('error', 'Unknown')}")
|
|
||||||
|
|
||||||
print(f"{'='*70}\n")
|
|
||||||
received_webhooks.append(payload)
|
|
||||||
|
|
||||||
return jsonify({"status": "received"}), 200
|
|
||||||
|
|
||||||
|
|
||||||
def start_webhook_server():
|
|
||||||
"""Start Flask webhook server in background."""
|
|
||||||
app.run(host='0.0.0.0', port=8080, debug=False, use_reloader=False)
|
|
||||||
|
|
||||||
|
|
||||||
def demo_1_crawl_webhook_notification_only():
|
|
||||||
"""Demo 1: Crawl job with webhook notification (data fetched separately)."""
|
|
||||||
print("\n" + "="*70)
|
|
||||||
print("DEMO 1: Crawl Job - Webhook Notification Only")
|
|
||||||
print("="*70)
|
|
||||||
print("Submitting crawl job with webhook notification...")
|
|
||||||
|
|
||||||
payload = {
|
|
||||||
"urls": ["https://example.com"],
|
|
||||||
"browser_config": {"headless": True},
|
|
||||||
"crawler_config": {"cache_mode": "bypass"},
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": f"{WEBHOOK_BASE_URL}/webhook",
|
|
||||||
"webhook_data_in_payload": False,
|
|
||||||
"webhook_headers": {
|
|
||||||
"X-Demo": "v0.7.6",
|
|
||||||
"X-Type": "crawl"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
response = requests.post(f"{CRAWL4AI_BASE_URL}/crawl/job", json=payload)
|
|
||||||
if response.ok:
|
|
||||||
task_id = response.json()['task_id']
|
|
||||||
print(f"✅ Job submitted: {task_id}")
|
|
||||||
print("⏳ Webhook will notify when complete...")
|
|
||||||
return task_id
|
|
||||||
else:
|
|
||||||
print(f"❌ Failed: {response.text}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def demo_2_crawl_webhook_with_data():
|
|
||||||
"""Demo 2: Crawl job with full data in webhook payload."""
|
|
||||||
print("\n" + "="*70)
|
|
||||||
print("DEMO 2: Crawl Job - Webhook with Full Data")
|
|
||||||
print("="*70)
|
|
||||||
print("Submitting crawl job with data included in webhook...")
|
|
||||||
|
|
||||||
payload = {
|
|
||||||
"urls": ["https://www.python.org"],
|
|
||||||
"browser_config": {"headless": True},
|
|
||||||
"crawler_config": {"cache_mode": "bypass"},
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": f"{WEBHOOK_BASE_URL}/webhook",
|
|
||||||
"webhook_data_in_payload": True,
|
|
||||||
"webhook_headers": {
|
|
||||||
"X-Demo": "v0.7.6",
|
|
||||||
"X-Type": "crawl-with-data"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
response = requests.post(f"{CRAWL4AI_BASE_URL}/crawl/job", json=payload)
|
|
||||||
if response.ok:
|
|
||||||
task_id = response.json()['task_id']
|
|
||||||
print(f"✅ Job submitted: {task_id}")
|
|
||||||
print("⏳ Webhook will include full results...")
|
|
||||||
return task_id
|
|
||||||
else:
|
|
||||||
print(f"❌ Failed: {response.text}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def demo_3_llm_webhook_notification_only():
|
|
||||||
"""Demo 3: LLM extraction with webhook notification (NEW in v0.7.6!)."""
|
|
||||||
print("\n" + "="*70)
|
|
||||||
print("DEMO 3: LLM Extraction - Webhook Notification Only (NEW!)")
|
|
||||||
print("="*70)
|
|
||||||
print("Submitting LLM extraction job with webhook notification...")
|
|
||||||
|
|
||||||
payload = {
|
|
||||||
"url": "https://www.example.com",
|
|
||||||
"q": "Extract the main heading and description from this page",
|
|
||||||
"provider": "openai/gpt-4o-mini",
|
|
||||||
"cache": False,
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": f"{WEBHOOK_BASE_URL}/webhook",
|
|
||||||
"webhook_data_in_payload": False,
|
|
||||||
"webhook_headers": {
|
|
||||||
"X-Demo": "v0.7.6",
|
|
||||||
"X-Type": "llm"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
response = requests.post(f"{CRAWL4AI_BASE_URL}/llm/job", json=payload)
|
|
||||||
if response.ok:
|
|
||||||
task_id = response.json()['task_id']
|
|
||||||
print(f"✅ Job submitted: {task_id}")
|
|
||||||
print("⏳ Webhook will notify when LLM extraction completes...")
|
|
||||||
return task_id
|
|
||||||
else:
|
|
||||||
print(f"❌ Failed: {response.text}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def demo_4_llm_webhook_with_schema():
|
|
||||||
"""Demo 4: LLM extraction with JSON schema and data in webhook (NEW in v0.7.6!)."""
|
|
||||||
print("\n" + "="*70)
|
|
||||||
print("DEMO 4: LLM Extraction - Schema + Full Data in Webhook (NEW!)")
|
|
||||||
print("="*70)
|
|
||||||
print("Submitting LLM extraction with JSON schema...")
|
|
||||||
|
|
||||||
schema = {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"title": {"type": "string", "description": "Page title"},
|
|
||||||
"description": {"type": "string", "description": "Page description"},
|
|
||||||
"main_topics": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {"type": "string"},
|
|
||||||
"description": "Main topics covered"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"required": ["title"]
|
|
||||||
}
|
|
||||||
|
|
||||||
payload = {
|
|
||||||
"url": "https://www.python.org",
|
|
||||||
"q": "Extract the title, description, and main topics from this website",
|
|
||||||
"schema": json.dumps(schema),
|
|
||||||
"provider": "openai/gpt-4o-mini",
|
|
||||||
"cache": False,
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": f"{WEBHOOK_BASE_URL}/webhook",
|
|
||||||
"webhook_data_in_payload": True,
|
|
||||||
"webhook_headers": {
|
|
||||||
"X-Demo": "v0.7.6",
|
|
||||||
"X-Type": "llm-with-schema"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
response = requests.post(f"{CRAWL4AI_BASE_URL}/llm/job", json=payload)
|
|
||||||
if response.ok:
|
|
||||||
task_id = response.json()['task_id']
|
|
||||||
print(f"✅ Job submitted: {task_id}")
|
|
||||||
print("⏳ Webhook will include structured extraction results...")
|
|
||||||
return task_id
|
|
||||||
else:
|
|
||||||
print(f"❌ Failed: {response.text}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def demo_5_global_webhook_config():
|
|
||||||
"""Demo 5: Using global webhook configuration from config.yml."""
|
|
||||||
print("\n" + "="*70)
|
|
||||||
print("DEMO 5: Global Webhook Configuration")
|
|
||||||
print("="*70)
|
|
||||||
print("💡 You can configure a default webhook URL in config.yml:")
|
|
||||||
print("""
|
|
||||||
webhooks:
|
|
||||||
enabled: true
|
|
||||||
default_url: "https://myapp.com/webhooks/default"
|
|
||||||
data_in_payload: false
|
|
||||||
retry:
|
|
||||||
max_attempts: 5
|
|
||||||
initial_delay_ms: 1000
|
|
||||||
max_delay_ms: 32000
|
|
||||||
timeout_ms: 30000
|
|
||||||
""")
|
|
||||||
print("Then submit jobs WITHOUT webhook_config - they'll use the default!")
|
|
||||||
print("This is useful for consistent webhook handling across all jobs.")
|
|
||||||
|
|
||||||
|
|
||||||
def demo_6_webhook_retry_logic():
|
|
||||||
"""Demo 6: Webhook retry mechanism with exponential backoff."""
|
|
||||||
print("\n" + "="*70)
|
|
||||||
print("DEMO 6: Webhook Retry Logic")
|
|
||||||
print("="*70)
|
|
||||||
print("🔄 Webhook delivery uses exponential backoff retry:")
|
|
||||||
print(" • Max attempts: 5")
|
|
||||||
print(" • Delays: 1s → 2s → 4s → 8s → 16s")
|
|
||||||
print(" • Timeout: 30s per attempt")
|
|
||||||
print(" • Retries on: 5xx errors, network errors, timeouts")
|
|
||||||
print(" • No retry on: 4xx client errors")
|
|
||||||
print("\nThis ensures reliable webhook delivery even with temporary failures!")
|
|
||||||
|
|
||||||
|
|
||||||
def print_summary():
|
|
||||||
"""Print demo summary and results."""
|
|
||||||
print("\n" + "="*70)
|
|
||||||
print("📊 DEMO SUMMARY")
|
|
||||||
print("="*70)
|
|
||||||
print(f"Total webhooks received: {len(received_webhooks)}")
|
|
||||||
|
|
||||||
crawl_webhooks = [w for w in received_webhooks if w['task_type'] == 'crawl']
|
|
||||||
llm_webhooks = [w for w in received_webhooks if w['task_type'] == 'llm_extraction']
|
|
||||||
|
|
||||||
print(f"\nBreakdown:")
|
|
||||||
print(f" 🕷️ Crawl jobs: {len(crawl_webhooks)}")
|
|
||||||
print(f" 🤖 LLM extraction jobs: {len(llm_webhooks)}")
|
|
||||||
|
|
||||||
print(f"\nDetails:")
|
|
||||||
for i, webhook in enumerate(received_webhooks, 1):
|
|
||||||
icon = "🕷️" if webhook['task_type'] == 'crawl' else "🤖"
|
|
||||||
print(f" {i}. {icon} {webhook['task_id']}: {webhook['status']}")
|
|
||||||
|
|
||||||
print("\n" + "="*70)
|
|
||||||
print("✨ v0.7.6 KEY FEATURES DEMONSTRATED:")
|
|
||||||
print("="*70)
|
|
||||||
print("✅ Webhook support for /crawl/job")
|
|
||||||
print("✅ Webhook support for /llm/job (NEW!)")
|
|
||||||
print("✅ Notification-only mode (fetch data separately)")
|
|
||||||
print("✅ Data-in-payload mode (get full results in webhook)")
|
|
||||||
print("✅ Custom headers for authentication")
|
|
||||||
print("✅ JSON schema for structured LLM extraction")
|
|
||||||
print("✅ Exponential backoff retry for reliable delivery")
|
|
||||||
print("✅ Global webhook configuration support")
|
|
||||||
print("✅ Universal webhook handler for both job types")
|
|
||||||
print("\n💡 Benefits:")
|
|
||||||
print(" • No more polling - get instant notifications")
|
|
||||||
print(" • Better resource utilization")
|
|
||||||
print(" • Reliable delivery with automatic retries")
|
|
||||||
print(" • Consistent API across crawl and LLM jobs")
|
|
||||||
print(" • Production-ready webhook infrastructure")
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Run all demos."""
|
|
||||||
print("\n" + "="*70)
|
|
||||||
print("🚀 Crawl4AI v0.7.6 Release Demo")
|
|
||||||
print("="*70)
|
|
||||||
print("Feature: Webhook Support for Docker Job Queue API")
|
|
||||||
print("="*70)
|
|
||||||
|
|
||||||
# Check if server is running
|
|
||||||
try:
|
|
||||||
health = requests.get(f"{CRAWL4AI_BASE_URL}/health", timeout=5)
|
|
||||||
print(f"✅ Crawl4AI server is running")
|
|
||||||
except:
|
|
||||||
print(f"❌ Cannot connect to Crawl4AI at {CRAWL4AI_BASE_URL}")
|
|
||||||
print("Please start Docker container:")
|
|
||||||
print(" docker run -d -p 11235:11235 --env-file .llm.env unclecode/crawl4ai:0.7.6")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Start webhook server
|
|
||||||
print(f"\n🌐 Starting webhook server at {WEBHOOK_BASE_URL}...")
|
|
||||||
webhook_thread = Thread(target=start_webhook_server, daemon=True)
|
|
||||||
webhook_thread.start()
|
|
||||||
time.sleep(2)
|
|
||||||
|
|
||||||
# Run demos
|
|
||||||
demo_1_crawl_webhook_notification_only()
|
|
||||||
time.sleep(5)
|
|
||||||
|
|
||||||
demo_2_crawl_webhook_with_data()
|
|
||||||
time.sleep(5)
|
|
||||||
|
|
||||||
demo_3_llm_webhook_notification_only()
|
|
||||||
time.sleep(5)
|
|
||||||
|
|
||||||
demo_4_llm_webhook_with_schema()
|
|
||||||
time.sleep(5)
|
|
||||||
|
|
||||||
demo_5_global_webhook_config()
|
|
||||||
demo_6_webhook_retry_logic()
|
|
||||||
|
|
||||||
# Wait for webhooks
|
|
||||||
print("\n⏳ Waiting for all webhooks to arrive...")
|
|
||||||
time.sleep(30)
|
|
||||||
|
|
||||||
# Print summary
|
|
||||||
print_summary()
|
|
||||||
|
|
||||||
print("\n" + "="*70)
|
|
||||||
print("✅ Demo completed!")
|
|
||||||
print("="*70)
|
|
||||||
print("\n📚 Documentation:")
|
|
||||||
print(" • deploy/docker/WEBHOOK_EXAMPLES.md")
|
|
||||||
print(" • docs/examples/docker_webhook_example.py")
|
|
||||||
print("\n🔗 Upgrade:")
|
|
||||||
print(" docker pull unclecode/crawl4ai:0.7.6")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,655 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
🚀 Crawl4AI v0.7.5 - Docker Hooks System Complete Demonstration
|
|
||||||
================================================================
|
|
||||||
|
|
||||||
This file demonstrates the NEW Docker Hooks System introduced in v0.7.5.
|
|
||||||
|
|
||||||
The Docker Hooks System is a completely NEW feature that provides pipeline
|
|
||||||
customization through user-provided Python functions. It offers three approaches:
|
|
||||||
|
|
||||||
1. String-based hooks for REST API
|
|
||||||
2. hooks_to_string() utility to convert functions
|
|
||||||
3. Docker Client with automatic conversion (most convenient)
|
|
||||||
|
|
||||||
All three approaches are part of this NEW v0.7.5 feature!
|
|
||||||
|
|
||||||
Perfect for video recording and demonstration purposes.
|
|
||||||
|
|
||||||
Requirements:
|
|
||||||
- Docker container running: docker run -p 11235:11235 unclecode/crawl4ai:latest
|
|
||||||
- crawl4ai v0.7.5 installed: pip install crawl4ai==0.7.5
|
|
||||||
"""
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
import requests
|
|
||||||
import json
|
|
||||||
import time
|
|
||||||
from typing import Dict, Any
|
|
||||||
|
|
||||||
# Import Crawl4AI components
|
|
||||||
from crawl4ai import hooks_to_string
|
|
||||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
|
||||||
|
|
||||||
# Configuration
|
|
||||||
DOCKER_URL = "http://localhost:11235"
|
|
||||||
# DOCKER_URL = "http://localhost:11234"
|
|
||||||
TEST_URLS = [
|
|
||||||
# "https://httpbin.org/html",
|
|
||||||
"https://www.kidocode.com",
|
|
||||||
"https://quotes.toscrape.com",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def print_section(title: str, description: str = ""):
|
|
||||||
"""Print a formatted section header"""
|
|
||||||
print("\n" + "=" * 70)
|
|
||||||
print(f" {title}")
|
|
||||||
if description:
|
|
||||||
print(f" {description}")
|
|
||||||
print("=" * 70 + "\n")
|
|
||||||
|
|
||||||
|
|
||||||
def check_docker_service() -> bool:
|
|
||||||
"""Check if Docker service is running"""
|
|
||||||
try:
|
|
||||||
response = requests.get(f"{DOCKER_URL}/health", timeout=3)
|
|
||||||
return response.status_code == 200
|
|
||||||
except:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
|
||||||
# REUSABLE HOOK LIBRARY (NEW in v0.7.5)
|
|
||||||
# ============================================================================
|
|
||||||
|
|
||||||
async def performance_optimization_hook(page, context, **kwargs):
|
|
||||||
"""
|
|
||||||
Performance Hook: Block unnecessary resources to speed up crawling
|
|
||||||
"""
|
|
||||||
print(" [Hook] 🚀 Optimizing performance - blocking images and ads...")
|
|
||||||
|
|
||||||
# Block images
|
|
||||||
await context.route(
|
|
||||||
"**/*.{png,jpg,jpeg,gif,webp,svg,ico}",
|
|
||||||
lambda route: route.abort()
|
|
||||||
)
|
|
||||||
|
|
||||||
# Block ads and analytics
|
|
||||||
await context.route("**/analytics/*", lambda route: route.abort())
|
|
||||||
await context.route("**/ads/*", lambda route: route.abort())
|
|
||||||
await context.route("**/google-analytics.com/*", lambda route: route.abort())
|
|
||||||
|
|
||||||
print(" [Hook] ✓ Performance optimization applied")
|
|
||||||
return page
|
|
||||||
|
|
||||||
|
|
||||||
async def viewport_setup_hook(page, context, **kwargs):
|
|
||||||
"""
|
|
||||||
Viewport Hook: Set consistent viewport size for rendering
|
|
||||||
"""
|
|
||||||
print(" [Hook] 🖥️ Setting viewport to 1920x1080...")
|
|
||||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
|
||||||
print(" [Hook] ✓ Viewport configured")
|
|
||||||
return page
|
|
||||||
|
|
||||||
|
|
||||||
async def authentication_headers_hook(page, context, url, **kwargs):
|
|
||||||
"""
|
|
||||||
Headers Hook: Add custom authentication and tracking headers
|
|
||||||
"""
|
|
||||||
print(f" [Hook] 🔐 Adding custom headers for {url[:50]}...")
|
|
||||||
|
|
||||||
await page.set_extra_http_headers({
|
|
||||||
'X-Crawl4AI-Version': '0.7.5',
|
|
||||||
'X-Custom-Hook': 'function-based-demo',
|
|
||||||
'Accept-Language': 'en-US,en;q=0.9',
|
|
||||||
'User-Agent': 'Crawl4AI/0.7.5 (Educational Demo)'
|
|
||||||
})
|
|
||||||
|
|
||||||
print(" [Hook] ✓ Custom headers added")
|
|
||||||
return page
|
|
||||||
|
|
||||||
|
|
||||||
async def lazy_loading_handler_hook(page, context, **kwargs):
|
|
||||||
"""
|
|
||||||
Content Hook: Handle lazy-loaded content by scrolling
|
|
||||||
"""
|
|
||||||
print(" [Hook] 📜 Scrolling to load lazy content...")
|
|
||||||
|
|
||||||
# Scroll to bottom
|
|
||||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
||||||
await page.wait_for_timeout(1000)
|
|
||||||
|
|
||||||
# Scroll to middle
|
|
||||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight / 2)")
|
|
||||||
await page.wait_for_timeout(500)
|
|
||||||
|
|
||||||
# Scroll back to top
|
|
||||||
await page.evaluate("window.scrollTo(0, 0)")
|
|
||||||
await page.wait_for_timeout(500)
|
|
||||||
|
|
||||||
print(" [Hook] ✓ Lazy content loaded")
|
|
||||||
return page
|
|
||||||
|
|
||||||
|
|
||||||
async def page_analytics_hook(page, context, **kwargs):
|
|
||||||
"""
|
|
||||||
Analytics Hook: Log page metrics before extraction
|
|
||||||
"""
|
|
||||||
print(" [Hook] 📊 Collecting page analytics...")
|
|
||||||
|
|
||||||
metrics = await page.evaluate('''
|
|
||||||
() => ({
|
|
||||||
title: document.title,
|
|
||||||
images: document.images.length,
|
|
||||||
links: document.links.length,
|
|
||||||
scripts: document.scripts.length,
|
|
||||||
headings: document.querySelectorAll('h1, h2, h3').length,
|
|
||||||
paragraphs: document.querySelectorAll('p').length
|
|
||||||
})
|
|
||||||
''')
|
|
||||||
|
|
||||||
print(f" [Hook] 📈 Page: {metrics['title'][:50]}...")
|
|
||||||
print(f" Links: {metrics['links']}, Images: {metrics['images']}, "
|
|
||||||
f"Headings: {metrics['headings']}, Paragraphs: {metrics['paragraphs']}")
|
|
||||||
|
|
||||||
return page
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
|
||||||
# DEMO 1: String-Based Hooks (NEW Docker Hooks System)
|
|
||||||
# ============================================================================
|
|
||||||
|
|
||||||
def demo_1_string_based_hooks():
|
|
||||||
"""
|
|
||||||
Demonstrate string-based hooks with REST API (part of NEW Docker Hooks System)
|
|
||||||
"""
|
|
||||||
print_section(
|
|
||||||
"DEMO 1: String-Based Hooks (REST API)",
|
|
||||||
"Part of the NEW Docker Hooks System - hooks as strings"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Define hooks as strings
|
|
||||||
hooks_config = {
|
|
||||||
"on_page_context_created": """
|
|
||||||
async def hook(page, context, **kwargs):
|
|
||||||
print(" [String Hook] Setting up page context...")
|
|
||||||
# Block images for performance
|
|
||||||
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
|
||||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
|
||||||
return page
|
|
||||||
""",
|
|
||||||
|
|
||||||
"before_goto": """
|
|
||||||
async def hook(page, context, url, **kwargs):
|
|
||||||
print(f" [String Hook] Navigating to {url[:50]}...")
|
|
||||||
await page.set_extra_http_headers({
|
|
||||||
'X-Crawl4AI': 'string-based-hooks',
|
|
||||||
'X-Demo': 'v0.7.5'
|
|
||||||
})
|
|
||||||
return page
|
|
||||||
""",
|
|
||||||
|
|
||||||
"before_retrieve_html": """
|
|
||||||
async def hook(page, context, **kwargs):
|
|
||||||
print(" [String Hook] Scrolling page...")
|
|
||||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
||||||
await page.wait_for_timeout(1000)
|
|
||||||
return page
|
|
||||||
"""
|
|
||||||
}
|
|
||||||
|
|
||||||
# Prepare request payload
|
|
||||||
payload = {
|
|
||||||
"urls": [TEST_URLS[0]],
|
|
||||||
"hooks": {
|
|
||||||
"code": hooks_config,
|
|
||||||
"timeout": 30
|
|
||||||
},
|
|
||||||
"crawler_config": {
|
|
||||||
"cache_mode": "bypass"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
print(f"🎯 Target URL: {TEST_URLS[0]}")
|
|
||||||
print(f"🔧 Configured {len(hooks_config)} string-based hooks")
|
|
||||||
print(f"📡 Sending request to Docker API...\n")
|
|
||||||
|
|
||||||
try:
|
|
||||||
start_time = time.time()
|
|
||||||
response = requests.post(f"{DOCKER_URL}/crawl", json=payload, timeout=60)
|
|
||||||
execution_time = time.time() - start_time
|
|
||||||
|
|
||||||
if response.status_code == 200:
|
|
||||||
result = response.json()
|
|
||||||
|
|
||||||
print(f"\n✅ Request successful! (took {execution_time:.2f}s)")
|
|
||||||
|
|
||||||
# Display results
|
|
||||||
if result.get('results') and result['results'][0].get('success'):
|
|
||||||
crawl_result = result['results'][0]
|
|
||||||
html_length = len(crawl_result.get('html', ''))
|
|
||||||
markdown_length = len(crawl_result.get('markdown', ''))
|
|
||||||
|
|
||||||
print(f"\n📊 Results:")
|
|
||||||
print(f" • HTML length: {html_length:,} characters")
|
|
||||||
print(f" • Markdown length: {markdown_length:,} characters")
|
|
||||||
print(f" • URL: {crawl_result.get('url')}")
|
|
||||||
|
|
||||||
# Check hooks execution
|
|
||||||
if 'hooks' in result:
|
|
||||||
hooks_info = result['hooks']
|
|
||||||
print(f"\n🎣 Hooks Execution:")
|
|
||||||
print(f" • Status: {hooks_info['status']['status']}")
|
|
||||||
print(f" • Attached hooks: {len(hooks_info['status']['attached_hooks'])}")
|
|
||||||
|
|
||||||
if 'summary' in hooks_info:
|
|
||||||
summary = hooks_info['summary']
|
|
||||||
print(f" • Total executions: {summary['total_executions']}")
|
|
||||||
print(f" • Successful: {summary['successful']}")
|
|
||||||
print(f" • Success rate: {summary['success_rate']:.1f}%")
|
|
||||||
else:
|
|
||||||
print(f"⚠️ Crawl completed but no results")
|
|
||||||
|
|
||||||
else:
|
|
||||||
print(f"❌ Request failed with status {response.status_code}")
|
|
||||||
print(f" Error: {response.text[:200]}")
|
|
||||||
|
|
||||||
except requests.exceptions.Timeout:
|
|
||||||
print("⏰ Request timed out after 60 seconds")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Error: {str(e)}")
|
|
||||||
|
|
||||||
print("\n" + "─" * 70)
|
|
||||||
print("✓ String-based hooks demo complete\n")
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
|
||||||
# DEMO 2: Function-Based Hooks with hooks_to_string() Utility
|
|
||||||
# ============================================================================
|
|
||||||
|
|
||||||
def demo_2_hooks_to_string_utility():
|
|
||||||
"""
|
|
||||||
Demonstrate the new hooks_to_string() utility for converting functions
|
|
||||||
"""
|
|
||||||
print_section(
|
|
||||||
"DEMO 2: hooks_to_string() Utility (NEW! ✨)",
|
|
||||||
"Convert Python functions to strings for REST API"
|
|
||||||
)
|
|
||||||
|
|
||||||
print("📦 Creating hook functions...")
|
|
||||||
print(" • performance_optimization_hook")
|
|
||||||
print(" • viewport_setup_hook")
|
|
||||||
print(" • authentication_headers_hook")
|
|
||||||
print(" • lazy_loading_handler_hook")
|
|
||||||
|
|
||||||
# Convert function objects to strings using the NEW utility
|
|
||||||
print("\n🔄 Converting functions to strings with hooks_to_string()...")
|
|
||||||
|
|
||||||
hooks_dict = {
|
|
||||||
"on_page_context_created": performance_optimization_hook,
|
|
||||||
"before_goto": authentication_headers_hook,
|
|
||||||
"before_retrieve_html": lazy_loading_handler_hook,
|
|
||||||
}
|
|
||||||
|
|
||||||
hooks_as_strings = hooks_to_string(hooks_dict)
|
|
||||||
|
|
||||||
print(f"✅ Successfully converted {len(hooks_as_strings)} functions to strings")
|
|
||||||
|
|
||||||
# Show a preview
|
|
||||||
print("\n📝 Sample converted hook (first 250 characters):")
|
|
||||||
print("─" * 70)
|
|
||||||
sample_hook = list(hooks_as_strings.values())[0]
|
|
||||||
print(sample_hook[:250] + "...")
|
|
||||||
print("─" * 70)
|
|
||||||
|
|
||||||
# Use the converted hooks with REST API
|
|
||||||
print("\n📡 Using converted hooks with REST API...")
|
|
||||||
|
|
||||||
payload = {
|
|
||||||
"urls": [TEST_URLS[0]],
|
|
||||||
"hooks": {
|
|
||||||
"code": hooks_as_strings,
|
|
||||||
"timeout": 30
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
start_time = time.time()
|
|
||||||
response = requests.post(f"{DOCKER_URL}/crawl", json=payload, timeout=60)
|
|
||||||
execution_time = time.time() - start_time
|
|
||||||
|
|
||||||
if response.status_code == 200:
|
|
||||||
result = response.json()
|
|
||||||
print(f"\n✅ Request successful! (took {execution_time:.2f}s)")
|
|
||||||
|
|
||||||
if result.get('results') and result['results'][0].get('success'):
|
|
||||||
crawl_result = result['results'][0]
|
|
||||||
print(f" • HTML length: {len(crawl_result.get('html', '')):,} characters")
|
|
||||||
print(f" • Hooks executed successfully!")
|
|
||||||
else:
|
|
||||||
print(f"❌ Request failed: {response.status_code}")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Error: {str(e)}")
|
|
||||||
|
|
||||||
print("\n💡 Benefits of hooks_to_string():")
|
|
||||||
print(" ✓ Write hooks as regular Python functions")
|
|
||||||
print(" ✓ Full IDE support (autocomplete, syntax highlighting)")
|
|
||||||
print(" ✓ Type checking and linting")
|
|
||||||
print(" ✓ Easy to test and debug")
|
|
||||||
print(" ✓ Reusable across projects")
|
|
||||||
print(" ✓ Works with any REST API client")
|
|
||||||
|
|
||||||
print("\n" + "─" * 70)
|
|
||||||
print("✓ hooks_to_string() utility demo complete\n")
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
|
||||||
# DEMO 3: Docker Client with Automatic Conversion (RECOMMENDED! 🌟)
|
|
||||||
# ============================================================================
|
|
||||||
|
|
||||||
async def demo_3_docker_client_auto_conversion():
|
|
||||||
"""
|
|
||||||
Demonstrate Docker Client with automatic hook conversion (RECOMMENDED)
|
|
||||||
"""
|
|
||||||
print_section(
|
|
||||||
"DEMO 3: Docker Client with Auto-Conversion (RECOMMENDED! 🌟)",
|
|
||||||
"Pass function objects directly - conversion happens automatically!"
|
|
||||||
)
|
|
||||||
|
|
||||||
print("🐳 Initializing Crawl4AI Docker Client...")
|
|
||||||
client = Crawl4aiDockerClient(base_url=DOCKER_URL)
|
|
||||||
|
|
||||||
print("✅ Client ready!\n")
|
|
||||||
|
|
||||||
# Use our reusable hook library - just pass the function objects!
|
|
||||||
print("📚 Using reusable hook library:")
|
|
||||||
print(" • performance_optimization_hook")
|
|
||||||
print(" • viewport_setup_hook")
|
|
||||||
print(" • authentication_headers_hook")
|
|
||||||
print(" • lazy_loading_handler_hook")
|
|
||||||
print(" • page_analytics_hook")
|
|
||||||
|
|
||||||
print("\n🎯 Target URL: " + TEST_URLS[1])
|
|
||||||
print("🚀 Starting crawl with automatic hook conversion...\n")
|
|
||||||
|
|
||||||
try:
|
|
||||||
start_time = time.time()
|
|
||||||
|
|
||||||
# Pass function objects directly - NO manual conversion needed! ✨
|
|
||||||
results = await client.crawl(
|
|
||||||
urls=[TEST_URLS[0]],
|
|
||||||
hooks={
|
|
||||||
"on_page_context_created": performance_optimization_hook,
|
|
||||||
"before_goto": authentication_headers_hook,
|
|
||||||
"before_retrieve_html": lazy_loading_handler_hook,
|
|
||||||
"before_return_html": page_analytics_hook,
|
|
||||||
},
|
|
||||||
hooks_timeout=30
|
|
||||||
)
|
|
||||||
|
|
||||||
execution_time = time.time() - start_time
|
|
||||||
|
|
||||||
print(f"\n✅ Crawl completed! (took {execution_time:.2f}s)\n")
|
|
||||||
|
|
||||||
# Display results
|
|
||||||
if results and results.success:
|
|
||||||
result = results
|
|
||||||
print(f"📊 Results:")
|
|
||||||
print(f" • URL: {result.url}")
|
|
||||||
print(f" • Success: {result.success}")
|
|
||||||
print(f" • HTML length: {len(result.html):,} characters")
|
|
||||||
print(f" • Markdown length: {len(result.markdown):,} characters")
|
|
||||||
|
|
||||||
# Show metadata
|
|
||||||
if result.metadata:
|
|
||||||
print(f"\n📋 Metadata:")
|
|
||||||
print(f" • Title: {result.metadata.get('title', 'N/A')}")
|
|
||||||
print(f" • Description: {result.metadata.get('description', 'N/A')}")
|
|
||||||
|
|
||||||
# Show links
|
|
||||||
if result.links:
|
|
||||||
internal_count = len(result.links.get('internal', []))
|
|
||||||
external_count = len(result.links.get('external', []))
|
|
||||||
print(f"\n🔗 Links Found:")
|
|
||||||
print(f" • Internal: {internal_count}")
|
|
||||||
print(f" • External: {external_count}")
|
|
||||||
else:
|
|
||||||
print(f"⚠️ Crawl completed but no successful results")
|
|
||||||
if results:
|
|
||||||
print(f" Error: {results.error_message}")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Error: {str(e)}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
|
|
||||||
print("\n🌟 Why Docker Client is RECOMMENDED:")
|
|
||||||
print(" ✓ Automatic function-to-string conversion")
|
|
||||||
print(" ✓ No manual hooks_to_string() calls needed")
|
|
||||||
print(" ✓ Cleaner, more Pythonic code")
|
|
||||||
print(" ✓ Full type hints and IDE support")
|
|
||||||
print(" ✓ Built-in error handling")
|
|
||||||
print(" ✓ Async/await support")
|
|
||||||
|
|
||||||
print("\n" + "─" * 70)
|
|
||||||
print("✓ Docker Client auto-conversion demo complete\n")
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
|
||||||
# DEMO 4: Advanced Use Case - Complete Hook Pipeline
|
|
||||||
# ============================================================================
|
|
||||||
|
|
||||||
async def demo_4_complete_hook_pipeline():
|
|
||||||
"""
|
|
||||||
Demonstrate a complete hook pipeline using all 8 hook points
|
|
||||||
"""
|
|
||||||
print_section(
|
|
||||||
"DEMO 4: Complete Hook Pipeline",
|
|
||||||
"Using all 8 available hook points for comprehensive control"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Define all 8 hooks
|
|
||||||
async def on_browser_created_hook(browser, **kwargs):
|
|
||||||
"""Hook 1: Called after browser is created"""
|
|
||||||
print(" [Pipeline] 1/8 Browser created")
|
|
||||||
return browser
|
|
||||||
|
|
||||||
async def on_page_context_created_hook(page, context, **kwargs):
|
|
||||||
"""Hook 2: Called after page context is created"""
|
|
||||||
print(" [Pipeline] 2/8 Page context created - setting up...")
|
|
||||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
|
||||||
return page
|
|
||||||
|
|
||||||
async def on_user_agent_updated_hook(page, context, user_agent, **kwargs):
|
|
||||||
"""Hook 3: Called when user agent is updated"""
|
|
||||||
print(f" [Pipeline] 3/8 User agent updated: {user_agent[:50]}...")
|
|
||||||
return page
|
|
||||||
|
|
||||||
async def before_goto_hook(page, context, url, **kwargs):
|
|
||||||
"""Hook 4: Called before navigating to URL"""
|
|
||||||
print(f" [Pipeline] 4/8 Before navigation to: {url[:60]}...")
|
|
||||||
return page
|
|
||||||
|
|
||||||
async def after_goto_hook(page, context, url, response, **kwargs):
|
|
||||||
"""Hook 5: Called after navigation completes"""
|
|
||||||
print(f" [Pipeline] 5/8 After navigation - Status: {response.status if response else 'N/A'}")
|
|
||||||
await page.wait_for_timeout(1000)
|
|
||||||
return page
|
|
||||||
|
|
||||||
async def on_execution_started_hook(page, context, **kwargs):
|
|
||||||
"""Hook 6: Called when JavaScript execution starts"""
|
|
||||||
print(" [Pipeline] 6/8 JavaScript execution started")
|
|
||||||
return page
|
|
||||||
|
|
||||||
async def before_retrieve_html_hook(page, context, **kwargs):
|
|
||||||
"""Hook 7: Called before retrieving HTML"""
|
|
||||||
print(" [Pipeline] 7/8 Before HTML retrieval - scrolling...")
|
|
||||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
||||||
return page
|
|
||||||
|
|
||||||
async def before_return_html_hook(page, context, html, **kwargs):
|
|
||||||
"""Hook 8: Called before returning HTML"""
|
|
||||||
print(f" [Pipeline] 8/8 Before return - HTML length: {len(html):,} chars")
|
|
||||||
return page
|
|
||||||
|
|
||||||
print("🎯 Target URL: " + TEST_URLS[0])
|
|
||||||
print("🔧 Configured ALL 8 hook points for complete pipeline control\n")
|
|
||||||
|
|
||||||
client = Crawl4aiDockerClient(base_url=DOCKER_URL)
|
|
||||||
|
|
||||||
try:
|
|
||||||
print("🚀 Starting complete pipeline crawl...\n")
|
|
||||||
start_time = time.time()
|
|
||||||
|
|
||||||
results = await client.crawl(
|
|
||||||
urls=[TEST_URLS[0]],
|
|
||||||
hooks={
|
|
||||||
"on_browser_created": on_browser_created_hook,
|
|
||||||
"on_page_context_created": on_page_context_created_hook,
|
|
||||||
"on_user_agent_updated": on_user_agent_updated_hook,
|
|
||||||
"before_goto": before_goto_hook,
|
|
||||||
"after_goto": after_goto_hook,
|
|
||||||
"on_execution_started": on_execution_started_hook,
|
|
||||||
"before_retrieve_html": before_retrieve_html_hook,
|
|
||||||
"before_return_html": before_return_html_hook,
|
|
||||||
},
|
|
||||||
hooks_timeout=45
|
|
||||||
)
|
|
||||||
|
|
||||||
execution_time = time.time() - start_time
|
|
||||||
|
|
||||||
if results and results.success:
|
|
||||||
print(f"\n✅ Complete pipeline executed successfully! (took {execution_time:.2f}s)")
|
|
||||||
print(f" • All 8 hooks executed in sequence")
|
|
||||||
print(f" • HTML length: {len(results.html):,} characters")
|
|
||||||
else:
|
|
||||||
print(f"⚠️ Pipeline completed with warnings")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Error: {str(e)}")
|
|
||||||
|
|
||||||
print("\n📚 Available Hook Points:")
|
|
||||||
print(" 1. on_browser_created - Browser initialization")
|
|
||||||
print(" 2. on_page_context_created - Page context setup")
|
|
||||||
print(" 3. on_user_agent_updated - User agent configuration")
|
|
||||||
print(" 4. before_goto - Pre-navigation setup")
|
|
||||||
print(" 5. after_goto - Post-navigation processing")
|
|
||||||
print(" 6. on_execution_started - JavaScript execution start")
|
|
||||||
print(" 7. before_retrieve_html - Pre-extraction processing")
|
|
||||||
print(" 8. before_return_html - Final HTML processing")
|
|
||||||
|
|
||||||
print("\n" + "─" * 70)
|
|
||||||
print("✓ Complete hook pipeline demo complete\n")
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
|
||||||
# MAIN EXECUTION
|
|
||||||
# ============================================================================
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
"""
|
|
||||||
Run all demonstrations
|
|
||||||
"""
|
|
||||||
print("\n" + "=" * 70)
|
|
||||||
print(" 🚀 Crawl4AI v0.7.5 - Docker Hooks Complete Demonstration")
|
|
||||||
print("=" * 70)
|
|
||||||
|
|
||||||
# Check Docker service
|
|
||||||
print("\n🔍 Checking Docker service status...")
|
|
||||||
if not check_docker_service():
|
|
||||||
print("❌ Docker service is not running!")
|
|
||||||
print("\n📋 To start the Docker service:")
|
|
||||||
print(" docker run -p 11235:11235 unclecode/crawl4ai:latest")
|
|
||||||
print("\nPlease start the service and run this demo again.")
|
|
||||||
return
|
|
||||||
|
|
||||||
print("✅ Docker service is running!\n")
|
|
||||||
|
|
||||||
# Run all demos
|
|
||||||
demos = [
|
|
||||||
("String-Based Hooks (REST API)", demo_1_string_based_hooks, False),
|
|
||||||
("hooks_to_string() Utility", demo_2_hooks_to_string_utility, False),
|
|
||||||
("Docker Client Auto-Conversion", demo_3_docker_client_auto_conversion, True),
|
|
||||||
# ("Complete Hook Pipeline", demo_4_complete_hook_pipeline, True),
|
|
||||||
]
|
|
||||||
|
|
||||||
for i, (name, demo_func, is_async) in enumerate(demos, 1):
|
|
||||||
print(f"\n{'🔷' * 35}")
|
|
||||||
print(f"Starting Demo {i}/{len(demos)}: {name}")
|
|
||||||
print(f"{'🔷' * 35}\n")
|
|
||||||
|
|
||||||
try:
|
|
||||||
if is_async:
|
|
||||||
await demo_func()
|
|
||||||
else:
|
|
||||||
demo_func()
|
|
||||||
|
|
||||||
print(f"✅ Demo {i} completed successfully!")
|
|
||||||
|
|
||||||
# Pause between demos (except the last one)
|
|
||||||
if i < len(demos):
|
|
||||||
print("\n⏸️ Press Enter to continue to next demo...")
|
|
||||||
# input()
|
|
||||||
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
print(f"\n⏹️ Demo interrupted by user")
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
print(f"\n❌ Demo {i} failed: {str(e)}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
print("\nContinuing to next demo...\n")
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Final summary
|
|
||||||
print("\n" + "=" * 70)
|
|
||||||
print(" 🎉 All Demonstrations Complete!")
|
|
||||||
print("=" * 70)
|
|
||||||
|
|
||||||
print("\n📊 Summary of v0.7.5 Docker Hooks System:")
|
|
||||||
print("\n🆕 COMPLETELY NEW FEATURE in v0.7.5:")
|
|
||||||
print(" The Docker Hooks System lets you customize the crawling pipeline")
|
|
||||||
print(" with user-provided Python functions at 8 strategic points.")
|
|
||||||
|
|
||||||
print("\n✨ Three Ways to Use Docker Hooks (All NEW!):")
|
|
||||||
print(" 1. String-based - Write hooks as strings for REST API")
|
|
||||||
print(" 2. hooks_to_string() - Convert Python functions to strings")
|
|
||||||
print(" 3. Docker Client - Automatic conversion (RECOMMENDED)")
|
|
||||||
|
|
||||||
print("\n💡 Key Benefits:")
|
|
||||||
print(" ✓ Full IDE support (autocomplete, syntax highlighting)")
|
|
||||||
print(" ✓ Type checking and linting")
|
|
||||||
print(" ✓ Easy to test and debug")
|
|
||||||
print(" ✓ Reusable across projects")
|
|
||||||
print(" ✓ Complete pipeline control")
|
|
||||||
|
|
||||||
print("\n🎯 8 Hook Points Available:")
|
|
||||||
print(" • on_browser_created, on_page_context_created")
|
|
||||||
print(" • on_user_agent_updated, before_goto, after_goto")
|
|
||||||
print(" • on_execution_started, before_retrieve_html, before_return_html")
|
|
||||||
|
|
||||||
print("\n📚 Resources:")
|
|
||||||
print(" • Docs: https://docs.crawl4ai.com")
|
|
||||||
print(" • GitHub: https://github.com/unclecode/crawl4ai")
|
|
||||||
print(" • Discord: https://discord.gg/jP8KfhDhyN")
|
|
||||||
|
|
||||||
print("\n" + "=" * 70)
|
|
||||||
print(" Happy Crawling with v0.7.5! 🕷️")
|
|
||||||
print("=" * 70 + "\n")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
print("\n🎬 Starting Crawl4AI v0.7.5 Docker Hooks Demonstration...")
|
|
||||||
print("Press Ctrl+C anytime to exit\n")
|
|
||||||
|
|
||||||
try:
|
|
||||||
asyncio.run(main())
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
print("\n\n👋 Demo stopped by user. Thanks for exploring Crawl4AI v0.7.5!")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"\n\n❌ Demo error: {str(e)}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -7,7 +7,6 @@ docs_dir: docs/md_v2
|
|||||||
|
|
||||||
nav:
|
nav:
|
||||||
- Home: 'index.md'
|
- Home: 'index.md'
|
||||||
- "📚 Complete SDK Reference": "complete-sdk-reference.md"
|
|
||||||
- "Ask AI": "core/ask-ai.md"
|
- "Ask AI": "core/ask-ai.md"
|
||||||
- "Quick Start": "core/quickstart.md"
|
- "Quick Start": "core/quickstart.md"
|
||||||
- "Code Examples": "core/examples.md"
|
- "Code Examples": "core/examples.md"
|
||||||
@@ -19,7 +18,7 @@ nav:
|
|||||||
- "Marketplace Admin": "marketplace/admin/index.html"
|
- "Marketplace Admin": "marketplace/admin/index.html"
|
||||||
- Setup & Installation:
|
- Setup & Installation:
|
||||||
- "Installation": "core/installation.md"
|
- "Installation": "core/installation.md"
|
||||||
- "Docker Deployment": "core/docker-deployment.md"
|
- "Self-Hosting Guide": "core/self-hosting.md"
|
||||||
- "Blog & Changelog":
|
- "Blog & Changelog":
|
||||||
- "Blog Home": "blog/index.md"
|
- "Blog Home": "blog/index.md"
|
||||||
- "Changelog": "https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md"
|
- "Changelog": "https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md"
|
||||||
|
|||||||
@@ -1,401 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Test script to validate webhook implementation for /llm/job endpoint.
|
|
||||||
|
|
||||||
This tests that the /llm/job endpoint now supports webhooks
|
|
||||||
following the same pattern as /crawl/job.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
|
|
||||||
# Add deploy/docker to path
|
|
||||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'deploy', 'docker'))
|
|
||||||
|
|
||||||
def test_llm_job_payload_model():
|
|
||||||
"""Test that LlmJobPayload includes webhook_config field"""
|
|
||||||
print("=" * 60)
|
|
||||||
print("TEST 1: LlmJobPayload Model")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
try:
|
|
||||||
from job import LlmJobPayload
|
|
||||||
from schemas import WebhookConfig
|
|
||||||
from pydantic import ValidationError
|
|
||||||
|
|
||||||
# Test with webhook_config
|
|
||||||
payload_dict = {
|
|
||||||
"url": "https://example.com",
|
|
||||||
"q": "Extract main content",
|
|
||||||
"schema": None,
|
|
||||||
"cache": False,
|
|
||||||
"provider": None,
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": "https://myapp.com/webhook",
|
|
||||||
"webhook_data_in_payload": True,
|
|
||||||
"webhook_headers": {"X-Secret": "token"}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
payload = LlmJobPayload(**payload_dict)
|
|
||||||
|
|
||||||
print(f"✅ LlmJobPayload accepts webhook_config")
|
|
||||||
print(f" - URL: {payload.url}")
|
|
||||||
print(f" - Query: {payload.q}")
|
|
||||||
print(f" - Webhook URL: {payload.webhook_config.webhook_url}")
|
|
||||||
print(f" - Data in payload: {payload.webhook_config.webhook_data_in_payload}")
|
|
||||||
|
|
||||||
# Test without webhook_config (should be optional)
|
|
||||||
minimal_payload = {
|
|
||||||
"url": "https://example.com",
|
|
||||||
"q": "Extract content"
|
|
||||||
}
|
|
||||||
|
|
||||||
payload2 = LlmJobPayload(**minimal_payload)
|
|
||||||
assert payload2.webhook_config is None, "webhook_config should be optional"
|
|
||||||
print(f"✅ LlmJobPayload works without webhook_config (optional)")
|
|
||||||
|
|
||||||
return True
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Failed: {e}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
return False
|
|
||||||
|
|
||||||
def test_handle_llm_request_signature():
|
|
||||||
"""Test that handle_llm_request accepts webhook_config parameter"""
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("TEST 2: handle_llm_request Function Signature")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
try:
|
|
||||||
from api import handle_llm_request
|
|
||||||
import inspect
|
|
||||||
|
|
||||||
sig = inspect.signature(handle_llm_request)
|
|
||||||
params = list(sig.parameters.keys())
|
|
||||||
|
|
||||||
print(f"Function parameters: {params}")
|
|
||||||
|
|
||||||
if 'webhook_config' in params:
|
|
||||||
print(f"✅ handle_llm_request has webhook_config parameter")
|
|
||||||
|
|
||||||
# Check that it's optional with default None
|
|
||||||
webhook_param = sig.parameters['webhook_config']
|
|
||||||
if webhook_param.default is None or webhook_param.default == inspect.Parameter.empty:
|
|
||||||
print(f"✅ webhook_config is optional (default: {webhook_param.default})")
|
|
||||||
else:
|
|
||||||
print(f"⚠️ webhook_config default is: {webhook_param.default}")
|
|
||||||
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
print(f"❌ handle_llm_request missing webhook_config parameter")
|
|
||||||
return False
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Failed: {e}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
return False
|
|
||||||
|
|
||||||
def test_process_llm_extraction_signature():
|
|
||||||
"""Test that process_llm_extraction accepts webhook_config parameter"""
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("TEST 3: process_llm_extraction Function Signature")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
try:
|
|
||||||
from api import process_llm_extraction
|
|
||||||
import inspect
|
|
||||||
|
|
||||||
sig = inspect.signature(process_llm_extraction)
|
|
||||||
params = list(sig.parameters.keys())
|
|
||||||
|
|
||||||
print(f"Function parameters: {params}")
|
|
||||||
|
|
||||||
if 'webhook_config' in params:
|
|
||||||
print(f"✅ process_llm_extraction has webhook_config parameter")
|
|
||||||
|
|
||||||
webhook_param = sig.parameters['webhook_config']
|
|
||||||
if webhook_param.default is None or webhook_param.default == inspect.Parameter.empty:
|
|
||||||
print(f"✅ webhook_config is optional (default: {webhook_param.default})")
|
|
||||||
else:
|
|
||||||
print(f"⚠️ webhook_config default is: {webhook_param.default}")
|
|
||||||
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
print(f"❌ process_llm_extraction missing webhook_config parameter")
|
|
||||||
return False
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Failed: {e}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
return False
|
|
||||||
|
|
||||||
def test_webhook_integration_in_api():
|
|
||||||
"""Test that api.py properly integrates webhook notifications"""
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("TEST 4: Webhook Integration in process_llm_extraction")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
try:
|
|
||||||
api_file = os.path.join(os.path.dirname(__file__), 'deploy', 'docker', 'api.py')
|
|
||||||
|
|
||||||
with open(api_file, 'r') as f:
|
|
||||||
api_content = f.read()
|
|
||||||
|
|
||||||
# Check for WebhookDeliveryService initialization
|
|
||||||
if 'webhook_service = WebhookDeliveryService(config)' in api_content:
|
|
||||||
print("✅ process_llm_extraction initializes WebhookDeliveryService")
|
|
||||||
else:
|
|
||||||
print("❌ Missing WebhookDeliveryService initialization in process_llm_extraction")
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Check for notify_job_completion calls with llm_extraction
|
|
||||||
if 'task_type="llm_extraction"' in api_content:
|
|
||||||
print("✅ Uses correct task_type='llm_extraction' for notifications")
|
|
||||||
else:
|
|
||||||
print("❌ Missing task_type='llm_extraction' in webhook notifications")
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Count webhook notification calls (should have at least 3: success + 2 failure paths)
|
|
||||||
notification_count = api_content.count('await webhook_service.notify_job_completion')
|
|
||||||
# Find only in process_llm_extraction function
|
|
||||||
llm_func_start = api_content.find('async def process_llm_extraction')
|
|
||||||
llm_func_end = api_content.find('\nasync def ', llm_func_start + 1)
|
|
||||||
if llm_func_end == -1:
|
|
||||||
llm_func_end = len(api_content)
|
|
||||||
|
|
||||||
llm_func_content = api_content[llm_func_start:llm_func_end]
|
|
||||||
llm_notification_count = llm_func_content.count('await webhook_service.notify_job_completion')
|
|
||||||
|
|
||||||
print(f"✅ Found {llm_notification_count} webhook notification calls in process_llm_extraction")
|
|
||||||
|
|
||||||
if llm_notification_count >= 3:
|
|
||||||
print(f"✅ Sufficient notification points (success + failure paths)")
|
|
||||||
else:
|
|
||||||
print(f"⚠️ Expected at least 3 notification calls, found {llm_notification_count}")
|
|
||||||
|
|
||||||
return True
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Failed: {e}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
return False
|
|
||||||
|
|
||||||
def test_job_endpoint_integration():
|
|
||||||
"""Test that /llm/job endpoint extracts and passes webhook_config"""
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("TEST 5: /llm/job Endpoint Integration")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
try:
|
|
||||||
job_file = os.path.join(os.path.dirname(__file__), 'deploy', 'docker', 'job.py')
|
|
||||||
|
|
||||||
with open(job_file, 'r') as f:
|
|
||||||
job_content = f.read()
|
|
||||||
|
|
||||||
# Find the llm_job_enqueue function
|
|
||||||
llm_job_start = job_content.find('async def llm_job_enqueue')
|
|
||||||
llm_job_end = job_content.find('\n\n@router', llm_job_start + 1)
|
|
||||||
if llm_job_end == -1:
|
|
||||||
llm_job_end = job_content.find('\n\nasync def', llm_job_start + 1)
|
|
||||||
|
|
||||||
llm_job_func = job_content[llm_job_start:llm_job_end]
|
|
||||||
|
|
||||||
# Check for webhook_config extraction
|
|
||||||
if 'webhook_config = None' in llm_job_func:
|
|
||||||
print("✅ llm_job_enqueue initializes webhook_config variable")
|
|
||||||
else:
|
|
||||||
print("❌ Missing webhook_config initialization")
|
|
||||||
return False
|
|
||||||
|
|
||||||
if 'if payload.webhook_config:' in llm_job_func:
|
|
||||||
print("✅ llm_job_enqueue checks for payload.webhook_config")
|
|
||||||
else:
|
|
||||||
print("❌ Missing webhook_config check")
|
|
||||||
return False
|
|
||||||
|
|
||||||
if 'webhook_config = payload.webhook_config.model_dump(mode=\'json\')' in llm_job_func:
|
|
||||||
print("✅ llm_job_enqueue converts webhook_config to dict")
|
|
||||||
else:
|
|
||||||
print("❌ Missing webhook_config.model_dump conversion")
|
|
||||||
return False
|
|
||||||
|
|
||||||
if 'webhook_config=webhook_config' in llm_job_func:
|
|
||||||
print("✅ llm_job_enqueue passes webhook_config to handle_llm_request")
|
|
||||||
else:
|
|
||||||
print("❌ Missing webhook_config parameter in handle_llm_request call")
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Failed: {e}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
return False
|
|
||||||
|
|
||||||
def test_create_new_task_integration():
|
|
||||||
"""Test that create_new_task stores webhook_config in Redis"""
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("TEST 6: create_new_task Webhook Storage")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
try:
|
|
||||||
api_file = os.path.join(os.path.dirname(__file__), 'deploy', 'docker', 'api.py')
|
|
||||||
|
|
||||||
with open(api_file, 'r') as f:
|
|
||||||
api_content = f.read()
|
|
||||||
|
|
||||||
# Find create_new_task function
|
|
||||||
create_task_start = api_content.find('async def create_new_task')
|
|
||||||
create_task_end = api_content.find('\nasync def ', create_task_start + 1)
|
|
||||||
if create_task_end == -1:
|
|
||||||
create_task_end = len(api_content)
|
|
||||||
|
|
||||||
create_task_func = api_content[create_task_start:create_task_end]
|
|
||||||
|
|
||||||
# Check for webhook_config storage
|
|
||||||
if 'if webhook_config:' in create_task_func:
|
|
||||||
print("✅ create_new_task checks for webhook_config")
|
|
||||||
else:
|
|
||||||
print("❌ Missing webhook_config check in create_new_task")
|
|
||||||
return False
|
|
||||||
|
|
||||||
if 'task_data["webhook_config"] = json.dumps(webhook_config)' in create_task_func:
|
|
||||||
print("✅ create_new_task stores webhook_config in Redis task data")
|
|
||||||
else:
|
|
||||||
print("❌ Missing webhook_config storage in task_data")
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Check that webhook_config is passed to process_llm_extraction
|
|
||||||
if 'webhook_config' in create_task_func and 'background_tasks.add_task' in create_task_func:
|
|
||||||
print("✅ create_new_task passes webhook_config to background task")
|
|
||||||
else:
|
|
||||||
print("⚠️ Could not verify webhook_config passed to background task")
|
|
||||||
|
|
||||||
return True
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Failed: {e}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
return False
|
|
||||||
|
|
||||||
def test_pattern_consistency():
|
|
||||||
"""Test that /llm/job follows the same pattern as /crawl/job"""
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("TEST 7: Pattern Consistency with /crawl/job")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
try:
|
|
||||||
api_file = os.path.join(os.path.dirname(__file__), 'deploy', 'docker', 'api.py')
|
|
||||||
|
|
||||||
with open(api_file, 'r') as f:
|
|
||||||
api_content = f.read()
|
|
||||||
|
|
||||||
# Find handle_crawl_job to compare pattern
|
|
||||||
crawl_job_start = api_content.find('async def handle_crawl_job')
|
|
||||||
crawl_job_end = api_content.find('\nasync def ', crawl_job_start + 1)
|
|
||||||
if crawl_job_end == -1:
|
|
||||||
crawl_job_end = len(api_content)
|
|
||||||
crawl_job_func = api_content[crawl_job_start:crawl_job_end]
|
|
||||||
|
|
||||||
# Find process_llm_extraction
|
|
||||||
llm_extract_start = api_content.find('async def process_llm_extraction')
|
|
||||||
llm_extract_end = api_content.find('\nasync def ', llm_extract_start + 1)
|
|
||||||
if llm_extract_end == -1:
|
|
||||||
llm_extract_end = len(api_content)
|
|
||||||
llm_extract_func = api_content[llm_extract_start:llm_extract_end]
|
|
||||||
|
|
||||||
print("Checking pattern consistency...")
|
|
||||||
|
|
||||||
# Both should initialize WebhookDeliveryService
|
|
||||||
crawl_has_service = 'webhook_service = WebhookDeliveryService(config)' in crawl_job_func
|
|
||||||
llm_has_service = 'webhook_service = WebhookDeliveryService(config)' in llm_extract_func
|
|
||||||
|
|
||||||
if crawl_has_service and llm_has_service:
|
|
||||||
print("✅ Both initialize WebhookDeliveryService")
|
|
||||||
else:
|
|
||||||
print(f"❌ Service initialization mismatch (crawl: {crawl_has_service}, llm: {llm_has_service})")
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Both should call notify_job_completion on success
|
|
||||||
crawl_notifies_success = 'status="completed"' in crawl_job_func and 'notify_job_completion' in crawl_job_func
|
|
||||||
llm_notifies_success = 'status="completed"' in llm_extract_func and 'notify_job_completion' in llm_extract_func
|
|
||||||
|
|
||||||
if crawl_notifies_success and llm_notifies_success:
|
|
||||||
print("✅ Both notify on success")
|
|
||||||
else:
|
|
||||||
print(f"❌ Success notification mismatch (crawl: {crawl_notifies_success}, llm: {llm_notifies_success})")
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Both should call notify_job_completion on failure
|
|
||||||
crawl_notifies_failure = 'status="failed"' in crawl_job_func and 'error=' in crawl_job_func
|
|
||||||
llm_notifies_failure = 'status="failed"' in llm_extract_func and 'error=' in llm_extract_func
|
|
||||||
|
|
||||||
if crawl_notifies_failure and llm_notifies_failure:
|
|
||||||
print("✅ Both notify on failure")
|
|
||||||
else:
|
|
||||||
print(f"❌ Failure notification mismatch (crawl: {crawl_notifies_failure}, llm: {llm_notifies_failure})")
|
|
||||||
return False
|
|
||||||
|
|
||||||
print("✅ /llm/job follows the same pattern as /crawl/job")
|
|
||||||
return True
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Failed: {e}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
return False
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Run all tests"""
|
|
||||||
print("\n🧪 LLM Job Webhook Feature Validation")
|
|
||||||
print("=" * 60)
|
|
||||||
print("Testing that /llm/job now supports webhooks like /crawl/job")
|
|
||||||
print("=" * 60 + "\n")
|
|
||||||
|
|
||||||
results = []
|
|
||||||
|
|
||||||
# Run all tests
|
|
||||||
results.append(("LlmJobPayload Model", test_llm_job_payload_model()))
|
|
||||||
results.append(("handle_llm_request Signature", test_handle_llm_request_signature()))
|
|
||||||
results.append(("process_llm_extraction Signature", test_process_llm_extraction_signature()))
|
|
||||||
results.append(("Webhook Integration", test_webhook_integration_in_api()))
|
|
||||||
results.append(("/llm/job Endpoint", test_job_endpoint_integration()))
|
|
||||||
results.append(("create_new_task Storage", test_create_new_task_integration()))
|
|
||||||
results.append(("Pattern Consistency", test_pattern_consistency()))
|
|
||||||
|
|
||||||
# Print summary
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("TEST SUMMARY")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
passed = sum(1 for _, result in results if result)
|
|
||||||
total = len(results)
|
|
||||||
|
|
||||||
for test_name, result in results:
|
|
||||||
status = "✅ PASS" if result else "❌ FAIL"
|
|
||||||
print(f"{status} - {test_name}")
|
|
||||||
|
|
||||||
print(f"\n{'=' * 60}")
|
|
||||||
print(f"Results: {passed}/{total} tests passed")
|
|
||||||
print(f"{'=' * 60}")
|
|
||||||
|
|
||||||
if passed == total:
|
|
||||||
print("\n🎉 All tests passed! /llm/job webhook feature is correctly implemented.")
|
|
||||||
print("\n📝 Summary of changes:")
|
|
||||||
print(" 1. LlmJobPayload model includes webhook_config field")
|
|
||||||
print(" 2. /llm/job endpoint extracts and passes webhook_config")
|
|
||||||
print(" 3. handle_llm_request accepts webhook_config parameter")
|
|
||||||
print(" 4. create_new_task stores webhook_config in Redis")
|
|
||||||
print(" 5. process_llm_extraction sends webhook notifications")
|
|
||||||
print(" 6. Follows the same pattern as /crawl/job")
|
|
||||||
return 0
|
|
||||||
else:
|
|
||||||
print(f"\n⚠️ {total - passed} test(s) failed. Please review the output above.")
|
|
||||||
return 1
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
exit(main())
|
|
||||||
@@ -1,307 +0,0 @@
|
|||||||
"""
|
|
||||||
Simple test script to validate webhook implementation without running full server.
|
|
||||||
|
|
||||||
This script tests:
|
|
||||||
1. Webhook module imports and syntax
|
|
||||||
2. WebhookDeliveryService initialization
|
|
||||||
3. Payload construction logic
|
|
||||||
4. Configuration parsing
|
|
||||||
"""
|
|
||||||
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
import json
|
|
||||||
from datetime import datetime, timezone
|
|
||||||
|
|
||||||
# Add deploy/docker to path to import modules
|
|
||||||
# sys.path.insert(0, '/home/user/crawl4ai/deploy/docker')
|
|
||||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'deploy', 'docker'))
|
|
||||||
|
|
||||||
def test_imports():
|
|
||||||
"""Test that all webhook-related modules can be imported"""
|
|
||||||
print("=" * 60)
|
|
||||||
print("TEST 1: Module Imports")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
try:
|
|
||||||
from webhook import WebhookDeliveryService
|
|
||||||
print("✅ webhook.WebhookDeliveryService imported successfully")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Failed to import webhook module: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
try:
|
|
||||||
from schemas import WebhookConfig, WebhookPayload
|
|
||||||
print("✅ schemas.WebhookConfig imported successfully")
|
|
||||||
print("✅ schemas.WebhookPayload imported successfully")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Failed to import schemas: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
def test_webhook_service_init():
|
|
||||||
"""Test WebhookDeliveryService initialization"""
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("TEST 2: WebhookDeliveryService Initialization")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
try:
|
|
||||||
from webhook import WebhookDeliveryService
|
|
||||||
|
|
||||||
# Test with default config
|
|
||||||
config = {
|
|
||||||
"webhooks": {
|
|
||||||
"enabled": True,
|
|
||||||
"default_url": None,
|
|
||||||
"data_in_payload": False,
|
|
||||||
"retry": {
|
|
||||||
"max_attempts": 5,
|
|
||||||
"initial_delay_ms": 1000,
|
|
||||||
"max_delay_ms": 32000,
|
|
||||||
"timeout_ms": 30000
|
|
||||||
},
|
|
||||||
"headers": {
|
|
||||||
"User-Agent": "Crawl4AI-Webhook/1.0"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
service = WebhookDeliveryService(config)
|
|
||||||
|
|
||||||
print(f"✅ Service initialized successfully")
|
|
||||||
print(f" - Max attempts: {service.max_attempts}")
|
|
||||||
print(f" - Initial delay: {service.initial_delay}s")
|
|
||||||
print(f" - Max delay: {service.max_delay}s")
|
|
||||||
print(f" - Timeout: {service.timeout}s")
|
|
||||||
|
|
||||||
# Verify calculations
|
|
||||||
assert service.max_attempts == 5, "Max attempts should be 5"
|
|
||||||
assert service.initial_delay == 1.0, "Initial delay should be 1.0s"
|
|
||||||
assert service.max_delay == 32.0, "Max delay should be 32.0s"
|
|
||||||
assert service.timeout == 30.0, "Timeout should be 30.0s"
|
|
||||||
|
|
||||||
print("✅ All configuration values correct")
|
|
||||||
|
|
||||||
return True
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Service initialization failed: {e}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
return False
|
|
||||||
|
|
||||||
def test_webhook_config_model():
|
|
||||||
"""Test WebhookConfig Pydantic model"""
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("TEST 3: WebhookConfig Model Validation")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
try:
|
|
||||||
from schemas import WebhookConfig
|
|
||||||
from pydantic import ValidationError
|
|
||||||
|
|
||||||
# Test valid config
|
|
||||||
valid_config = {
|
|
||||||
"webhook_url": "https://example.com/webhook",
|
|
||||||
"webhook_data_in_payload": True,
|
|
||||||
"webhook_headers": {"X-Secret": "token123"}
|
|
||||||
}
|
|
||||||
|
|
||||||
config = WebhookConfig(**valid_config)
|
|
||||||
print(f"✅ Valid config accepted:")
|
|
||||||
print(f" - URL: {config.webhook_url}")
|
|
||||||
print(f" - Data in payload: {config.webhook_data_in_payload}")
|
|
||||||
print(f" - Headers: {config.webhook_headers}")
|
|
||||||
|
|
||||||
# Test minimal config
|
|
||||||
minimal_config = {
|
|
||||||
"webhook_url": "https://example.com/webhook"
|
|
||||||
}
|
|
||||||
|
|
||||||
config2 = WebhookConfig(**minimal_config)
|
|
||||||
print(f"✅ Minimal config accepted (defaults applied):")
|
|
||||||
print(f" - URL: {config2.webhook_url}")
|
|
||||||
print(f" - Data in payload: {config2.webhook_data_in_payload}")
|
|
||||||
print(f" - Headers: {config2.webhook_headers}")
|
|
||||||
|
|
||||||
# Test invalid URL
|
|
||||||
try:
|
|
||||||
invalid_config = {
|
|
||||||
"webhook_url": "not-a-url"
|
|
||||||
}
|
|
||||||
config3 = WebhookConfig(**invalid_config)
|
|
||||||
print(f"❌ Invalid URL should have been rejected")
|
|
||||||
return False
|
|
||||||
except ValidationError as e:
|
|
||||||
print(f"✅ Invalid URL correctly rejected")
|
|
||||||
|
|
||||||
return True
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Model validation test failed: {e}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
return False
|
|
||||||
|
|
||||||
def test_payload_construction():
|
|
||||||
"""Test webhook payload construction logic"""
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("TEST 4: Payload Construction")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Simulate payload construction from notify_job_completion
|
|
||||||
task_id = "crawl_abc123"
|
|
||||||
task_type = "crawl"
|
|
||||||
status = "completed"
|
|
||||||
urls = ["https://example.com"]
|
|
||||||
|
|
||||||
payload = {
|
|
||||||
"task_id": task_id,
|
|
||||||
"task_type": task_type,
|
|
||||||
"status": status,
|
|
||||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
||||||
"urls": urls
|
|
||||||
}
|
|
||||||
|
|
||||||
print(f"✅ Basic payload constructed:")
|
|
||||||
print(json.dumps(payload, indent=2))
|
|
||||||
|
|
||||||
# Test with error
|
|
||||||
error_payload = {
|
|
||||||
"task_id": "crawl_xyz789",
|
|
||||||
"task_type": "crawl",
|
|
||||||
"status": "failed",
|
|
||||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
||||||
"urls": ["https://example.com"],
|
|
||||||
"error": "Connection timeout"
|
|
||||||
}
|
|
||||||
|
|
||||||
print(f"\n✅ Error payload constructed:")
|
|
||||||
print(json.dumps(error_payload, indent=2))
|
|
||||||
|
|
||||||
# Test with data
|
|
||||||
data_payload = {
|
|
||||||
"task_id": "crawl_def456",
|
|
||||||
"task_type": "crawl",
|
|
||||||
"status": "completed",
|
|
||||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
||||||
"urls": ["https://example.com"],
|
|
||||||
"data": {
|
|
||||||
"results": [
|
|
||||||
{"url": "https://example.com", "markdown": "# Example"}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
print(f"\n✅ Data payload constructed:")
|
|
||||||
print(json.dumps(data_payload, indent=2))
|
|
||||||
|
|
||||||
return True
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Payload construction failed: {e}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
return False
|
|
||||||
|
|
||||||
def test_exponential_backoff():
|
|
||||||
"""Test exponential backoff calculation"""
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("TEST 5: Exponential Backoff Calculation")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
try:
|
|
||||||
initial_delay = 1.0 # 1 second
|
|
||||||
max_delay = 32.0 # 32 seconds
|
|
||||||
|
|
||||||
print("Backoff delays for 5 attempts:")
|
|
||||||
for attempt in range(5):
|
|
||||||
delay = min(initial_delay * (2 ** attempt), max_delay)
|
|
||||||
print(f" Attempt {attempt + 1}: {delay}s")
|
|
||||||
|
|
||||||
# Verify the sequence: 1s, 2s, 4s, 8s, 16s
|
|
||||||
expected = [1.0, 2.0, 4.0, 8.0, 16.0]
|
|
||||||
actual = [min(initial_delay * (2 ** i), max_delay) for i in range(5)]
|
|
||||||
|
|
||||||
assert actual == expected, f"Expected {expected}, got {actual}"
|
|
||||||
print("✅ Exponential backoff sequence correct")
|
|
||||||
|
|
||||||
return True
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Backoff calculation failed: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
def test_api_integration():
|
|
||||||
"""Test that api.py imports webhook module correctly"""
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("TEST 6: API Integration")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Check if api.py can import webhook module
|
|
||||||
api_path = os.path.join(os.path.dirname(__file__), 'deploy', 'docker', 'api.py')
|
|
||||||
with open(api_path, 'r') as f:
|
|
||||||
api_content = f.read()
|
|
||||||
|
|
||||||
if 'from webhook import WebhookDeliveryService' in api_content:
|
|
||||||
print("✅ api.py imports WebhookDeliveryService")
|
|
||||||
else:
|
|
||||||
print("❌ api.py missing webhook import")
|
|
||||||
return False
|
|
||||||
|
|
||||||
if 'WebhookDeliveryService(config)' in api_content:
|
|
||||||
print("✅ api.py initializes WebhookDeliveryService")
|
|
||||||
else:
|
|
||||||
print("❌ api.py doesn't initialize WebhookDeliveryService")
|
|
||||||
return False
|
|
||||||
|
|
||||||
if 'notify_job_completion' in api_content:
|
|
||||||
print("✅ api.py calls notify_job_completion")
|
|
||||||
else:
|
|
||||||
print("❌ api.py doesn't call notify_job_completion")
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ API integration check failed: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Run all tests"""
|
|
||||||
print("\n🧪 Webhook Implementation Validation Tests")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
results = []
|
|
||||||
|
|
||||||
# Run tests
|
|
||||||
results.append(("Module Imports", test_imports()))
|
|
||||||
results.append(("Service Initialization", test_webhook_service_init()))
|
|
||||||
results.append(("Config Model", test_webhook_config_model()))
|
|
||||||
results.append(("Payload Construction", test_payload_construction()))
|
|
||||||
results.append(("Exponential Backoff", test_exponential_backoff()))
|
|
||||||
results.append(("API Integration", test_api_integration()))
|
|
||||||
|
|
||||||
# Print summary
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("TEST SUMMARY")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
passed = sum(1 for _, result in results if result)
|
|
||||||
total = len(results)
|
|
||||||
|
|
||||||
for test_name, result in results:
|
|
||||||
status = "✅ PASS" if result else "❌ FAIL"
|
|
||||||
print(f"{status} - {test_name}")
|
|
||||||
|
|
||||||
print(f"\n{'=' * 60}")
|
|
||||||
print(f"Results: {passed}/{total} tests passed")
|
|
||||||
print(f"{'=' * 60}")
|
|
||||||
|
|
||||||
if passed == total:
|
|
||||||
print("\n🎉 All tests passed! Webhook implementation is valid.")
|
|
||||||
return 0
|
|
||||||
else:
|
|
||||||
print(f"\n⚠️ {total - passed} test(s) failed. Please review the output above.")
|
|
||||||
return 1
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
exit(main())
|
|
||||||
@@ -1,251 +0,0 @@
|
|||||||
# Webhook Feature Test Script
|
|
||||||
|
|
||||||
This directory contains a comprehensive test script for the webhook feature implementation.
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
The `test_webhook_feature.sh` script automates the entire process of testing the webhook feature:
|
|
||||||
|
|
||||||
1. ✅ Fetches and switches to the webhook feature branch
|
|
||||||
2. ✅ Activates the virtual environment
|
|
||||||
3. ✅ Installs all required dependencies
|
|
||||||
4. ✅ Starts Redis server in background
|
|
||||||
5. ✅ Starts Crawl4AI server in background
|
|
||||||
6. ✅ Runs webhook integration test
|
|
||||||
7. ✅ Verifies job completion via webhook
|
|
||||||
8. ✅ Cleans up and returns to original branch
|
|
||||||
|
|
||||||
## Prerequisites
|
|
||||||
|
|
||||||
- Python 3.10+
|
|
||||||
- Virtual environment already created (`venv/` in project root)
|
|
||||||
- Git repository with the webhook feature branch
|
|
||||||
- `redis-server` (script will attempt to install if missing)
|
|
||||||
- `curl` and `lsof` commands available
|
|
||||||
|
|
||||||
## Usage
|
|
||||||
|
|
||||||
### Quick Start
|
|
||||||
|
|
||||||
From the project root:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
./tests/test_webhook_feature.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
Or from the tests directory:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
cd tests
|
|
||||||
./test_webhook_feature.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
### What the Script Does
|
|
||||||
|
|
||||||
#### Step 1: Branch Management
|
|
||||||
- Saves your current branch
|
|
||||||
- Fetches the webhook feature branch from remote
|
|
||||||
- Switches to the webhook feature branch
|
|
||||||
|
|
||||||
#### Step 2: Environment Setup
|
|
||||||
- Activates your existing virtual environment
|
|
||||||
- Installs dependencies from `deploy/docker/requirements.txt`
|
|
||||||
- Installs Flask for the webhook receiver
|
|
||||||
|
|
||||||
#### Step 3: Service Startup
|
|
||||||
- Starts Redis server on port 6379
|
|
||||||
- Starts Crawl4AI server on port 11235
|
|
||||||
- Waits for server health check to pass
|
|
||||||
|
|
||||||
#### Step 4: Webhook Test
|
|
||||||
- Creates a webhook receiver on port 8080
|
|
||||||
- Submits a crawl job for `https://example.com` with webhook config
|
|
||||||
- Waits for webhook notification (60s timeout)
|
|
||||||
- Verifies webhook payload contains expected data
|
|
||||||
|
|
||||||
#### Step 5: Cleanup
|
|
||||||
- Stops webhook receiver
|
|
||||||
- Stops Crawl4AI server
|
|
||||||
- Stops Redis server
|
|
||||||
- Returns to your original branch
|
|
||||||
|
|
||||||
## Expected Output
|
|
||||||
|
|
||||||
```
|
|
||||||
[INFO] Starting webhook feature test script
|
|
||||||
[INFO] Project root: /path/to/crawl4ai
|
|
||||||
[INFO] Step 1: Fetching PR branch...
|
|
||||||
[INFO] Current branch: develop
|
|
||||||
[SUCCESS] Branch fetched
|
|
||||||
[INFO] Step 2: Switching to branch: claude/implement-webhook-crawl-feature-011CULZY1Jy8N5MUkZqXkRVp
|
|
||||||
[SUCCESS] Switched to webhook feature branch
|
|
||||||
[INFO] Step 3: Activating virtual environment...
|
|
||||||
[SUCCESS] Virtual environment activated
|
|
||||||
[INFO] Step 4: Installing server dependencies...
|
|
||||||
[SUCCESS] Dependencies installed
|
|
||||||
[INFO] Step 5a: Starting Redis...
|
|
||||||
[SUCCESS] Redis started (PID: 12345)
|
|
||||||
[INFO] Step 5b: Starting server on port 11235...
|
|
||||||
[INFO] Server started (PID: 12346)
|
|
||||||
[INFO] Waiting for server to be ready...
|
|
||||||
[SUCCESS] Server is ready!
|
|
||||||
[INFO] Step 6: Creating webhook test script...
|
|
||||||
[INFO] Running webhook test...
|
|
||||||
|
|
||||||
🚀 Submitting crawl job with webhook...
|
|
||||||
✅ Job submitted successfully, task_id: crawl_abc123
|
|
||||||
⏳ Waiting for webhook notification...
|
|
||||||
|
|
||||||
✅ Webhook received: {
|
|
||||||
"task_id": "crawl_abc123",
|
|
||||||
"task_type": "crawl",
|
|
||||||
"status": "completed",
|
|
||||||
"timestamp": "2025-10-22T00:00:00.000000+00:00",
|
|
||||||
"urls": ["https://example.com"],
|
|
||||||
"data": { ... }
|
|
||||||
}
|
|
||||||
|
|
||||||
✅ Webhook received!
|
|
||||||
Task ID: crawl_abc123
|
|
||||||
Status: completed
|
|
||||||
URLs: ['https://example.com']
|
|
||||||
✅ Data included in webhook payload
|
|
||||||
📄 Crawled 1 URL(s)
|
|
||||||
- https://example.com: 1234 chars
|
|
||||||
|
|
||||||
🎉 Webhook test PASSED!
|
|
||||||
|
|
||||||
[INFO] Step 7: Verifying test results...
|
|
||||||
[SUCCESS] ✅ Webhook test PASSED!
|
|
||||||
[SUCCESS] All tests completed successfully! 🎉
|
|
||||||
[INFO] Cleanup will happen automatically...
|
|
||||||
[INFO] Starting cleanup...
|
|
||||||
[INFO] Stopping webhook receiver...
|
|
||||||
[INFO] Stopping server...
|
|
||||||
[INFO] Stopping Redis...
|
|
||||||
[INFO] Switching back to branch: develop
|
|
||||||
[SUCCESS] Cleanup complete
|
|
||||||
```
|
|
||||||
|
|
||||||
## Troubleshooting
|
|
||||||
|
|
||||||
### Server Failed to Start
|
|
||||||
|
|
||||||
If the server fails to start, check the logs:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
tail -100 /tmp/crawl4ai_server.log
|
|
||||||
```
|
|
||||||
|
|
||||||
Common issues:
|
|
||||||
- Port 11235 already in use: `lsof -ti:11235 | xargs kill -9`
|
|
||||||
- Missing dependencies: Check that all packages are installed
|
|
||||||
|
|
||||||
### Redis Connection Failed
|
|
||||||
|
|
||||||
Check if Redis is running:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
redis-cli ping
|
|
||||||
# Should return: PONG
|
|
||||||
```
|
|
||||||
|
|
||||||
If not running:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
redis-server --port 6379 --daemonize yes
|
|
||||||
```
|
|
||||||
|
|
||||||
### Webhook Not Received
|
|
||||||
|
|
||||||
The script has a 60-second timeout for webhook delivery. If the webhook isn't received:
|
|
||||||
|
|
||||||
1. Check server logs: `/tmp/crawl4ai_server.log`
|
|
||||||
2. Verify webhook receiver is running on port 8080
|
|
||||||
3. Check network connectivity between components
|
|
||||||
|
|
||||||
### Script Interruption
|
|
||||||
|
|
||||||
If the script is interrupted (Ctrl+C), cleanup happens automatically via trap. The script will:
|
|
||||||
- Kill all background processes
|
|
||||||
- Stop Redis
|
|
||||||
- Return to your original branch
|
|
||||||
|
|
||||||
To manually cleanup if needed:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Kill processes by port
|
|
||||||
lsof -ti:11235 | xargs kill -9 # Server
|
|
||||||
lsof -ti:8080 | xargs kill -9 # Webhook receiver
|
|
||||||
lsof -ti:6379 | xargs kill -9 # Redis
|
|
||||||
|
|
||||||
# Return to your branch
|
|
||||||
git checkout develop # or your branch name
|
|
||||||
```
|
|
||||||
|
|
||||||
## Testing Different URLs
|
|
||||||
|
|
||||||
To test with a different URL, modify the script or create a custom test:
|
|
||||||
|
|
||||||
```python
|
|
||||||
payload = {
|
|
||||||
"urls": ["https://your-url-here.com"],
|
|
||||||
"browser_config": {"headless": True},
|
|
||||||
"crawler_config": {"cache_mode": "bypass"},
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": "http://localhost:8080/webhook",
|
|
||||||
"webhook_data_in_payload": True
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## Files Generated
|
|
||||||
|
|
||||||
The script creates temporary files:
|
|
||||||
|
|
||||||
- `/tmp/crawl4ai_server.log` - Server output logs
|
|
||||||
- `/tmp/test_webhook.py` - Webhook test Python script
|
|
||||||
|
|
||||||
These are not cleaned up automatically so you can review them after the test.
|
|
||||||
|
|
||||||
## Exit Codes
|
|
||||||
|
|
||||||
- `0` - All tests passed successfully
|
|
||||||
- `1` - Test failed (check output for details)
|
|
||||||
|
|
||||||
## Safety Features
|
|
||||||
|
|
||||||
- ✅ Automatic cleanup on exit, interrupt, or error
|
|
||||||
- ✅ Returns to original branch on completion
|
|
||||||
- ✅ Kills all background processes
|
|
||||||
- ✅ Comprehensive error handling
|
|
||||||
- ✅ Colored output for easy reading
|
|
||||||
- ✅ Detailed logging at each step
|
|
||||||
|
|
||||||
## Notes
|
|
||||||
|
|
||||||
- The script uses `set -e` to exit on any command failure
|
|
||||||
- All background processes are tracked and cleaned up
|
|
||||||
- The virtual environment must exist before running
|
|
||||||
- Redis must be available (installed or installable via apt-get/brew)
|
|
||||||
|
|
||||||
## Integration with CI/CD
|
|
||||||
|
|
||||||
This script can be integrated into CI/CD pipelines:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
# Example GitHub Actions
|
|
||||||
- name: Test Webhook Feature
|
|
||||||
run: |
|
|
||||||
chmod +x tests/test_webhook_feature.sh
|
|
||||||
./tests/test_webhook_feature.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
## Support
|
|
||||||
|
|
||||||
If you encounter issues:
|
|
||||||
|
|
||||||
1. Check the troubleshooting section above
|
|
||||||
2. Review server logs at `/tmp/crawl4ai_server.log`
|
|
||||||
3. Ensure all prerequisites are met
|
|
||||||
4. Open an issue with the full output of the script
|
|
||||||
@@ -1,193 +0,0 @@
|
|||||||
"""
|
|
||||||
Test script demonstrating the hooks_to_string utility and Docker client integration.
|
|
||||||
"""
|
|
||||||
import asyncio
|
|
||||||
from crawl4ai import Crawl4aiDockerClient, hooks_to_string
|
|
||||||
|
|
||||||
|
|
||||||
# Define hook functions as regular Python functions
|
|
||||||
async def auth_hook(page, context, **kwargs):
|
|
||||||
"""Add authentication cookies."""
|
|
||||||
await context.add_cookies([{
|
|
||||||
'name': 'test_cookie',
|
|
||||||
'value': 'test_value',
|
|
||||||
'domain': '.httpbin.org',
|
|
||||||
'path': '/'
|
|
||||||
}])
|
|
||||||
return page
|
|
||||||
|
|
||||||
|
|
||||||
async def scroll_hook(page, context, **kwargs):
|
|
||||||
"""Scroll to load lazy content."""
|
|
||||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
||||||
await page.wait_for_timeout(1000)
|
|
||||||
return page
|
|
||||||
|
|
||||||
|
|
||||||
async def viewport_hook(page, context, **kwargs):
|
|
||||||
"""Set custom viewport."""
|
|
||||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
|
||||||
return page
|
|
||||||
|
|
||||||
|
|
||||||
async def test_hooks_utility():
|
|
||||||
"""Test the hooks_to_string utility function."""
|
|
||||||
print("=" * 60)
|
|
||||||
print("Testing hooks_to_string utility")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
# Create hooks dictionary with function objects
|
|
||||||
hooks_dict = {
|
|
||||||
"on_page_context_created": auth_hook,
|
|
||||||
"before_retrieve_html": scroll_hook
|
|
||||||
}
|
|
||||||
|
|
||||||
# Convert to string format
|
|
||||||
hooks_string = hooks_to_string(hooks_dict)
|
|
||||||
|
|
||||||
print("\n✓ Successfully converted function objects to strings")
|
|
||||||
print(f"\n✓ Converted {len(hooks_string)} hooks:")
|
|
||||||
for hook_name in hooks_string.keys():
|
|
||||||
print(f" - {hook_name}")
|
|
||||||
|
|
||||||
print("\n✓ Preview of converted hook:")
|
|
||||||
print("-" * 60)
|
|
||||||
print(hooks_string["on_page_context_created"][:200] + "...")
|
|
||||||
print("-" * 60)
|
|
||||||
|
|
||||||
return hooks_string
|
|
||||||
|
|
||||||
|
|
||||||
async def test_docker_client_with_functions():
|
|
||||||
"""Test Docker client with function objects (automatic conversion)."""
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("Testing Docker Client with Function Objects")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
# Note: This requires a running Crawl4AI Docker server
|
|
||||||
# Uncomment the following to test with actual server:
|
|
||||||
|
|
||||||
async with Crawl4aiDockerClient(base_url="http://localhost:11234", verbose=True) as client:
|
|
||||||
# Pass function objects directly - they'll be converted automatically
|
|
||||||
result = await client.crawl(
|
|
||||||
["https://httpbin.org/html"],
|
|
||||||
hooks={
|
|
||||||
"on_page_context_created": auth_hook,
|
|
||||||
"before_retrieve_html": scroll_hook
|
|
||||||
},
|
|
||||||
hooks_timeout=30
|
|
||||||
)
|
|
||||||
print(f"\n✓ Crawl successful: {result.success}")
|
|
||||||
print(f"✓ URL: {result.url}")
|
|
||||||
|
|
||||||
print("\n✓ Docker client accepts function objects directly")
|
|
||||||
print("✓ Automatic conversion happens internally")
|
|
||||||
print("✓ No manual string formatting needed!")
|
|
||||||
|
|
||||||
|
|
||||||
async def test_docker_client_with_strings():
|
|
||||||
"""Test Docker client with pre-converted strings."""
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("Testing Docker Client with String Hooks")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
# Convert hooks to strings first
|
|
||||||
hooks_dict = {
|
|
||||||
"on_page_context_created": viewport_hook,
|
|
||||||
"before_retrieve_html": scroll_hook
|
|
||||||
}
|
|
||||||
hooks_string = hooks_to_string(hooks_dict)
|
|
||||||
|
|
||||||
# Note: This requires a running Crawl4AI Docker server
|
|
||||||
# Uncomment the following to test with actual server:
|
|
||||||
|
|
||||||
async with Crawl4aiDockerClient(base_url="http://localhost:11234", verbose=True) as client:
|
|
||||||
# Pass string hooks - they'll be used as-is
|
|
||||||
result = await client.crawl(
|
|
||||||
["https://httpbin.org/html"],
|
|
||||||
hooks=hooks_string,
|
|
||||||
hooks_timeout=30
|
|
||||||
)
|
|
||||||
print(f"\n✓ Crawl successful: {result.success}")
|
|
||||||
|
|
||||||
print("\n✓ Docker client also accepts pre-converted strings")
|
|
||||||
print("✓ Backward compatible with existing code")
|
|
||||||
|
|
||||||
|
|
||||||
async def show_usage_patterns():
|
|
||||||
"""Show different usage patterns."""
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("Usage Patterns")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
print("\n1. Direct function usage (simplest):")
|
|
||||||
print("-" * 60)
|
|
||||||
print("""
|
|
||||||
async def my_hook(page, context, **kwargs):
|
|
||||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
|
||||||
return page
|
|
||||||
|
|
||||||
result = await client.crawl(
|
|
||||||
["https://example.com"],
|
|
||||||
hooks={"on_page_context_created": my_hook}
|
|
||||||
)
|
|
||||||
""")
|
|
||||||
|
|
||||||
print("\n2. Convert then use:")
|
|
||||||
print("-" * 60)
|
|
||||||
print("""
|
|
||||||
hooks_dict = {"on_page_context_created": my_hook}
|
|
||||||
hooks_string = hooks_to_string(hooks_dict)
|
|
||||||
|
|
||||||
result = await client.crawl(
|
|
||||||
["https://example.com"],
|
|
||||||
hooks=hooks_string
|
|
||||||
)
|
|
||||||
""")
|
|
||||||
|
|
||||||
print("\n3. Manual string (backward compatible):")
|
|
||||||
print("-" * 60)
|
|
||||||
print("""
|
|
||||||
hooks_string = {
|
|
||||||
"on_page_context_created": '''
|
|
||||||
async def hook(page, context, **kwargs):
|
|
||||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
|
||||||
return page
|
|
||||||
'''
|
|
||||||
}
|
|
||||||
|
|
||||||
result = await client.crawl(
|
|
||||||
["https://example.com"],
|
|
||||||
hooks=hooks_string
|
|
||||||
)
|
|
||||||
""")
|
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
"""Run all tests."""
|
|
||||||
print("\n🚀 Crawl4AI Hooks Utility Test Suite\n")
|
|
||||||
|
|
||||||
# Test the utility function
|
|
||||||
# await test_hooks_utility()
|
|
||||||
|
|
||||||
# Show usage with Docker client
|
|
||||||
# await test_docker_client_with_functions()
|
|
||||||
await test_docker_client_with_strings()
|
|
||||||
|
|
||||||
# Show different patterns
|
|
||||||
# await show_usage_patterns()
|
|
||||||
|
|
||||||
# print("\n" + "=" * 60)
|
|
||||||
# print("✓ All tests completed successfully!")
|
|
||||||
# print("=" * 60)
|
|
||||||
# print("\nKey Benefits:")
|
|
||||||
# print(" • Write hooks as regular Python functions")
|
|
||||||
# print(" • IDE support with autocomplete and type checking")
|
|
||||||
# print(" • Automatic conversion to API format")
|
|
||||||
# print(" • Backward compatible with string hooks")
|
|
||||||
# print(" • Same utility used everywhere")
|
|
||||||
# print("\n")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(main())
|
|
||||||
@@ -1,305 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
#############################################################################
|
|
||||||
# Webhook Feature Test Script
|
|
||||||
#
|
|
||||||
# This script tests the webhook feature implementation by:
|
|
||||||
# 1. Switching to the webhook feature branch
|
|
||||||
# 2. Installing dependencies
|
|
||||||
# 3. Starting the server
|
|
||||||
# 4. Running webhook tests
|
|
||||||
# 5. Cleaning up and returning to original branch
|
|
||||||
#
|
|
||||||
# Usage: ./test_webhook_feature.sh
|
|
||||||
#############################################################################
|
|
||||||
|
|
||||||
set -e # Exit on error
|
|
||||||
|
|
||||||
# Colors for output
|
|
||||||
RED='\033[0;31m'
|
|
||||||
GREEN='\033[0;32m'
|
|
||||||
YELLOW='\033[1;33m'
|
|
||||||
BLUE='\033[0;34m'
|
|
||||||
NC='\033[0m' # No Color
|
|
||||||
|
|
||||||
# Configuration
|
|
||||||
BRANCH_NAME="claude/implement-webhook-crawl-feature-011CULZY1Jy8N5MUkZqXkRVp"
|
|
||||||
VENV_PATH="venv"
|
|
||||||
SERVER_PORT=11235
|
|
||||||
WEBHOOK_PORT=8080
|
|
||||||
PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
|
||||||
|
|
||||||
# PID files for cleanup
|
|
||||||
REDIS_PID=""
|
|
||||||
SERVER_PID=""
|
|
||||||
WEBHOOK_PID=""
|
|
||||||
|
|
||||||
#############################################################################
|
|
||||||
# Utility Functions
|
|
||||||
#############################################################################
|
|
||||||
|
|
||||||
log_info() {
|
|
||||||
echo -e "${BLUE}[INFO]${NC} $1"
|
|
||||||
}
|
|
||||||
|
|
||||||
log_success() {
|
|
||||||
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
|
||||||
}
|
|
||||||
|
|
||||||
log_warning() {
|
|
||||||
echo -e "${YELLOW}[WARNING]${NC} $1"
|
|
||||||
}
|
|
||||||
|
|
||||||
log_error() {
|
|
||||||
echo -e "${RED}[ERROR]${NC} $1"
|
|
||||||
}
|
|
||||||
|
|
||||||
cleanup() {
|
|
||||||
log_info "Starting cleanup..."
|
|
||||||
|
|
||||||
# Kill webhook receiver if running
|
|
||||||
if [ ! -z "$WEBHOOK_PID" ] && kill -0 $WEBHOOK_PID 2>/dev/null; then
|
|
||||||
log_info "Stopping webhook receiver (PID: $WEBHOOK_PID)..."
|
|
||||||
kill $WEBHOOK_PID 2>/dev/null || true
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Kill server if running
|
|
||||||
if [ ! -z "$SERVER_PID" ] && kill -0 $SERVER_PID 2>/dev/null; then
|
|
||||||
log_info "Stopping server (PID: $SERVER_PID)..."
|
|
||||||
kill $SERVER_PID 2>/dev/null || true
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Kill Redis if running
|
|
||||||
if [ ! -z "$REDIS_PID" ] && kill -0 $REDIS_PID 2>/dev/null; then
|
|
||||||
log_info "Stopping Redis (PID: $REDIS_PID)..."
|
|
||||||
kill $REDIS_PID 2>/dev/null || true
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Also kill by port if PIDs didn't work
|
|
||||||
lsof -ti:$SERVER_PORT | xargs kill -9 2>/dev/null || true
|
|
||||||
lsof -ti:$WEBHOOK_PORT | xargs kill -9 2>/dev/null || true
|
|
||||||
lsof -ti:6379 | xargs kill -9 2>/dev/null || true
|
|
||||||
|
|
||||||
# Return to original branch
|
|
||||||
if [ ! -z "$ORIGINAL_BRANCH" ]; then
|
|
||||||
log_info "Switching back to branch: $ORIGINAL_BRANCH"
|
|
||||||
git checkout $ORIGINAL_BRANCH 2>/dev/null || true
|
|
||||||
fi
|
|
||||||
|
|
||||||
log_success "Cleanup complete"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Set trap to cleanup on exit
|
|
||||||
trap cleanup EXIT INT TERM
|
|
||||||
|
|
||||||
#############################################################################
|
|
||||||
# Main Script
|
|
||||||
#############################################################################
|
|
||||||
|
|
||||||
log_info "Starting webhook feature test script"
|
|
||||||
log_info "Project root: $PROJECT_ROOT"
|
|
||||||
|
|
||||||
cd "$PROJECT_ROOT"
|
|
||||||
|
|
||||||
# Step 1: Save current branch and fetch PR
|
|
||||||
log_info "Step 1: Fetching PR branch..."
|
|
||||||
ORIGINAL_BRANCH=$(git rev-parse --abbrev-ref HEAD)
|
|
||||||
log_info "Current branch: $ORIGINAL_BRANCH"
|
|
||||||
|
|
||||||
git fetch origin $BRANCH_NAME
|
|
||||||
log_success "Branch fetched"
|
|
||||||
|
|
||||||
# Step 2: Switch to new branch
|
|
||||||
log_info "Step 2: Switching to branch: $BRANCH_NAME"
|
|
||||||
git checkout $BRANCH_NAME
|
|
||||||
log_success "Switched to webhook feature branch"
|
|
||||||
|
|
||||||
# Step 3: Activate virtual environment
|
|
||||||
log_info "Step 3: Activating virtual environment..."
|
|
||||||
if [ ! -d "$VENV_PATH" ]; then
|
|
||||||
log_error "Virtual environment not found at $VENV_PATH"
|
|
||||||
log_info "Creating virtual environment..."
|
|
||||||
python3 -m venv $VENV_PATH
|
|
||||||
fi
|
|
||||||
|
|
||||||
source $VENV_PATH/bin/activate
|
|
||||||
log_success "Virtual environment activated: $(which python)"
|
|
||||||
|
|
||||||
# Step 4: Install server dependencies
|
|
||||||
log_info "Step 4: Installing server dependencies..."
|
|
||||||
pip install -q -r deploy/docker/requirements.txt
|
|
||||||
log_success "Dependencies installed"
|
|
||||||
|
|
||||||
# Check if Redis is available
|
|
||||||
log_info "Checking Redis availability..."
|
|
||||||
if ! command -v redis-server &> /dev/null; then
|
|
||||||
log_warning "Redis not found, attempting to install..."
|
|
||||||
if command -v apt-get &> /dev/null; then
|
|
||||||
sudo apt-get update && sudo apt-get install -y redis-server
|
|
||||||
elif command -v brew &> /dev/null; then
|
|
||||||
brew install redis
|
|
||||||
else
|
|
||||||
log_error "Cannot install Redis automatically. Please install Redis manually."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Step 5: Start Redis in background
|
|
||||||
log_info "Step 5a: Starting Redis..."
|
|
||||||
redis-server --port 6379 --daemonize yes
|
|
||||||
sleep 2
|
|
||||||
REDIS_PID=$(pgrep redis-server)
|
|
||||||
log_success "Redis started (PID: $REDIS_PID)"
|
|
||||||
|
|
||||||
# Step 5b: Start server in background
|
|
||||||
log_info "Step 5b: Starting server on port $SERVER_PORT..."
|
|
||||||
cd deploy/docker
|
|
||||||
|
|
||||||
# Start server in background
|
|
||||||
python3 -m uvicorn server:app --host 0.0.0.0 --port $SERVER_PORT > /tmp/crawl4ai_server.log 2>&1 &
|
|
||||||
SERVER_PID=$!
|
|
||||||
cd "$PROJECT_ROOT"
|
|
||||||
|
|
||||||
log_info "Server started (PID: $SERVER_PID)"
|
|
||||||
|
|
||||||
# Wait for server to be ready
|
|
||||||
log_info "Waiting for server to be ready..."
|
|
||||||
for i in {1..30}; do
|
|
||||||
if curl -s http://localhost:$SERVER_PORT/health > /dev/null 2>&1; then
|
|
||||||
log_success "Server is ready!"
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
if [ $i -eq 30 ]; then
|
|
||||||
log_error "Server failed to start within 30 seconds"
|
|
||||||
log_info "Server logs:"
|
|
||||||
tail -50 /tmp/crawl4ai_server.log
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo -n "."
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
# Step 6: Create and run webhook test
|
|
||||||
log_info "Step 6: Creating webhook test script..."
|
|
||||||
|
|
||||||
cat > /tmp/test_webhook.py << 'PYTHON_SCRIPT'
|
|
||||||
import requests
|
|
||||||
import json
|
|
||||||
import time
|
|
||||||
from flask import Flask, request, jsonify
|
|
||||||
from threading import Thread, Event
|
|
||||||
|
|
||||||
# Configuration
|
|
||||||
CRAWL4AI_BASE_URL = "http://localhost:11235"
|
|
||||||
WEBHOOK_BASE_URL = "http://localhost:8080"
|
|
||||||
|
|
||||||
# Flask app for webhook receiver
|
|
||||||
app = Flask(__name__)
|
|
||||||
webhook_received = Event()
|
|
||||||
webhook_data = {}
|
|
||||||
|
|
||||||
@app.route('/webhook', methods=['POST'])
|
|
||||||
def handle_webhook():
|
|
||||||
global webhook_data
|
|
||||||
webhook_data = request.json
|
|
||||||
webhook_received.set()
|
|
||||||
print(f"\n✅ Webhook received: {json.dumps(webhook_data, indent=2)}")
|
|
||||||
return jsonify({"status": "received"}), 200
|
|
||||||
|
|
||||||
def start_webhook_server():
|
|
||||||
app.run(host='0.0.0.0', port=8080, debug=False, use_reloader=False)
|
|
||||||
|
|
||||||
# Start webhook server in background
|
|
||||||
webhook_thread = Thread(target=start_webhook_server, daemon=True)
|
|
||||||
webhook_thread.start()
|
|
||||||
time.sleep(2)
|
|
||||||
|
|
||||||
print("🚀 Submitting crawl job with webhook...")
|
|
||||||
|
|
||||||
# Submit job with webhook
|
|
||||||
payload = {
|
|
||||||
"urls": ["https://example.com"],
|
|
||||||
"browser_config": {"headless": True},
|
|
||||||
"crawler_config": {"cache_mode": "bypass"},
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": f"{WEBHOOK_BASE_URL}/webhook",
|
|
||||||
"webhook_data_in_payload": True
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
response = requests.post(
|
|
||||||
f"{CRAWL4AI_BASE_URL}/crawl/job",
|
|
||||||
json=payload,
|
|
||||||
headers={"Content-Type": "application/json"}
|
|
||||||
)
|
|
||||||
|
|
||||||
if not response.ok:
|
|
||||||
print(f"❌ Failed to submit job: {response.text}")
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
task_id = response.json()['task_id']
|
|
||||||
print(f"✅ Job submitted successfully, task_id: {task_id}")
|
|
||||||
|
|
||||||
# Wait for webhook (with timeout)
|
|
||||||
print("⏳ Waiting for webhook notification...")
|
|
||||||
if webhook_received.wait(timeout=60):
|
|
||||||
print(f"✅ Webhook received!")
|
|
||||||
print(f" Task ID: {webhook_data.get('task_id')}")
|
|
||||||
print(f" Status: {webhook_data.get('status')}")
|
|
||||||
print(f" URLs: {webhook_data.get('urls')}")
|
|
||||||
|
|
||||||
if webhook_data.get('status') == 'completed':
|
|
||||||
if 'data' in webhook_data:
|
|
||||||
print(f" ✅ Data included in webhook payload")
|
|
||||||
results = webhook_data['data'].get('results', [])
|
|
||||||
if results:
|
|
||||||
print(f" 📄 Crawled {len(results)} URL(s)")
|
|
||||||
for result in results:
|
|
||||||
print(f" - {result.get('url')}: {len(result.get('markdown', ''))} chars")
|
|
||||||
print("\n🎉 Webhook test PASSED!")
|
|
||||||
exit(0)
|
|
||||||
else:
|
|
||||||
print(f" ❌ Job failed: {webhook_data.get('error')}")
|
|
||||||
exit(1)
|
|
||||||
else:
|
|
||||||
print("❌ Webhook not received within 60 seconds")
|
|
||||||
# Try polling as fallback
|
|
||||||
print("⏳ Trying to poll job status...")
|
|
||||||
for i in range(10):
|
|
||||||
status_response = requests.get(f"{CRAWL4AI_BASE_URL}/crawl/job/{task_id}")
|
|
||||||
if status_response.ok:
|
|
||||||
status = status_response.json()
|
|
||||||
print(f" Status: {status.get('status')}")
|
|
||||||
if status.get('status') in ['completed', 'failed']:
|
|
||||||
break
|
|
||||||
time.sleep(2)
|
|
||||||
exit(1)
|
|
||||||
PYTHON_SCRIPT
|
|
||||||
|
|
||||||
# Install Flask for webhook receiver
|
|
||||||
pip install -q flask
|
|
||||||
|
|
||||||
# Run the webhook test
|
|
||||||
log_info "Running webhook test..."
|
|
||||||
python3 /tmp/test_webhook.py &
|
|
||||||
WEBHOOK_PID=$!
|
|
||||||
|
|
||||||
# Wait for test to complete
|
|
||||||
wait $WEBHOOK_PID
|
|
||||||
TEST_EXIT_CODE=$?
|
|
||||||
|
|
||||||
# Step 7: Verify results
|
|
||||||
log_info "Step 7: Verifying test results..."
|
|
||||||
if [ $TEST_EXIT_CODE -eq 0 ]; then
|
|
||||||
log_success "✅ Webhook test PASSED!"
|
|
||||||
else
|
|
||||||
log_error "❌ Webhook test FAILED (exit code: $TEST_EXIT_CODE)"
|
|
||||||
log_info "Server logs:"
|
|
||||||
tail -100 /tmp/crawl4ai_server.log
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Step 8: Cleanup happens automatically via trap
|
|
||||||
log_success "All tests completed successfully! 🎉"
|
|
||||||
log_info "Cleanup will happen automatically..."
|
|
||||||
Reference in New Issue
Block a user