Compare commits
19 Commits
feature/ag
...
release/v0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f6a02c4358 | ||
|
|
6d1a398419 | ||
|
|
c107617920 | ||
|
|
69d0ef89dd | ||
|
|
1bf85bcb1a | ||
|
|
749232ba1a | ||
|
|
c7288dd2f1 | ||
|
|
c91b235cb7 | ||
|
|
8fc1747225 | ||
|
|
aadab30c3d | ||
|
|
4a04b8506a | ||
|
|
7dadb65b80 | ||
|
|
a3f057e19f | ||
|
|
611d48f93b | ||
|
|
936397ee0e | ||
|
|
9900f63f97 | ||
|
|
9292b265fc | ||
|
|
70af81d9d7 | ||
|
|
361499d291 |
81
.github/workflows/docker-release.yml
vendored
Normal file
81
.github/workflows/docker-release.yml
vendored
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
name: Docker Release
|
||||||
|
on:
|
||||||
|
release:
|
||||||
|
types: [published]
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- 'docker-rebuild-v*' # Allow manual Docker rebuilds via tags
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
docker:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Extract version from release or tag
|
||||||
|
id: get_version
|
||||||
|
run: |
|
||||||
|
if [ "${{ github.event_name }}" == "release" ]; then
|
||||||
|
# Triggered by release event
|
||||||
|
VERSION="${{ github.event.release.tag_name }}"
|
||||||
|
VERSION=${VERSION#v} # Remove 'v' prefix
|
||||||
|
else
|
||||||
|
# Triggered by docker-rebuild-v* tag
|
||||||
|
VERSION=${GITHUB_REF#refs/tags/docker-rebuild-v}
|
||||||
|
fi
|
||||||
|
echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
|
||||||
|
echo "Building Docker images for version: $VERSION"
|
||||||
|
|
||||||
|
- name: Extract major and minor versions
|
||||||
|
id: versions
|
||||||
|
run: |
|
||||||
|
VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||||
|
MAJOR=$(echo $VERSION | cut -d. -f1)
|
||||||
|
MINOR=$(echo $VERSION | cut -d. -f1-2)
|
||||||
|
echo "MAJOR=$MAJOR" >> $GITHUB_OUTPUT
|
||||||
|
echo "MINOR=$MINOR" >> $GITHUB_OUTPUT
|
||||||
|
echo "Semantic versions - Major: $MAJOR, Minor: $MINOR"
|
||||||
|
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
|
- name: Log in to Docker Hub
|
||||||
|
uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.DOCKER_USERNAME }}
|
||||||
|
password: ${{ secrets.DOCKER_TOKEN }}
|
||||||
|
|
||||||
|
- name: Build and push Docker images
|
||||||
|
uses: docker/build-push-action@v5
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
push: true
|
||||||
|
tags: |
|
||||||
|
unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
||||||
|
unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}
|
||||||
|
unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}
|
||||||
|
unclecode/crawl4ai:latest
|
||||||
|
platforms: linux/amd64,linux/arm64
|
||||||
|
cache-from: type=gha
|
||||||
|
cache-to: type=gha,mode=max
|
||||||
|
|
||||||
|
- name: Summary
|
||||||
|
run: |
|
||||||
|
echo "## 🐳 Docker Release Complete!" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "### Published Images" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "- \`unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}\`" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}\`" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "- \`unclecode/crawl4ai:latest\`" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "### Platforms" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "- linux/amd64" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "- linux/arm64" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "### 🚀 Pull Command" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "\`\`\`bash" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "docker pull unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
|
||||||
917
.github/workflows/docs/ARCHITECTURE.md
vendored
Normal file
917
.github/workflows/docs/ARCHITECTURE.md
vendored
Normal file
@@ -0,0 +1,917 @@
|
|||||||
|
# Workflow Architecture Documentation
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
This document describes the technical architecture of the split release pipeline for Crawl4AI.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Architecture Diagram
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────────────────────────────────────────────────┐
|
||||||
|
│ Developer │
|
||||||
|
│ │ │
|
||||||
|
│ ▼ │
|
||||||
|
│ git tag v1.2.3 │
|
||||||
|
│ git push --tags │
|
||||||
|
└──────────────────────────────┬──────────────────────────────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌─────────────────────────────────────────────────────────────────┐
|
||||||
|
│ GitHub Repository │
|
||||||
|
│ │
|
||||||
|
│ ┌────────────────────────────────────────────────────────┐ │
|
||||||
|
│ │ Tag Event: v1.2.3 │ │
|
||||||
|
│ └────────────────────────────────────────────────────────┘ │
|
||||||
|
│ │ │
|
||||||
|
│ ▼ │
|
||||||
|
│ ┌────────────────────────────────────────────────────────┐ │
|
||||||
|
│ │ release.yml (Release Pipeline) │ │
|
||||||
|
│ │ ┌──────────────────────────────────────────────┐ │ │
|
||||||
|
│ │ │ 1. Extract Version │ │ │
|
||||||
|
│ │ │ v1.2.3 → 1.2.3 │ │ │
|
||||||
|
│ │ └──────────────────────────────────────────────┘ │ │
|
||||||
|
│ │ ┌──────────────────────────────────────────────┐ │ │
|
||||||
|
│ │ │ 2. Validate Version │ │ │
|
||||||
|
│ │ │ Tag == __version__.py │ │ │
|
||||||
|
│ │ └──────────────────────────────────────────────┘ │ │
|
||||||
|
│ │ ┌──────────────────────────────────────────────┐ │ │
|
||||||
|
│ │ │ 3. Build Python Package │ │ │
|
||||||
|
│ │ │ - Source dist (.tar.gz) │ │ │
|
||||||
|
│ │ │ - Wheel (.whl) │ │ │
|
||||||
|
│ │ └──────────────────────────────────────────────┘ │ │
|
||||||
|
│ │ ┌──────────────────────────────────────────────┐ │ │
|
||||||
|
│ │ │ 4. Upload to PyPI │ │ │
|
||||||
|
│ │ │ - Authenticate with token │ │ │
|
||||||
|
│ │ │ - Upload dist/* │ │ │
|
||||||
|
│ │ └──────────────────────────────────────────────┘ │ │
|
||||||
|
│ │ ┌──────────────────────────────────────────────┐ │ │
|
||||||
|
│ │ │ 5. Create GitHub Release │ │ │
|
||||||
|
│ │ │ - Tag: v1.2.3 │ │ │
|
||||||
|
│ │ │ - Body: Install instructions │ │ │
|
||||||
|
│ │ │ - Status: Published │ │ │
|
||||||
|
│ │ └──────────────────────────────────────────────┘ │ │
|
||||||
|
│ └────────────────────────────────────────────────────────┘ │
|
||||||
|
│ │ │
|
||||||
|
│ ▼ │
|
||||||
|
│ ┌────────────────────────────────────────────────────────┐ │
|
||||||
|
│ │ Release Event: published (v1.2.3) │ │
|
||||||
|
│ └────────────────────────────────────────────────────────┘ │
|
||||||
|
│ │ │
|
||||||
|
│ ▼ │
|
||||||
|
│ ┌────────────────────────────────────────────────────────┐ │
|
||||||
|
│ │ docker-release.yml (Docker Pipeline) │ │
|
||||||
|
│ │ ┌──────────────────────────────────────────────┐ │ │
|
||||||
|
│ │ │ 1. Extract Version from Release │ │ │
|
||||||
|
│ │ │ github.event.release.tag_name → 1.2.3 │ │ │
|
||||||
|
│ │ └──────────────────────────────────────────────┘ │ │
|
||||||
|
│ │ ┌──────────────────────────────────────────────┐ │ │
|
||||||
|
│ │ │ 2. Parse Semantic Versions │ │ │
|
||||||
|
│ │ │ 1.2.3 → Major: 1, Minor: 1.2 │ │ │
|
||||||
|
│ │ └──────────────────────────────────────────────┘ │ │
|
||||||
|
│ │ ┌──────────────────────────────────────────────┐ │ │
|
||||||
|
│ │ │ 3. Setup Multi-Arch Build │ │ │
|
||||||
|
│ │ │ - Docker Buildx │ │ │
|
||||||
|
│ │ │ - QEMU emulation │ │ │
|
||||||
|
│ │ └──────────────────────────────────────────────┘ │ │
|
||||||
|
│ │ ┌──────────────────────────────────────────────┐ │ │
|
||||||
|
│ │ │ 4. Authenticate Docker Hub │ │ │
|
||||||
|
│ │ │ - Username: DOCKER_USERNAME │ │ │
|
||||||
|
│ │ │ - Token: DOCKER_TOKEN │ │ │
|
||||||
|
│ │ └──────────────────────────────────────────────┘ │ │
|
||||||
|
│ │ ┌──────────────────────────────────────────────┐ │ │
|
||||||
|
│ │ │ 5. Build Multi-Arch Images │ │ │
|
||||||
|
│ │ │ ┌────────────────┬────────────────┐ │ │ │
|
||||||
|
│ │ │ │ linux/amd64 │ linux/arm64 │ │ │ │
|
||||||
|
│ │ │ └────────────────┴────────────────┘ │ │ │
|
||||||
|
│ │ │ Cache: GitHub Actions (type=gha) │ │ │
|
||||||
|
│ │ └──────────────────────────────────────────────┘ │ │
|
||||||
|
│ │ ┌──────────────────────────────────────────────┐ │ │
|
||||||
|
│ │ │ 6. Push to Docker Hub │ │ │
|
||||||
|
│ │ │ Tags: │ │ │
|
||||||
|
│ │ │ - unclecode/crawl4ai:1.2.3 │ │ │
|
||||||
|
│ │ │ - unclecode/crawl4ai:1.2 │ │ │
|
||||||
|
│ │ │ - unclecode/crawl4ai:1 │ │ │
|
||||||
|
│ │ │ - unclecode/crawl4ai:latest │ │ │
|
||||||
|
│ │ └──────────────────────────────────────────────┘ │ │
|
||||||
|
│ └────────────────────────────────────────────────────────┘ │
|
||||||
|
└─────────────────────────────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌─────────────────────────────────────────────────────────────────┐
|
||||||
|
│ External Services │
|
||||||
|
│ │
|
||||||
|
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
|
||||||
|
│ │ PyPI │ │ Docker Hub │ │ GitHub │ │
|
||||||
|
│ │ │ │ │ │ │ │
|
||||||
|
│ │ crawl4ai │ │ unclecode/ │ │ Releases │ │
|
||||||
|
│ │ 1.2.3 │ │ crawl4ai │ │ v1.2.3 │ │
|
||||||
|
│ └──────────────┘ └──────────────┘ └──────────────┘ │
|
||||||
|
└─────────────────────────────────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Component Details
|
||||||
|
|
||||||
|
### 1. Release Pipeline (release.yml)
|
||||||
|
|
||||||
|
#### Purpose
|
||||||
|
Fast publication of Python package and GitHub release.
|
||||||
|
|
||||||
|
#### Input
|
||||||
|
- **Trigger**: Git tag matching `v*` (excluding `test-v*`)
|
||||||
|
- **Example**: `v1.2.3`
|
||||||
|
|
||||||
|
#### Processing Stages
|
||||||
|
|
||||||
|
##### Stage 1: Version Extraction
|
||||||
|
```bash
|
||||||
|
Input: refs/tags/v1.2.3
|
||||||
|
Output: VERSION=1.2.3
|
||||||
|
```
|
||||||
|
|
||||||
|
**Implementation**:
|
||||||
|
```bash
|
||||||
|
TAG_VERSION=${GITHUB_REF#refs/tags/v} # Remove 'refs/tags/v' prefix
|
||||||
|
echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Stage 2: Version Validation
|
||||||
|
```bash
|
||||||
|
Input: TAG_VERSION=1.2.3
|
||||||
|
Check: crawl4ai/__version__.py contains __version__ = "1.2.3"
|
||||||
|
Output: Pass/Fail
|
||||||
|
```
|
||||||
|
|
||||||
|
**Implementation**:
|
||||||
|
```bash
|
||||||
|
PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)")
|
||||||
|
if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Stage 3: Package Build
|
||||||
|
```bash
|
||||||
|
Input: Source code + pyproject.toml
|
||||||
|
Output: dist/crawl4ai-1.2.3.tar.gz
|
||||||
|
dist/crawl4ai-1.2.3-py3-none-any.whl
|
||||||
|
```
|
||||||
|
|
||||||
|
**Implementation**:
|
||||||
|
```bash
|
||||||
|
python -m build
|
||||||
|
# Uses build backend defined in pyproject.toml
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Stage 4: PyPI Upload
|
||||||
|
```bash
|
||||||
|
Input: dist/*.{tar.gz,whl}
|
||||||
|
Auth: PYPI_TOKEN
|
||||||
|
Output: Package published to PyPI
|
||||||
|
```
|
||||||
|
|
||||||
|
**Implementation**:
|
||||||
|
```bash
|
||||||
|
twine upload dist/*
|
||||||
|
# Environment:
|
||||||
|
# TWINE_USERNAME: __token__
|
||||||
|
# TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Stage 5: GitHub Release Creation
|
||||||
|
```bash
|
||||||
|
Input: Tag: v1.2.3
|
||||||
|
Body: Markdown content
|
||||||
|
Output: Published GitHub release
|
||||||
|
```
|
||||||
|
|
||||||
|
**Implementation**:
|
||||||
|
```yaml
|
||||||
|
uses: softprops/action-gh-release@v2
|
||||||
|
with:
|
||||||
|
tag_name: v1.2.3
|
||||||
|
name: Release v1.2.3
|
||||||
|
body: |
|
||||||
|
Installation instructions and changelog
|
||||||
|
draft: false
|
||||||
|
prerelease: false
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Output
|
||||||
|
- **PyPI Package**: https://pypi.org/project/crawl4ai/1.2.3/
|
||||||
|
- **GitHub Release**: Published release on repository
|
||||||
|
- **Event**: `release.published` (triggers Docker workflow)
|
||||||
|
|
||||||
|
#### Timeline
|
||||||
|
```
|
||||||
|
0:00 - Tag pushed
|
||||||
|
0:01 - Checkout + Python setup
|
||||||
|
0:02 - Version validation
|
||||||
|
0:03 - Package build
|
||||||
|
0:04 - PyPI upload starts
|
||||||
|
0:06 - PyPI upload complete
|
||||||
|
0:07 - GitHub release created
|
||||||
|
0:08 - Workflow complete
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### 2. Docker Release Pipeline (docker-release.yml)
|
||||||
|
|
||||||
|
#### Purpose
|
||||||
|
Build and publish multi-architecture Docker images.
|
||||||
|
|
||||||
|
#### Inputs
|
||||||
|
|
||||||
|
##### Input 1: Release Event (Automatic)
|
||||||
|
```yaml
|
||||||
|
Event: release.published
|
||||||
|
Data: github.event.release.tag_name = "v1.2.3"
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Input 2: Docker Rebuild Tag (Manual)
|
||||||
|
```yaml
|
||||||
|
Tag: docker-rebuild-v1.2.3
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Processing Stages
|
||||||
|
|
||||||
|
##### Stage 1: Version Detection
|
||||||
|
```bash
|
||||||
|
# From release event:
|
||||||
|
VERSION = github.event.release.tag_name.strip("v")
|
||||||
|
# Result: "1.2.3"
|
||||||
|
|
||||||
|
# From rebuild tag:
|
||||||
|
VERSION = GITHUB_REF.replace("refs/tags/docker-rebuild-v", "")
|
||||||
|
# Result: "1.2.3"
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Stage 2: Semantic Version Parsing
|
||||||
|
```bash
|
||||||
|
Input: VERSION=1.2.3
|
||||||
|
Output: MAJOR=1
|
||||||
|
MINOR=1.2
|
||||||
|
PATCH=3 (implicit)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Implementation**:
|
||||||
|
```bash
|
||||||
|
MAJOR=$(echo $VERSION | cut -d. -f1) # Extract first component
|
||||||
|
MINOR=$(echo $VERSION | cut -d. -f1-2) # Extract first two components
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Stage 3: Multi-Architecture Setup
|
||||||
|
```yaml
|
||||||
|
Setup:
|
||||||
|
- Docker Buildx (multi-platform builder)
|
||||||
|
- QEMU (ARM emulation on x86)
|
||||||
|
|
||||||
|
Platforms:
|
||||||
|
- linux/amd64 (x86_64)
|
||||||
|
- linux/arm64 (aarch64)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Architecture**:
|
||||||
|
```
|
||||||
|
GitHub Runner (linux/amd64)
|
||||||
|
├─ Buildx Builder
|
||||||
|
│ ├─ Native: Build linux/amd64 image
|
||||||
|
│ └─ QEMU: Emulate ARM to build linux/arm64 image
|
||||||
|
└─ Generate manifest list (points to both images)
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Stage 4: Docker Hub Authentication
|
||||||
|
```bash
|
||||||
|
Input: DOCKER_USERNAME
|
||||||
|
DOCKER_TOKEN
|
||||||
|
Output: Authenticated Docker client
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Stage 5: Build with Cache
|
||||||
|
```yaml
|
||||||
|
Cache Configuration:
|
||||||
|
cache-from: type=gha # Read from GitHub Actions cache
|
||||||
|
cache-to: type=gha,mode=max # Write all layers
|
||||||
|
|
||||||
|
Cache Key Components:
|
||||||
|
- Workflow file path
|
||||||
|
- Branch name
|
||||||
|
- Architecture (amd64/arm64)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Cache Hierarchy**:
|
||||||
|
```
|
||||||
|
Cache Entry: main/docker-release.yml/linux-amd64
|
||||||
|
├─ Layer: sha256:abc123... (FROM python:3.12)
|
||||||
|
├─ Layer: sha256:def456... (RUN apt-get update)
|
||||||
|
├─ Layer: sha256:ghi789... (COPY requirements.txt)
|
||||||
|
├─ Layer: sha256:jkl012... (RUN pip install)
|
||||||
|
└─ Layer: sha256:mno345... (COPY . /app)
|
||||||
|
|
||||||
|
Cache Hit/Miss Logic:
|
||||||
|
- If layer input unchanged → cache hit → skip build
|
||||||
|
- If layer input changed → cache miss → rebuild + all subsequent layers
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Stage 6: Tag Generation
|
||||||
|
```bash
|
||||||
|
Input: VERSION=1.2.3, MAJOR=1, MINOR=1.2
|
||||||
|
|
||||||
|
Output Tags:
|
||||||
|
- unclecode/crawl4ai:1.2.3 (exact version)
|
||||||
|
- unclecode/crawl4ai:1.2 (minor version)
|
||||||
|
- unclecode/crawl4ai:1 (major version)
|
||||||
|
- unclecode/crawl4ai:latest (latest stable)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Tag Strategy**:
|
||||||
|
- All tags point to same image SHA
|
||||||
|
- Users can pin to desired stability level
|
||||||
|
- Pushing new version updates `1`, `1.2`, and `latest` automatically
|
||||||
|
|
||||||
|
##### Stage 7: Push to Registry
|
||||||
|
```bash
|
||||||
|
For each tag:
|
||||||
|
For each platform (amd64, arm64):
|
||||||
|
Push image to Docker Hub
|
||||||
|
|
||||||
|
Create manifest list:
|
||||||
|
Manifest: unclecode/crawl4ai:1.2.3
|
||||||
|
├─ linux/amd64: sha256:abc...
|
||||||
|
└─ linux/arm64: sha256:def...
|
||||||
|
|
||||||
|
Docker CLI automatically selects correct platform on pull
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Output
|
||||||
|
- **Docker Images**: 4 tags × 2 platforms = 8 image variants + 4 manifests
|
||||||
|
- **Docker Hub**: https://hub.docker.com/r/unclecode/crawl4ai/tags
|
||||||
|
|
||||||
|
#### Timeline
|
||||||
|
|
||||||
|
**Cold Cache (First Build)**:
|
||||||
|
```
|
||||||
|
0:00 - Release event received
|
||||||
|
0:01 - Checkout + Buildx setup
|
||||||
|
0:02 - Docker Hub auth
|
||||||
|
0:03 - Start build (amd64)
|
||||||
|
0:08 - Complete amd64 build
|
||||||
|
0:09 - Start build (arm64)
|
||||||
|
0:14 - Complete arm64 build
|
||||||
|
0:15 - Generate manifests
|
||||||
|
0:16 - Push all tags
|
||||||
|
0:17 - Workflow complete
|
||||||
|
```
|
||||||
|
|
||||||
|
**Warm Cache (Code Change Only)**:
|
||||||
|
```
|
||||||
|
0:00 - Release event received
|
||||||
|
0:01 - Checkout + Buildx setup
|
||||||
|
0:02 - Docker Hub auth
|
||||||
|
0:03 - Start build (amd64) - cache hit for layers 1-4
|
||||||
|
0:04 - Complete amd64 build (only layer 5 rebuilt)
|
||||||
|
0:05 - Start build (arm64) - cache hit for layers 1-4
|
||||||
|
0:06 - Complete arm64 build (only layer 5 rebuilt)
|
||||||
|
0:07 - Generate manifests
|
||||||
|
0:08 - Push all tags
|
||||||
|
0:09 - Workflow complete
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Data Flow
|
||||||
|
|
||||||
|
### Version Information Flow
|
||||||
|
|
||||||
|
```
|
||||||
|
Developer
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
crawl4ai/__version__.py
|
||||||
|
__version__ = "1.2.3"
|
||||||
|
│
|
||||||
|
├─► Git Tag
|
||||||
|
│ v1.2.3
|
||||||
|
│ │
|
||||||
|
│ ▼
|
||||||
|
│ release.yml
|
||||||
|
│ │
|
||||||
|
│ ├─► Validation
|
||||||
|
│ │ ✓ Match
|
||||||
|
│ │
|
||||||
|
│ ├─► PyPI Package
|
||||||
|
│ │ crawl4ai==1.2.3
|
||||||
|
│ │
|
||||||
|
│ └─► GitHub Release
|
||||||
|
│ v1.2.3
|
||||||
|
│ │
|
||||||
|
│ ▼
|
||||||
|
│ docker-release.yml
|
||||||
|
│ │
|
||||||
|
│ └─► Docker Tags
|
||||||
|
│ 1.2.3, 1.2, 1, latest
|
||||||
|
│
|
||||||
|
└─► Package Metadata
|
||||||
|
pyproject.toml
|
||||||
|
version = "1.2.3"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Secrets Flow
|
||||||
|
|
||||||
|
```
|
||||||
|
GitHub Secrets (Encrypted at Rest)
|
||||||
|
│
|
||||||
|
├─► PYPI_TOKEN
|
||||||
|
│ │
|
||||||
|
│ ▼
|
||||||
|
│ release.yml
|
||||||
|
│ │
|
||||||
|
│ ▼
|
||||||
|
│ TWINE_PASSWORD env var (masked in logs)
|
||||||
|
│ │
|
||||||
|
│ ▼
|
||||||
|
│ PyPI API (HTTPS)
|
||||||
|
│
|
||||||
|
├─► DOCKER_USERNAME
|
||||||
|
│ │
|
||||||
|
│ ▼
|
||||||
|
│ docker-release.yml
|
||||||
|
│ │
|
||||||
|
│ ▼
|
||||||
|
│ docker/login-action (masked in logs)
|
||||||
|
│ │
|
||||||
|
│ ▼
|
||||||
|
│ Docker Hub API (HTTPS)
|
||||||
|
│
|
||||||
|
└─► DOCKER_TOKEN
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
docker-release.yml
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
docker/login-action (masked in logs)
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
Docker Hub API (HTTPS)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Artifact Flow
|
||||||
|
|
||||||
|
```
|
||||||
|
Source Code
|
||||||
|
│
|
||||||
|
├─► release.yml
|
||||||
|
│ │
|
||||||
|
│ ▼
|
||||||
|
│ python -m build
|
||||||
|
│ │
|
||||||
|
│ ├─► crawl4ai-1.2.3.tar.gz
|
||||||
|
│ │ │
|
||||||
|
│ │ ▼
|
||||||
|
│ │ PyPI Storage
|
||||||
|
│ │ │
|
||||||
|
│ │ ▼
|
||||||
|
│ │ pip install crawl4ai
|
||||||
|
│ │
|
||||||
|
│ └─► crawl4ai-1.2.3-py3-none-any.whl
|
||||||
|
│ │
|
||||||
|
│ ▼
|
||||||
|
│ PyPI Storage
|
||||||
|
│ │
|
||||||
|
│ ▼
|
||||||
|
│ pip install crawl4ai
|
||||||
|
│
|
||||||
|
└─► docker-release.yml
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
docker build
|
||||||
|
│
|
||||||
|
├─► Image: linux/amd64
|
||||||
|
│ │
|
||||||
|
│ └─► Docker Hub
|
||||||
|
│ unclecode/crawl4ai:1.2.3-amd64
|
||||||
|
│
|
||||||
|
└─► Image: linux/arm64
|
||||||
|
│
|
||||||
|
└─► Docker Hub
|
||||||
|
unclecode/crawl4ai:1.2.3-arm64
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## State Machines
|
||||||
|
|
||||||
|
### Release Pipeline State Machine
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────┐
|
||||||
|
│ START │
|
||||||
|
└────┬────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌──────────────┐
|
||||||
|
│ Extract │
|
||||||
|
│ Version │
|
||||||
|
└──────┬───────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌──────────────┐ ┌─────────┐
|
||||||
|
│ Validate │─────►│ FAILED │
|
||||||
|
│ Version │ No │ (Exit 1)│
|
||||||
|
└──────┬───────┘ └─────────┘
|
||||||
|
│ Yes
|
||||||
|
▼
|
||||||
|
┌──────────────┐
|
||||||
|
│ Build │
|
||||||
|
│ Package │
|
||||||
|
└──────┬───────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌──────────────┐ ┌─────────┐
|
||||||
|
│ Upload │─────►│ FAILED │
|
||||||
|
│ to PyPI │ Error│ (Exit 1)│
|
||||||
|
└──────┬───────┘ └─────────┘
|
||||||
|
│ Success
|
||||||
|
▼
|
||||||
|
┌──────────────┐
|
||||||
|
│ Create │
|
||||||
|
│ GH Release │
|
||||||
|
└──────┬───────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌──────────────┐
|
||||||
|
│ SUCCESS │
|
||||||
|
│ (Emit Event) │
|
||||||
|
└──────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
### Docker Pipeline State Machine
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────┐
|
||||||
|
│ START │
|
||||||
|
│ (Event) │
|
||||||
|
└────┬────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌──────────────┐
|
||||||
|
│ Detect │
|
||||||
|
│ Version │
|
||||||
|
│ Source │
|
||||||
|
└──────┬───────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌──────────────┐
|
||||||
|
│ Parse │
|
||||||
|
│ Semantic │
|
||||||
|
│ Versions │
|
||||||
|
└──────┬───────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌──────────────┐ ┌─────────┐
|
||||||
|
│ Authenticate │─────►│ FAILED │
|
||||||
|
│ Docker Hub │ Error│ (Exit 1)│
|
||||||
|
└──────┬───────┘ └─────────┘
|
||||||
|
│ Success
|
||||||
|
▼
|
||||||
|
┌──────────────┐
|
||||||
|
│ Build │
|
||||||
|
│ amd64 │
|
||||||
|
└──────┬───────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌──────────────┐ ┌─────────┐
|
||||||
|
│ Build │─────►│ FAILED │
|
||||||
|
│ arm64 │ Error│ (Exit 1)│
|
||||||
|
└──────┬───────┘ └─────────┘
|
||||||
|
│ Success
|
||||||
|
▼
|
||||||
|
┌──────────────┐
|
||||||
|
│ Push All │
|
||||||
|
│ Tags │
|
||||||
|
└──────┬───────┘
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
┌──────────────┐
|
||||||
|
│ SUCCESS │
|
||||||
|
└──────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Security Architecture
|
||||||
|
|
||||||
|
### Threat Model
|
||||||
|
|
||||||
|
#### Threats Mitigated
|
||||||
|
|
||||||
|
1. **Secret Exposure**
|
||||||
|
- Mitigation: GitHub Actions secret masking
|
||||||
|
- Evidence: Secrets never appear in logs
|
||||||
|
|
||||||
|
2. **Unauthorized Package Upload**
|
||||||
|
- Mitigation: Scoped PyPI tokens
|
||||||
|
- Evidence: Token limited to `crawl4ai` project
|
||||||
|
|
||||||
|
3. **Man-in-the-Middle**
|
||||||
|
- Mitigation: HTTPS for all API calls
|
||||||
|
- Evidence: PyPI, Docker Hub, GitHub all use TLS
|
||||||
|
|
||||||
|
4. **Supply Chain Tampering**
|
||||||
|
- Mitigation: Immutable artifacts, content checksums
|
||||||
|
- Evidence: PyPI stores SHA256, Docker uses content-addressable storage
|
||||||
|
|
||||||
|
#### Trust Boundaries
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────────────────────────────┐
|
||||||
|
│ Trusted Zone │
|
||||||
|
│ ┌────────────────────────────────┐ │
|
||||||
|
│ │ GitHub Actions Runner │ │
|
||||||
|
│ │ - Ephemeral VM │ │
|
||||||
|
│ │ - Isolated environment │ │
|
||||||
|
│ │ - Access to secrets │ │
|
||||||
|
│ └────────────────────────────────┘ │
|
||||||
|
│ │ │
|
||||||
|
│ │ HTTPS (TLS 1.2+) │
|
||||||
|
│ ▼ │
|
||||||
|
└─────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
┌────────────┼────────────┐
|
||||||
|
│ │ │
|
||||||
|
▼ ▼ ▼
|
||||||
|
┌────────┐ ┌─────────┐ ┌──────────┐
|
||||||
|
│ PyPI │ │ Docker │ │ GitHub │
|
||||||
|
│ API │ │ Hub │ │ API │
|
||||||
|
└────────┘ └─────────┘ └──────────┘
|
||||||
|
External External External
|
||||||
|
Service Service Service
|
||||||
|
```
|
||||||
|
|
||||||
|
### Secret Management
|
||||||
|
|
||||||
|
#### Secret Lifecycle
|
||||||
|
|
||||||
|
```
|
||||||
|
Creation (Developer)
|
||||||
|
│
|
||||||
|
├─► PyPI: Create API token (scoped to project)
|
||||||
|
├─► Docker Hub: Create access token (read/write)
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
Storage (GitHub)
|
||||||
|
│
|
||||||
|
├─► Encrypted at rest (AES-256)
|
||||||
|
├─► Access controlled (repo-scoped)
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
Usage (Workflow)
|
||||||
|
│
|
||||||
|
├─► Injected as env vars
|
||||||
|
├─► Masked in logs (GitHub redacts on output)
|
||||||
|
├─► Never persisted to disk (in-memory only)
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
Transmission (API Call)
|
||||||
|
│
|
||||||
|
├─► HTTPS only
|
||||||
|
├─► TLS 1.2+ with strong ciphers
|
||||||
|
│
|
||||||
|
▼
|
||||||
|
Rotation (Manual)
|
||||||
|
│
|
||||||
|
└─► Regenerate on PyPI/Docker Hub
|
||||||
|
Update GitHub secret
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Performance Characteristics
|
||||||
|
|
||||||
|
### Release Pipeline Performance
|
||||||
|
|
||||||
|
| Metric | Value | Notes |
|
||||||
|
|--------|-------|-------|
|
||||||
|
| Cold start | ~2-3 min | First run on new runner |
|
||||||
|
| Warm start | ~2-3 min | Minimal caching benefit |
|
||||||
|
| PyPI upload | ~30-60 sec | Network-bound |
|
||||||
|
| Package build | ~30 sec | CPU-bound |
|
||||||
|
| Parallelization | None | Sequential by design |
|
||||||
|
|
||||||
|
### Docker Pipeline Performance
|
||||||
|
|
||||||
|
| Metric | Cold Cache | Warm Cache (code) | Warm Cache (deps) |
|
||||||
|
|--------|-----------|-------------------|-------------------|
|
||||||
|
| Total time | 10-15 min | 1-2 min | 3-5 min |
|
||||||
|
| amd64 build | 5-7 min | 30-60 sec | 1-2 min |
|
||||||
|
| arm64 build | 5-7 min | 30-60 sec | 1-2 min |
|
||||||
|
| Push time | 1-2 min | 30 sec | 30 sec |
|
||||||
|
| Cache hit rate | 0% | 85% | 60% |
|
||||||
|
|
||||||
|
### Cache Performance Model
|
||||||
|
|
||||||
|
```python
|
||||||
|
def estimate_build_time(changes):
|
||||||
|
base_time = 60 # seconds (setup + push)
|
||||||
|
|
||||||
|
if "Dockerfile" in changes:
|
||||||
|
return base_time + (10 * 60) # Full rebuild: ~11 min
|
||||||
|
elif "requirements.txt" in changes:
|
||||||
|
return base_time + (3 * 60) # Deps rebuild: ~4 min
|
||||||
|
elif any(f.endswith(".py") for f in changes):
|
||||||
|
return base_time + 60 # Code only: ~2 min
|
||||||
|
else:
|
||||||
|
return base_time # No changes: ~1 min
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Scalability Considerations
|
||||||
|
|
||||||
|
### Current Limits
|
||||||
|
|
||||||
|
| Resource | Limit | Impact |
|
||||||
|
|----------|-------|--------|
|
||||||
|
| Workflow concurrency | 20 (default) | Max 20 releases in parallel |
|
||||||
|
| Artifact storage | 500 MB/artifact | PyPI packages small (<10 MB) |
|
||||||
|
| Cache storage | 10 GB/repo | Docker layers fit comfortably |
|
||||||
|
| Workflow run time | 6 hours | Plenty of headroom |
|
||||||
|
|
||||||
|
### Scaling Strategies
|
||||||
|
|
||||||
|
#### Horizontal Scaling (Multiple Repos)
|
||||||
|
```
|
||||||
|
crawl4ai (main)
|
||||||
|
├─ release.yml
|
||||||
|
└─ docker-release.yml
|
||||||
|
|
||||||
|
crawl4ai-plugins (separate)
|
||||||
|
├─ release.yml
|
||||||
|
└─ docker-release.yml
|
||||||
|
|
||||||
|
Each repo has independent:
|
||||||
|
- Secrets
|
||||||
|
- Cache (10 GB each)
|
||||||
|
- Concurrency limits (20 each)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Vertical Scaling (Larger Runners)
|
||||||
|
```yaml
|
||||||
|
jobs:
|
||||||
|
docker:
|
||||||
|
runs-on: ubuntu-latest-8-cores # GitHub-hosted larger runner
|
||||||
|
# 4x faster builds for CPU-bound layers
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Disaster Recovery
|
||||||
|
|
||||||
|
### Failure Scenarios
|
||||||
|
|
||||||
|
#### Scenario 1: Release Pipeline Fails
|
||||||
|
|
||||||
|
**Failure Point**: PyPI upload fails (network error)
|
||||||
|
|
||||||
|
**State**:
|
||||||
|
- ✓ Version validated
|
||||||
|
- ✓ Package built
|
||||||
|
- ✗ PyPI upload
|
||||||
|
- ✗ GitHub release
|
||||||
|
|
||||||
|
**Recovery**:
|
||||||
|
```bash
|
||||||
|
# Manual upload
|
||||||
|
twine upload dist/*
|
||||||
|
|
||||||
|
# Retry workflow (re-run from GitHub Actions UI)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Prevention**: Add retry logic to PyPI upload
|
||||||
|
|
||||||
|
#### Scenario 2: Docker Pipeline Fails
|
||||||
|
|
||||||
|
**Failure Point**: ARM build fails (dependency issue)
|
||||||
|
|
||||||
|
**State**:
|
||||||
|
- ✓ PyPI published
|
||||||
|
- ✓ GitHub release created
|
||||||
|
- ✓ amd64 image built
|
||||||
|
- ✗ arm64 image build
|
||||||
|
|
||||||
|
**Recovery**:
|
||||||
|
```bash
|
||||||
|
# Fix Dockerfile
|
||||||
|
git commit -am "fix: ARM build dependency"
|
||||||
|
|
||||||
|
# Trigger rebuild
|
||||||
|
git tag docker-rebuild-v1.2.3
|
||||||
|
git push origin docker-rebuild-v1.2.3
|
||||||
|
```
|
||||||
|
|
||||||
|
**Impact**: PyPI package available, only Docker ARM users affected
|
||||||
|
|
||||||
|
#### Scenario 3: Partial Release
|
||||||
|
|
||||||
|
**Failure Point**: GitHub release creation fails
|
||||||
|
|
||||||
|
**State**:
|
||||||
|
- ✓ PyPI published
|
||||||
|
- ✗ GitHub release
|
||||||
|
- ✗ Docker images
|
||||||
|
|
||||||
|
**Recovery**:
|
||||||
|
```bash
|
||||||
|
# Create release manually
|
||||||
|
gh release create v1.2.3 \
|
||||||
|
--title "Release v1.2.3" \
|
||||||
|
--notes "..."
|
||||||
|
|
||||||
|
# This triggers docker-release.yml automatically
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Monitoring and Observability
|
||||||
|
|
||||||
|
### Metrics to Track
|
||||||
|
|
||||||
|
#### Release Pipeline
|
||||||
|
- Success rate (target: >99%)
|
||||||
|
- Duration (target: <3 min)
|
||||||
|
- PyPI upload time (target: <60 sec)
|
||||||
|
|
||||||
|
#### Docker Pipeline
|
||||||
|
- Success rate (target: >95%)
|
||||||
|
- Duration (target: <15 min cold, <2 min warm)
|
||||||
|
- Cache hit rate (target: >80% for code changes)
|
||||||
|
|
||||||
|
### Alerting
|
||||||
|
|
||||||
|
**Critical Alerts**:
|
||||||
|
- Release pipeline failure (blocks release)
|
||||||
|
- PyPI authentication failure (expired token)
|
||||||
|
|
||||||
|
**Warning Alerts**:
|
||||||
|
- Docker build >15 min (performance degradation)
|
||||||
|
- Cache hit rate <50% (cache issue)
|
||||||
|
|
||||||
|
### Logging
|
||||||
|
|
||||||
|
**GitHub Actions Logs**:
|
||||||
|
- Retention: 90 days
|
||||||
|
- Downloadable: Yes
|
||||||
|
- Searchable: Limited
|
||||||
|
|
||||||
|
**Recommended External Logging**:
|
||||||
|
```yaml
|
||||||
|
- name: Send logs to external service
|
||||||
|
if: failure()
|
||||||
|
run: |
|
||||||
|
curl -X POST https://logs.example.com/api/v1/logs \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d "{\"workflow\": \"${{ github.workflow }}\", \"status\": \"failed\"}"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Future Enhancements
|
||||||
|
|
||||||
|
### Planned Improvements
|
||||||
|
|
||||||
|
1. **Automated Changelog Generation**
|
||||||
|
- Use conventional commits
|
||||||
|
- Generate CHANGELOG.md automatically
|
||||||
|
|
||||||
|
2. **Pre-release Testing**
|
||||||
|
- Test builds on `test-v*` tags
|
||||||
|
- Upload to TestPyPI
|
||||||
|
|
||||||
|
3. **Notification System**
|
||||||
|
- Slack/Discord notifications on release
|
||||||
|
- Email on failure
|
||||||
|
|
||||||
|
4. **Performance Optimization**
|
||||||
|
- Parallel Docker builds (amd64 + arm64 simultaneously)
|
||||||
|
- Persistent runners for better caching
|
||||||
|
|
||||||
|
5. **Enhanced Validation**
|
||||||
|
- Smoke tests after PyPI upload
|
||||||
|
- Container security scanning
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- [GitHub Actions Architecture](https://docs.github.com/en/actions/learn-github-actions/understanding-github-actions)
|
||||||
|
- [Docker Build Cache](https://docs.docker.com/build/cache/)
|
||||||
|
- [PyPI API Documentation](https://warehouse.pypa.io/api-reference/)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Last Updated**: 2025-01-21
|
||||||
|
**Version**: 2.0
|
||||||
1029
.github/workflows/docs/README.md
vendored
Normal file
1029
.github/workflows/docs/README.md
vendored
Normal file
File diff suppressed because it is too large
Load Diff
287
.github/workflows/docs/WORKFLOW_REFERENCE.md
vendored
Normal file
287
.github/workflows/docs/WORKFLOW_REFERENCE.md
vendored
Normal file
@@ -0,0 +1,287 @@
|
|||||||
|
# Workflow Quick Reference
|
||||||
|
|
||||||
|
## Quick Commands
|
||||||
|
|
||||||
|
### Standard Release
|
||||||
|
```bash
|
||||||
|
# 1. Update version
|
||||||
|
vim crawl4ai/__version__.py # Set to "1.2.3"
|
||||||
|
|
||||||
|
# 2. Commit and tag
|
||||||
|
git add crawl4ai/__version__.py
|
||||||
|
git commit -m "chore: bump version to 1.2.3"
|
||||||
|
git tag v1.2.3
|
||||||
|
git push origin main
|
||||||
|
git push origin v1.2.3
|
||||||
|
|
||||||
|
# 3. Monitor
|
||||||
|
# - PyPI: ~2-3 minutes
|
||||||
|
# - Docker: ~1-15 minutes
|
||||||
|
```
|
||||||
|
|
||||||
|
### Docker Rebuild Only
|
||||||
|
```bash
|
||||||
|
git tag docker-rebuild-v1.2.3
|
||||||
|
git push origin docker-rebuild-v1.2.3
|
||||||
|
```
|
||||||
|
|
||||||
|
### Delete Tag (Undo Release)
|
||||||
|
```bash
|
||||||
|
# Local
|
||||||
|
git tag -d v1.2.3
|
||||||
|
|
||||||
|
# Remote
|
||||||
|
git push --delete origin v1.2.3
|
||||||
|
|
||||||
|
# GitHub Release
|
||||||
|
gh release delete v1.2.3
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Workflow Triggers
|
||||||
|
|
||||||
|
### release.yml
|
||||||
|
| Event | Pattern | Example |
|
||||||
|
|-------|---------|---------|
|
||||||
|
| Tag push | `v*` | `v1.2.3` |
|
||||||
|
| Excludes | `test-v*` | `test-v1.2.3` |
|
||||||
|
|
||||||
|
### docker-release.yml
|
||||||
|
| Event | Pattern | Example |
|
||||||
|
|-------|---------|---------|
|
||||||
|
| Release published | `release.published` | Automatic |
|
||||||
|
| Tag push | `docker-rebuild-v*` | `docker-rebuild-v1.2.3` |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Environment Variables
|
||||||
|
|
||||||
|
### release.yml
|
||||||
|
| Variable | Source | Example |
|
||||||
|
|----------|--------|---------|
|
||||||
|
| `VERSION` | Git tag | `1.2.3` |
|
||||||
|
| `TWINE_USERNAME` | Static | `__token__` |
|
||||||
|
| `TWINE_PASSWORD` | Secret | `pypi-Ag...` |
|
||||||
|
| `GITHUB_TOKEN` | Auto | `ghp_...` |
|
||||||
|
|
||||||
|
### docker-release.yml
|
||||||
|
| Variable | Source | Example |
|
||||||
|
|----------|--------|---------|
|
||||||
|
| `VERSION` | Release/Tag | `1.2.3` |
|
||||||
|
| `MAJOR` | Computed | `1` |
|
||||||
|
| `MINOR` | Computed | `1.2` |
|
||||||
|
| `DOCKER_USERNAME` | Secret | `unclecode` |
|
||||||
|
| `DOCKER_TOKEN` | Secret | `dckr_pat_...` |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Docker Tags Generated
|
||||||
|
|
||||||
|
| Version | Tags Created |
|
||||||
|
|---------|-------------|
|
||||||
|
| v1.0.0 | `1.0.0`, `1.0`, `1`, `latest` |
|
||||||
|
| v1.1.0 | `1.1.0`, `1.1`, `1`, `latest` |
|
||||||
|
| v1.2.3 | `1.2.3`, `1.2`, `1`, `latest` |
|
||||||
|
| v2.0.0 | `2.0.0`, `2.0`, `2`, `latest` |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Workflow Outputs
|
||||||
|
|
||||||
|
### release.yml
|
||||||
|
| Output | Location | Time |
|
||||||
|
|--------|----------|------|
|
||||||
|
| PyPI Package | https://pypi.org/project/crawl4ai/ | ~2-3 min |
|
||||||
|
| GitHub Release | Repository → Releases | ~2-3 min |
|
||||||
|
| Workflow Summary | Actions → Run → Summary | Immediate |
|
||||||
|
|
||||||
|
### docker-release.yml
|
||||||
|
| Output | Location | Time |
|
||||||
|
|--------|----------|------|
|
||||||
|
| Docker Images | https://hub.docker.com/r/unclecode/crawl4ai | ~1-15 min |
|
||||||
|
| Workflow Summary | Actions → Run → Summary | Immediate |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Common Issues
|
||||||
|
|
||||||
|
| Issue | Solution |
|
||||||
|
|-------|----------|
|
||||||
|
| Version mismatch | Update `crawl4ai/__version__.py` to match tag |
|
||||||
|
| PyPI 403 Forbidden | Check `PYPI_TOKEN` secret |
|
||||||
|
| PyPI 400 File exists | Version already published, increment version |
|
||||||
|
| Docker auth failed | Regenerate `DOCKER_TOKEN` |
|
||||||
|
| Docker build timeout | Check Dockerfile, review build logs |
|
||||||
|
| Cache not working | First build on branch always cold |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Secrets Checklist
|
||||||
|
|
||||||
|
- [ ] `PYPI_TOKEN` - PyPI API token (project or account scope)
|
||||||
|
- [ ] `DOCKER_USERNAME` - Docker Hub username
|
||||||
|
- [ ] `DOCKER_TOKEN` - Docker Hub access token (read/write)
|
||||||
|
- [ ] `GITHUB_TOKEN` - Auto-provided (no action needed)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Workflow Dependencies
|
||||||
|
|
||||||
|
### release.yml Dependencies
|
||||||
|
```yaml
|
||||||
|
Python: 3.12
|
||||||
|
Actions:
|
||||||
|
- actions/checkout@v4
|
||||||
|
- actions/setup-python@v5
|
||||||
|
- softprops/action-gh-release@v2
|
||||||
|
PyPI Packages:
|
||||||
|
- build
|
||||||
|
- twine
|
||||||
|
```
|
||||||
|
|
||||||
|
### docker-release.yml Dependencies
|
||||||
|
```yaml
|
||||||
|
Actions:
|
||||||
|
- actions/checkout@v4
|
||||||
|
- docker/setup-buildx-action@v3
|
||||||
|
- docker/login-action@v3
|
||||||
|
- docker/build-push-action@v5
|
||||||
|
Docker:
|
||||||
|
- Buildx
|
||||||
|
- QEMU (for multi-arch)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Cache Information
|
||||||
|
|
||||||
|
### Type
|
||||||
|
- GitHub Actions Cache (`type=gha`)
|
||||||
|
|
||||||
|
### Storage
|
||||||
|
- **Limit**: 10GB per repository
|
||||||
|
- **Retention**: 7 days for unused entries
|
||||||
|
- **Cleanup**: Automatic LRU eviction
|
||||||
|
|
||||||
|
### Performance
|
||||||
|
| Scenario | Cache Hit | Build Time |
|
||||||
|
|----------|-----------|------------|
|
||||||
|
| First build | 0% | 10-15 min |
|
||||||
|
| Code change only | 85% | 1-2 min |
|
||||||
|
| Dependency update | 60% | 3-5 min |
|
||||||
|
| No changes | 100% | 30-60 sec |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Build Platforms
|
||||||
|
|
||||||
|
| Platform | Architecture | Devices |
|
||||||
|
|----------|--------------|---------|
|
||||||
|
| linux/amd64 | x86_64 | Intel/AMD servers, AWS EC2, GCP |
|
||||||
|
| linux/arm64 | aarch64 | Apple Silicon, AWS Graviton, Raspberry Pi |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Version Validation
|
||||||
|
|
||||||
|
### Pre-Tag Checklist
|
||||||
|
```bash
|
||||||
|
# Check current version
|
||||||
|
python -c "from crawl4ai.__version__ import __version__; print(__version__)"
|
||||||
|
|
||||||
|
# Verify it matches intended tag
|
||||||
|
# If tag is v1.2.3, version should be "1.2.3"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Post-Release Verification
|
||||||
|
```bash
|
||||||
|
# PyPI
|
||||||
|
pip install crawl4ai==1.2.3
|
||||||
|
python -c "import crawl4ai; print(crawl4ai.__version__)"
|
||||||
|
|
||||||
|
# Docker
|
||||||
|
docker pull unclecode/crawl4ai:1.2.3
|
||||||
|
docker run unclecode/crawl4ai:1.2.3 python -c "import crawl4ai; print(crawl4ai.__version__)"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Monitoring URLs
|
||||||
|
|
||||||
|
| Service | URL |
|
||||||
|
|---------|-----|
|
||||||
|
| GitHub Actions | `https://github.com/{owner}/{repo}/actions` |
|
||||||
|
| PyPI Project | `https://pypi.org/project/crawl4ai/` |
|
||||||
|
| Docker Hub | `https://hub.docker.com/r/unclecode/crawl4ai` |
|
||||||
|
| GitHub Releases | `https://github.com/{owner}/{repo}/releases` |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Rollback Strategy
|
||||||
|
|
||||||
|
### PyPI (Cannot Delete)
|
||||||
|
```bash
|
||||||
|
# Increment patch version
|
||||||
|
git tag v1.2.4
|
||||||
|
git push origin v1.2.4
|
||||||
|
```
|
||||||
|
|
||||||
|
### Docker (Can Overwrite)
|
||||||
|
```bash
|
||||||
|
# Rebuild with fix
|
||||||
|
git tag docker-rebuild-v1.2.3
|
||||||
|
git push origin docker-rebuild-v1.2.3
|
||||||
|
```
|
||||||
|
|
||||||
|
### GitHub Release
|
||||||
|
```bash
|
||||||
|
# Delete release
|
||||||
|
gh release delete v1.2.3
|
||||||
|
|
||||||
|
# Delete tag
|
||||||
|
git push --delete origin v1.2.3
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Status Badge Markdown
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
[](https://github.com/{owner}/{repo}/actions/workflows/release.yml)
|
||||||
|
|
||||||
|
[](https://github.com/{owner}/{repo}/actions/workflows/docker-release.yml)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Timeline Example
|
||||||
|
|
||||||
|
```
|
||||||
|
0:00 - Push tag v1.2.3
|
||||||
|
0:01 - release.yml starts
|
||||||
|
0:02 - Version validation passes
|
||||||
|
0:03 - Package built
|
||||||
|
0:04 - PyPI upload starts
|
||||||
|
0:06 - PyPI upload complete ✓
|
||||||
|
0:07 - GitHub release created ✓
|
||||||
|
0:08 - release.yml complete
|
||||||
|
0:08 - docker-release.yml triggered
|
||||||
|
0:10 - Docker build starts
|
||||||
|
0:12 - amd64 image built (cache hit)
|
||||||
|
0:14 - arm64 image built (cache hit)
|
||||||
|
0:15 - Images pushed to Docker Hub ✓
|
||||||
|
0:16 - docker-release.yml complete
|
||||||
|
|
||||||
|
Total: ~16 minutes
|
||||||
|
Critical path (PyPI + GitHub): ~8 minutes
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Contact
|
||||||
|
|
||||||
|
For workflow issues:
|
||||||
|
1. Check Actions tab for logs
|
||||||
|
2. Review this reference
|
||||||
|
3. See [README.md](./README.md) for detailed docs
|
||||||
79
.github/workflows/release.yml
vendored
79
.github/workflows/release.yml
vendored
@@ -10,53 +10,53 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
permissions:
|
permissions:
|
||||||
contents: write # Required for creating releases
|
contents: write # Required for creating releases
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout code
|
- name: Checkout code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: '3.12'
|
python-version: '3.12'
|
||||||
|
|
||||||
- name: Extract version from tag
|
- name: Extract version from tag
|
||||||
id: get_version
|
id: get_version
|
||||||
run: |
|
run: |
|
||||||
TAG_VERSION=${GITHUB_REF#refs/tags/v}
|
TAG_VERSION=${GITHUB_REF#refs/tags/v}
|
||||||
echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT
|
echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT
|
||||||
echo "Releasing version: $TAG_VERSION"
|
echo "Releasing version: $TAG_VERSION"
|
||||||
|
|
||||||
- name: Install package dependencies
|
- name: Install package dependencies
|
||||||
run: |
|
run: |
|
||||||
pip install -e .
|
pip install -e .
|
||||||
|
|
||||||
- name: Check version consistency
|
- name: Check version consistency
|
||||||
run: |
|
run: |
|
||||||
TAG_VERSION=${{ steps.get_version.outputs.VERSION }}
|
TAG_VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||||
PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)")
|
PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)")
|
||||||
|
|
||||||
echo "Tag version: $TAG_VERSION"
|
echo "Tag version: $TAG_VERSION"
|
||||||
echo "Package version: $PACKAGE_VERSION"
|
echo "Package version: $PACKAGE_VERSION"
|
||||||
|
|
||||||
if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then
|
if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then
|
||||||
echo "❌ Version mismatch! Tag: $TAG_VERSION, Package: $PACKAGE_VERSION"
|
echo "❌ Version mismatch! Tag: $TAG_VERSION, Package: $PACKAGE_VERSION"
|
||||||
echo "Please update crawl4ai/__version__.py to match the tag version"
|
echo "Please update crawl4ai/__version__.py to match the tag version"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
echo "✅ Version check passed: $TAG_VERSION"
|
echo "✅ Version check passed: $TAG_VERSION"
|
||||||
|
|
||||||
- name: Install build dependencies
|
- name: Install build dependencies
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install build twine
|
pip install build twine
|
||||||
|
|
||||||
- name: Build package
|
- name: Build package
|
||||||
run: python -m build
|
run: python -m build
|
||||||
|
|
||||||
- name: Check package
|
- name: Check package
|
||||||
run: twine check dist/*
|
run: twine check dist/*
|
||||||
|
|
||||||
- name: Upload to PyPI
|
- name: Upload to PyPI
|
||||||
env:
|
env:
|
||||||
TWINE_USERNAME: __token__
|
TWINE_USERNAME: __token__
|
||||||
@@ -65,37 +65,7 @@ jobs:
|
|||||||
echo "📦 Uploading to PyPI..."
|
echo "📦 Uploading to PyPI..."
|
||||||
twine upload dist/*
|
twine upload dist/*
|
||||||
echo "✅ Package uploaded to https://pypi.org/project/crawl4ai/"
|
echo "✅ Package uploaded to https://pypi.org/project/crawl4ai/"
|
||||||
|
|
||||||
- name: Set up Docker Buildx
|
|
||||||
uses: docker/setup-buildx-action@v3
|
|
||||||
|
|
||||||
- name: Log in to Docker Hub
|
|
||||||
uses: docker/login-action@v3
|
|
||||||
with:
|
|
||||||
username: ${{ secrets.DOCKER_USERNAME }}
|
|
||||||
password: ${{ secrets.DOCKER_TOKEN }}
|
|
||||||
|
|
||||||
- name: Extract major and minor versions
|
|
||||||
id: versions
|
|
||||||
run: |
|
|
||||||
VERSION=${{ steps.get_version.outputs.VERSION }}
|
|
||||||
MAJOR=$(echo $VERSION | cut -d. -f1)
|
|
||||||
MINOR=$(echo $VERSION | cut -d. -f1-2)
|
|
||||||
echo "MAJOR=$MAJOR" >> $GITHUB_OUTPUT
|
|
||||||
echo "MINOR=$MINOR" >> $GITHUB_OUTPUT
|
|
||||||
|
|
||||||
- name: Build and push Docker images
|
|
||||||
uses: docker/build-push-action@v5
|
|
||||||
with:
|
|
||||||
context: .
|
|
||||||
push: true
|
|
||||||
tags: |
|
|
||||||
unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
|
||||||
unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}
|
|
||||||
unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}
|
|
||||||
unclecode/crawl4ai:latest
|
|
||||||
platforms: linux/amd64,linux/arm64
|
|
||||||
|
|
||||||
- name: Create GitHub Release
|
- name: Create GitHub Release
|
||||||
uses: softprops/action-gh-release@v2
|
uses: softprops/action-gh-release@v2
|
||||||
with:
|
with:
|
||||||
@@ -103,26 +73,29 @@ jobs:
|
|||||||
name: Release v${{ steps.get_version.outputs.VERSION }}
|
name: Release v${{ steps.get_version.outputs.VERSION }}
|
||||||
body: |
|
body: |
|
||||||
## 🎉 Crawl4AI v${{ steps.get_version.outputs.VERSION }} Released!
|
## 🎉 Crawl4AI v${{ steps.get_version.outputs.VERSION }} Released!
|
||||||
|
|
||||||
### 📦 Installation
|
### 📦 Installation
|
||||||
|
|
||||||
**PyPI:**
|
**PyPI:**
|
||||||
```bash
|
```bash
|
||||||
pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}
|
pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}
|
||||||
```
|
```
|
||||||
|
|
||||||
**Docker:**
|
**Docker:**
|
||||||
```bash
|
```bash
|
||||||
docker pull unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
docker pull unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
||||||
docker pull unclecode/crawl4ai:latest
|
docker pull unclecode/crawl4ai:latest
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Note:** Docker images are being built and will be available shortly.
|
||||||
|
Check the [Docker Release workflow](https://github.com/${{ github.repository }}/actions/workflows/docker-release.yml) for build status.
|
||||||
|
|
||||||
### 📝 What's Changed
|
### 📝 What's Changed
|
||||||
See [CHANGELOG.md](https://github.com/${{ github.repository }}/blob/main/CHANGELOG.md) for details.
|
See [CHANGELOG.md](https://github.com/${{ github.repository }}/blob/main/CHANGELOG.md) for details.
|
||||||
draft: false
|
draft: false
|
||||||
prerelease: false
|
prerelease: false
|
||||||
token: ${{ secrets.GITHUB_TOKEN }}
|
token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
||||||
- name: Summary
|
- name: Summary
|
||||||
run: |
|
run: |
|
||||||
echo "## 🚀 Release Complete!" >> $GITHUB_STEP_SUMMARY
|
echo "## 🚀 Release Complete!" >> $GITHUB_STEP_SUMMARY
|
||||||
@@ -132,11 +105,9 @@ jobs:
|
|||||||
echo "- URL: https://pypi.org/project/crawl4ai/" >> $GITHUB_STEP_SUMMARY
|
echo "- URL: https://pypi.org/project/crawl4ai/" >> $GITHUB_STEP_SUMMARY
|
||||||
echo "- Install: \`pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
echo "- Install: \`pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||||
echo "" >> $GITHUB_STEP_SUMMARY
|
echo "" >> $GITHUB_STEP_SUMMARY
|
||||||
echo "### 🐳 Docker Images" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "- \`unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}\`" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}\`" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "- \`unclecode/crawl4ai:latest\`" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "" >> $GITHUB_STEP_SUMMARY
|
|
||||||
echo "### 📋 GitHub Release" >> $GITHUB_STEP_SUMMARY
|
echo "### 📋 GitHub Release" >> $GITHUB_STEP_SUMMARY
|
||||||
echo "https://github.com/${{ github.repository }}/releases/tag/v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
echo "- https://github.com/${{ github.repository }}/releases/tag/v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "### 🐳 Docker Images" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "Docker images are being built in a separate workflow." >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "Check: https://github.com/${{ github.repository }}/actions/workflows/docker-release.yml" >> $GITHUB_STEP_SUMMARY
|
||||||
|
|||||||
142
.github/workflows/release.yml.backup
vendored
Normal file
142
.github/workflows/release.yml.backup
vendored
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
name: Release Pipeline
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- 'v*'
|
||||||
|
- '!test-v*' # Exclude test tags
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
release:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
permissions:
|
||||||
|
contents: write # Required for creating releases
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: '3.12'
|
||||||
|
|
||||||
|
- name: Extract version from tag
|
||||||
|
id: get_version
|
||||||
|
run: |
|
||||||
|
TAG_VERSION=${GITHUB_REF#refs/tags/v}
|
||||||
|
echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT
|
||||||
|
echo "Releasing version: $TAG_VERSION"
|
||||||
|
|
||||||
|
- name: Install package dependencies
|
||||||
|
run: |
|
||||||
|
pip install -e .
|
||||||
|
|
||||||
|
- name: Check version consistency
|
||||||
|
run: |
|
||||||
|
TAG_VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||||
|
PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)")
|
||||||
|
|
||||||
|
echo "Tag version: $TAG_VERSION"
|
||||||
|
echo "Package version: $PACKAGE_VERSION"
|
||||||
|
|
||||||
|
if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then
|
||||||
|
echo "❌ Version mismatch! Tag: $TAG_VERSION, Package: $PACKAGE_VERSION"
|
||||||
|
echo "Please update crawl4ai/__version__.py to match the tag version"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "✅ Version check passed: $TAG_VERSION"
|
||||||
|
|
||||||
|
- name: Install build dependencies
|
||||||
|
run: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
pip install build twine
|
||||||
|
|
||||||
|
- name: Build package
|
||||||
|
run: python -m build
|
||||||
|
|
||||||
|
- name: Check package
|
||||||
|
run: twine check dist/*
|
||||||
|
|
||||||
|
- name: Upload to PyPI
|
||||||
|
env:
|
||||||
|
TWINE_USERNAME: __token__
|
||||||
|
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
|
||||||
|
run: |
|
||||||
|
echo "📦 Uploading to PyPI..."
|
||||||
|
twine upload dist/*
|
||||||
|
echo "✅ Package uploaded to https://pypi.org/project/crawl4ai/"
|
||||||
|
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
|
- name: Log in to Docker Hub
|
||||||
|
uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.DOCKER_USERNAME }}
|
||||||
|
password: ${{ secrets.DOCKER_TOKEN }}
|
||||||
|
|
||||||
|
- name: Extract major and minor versions
|
||||||
|
id: versions
|
||||||
|
run: |
|
||||||
|
VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||||
|
MAJOR=$(echo $VERSION | cut -d. -f1)
|
||||||
|
MINOR=$(echo $VERSION | cut -d. -f1-2)
|
||||||
|
echo "MAJOR=$MAJOR" >> $GITHUB_OUTPUT
|
||||||
|
echo "MINOR=$MINOR" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
|
- name: Build and push Docker images
|
||||||
|
uses: docker/build-push-action@v5
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
push: true
|
||||||
|
tags: |
|
||||||
|
unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
||||||
|
unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}
|
||||||
|
unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}
|
||||||
|
unclecode/crawl4ai:latest
|
||||||
|
platforms: linux/amd64,linux/arm64
|
||||||
|
|
||||||
|
- name: Create GitHub Release
|
||||||
|
uses: softprops/action-gh-release@v2
|
||||||
|
with:
|
||||||
|
tag_name: v${{ steps.get_version.outputs.VERSION }}
|
||||||
|
name: Release v${{ steps.get_version.outputs.VERSION }}
|
||||||
|
body: |
|
||||||
|
## 🎉 Crawl4AI v${{ steps.get_version.outputs.VERSION }} Released!
|
||||||
|
|
||||||
|
### 📦 Installation
|
||||||
|
|
||||||
|
**PyPI:**
|
||||||
|
```bash
|
||||||
|
pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Docker:**
|
||||||
|
```bash
|
||||||
|
docker pull unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
||||||
|
docker pull unclecode/crawl4ai:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
### 📝 What's Changed
|
||||||
|
See [CHANGELOG.md](https://github.com/${{ github.repository }}/blob/main/CHANGELOG.md) for details.
|
||||||
|
draft: false
|
||||||
|
prerelease: false
|
||||||
|
token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
||||||
|
- name: Summary
|
||||||
|
run: |
|
||||||
|
echo "## 🚀 Release Complete!" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "### 📦 PyPI Package" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "- Version: ${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "- URL: https://pypi.org/project/crawl4ai/" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "- Install: \`pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "### 🐳 Docker Images" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "- \`unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}\`" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}\`" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "- \`unclecode/crawl4ai:latest\`" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "### 📋 GitHub Release" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "https://github.com/${{ github.repository }}/releases/tag/v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -266,6 +266,8 @@ continue_config.json
|
|||||||
.llm.env
|
.llm.env
|
||||||
.private/
|
.private/
|
||||||
|
|
||||||
|
.claude/
|
||||||
|
|
||||||
CLAUDE_MONITOR.md
|
CLAUDE_MONITOR.md
|
||||||
CLAUDE.md
|
CLAUDE.md
|
||||||
|
|
||||||
|
|||||||
58
README.md
58
README.md
@@ -27,11 +27,13 @@
|
|||||||
|
|
||||||
Crawl4AI turns the web into clean, LLM ready Markdown for RAG, agents, and data pipelines. Fast, controllable, battle tested by a 50k+ star community.
|
Crawl4AI turns the web into clean, LLM ready Markdown for RAG, agents, and data pipelines. Fast, controllable, battle tested by a 50k+ star community.
|
||||||
|
|
||||||
[✨ Check out latest update v0.7.4](#-recent-updates)
|
[✨ Check out latest update v0.7.5](#-recent-updates)
|
||||||
|
|
||||||
✨ New in v0.7.4: Revolutionary LLM Table Extraction with intelligent chunking, enhanced concurrency fixes, memory management refactor, and critical stability improvements. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.4.md)
|
✨ New in v0.7.5: Docker Hooks System with function-based API for pipeline customization, Enhanced LLM Integration with custom providers, HTTPS Preservation, and multiple community-reported bug fixes. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.5.md)
|
||||||
|
|
||||||
✨ Recent v0.7.3: Undetected Browser Support, Multi-URL Configurations, Memory Monitoring, Enhanced Table Extraction, GitHub Sponsors. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.3.md)
|
✨ Recent v0.7.4: Revolutionary LLM Table Extraction with intelligent chunking, enhanced concurrency fixes, memory management refactor, and critical stability improvements. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.4.md)
|
||||||
|
|
||||||
|
✨ Previous v0.7.3: Undetected Browser Support, Multi-URL Configurations, Memory Monitoring, Enhanced Table Extraction, GitHub Sponsors. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.3.md)
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary>🤓 <strong>My Personal Story</strong></summary>
|
<summary>🤓 <strong>My Personal Story</strong></summary>
|
||||||
@@ -177,7 +179,7 @@ No rate-limited APIs. No lock-in. Build and own your data pipeline with direct g
|
|||||||
- 📸 **Screenshots**: Capture page screenshots during crawling for debugging or analysis.
|
- 📸 **Screenshots**: Capture page screenshots during crawling for debugging or analysis.
|
||||||
- 📂 **Raw Data Crawling**: Directly process raw HTML (`raw:`) or local files (`file://`).
|
- 📂 **Raw Data Crawling**: Directly process raw HTML (`raw:`) or local files (`file://`).
|
||||||
- 🔗 **Comprehensive Link Extraction**: Extracts internal, external links, and embedded iframe content.
|
- 🔗 **Comprehensive Link Extraction**: Extracts internal, external links, and embedded iframe content.
|
||||||
- 🛠️ **Customizable Hooks**: Define hooks at every step to customize crawling behavior.
|
- 🛠️ **Customizable Hooks**: Define hooks at every step to customize crawling behavior (supports both string and function-based APIs).
|
||||||
- 💾 **Caching**: Cache data for improved speed and to avoid redundant fetches.
|
- 💾 **Caching**: Cache data for improved speed and to avoid redundant fetches.
|
||||||
- 📄 **Metadata Extraction**: Retrieve structured metadata from web pages.
|
- 📄 **Metadata Extraction**: Retrieve structured metadata from web pages.
|
||||||
- 📡 **IFrame Content Extraction**: Seamless extraction from embedded iframe content.
|
- 📡 **IFrame Content Extraction**: Seamless extraction from embedded iframe content.
|
||||||
@@ -544,6 +546,54 @@ async def test_news_crawl():
|
|||||||
|
|
||||||
## ✨ Recent Updates
|
## ✨ Recent Updates
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary><strong>Version 0.7.5 Release Highlights - The Docker Hooks & Security Update</strong></summary>
|
||||||
|
|
||||||
|
- **🔧 Docker Hooks System**: Complete pipeline customization with user-provided Python functions at 8 key points
|
||||||
|
- **✨ Function-Based Hooks API (NEW)**: Write hooks as regular Python functions with full IDE support:
|
||||||
|
```python
|
||||||
|
from crawl4ai import hooks_to_string
|
||||||
|
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||||
|
|
||||||
|
# Define hooks as regular Python functions
|
||||||
|
async def on_page_context_created(page, context, **kwargs):
|
||||||
|
"""Block images to speed up crawling"""
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def before_goto(page, context, url, **kwargs):
|
||||||
|
"""Add custom headers"""
|
||||||
|
await page.set_extra_http_headers({'X-Crawl4AI': 'v0.7.5'})
|
||||||
|
return page
|
||||||
|
|
||||||
|
# Option 1: Use hooks_to_string() utility for REST API
|
||||||
|
hooks_code = hooks_to_string({
|
||||||
|
"on_page_context_created": on_page_context_created,
|
||||||
|
"before_goto": before_goto
|
||||||
|
})
|
||||||
|
|
||||||
|
# Option 2: Docker client with automatic conversion (Recommended)
|
||||||
|
client = Crawl4aiDockerClient(base_url="http://localhost:11235")
|
||||||
|
results = await client.crawl(
|
||||||
|
urls=["https://httpbin.org/html"],
|
||||||
|
hooks={
|
||||||
|
"on_page_context_created": on_page_context_created,
|
||||||
|
"before_goto": before_goto
|
||||||
|
}
|
||||||
|
)
|
||||||
|
# ✓ Full IDE support, type checking, and reusability!
|
||||||
|
```
|
||||||
|
|
||||||
|
- **🤖 Enhanced LLM Integration**: Custom providers with temperature control and base_url configuration
|
||||||
|
- **🔒 HTTPS Preservation**: Secure internal link handling with `preserve_https_for_internal_links=True`
|
||||||
|
- **🐍 Python 3.10+ Support**: Modern language features and enhanced performance
|
||||||
|
- **🛠️ Bug Fixes**: Resolved multiple community-reported issues including URL processing, JWT authentication, and proxy configuration
|
||||||
|
|
||||||
|
[Full v0.7.5 Release Notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.5.md)
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary><strong>Version 0.7.4 Release Highlights - The Intelligent Table Extraction & Performance Update</strong></summary>
|
<summary><strong>Version 0.7.4 Release Highlights - The Intelligent Table Extraction & Performance Update</strong></summary>
|
||||||
|
|
||||||
|
|||||||
@@ -103,7 +103,8 @@ from .browser_adapter import (
|
|||||||
|
|
||||||
from .utils import (
|
from .utils import (
|
||||||
start_colab_display_server,
|
start_colab_display_server,
|
||||||
setup_colab_environment
|
setup_colab_environment,
|
||||||
|
hooks_to_string
|
||||||
)
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
@@ -183,6 +184,7 @@ __all__ = [
|
|||||||
"ProxyConfig",
|
"ProxyConfig",
|
||||||
"start_colab_display_server",
|
"start_colab_display_server",
|
||||||
"setup_colab_environment",
|
"setup_colab_environment",
|
||||||
|
"hooks_to_string",
|
||||||
# C4A Script additions
|
# C4A Script additions
|
||||||
"c4a_compile",
|
"c4a_compile",
|
||||||
"c4a_validate",
|
"c4a_validate",
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
# crawl4ai/__version__.py
|
# crawl4ai/__version__.py
|
||||||
|
|
||||||
# This is the version that will be used for stable releases
|
# This is the version that will be used for stable releases
|
||||||
__version__ = "0.7.4"
|
__version__ = "0.7.5"
|
||||||
|
|
||||||
# For nightly builds, this gets set during build process
|
# For nightly builds, this gets set during build process
|
||||||
__nightly_version__ = None
|
__nightly_version__ = None
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from typing import List, Optional, Union, AsyncGenerator, Dict, Any
|
from typing import List, Optional, Union, AsyncGenerator, Dict, Any, Callable
|
||||||
import httpx
|
import httpx
|
||||||
import json
|
import json
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
@@ -7,6 +7,7 @@ import asyncio
|
|||||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
from .models import CrawlResult
|
from .models import CrawlResult
|
||||||
from .async_logger import AsyncLogger, LogLevel
|
from .async_logger import AsyncLogger, LogLevel
|
||||||
|
from .utils import hooks_to_string
|
||||||
|
|
||||||
|
|
||||||
class Crawl4aiClientError(Exception):
|
class Crawl4aiClientError(Exception):
|
||||||
@@ -70,17 +71,41 @@ class Crawl4aiDockerClient:
|
|||||||
self.logger.error(f"Server unreachable: {str(e)}", tag="ERROR")
|
self.logger.error(f"Server unreachable: {str(e)}", tag="ERROR")
|
||||||
raise ConnectionError(f"Cannot connect to server: {str(e)}")
|
raise ConnectionError(f"Cannot connect to server: {str(e)}")
|
||||||
|
|
||||||
def _prepare_request(self, urls: List[str], browser_config: Optional[BrowserConfig] = None,
|
def _prepare_request(
|
||||||
crawler_config: Optional[CrawlerRunConfig] = None) -> Dict[str, Any]:
|
self,
|
||||||
|
urls: List[str],
|
||||||
|
browser_config: Optional[BrowserConfig] = None,
|
||||||
|
crawler_config: Optional[CrawlerRunConfig] = None,
|
||||||
|
hooks: Optional[Union[Dict[str, Callable], Dict[str, str]]] = None,
|
||||||
|
hooks_timeout: int = 30
|
||||||
|
) -> Dict[str, Any]:
|
||||||
"""Prepare request data from configs."""
|
"""Prepare request data from configs."""
|
||||||
if self._token:
|
if self._token:
|
||||||
self._http_client.headers["Authorization"] = f"Bearer {self._token}"
|
self._http_client.headers["Authorization"] = f"Bearer {self._token}"
|
||||||
return {
|
|
||||||
|
request_data = {
|
||||||
"urls": urls,
|
"urls": urls,
|
||||||
"browser_config": browser_config.dump() if browser_config else {},
|
"browser_config": browser_config.dump() if browser_config else {},
|
||||||
"crawler_config": crawler_config.dump() if crawler_config else {}
|
"crawler_config": crawler_config.dump() if crawler_config else {}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Handle hooks if provided
|
||||||
|
if hooks:
|
||||||
|
# Check if hooks are already strings or need conversion
|
||||||
|
if any(callable(v) for v in hooks.values()):
|
||||||
|
# Convert function objects to strings
|
||||||
|
hooks_code = hooks_to_string(hooks)
|
||||||
|
else:
|
||||||
|
# Already in string format
|
||||||
|
hooks_code = hooks
|
||||||
|
|
||||||
|
request_data["hooks"] = {
|
||||||
|
"code": hooks_code,
|
||||||
|
"timeout": hooks_timeout
|
||||||
|
}
|
||||||
|
|
||||||
|
return request_data
|
||||||
|
|
||||||
async def _request(self, method: str, endpoint: str, **kwargs) -> httpx.Response:
|
async def _request(self, method: str, endpoint: str, **kwargs) -> httpx.Response:
|
||||||
"""Make an HTTP request with error handling."""
|
"""Make an HTTP request with error handling."""
|
||||||
url = urljoin(self.base_url, endpoint)
|
url = urljoin(self.base_url, endpoint)
|
||||||
@@ -102,16 +127,42 @@ class Crawl4aiDockerClient:
|
|||||||
self,
|
self,
|
||||||
urls: List[str],
|
urls: List[str],
|
||||||
browser_config: Optional[BrowserConfig] = None,
|
browser_config: Optional[BrowserConfig] = None,
|
||||||
crawler_config: Optional[CrawlerRunConfig] = None
|
crawler_config: Optional[CrawlerRunConfig] = None,
|
||||||
|
hooks: Optional[Union[Dict[str, Callable], Dict[str, str]]] = None,
|
||||||
|
hooks_timeout: int = 30
|
||||||
) -> Union[CrawlResult, List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
|
) -> Union[CrawlResult, List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
|
||||||
"""Execute a crawl operation."""
|
"""
|
||||||
|
Execute a crawl operation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
urls: List of URLs to crawl
|
||||||
|
browser_config: Browser configuration
|
||||||
|
crawler_config: Crawler configuration
|
||||||
|
hooks: Optional hooks - can be either:
|
||||||
|
- Dict[str, Callable]: Function objects that will be converted to strings
|
||||||
|
- Dict[str, str]: Already stringified hook code
|
||||||
|
hooks_timeout: Timeout in seconds for each hook execution (1-120)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Single CrawlResult, list of results, or async generator for streaming
|
||||||
|
|
||||||
|
Example with function hooks:
|
||||||
|
>>> async def my_hook(page, context, **kwargs):
|
||||||
|
... await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
... return page
|
||||||
|
>>>
|
||||||
|
>>> result = await client.crawl(
|
||||||
|
... ["https://example.com"],
|
||||||
|
... hooks={"on_page_context_created": my_hook}
|
||||||
|
... )
|
||||||
|
"""
|
||||||
await self._check_server()
|
await self._check_server()
|
||||||
|
|
||||||
data = self._prepare_request(urls, browser_config, crawler_config)
|
data = self._prepare_request(urls, browser_config, crawler_config, hooks, hooks_timeout)
|
||||||
is_streaming = crawler_config and crawler_config.stream
|
is_streaming = crawler_config and crawler_config.stream
|
||||||
|
|
||||||
self.logger.info(f"Crawling {len(urls)} URLs {'(streaming)' if is_streaming else ''}", tag="CRAWL")
|
self.logger.info(f"Crawling {len(urls)} URLs {'(streaming)' if is_streaming else ''}", tag="CRAWL")
|
||||||
|
|
||||||
if is_streaming:
|
if is_streaming:
|
||||||
async def stream_results() -> AsyncGenerator[CrawlResult, None]:
|
async def stream_results() -> AsyncGenerator[CrawlResult, None]:
|
||||||
async with self._http_client.stream("POST", f"{self.base_url}/crawl/stream", json=data) as response:
|
async with self._http_client.stream("POST", f"{self.base_url}/crawl/stream", json=data) as response:
|
||||||
@@ -128,12 +179,12 @@ class Crawl4aiDockerClient:
|
|||||||
else:
|
else:
|
||||||
yield CrawlResult(**result)
|
yield CrawlResult(**result)
|
||||||
return stream_results()
|
return stream_results()
|
||||||
|
|
||||||
response = await self._request("POST", "/crawl", json=data)
|
response = await self._request("POST", "/crawl", json=data)
|
||||||
result_data = response.json()
|
result_data = response.json()
|
||||||
if not result_data.get("success", False):
|
if not result_data.get("success", False):
|
||||||
raise RequestError(f"Crawl failed: {result_data.get('msg', 'Unknown error')}")
|
raise RequestError(f"Crawl failed: {result_data.get('msg', 'Unknown error')}")
|
||||||
|
|
||||||
results = [CrawlResult(**r) for r in result_data.get("results", [])]
|
results = [CrawlResult(**r) for r in result_data.get("results", [])]
|
||||||
self.logger.success(f"Crawl completed with {len(results)} results", tag="CRAWL")
|
self.logger.success(f"Crawl completed with {len(results)} results", tag="CRAWL")
|
||||||
return results[0] if len(results) == 1 else results
|
return results[0] if len(results) == 1 else results
|
||||||
|
|||||||
@@ -47,6 +47,7 @@ from urllib.parse import (
|
|||||||
urljoin, urlparse, urlunparse,
|
urljoin, urlparse, urlunparse,
|
||||||
parse_qsl, urlencode, quote, unquote
|
parse_qsl, urlencode, quote, unquote
|
||||||
)
|
)
|
||||||
|
import inspect
|
||||||
|
|
||||||
|
|
||||||
# Monkey patch to fix wildcard handling in urllib.robotparser
|
# Monkey patch to fix wildcard handling in urllib.robotparser
|
||||||
@@ -3529,4 +3530,52 @@ def get_memory_stats() -> Tuple[float, float, float]:
|
|||||||
available_gb = get_true_available_memory_gb()
|
available_gb = get_true_available_memory_gb()
|
||||||
used_percent = get_true_memory_usage_percent()
|
used_percent = get_true_memory_usage_percent()
|
||||||
|
|
||||||
return used_percent, available_gb, total_gb
|
return used_percent, available_gb, total_gb
|
||||||
|
|
||||||
|
|
||||||
|
# Hook utilities for Docker API
|
||||||
|
def hooks_to_string(hooks: Dict[str, Callable]) -> Dict[str, str]:
|
||||||
|
"""
|
||||||
|
Convert hook function objects to string representations for Docker API.
|
||||||
|
|
||||||
|
This utility simplifies the process of using hooks with the Docker API by converting
|
||||||
|
Python function objects into the string format required by the API.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
hooks: Dictionary mapping hook point names to Python function objects.
|
||||||
|
Functions should be async and follow hook signature requirements.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary mapping hook point names to string representations of the functions.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> async def my_hook(page, context, **kwargs):
|
||||||
|
... await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
... return page
|
||||||
|
>>>
|
||||||
|
>>> hooks_dict = {"on_page_context_created": my_hook}
|
||||||
|
>>> api_hooks = hooks_to_string(hooks_dict)
|
||||||
|
>>> # api_hooks is now ready to use with Docker API
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If a hook is not callable or source cannot be extracted
|
||||||
|
"""
|
||||||
|
result = {}
|
||||||
|
|
||||||
|
for hook_name, hook_func in hooks.items():
|
||||||
|
if not callable(hook_func):
|
||||||
|
raise ValueError(f"Hook '{hook_name}' must be a callable function, got {type(hook_func)}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get the source code of the function
|
||||||
|
source = inspect.getsource(hook_func)
|
||||||
|
# Remove any leading indentation to get clean source
|
||||||
|
source = textwrap.dedent(source)
|
||||||
|
result[hook_name] = source
|
||||||
|
except (OSError, TypeError) as e:
|
||||||
|
raise ValueError(
|
||||||
|
f"Cannot extract source code for hook '{hook_name}'. "
|
||||||
|
f"Make sure the function is defined in a file (not interactively). Error: {e}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|||||||
@@ -10,7 +10,6 @@ Today I'm releasing Crawl4AI v0.7.4—the Intelligent Table Extraction & Perform
|
|||||||
|
|
||||||
- **🚀 LLMTableExtraction**: Revolutionary table extraction with intelligent chunking for massive tables
|
- **🚀 LLMTableExtraction**: Revolutionary table extraction with intelligent chunking for massive tables
|
||||||
- **⚡ Enhanced Concurrency**: True concurrency improvements for fast-completing tasks in batch operations
|
- **⚡ Enhanced Concurrency**: True concurrency improvements for fast-completing tasks in batch operations
|
||||||
- **🧹 Memory Management Refactor**: Streamlined memory utilities and better resource management
|
|
||||||
- **🔧 Browser Manager Fixes**: Resolved race conditions in concurrent page creation
|
- **🔧 Browser Manager Fixes**: Resolved race conditions in concurrent page creation
|
||||||
- **⌨️ Cross-Platform Browser Profiler**: Improved keyboard handling and quit mechanisms
|
- **⌨️ Cross-Platform Browser Profiler**: Improved keyboard handling and quit mechanisms
|
||||||
- **🔗 Advanced URL Processing**: Better handling of raw URLs and base tag link resolution
|
- **🔗 Advanced URL Processing**: Better handling of raw URLs and base tag link resolution
|
||||||
@@ -158,40 +157,6 @@ async with AsyncWebCrawler() as crawler:
|
|||||||
- **Monitoring Systems**: Faster health checks and status page monitoring
|
- **Monitoring Systems**: Faster health checks and status page monitoring
|
||||||
- **Data Aggregation**: Improved performance for real-time data collection
|
- **Data Aggregation**: Improved performance for real-time data collection
|
||||||
|
|
||||||
## 🧹 Memory Management Refactor: Cleaner Architecture
|
|
||||||
|
|
||||||
**The Problem:** Memory utilities were scattered and difficult to maintain, with potential import conflicts and unclear organization.
|
|
||||||
|
|
||||||
**My Solution:** I consolidated all memory-related utilities into the main `utils.py` module, creating a cleaner, more maintainable architecture.
|
|
||||||
|
|
||||||
### Improved Memory Handling
|
|
||||||
|
|
||||||
```python
|
|
||||||
# All memory utilities now consolidated
|
|
||||||
from crawl4ai.utils import get_true_memory_usage_percent, MemoryMonitor
|
|
||||||
|
|
||||||
# Enhanced memory monitoring
|
|
||||||
monitor = MemoryMonitor()
|
|
||||||
monitor.start_monitoring()
|
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
|
||||||
# Memory-efficient batch processing
|
|
||||||
results = await crawler.arun_many(large_url_list)
|
|
||||||
|
|
||||||
# Get accurate memory metrics
|
|
||||||
memory_usage = get_true_memory_usage_percent()
|
|
||||||
memory_report = monitor.get_report()
|
|
||||||
|
|
||||||
print(f"Memory efficiency: {memory_report['efficiency']:.1f}%")
|
|
||||||
print(f"Peak usage: {memory_report['peak_mb']:.1f} MB")
|
|
||||||
```
|
|
||||||
|
|
||||||
**Expected Real-World Impact:**
|
|
||||||
- **Production Stability**: More reliable memory tracking and management
|
|
||||||
- **Code Maintainability**: Cleaner architecture for easier debugging
|
|
||||||
- **Import Clarity**: Resolved potential conflicts and import issues
|
|
||||||
- **Developer Experience**: Simpler API for memory monitoring
|
|
||||||
|
|
||||||
## 🔧 Critical Stability Fixes
|
## 🔧 Critical Stability Fixes
|
||||||
|
|
||||||
### Browser Manager Race Condition Resolution
|
### Browser Manager Race Condition Resolution
|
||||||
|
|||||||
318
docs/blog/release-v0.7.5.md
Normal file
318
docs/blog/release-v0.7.5.md
Normal file
@@ -0,0 +1,318 @@
|
|||||||
|
# 🚀 Crawl4AI v0.7.5: The Docker Hooks & Security Update
|
||||||
|
|
||||||
|
*September 29, 2025 • 8 min read*
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Today I'm releasing Crawl4AI v0.7.5—focused on extensibility and security. This update introduces the Docker Hooks System for pipeline customization, enhanced LLM integration, and important security improvements.
|
||||||
|
|
||||||
|
## 🎯 What's New at a Glance
|
||||||
|
|
||||||
|
- **Docker Hooks System**: Custom Python functions at key pipeline points with function-based API
|
||||||
|
- **Function-Based Hooks**: New `hooks_to_string()` utility with Docker client auto-conversion
|
||||||
|
- **Enhanced LLM Integration**: Custom providers with temperature control
|
||||||
|
- **HTTPS Preservation**: Secure internal link handling
|
||||||
|
- **Bug Fixes**: Resolved multiple community-reported issues
|
||||||
|
- **Improved Docker Error Handling**: Better debugging and reliability
|
||||||
|
|
||||||
|
## 🔧 Docker Hooks System: Pipeline Customization
|
||||||
|
|
||||||
|
Every scraping project needs custom logic—authentication, performance optimization, content processing. Traditional solutions require forking or complex workarounds. Docker Hooks let you inject custom Python functions at 8 key points in the crawling pipeline.
|
||||||
|
|
||||||
|
### Real Example: Authentication & Performance
|
||||||
|
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# Real working hooks for httpbin.org
|
||||||
|
hooks_config = {
|
||||||
|
"on_page_context_created": """
|
||||||
|
async def hook(page, context, **kwargs):
|
||||||
|
print("Hook: Setting up page context")
|
||||||
|
# Block images to speed up crawling
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||||
|
print("Hook: Images blocked")
|
||||||
|
return page
|
||||||
|
""",
|
||||||
|
|
||||||
|
"before_retrieve_html": """
|
||||||
|
async def hook(page, context, **kwargs):
|
||||||
|
print("Hook: Before retrieving HTML")
|
||||||
|
# Scroll to bottom to load lazy content
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
print("Hook: Scrolled to bottom")
|
||||||
|
return page
|
||||||
|
""",
|
||||||
|
|
||||||
|
"before_goto": """
|
||||||
|
async def hook(page, context, url, **kwargs):
|
||||||
|
print(f"Hook: About to navigate to {url}")
|
||||||
|
# Add custom headers
|
||||||
|
await page.set_extra_http_headers({
|
||||||
|
'X-Test-Header': 'crawl4ai-hooks-test'
|
||||||
|
})
|
||||||
|
return page
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
|
||||||
|
# Test with Docker API
|
||||||
|
payload = {
|
||||||
|
"urls": ["https://httpbin.org/html"],
|
||||||
|
"hooks": {
|
||||||
|
"code": hooks_config,
|
||||||
|
"timeout": 30
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post("http://localhost:11235/crawl", json=payload)
|
||||||
|
result = response.json()
|
||||||
|
|
||||||
|
if result.get('success'):
|
||||||
|
print("✅ Hooks executed successfully!")
|
||||||
|
print(f"Content length: {len(result.get('markdown', ''))} characters")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Available Hook Points:**
|
||||||
|
- `on_browser_created`: Browser setup
|
||||||
|
- `on_page_context_created`: Page context configuration
|
||||||
|
- `before_goto`: Pre-navigation setup
|
||||||
|
- `after_goto`: Post-navigation processing
|
||||||
|
- `on_user_agent_updated`: User agent changes
|
||||||
|
- `on_execution_started`: Crawl initialization
|
||||||
|
- `before_retrieve_html`: Pre-extraction processing
|
||||||
|
- `before_return_html`: Final HTML processing
|
||||||
|
|
||||||
|
### Function-Based Hooks API
|
||||||
|
|
||||||
|
Writing hooks as strings works, but lacks IDE support and type checking. v0.7.5 introduces a function-based approach with automatic conversion!
|
||||||
|
|
||||||
|
**Option 1: Using the `hooks_to_string()` Utility**
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import hooks_to_string
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# Define hooks as regular Python functions (with full IDE support!)
|
||||||
|
async def on_page_context_created(page, context, **kwargs):
|
||||||
|
"""Block images to speed up crawling"""
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def before_goto(page, context, url, **kwargs):
|
||||||
|
"""Add custom headers"""
|
||||||
|
await page.set_extra_http_headers({
|
||||||
|
'X-Crawl4AI': 'v0.7.5',
|
||||||
|
'X-Custom-Header': 'my-value'
|
||||||
|
})
|
||||||
|
return page
|
||||||
|
|
||||||
|
# Convert functions to strings
|
||||||
|
hooks_code = hooks_to_string({
|
||||||
|
"on_page_context_created": on_page_context_created,
|
||||||
|
"before_goto": before_goto
|
||||||
|
})
|
||||||
|
|
||||||
|
# Use with REST API
|
||||||
|
payload = {
|
||||||
|
"urls": ["https://httpbin.org/html"],
|
||||||
|
"hooks": {"code": hooks_code, "timeout": 30}
|
||||||
|
}
|
||||||
|
response = requests.post("http://localhost:11235/crawl", json=payload)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Option 2: Docker Client with Automatic Conversion (Recommended!)**
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||||
|
|
||||||
|
# Define hooks as functions (same as above)
|
||||||
|
async def on_page_context_created(page, context, **kwargs):
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def before_retrieve_html(page, context, **kwargs):
|
||||||
|
# Scroll to load lazy content
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
return page
|
||||||
|
|
||||||
|
# Use Docker client - conversion happens automatically!
|
||||||
|
client = Crawl4aiDockerClient(base_url="http://localhost:11235")
|
||||||
|
|
||||||
|
results = await client.crawl(
|
||||||
|
urls=["https://httpbin.org/html"],
|
||||||
|
hooks={
|
||||||
|
"on_page_context_created": on_page_context_created,
|
||||||
|
"before_retrieve_html": before_retrieve_html
|
||||||
|
},
|
||||||
|
hooks_timeout=30
|
||||||
|
)
|
||||||
|
|
||||||
|
if results and results.success:
|
||||||
|
print(f"✅ Hooks executed! HTML length: {len(results.html)}")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Benefits of Function-Based Hooks:**
|
||||||
|
- ✅ Full IDE support (autocomplete, syntax highlighting)
|
||||||
|
- ✅ Type checking and linting
|
||||||
|
- ✅ Easier to test and debug
|
||||||
|
- ✅ Reusable across projects
|
||||||
|
- ✅ Automatic conversion in Docker client
|
||||||
|
- ✅ No breaking changes - string hooks still work!
|
||||||
|
|
||||||
|
## 🤖 Enhanced LLM Integration
|
||||||
|
|
||||||
|
Enhanced LLM integration with custom providers, temperature control, and base URL configuration.
|
||||||
|
|
||||||
|
### Multi-Provider Support
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
|
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||||
|
|
||||||
|
# Test with different providers
|
||||||
|
async def test_llm_providers():
|
||||||
|
# OpenAI with custom temperature
|
||||||
|
openai_strategy = LLMExtractionStrategy(
|
||||||
|
provider="gemini/gemini-2.5-flash-lite",
|
||||||
|
api_token="your-api-token",
|
||||||
|
temperature=0.7, # New in v0.7.5
|
||||||
|
instruction="Summarize this page in one sentence"
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(
|
||||||
|
"https://example.com",
|
||||||
|
config=CrawlerRunConfig(extraction_strategy=openai_strategy)
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.success:
|
||||||
|
print("✅ LLM extraction completed")
|
||||||
|
print(result.extracted_content)
|
||||||
|
|
||||||
|
# Docker API with enhanced LLM config
|
||||||
|
llm_payload = {
|
||||||
|
"url": "https://example.com",
|
||||||
|
"f": "llm",
|
||||||
|
"q": "Summarize this page in one sentence.",
|
||||||
|
"provider": "gemini/gemini-2.5-flash-lite",
|
||||||
|
"temperature": 0.7
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post("http://localhost:11235/md", json=llm_payload)
|
||||||
|
```
|
||||||
|
|
||||||
|
**New Features:**
|
||||||
|
- Custom `temperature` parameter for creativity control
|
||||||
|
- `base_url` for custom API endpoints
|
||||||
|
- Multi-provider environment variable support
|
||||||
|
- Docker API integration
|
||||||
|
|
||||||
|
## 🔒 HTTPS Preservation
|
||||||
|
|
||||||
|
**The Problem:** Modern web apps require HTTPS everywhere. When crawlers downgrade internal links from HTTPS to HTTP, authentication breaks and security warnings appear.
|
||||||
|
|
||||||
|
**Solution:** HTTPS preservation maintains secure protocols throughout crawling.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, FilterChain, URLPatternFilter, BFSDeepCrawlStrategy
|
||||||
|
|
||||||
|
async def test_https_preservation():
|
||||||
|
# Enable HTTPS preservation
|
||||||
|
url_filter = URLPatternFilter(
|
||||||
|
patterns=["^(https:\/\/)?quotes\.toscrape\.com(\/.*)?$"]
|
||||||
|
)
|
||||||
|
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
exclude_external_links=True,
|
||||||
|
preserve_https_for_internal_links=True, # New in v0.7.5
|
||||||
|
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||||
|
max_depth=2,
|
||||||
|
max_pages=5,
|
||||||
|
filter_chain=FilterChain([url_filter])
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
async for result in await crawler.arun(
|
||||||
|
url="https://quotes.toscrape.com",
|
||||||
|
config=config
|
||||||
|
):
|
||||||
|
# All internal links maintain HTTPS
|
||||||
|
internal_links = [link['href'] for link in result.links['internal']]
|
||||||
|
https_links = [link for link in internal_links if link.startswith('https://')]
|
||||||
|
|
||||||
|
print(f"HTTPS links preserved: {len(https_links)}/{len(internal_links)}")
|
||||||
|
for link in https_links[:3]:
|
||||||
|
print(f" → {link}")
|
||||||
|
```
|
||||||
|
|
||||||
|
## 🛠️ Bug Fixes and Improvements
|
||||||
|
|
||||||
|
### Major Fixes
|
||||||
|
- **URL Processing**: Fixed '+' sign preservation in query parameters (#1332)
|
||||||
|
- **Proxy Configuration**: Enhanced proxy string parsing (old `proxy` parameter deprecated)
|
||||||
|
- **Docker Error Handling**: Comprehensive error messages with status codes
|
||||||
|
- **Memory Management**: Fixed leaks in long-running sessions
|
||||||
|
- **JWT Authentication**: Fixed Docker JWT validation issues (#1442)
|
||||||
|
- **Playwright Stealth**: Fixed stealth features for Playwright integration (#1481)
|
||||||
|
- **API Configuration**: Fixed config handling to prevent overriding user-provided settings (#1505)
|
||||||
|
- **Docker Filter Serialization**: Resolved JSON encoding errors in deep crawl strategy (#1419)
|
||||||
|
- **LLM Provider Support**: Fixed custom LLM provider integration for adaptive crawler (#1291)
|
||||||
|
- **Performance Issues**: Resolved backoff strategy failures and timeout handling (#989)
|
||||||
|
|
||||||
|
### Community-Reported Issues Fixed
|
||||||
|
This release addresses multiple issues reported by the community through GitHub issues and Discord discussions:
|
||||||
|
- Fixed browser configuration reference errors
|
||||||
|
- Resolved dependency conflicts with cssselect
|
||||||
|
- Improved error messaging for failed authentications
|
||||||
|
- Enhanced compatibility with various proxy configurations
|
||||||
|
- Fixed edge cases in URL normalization
|
||||||
|
|
||||||
|
### Configuration Updates
|
||||||
|
```python
|
||||||
|
# Old proxy config (deprecated)
|
||||||
|
# browser_config = BrowserConfig(proxy="http://proxy:8080")
|
||||||
|
|
||||||
|
# New enhanced proxy config
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
proxy_config={
|
||||||
|
"server": "http://proxy:8080",
|
||||||
|
"username": "optional-user",
|
||||||
|
"password": "optional-pass"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## 🔄 Breaking Changes
|
||||||
|
|
||||||
|
1. **Python 3.10+ Required**: Upgrade from Python 3.9
|
||||||
|
2. **Proxy Parameter Deprecated**: Use new `proxy_config` structure
|
||||||
|
3. **New Dependency**: Added `cssselect` for better CSS handling
|
||||||
|
|
||||||
|
## 🚀 Get Started
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Install latest version
|
||||||
|
pip install crawl4ai==0.7.5
|
||||||
|
|
||||||
|
# Docker deployment
|
||||||
|
docker pull unclecode/crawl4ai:latest
|
||||||
|
docker run -p 11235:11235 unclecode/crawl4ai:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
**Try the Demo:**
|
||||||
|
```bash
|
||||||
|
# Run working examples
|
||||||
|
python docs/releases_review/demo_v0.7.5.py
|
||||||
|
```
|
||||||
|
|
||||||
|
**Resources:**
|
||||||
|
- 📖 Documentation: [docs.crawl4ai.com](https://docs.crawl4ai.com)
|
||||||
|
- 🐙 GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||||
|
- 💬 Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||||
|
- 🐦 Twitter: [@unclecode](https://x.com/unclecode)
|
||||||
|
|
||||||
|
Happy crawling! 🕷️
|
||||||
522
docs/examples/docker_client_hooks_example.py
Normal file
522
docs/examples/docker_client_hooks_example.py
Normal file
@@ -0,0 +1,522 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Comprehensive hooks examples using Docker Client with function objects.
|
||||||
|
|
||||||
|
This approach is recommended because:
|
||||||
|
- Write hooks as regular Python functions
|
||||||
|
- Full IDE support (autocomplete, type checking)
|
||||||
|
- Automatic conversion to API format
|
||||||
|
- Reusable and testable code
|
||||||
|
- Clean, readable syntax
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import Crawl4aiDockerClient
|
||||||
|
|
||||||
|
# API_BASE_URL = "http://localhost:11235"
|
||||||
|
API_BASE_URL = "http://localhost:11234"
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Hook Function Definitions
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
# --- All Hooks Demo ---
|
||||||
|
async def browser_created_hook(browser, **kwargs):
|
||||||
|
"""Called after browser is created"""
|
||||||
|
print("[HOOK] Browser created and ready")
|
||||||
|
return browser
|
||||||
|
|
||||||
|
|
||||||
|
async def page_context_hook(page, context, **kwargs):
|
||||||
|
"""Setup page environment"""
|
||||||
|
print("[HOOK] Setting up page environment")
|
||||||
|
|
||||||
|
# Set viewport
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
|
||||||
|
# Add cookies
|
||||||
|
await context.add_cookies([{
|
||||||
|
"name": "test_session",
|
||||||
|
"value": "abc123xyz",
|
||||||
|
"domain": ".httpbin.org",
|
||||||
|
"path": "/"
|
||||||
|
}])
|
||||||
|
|
||||||
|
# Block resources
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif}", lambda route: route.abort())
|
||||||
|
await context.route("**/analytics/*", lambda route: route.abort())
|
||||||
|
|
||||||
|
print("[HOOK] Environment configured")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def user_agent_hook(page, context, user_agent, **kwargs):
|
||||||
|
"""Called when user agent is updated"""
|
||||||
|
print(f"[HOOK] User agent: {user_agent[:50]}...")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def before_goto_hook(page, context, url, **kwargs):
|
||||||
|
"""Called before navigating to URL"""
|
||||||
|
print(f"[HOOK] Navigating to: {url}")
|
||||||
|
|
||||||
|
await page.set_extra_http_headers({
|
||||||
|
"X-Custom-Header": "crawl4ai-test",
|
||||||
|
"Accept-Language": "en-US"
|
||||||
|
})
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def after_goto_hook(page, context, url, response, **kwargs):
|
||||||
|
"""Called after page loads"""
|
||||||
|
print(f"[HOOK] Page loaded: {url}")
|
||||||
|
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await page.wait_for_selector("body", timeout=2000)
|
||||||
|
print("[HOOK] Body element ready")
|
||||||
|
except:
|
||||||
|
print("[HOOK] Timeout, continuing")
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def execution_started_hook(page, context, **kwargs):
|
||||||
|
"""Called when custom JS execution starts"""
|
||||||
|
print("[HOOK] JS execution started")
|
||||||
|
await page.evaluate("console.log('[HOOK] Custom JS');")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def before_retrieve_hook(page, context, **kwargs):
|
||||||
|
"""Called before retrieving HTML"""
|
||||||
|
print("[HOOK] Preparing HTML retrieval")
|
||||||
|
|
||||||
|
# Scroll for lazy content
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
||||||
|
await page.wait_for_timeout(500)
|
||||||
|
await page.evaluate("window.scrollTo(0, 0);")
|
||||||
|
|
||||||
|
print("[HOOK] Scrolling complete")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def before_return_hook(page, context, html, **kwargs):
|
||||||
|
"""Called before returning HTML"""
|
||||||
|
print(f"[HOOK] HTML ready: {len(html)} chars")
|
||||||
|
|
||||||
|
metrics = await page.evaluate('''() => ({
|
||||||
|
images: document.images.length,
|
||||||
|
links: document.links.length,
|
||||||
|
scripts: document.scripts.length
|
||||||
|
})''')
|
||||||
|
|
||||||
|
print(f"[HOOK] Metrics - Images: {metrics['images']}, Links: {metrics['links']}")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
# --- Authentication Hooks ---
|
||||||
|
async def auth_context_hook(page, context, **kwargs):
|
||||||
|
"""Setup authentication context"""
|
||||||
|
print("[HOOK] Setting up authentication")
|
||||||
|
|
||||||
|
# Add auth cookies
|
||||||
|
await context.add_cookies([{
|
||||||
|
"name": "auth_token",
|
||||||
|
"value": "fake_jwt_token",
|
||||||
|
"domain": ".httpbin.org",
|
||||||
|
"path": "/",
|
||||||
|
"httpOnly": True
|
||||||
|
}])
|
||||||
|
|
||||||
|
# Set localStorage
|
||||||
|
await page.evaluate('''
|
||||||
|
localStorage.setItem('user_id', '12345');
|
||||||
|
localStorage.setItem('auth_time', new Date().toISOString());
|
||||||
|
''')
|
||||||
|
|
||||||
|
print("[HOOK] Auth context ready")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def auth_headers_hook(page, context, url, **kwargs):
|
||||||
|
"""Add authentication headers"""
|
||||||
|
print(f"[HOOK] Adding auth headers for {url}")
|
||||||
|
|
||||||
|
import base64
|
||||||
|
credentials = base64.b64encode(b"user:passwd").decode('ascii')
|
||||||
|
|
||||||
|
await page.set_extra_http_headers({
|
||||||
|
'Authorization': f'Basic {credentials}',
|
||||||
|
'X-API-Key': 'test-key-123'
|
||||||
|
})
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
# --- Performance Optimization Hooks ---
|
||||||
|
async def performance_hook(page, context, **kwargs):
|
||||||
|
"""Optimize page for performance"""
|
||||||
|
print("[HOOK] Optimizing for performance")
|
||||||
|
|
||||||
|
# Block resource-heavy content
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif,webp,svg}", lambda r: r.abort())
|
||||||
|
await context.route("**/*.{woff,woff2,ttf}", lambda r: r.abort())
|
||||||
|
await context.route("**/*.{mp4,webm,ogg}", lambda r: r.abort())
|
||||||
|
await context.route("**/googletagmanager.com/*", lambda r: r.abort())
|
||||||
|
await context.route("**/google-analytics.com/*", lambda r: r.abort())
|
||||||
|
await context.route("**/facebook.com/*", lambda r: r.abort())
|
||||||
|
|
||||||
|
# Disable animations
|
||||||
|
await page.add_style_tag(content='''
|
||||||
|
*, *::before, *::after {
|
||||||
|
animation-duration: 0s !important;
|
||||||
|
transition-duration: 0s !important;
|
||||||
|
}
|
||||||
|
''')
|
||||||
|
|
||||||
|
print("[HOOK] Optimizations applied")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def cleanup_hook(page, context, **kwargs):
|
||||||
|
"""Clean page before extraction"""
|
||||||
|
print("[HOOK] Cleaning page")
|
||||||
|
|
||||||
|
await page.evaluate('''() => {
|
||||||
|
const selectors = [
|
||||||
|
'.ad', '.ads', '.advertisement',
|
||||||
|
'.popup', '.modal', '.overlay',
|
||||||
|
'.cookie-banner', '.newsletter'
|
||||||
|
];
|
||||||
|
|
||||||
|
selectors.forEach(sel => {
|
||||||
|
document.querySelectorAll(sel).forEach(el => el.remove());
|
||||||
|
});
|
||||||
|
|
||||||
|
document.querySelectorAll('script, style').forEach(el => el.remove());
|
||||||
|
}''')
|
||||||
|
|
||||||
|
print("[HOOK] Page cleaned")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
# --- Content Extraction Hooks ---
|
||||||
|
async def wait_dynamic_content_hook(page, context, url, response, **kwargs):
|
||||||
|
"""Wait for dynamic content to load"""
|
||||||
|
print(f"[HOOK] Waiting for dynamic content on {url}")
|
||||||
|
|
||||||
|
await page.wait_for_timeout(2000)
|
||||||
|
|
||||||
|
# Click "Load More" if exists
|
||||||
|
try:
|
||||||
|
load_more = await page.query_selector('[class*="load-more"], button:has-text("Load More")')
|
||||||
|
if load_more:
|
||||||
|
await load_more.click()
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
print("[HOOK] Clicked 'Load More'")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def extract_metadata_hook(page, context, **kwargs):
|
||||||
|
"""Extract page metadata"""
|
||||||
|
print("[HOOK] Extracting metadata")
|
||||||
|
|
||||||
|
metadata = await page.evaluate('''() => {
|
||||||
|
const getMeta = (name) => {
|
||||||
|
const el = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
|
||||||
|
return el ? el.getAttribute('content') : null;
|
||||||
|
};
|
||||||
|
|
||||||
|
return {
|
||||||
|
title: document.title,
|
||||||
|
description: getMeta('description'),
|
||||||
|
author: getMeta('author'),
|
||||||
|
keywords: getMeta('keywords'),
|
||||||
|
};
|
||||||
|
}''')
|
||||||
|
|
||||||
|
print(f"[HOOK] Metadata: {metadata}")
|
||||||
|
|
||||||
|
# Infinite scroll
|
||||||
|
for i in range(3):
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
print(f"[HOOK] Scroll {i+1}/3")
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
# --- Multi-URL Hooks ---
|
||||||
|
async def url_specific_hook(page, context, url, **kwargs):
|
||||||
|
"""Apply URL-specific logic"""
|
||||||
|
print(f"[HOOK] Processing URL: {url}")
|
||||||
|
|
||||||
|
# URL-specific headers
|
||||||
|
if 'html' in url:
|
||||||
|
await page.set_extra_http_headers({"X-Type": "HTML"})
|
||||||
|
elif 'json' in url:
|
||||||
|
await page.set_extra_http_headers({"X-Type": "JSON"})
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def track_progress_hook(page, context, url, response, **kwargs):
|
||||||
|
"""Track crawl progress"""
|
||||||
|
status = response.status if response else 'unknown'
|
||||||
|
print(f"[HOOK] Loaded {url} - Status: {status}")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Test Functions
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
async def test_all_hooks_comprehensive():
|
||||||
|
"""Test all 8 hook types"""
|
||||||
|
print("=" * 70)
|
||||||
|
print("Test 1: All Hooks Comprehensive Demo (Docker Client)")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||||
|
print("\nCrawling with all 8 hooks...")
|
||||||
|
|
||||||
|
# Define hooks with function objects
|
||||||
|
hooks = {
|
||||||
|
"on_browser_created": browser_created_hook,
|
||||||
|
"on_page_context_created": page_context_hook,
|
||||||
|
"on_user_agent_updated": user_agent_hook,
|
||||||
|
"before_goto": before_goto_hook,
|
||||||
|
"after_goto": after_goto_hook,
|
||||||
|
"on_execution_started": execution_started_hook,
|
||||||
|
"before_retrieve_html": before_retrieve_hook,
|
||||||
|
"before_return_html": before_return_hook
|
||||||
|
}
|
||||||
|
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://httpbin.org/html"],
|
||||||
|
hooks=hooks,
|
||||||
|
hooks_timeout=30
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n✅ Success!")
|
||||||
|
print(f" URL: {result.url}")
|
||||||
|
print(f" Success: {result.success}")
|
||||||
|
print(f" HTML: {len(result.html)} chars")
|
||||||
|
|
||||||
|
|
||||||
|
async def test_authentication_workflow():
|
||||||
|
"""Test authentication with hooks"""
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("Test 2: Authentication Workflow (Docker Client)")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||||
|
print("\nTesting authentication...")
|
||||||
|
|
||||||
|
hooks = {
|
||||||
|
"on_page_context_created": auth_context_hook,
|
||||||
|
"before_goto": auth_headers_hook
|
||||||
|
}
|
||||||
|
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://httpbin.org/basic-auth/user/passwd"],
|
||||||
|
hooks=hooks,
|
||||||
|
hooks_timeout=15
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n✅ Authentication completed")
|
||||||
|
|
||||||
|
if result.success:
|
||||||
|
if '"authenticated"' in result.html and 'true' in result.html:
|
||||||
|
print(" ✅ Basic auth successful!")
|
||||||
|
else:
|
||||||
|
print(" ⚠️ Auth status unclear")
|
||||||
|
else:
|
||||||
|
print(f" ❌ Failed: {result.error_message}")
|
||||||
|
|
||||||
|
|
||||||
|
async def test_performance_optimization():
|
||||||
|
"""Test performance optimization"""
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("Test 3: Performance Optimization (Docker Client)")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||||
|
print("\nTesting performance hooks...")
|
||||||
|
|
||||||
|
hooks = {
|
||||||
|
"on_page_context_created": performance_hook,
|
||||||
|
"before_retrieve_html": cleanup_hook
|
||||||
|
}
|
||||||
|
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://httpbin.org/html"],
|
||||||
|
hooks=hooks,
|
||||||
|
hooks_timeout=10
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n✅ Optimization completed")
|
||||||
|
print(f" HTML size: {len(result.html):,} chars")
|
||||||
|
print(" Resources blocked, ads removed")
|
||||||
|
|
||||||
|
|
||||||
|
async def test_content_extraction():
|
||||||
|
"""Test content extraction"""
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("Test 4: Content Extraction (Docker Client)")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||||
|
print("\nTesting extraction hooks...")
|
||||||
|
|
||||||
|
hooks = {
|
||||||
|
"after_goto": wait_dynamic_content_hook,
|
||||||
|
"before_retrieve_html": extract_metadata_hook
|
||||||
|
}
|
||||||
|
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://www.kidocode.com/"],
|
||||||
|
hooks=hooks,
|
||||||
|
hooks_timeout=20
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n✅ Extraction completed")
|
||||||
|
print(f" URL: {result.url}")
|
||||||
|
print(f" Success: {result.success}")
|
||||||
|
print(f" Metadata: {result.metadata}")
|
||||||
|
|
||||||
|
|
||||||
|
async def test_multi_url_crawl():
|
||||||
|
"""Test hooks with multiple URLs"""
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("Test 5: Multi-URL Crawl (Docker Client)")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||||
|
print("\nCrawling multiple URLs...")
|
||||||
|
|
||||||
|
hooks = {
|
||||||
|
"before_goto": url_specific_hook,
|
||||||
|
"after_goto": track_progress_hook
|
||||||
|
}
|
||||||
|
|
||||||
|
results = await client.crawl(
|
||||||
|
[
|
||||||
|
"https://httpbin.org/html",
|
||||||
|
"https://httpbin.org/json",
|
||||||
|
"https://httpbin.org/xml"
|
||||||
|
],
|
||||||
|
hooks=hooks,
|
||||||
|
hooks_timeout=15
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n✅ Multi-URL crawl completed")
|
||||||
|
print(f"\n Crawled {len(results)} URLs:")
|
||||||
|
for i, result in enumerate(results, 1):
|
||||||
|
status = "✅" if result.success else "❌"
|
||||||
|
print(f" {status} {i}. {result.url}")
|
||||||
|
|
||||||
|
|
||||||
|
async def test_reusable_hook_library():
|
||||||
|
"""Test using reusable hook library"""
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("Test 6: Reusable Hook Library (Docker Client)")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
# Create a library of reusable hooks
|
||||||
|
class HookLibrary:
|
||||||
|
@staticmethod
|
||||||
|
async def block_images(page, context, **kwargs):
|
||||||
|
"""Block all images"""
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif}", lambda r: r.abort())
|
||||||
|
print("[LIBRARY] Images blocked")
|
||||||
|
return page
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
async def block_analytics(page, context, **kwargs):
|
||||||
|
"""Block analytics"""
|
||||||
|
await context.route("**/analytics/*", lambda r: r.abort())
|
||||||
|
await context.route("**/google-analytics.com/*", lambda r: r.abort())
|
||||||
|
print("[LIBRARY] Analytics blocked")
|
||||||
|
return page
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
async def scroll_infinite(page, context, **kwargs):
|
||||||
|
"""Handle infinite scroll"""
|
||||||
|
for i in range(5):
|
||||||
|
prev = await page.evaluate("document.body.scrollHeight")
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
curr = await page.evaluate("document.body.scrollHeight")
|
||||||
|
if curr == prev:
|
||||||
|
break
|
||||||
|
print("[LIBRARY] Infinite scroll complete")
|
||||||
|
return page
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||||
|
print("\nUsing hook library...")
|
||||||
|
|
||||||
|
hooks = {
|
||||||
|
"on_page_context_created": HookLibrary.block_images,
|
||||||
|
"before_retrieve_html": HookLibrary.scroll_infinite
|
||||||
|
}
|
||||||
|
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://www.kidocode.com/"],
|
||||||
|
hooks=hooks,
|
||||||
|
hooks_timeout=20
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n✅ Library hooks completed")
|
||||||
|
print(f" Success: {result.success}")
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Main
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Run all Docker client hook examples"""
|
||||||
|
print("🔧 Crawl4AI Docker Client - Hooks Examples (Function-Based)")
|
||||||
|
print("Using Python function objects with automatic conversion")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
tests = [
|
||||||
|
("All Hooks Demo", test_all_hooks_comprehensive),
|
||||||
|
("Authentication", test_authentication_workflow),
|
||||||
|
("Performance", test_performance_optimization),
|
||||||
|
("Extraction", test_content_extraction),
|
||||||
|
("Multi-URL", test_multi_url_crawl),
|
||||||
|
("Hook Library", test_reusable_hook_library)
|
||||||
|
]
|
||||||
|
|
||||||
|
for i, (name, test_func) in enumerate(tests, 1):
|
||||||
|
try:
|
||||||
|
await test_func()
|
||||||
|
print(f"\n✅ Test {i}/{len(tests)}: {name} completed\n")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n❌ Test {i}/{len(tests)}: {name} failed: {e}\n")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
print("=" * 70)
|
||||||
|
print("🎉 All Docker client hook examples completed!")
|
||||||
|
print("\n💡 Key Benefits of Function-Based Hooks:")
|
||||||
|
print(" • Write as regular Python functions")
|
||||||
|
print(" • Full IDE support (autocomplete, types)")
|
||||||
|
print(" • Automatic conversion to API format")
|
||||||
|
print(" • Reusable across projects")
|
||||||
|
print(" • Clean, readable code")
|
||||||
|
print(" • Easy to test and debug")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
BIN
docs/md_v2/assets/crawl4ai-skill.zip
Normal file
BIN
docs/md_v2/assets/crawl4ai-skill.zip
Normal file
Binary file not shown.
@@ -20,17 +20,26 @@ Ever wondered why your AI coding assistant struggles with your library despite c
|
|||||||
|
|
||||||
## Latest Release
|
## Latest Release
|
||||||
|
|
||||||
|
### [Crawl4AI v0.7.5 – The Docker Hooks & Security Update](../blog/release-v0.7.5.md)
|
||||||
|
*September 29, 2025*
|
||||||
|
|
||||||
|
Crawl4AI v0.7.5 introduces the powerful Docker Hooks System for complete pipeline customization, enhanced LLM integration with custom providers, HTTPS preservation for modern web security, and resolves multiple community-reported issues.
|
||||||
|
|
||||||
|
Key highlights:
|
||||||
|
- **🔧 Docker Hooks System**: Custom Python functions at 8 key pipeline points for unprecedented customization
|
||||||
|
- **🤖 Enhanced LLM Integration**: Custom providers with temperature control and base_url configuration
|
||||||
|
- **🔒 HTTPS Preservation**: Secure internal link handling for modern web applications
|
||||||
|
- **🐍 Python 3.10+ Support**: Modern language features and enhanced performance
|
||||||
|
- **🛠️ Bug Fixes**: Resolved multiple community-reported issues including URL processing, JWT authentication, and proxy configuration
|
||||||
|
|
||||||
|
[Read full release notes →](../blog/release-v0.7.5.md)
|
||||||
|
|
||||||
|
## Recent Releases
|
||||||
|
|
||||||
### [Crawl4AI v0.7.4 – The Intelligent Table Extraction & Performance Update](../blog/release-v0.7.4.md)
|
### [Crawl4AI v0.7.4 – The Intelligent Table Extraction & Performance Update](../blog/release-v0.7.4.md)
|
||||||
*August 17, 2025*
|
*August 17, 2025*
|
||||||
|
|
||||||
Crawl4AI v0.7.4 introduces revolutionary LLM-powered table extraction with intelligent chunking, performance improvements for concurrent crawling, enhanced browser management, and critical stability fixes that make Crawl4AI more robust for production workloads.
|
Revolutionary LLM-powered table extraction with intelligent chunking, performance improvements for concurrent crawling, enhanced browser management, and critical stability fixes.
|
||||||
|
|
||||||
Key highlights:
|
|
||||||
- **🚀 LLMTableExtraction**: Revolutionary table extraction with intelligent chunking for massive tables
|
|
||||||
- **⚡ Dispatcher Bug Fix**: Fixed sequential processing issue in arun_many for fast-completing tasks
|
|
||||||
- **🧹 Memory Management Refactor**: Streamlined memory utilities and better resource management
|
|
||||||
- **🔧 Browser Manager Fixes**: Resolved race conditions in concurrent page creation
|
|
||||||
- **🔗 Advanced URL Processing**: Better handling of raw URLs and base tag link resolution
|
|
||||||
|
|
||||||
[Read full release notes →](../blog/release-v0.7.4.md)
|
[Read full release notes →](../blog/release-v0.7.4.md)
|
||||||
|
|
||||||
|
|||||||
318
docs/md_v2/blog/releases/v0.7.5.md
Normal file
318
docs/md_v2/blog/releases/v0.7.5.md
Normal file
@@ -0,0 +1,318 @@
|
|||||||
|
# 🚀 Crawl4AI v0.7.5: The Docker Hooks & Security Update
|
||||||
|
|
||||||
|
*September 29, 2025 • 8 min read*
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Today I'm releasing Crawl4AI v0.7.5—focused on extensibility and security. This update introduces the Docker Hooks System for pipeline customization, enhanced LLM integration, and important security improvements.
|
||||||
|
|
||||||
|
## 🎯 What's New at a Glance
|
||||||
|
|
||||||
|
- **Docker Hooks System**: Custom Python functions at key pipeline points with function-based API
|
||||||
|
- **Function-Based Hooks**: New `hooks_to_string()` utility with Docker client auto-conversion
|
||||||
|
- **Enhanced LLM Integration**: Custom providers with temperature control
|
||||||
|
- **HTTPS Preservation**: Secure internal link handling
|
||||||
|
- **Bug Fixes**: Resolved multiple community-reported issues
|
||||||
|
- **Improved Docker Error Handling**: Better debugging and reliability
|
||||||
|
|
||||||
|
## 🔧 Docker Hooks System: Pipeline Customization
|
||||||
|
|
||||||
|
Every scraping project needs custom logic—authentication, performance optimization, content processing. Traditional solutions require forking or complex workarounds. Docker Hooks let you inject custom Python functions at 8 key points in the crawling pipeline.
|
||||||
|
|
||||||
|
### Real Example: Authentication & Performance
|
||||||
|
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# Real working hooks for httpbin.org
|
||||||
|
hooks_config = {
|
||||||
|
"on_page_context_created": """
|
||||||
|
async def hook(page, context, **kwargs):
|
||||||
|
print("Hook: Setting up page context")
|
||||||
|
# Block images to speed up crawling
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||||
|
print("Hook: Images blocked")
|
||||||
|
return page
|
||||||
|
""",
|
||||||
|
|
||||||
|
"before_retrieve_html": """
|
||||||
|
async def hook(page, context, **kwargs):
|
||||||
|
print("Hook: Before retrieving HTML")
|
||||||
|
# Scroll to bottom to load lazy content
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
print("Hook: Scrolled to bottom")
|
||||||
|
return page
|
||||||
|
""",
|
||||||
|
|
||||||
|
"before_goto": """
|
||||||
|
async def hook(page, context, url, **kwargs):
|
||||||
|
print(f"Hook: About to navigate to {url}")
|
||||||
|
# Add custom headers
|
||||||
|
await page.set_extra_http_headers({
|
||||||
|
'X-Test-Header': 'crawl4ai-hooks-test'
|
||||||
|
})
|
||||||
|
return page
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
|
||||||
|
# Test with Docker API
|
||||||
|
payload = {
|
||||||
|
"urls": ["https://httpbin.org/html"],
|
||||||
|
"hooks": {
|
||||||
|
"code": hooks_config,
|
||||||
|
"timeout": 30
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post("http://localhost:11235/crawl", json=payload)
|
||||||
|
result = response.json()
|
||||||
|
|
||||||
|
if result.get('success'):
|
||||||
|
print("✅ Hooks executed successfully!")
|
||||||
|
print(f"Content length: {len(result.get('markdown', ''))} characters")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Available Hook Points:**
|
||||||
|
- `on_browser_created`: Browser setup
|
||||||
|
- `on_page_context_created`: Page context configuration
|
||||||
|
- `before_goto`: Pre-navigation setup
|
||||||
|
- `after_goto`: Post-navigation processing
|
||||||
|
- `on_user_agent_updated`: User agent changes
|
||||||
|
- `on_execution_started`: Crawl initialization
|
||||||
|
- `before_retrieve_html`: Pre-extraction processing
|
||||||
|
- `before_return_html`: Final HTML processing
|
||||||
|
|
||||||
|
### Function-Based Hooks API
|
||||||
|
|
||||||
|
Writing hooks as strings works, but lacks IDE support and type checking. v0.7.5 introduces a function-based approach with automatic conversion!
|
||||||
|
|
||||||
|
**Option 1: Using the `hooks_to_string()` Utility**
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import hooks_to_string
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# Define hooks as regular Python functions (with full IDE support!)
|
||||||
|
async def on_page_context_created(page, context, **kwargs):
|
||||||
|
"""Block images to speed up crawling"""
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def before_goto(page, context, url, **kwargs):
|
||||||
|
"""Add custom headers"""
|
||||||
|
await page.set_extra_http_headers({
|
||||||
|
'X-Crawl4AI': 'v0.7.5',
|
||||||
|
'X-Custom-Header': 'my-value'
|
||||||
|
})
|
||||||
|
return page
|
||||||
|
|
||||||
|
# Convert functions to strings
|
||||||
|
hooks_code = hooks_to_string({
|
||||||
|
"on_page_context_created": on_page_context_created,
|
||||||
|
"before_goto": before_goto
|
||||||
|
})
|
||||||
|
|
||||||
|
# Use with REST API
|
||||||
|
payload = {
|
||||||
|
"urls": ["https://httpbin.org/html"],
|
||||||
|
"hooks": {"code": hooks_code, "timeout": 30}
|
||||||
|
}
|
||||||
|
response = requests.post("http://localhost:11235/crawl", json=payload)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Option 2: Docker Client with Automatic Conversion (Recommended!)**
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||||
|
|
||||||
|
# Define hooks as functions (same as above)
|
||||||
|
async def on_page_context_created(page, context, **kwargs):
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def before_retrieve_html(page, context, **kwargs):
|
||||||
|
# Scroll to load lazy content
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
return page
|
||||||
|
|
||||||
|
# Use Docker client - conversion happens automatically!
|
||||||
|
client = Crawl4aiDockerClient(base_url="http://localhost:11235")
|
||||||
|
|
||||||
|
results = await client.crawl(
|
||||||
|
urls=["https://httpbin.org/html"],
|
||||||
|
hooks={
|
||||||
|
"on_page_context_created": on_page_context_created,
|
||||||
|
"before_retrieve_html": before_retrieve_html
|
||||||
|
},
|
||||||
|
hooks_timeout=30
|
||||||
|
)
|
||||||
|
|
||||||
|
if results and results.success:
|
||||||
|
print(f"✅ Hooks executed! HTML length: {len(results.html)}")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Benefits of Function-Based Hooks:**
|
||||||
|
- ✅ Full IDE support (autocomplete, syntax highlighting)
|
||||||
|
- ✅ Type checking and linting
|
||||||
|
- ✅ Easier to test and debug
|
||||||
|
- ✅ Reusable across projects
|
||||||
|
- ✅ Automatic conversion in Docker client
|
||||||
|
- ✅ No breaking changes - string hooks still work!
|
||||||
|
|
||||||
|
## 🤖 Enhanced LLM Integration
|
||||||
|
|
||||||
|
Enhanced LLM integration with custom providers, temperature control, and base URL configuration.
|
||||||
|
|
||||||
|
### Multi-Provider Support
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
|
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||||
|
|
||||||
|
# Test with different providers
|
||||||
|
async def test_llm_providers():
|
||||||
|
# OpenAI with custom temperature
|
||||||
|
openai_strategy = LLMExtractionStrategy(
|
||||||
|
provider="gemini/gemini-2.5-flash-lite",
|
||||||
|
api_token="your-api-token",
|
||||||
|
temperature=0.7, # New in v0.7.5
|
||||||
|
instruction="Summarize this page in one sentence"
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(
|
||||||
|
"https://example.com",
|
||||||
|
config=CrawlerRunConfig(extraction_strategy=openai_strategy)
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.success:
|
||||||
|
print("✅ LLM extraction completed")
|
||||||
|
print(result.extracted_content)
|
||||||
|
|
||||||
|
# Docker API with enhanced LLM config
|
||||||
|
llm_payload = {
|
||||||
|
"url": "https://example.com",
|
||||||
|
"f": "llm",
|
||||||
|
"q": "Summarize this page in one sentence.",
|
||||||
|
"provider": "gemini/gemini-2.5-flash-lite",
|
||||||
|
"temperature": 0.7
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post("http://localhost:11235/md", json=llm_payload)
|
||||||
|
```
|
||||||
|
|
||||||
|
**New Features:**
|
||||||
|
- Custom `temperature` parameter for creativity control
|
||||||
|
- `base_url` for custom API endpoints
|
||||||
|
- Multi-provider environment variable support
|
||||||
|
- Docker API integration
|
||||||
|
|
||||||
|
## 🔒 HTTPS Preservation
|
||||||
|
|
||||||
|
**The Problem:** Modern web apps require HTTPS everywhere. When crawlers downgrade internal links from HTTPS to HTTP, authentication breaks and security warnings appear.
|
||||||
|
|
||||||
|
**Solution:** HTTPS preservation maintains secure protocols throughout crawling.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, FilterChain, URLPatternFilter, BFSDeepCrawlStrategy
|
||||||
|
|
||||||
|
async def test_https_preservation():
|
||||||
|
# Enable HTTPS preservation
|
||||||
|
url_filter = URLPatternFilter(
|
||||||
|
patterns=["^(https:\/\/)?quotes\.toscrape\.com(\/.*)?$"]
|
||||||
|
)
|
||||||
|
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
exclude_external_links=True,
|
||||||
|
preserve_https_for_internal_links=True, # New in v0.7.5
|
||||||
|
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||||
|
max_depth=2,
|
||||||
|
max_pages=5,
|
||||||
|
filter_chain=FilterChain([url_filter])
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
async for result in await crawler.arun(
|
||||||
|
url="https://quotes.toscrape.com",
|
||||||
|
config=config
|
||||||
|
):
|
||||||
|
# All internal links maintain HTTPS
|
||||||
|
internal_links = [link['href'] for link in result.links['internal']]
|
||||||
|
https_links = [link for link in internal_links if link.startswith('https://')]
|
||||||
|
|
||||||
|
print(f"HTTPS links preserved: {len(https_links)}/{len(internal_links)}")
|
||||||
|
for link in https_links[:3]:
|
||||||
|
print(f" → {link}")
|
||||||
|
```
|
||||||
|
|
||||||
|
## 🛠️ Bug Fixes and Improvements
|
||||||
|
|
||||||
|
### Major Fixes
|
||||||
|
- **URL Processing**: Fixed '+' sign preservation in query parameters (#1332)
|
||||||
|
- **Proxy Configuration**: Enhanced proxy string parsing (old `proxy` parameter deprecated)
|
||||||
|
- **Docker Error Handling**: Comprehensive error messages with status codes
|
||||||
|
- **Memory Management**: Fixed leaks in long-running sessions
|
||||||
|
- **JWT Authentication**: Fixed Docker JWT validation issues (#1442)
|
||||||
|
- **Playwright Stealth**: Fixed stealth features for Playwright integration (#1481)
|
||||||
|
- **API Configuration**: Fixed config handling to prevent overriding user-provided settings (#1505)
|
||||||
|
- **Docker Filter Serialization**: Resolved JSON encoding errors in deep crawl strategy (#1419)
|
||||||
|
- **LLM Provider Support**: Fixed custom LLM provider integration for adaptive crawler (#1291)
|
||||||
|
- **Performance Issues**: Resolved backoff strategy failures and timeout handling (#989)
|
||||||
|
|
||||||
|
### Community-Reported Issues Fixed
|
||||||
|
This release addresses multiple issues reported by the community through GitHub issues and Discord discussions:
|
||||||
|
- Fixed browser configuration reference errors
|
||||||
|
- Resolved dependency conflicts with cssselect
|
||||||
|
- Improved error messaging for failed authentications
|
||||||
|
- Enhanced compatibility with various proxy configurations
|
||||||
|
- Fixed edge cases in URL normalization
|
||||||
|
|
||||||
|
### Configuration Updates
|
||||||
|
```python
|
||||||
|
# Old proxy config (deprecated)
|
||||||
|
# browser_config = BrowserConfig(proxy="http://proxy:8080")
|
||||||
|
|
||||||
|
# New enhanced proxy config
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
proxy_config={
|
||||||
|
"server": "http://proxy:8080",
|
||||||
|
"username": "optional-user",
|
||||||
|
"password": "optional-pass"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## 🔄 Breaking Changes
|
||||||
|
|
||||||
|
1. **Python 3.10+ Required**: Upgrade from Python 3.9
|
||||||
|
2. **Proxy Parameter Deprecated**: Use new `proxy_config` structure
|
||||||
|
3. **New Dependency**: Added `cssselect` for better CSS handling
|
||||||
|
|
||||||
|
## 🚀 Get Started
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Install latest version
|
||||||
|
pip install crawl4ai==0.7.5
|
||||||
|
|
||||||
|
# Docker deployment
|
||||||
|
docker pull unclecode/crawl4ai:latest
|
||||||
|
docker run -p 11235:11235 unclecode/crawl4ai:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
**Try the Demo:**
|
||||||
|
```bash
|
||||||
|
# Run working examples
|
||||||
|
python docs/releases_review/demo_v0.7.5.py
|
||||||
|
```
|
||||||
|
|
||||||
|
**Resources:**
|
||||||
|
- 📖 Documentation: [docs.crawl4ai.com](https://docs.crawl4ai.com)
|
||||||
|
- 🐙 GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||||
|
- 💬 Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||||
|
- 🐦 Twitter: [@unclecode](https://x.com/unclecode)
|
||||||
|
|
||||||
|
Happy crawling! 🕷️
|
||||||
5196
docs/md_v2/complete-sdk-reference.md
Normal file
5196
docs/md_v2/complete-sdk-reference.md
Normal file
File diff suppressed because it is too large
Load Diff
@@ -6,18 +6,6 @@
|
|||||||
- [Option 1: Using Pre-built Docker Hub Images (Recommended)](#option-1-using-pre-built-docker-hub-images-recommended)
|
- [Option 1: Using Pre-built Docker Hub Images (Recommended)](#option-1-using-pre-built-docker-hub-images-recommended)
|
||||||
- [Option 2: Using Docker Compose](#option-2-using-docker-compose)
|
- [Option 2: Using Docker Compose](#option-2-using-docker-compose)
|
||||||
- [Option 3: Manual Local Build & Run](#option-3-manual-local-build--run)
|
- [Option 3: Manual Local Build & Run](#option-3-manual-local-build--run)
|
||||||
- [Dockerfile Parameters](#dockerfile-parameters)
|
|
||||||
- [Using the API](#using-the-api)
|
|
||||||
- [Playground Interface](#playground-interface)
|
|
||||||
- [Python SDK](#python-sdk)
|
|
||||||
- [Understanding Request Schema](#understanding-request-schema)
|
|
||||||
- [REST API Examples](#rest-api-examples)
|
|
||||||
- [Additional API Endpoints](#additional-api-endpoints)
|
|
||||||
- [HTML Extraction Endpoint](#html-extraction-endpoint)
|
|
||||||
- [Screenshot Endpoint](#screenshot-endpoint)
|
|
||||||
- [PDF Export Endpoint](#pdf-export-endpoint)
|
|
||||||
- [JavaScript Execution Endpoint](#javascript-execution-endpoint)
|
|
||||||
- [Library Context Endpoint](#library-context-endpoint)
|
|
||||||
- [MCP (Model Context Protocol) Support](#mcp-model-context-protocol-support)
|
- [MCP (Model Context Protocol) Support](#mcp-model-context-protocol-support)
|
||||||
- [What is MCP?](#what-is-mcp)
|
- [What is MCP?](#what-is-mcp)
|
||||||
- [Connecting via MCP](#connecting-via-mcp)
|
- [Connecting via MCP](#connecting-via-mcp)
|
||||||
@@ -25,9 +13,28 @@
|
|||||||
- [Available MCP Tools](#available-mcp-tools)
|
- [Available MCP Tools](#available-mcp-tools)
|
||||||
- [Testing MCP Connections](#testing-mcp-connections)
|
- [Testing MCP Connections](#testing-mcp-connections)
|
||||||
- [MCP Schemas](#mcp-schemas)
|
- [MCP Schemas](#mcp-schemas)
|
||||||
|
- [Additional API Endpoints](#additional-api-endpoints)
|
||||||
|
- [HTML Extraction Endpoint](#html-extraction-endpoint)
|
||||||
|
- [Screenshot Endpoint](#screenshot-endpoint)
|
||||||
|
- [PDF Export Endpoint](#pdf-export-endpoint)
|
||||||
|
- [JavaScript Execution Endpoint](#javascript-execution-endpoint)
|
||||||
|
- [User-Provided Hooks API](#user-provided-hooks-api)
|
||||||
|
- [Hook Information Endpoint](#hook-information-endpoint)
|
||||||
|
- [Available Hook Points](#available-hook-points)
|
||||||
|
- [Using Hooks in Requests](#using-hooks-in-requests)
|
||||||
|
- [Hook Examples with Real URLs](#hook-examples-with-real-urls)
|
||||||
|
- [Security Best Practices](#security-best-practices)
|
||||||
|
- [Hook Response Information](#hook-response-information)
|
||||||
|
- [Error Handling](#error-handling)
|
||||||
|
- [Hooks Utility: Function-Based Approach (Python)](#hooks-utility-function-based-approach-python)
|
||||||
|
- [Dockerfile Parameters](#dockerfile-parameters)
|
||||||
|
- [Using the API](#using-the-api)
|
||||||
|
- [Playground Interface](#playground-interface)
|
||||||
|
- [Python SDK](#python-sdk)
|
||||||
|
- [Understanding Request Schema](#understanding-request-schema)
|
||||||
|
- [REST API Examples](#rest-api-examples)
|
||||||
|
- [LLM Configuration Examples](#llm-configuration-examples)
|
||||||
- [Metrics & Monitoring](#metrics--monitoring)
|
- [Metrics & Monitoring](#metrics--monitoring)
|
||||||
- [Deployment Scenarios](#deployment-scenarios)
|
|
||||||
- [Complete Examples](#complete-examples)
|
|
||||||
- [Server Configuration](#server-configuration)
|
- [Server Configuration](#server-configuration)
|
||||||
- [Understanding config.yml](#understanding-configyml)
|
- [Understanding config.yml](#understanding-configyml)
|
||||||
- [JWT Authentication](#jwt-authentication)
|
- [JWT Authentication](#jwt-authentication)
|
||||||
@@ -832,6 +839,275 @@ else:
|
|||||||
|
|
||||||
> 💡 **Remember**: Always test your hooks on safe, known websites first before using them on production sites. Never crawl sites that you don't have permission to access or that might be malicious.
|
> 💡 **Remember**: Always test your hooks on safe, known websites first before using them on production sites. Never crawl sites that you don't have permission to access or that might be malicious.
|
||||||
|
|
||||||
|
### Hooks Utility: Function-Based Approach (Python)
|
||||||
|
|
||||||
|
For Python developers, Crawl4AI provides a more convenient way to work with hooks using the `hooks_to_string()` utility function and Docker client integration.
|
||||||
|
|
||||||
|
#### Why Use Function-Based Hooks?
|
||||||
|
|
||||||
|
**String-Based Approach (shown above)**:
|
||||||
|
```python
|
||||||
|
hooks_code = {
|
||||||
|
"on_page_context_created": """
|
||||||
|
async def hook(page, context, **kwargs):
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
return page
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Function-Based Approach (recommended for Python)**:
|
||||||
|
```python
|
||||||
|
from crawl4ai import Crawl4aiDockerClient
|
||||||
|
|
||||||
|
async def my_hook(page, context, **kwargs):
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
return page
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url="http://localhost:11235") as client:
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://example.com"],
|
||||||
|
hooks={"on_page_context_created": my_hook}
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Benefits**:
|
||||||
|
- ✅ Write hooks as regular Python functions
|
||||||
|
- ✅ Full IDE support (autocomplete, syntax highlighting, type checking)
|
||||||
|
- ✅ Easy to test and debug
|
||||||
|
- ✅ Reusable hook libraries
|
||||||
|
- ✅ Automatic conversion to API format
|
||||||
|
|
||||||
|
#### Using the Hooks Utility
|
||||||
|
|
||||||
|
The `hooks_to_string()` utility converts Python function objects to the string format required by the API:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import hooks_to_string
|
||||||
|
|
||||||
|
# Define your hooks as functions
|
||||||
|
async def setup_hook(page, context, **kwargs):
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
await context.add_cookies([{
|
||||||
|
"name": "session",
|
||||||
|
"value": "token",
|
||||||
|
"domain": ".example.com"
|
||||||
|
}])
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def scroll_hook(page, context, **kwargs):
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
return page
|
||||||
|
|
||||||
|
# Convert to string format
|
||||||
|
hooks_dict = {
|
||||||
|
"on_page_context_created": setup_hook,
|
||||||
|
"before_retrieve_html": scroll_hook
|
||||||
|
}
|
||||||
|
hooks_string = hooks_to_string(hooks_dict)
|
||||||
|
|
||||||
|
# Now use with REST API or Docker client
|
||||||
|
# hooks_string contains the string representations
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Docker Client with Automatic Conversion
|
||||||
|
|
||||||
|
The Docker client automatically detects and converts function objects:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import Crawl4aiDockerClient
|
||||||
|
|
||||||
|
async def auth_hook(page, context, **kwargs):
|
||||||
|
"""Add authentication cookies"""
|
||||||
|
await context.add_cookies([{
|
||||||
|
"name": "auth_token",
|
||||||
|
"value": "your_token",
|
||||||
|
"domain": ".example.com"
|
||||||
|
}])
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def performance_hook(page, context, **kwargs):
|
||||||
|
"""Block unnecessary resources"""
|
||||||
|
await context.route("**/*.{png,jpg,gif}", lambda r: r.abort())
|
||||||
|
await context.route("**/analytics/*", lambda r: r.abort())
|
||||||
|
return page
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url="http://localhost:11235") as client:
|
||||||
|
# Pass functions directly - automatic conversion!
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://example.com"],
|
||||||
|
hooks={
|
||||||
|
"on_page_context_created": performance_hook,
|
||||||
|
"before_goto": auth_hook
|
||||||
|
},
|
||||||
|
hooks_timeout=30 # Optional timeout in seconds (1-120)
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"Success: {result.success}")
|
||||||
|
print(f"HTML: {len(result.html)} chars")
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Creating Reusable Hook Libraries
|
||||||
|
|
||||||
|
Build collections of reusable hooks:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# hooks_library.py
|
||||||
|
class CrawlHooks:
|
||||||
|
"""Reusable hook collection for common crawling tasks"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
async def block_images(page, context, **kwargs):
|
||||||
|
"""Block all images to speed up crawling"""
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda r: r.abort())
|
||||||
|
return page
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
async def block_analytics(page, context, **kwargs):
|
||||||
|
"""Block analytics and tracking scripts"""
|
||||||
|
tracking_domains = [
|
||||||
|
"**/google-analytics.com/*",
|
||||||
|
"**/googletagmanager.com/*",
|
||||||
|
"**/facebook.com/tr/*",
|
||||||
|
"**/doubleclick.net/*"
|
||||||
|
]
|
||||||
|
for domain in tracking_domains:
|
||||||
|
await context.route(domain, lambda r: r.abort())
|
||||||
|
return page
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
async def scroll_infinite(page, context, **kwargs):
|
||||||
|
"""Handle infinite scroll to load more content"""
|
||||||
|
previous_height = 0
|
||||||
|
for i in range(5): # Max 5 scrolls
|
||||||
|
current_height = await page.evaluate("document.body.scrollHeight")
|
||||||
|
if current_height == previous_height:
|
||||||
|
break
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
previous_height = current_height
|
||||||
|
return page
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
async def wait_for_dynamic_content(page, context, url, response, **kwargs):
|
||||||
|
"""Wait for dynamic content to load"""
|
||||||
|
await page.wait_for_timeout(2000)
|
||||||
|
try:
|
||||||
|
# Click "Load More" if present
|
||||||
|
load_more = await page.query_selector('[class*="load-more"]')
|
||||||
|
if load_more:
|
||||||
|
await load_more.click()
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return page
|
||||||
|
|
||||||
|
# Use in your application
|
||||||
|
from hooks_library import CrawlHooks
|
||||||
|
from crawl4ai import Crawl4aiDockerClient
|
||||||
|
|
||||||
|
async def crawl_with_optimizations(url):
|
||||||
|
async with Crawl4aiDockerClient() as client:
|
||||||
|
result = await client.crawl(
|
||||||
|
[url],
|
||||||
|
hooks={
|
||||||
|
"on_page_context_created": CrawlHooks.block_images,
|
||||||
|
"before_retrieve_html": CrawlHooks.scroll_infinite
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Choosing the Right Approach
|
||||||
|
|
||||||
|
| Approach | Best For | IDE Support | Language |
|
||||||
|
|----------|----------|-------------|----------|
|
||||||
|
| **String-based** | Non-Python clients, REST APIs, other languages | ❌ None | Any |
|
||||||
|
| **Function-based** | Python applications, local development | ✅ Full | Python only |
|
||||||
|
| **Docker Client** | Python apps with automatic conversion | ✅ Full | Python only |
|
||||||
|
|
||||||
|
**Recommendation**:
|
||||||
|
- **Python applications**: Use Docker client with function objects (easiest)
|
||||||
|
- **Non-Python or REST API**: Use string-based hooks (most flexible)
|
||||||
|
- **Manual control**: Use `hooks_to_string()` utility (middle ground)
|
||||||
|
|
||||||
|
#### Complete Example with Function Hooks
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import Crawl4aiDockerClient, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
|
|
||||||
|
# Define hooks as regular Python functions
|
||||||
|
async def setup_environment(page, context, **kwargs):
|
||||||
|
"""Setup crawling environment"""
|
||||||
|
# Set viewport
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
|
||||||
|
# Block resources for speed
|
||||||
|
await context.route("**/*.{png,jpg,gif}", lambda r: r.abort())
|
||||||
|
|
||||||
|
# Add custom headers
|
||||||
|
await page.set_extra_http_headers({
|
||||||
|
"Accept-Language": "en-US",
|
||||||
|
"X-Custom-Header": "Crawl4AI"
|
||||||
|
})
|
||||||
|
|
||||||
|
print("[HOOK] Environment configured")
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def extract_content(page, context, **kwargs):
|
||||||
|
"""Extract and prepare content"""
|
||||||
|
# Scroll to load lazy content
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
|
||||||
|
# Extract metadata
|
||||||
|
metadata = await page.evaluate('''() => ({
|
||||||
|
title: document.title,
|
||||||
|
links: document.links.length,
|
||||||
|
images: document.images.length
|
||||||
|
})''')
|
||||||
|
|
||||||
|
print(f"[HOOK] Page metadata: {metadata}")
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
async with Crawl4aiDockerClient(base_url="http://localhost:11235", verbose=True) as client:
|
||||||
|
# Configure crawl
|
||||||
|
browser_config = BrowserConfig(headless=True)
|
||||||
|
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
|
|
||||||
|
# Crawl with hooks
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://httpbin.org/html"],
|
||||||
|
browser_config=browser_config,
|
||||||
|
crawler_config=crawler_config,
|
||||||
|
hooks={
|
||||||
|
"on_page_context_created": setup_environment,
|
||||||
|
"before_retrieve_html": extract_content
|
||||||
|
},
|
||||||
|
hooks_timeout=30
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.success:
|
||||||
|
print(f"✅ Crawl successful!")
|
||||||
|
print(f" URL: {result.url}")
|
||||||
|
print(f" HTML: {len(result.html)} chars")
|
||||||
|
print(f" Markdown: {len(result.markdown)} chars")
|
||||||
|
else:
|
||||||
|
print(f"❌ Crawl failed: {result.error_message}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import asyncio
|
||||||
|
asyncio.run(main())
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Additional Resources
|
||||||
|
|
||||||
|
- **Comprehensive Examples**: See `/docs/examples/hooks_docker_client_example.py` for Python function-based examples
|
||||||
|
- **REST API Examples**: See `/docs/examples/hooks_rest_api_example.py` for string-based examples
|
||||||
|
- **Comparison Guide**: See `/docs/examples/README_HOOKS.md` for detailed comparison
|
||||||
|
- **Utility Documentation**: See `/docs/hooks-utility-guide.md` for complete guide
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Dockerfile Parameters
|
## Dockerfile Parameters
|
||||||
@@ -892,10 +1168,12 @@ This is the easiest way to translate Python configuration to JSON requests when
|
|||||||
|
|
||||||
Install the SDK: `pip install crawl4ai`
|
Install the SDK: `pip install crawl4ai`
|
||||||
|
|
||||||
|
The Python SDK provides a convenient way to interact with the Docker API, including **automatic hook conversion** when using function objects.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||||
from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode # Assuming you have crawl4ai installed
|
from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
# Point to the correct server port
|
# Point to the correct server port
|
||||||
@@ -907,23 +1185,22 @@ async def main():
|
|||||||
print("--- Running Non-Streaming Crawl ---")
|
print("--- Running Non-Streaming Crawl ---")
|
||||||
results = await client.crawl(
|
results = await client.crawl(
|
||||||
["https://httpbin.org/html"],
|
["https://httpbin.org/html"],
|
||||||
browser_config=BrowserConfig(headless=True), # Use library classes for config aid
|
browser_config=BrowserConfig(headless=True),
|
||||||
crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
)
|
)
|
||||||
if results: # client.crawl returns None on failure
|
if results:
|
||||||
print(f"Non-streaming results success: {results.success}")
|
print(f"Non-streaming results success: {results.success}")
|
||||||
if results.success:
|
if results.success:
|
||||||
for result in results: # Iterate through the CrawlResultContainer
|
for result in results:
|
||||||
print(f"URL: {result.url}, Success: {result.success}")
|
print(f"URL: {result.url}, Success: {result.success}")
|
||||||
else:
|
else:
|
||||||
print("Non-streaming crawl failed.")
|
print("Non-streaming crawl failed.")
|
||||||
|
|
||||||
|
|
||||||
# Example Streaming crawl
|
# Example Streaming crawl
|
||||||
print("\n--- Running Streaming Crawl ---")
|
print("\n--- Running Streaming Crawl ---")
|
||||||
stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)
|
stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)
|
||||||
try:
|
try:
|
||||||
async for result in await client.crawl( # client.crawl returns an async generator for streaming
|
async for result in await client.crawl(
|
||||||
["https://httpbin.org/html", "https://httpbin.org/links/5/0"],
|
["https://httpbin.org/html", "https://httpbin.org/links/5/0"],
|
||||||
browser_config=BrowserConfig(headless=True),
|
browser_config=BrowserConfig(headless=True),
|
||||||
crawler_config=stream_config
|
crawler_config=stream_config
|
||||||
@@ -932,17 +1209,56 @@ async def main():
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Streaming crawl failed: {e}")
|
print(f"Streaming crawl failed: {e}")
|
||||||
|
|
||||||
|
# Example with hooks (Python function objects)
|
||||||
|
print("\n--- Crawl with Hooks ---")
|
||||||
|
|
||||||
|
async def my_hook(page, context, **kwargs):
|
||||||
|
"""Custom hook to optimize performance"""
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
await context.route("**/*.{png,jpg}", lambda r: r.abort())
|
||||||
|
print("[HOOK] Page optimized")
|
||||||
|
return page
|
||||||
|
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://httpbin.org/html"],
|
||||||
|
browser_config=BrowserConfig(headless=True),
|
||||||
|
crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
|
||||||
|
hooks={"on_page_context_created": my_hook}, # Pass function directly!
|
||||||
|
hooks_timeout=30
|
||||||
|
)
|
||||||
|
print(f"Crawl with hooks success: {result.success}")
|
||||||
|
|
||||||
# Example Get schema
|
# Example Get schema
|
||||||
print("\n--- Getting Schema ---")
|
print("\n--- Getting Schema ---")
|
||||||
schema = await client.get_schema()
|
schema = await client.get_schema()
|
||||||
print(f"Schema received: {bool(schema)}") # Print whether schema was received
|
print(f"Schema received: {bool(schema)}")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
```
|
```
|
||||||
|
|
||||||
*(SDK parameters like timeout, verify_ssl etc. remain the same)*
|
#### SDK Parameters
|
||||||
|
|
||||||
|
The Docker client supports the following parameters:
|
||||||
|
|
||||||
|
**Client Initialization**:
|
||||||
|
- `base_url` (str): URL of the Docker server (default: `http://localhost:8000`)
|
||||||
|
- `timeout` (float): Request timeout in seconds (default: 30.0)
|
||||||
|
- `verify_ssl` (bool): Verify SSL certificates (default: True)
|
||||||
|
- `verbose` (bool): Enable verbose logging (default: True)
|
||||||
|
- `log_file` (Optional[str]): Path to log file (default: None)
|
||||||
|
|
||||||
|
**crawl() Method**:
|
||||||
|
- `urls` (List[str]): List of URLs to crawl
|
||||||
|
- `browser_config` (Optional[BrowserConfig]): Browser configuration
|
||||||
|
- `crawler_config` (Optional[CrawlerRunConfig]): Crawler configuration
|
||||||
|
- `hooks` (Optional[Dict]): Hook functions or strings - **automatically converts function objects!**
|
||||||
|
- `hooks_timeout` (int): Timeout for each hook execution in seconds (default: 30)
|
||||||
|
|
||||||
|
**Returns**:
|
||||||
|
- Single URL: `CrawlResult` object
|
||||||
|
- Multiple URLs: `List[CrawlResult]`
|
||||||
|
- Streaming: `AsyncGenerator[CrawlResult]`
|
||||||
|
|
||||||
### Second Approach: Direct API Calls
|
### Second Approach: Direct API Calls
|
||||||
|
|
||||||
@@ -1352,19 +1668,40 @@ We're here to help you succeed with Crawl4AI! Here's how to get support:
|
|||||||
|
|
||||||
In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
|
In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
|
||||||
- Building and running the Docker container
|
- Building and running the Docker container
|
||||||
- Configuring the environment
|
- Configuring the environment
|
||||||
- Using the interactive playground for testing
|
- Using the interactive playground for testing
|
||||||
- Making API requests with proper typing
|
- Making API requests with proper typing
|
||||||
- Using the Python SDK
|
- Using the Python SDK with **automatic hook conversion**
|
||||||
|
- **Working with hooks** - both string-based (REST API) and function-based (Python SDK)
|
||||||
- Leveraging specialized endpoints for screenshots, PDFs, and JavaScript execution
|
- Leveraging specialized endpoints for screenshots, PDFs, and JavaScript execution
|
||||||
- Connecting via the Model Context Protocol (MCP)
|
- Connecting via the Model Context Protocol (MCP)
|
||||||
- Monitoring your deployment
|
- Monitoring your deployment
|
||||||
|
|
||||||
The new playground interface at `http://localhost:11235/playground` makes it much easier to test configurations and generate the corresponding JSON for API requests.
|
### Key Features
|
||||||
|
|
||||||
For AI application developers, the MCP integration allows tools like Claude Code to directly access Crawl4AI's capabilities without complex API handling.
|
**Hooks Support**: Crawl4AI offers two approaches for working with hooks:
|
||||||
|
- **String-based** (REST API): Works with any language, requires manual string formatting
|
||||||
|
- **Function-based** (Python SDK): Write hooks as regular Python functions with full IDE support and automatic conversion
|
||||||
|
|
||||||
Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs.
|
**Playground Interface**: The built-in playground at `http://localhost:11235/playground` makes it easy to test configurations and generate corresponding JSON for API requests.
|
||||||
|
|
||||||
|
**MCP Integration**: For AI application developers, the MCP integration allows tools like Claude Code to directly access Crawl4AI's capabilities without complex API handling.
|
||||||
|
|
||||||
|
### Next Steps
|
||||||
|
|
||||||
|
1. **Explore Examples**: Check out the comprehensive examples in:
|
||||||
|
- `/docs/examples/hooks_docker_client_example.py` - Python function-based hooks
|
||||||
|
- `/docs/examples/hooks_rest_api_example.py` - REST API string-based hooks
|
||||||
|
- `/docs/examples/README_HOOKS.md` - Comparison and guide
|
||||||
|
|
||||||
|
2. **Read Documentation**:
|
||||||
|
- `/docs/hooks-utility-guide.md` - Complete hooks utility guide
|
||||||
|
- API documentation for detailed configuration options
|
||||||
|
|
||||||
|
3. **Join the Community**:
|
||||||
|
- GitHub: Report issues and contribute
|
||||||
|
- Discord: Get help and share your experiences
|
||||||
|
- Documentation: Comprehensive guides and tutorials
|
||||||
|
|
||||||
Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀
|
Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀
|
||||||
|
|
||||||
|
|||||||
@@ -59,6 +59,27 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant
|
|||||||
|
|
||||||
> **Note**: If you're looking for the old documentation, you can access it [here](https://old.docs.crawl4ai.com).
|
> **Note**: If you're looking for the old documentation, you can access it [here](https://old.docs.crawl4ai.com).
|
||||||
|
|
||||||
|
## 🆕 AI Assistant Skill Now Available!
|
||||||
|
|
||||||
|
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 10px; margin: 20px 0; box-shadow: 0 4px 6px rgba(0,0,0,0.1);">
|
||||||
|
<h3 style="color: white; margin: 0 0 10px 0;">🤖 Crawl4AI Skill for Claude & AI Assistants</h3>
|
||||||
|
<p style="color: white; margin: 10px 0;">Supercharge your AI coding assistant with complete Crawl4AI knowledge! Download our comprehensive skill package that includes:</p>
|
||||||
|
<ul style="color: white; margin: 10px 0;">
|
||||||
|
<li>📚 Complete SDK reference (23K+ words)</li>
|
||||||
|
<li>🚀 Ready-to-use extraction scripts</li>
|
||||||
|
<li>⚡ Schema generation for efficient scraping</li>
|
||||||
|
<li>🔧 Version 0.7.4 compatible</li>
|
||||||
|
</ul>
|
||||||
|
<div style="text-align: center; margin-top: 15px;">
|
||||||
|
<a href="assets/crawl4ai-skill.zip" download style="background: white; color: #667eea; padding: 12px 30px; border-radius: 5px; text-decoration: none; font-weight: bold; display: inline-block; transition: transform 0.2s;">
|
||||||
|
📦 Download Skill Package
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
<p style="color: white; margin: 15px 0 0 0; font-size: 0.9em; text-align: center;">
|
||||||
|
Works with Claude, Cursor, Windsurf, and other AI coding assistants. Import the .zip file into your AI assistant's skill/knowledge system.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
## 🎯 New: Adaptive Web Crawling
|
## 🎯 New: Adaptive Web Crawling
|
||||||
|
|
||||||
Crawl4AI now features intelligent adaptive crawling that knows when to stop! Using advanced information foraging algorithms, it determines when sufficient information has been gathered to answer your query.
|
Crawl4AI now features intelligent adaptive crawling that knows when to stop! Using advanced information foraging algorithms, it determines when sufficient information has been gathered to answer your query.
|
||||||
|
|||||||
338
docs/releases_review/demo_v0.7.5.py
Normal file
338
docs/releases_review/demo_v0.7.5.py
Normal file
@@ -0,0 +1,338 @@
|
|||||||
|
"""
|
||||||
|
🚀 Crawl4AI v0.7.5 Release Demo - Working Examples
|
||||||
|
==================================================
|
||||||
|
This demo showcases key features introduced in v0.7.5 with real, executable examples.
|
||||||
|
|
||||||
|
Featured Demos:
|
||||||
|
1. ✅ Docker Hooks System - Real API calls with custom hooks (string & function-based)
|
||||||
|
2. ✅ Enhanced LLM Integration - Working LLM configurations
|
||||||
|
3. ✅ HTTPS Preservation - Live crawling with HTTPS maintenance
|
||||||
|
|
||||||
|
Requirements:
|
||||||
|
- crawl4ai v0.7.5 installed
|
||||||
|
- Docker running with crawl4ai image (optional for Docker demos)
|
||||||
|
- Valid API keys for LLM demos (optional)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import requests
|
||||||
|
import time
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from crawl4ai import (AsyncWebCrawler, CrawlerRunConfig, BrowserConfig,
|
||||||
|
CacheMode, FilterChain, URLPatternFilter, BFSDeepCrawlStrategy,
|
||||||
|
hooks_to_string)
|
||||||
|
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||||
|
|
||||||
|
|
||||||
|
def print_section(title: str, description: str = ""):
|
||||||
|
"""Print a section header"""
|
||||||
|
print(f"\n{'=' * 60}")
|
||||||
|
print(f"{title}")
|
||||||
|
if description:
|
||||||
|
print(f"{description}")
|
||||||
|
print(f"{'=' * 60}\n")
|
||||||
|
|
||||||
|
|
||||||
|
async def demo_1_docker_hooks_system():
|
||||||
|
"""Demo 1: Docker Hooks System - Real API calls with custom hooks"""
|
||||||
|
print_section(
|
||||||
|
"Demo 1: Docker Hooks System",
|
||||||
|
"Testing both string-based and function-based hooks (NEW in v0.7.5!)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check Docker service availability
|
||||||
|
def check_docker_service():
|
||||||
|
try:
|
||||||
|
response = requests.get("http://localhost:11235/", timeout=3)
|
||||||
|
return response.status_code == 200
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
||||||
|
print("Checking Docker service...")
|
||||||
|
docker_running = check_docker_service()
|
||||||
|
|
||||||
|
if not docker_running:
|
||||||
|
print("⚠️ Docker service not running on localhost:11235")
|
||||||
|
print("To test Docker hooks:")
|
||||||
|
print("1. Run: docker run -p 11235:11235 unclecode/crawl4ai:latest")
|
||||||
|
print("2. Wait for service to start")
|
||||||
|
print("3. Re-run this demo\n")
|
||||||
|
return
|
||||||
|
|
||||||
|
print("✓ Docker service detected!")
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# PART 1: Traditional String-Based Hooks (Works with REST API)
|
||||||
|
# ============================================================================
|
||||||
|
print("\n" + "─" * 60)
|
||||||
|
print("Part 1: String-Based Hooks (REST API)")
|
||||||
|
print("─" * 60)
|
||||||
|
|
||||||
|
hooks_config_string = {
|
||||||
|
"on_page_context_created": """
|
||||||
|
async def hook(page, context, **kwargs):
|
||||||
|
print("[String Hook] Setting up page context")
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||||
|
return page
|
||||||
|
""",
|
||||||
|
"before_retrieve_html": """
|
||||||
|
async def hook(page, context, **kwargs):
|
||||||
|
print("[String Hook] Before retrieving HTML")
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
return page
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"urls": ["https://httpbin.org/html"],
|
||||||
|
"hooks": {
|
||||||
|
"code": hooks_config_string,
|
||||||
|
"timeout": 30
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
print("🔧 Using string-based hooks for REST API...")
|
||||||
|
try:
|
||||||
|
start_time = time.time()
|
||||||
|
response = requests.post("http://localhost:11235/crawl", json=payload, timeout=60)
|
||||||
|
execution_time = time.time() - start_time
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
result = response.json()
|
||||||
|
print(f"✅ String-based hooks executed in {execution_time:.2f}s")
|
||||||
|
if result.get('results') and result['results'][0].get('success'):
|
||||||
|
html_length = len(result['results'][0].get('html', ''))
|
||||||
|
print(f" 📄 HTML length: {html_length} characters")
|
||||||
|
else:
|
||||||
|
print(f"❌ Request failed: {response.status_code}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error: {str(e)}")
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# PART 2: NEW Function-Based Hooks with Docker Client (v0.7.5)
|
||||||
|
# ============================================================================
|
||||||
|
print("\n" + "─" * 60)
|
||||||
|
print("Part 2: Function-Based Hooks with Docker Client (✨ NEW!)")
|
||||||
|
print("─" * 60)
|
||||||
|
|
||||||
|
# Define hooks as regular Python functions
|
||||||
|
async def on_page_context_created_func(page, context, **kwargs):
|
||||||
|
"""Block images to speed up crawling"""
|
||||||
|
print("[Function Hook] Setting up page context")
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def before_goto_func(page, context, url, **kwargs):
|
||||||
|
"""Add custom headers before navigation"""
|
||||||
|
print(f"[Function Hook] About to navigate to {url}")
|
||||||
|
await page.set_extra_http_headers({
|
||||||
|
'X-Crawl4AI': 'v0.7.5-function-hooks',
|
||||||
|
'X-Test-Header': 'demo'
|
||||||
|
})
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def before_retrieve_html_func(page, context, **kwargs):
|
||||||
|
"""Scroll to load lazy content"""
|
||||||
|
print("[Function Hook] Scrolling page for lazy-loaded content")
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
await page.wait_for_timeout(500)
|
||||||
|
await page.evaluate("window.scrollTo(0, 0)")
|
||||||
|
return page
|
||||||
|
|
||||||
|
# Use the hooks_to_string utility (can be used standalone)
|
||||||
|
print("\n📦 Converting functions to strings with hooks_to_string()...")
|
||||||
|
hooks_as_strings = hooks_to_string({
|
||||||
|
"on_page_context_created": on_page_context_created_func,
|
||||||
|
"before_goto": before_goto_func,
|
||||||
|
"before_retrieve_html": before_retrieve_html_func
|
||||||
|
})
|
||||||
|
print(f" ✓ Converted {len(hooks_as_strings)} hooks to string format")
|
||||||
|
|
||||||
|
# OR use Docker Client which does conversion automatically!
|
||||||
|
print("\n🐳 Using Docker Client with automatic conversion...")
|
||||||
|
try:
|
||||||
|
client = Crawl4aiDockerClient(base_url="http://localhost:11235")
|
||||||
|
|
||||||
|
# Pass function objects directly - conversion happens automatically!
|
||||||
|
results = await client.crawl(
|
||||||
|
urls=["https://httpbin.org/html"],
|
||||||
|
hooks={
|
||||||
|
"on_page_context_created": on_page_context_created_func,
|
||||||
|
"before_goto": before_goto_func,
|
||||||
|
"before_retrieve_html": before_retrieve_html_func
|
||||||
|
},
|
||||||
|
hooks_timeout=30
|
||||||
|
)
|
||||||
|
|
||||||
|
if results and results.success:
|
||||||
|
print(f"✅ Function-based hooks executed successfully!")
|
||||||
|
print(f" 📄 HTML length: {len(results.html)} characters")
|
||||||
|
print(f" 🎯 URL: {results.url}")
|
||||||
|
else:
|
||||||
|
print("⚠️ Crawl completed but may have warnings")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Docker client error: {str(e)}")
|
||||||
|
|
||||||
|
# Show the benefits
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("✨ Benefits of Function-Based Hooks:")
|
||||||
|
print("=" * 60)
|
||||||
|
print("✓ Full IDE support (autocomplete, syntax highlighting)")
|
||||||
|
print("✓ Type checking and linting")
|
||||||
|
print("✓ Easier to test and debug")
|
||||||
|
print("✓ Reusable across projects")
|
||||||
|
print("✓ Automatic conversion in Docker client")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
|
||||||
|
async def demo_2_enhanced_llm_integration():
|
||||||
|
"""Demo 2: Enhanced LLM Integration - Working LLM configurations"""
|
||||||
|
print_section(
|
||||||
|
"Demo 2: Enhanced LLM Integration",
|
||||||
|
"Testing custom LLM providers and configurations"
|
||||||
|
)
|
||||||
|
|
||||||
|
print("🤖 Testing Enhanced LLM Integration Features")
|
||||||
|
|
||||||
|
provider = "gemini/gemini-2.5-flash-lite"
|
||||||
|
payload = {
|
||||||
|
"url": "https://example.com",
|
||||||
|
"f": "llm",
|
||||||
|
"q": "Summarize this page in one sentence.",
|
||||||
|
"provider": provider, # Explicitly set provider
|
||||||
|
"temperature": 0.7
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
response = requests.post(
|
||||||
|
"http://localhost:11235/md",
|
||||||
|
json=payload,
|
||||||
|
timeout=60
|
||||||
|
)
|
||||||
|
if response.status_code == 200:
|
||||||
|
result = response.json()
|
||||||
|
print(f"✓ Request successful with provider: {provider}")
|
||||||
|
print(f" - Response keys: {list(result.keys())}")
|
||||||
|
print(f" - Content length: {len(result.get('markdown', ''))} characters")
|
||||||
|
print(f" - Note: Actual LLM call may fail without valid API key")
|
||||||
|
else:
|
||||||
|
print(f"❌ Request failed: {response.status_code}")
|
||||||
|
print(f" - Response: {response.text[:500]}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[red]Error: {e}[/]")
|
||||||
|
|
||||||
|
|
||||||
|
async def demo_3_https_preservation():
|
||||||
|
"""Demo 3: HTTPS Preservation - Live crawling with HTTPS maintenance"""
|
||||||
|
print_section(
|
||||||
|
"Demo 3: HTTPS Preservation",
|
||||||
|
"Testing HTTPS preservation for internal links"
|
||||||
|
)
|
||||||
|
|
||||||
|
print("🔒 Testing HTTPS Preservation Feature")
|
||||||
|
|
||||||
|
# Test with HTTPS preservation enabled
|
||||||
|
print("\nTest 1: HTTPS Preservation ENABLED")
|
||||||
|
|
||||||
|
url_filter = URLPatternFilter(
|
||||||
|
patterns=["^(https:\/\/)?quotes\.toscrape\.com(\/.*)?$"]
|
||||||
|
)
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
exclude_external_links=True,
|
||||||
|
stream=True,
|
||||||
|
verbose=False,
|
||||||
|
preserve_https_for_internal_links=True,
|
||||||
|
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||||
|
max_depth=2,
|
||||||
|
max_pages=5,
|
||||||
|
filter_chain=FilterChain([url_filter])
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
test_url = "https://quotes.toscrape.com"
|
||||||
|
print(f"🎯 Testing URL: {test_url}")
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
async for result in await crawler.arun(url=test_url, config=config):
|
||||||
|
print("✓ HTTPS Preservation Test Completed")
|
||||||
|
internal_links = [i['href'] for i in result.links['internal']]
|
||||||
|
for link in internal_links:
|
||||||
|
print(f" → {link}")
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Run all demos"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("🚀 Crawl4AI v0.7.5 Working Demo")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Check system requirements
|
||||||
|
print("🔍 System Requirements Check:")
|
||||||
|
print(f" - Python version: {sys.version.split()[0]} {'✓' if sys.version_info >= (3, 10) else '❌ (3.10+ required)'}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
import requests
|
||||||
|
print(f" - Requests library: ✓")
|
||||||
|
except ImportError:
|
||||||
|
print(f" - Requests library: ❌")
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
demos = [
|
||||||
|
("Docker Hooks System", demo_1_docker_hooks_system),
|
||||||
|
("Enhanced LLM Integration", demo_2_enhanced_llm_integration),
|
||||||
|
("HTTPS Preservation", demo_3_https_preservation),
|
||||||
|
]
|
||||||
|
|
||||||
|
for i, (name, demo_func) in enumerate(demos, 1):
|
||||||
|
try:
|
||||||
|
print(f"\n📍 Starting Demo {i}/{len(demos)}: {name}")
|
||||||
|
await demo_func()
|
||||||
|
|
||||||
|
if i < len(demos):
|
||||||
|
print(f"\n✨ Demo {i} complete! Press Enter for next demo...")
|
||||||
|
input()
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print(f"\n⏹️ Demo interrupted by user")
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Demo {i} error: {str(e)}")
|
||||||
|
print("Continuing to next demo...")
|
||||||
|
continue
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("🎉 Demo Complete!")
|
||||||
|
print("=" * 60)
|
||||||
|
print("You've experienced the power of Crawl4AI v0.7.5!")
|
||||||
|
print("")
|
||||||
|
print("Key Features Demonstrated:")
|
||||||
|
print("🔧 Docker Hooks - String-based & function-based (NEW!)")
|
||||||
|
print(" • hooks_to_string() utility for function conversion")
|
||||||
|
print(" • Docker client with automatic conversion")
|
||||||
|
print(" • Full IDE support and type checking")
|
||||||
|
print("🤖 Enhanced LLM - Better AI integration")
|
||||||
|
print("🔒 HTTPS Preservation - Secure link handling")
|
||||||
|
print("")
|
||||||
|
print("Ready to build something amazing? 🚀")
|
||||||
|
print("")
|
||||||
|
print("📖 Docs: https://docs.crawl4ai.com/")
|
||||||
|
print("🐙 GitHub: https://github.com/unclecode/crawl4ai")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("🚀 Crawl4AI v0.7.5 Live Demo Starting...")
|
||||||
|
print("Press Ctrl+C anytime to exit\n")
|
||||||
|
|
||||||
|
try:
|
||||||
|
asyncio.run(main())
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\n👋 Demo stopped by user. Thanks for trying Crawl4AI v0.7.5!")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n❌ Demo error: {str(e)}")
|
||||||
|
print("Make sure you have the required dependencies installed.")
|
||||||
655
docs/releases_review/v0.7.5_docker_hooks_demo.py
Normal file
655
docs/releases_review/v0.7.5_docker_hooks_demo.py
Normal file
@@ -0,0 +1,655 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
🚀 Crawl4AI v0.7.5 - Docker Hooks System Complete Demonstration
|
||||||
|
================================================================
|
||||||
|
|
||||||
|
This file demonstrates the NEW Docker Hooks System introduced in v0.7.5.
|
||||||
|
|
||||||
|
The Docker Hooks System is a completely NEW feature that provides pipeline
|
||||||
|
customization through user-provided Python functions. It offers three approaches:
|
||||||
|
|
||||||
|
1. String-based hooks for REST API
|
||||||
|
2. hooks_to_string() utility to convert functions
|
||||||
|
3. Docker Client with automatic conversion (most convenient)
|
||||||
|
|
||||||
|
All three approaches are part of this NEW v0.7.5 feature!
|
||||||
|
|
||||||
|
Perfect for video recording and demonstration purposes.
|
||||||
|
|
||||||
|
Requirements:
|
||||||
|
- Docker container running: docker run -p 11235:11235 unclecode/crawl4ai:latest
|
||||||
|
- crawl4ai v0.7.5 installed: pip install crawl4ai==0.7.5
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from typing import Dict, Any
|
||||||
|
|
||||||
|
# Import Crawl4AI components
|
||||||
|
from crawl4ai import hooks_to_string
|
||||||
|
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
DOCKER_URL = "http://localhost:11235"
|
||||||
|
# DOCKER_URL = "http://localhost:11234"
|
||||||
|
TEST_URLS = [
|
||||||
|
# "https://httpbin.org/html",
|
||||||
|
"https://www.kidocode.com",
|
||||||
|
"https://quotes.toscrape.com",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def print_section(title: str, description: str = ""):
|
||||||
|
"""Print a formatted section header"""
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print(f" {title}")
|
||||||
|
if description:
|
||||||
|
print(f" {description}")
|
||||||
|
print("=" * 70 + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
def check_docker_service() -> bool:
|
||||||
|
"""Check if Docker service is running"""
|
||||||
|
try:
|
||||||
|
response = requests.get(f"{DOCKER_URL}/health", timeout=3)
|
||||||
|
return response.status_code == 200
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# REUSABLE HOOK LIBRARY (NEW in v0.7.5)
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
async def performance_optimization_hook(page, context, **kwargs):
|
||||||
|
"""
|
||||||
|
Performance Hook: Block unnecessary resources to speed up crawling
|
||||||
|
"""
|
||||||
|
print(" [Hook] 🚀 Optimizing performance - blocking images and ads...")
|
||||||
|
|
||||||
|
# Block images
|
||||||
|
await context.route(
|
||||||
|
"**/*.{png,jpg,jpeg,gif,webp,svg,ico}",
|
||||||
|
lambda route: route.abort()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Block ads and analytics
|
||||||
|
await context.route("**/analytics/*", lambda route: route.abort())
|
||||||
|
await context.route("**/ads/*", lambda route: route.abort())
|
||||||
|
await context.route("**/google-analytics.com/*", lambda route: route.abort())
|
||||||
|
|
||||||
|
print(" [Hook] ✓ Performance optimization applied")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def viewport_setup_hook(page, context, **kwargs):
|
||||||
|
"""
|
||||||
|
Viewport Hook: Set consistent viewport size for rendering
|
||||||
|
"""
|
||||||
|
print(" [Hook] 🖥️ Setting viewport to 1920x1080...")
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
print(" [Hook] ✓ Viewport configured")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def authentication_headers_hook(page, context, url, **kwargs):
|
||||||
|
"""
|
||||||
|
Headers Hook: Add custom authentication and tracking headers
|
||||||
|
"""
|
||||||
|
print(f" [Hook] 🔐 Adding custom headers for {url[:50]}...")
|
||||||
|
|
||||||
|
await page.set_extra_http_headers({
|
||||||
|
'X-Crawl4AI-Version': '0.7.5',
|
||||||
|
'X-Custom-Hook': 'function-based-demo',
|
||||||
|
'Accept-Language': 'en-US,en;q=0.9',
|
||||||
|
'User-Agent': 'Crawl4AI/0.7.5 (Educational Demo)'
|
||||||
|
})
|
||||||
|
|
||||||
|
print(" [Hook] ✓ Custom headers added")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def lazy_loading_handler_hook(page, context, **kwargs):
|
||||||
|
"""
|
||||||
|
Content Hook: Handle lazy-loaded content by scrolling
|
||||||
|
"""
|
||||||
|
print(" [Hook] 📜 Scrolling to load lazy content...")
|
||||||
|
|
||||||
|
# Scroll to bottom
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
|
||||||
|
# Scroll to middle
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight / 2)")
|
||||||
|
await page.wait_for_timeout(500)
|
||||||
|
|
||||||
|
# Scroll back to top
|
||||||
|
await page.evaluate("window.scrollTo(0, 0)")
|
||||||
|
await page.wait_for_timeout(500)
|
||||||
|
|
||||||
|
print(" [Hook] ✓ Lazy content loaded")
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def page_analytics_hook(page, context, **kwargs):
|
||||||
|
"""
|
||||||
|
Analytics Hook: Log page metrics before extraction
|
||||||
|
"""
|
||||||
|
print(" [Hook] 📊 Collecting page analytics...")
|
||||||
|
|
||||||
|
metrics = await page.evaluate('''
|
||||||
|
() => ({
|
||||||
|
title: document.title,
|
||||||
|
images: document.images.length,
|
||||||
|
links: document.links.length,
|
||||||
|
scripts: document.scripts.length,
|
||||||
|
headings: document.querySelectorAll('h1, h2, h3').length,
|
||||||
|
paragraphs: document.querySelectorAll('p').length
|
||||||
|
})
|
||||||
|
''')
|
||||||
|
|
||||||
|
print(f" [Hook] 📈 Page: {metrics['title'][:50]}...")
|
||||||
|
print(f" Links: {metrics['links']}, Images: {metrics['images']}, "
|
||||||
|
f"Headings: {metrics['headings']}, Paragraphs: {metrics['paragraphs']}")
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# DEMO 1: String-Based Hooks (NEW Docker Hooks System)
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def demo_1_string_based_hooks():
|
||||||
|
"""
|
||||||
|
Demonstrate string-based hooks with REST API (part of NEW Docker Hooks System)
|
||||||
|
"""
|
||||||
|
print_section(
|
||||||
|
"DEMO 1: String-Based Hooks (REST API)",
|
||||||
|
"Part of the NEW Docker Hooks System - hooks as strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Define hooks as strings
|
||||||
|
hooks_config = {
|
||||||
|
"on_page_context_created": """
|
||||||
|
async def hook(page, context, **kwargs):
|
||||||
|
print(" [String Hook] Setting up page context...")
|
||||||
|
# Block images for performance
|
||||||
|
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
return page
|
||||||
|
""",
|
||||||
|
|
||||||
|
"before_goto": """
|
||||||
|
async def hook(page, context, url, **kwargs):
|
||||||
|
print(f" [String Hook] Navigating to {url[:50]}...")
|
||||||
|
await page.set_extra_http_headers({
|
||||||
|
'X-Crawl4AI': 'string-based-hooks',
|
||||||
|
'X-Demo': 'v0.7.5'
|
||||||
|
})
|
||||||
|
return page
|
||||||
|
""",
|
||||||
|
|
||||||
|
"before_retrieve_html": """
|
||||||
|
async def hook(page, context, **kwargs):
|
||||||
|
print(" [String Hook] Scrolling page...")
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
return page
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
|
||||||
|
# Prepare request payload
|
||||||
|
payload = {
|
||||||
|
"urls": [TEST_URLS[0]],
|
||||||
|
"hooks": {
|
||||||
|
"code": hooks_config,
|
||||||
|
"timeout": 30
|
||||||
|
},
|
||||||
|
"crawler_config": {
|
||||||
|
"cache_mode": "bypass"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f"🎯 Target URL: {TEST_URLS[0]}")
|
||||||
|
print(f"🔧 Configured {len(hooks_config)} string-based hooks")
|
||||||
|
print(f"📡 Sending request to Docker API...\n")
|
||||||
|
|
||||||
|
try:
|
||||||
|
start_time = time.time()
|
||||||
|
response = requests.post(f"{DOCKER_URL}/crawl", json=payload, timeout=60)
|
||||||
|
execution_time = time.time() - start_time
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
result = response.json()
|
||||||
|
|
||||||
|
print(f"\n✅ Request successful! (took {execution_time:.2f}s)")
|
||||||
|
|
||||||
|
# Display results
|
||||||
|
if result.get('results') and result['results'][0].get('success'):
|
||||||
|
crawl_result = result['results'][0]
|
||||||
|
html_length = len(crawl_result.get('html', ''))
|
||||||
|
markdown_length = len(crawl_result.get('markdown', ''))
|
||||||
|
|
||||||
|
print(f"\n📊 Results:")
|
||||||
|
print(f" • HTML length: {html_length:,} characters")
|
||||||
|
print(f" • Markdown length: {markdown_length:,} characters")
|
||||||
|
print(f" • URL: {crawl_result.get('url')}")
|
||||||
|
|
||||||
|
# Check hooks execution
|
||||||
|
if 'hooks' in result:
|
||||||
|
hooks_info = result['hooks']
|
||||||
|
print(f"\n🎣 Hooks Execution:")
|
||||||
|
print(f" • Status: {hooks_info['status']['status']}")
|
||||||
|
print(f" • Attached hooks: {len(hooks_info['status']['attached_hooks'])}")
|
||||||
|
|
||||||
|
if 'summary' in hooks_info:
|
||||||
|
summary = hooks_info['summary']
|
||||||
|
print(f" • Total executions: {summary['total_executions']}")
|
||||||
|
print(f" • Successful: {summary['successful']}")
|
||||||
|
print(f" • Success rate: {summary['success_rate']:.1f}%")
|
||||||
|
else:
|
||||||
|
print(f"⚠️ Crawl completed but no results")
|
||||||
|
|
||||||
|
else:
|
||||||
|
print(f"❌ Request failed with status {response.status_code}")
|
||||||
|
print(f" Error: {response.text[:200]}")
|
||||||
|
|
||||||
|
except requests.exceptions.Timeout:
|
||||||
|
print("⏰ Request timed out after 60 seconds")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error: {str(e)}")
|
||||||
|
|
||||||
|
print("\n" + "─" * 70)
|
||||||
|
print("✓ String-based hooks demo complete\n")
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# DEMO 2: Function-Based Hooks with hooks_to_string() Utility
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def demo_2_hooks_to_string_utility():
|
||||||
|
"""
|
||||||
|
Demonstrate the new hooks_to_string() utility for converting functions
|
||||||
|
"""
|
||||||
|
print_section(
|
||||||
|
"DEMO 2: hooks_to_string() Utility (NEW! ✨)",
|
||||||
|
"Convert Python functions to strings for REST API"
|
||||||
|
)
|
||||||
|
|
||||||
|
print("📦 Creating hook functions...")
|
||||||
|
print(" • performance_optimization_hook")
|
||||||
|
print(" • viewport_setup_hook")
|
||||||
|
print(" • authentication_headers_hook")
|
||||||
|
print(" • lazy_loading_handler_hook")
|
||||||
|
|
||||||
|
# Convert function objects to strings using the NEW utility
|
||||||
|
print("\n🔄 Converting functions to strings with hooks_to_string()...")
|
||||||
|
|
||||||
|
hooks_dict = {
|
||||||
|
"on_page_context_created": performance_optimization_hook,
|
||||||
|
"before_goto": authentication_headers_hook,
|
||||||
|
"before_retrieve_html": lazy_loading_handler_hook,
|
||||||
|
}
|
||||||
|
|
||||||
|
hooks_as_strings = hooks_to_string(hooks_dict)
|
||||||
|
|
||||||
|
print(f"✅ Successfully converted {len(hooks_as_strings)} functions to strings")
|
||||||
|
|
||||||
|
# Show a preview
|
||||||
|
print("\n📝 Sample converted hook (first 250 characters):")
|
||||||
|
print("─" * 70)
|
||||||
|
sample_hook = list(hooks_as_strings.values())[0]
|
||||||
|
print(sample_hook[:250] + "...")
|
||||||
|
print("─" * 70)
|
||||||
|
|
||||||
|
# Use the converted hooks with REST API
|
||||||
|
print("\n📡 Using converted hooks with REST API...")
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"urls": [TEST_URLS[0]],
|
||||||
|
"hooks": {
|
||||||
|
"code": hooks_as_strings,
|
||||||
|
"timeout": 30
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
start_time = time.time()
|
||||||
|
response = requests.post(f"{DOCKER_URL}/crawl", json=payload, timeout=60)
|
||||||
|
execution_time = time.time() - start_time
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
result = response.json()
|
||||||
|
print(f"\n✅ Request successful! (took {execution_time:.2f}s)")
|
||||||
|
|
||||||
|
if result.get('results') and result['results'][0].get('success'):
|
||||||
|
crawl_result = result['results'][0]
|
||||||
|
print(f" • HTML length: {len(crawl_result.get('html', '')):,} characters")
|
||||||
|
print(f" • Hooks executed successfully!")
|
||||||
|
else:
|
||||||
|
print(f"❌ Request failed: {response.status_code}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error: {str(e)}")
|
||||||
|
|
||||||
|
print("\n💡 Benefits of hooks_to_string():")
|
||||||
|
print(" ✓ Write hooks as regular Python functions")
|
||||||
|
print(" ✓ Full IDE support (autocomplete, syntax highlighting)")
|
||||||
|
print(" ✓ Type checking and linting")
|
||||||
|
print(" ✓ Easy to test and debug")
|
||||||
|
print(" ✓ Reusable across projects")
|
||||||
|
print(" ✓ Works with any REST API client")
|
||||||
|
|
||||||
|
print("\n" + "─" * 70)
|
||||||
|
print("✓ hooks_to_string() utility demo complete\n")
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# DEMO 3: Docker Client with Automatic Conversion (RECOMMENDED! 🌟)
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
async def demo_3_docker_client_auto_conversion():
|
||||||
|
"""
|
||||||
|
Demonstrate Docker Client with automatic hook conversion (RECOMMENDED)
|
||||||
|
"""
|
||||||
|
print_section(
|
||||||
|
"DEMO 3: Docker Client with Auto-Conversion (RECOMMENDED! 🌟)",
|
||||||
|
"Pass function objects directly - conversion happens automatically!"
|
||||||
|
)
|
||||||
|
|
||||||
|
print("🐳 Initializing Crawl4AI Docker Client...")
|
||||||
|
client = Crawl4aiDockerClient(base_url=DOCKER_URL)
|
||||||
|
|
||||||
|
print("✅ Client ready!\n")
|
||||||
|
|
||||||
|
# Use our reusable hook library - just pass the function objects!
|
||||||
|
print("📚 Using reusable hook library:")
|
||||||
|
print(" • performance_optimization_hook")
|
||||||
|
print(" • viewport_setup_hook")
|
||||||
|
print(" • authentication_headers_hook")
|
||||||
|
print(" • lazy_loading_handler_hook")
|
||||||
|
print(" • page_analytics_hook")
|
||||||
|
|
||||||
|
print("\n🎯 Target URL: " + TEST_URLS[1])
|
||||||
|
print("🚀 Starting crawl with automatic hook conversion...\n")
|
||||||
|
|
||||||
|
try:
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# Pass function objects directly - NO manual conversion needed! ✨
|
||||||
|
results = await client.crawl(
|
||||||
|
urls=[TEST_URLS[0]],
|
||||||
|
hooks={
|
||||||
|
"on_page_context_created": performance_optimization_hook,
|
||||||
|
"before_goto": authentication_headers_hook,
|
||||||
|
"before_retrieve_html": lazy_loading_handler_hook,
|
||||||
|
"before_return_html": page_analytics_hook,
|
||||||
|
},
|
||||||
|
hooks_timeout=30
|
||||||
|
)
|
||||||
|
|
||||||
|
execution_time = time.time() - start_time
|
||||||
|
|
||||||
|
print(f"\n✅ Crawl completed! (took {execution_time:.2f}s)\n")
|
||||||
|
|
||||||
|
# Display results
|
||||||
|
if results and results.success:
|
||||||
|
result = results
|
||||||
|
print(f"📊 Results:")
|
||||||
|
print(f" • URL: {result.url}")
|
||||||
|
print(f" • Success: {result.success}")
|
||||||
|
print(f" • HTML length: {len(result.html):,} characters")
|
||||||
|
print(f" • Markdown length: {len(result.markdown):,} characters")
|
||||||
|
|
||||||
|
# Show metadata
|
||||||
|
if result.metadata:
|
||||||
|
print(f"\n📋 Metadata:")
|
||||||
|
print(f" • Title: {result.metadata.get('title', 'N/A')}")
|
||||||
|
print(f" • Description: {result.metadata.get('description', 'N/A')}")
|
||||||
|
|
||||||
|
# Show links
|
||||||
|
if result.links:
|
||||||
|
internal_count = len(result.links.get('internal', []))
|
||||||
|
external_count = len(result.links.get('external', []))
|
||||||
|
print(f"\n🔗 Links Found:")
|
||||||
|
print(f" • Internal: {internal_count}")
|
||||||
|
print(f" • External: {external_count}")
|
||||||
|
else:
|
||||||
|
print(f"⚠️ Crawl completed but no successful results")
|
||||||
|
if results:
|
||||||
|
print(f" Error: {results.error_message}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error: {str(e)}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
print("\n🌟 Why Docker Client is RECOMMENDED:")
|
||||||
|
print(" ✓ Automatic function-to-string conversion")
|
||||||
|
print(" ✓ No manual hooks_to_string() calls needed")
|
||||||
|
print(" ✓ Cleaner, more Pythonic code")
|
||||||
|
print(" ✓ Full type hints and IDE support")
|
||||||
|
print(" ✓ Built-in error handling")
|
||||||
|
print(" ✓ Async/await support")
|
||||||
|
|
||||||
|
print("\n" + "─" * 70)
|
||||||
|
print("✓ Docker Client auto-conversion demo complete\n")
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# DEMO 4: Advanced Use Case - Complete Hook Pipeline
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
async def demo_4_complete_hook_pipeline():
|
||||||
|
"""
|
||||||
|
Demonstrate a complete hook pipeline using all 8 hook points
|
||||||
|
"""
|
||||||
|
print_section(
|
||||||
|
"DEMO 4: Complete Hook Pipeline",
|
||||||
|
"Using all 8 available hook points for comprehensive control"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Define all 8 hooks
|
||||||
|
async def on_browser_created_hook(browser, **kwargs):
|
||||||
|
"""Hook 1: Called after browser is created"""
|
||||||
|
print(" [Pipeline] 1/8 Browser created")
|
||||||
|
return browser
|
||||||
|
|
||||||
|
async def on_page_context_created_hook(page, context, **kwargs):
|
||||||
|
"""Hook 2: Called after page context is created"""
|
||||||
|
print(" [Pipeline] 2/8 Page context created - setting up...")
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def on_user_agent_updated_hook(page, context, user_agent, **kwargs):
|
||||||
|
"""Hook 3: Called when user agent is updated"""
|
||||||
|
print(f" [Pipeline] 3/8 User agent updated: {user_agent[:50]}...")
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def before_goto_hook(page, context, url, **kwargs):
|
||||||
|
"""Hook 4: Called before navigating to URL"""
|
||||||
|
print(f" [Pipeline] 4/8 Before navigation to: {url[:60]}...")
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def after_goto_hook(page, context, url, response, **kwargs):
|
||||||
|
"""Hook 5: Called after navigation completes"""
|
||||||
|
print(f" [Pipeline] 5/8 After navigation - Status: {response.status if response else 'N/A'}")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def on_execution_started_hook(page, context, **kwargs):
|
||||||
|
"""Hook 6: Called when JavaScript execution starts"""
|
||||||
|
print(" [Pipeline] 6/8 JavaScript execution started")
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def before_retrieve_html_hook(page, context, **kwargs):
|
||||||
|
"""Hook 7: Called before retrieving HTML"""
|
||||||
|
print(" [Pipeline] 7/8 Before HTML retrieval - scrolling...")
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
return page
|
||||||
|
|
||||||
|
async def before_return_html_hook(page, context, html, **kwargs):
|
||||||
|
"""Hook 8: Called before returning HTML"""
|
||||||
|
print(f" [Pipeline] 8/8 Before return - HTML length: {len(html):,} chars")
|
||||||
|
return page
|
||||||
|
|
||||||
|
print("🎯 Target URL: " + TEST_URLS[0])
|
||||||
|
print("🔧 Configured ALL 8 hook points for complete pipeline control\n")
|
||||||
|
|
||||||
|
client = Crawl4aiDockerClient(base_url=DOCKER_URL)
|
||||||
|
|
||||||
|
try:
|
||||||
|
print("🚀 Starting complete pipeline crawl...\n")
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
results = await client.crawl(
|
||||||
|
urls=[TEST_URLS[0]],
|
||||||
|
hooks={
|
||||||
|
"on_browser_created": on_browser_created_hook,
|
||||||
|
"on_page_context_created": on_page_context_created_hook,
|
||||||
|
"on_user_agent_updated": on_user_agent_updated_hook,
|
||||||
|
"before_goto": before_goto_hook,
|
||||||
|
"after_goto": after_goto_hook,
|
||||||
|
"on_execution_started": on_execution_started_hook,
|
||||||
|
"before_retrieve_html": before_retrieve_html_hook,
|
||||||
|
"before_return_html": before_return_html_hook,
|
||||||
|
},
|
||||||
|
hooks_timeout=45
|
||||||
|
)
|
||||||
|
|
||||||
|
execution_time = time.time() - start_time
|
||||||
|
|
||||||
|
if results and results.success:
|
||||||
|
print(f"\n✅ Complete pipeline executed successfully! (took {execution_time:.2f}s)")
|
||||||
|
print(f" • All 8 hooks executed in sequence")
|
||||||
|
print(f" • HTML length: {len(results.html):,} characters")
|
||||||
|
else:
|
||||||
|
print(f"⚠️ Pipeline completed with warnings")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error: {str(e)}")
|
||||||
|
|
||||||
|
print("\n📚 Available Hook Points:")
|
||||||
|
print(" 1. on_browser_created - Browser initialization")
|
||||||
|
print(" 2. on_page_context_created - Page context setup")
|
||||||
|
print(" 3. on_user_agent_updated - User agent configuration")
|
||||||
|
print(" 4. before_goto - Pre-navigation setup")
|
||||||
|
print(" 5. after_goto - Post-navigation processing")
|
||||||
|
print(" 6. on_execution_started - JavaScript execution start")
|
||||||
|
print(" 7. before_retrieve_html - Pre-extraction processing")
|
||||||
|
print(" 8. before_return_html - Final HTML processing")
|
||||||
|
|
||||||
|
print("\n" + "─" * 70)
|
||||||
|
print("✓ Complete hook pipeline demo complete\n")
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# MAIN EXECUTION
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""
|
||||||
|
Run all demonstrations
|
||||||
|
"""
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print(" 🚀 Crawl4AI v0.7.5 - Docker Hooks Complete Demonstration")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
# Check Docker service
|
||||||
|
print("\n🔍 Checking Docker service status...")
|
||||||
|
if not check_docker_service():
|
||||||
|
print("❌ Docker service is not running!")
|
||||||
|
print("\n📋 To start the Docker service:")
|
||||||
|
print(" docker run -p 11235:11235 unclecode/crawl4ai:latest")
|
||||||
|
print("\nPlease start the service and run this demo again.")
|
||||||
|
return
|
||||||
|
|
||||||
|
print("✅ Docker service is running!\n")
|
||||||
|
|
||||||
|
# Run all demos
|
||||||
|
demos = [
|
||||||
|
("String-Based Hooks (REST API)", demo_1_string_based_hooks, False),
|
||||||
|
("hooks_to_string() Utility", demo_2_hooks_to_string_utility, False),
|
||||||
|
("Docker Client Auto-Conversion", demo_3_docker_client_auto_conversion, True),
|
||||||
|
# ("Complete Hook Pipeline", demo_4_complete_hook_pipeline, True),
|
||||||
|
]
|
||||||
|
|
||||||
|
for i, (name, demo_func, is_async) in enumerate(demos, 1):
|
||||||
|
print(f"\n{'🔷' * 35}")
|
||||||
|
print(f"Starting Demo {i}/{len(demos)}: {name}")
|
||||||
|
print(f"{'🔷' * 35}\n")
|
||||||
|
|
||||||
|
try:
|
||||||
|
if is_async:
|
||||||
|
await demo_func()
|
||||||
|
else:
|
||||||
|
demo_func()
|
||||||
|
|
||||||
|
print(f"✅ Demo {i} completed successfully!")
|
||||||
|
|
||||||
|
# Pause between demos (except the last one)
|
||||||
|
if i < len(demos):
|
||||||
|
print("\n⏸️ Press Enter to continue to next demo...")
|
||||||
|
# input()
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print(f"\n⏹️ Demo interrupted by user")
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n❌ Demo {i} failed: {str(e)}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
print("\nContinuing to next demo...\n")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Final summary
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print(" 🎉 All Demonstrations Complete!")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
print("\n📊 Summary of v0.7.5 Docker Hooks System:")
|
||||||
|
print("\n🆕 COMPLETELY NEW FEATURE in v0.7.5:")
|
||||||
|
print(" The Docker Hooks System lets you customize the crawling pipeline")
|
||||||
|
print(" with user-provided Python functions at 8 strategic points.")
|
||||||
|
|
||||||
|
print("\n✨ Three Ways to Use Docker Hooks (All NEW!):")
|
||||||
|
print(" 1. String-based - Write hooks as strings for REST API")
|
||||||
|
print(" 2. hooks_to_string() - Convert Python functions to strings")
|
||||||
|
print(" 3. Docker Client - Automatic conversion (RECOMMENDED)")
|
||||||
|
|
||||||
|
print("\n💡 Key Benefits:")
|
||||||
|
print(" ✓ Full IDE support (autocomplete, syntax highlighting)")
|
||||||
|
print(" ✓ Type checking and linting")
|
||||||
|
print(" ✓ Easy to test and debug")
|
||||||
|
print(" ✓ Reusable across projects")
|
||||||
|
print(" ✓ Complete pipeline control")
|
||||||
|
|
||||||
|
print("\n🎯 8 Hook Points Available:")
|
||||||
|
print(" • on_browser_created, on_page_context_created")
|
||||||
|
print(" • on_user_agent_updated, before_goto, after_goto")
|
||||||
|
print(" • on_execution_started, before_retrieve_html, before_return_html")
|
||||||
|
|
||||||
|
print("\n📚 Resources:")
|
||||||
|
print(" • Docs: https://docs.crawl4ai.com")
|
||||||
|
print(" • GitHub: https://github.com/unclecode/crawl4ai")
|
||||||
|
print(" • Discord: https://discord.gg/jP8KfhDhyN")
|
||||||
|
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print(" Happy Crawling with v0.7.5! 🕷️")
|
||||||
|
print("=" * 70 + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("\n🎬 Starting Crawl4AI v0.7.5 Docker Hooks Demonstration...")
|
||||||
|
print("Press Ctrl+C anytime to exit\n")
|
||||||
|
|
||||||
|
try:
|
||||||
|
asyncio.run(main())
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\n\n👋 Demo stopped by user. Thanks for exploring Crawl4AI v0.7.5!")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n\n❌ Demo error: {str(e)}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
1516
docs/releases_review/v0.7.5_video_walkthrough.ipynb
Normal file
1516
docs/releases_review/v0.7.5_video_walkthrough.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
@@ -7,6 +7,7 @@ docs_dir: docs/md_v2
|
|||||||
|
|
||||||
nav:
|
nav:
|
||||||
- Home: 'index.md'
|
- Home: 'index.md'
|
||||||
|
- "📚 Complete SDK Reference": "complete-sdk-reference.md"
|
||||||
- "Ask AI": "core/ask-ai.md"
|
- "Ask AI": "core/ask-ai.md"
|
||||||
- "Quick Start": "core/quickstart.md"
|
- "Quick Start": "core/quickstart.md"
|
||||||
- "Code Examples": "core/examples.md"
|
- "Code Examples": "core/examples.md"
|
||||||
|
|||||||
193
tests/docker/test_hooks_utility.py
Normal file
193
tests/docker/test_hooks_utility.py
Normal file
@@ -0,0 +1,193 @@
|
|||||||
|
"""
|
||||||
|
Test script demonstrating the hooks_to_string utility and Docker client integration.
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import Crawl4aiDockerClient, hooks_to_string
|
||||||
|
|
||||||
|
|
||||||
|
# Define hook functions as regular Python functions
|
||||||
|
async def auth_hook(page, context, **kwargs):
|
||||||
|
"""Add authentication cookies."""
|
||||||
|
await context.add_cookies([{
|
||||||
|
'name': 'test_cookie',
|
||||||
|
'value': 'test_value',
|
||||||
|
'domain': '.httpbin.org',
|
||||||
|
'path': '/'
|
||||||
|
}])
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def scroll_hook(page, context, **kwargs):
|
||||||
|
"""Scroll to load lazy content."""
|
||||||
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||||
|
await page.wait_for_timeout(1000)
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def viewport_hook(page, context, **kwargs):
|
||||||
|
"""Set custom viewport."""
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
async def test_hooks_utility():
|
||||||
|
"""Test the hooks_to_string utility function."""
|
||||||
|
print("=" * 60)
|
||||||
|
print("Testing hooks_to_string utility")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Create hooks dictionary with function objects
|
||||||
|
hooks_dict = {
|
||||||
|
"on_page_context_created": auth_hook,
|
||||||
|
"before_retrieve_html": scroll_hook
|
||||||
|
}
|
||||||
|
|
||||||
|
# Convert to string format
|
||||||
|
hooks_string = hooks_to_string(hooks_dict)
|
||||||
|
|
||||||
|
print("\n✓ Successfully converted function objects to strings")
|
||||||
|
print(f"\n✓ Converted {len(hooks_string)} hooks:")
|
||||||
|
for hook_name in hooks_string.keys():
|
||||||
|
print(f" - {hook_name}")
|
||||||
|
|
||||||
|
print("\n✓ Preview of converted hook:")
|
||||||
|
print("-" * 60)
|
||||||
|
print(hooks_string["on_page_context_created"][:200] + "...")
|
||||||
|
print("-" * 60)
|
||||||
|
|
||||||
|
return hooks_string
|
||||||
|
|
||||||
|
|
||||||
|
async def test_docker_client_with_functions():
|
||||||
|
"""Test Docker client with function objects (automatic conversion)."""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("Testing Docker Client with Function Objects")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Note: This requires a running Crawl4AI Docker server
|
||||||
|
# Uncomment the following to test with actual server:
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url="http://localhost:11234", verbose=True) as client:
|
||||||
|
# Pass function objects directly - they'll be converted automatically
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://httpbin.org/html"],
|
||||||
|
hooks={
|
||||||
|
"on_page_context_created": auth_hook,
|
||||||
|
"before_retrieve_html": scroll_hook
|
||||||
|
},
|
||||||
|
hooks_timeout=30
|
||||||
|
)
|
||||||
|
print(f"\n✓ Crawl successful: {result.success}")
|
||||||
|
print(f"✓ URL: {result.url}")
|
||||||
|
|
||||||
|
print("\n✓ Docker client accepts function objects directly")
|
||||||
|
print("✓ Automatic conversion happens internally")
|
||||||
|
print("✓ No manual string formatting needed!")
|
||||||
|
|
||||||
|
|
||||||
|
async def test_docker_client_with_strings():
|
||||||
|
"""Test Docker client with pre-converted strings."""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("Testing Docker Client with String Hooks")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Convert hooks to strings first
|
||||||
|
hooks_dict = {
|
||||||
|
"on_page_context_created": viewport_hook,
|
||||||
|
"before_retrieve_html": scroll_hook
|
||||||
|
}
|
||||||
|
hooks_string = hooks_to_string(hooks_dict)
|
||||||
|
|
||||||
|
# Note: This requires a running Crawl4AI Docker server
|
||||||
|
# Uncomment the following to test with actual server:
|
||||||
|
|
||||||
|
async with Crawl4aiDockerClient(base_url="http://localhost:11234", verbose=True) as client:
|
||||||
|
# Pass string hooks - they'll be used as-is
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://httpbin.org/html"],
|
||||||
|
hooks=hooks_string,
|
||||||
|
hooks_timeout=30
|
||||||
|
)
|
||||||
|
print(f"\n✓ Crawl successful: {result.success}")
|
||||||
|
|
||||||
|
print("\n✓ Docker client also accepts pre-converted strings")
|
||||||
|
print("✓ Backward compatible with existing code")
|
||||||
|
|
||||||
|
|
||||||
|
async def show_usage_patterns():
|
||||||
|
"""Show different usage patterns."""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("Usage Patterns")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
print("\n1. Direct function usage (simplest):")
|
||||||
|
print("-" * 60)
|
||||||
|
print("""
|
||||||
|
async def my_hook(page, context, **kwargs):
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
return page
|
||||||
|
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://example.com"],
|
||||||
|
hooks={"on_page_context_created": my_hook}
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
print("\n2. Convert then use:")
|
||||||
|
print("-" * 60)
|
||||||
|
print("""
|
||||||
|
hooks_dict = {"on_page_context_created": my_hook}
|
||||||
|
hooks_string = hooks_to_string(hooks_dict)
|
||||||
|
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://example.com"],
|
||||||
|
hooks=hooks_string
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
print("\n3. Manual string (backward compatible):")
|
||||||
|
print("-" * 60)
|
||||||
|
print("""
|
||||||
|
hooks_string = {
|
||||||
|
"on_page_context_created": '''
|
||||||
|
async def hook(page, context, **kwargs):
|
||||||
|
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||||
|
return page
|
||||||
|
'''
|
||||||
|
}
|
||||||
|
|
||||||
|
result = await client.crawl(
|
||||||
|
["https://example.com"],
|
||||||
|
hooks=hooks_string
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Run all tests."""
|
||||||
|
print("\n🚀 Crawl4AI Hooks Utility Test Suite\n")
|
||||||
|
|
||||||
|
# Test the utility function
|
||||||
|
# await test_hooks_utility()
|
||||||
|
|
||||||
|
# Show usage with Docker client
|
||||||
|
# await test_docker_client_with_functions()
|
||||||
|
await test_docker_client_with_strings()
|
||||||
|
|
||||||
|
# Show different patterns
|
||||||
|
# await show_usage_patterns()
|
||||||
|
|
||||||
|
# print("\n" + "=" * 60)
|
||||||
|
# print("✓ All tests completed successfully!")
|
||||||
|
# print("=" * 60)
|
||||||
|
# print("\nKey Benefits:")
|
||||||
|
# print(" • Write hooks as regular Python functions")
|
||||||
|
# print(" • IDE support with autocomplete and type checking")
|
||||||
|
# print(" • Automatic conversion to API format")
|
||||||
|
# print(" • Backward compatible with string hooks")
|
||||||
|
# print(" • Same utility used everywhere")
|
||||||
|
# print("\n")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
Reference in New Issue
Block a user