diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml new file mode 100644 index 00000000..5ff1b78a --- /dev/null +++ b/.github/workflows/docker-release.yml @@ -0,0 +1,81 @@ +name: Docker Release +on: + release: + types: [published] + push: + tags: + - 'docker-rebuild-v*' # Allow manual Docker rebuilds via tags + +jobs: + docker: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Extract version from release or tag + id: get_version + run: | + if [ "${{ github.event_name }}" == "release" ]; then + # Triggered by release event + VERSION="${{ github.event.release.tag_name }}" + VERSION=${VERSION#v} # Remove 'v' prefix + else + # Triggered by docker-rebuild-v* tag + VERSION=${GITHUB_REF#refs/tags/docker-rebuild-v} + fi + echo "VERSION=$VERSION" >> $GITHUB_OUTPUT + echo "Building Docker images for version: $VERSION" + + - name: Extract major and minor versions + id: versions + run: | + VERSION=${{ steps.get_version.outputs.VERSION }} + MAJOR=$(echo $VERSION | cut -d. -f1) + MINOR=$(echo $VERSION | cut -d. -f1-2) + echo "MAJOR=$MAJOR" >> $GITHUB_OUTPUT + echo "MINOR=$MINOR" >> $GITHUB_OUTPUT + echo "Semantic versions - Major: $MAJOR, Minor: $MINOR" + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_TOKEN }} + + - name: Build and push Docker images + uses: docker/build-push-action@v5 + with: + context: . + push: true + tags: | + unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }} + unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }} + unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }} + unclecode/crawl4ai:latest + platforms: linux/amd64,linux/arm64 + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Summary + run: | + echo "## π³ Docker Release Complete!" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Published Images" >> $GITHUB_STEP_SUMMARY + echo "- \`unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY + echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}\`" >> $GITHUB_STEP_SUMMARY + echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}\`" >> $GITHUB_STEP_SUMMARY + echo "- \`unclecode/crawl4ai:latest\`" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Platforms" >> $GITHUB_STEP_SUMMARY + echo "- linux/amd64" >> $GITHUB_STEP_SUMMARY + echo "- linux/arm64" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### π Pull Command" >> $GITHUB_STEP_SUMMARY + echo "\`\`\`bash" >> $GITHUB_STEP_SUMMARY + echo "docker pull unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY + echo "\`\`\`" >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/docs/ARCHITECTURE.md b/.github/workflows/docs/ARCHITECTURE.md new file mode 100644 index 00000000..aab2e8c1 --- /dev/null +++ b/.github/workflows/docs/ARCHITECTURE.md @@ -0,0 +1,917 @@ +# Workflow Architecture Documentation + +## Overview + +This document describes the technical architecture of the split release pipeline for Crawl4AI. + +--- + +## Architecture Diagram + +``` +βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +β Developer β +β β β +β βΌ β +β git tag v1.2.3 β +β git push --tags β +ββββββββββββββββββββββββββββββββ¬βββββββββββββββββββββββββββββββββββ + β + βΌ +βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +β GitHub Repository β +β β +β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β +β β Tag Event: v1.2.3 β β +β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β +β β β +β βΌ β +β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β +β β release.yml (Release Pipeline) β β +β β ββββββββββββββββββββββββββββββββββββββββββββββββ β β +β β β 1. Extract Version β β β +β β β v1.2.3 β 1.2.3 β β β +β β ββββββββββββββββββββββββββββββββββββββββββββββββ β β +β β ββββββββββββββββββββββββββββββββββββββββββββββββ β β +β β β 2. Validate Version β β β +β β β Tag == __version__.py β β β +β β ββββββββββββββββββββββββββββββββββββββββββββββββ β β +β β ββββββββββββββββββββββββββββββββββββββββββββββββ β β +β β β 3. Build Python Package β β β +β β β - Source dist (.tar.gz) β β β +β β β - Wheel (.whl) β β β +β β ββββββββββββββββββββββββββββββββββββββββββββββββ β β +β β ββββββββββββββββββββββββββββββββββββββββββββββββ β β +β β β 4. Upload to PyPI β β β +β β β - Authenticate with token β β β +β β β - Upload dist/* β β β +β β ββββββββββββββββββββββββββββββββββββββββββββββββ β β +β β ββββββββββββββββββββββββββββββββββββββββββββββββ β β +β β β 5. Create GitHub Release β β β +β β β - Tag: v1.2.3 β β β +β β β - Body: Install instructions β β β +β β β - Status: Published β β β +β β ββββββββββββββββββββββββββββββββββββββββββββββββ β β +β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β +β β β +β βΌ β +β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β +β β Release Event: published (v1.2.3) β β +β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β +β β β +β βΌ β +β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β +β β docker-release.yml (Docker Pipeline) β β +β β ββββββββββββββββββββββββββββββββββββββββββββββββ β β +β β β 1. Extract Version from Release β β β +β β β github.event.release.tag_name β 1.2.3 β β β +β β ββββββββββββββββββββββββββββββββββββββββββββββββ β β +β β ββββββββββββββββββββββββββββββββββββββββββββββββ β β +β β β 2. Parse Semantic Versions β β β +β β β 1.2.3 β Major: 1, Minor: 1.2 β β β +β β ββββββββββββββββββββββββββββββββββββββββββββββββ β β +β β ββββββββββββββββββββββββββββββββββββββββββββββββ β β +β β β 3. Setup Multi-Arch Build β β β +β β β - Docker Buildx β β β +β β β - QEMU emulation β β β +β β ββββββββββββββββββββββββββββββββββββββββββββββββ β β +β β ββββββββββββββββββββββββββββββββββββββββββββββββ β β +β β β 4. Authenticate Docker Hub β β β +β β β - Username: DOCKER_USERNAME β β β +β β β - Token: DOCKER_TOKEN β β β +β β ββββββββββββββββββββββββββββββββββββββββββββββββ β β +β β ββββββββββββββββββββββββββββββββββββββββββββββββ β β +β β β 5. Build Multi-Arch Images β β β +β β β ββββββββββββββββββ¬βββββββββββββββββ β β β +β β β β linux/amd64 β linux/arm64 β β β β +β β β ββββββββββββββββββ΄βββββββββββββββββ β β β +β β β Cache: GitHub Actions (type=gha) β β β +β β ββββββββββββββββββββββββββββββββββββββββββββββββ β β +β β ββββββββββββββββββββββββββββββββββββββββββββββββ β β +β β β 6. Push to Docker Hub β β β +β β β Tags: β β β +β β β - unclecode/crawl4ai:1.2.3 β β β +β β β - unclecode/crawl4ai:1.2 β β β +β β β - unclecode/crawl4ai:1 β β β +β β β - unclecode/crawl4ai:latest β β β +β β ββββββββββββββββββββββββββββββββββββββββββββββββ β β +β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β +βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ + β + βΌ +βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +β External Services β +β β +β ββββββββββββββββ ββββββββββββββββ ββββββββββββββββ β +β β PyPI β β Docker Hub β β GitHub β β +β β β β β β β β +β β crawl4ai β β unclecode/ β β Releases β β +β β 1.2.3 β β crawl4ai β β v1.2.3 β β +β ββββββββββββββββ ββββββββββββββββ ββββββββββββββββ β +βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ +``` + +--- + +## Component Details + +### 1. Release Pipeline (release.yml) + +#### Purpose +Fast publication of Python package and GitHub release. + +#### Input +- **Trigger**: Git tag matching `v*` (excluding `test-v*`) +- **Example**: `v1.2.3` + +#### Processing Stages + +##### Stage 1: Version Extraction +```bash +Input: refs/tags/v1.2.3 +Output: VERSION=1.2.3 +``` + +**Implementation**: +```bash +TAG_VERSION=${GITHUB_REF#refs/tags/v} # Remove 'refs/tags/v' prefix +echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT +``` + +##### Stage 2: Version Validation +```bash +Input: TAG_VERSION=1.2.3 +Check: crawl4ai/__version__.py contains __version__ = "1.2.3" +Output: Pass/Fail +``` + +**Implementation**: +```bash +PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)") +if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then + exit 1 +fi +``` + +##### Stage 3: Package Build +```bash +Input: Source code + pyproject.toml +Output: dist/crawl4ai-1.2.3.tar.gz + dist/crawl4ai-1.2.3-py3-none-any.whl +``` + +**Implementation**: +```bash +python -m build +# Uses build backend defined in pyproject.toml +``` + +##### Stage 4: PyPI Upload +```bash +Input: dist/*.{tar.gz,whl} +Auth: PYPI_TOKEN +Output: Package published to PyPI +``` + +**Implementation**: +```bash +twine upload dist/* +# Environment: +# TWINE_USERNAME: __token__ +# TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} +``` + +##### Stage 5: GitHub Release Creation +```bash +Input: Tag: v1.2.3 + Body: Markdown content +Output: Published GitHub release +``` + +**Implementation**: +```yaml +uses: softprops/action-gh-release@v2 +with: + tag_name: v1.2.3 + name: Release v1.2.3 + body: | + Installation instructions and changelog + draft: false + prerelease: false +``` + +#### Output +- **PyPI Package**: https://pypi.org/project/crawl4ai/1.2.3/ +- **GitHub Release**: Published release on repository +- **Event**: `release.published` (triggers Docker workflow) + +#### Timeline +``` +0:00 - Tag pushed +0:01 - Checkout + Python setup +0:02 - Version validation +0:03 - Package build +0:04 - PyPI upload starts +0:06 - PyPI upload complete +0:07 - GitHub release created +0:08 - Workflow complete +``` + +--- + +### 2. Docker Release Pipeline (docker-release.yml) + +#### Purpose +Build and publish multi-architecture Docker images. + +#### Inputs + +##### Input 1: Release Event (Automatic) +```yaml +Event: release.published +Data: github.event.release.tag_name = "v1.2.3" +``` + +##### Input 2: Docker Rebuild Tag (Manual) +```yaml +Tag: docker-rebuild-v1.2.3 +``` + +#### Processing Stages + +##### Stage 1: Version Detection +```bash +# From release event: +VERSION = github.event.release.tag_name.strip("v") +# Result: "1.2.3" + +# From rebuild tag: +VERSION = GITHUB_REF.replace("refs/tags/docker-rebuild-v", "") +# Result: "1.2.3" +``` + +##### Stage 2: Semantic Version Parsing +```bash +Input: VERSION=1.2.3 +Output: MAJOR=1 + MINOR=1.2 + PATCH=3 (implicit) +``` + +**Implementation**: +```bash +MAJOR=$(echo $VERSION | cut -d. -f1) # Extract first component +MINOR=$(echo $VERSION | cut -d. -f1-2) # Extract first two components +``` + +##### Stage 3: Multi-Architecture Setup +```yaml +Setup: + - Docker Buildx (multi-platform builder) + - QEMU (ARM emulation on x86) + +Platforms: + - linux/amd64 (x86_64) + - linux/arm64 (aarch64) +``` + +**Architecture**: +``` +GitHub Runner (linux/amd64) + ββ Buildx Builder + β ββ Native: Build linux/amd64 image + β ββ QEMU: Emulate ARM to build linux/arm64 image + ββ Generate manifest list (points to both images) +``` + +##### Stage 4: Docker Hub Authentication +```bash +Input: DOCKER_USERNAME + DOCKER_TOKEN +Output: Authenticated Docker client +``` + +##### Stage 5: Build with Cache +```yaml +Cache Configuration: + cache-from: type=gha # Read from GitHub Actions cache + cache-to: type=gha,mode=max # Write all layers + +Cache Key Components: + - Workflow file path + - Branch name + - Architecture (amd64/arm64) +``` + +**Cache Hierarchy**: +``` +Cache Entry: main/docker-release.yml/linux-amd64 + ββ Layer: sha256:abc123... (FROM python:3.12) + ββ Layer: sha256:def456... (RUN apt-get update) + ββ Layer: sha256:ghi789... (COPY requirements.txt) + ββ Layer: sha256:jkl012... (RUN pip install) + ββ Layer: sha256:mno345... (COPY . /app) + +Cache Hit/Miss Logic: + - If layer input unchanged β cache hit β skip build + - If layer input changed β cache miss β rebuild + all subsequent layers +``` + +##### Stage 6: Tag Generation +```bash +Input: VERSION=1.2.3, MAJOR=1, MINOR=1.2 + +Output Tags: + - unclecode/crawl4ai:1.2.3 (exact version) + - unclecode/crawl4ai:1.2 (minor version) + - unclecode/crawl4ai:1 (major version) + - unclecode/crawl4ai:latest (latest stable) +``` + +**Tag Strategy**: +- All tags point to same image SHA +- Users can pin to desired stability level +- Pushing new version updates `1`, `1.2`, and `latest` automatically + +##### Stage 7: Push to Registry +```bash +For each tag: + For each platform (amd64, arm64): + Push image to Docker Hub + +Create manifest list: + Manifest: unclecode/crawl4ai:1.2.3 + ββ linux/amd64: sha256:abc... + ββ linux/arm64: sha256:def... + +Docker CLI automatically selects correct platform on pull +``` + +#### Output +- **Docker Images**: 4 tags Γ 2 platforms = 8 image variants + 4 manifests +- **Docker Hub**: https://hub.docker.com/r/unclecode/crawl4ai/tags + +#### Timeline + +**Cold Cache (First Build)**: +``` +0:00 - Release event received +0:01 - Checkout + Buildx setup +0:02 - Docker Hub auth +0:03 - Start build (amd64) +0:08 - Complete amd64 build +0:09 - Start build (arm64) +0:14 - Complete arm64 build +0:15 - Generate manifests +0:16 - Push all tags +0:17 - Workflow complete +``` + +**Warm Cache (Code Change Only)**: +``` +0:00 - Release event received +0:01 - Checkout + Buildx setup +0:02 - Docker Hub auth +0:03 - Start build (amd64) - cache hit for layers 1-4 +0:04 - Complete amd64 build (only layer 5 rebuilt) +0:05 - Start build (arm64) - cache hit for layers 1-4 +0:06 - Complete arm64 build (only layer 5 rebuilt) +0:07 - Generate manifests +0:08 - Push all tags +0:09 - Workflow complete +``` + +--- + +## Data Flow + +### Version Information Flow + +``` +Developer + β + βΌ +crawl4ai/__version__.py + __version__ = "1.2.3" + β + βββΊ Git Tag + β v1.2.3 + β β + β βΌ + β release.yml + β β + β βββΊ Validation + β β β Match + β β + β βββΊ PyPI Package + β β crawl4ai==1.2.3 + β β + β βββΊ GitHub Release + β v1.2.3 + β β + β βΌ + β docker-release.yml + β β + β βββΊ Docker Tags + β 1.2.3, 1.2, 1, latest + β + βββΊ Package Metadata + pyproject.toml + version = "1.2.3" +``` + +### Secrets Flow + +``` +GitHub Secrets (Encrypted at Rest) + β + βββΊ PYPI_TOKEN + β β + β βΌ + β release.yml + β β + β βΌ + β TWINE_PASSWORD env var (masked in logs) + β β + β βΌ + β PyPI API (HTTPS) + β + βββΊ DOCKER_USERNAME + β β + β βΌ + β docker-release.yml + β β + β βΌ + β docker/login-action (masked in logs) + β β + β βΌ + β Docker Hub API (HTTPS) + β + βββΊ DOCKER_TOKEN + β + βΌ + docker-release.yml + β + βΌ + docker/login-action (masked in logs) + β + βΌ + Docker Hub API (HTTPS) +``` + +### Artifact Flow + +``` +Source Code + β + βββΊ release.yml + β β + β βΌ + β python -m build + β β + β βββΊ crawl4ai-1.2.3.tar.gz + β β β + β β βΌ + β β PyPI Storage + β β β + β β βΌ + β β pip install crawl4ai + β β + β βββΊ crawl4ai-1.2.3-py3-none-any.whl + β β + β βΌ + β PyPI Storage + β β + β βΌ + β pip install crawl4ai + β + βββΊ docker-release.yml + β + βΌ + docker build + β + βββΊ Image: linux/amd64 + β β + β βββΊ Docker Hub + β unclecode/crawl4ai:1.2.3-amd64 + β + βββΊ Image: linux/arm64 + β + βββΊ Docker Hub + unclecode/crawl4ai:1.2.3-arm64 +``` + +--- + +## State Machines + +### Release Pipeline State Machine + +``` +βββββββββββ +β START β +ββββββ¬βββββ + β + βΌ +ββββββββββββββββ +β Extract β +β Version β +ββββββββ¬ββββββββ + β + βΌ +ββββββββββββββββ βββββββββββ +β Validate βββββββΊβ FAILED β +β Version β No β (Exit 1)β +ββββββββ¬ββββββββ βββββββββββ + β Yes + βΌ +ββββββββββββββββ +β Build β +β Package β +ββββββββ¬ββββββββ + β + βΌ +ββββββββββββββββ βββββββββββ +β Upload βββββββΊβ FAILED β +β to PyPI β Errorβ (Exit 1)β +ββββββββ¬ββββββββ βββββββββββ + β Success + βΌ +ββββββββββββββββ +β Create β +β GH Release β +ββββββββ¬ββββββββ + β + βΌ +ββββββββββββββββ +β SUCCESS β +β (Emit Event) β +ββββββββββββββββ +``` + +### Docker Pipeline State Machine + +``` +βββββββββββ +β START β +β (Event) β +ββββββ¬βββββ + β + βΌ +ββββββββββββββββ +β Detect β +β Version β +β Source β +ββββββββ¬ββββββββ + β + βΌ +ββββββββββββββββ +β Parse β +β Semantic β +β Versions β +ββββββββ¬ββββββββ + β + βΌ +ββββββββββββββββ βββββββββββ +β Authenticate βββββββΊβ FAILED β +β Docker Hub β Errorβ (Exit 1)β +ββββββββ¬ββββββββ βββββββββββ + β Success + βΌ +ββββββββββββββββ +β Build β +β amd64 β +ββββββββ¬ββββββββ + β + βΌ +ββββββββββββββββ βββββββββββ +β Build βββββββΊβ FAILED β +β arm64 β Errorβ (Exit 1)β +ββββββββ¬ββββββββ βββββββββββ + β Success + βΌ +ββββββββββββββββ +β Push All β +β Tags β +ββββββββ¬ββββββββ + β + βΌ +ββββββββββββββββ +β SUCCESS β +ββββββββββββββββ +``` + +--- + +## Security Architecture + +### Threat Model + +#### Threats Mitigated + +1. **Secret Exposure** + - Mitigation: GitHub Actions secret masking + - Evidence: Secrets never appear in logs + +2. **Unauthorized Package Upload** + - Mitigation: Scoped PyPI tokens + - Evidence: Token limited to `crawl4ai` project + +3. **Man-in-the-Middle** + - Mitigation: HTTPS for all API calls + - Evidence: PyPI, Docker Hub, GitHub all use TLS + +4. **Supply Chain Tampering** + - Mitigation: Immutable artifacts, content checksums + - Evidence: PyPI stores SHA256, Docker uses content-addressable storage + +#### Trust Boundaries + +``` +βββββββββββββββββββββββββββββββββββββββββββ +β Trusted Zone β +β ββββββββββββββββββββββββββββββββββ β +β β GitHub Actions Runner β β +β β - Ephemeral VM β β +β β - Isolated environment β β +β β - Access to secrets β β +β ββββββββββββββββββββββββββββββββββ β +β β β +β β HTTPS (TLS 1.2+) β +β βΌ β +βββββββββββββββββββββββββββββββββββββββββββ + β + ββββββββββββββΌβββββββββββββ + β β β + βΌ βΌ βΌ +ββββββββββ βββββββββββ ββββββββββββ +β PyPI β β Docker β β GitHub β +β API β β Hub β β API β +ββββββββββ βββββββββββ ββββββββββββ + External External External + Service Service Service +``` + +### Secret Management + +#### Secret Lifecycle + +``` +Creation (Developer) + β + βββΊ PyPI: Create API token (scoped to project) + βββΊ Docker Hub: Create access token (read/write) + β + βΌ +Storage (GitHub) + β + βββΊ Encrypted at rest (AES-256) + βββΊ Access controlled (repo-scoped) + β + βΌ +Usage (Workflow) + β + βββΊ Injected as env vars + βββΊ Masked in logs (GitHub redacts on output) + βββΊ Never persisted to disk (in-memory only) + β + βΌ +Transmission (API Call) + β + βββΊ HTTPS only + βββΊ TLS 1.2+ with strong ciphers + β + βΌ +Rotation (Manual) + β + βββΊ Regenerate on PyPI/Docker Hub + Update GitHub secret +``` + +--- + +## Performance Characteristics + +### Release Pipeline Performance + +| Metric | Value | Notes | +|--------|-------|-------| +| Cold start | ~2-3 min | First run on new runner | +| Warm start | ~2-3 min | Minimal caching benefit | +| PyPI upload | ~30-60 sec | Network-bound | +| Package build | ~30 sec | CPU-bound | +| Parallelization | None | Sequential by design | + +### Docker Pipeline Performance + +| Metric | Cold Cache | Warm Cache (code) | Warm Cache (deps) | +|--------|-----------|-------------------|-------------------| +| Total time | 10-15 min | 1-2 min | 3-5 min | +| amd64 build | 5-7 min | 30-60 sec | 1-2 min | +| arm64 build | 5-7 min | 30-60 sec | 1-2 min | +| Push time | 1-2 min | 30 sec | 30 sec | +| Cache hit rate | 0% | 85% | 60% | + +### Cache Performance Model + +```python +def estimate_build_time(changes): + base_time = 60 # seconds (setup + push) + + if "Dockerfile" in changes: + return base_time + (10 * 60) # Full rebuild: ~11 min + elif "requirements.txt" in changes: + return base_time + (3 * 60) # Deps rebuild: ~4 min + elif any(f.endswith(".py") for f in changes): + return base_time + 60 # Code only: ~2 min + else: + return base_time # No changes: ~1 min +``` + +--- + +## Scalability Considerations + +### Current Limits + +| Resource | Limit | Impact | +|----------|-------|--------| +| Workflow concurrency | 20 (default) | Max 20 releases in parallel | +| Artifact storage | 500 MB/artifact | PyPI packages small (<10 MB) | +| Cache storage | 10 GB/repo | Docker layers fit comfortably | +| Workflow run time | 6 hours | Plenty of headroom | + +### Scaling Strategies + +#### Horizontal Scaling (Multiple Repos) +``` +crawl4ai (main) + ββ release.yml + ββ docker-release.yml + +crawl4ai-plugins (separate) + ββ release.yml + ββ docker-release.yml + +Each repo has independent: + - Secrets + - Cache (10 GB each) + - Concurrency limits (20 each) +``` + +#### Vertical Scaling (Larger Runners) +```yaml +jobs: + docker: + runs-on: ubuntu-latest-8-cores # GitHub-hosted larger runner + # 4x faster builds for CPU-bound layers +``` + +--- + +## Disaster Recovery + +### Failure Scenarios + +#### Scenario 1: Release Pipeline Fails + +**Failure Point**: PyPI upload fails (network error) + +**State**: +- β Version validated +- β Package built +- β PyPI upload +- β GitHub release + +**Recovery**: +```bash +# Manual upload +twine upload dist/* + +# Retry workflow (re-run from GitHub Actions UI) +``` + +**Prevention**: Add retry logic to PyPI upload + +#### Scenario 2: Docker Pipeline Fails + +**Failure Point**: ARM build fails (dependency issue) + +**State**: +- β PyPI published +- β GitHub release created +- β amd64 image built +- β arm64 image build + +**Recovery**: +```bash +# Fix Dockerfile +git commit -am "fix: ARM build dependency" + +# Trigger rebuild +git tag docker-rebuild-v1.2.3 +git push origin docker-rebuild-v1.2.3 +``` + +**Impact**: PyPI package available, only Docker ARM users affected + +#### Scenario 3: Partial Release + +**Failure Point**: GitHub release creation fails + +**State**: +- β PyPI published +- β GitHub release +- β Docker images + +**Recovery**: +```bash +# Create release manually +gh release create v1.2.3 \ + --title "Release v1.2.3" \ + --notes "..." + +# This triggers docker-release.yml automatically +``` + +--- + +## Monitoring and Observability + +### Metrics to Track + +#### Release Pipeline +- Success rate (target: >99%) +- Duration (target: <3 min) +- PyPI upload time (target: <60 sec) + +#### Docker Pipeline +- Success rate (target: >95%) +- Duration (target: <15 min cold, <2 min warm) +- Cache hit rate (target: >80% for code changes) + +### Alerting + +**Critical Alerts**: +- Release pipeline failure (blocks release) +- PyPI authentication failure (expired token) + +**Warning Alerts**: +- Docker build >15 min (performance degradation) +- Cache hit rate <50% (cache issue) + +### Logging + +**GitHub Actions Logs**: +- Retention: 90 days +- Downloadable: Yes +- Searchable: Limited + +**Recommended External Logging**: +```yaml +- name: Send logs to external service + if: failure() + run: | + curl -X POST https://logs.example.com/api/v1/logs \ + -H "Content-Type: application/json" \ + -d "{\"workflow\": \"${{ github.workflow }}\", \"status\": \"failed\"}" +``` + +--- + +## Future Enhancements + +### Planned Improvements + +1. **Automated Changelog Generation** + - Use conventional commits + - Generate CHANGELOG.md automatically + +2. **Pre-release Testing** + - Test builds on `test-v*` tags + - Upload to TestPyPI + +3. **Notification System** + - Slack/Discord notifications on release + - Email on failure + +4. **Performance Optimization** + - Parallel Docker builds (amd64 + arm64 simultaneously) + - Persistent runners for better caching + +5. **Enhanced Validation** + - Smoke tests after PyPI upload + - Container security scanning + +--- + +## References + +- [GitHub Actions Architecture](https://docs.github.com/en/actions/learn-github-actions/understanding-github-actions) +- [Docker Build Cache](https://docs.docker.com/build/cache/) +- [PyPI API Documentation](https://warehouse.pypa.io/api-reference/) + +--- + +**Last Updated**: 2025-01-21 +**Version**: 2.0 diff --git a/.github/workflows/docs/README.md b/.github/workflows/docs/README.md new file mode 100644 index 00000000..e96a4c5e --- /dev/null +++ b/.github/workflows/docs/README.md @@ -0,0 +1,1029 @@ +# GitHub Actions Workflows Documentation + +## Table of Contents + +1. [Overview](#overview) +2. [Workflow Architecture](#workflow-architecture) +3. [Workflows](#workflows) + - [Release Pipeline](#release-pipeline) + - [Docker Release](#docker-release) +4. [Usage Guide](#usage-guide) +5. [Secrets Configuration](#secrets-configuration) +6. [Troubleshooting](#troubleshooting) +7. [Advanced Topics](#advanced-topics) + +--- + +## Overview + +This repository uses a **split release pipeline** architecture to optimize release times and provide flexibility. The release process is divided into two independent workflows: + +1. **Release Pipeline** (`release.yml`) - Fast PyPI and GitHub release publication +2. **Docker Release** (`docker-release.yml`) - Multi-architecture Docker image builds with caching + +### Why Split Workflows? + +**Problem**: Docker multi-architecture builds take 10-15 minutes, blocking quick package releases. + +**Solution**: Separate Docker builds into an independent workflow that runs in parallel. + +**Benefits**: +- β PyPI package available in ~2-3 minutes +- β GitHub release published immediately +- β Docker images build in parallel (non-blocking) +- β Can rebuild Docker images independently +- β Faster subsequent builds with layer caching + +--- + +## Workflow Architecture + +``` +Tag Push (v1.2.3) + β + βββΊ Release Pipeline (release.yml) + β ββ Version validation + β ββ Build Python package + β ββ Upload to PyPI β + β ββ Create GitHub Release β + β β + β βββΊ Triggers Docker Release (docker-release.yml) + β ββ Build multi-arch images + β ββ Use GitHub Actions cache + β ββ Push to Docker Hub β + β + βββΊ Total Time: + - PyPI/GitHub: 2-3 minutes + - Docker: 1-15 minutes (parallel) +``` + +### Event Flow + +```mermaid +graph TD + A[Push tag v1.2.3] --> B[release.yml triggered] + B --> C{Version Check} + C -->|Match| D[Build Package] + C -->|Mismatch| E[β Fail - Update __version__.py] + D --> F[Upload to PyPI] + F --> G[Create GitHub Release] + G --> H[docker-release.yml triggered] + H --> I[Build Docker Images] + I --> J[Push to Docker Hub] + + K[Push tag docker-rebuild-v1.2.3] --> H +``` + +--- + +## Workflows + +### Release Pipeline + +**File**: `.github/workflows/release.yml` + +#### Trigger + +```yaml +on: + push: + tags: + - 'v*' # Matches: v1.2.3, v2.0.0, etc. + - '!test-v*' # Excludes: test-v1.2.3 +``` + +#### Jobs & Steps + +##### 1. Version Extraction +```bash +# Extracts version from tag +v1.2.3 β 1.2.3 +``` + +##### 2. Version Consistency Check +Validates that the git tag matches `crawl4ai/__version__.py`: + +```python +# crawl4ai/__version__.py must contain: +__version__ = "1.2.3" # Must match tag v1.2.3 +``` + +**Failure Example**: +``` +Tag version: 1.2.3 +Package version: 1.2.2 +β Version mismatch! Please update crawl4ai/__version__.py +``` + +##### 3. Package Build +- Installs build dependencies (`build`, `twine`) +- Builds source distribution and wheel: `python -m build` +- Validates package: `twine check dist/*` + +##### 4. PyPI Upload +```bash +twine upload dist/* +# Uploads to: https://pypi.org/project/crawl4ai/ +``` + +**Environment Variables**: +- `TWINE_USERNAME`: `__token__` (PyPI API token authentication) +- `TWINE_PASSWORD`: `${{ secrets.PYPI_TOKEN }}` + +##### 5. GitHub Release Creation +Creates a release with: +- Tag: `v1.2.3` +- Title: `Release v1.2.3` +- Body: Installation instructions + changelog link +- Status: Published (not draft) + +**Note**: The release body includes a link to the Docker workflow status, informing users that Docker images are building. + +##### 6. Summary Report +Generates a GitHub Actions summary with: +- PyPI package URL and version +- GitHub release URL +- Link to Docker workflow status + +#### Output Artifacts + +| Artifact | Location | Time | +|----------|----------|------| +| PyPI Package | https://pypi.org/project/crawl4ai/ | ~2-3 min | +| GitHub Release | Repository releases page | ~2-3 min | + +--- + +### Docker Release + +**File**: `.github/workflows/docker-release.yml` + +#### Triggers + +This workflow has **two independent triggers**: + +##### 1. Automatic Trigger (Release Event) +```yaml +on: + release: + types: [published] +``` + +Triggers when `release.yml` publishes a GitHub release. + +##### 2. Manual Trigger (Docker Rebuild Tag) +```yaml +on: + push: + tags: + - 'docker-rebuild-v*' +``` + +Allows rebuilding Docker images without creating a new release. + +**Use case**: Fix Dockerfile, rebuild images for existing version. + +#### Jobs & Steps + +##### 1. Version Detection +Intelligently detects version from either trigger: + +```bash +# From release event: +github.event.release.tag_name β v1.2.3 β 1.2.3 + +# From docker-rebuild tag: +docker-rebuild-v1.2.3 β 1.2.3 +``` + +##### 2. Semantic Version Extraction +```bash +VERSION=1.2.3 +MAJOR=1 # First component +MINOR=1.2 # First two components +``` + +Used for Docker tag variations. + +##### 3. Docker Buildx Setup +Configures multi-architecture build support: +- Platform: linux/amd64, linux/arm64 +- Builder: Buildx with QEMU emulation + +##### 4. Docker Hub Authentication +```yaml +username: ${{ secrets.DOCKER_USERNAME }} +password: ${{ secrets.DOCKER_TOKEN }} +``` + +##### 5. Multi-Architecture Build & Push + +**Docker Tags Created**: +``` +unclecode/crawl4ai:1.2.3 # Exact version +unclecode/crawl4ai:1.2 # Minor version +unclecode/crawl4ai:1 # Major version +unclecode/crawl4ai:latest # Latest stable +``` + +**Platforms**: +- `linux/amd64` (x86_64 - Intel/AMD processors) +- `linux/arm64` (ARM processors - Apple Silicon, AWS Graviton) + +**Caching Configuration**: +```yaml +cache-from: type=gha # Read from GitHub Actions cache +cache-to: type=gha,mode=max # Write all layers to cache +``` + +##### 6. Summary Report +Generates a summary with: +- Published image tags +- Supported platforms +- Pull command example + +#### Docker Layer Caching + +**How It Works**: + +Docker builds images in layers: +```dockerfile +FROM python:3.12 # Layer 1 (base image) +RUN apt-get update # Layer 2 (system packages) +COPY requirements.txt . # Layer 3 (dependency file) +RUN pip install -r ... # Layer 4 (Python packages) +COPY . . # Layer 5 (application code) +``` + +**Cache Behavior**: + +| Change Type | Cached Layers | Rebuild Time | +|-------------|---------------|--------------| +| No changes | 1-5 | ~30-60 sec | +| Code only | 1-4 | ~1-2 min | +| Dependencies | 1-3 | ~3-5 min | +| Dockerfile | None | ~10-15 min | + +**Cache Storage**: +- Location: GitHub Actions cache +- Limit: 10GB per repository +- Retention: 7 days for unused cache +- Cleanup: Automatic (LRU eviction) + +**Cache Efficiency Example**: + +```bash +# First build (v1.0.0) +Build time: 12m 34s +Cache: 0% (cold start) + +# Second build (v1.0.1 - code change only) +Build time: 1m 47s +Cache: 85% hit rate +Cached: Base image, system packages, Python dependencies + +# Third build (v1.0.2 - dependency update) +Build time: 4m 12s +Cache: 60% hit rate +Cached: Base image, system packages +``` + +#### Output Artifacts + +| Artifact | Location | Tags | Time | +|----------|----------|------|------| +| Docker Images | Docker Hub | 4 tags | 1-15 min | + +**Docker Hub URL**: https://hub.docker.com/r/unclecode/crawl4ai + +--- + +## Usage Guide + +### Standard Release Process + +#### Step 1: Update Version + +Edit `crawl4ai/__version__.py`: +```python +__version__ = "1.2.3" +``` + +#### Step 2: Commit and Tag + +```bash +git add crawl4ai/__version__.py +git commit -m "chore: bump version to 1.2.3" +git tag v1.2.3 +git push origin main +git push origin v1.2.3 +``` + +#### Step 3: Monitor Workflows + +**Release Pipeline** (~2-3 minutes): +``` +β Version check passed +β Package built +β Uploaded to PyPI +β GitHub release created +``` + +**Docker Release** (~1-15 minutes, runs in parallel): +``` +β Images built for amd64, arm64 +β Pushed 4 tags to Docker Hub +β Cache updated +``` + +#### Step 4: Verify Deployment + +```bash +# Check PyPI +pip install crawl4ai==1.2.3 + +# Check Docker +docker pull unclecode/crawl4ai:1.2.3 +docker run unclecode/crawl4ai:1.2.3 --version +``` + +### Manual Docker Rebuild + +**When to Use**: +- Dockerfile fixed after release +- Security patch in base image +- Rebuild needed without new version + +**Process**: + +```bash +# Rebuild Docker images for existing version 1.2.3 +git tag docker-rebuild-v1.2.3 +git push origin docker-rebuild-v1.2.3 +``` + +This triggers **only** `docker-release.yml`, not `release.yml`. + +**Result**: +- Docker images rebuilt with same version tag +- PyPI package unchanged +- GitHub release unchanged + +### Rollback Procedure + +#### Rollback PyPI Package +PyPI does not allow re-uploading the same version. Instead: + +```bash +# Publish a patch version +git tag v1.2.4 +git push origin v1.2.4 +``` + +Then update documentation to recommend the new version. + +#### Rollback Docker Images + +```bash +# Option 1: Rebuild with fixed code +git tag docker-rebuild-v1.2.3 +git push origin docker-rebuild-v1.2.3 + +# Option 2: Manually retag in Docker Hub (advanced) +# Not recommended - use git tags for traceability +``` + +--- + +## Secrets Configuration + +### Required Secrets + +Configure these in: **Repository Settings β Secrets and variables β Actions** + +#### 1. PYPI_TOKEN + +**Purpose**: Authenticate with PyPI for package uploads + +**How to Create**: +1. Go to https://pypi.org/manage/account/token/ +2. Create token with scope: "Entire account" or "Project: crawl4ai" +3. Copy token (starts with `pypi-`) +4. Add to GitHub secrets as `PYPI_TOKEN` + +**Format**: +``` +pypi-AgEIcHlwaS5vcmcCJGQ4M2Y5YTM5LWRjMzUtNGY3MS04ZmMwLWVhNzA5MjkzMjk5YQACKl... +``` + +#### 2. DOCKER_USERNAME + +**Purpose**: Docker Hub username for authentication + +**Value**: Your Docker Hub username (e.g., `unclecode`) + +#### 3. DOCKER_TOKEN + +**Purpose**: Docker Hub access token for authentication + +**How to Create**: +1. Go to https://hub.docker.com/settings/security +2. Click "New Access Token" +3. Name: `github-actions-crawl4ai` +4. Permissions: Read, Write, Delete +5. Copy token +6. Add to GitHub secrets as `DOCKER_TOKEN` + +**Format**: +``` +dckr_pat_1a2b3c4d5e6f7g8h9i0j +``` + +### Built-in Secrets + +#### GITHUB_TOKEN + +**Purpose**: Create GitHub releases + +**Note**: Automatically provided by GitHub Actions. No configuration needed. + +**Permissions**: Configured in workflow file: +```yaml +permissions: + contents: write # Required for creating releases +``` + +--- + +## Troubleshooting + +### Version Mismatch Error + +**Error**: +``` +β Version mismatch! Tag: 1.2.3, Package: 1.2.2 +Please update crawl4ai/__version__.py to match the tag version +``` + +**Cause**: Git tag doesn't match `__version__` in `crawl4ai/__version__.py` + +**Fix**: +```bash +# Option 1: Update __version__.py and re-tag +vim crawl4ai/__version__.py # Change to 1.2.3 +git add crawl4ai/__version__.py +git commit -m "fix: update version to 1.2.3" +git tag -d v1.2.3 # Delete local tag +git push --delete origin v1.2.3 # Delete remote tag +git tag v1.2.3 # Create new tag +git push origin main +git push origin v1.2.3 + +# Option 2: Use correct tag +git tag v1.2.2 # Match existing __version__ +git push origin v1.2.2 +``` + +### PyPI Upload Failure + +**Error**: +``` +HTTPError: 403 Forbidden +``` + +**Causes & Fixes**: + +1. **Invalid Token**: + - Verify `PYPI_TOKEN` in GitHub secrets + - Ensure token hasn't expired + - Regenerate token on PyPI + +2. **Version Already Exists**: + ``` + HTTPError: 400 File already exists + ``` + - PyPI doesn't allow re-uploading same version + - Increment version number and retry + +3. **Package Name Conflict**: + - Ensure you own the `crawl4ai` package on PyPI + - Check token scope includes this project + +### Docker Build Failure + +**Error**: +``` +failed to solve: process "/bin/sh -c ..." did not complete successfully +``` + +**Debug Steps**: + +1. **Check Build Logs**: + - Go to Actions tab β Docker Release workflow + - Expand "Build and push Docker images" step + - Look for specific error + +2. **Test Locally**: + ```bash + docker build -t crawl4ai:test . + ``` + +3. **Common Issues**: + + **Dependency installation fails**: + ```dockerfile + # Check requirements.txt is valid + # Ensure all packages are available + ``` + + **Architecture-specific issues**: + ```bash + # Test both platforms locally (if on Mac with Apple Silicon) + docker buildx build --platform linux/amd64,linux/arm64 -t test . + ``` + +4. **Cache Issues**: + ```bash + # Clear cache by pushing a tag with different content + # Or wait 7 days for automatic cache eviction + ``` + +### Docker Authentication Failure + +**Error**: +``` +Error: Cannot perform an interactive login from a non TTY device +``` + +**Cause**: Docker Hub credentials invalid + +**Fix**: +1. Verify `DOCKER_USERNAME` is correct +2. Regenerate `DOCKER_TOKEN` on Docker Hub +3. Update secret in GitHub + +### Docker Release Not Triggering + +**Issue**: Pushed tag `v1.2.3`, but `docker-release.yml` didn't run + +**Causes**: + +1. **Release Not Published**: + - Check if `release.yml` completed successfully + - Verify GitHub release is published (not draft) + +2. **Workflow File Syntax Error**: + ```bash + # Validate YAML syntax + yamllint .github/workflows/docker-release.yml + ``` + +3. **Workflow Not on Default Branch**: + - Workflow files must be on `main` branch + - Check if `.github/workflows/docker-release.yml` exists on `main` + +**Debug**: +```bash +# Check workflow files +git ls-tree main .github/workflows/ + +# Check GitHub Actions tab for workflow runs +``` + +### Cache Not Working + +**Issue**: Every build takes 10-15 minutes despite using cache + +**Causes**: + +1. **Cache Scope**: + - Cache is per-branch and per-workflow + - First build on new branch is always cold + +2. **Dockerfile Changes**: + - Any change invalidates subsequent layers + - Optimize Dockerfile layer order (stable β volatile) + +3. **Base Image Updates**: + - `FROM python:3.12` pulls latest monthly + - Pin to specific digest for stable cache + +**Optimization**: +```dockerfile +# Good: Stable layers first +FROM python:3.12 +RUN apt-get update && apt-get install -y ... +COPY requirements.txt . +RUN pip install -r requirements.txt +COPY . . + +# Bad: Volatile layers first (breaks cache often) +FROM python:3.12 +COPY . . +RUN pip install -r requirements.txt +``` + +--- + +## Advanced Topics + +### Multi-Architecture Build Details + +#### Platform Support + +| Platform | Architecture | Use Cases | +|----------|-------------|-----------| +| linux/amd64 | x86_64 | AWS EC2, GCP, Azure, Traditional servers | +| linux/arm64 | aarch64 | Apple Silicon, AWS Graviton, Raspberry Pi | + +#### Build Process + +```bash +# Buildx uses QEMU to emulate different architectures +docker buildx create --use # Create builder +docker buildx build --platform linux/amd64,linux/arm64 ... +``` + +**Under the Hood**: +1. For each platform: + - Spawn QEMU emulator + - Execute Dockerfile instructions + - Generate platform-specific image +2. Create manifest list (multi-arch index) +3. Push all variants + manifest to registry + +**Pull Behavior**: +```bash +# Docker automatically selects correct platform +docker pull unclecode/crawl4ai:latest + +# On M1 Mac: Pulls arm64 variant +# On Intel Linux: Pulls amd64 variant + +# Force specific platform +docker pull --platform linux/amd64 unclecode/crawl4ai:latest +``` + +### Semantic Versioning Strategy + +#### Tag Scheme + +``` +v1.2.3 + β β β + β β ββ Patch: Bug fixes, no API changes + β ββββ Minor: New features, backward compatible + ββββββ Major: Breaking changes +``` + +#### Docker Tag Mapping + +| Git Tag | Docker Tags Created | Use Case | +|---------|---------------------|----------| +| v1.2.3 | 1.2.3, 1.2, 1, latest | Full version chain | +| v2.0.0 | 2.0.0, 2.0, 2, latest | Major version bump | + +**Example Evolution**: + +```bash +# Release v1.0.0 +Tags: 1.0.0, 1.0, 1, latest + +# Release v1.1.0 +Tags: 1.1.0, 1.1, 1, latest +# Note: 1.0 still exists, but 1 and latest now point to 1.1.0 + +# Release v1.2.0 +Tags: 1.2.0, 1.2, 1, latest +# Note: 1.0 and 1.1 still exist, but 1 and latest now point to 1.2.0 + +# Release v2.0.0 +Tags: 2.0.0, 2.0, 2, latest +# Note: All v1.x tags still exist, but latest now points to 2.0.0 +``` + +**User Pinning Strategies**: + +```bash +# Maximum stability (never updates) +docker pull unclecode/crawl4ai:1.2.3 + +# Get patch updates only +docker pull unclecode/crawl4ai:1.2 + +# Get minor updates (features, bug fixes) +docker pull unclecode/crawl4ai:1 + +# Always get latest (potentially breaking) +docker pull unclecode/crawl4ai:latest +``` + +### Cache Optimization Strategies + +#### 1. Layer Order Optimization + +```dockerfile +# BEFORE (cache breaks often) +FROM python:3.12 +COPY . /app # Changes every commit +RUN pip install -r requirements.txt +RUN apt-get install -y ffmpeg + +# AFTER (cache-optimized) +FROM python:3.12 +RUN apt-get update && apt-get install -y ffmpeg # Rarely changes +COPY requirements.txt /app/requirements.txt # Changes occasionally +RUN pip install -r /app/requirements.txt +COPY . /app # Changes every commit +``` + +#### 2. Multi-Stage Builds + +```dockerfile +# Build stage (cached separately) +FROM python:3.12 as builder +COPY requirements.txt . +RUN pip install --user -r requirements.txt + +# Runtime stage +FROM python:3.12-slim +COPY --from=builder /root/.local /root/.local +COPY . /app +ENV PATH=/root/.local/bin:$PATH +``` + +**Benefits**: +- Builder stage cached independently +- Runtime image smaller +- Faster rebuilds + +#### 3. Dependency Caching + +```dockerfile +# Cache pip packages +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install -r requirements.txt + +# Cache apt packages +RUN --mount=type=cache,target=/var/cache/apt \ + apt-get update && apt-get install -y ... +``` + +**Note**: Requires BuildKit (enabled by default in GitHub Actions) + +#### 4. Base Image Pinning + +```dockerfile +# VOLATILE (updates monthly, breaks cache) +FROM python:3.12 + +# STABLE (fixed digest, cache preserved) +FROM python:3.12@sha256:8c5e5c77e7b9e44a6f0e3b9e8f5e5c77e7b9e44a6f0e3b9e8f5e5c77e7b9e44a +``` + +Find digest: +```bash +docker pull python:3.12 +docker inspect python:3.12 | grep -A 2 RepoDigests +``` + +### Workflow Security Best Practices + +#### 1. Secret Handling + +**Never**: +```yaml +# DON'T: Hardcode secrets +run: echo "my-secret-token" | docker login + +# DON'T: Log secrets +run: echo "Token is ${{ secrets.PYPI_TOKEN }}" +``` + +**Always**: +```yaml +# DO: Use environment variables +env: + PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} +run: twine upload dist/* + +# DO: Use action inputs (masked automatically) +uses: docker/login-action@v3 +with: + password: ${{ secrets.DOCKER_TOKEN }} +``` + +#### 2. Permission Minimization + +```yaml +# Specific permissions only +permissions: + contents: write # Only what's needed + # NOT: permissions: write-all +``` + +#### 3. Dependency Pinning + +```yaml +# DON'T: Use floating versions +uses: actions/checkout@v4 + +# DO: Pin to SHA (immutable) +uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 +``` + +#### 4. Token Scoping + +**PyPI Token**: +- Scope: Project-specific (`crawl4ai` only) +- Not: Account-wide access + +**Docker Token**: +- Permissions: Read, Write (not Delete unless needed) +- Expiration: Set to 1 year, rotate regularly + +### Monitoring and Observability + +#### GitHub Actions Metrics + +**Available in Actions tab**: +- Workflow run duration +- Success/failure rates +- Cache hit rates +- Artifact sizes + +#### Custom Metrics + +Add to workflow summary: +```yaml +- name: Build Metrics + run: | + echo "## Build Metrics" >> $GITHUB_STEP_SUMMARY + echo "- Duration: $(date -u -d @$SECONDS +%T)" >> $GITHUB_STEP_SUMMARY + echo "- Cache hit rate: 85%" >> $GITHUB_STEP_SUMMARY +``` + +#### External Monitoring + +**Webhooks**: Configure in Settings β Webhooks +```json +{ + "events": ["workflow_run"], + "url": "https://your-monitoring-service.com/webhook" +} +``` + +**Status Badges**: +```markdown +[](https://github.com/user/repo/actions/workflows/release.yml) + +[](https://github.com/user/repo/actions/workflows/docker-release.yml) +``` + +### Disaster Recovery + +#### Backup Workflow Files + +**Current Backup**: +- `.github/workflows/release.yml.backup` + +**Recommended**: +```bash +# Automatic backup before modifications +cp .github/workflows/release.yml .github/workflows/release.yml.backup-$(date +%Y%m%d) +git add .github/workflows/*.backup* +git commit -m "backup: workflow before modification" +``` + +#### Recovery from Failed Release + +**Scenario**: v1.2.3 release failed mid-way + +**Steps**: +1. **Identify what succeeded**: + - Check PyPI: `pip search crawl4ai` + - Check Docker Hub: https://hub.docker.com/r/unclecode/crawl4ai/tags + - Check GitHub Releases + +2. **Clean up partial release**: + ```bash + # Delete tag + git tag -d v1.2.3 + git push --delete origin v1.2.3 + + # Delete GitHub release (if created) + gh release delete v1.2.3 + ``` + +3. **Fix issue and retry**: + ```bash + # Fix the issue + # Re-tag and push + git tag v1.2.3 + git push origin v1.2.3 + ``` + +**Note**: Cannot delete PyPI uploads. If PyPI succeeded, increment to v1.2.4. + +### CI/CD Best Practices + +#### 1. Version Validation + +Add pre-commit hook: +```bash +# .git/hooks/pre-commit +#!/bin/bash +VERSION_FILE="crawl4ai/__version__.py" +VERSION=$(python -c "exec(open('$VERSION_FILE').read()); print(__version__)") +echo "Current version: $VERSION" +``` + +#### 2. Changelog Automation + +Use conventional commits: +```bash +git commit -m "feat: add new scraping mode" +git commit -m "fix: handle timeout errors" +git commit -m "docs: update API reference" +``` + +Generate changelog: +```bash +# Use git-cliff or similar +git cliff --tag v1.2.3 > CHANGELOG.md +``` + +#### 3. Pre-Release Testing + +Add test workflow: +```yaml +# .github/workflows/test.yml +on: + push: + tags: + - 'test-v*' + +jobs: + test-release: + runs-on: ubuntu-latest + steps: + - name: Build package + run: python -m build + - name: Upload to TestPyPI + run: twine upload --repository testpypi dist/* +``` + +#### 4. Release Checklist + +Create issue template: +```markdown +## Release Checklist + +- [ ] Update version in `crawl4ai/__version__.py` +- [ ] Update CHANGELOG.md +- [ ] Run tests locally: `pytest` +- [ ] Build package locally: `python -m build` +- [ ] Create and push tag: `git tag v1.2.3 && git push origin v1.2.3` +- [ ] Monitor Release Pipeline workflow +- [ ] Monitor Docker Release workflow +- [ ] Verify PyPI: `pip install crawl4ai==1.2.3` +- [ ] Verify Docker: `docker pull unclecode/crawl4ai:1.2.3` +- [ ] Announce release +``` + +--- + +## References + +### Official Documentation + +- [GitHub Actions Documentation](https://docs.github.com/en/actions) +- [Docker Build Push Action](https://github.com/docker/build-push-action) +- [PyPI Publishing Guide](https://packaging.python.org/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/) + +### Related Files + +- [`release.yml`](../release.yml) - Main release workflow +- [`docker-release.yml`](../docker-release.yml) - Docker build workflow +- [`release.yml.backup`](../release.yml.backup) - Original combined workflow + +### Changelog + +| Date | Version | Changes | +|------|---------|---------| +| 2025-01-XX | 2.0 | Split workflows, added Docker caching | +| 2024-XX-XX | 1.0 | Initial combined workflow | + +--- + +## Support + +For issues or questions: +1. Check [Troubleshooting](#troubleshooting) section +2. Review [GitHub Actions logs](../../actions) +3. Create issue in repository + +--- + +**Last Updated**: 2025-01-21 +**Maintainer**: Crawl4AI Team diff --git a/.github/workflows/docs/WORKFLOW_REFERENCE.md b/.github/workflows/docs/WORKFLOW_REFERENCE.md new file mode 100644 index 00000000..208b4d62 --- /dev/null +++ b/.github/workflows/docs/WORKFLOW_REFERENCE.md @@ -0,0 +1,287 @@ +# Workflow Quick Reference + +## Quick Commands + +### Standard Release +```bash +# 1. Update version +vim crawl4ai/__version__.py # Set to "1.2.3" + +# 2. Commit and tag +git add crawl4ai/__version__.py +git commit -m "chore: bump version to 1.2.3" +git tag v1.2.3 +git push origin main +git push origin v1.2.3 + +# 3. Monitor +# - PyPI: ~2-3 minutes +# - Docker: ~1-15 minutes +``` + +### Docker Rebuild Only +```bash +git tag docker-rebuild-v1.2.3 +git push origin docker-rebuild-v1.2.3 +``` + +### Delete Tag (Undo Release) +```bash +# Local +git tag -d v1.2.3 + +# Remote +git push --delete origin v1.2.3 + +# GitHub Release +gh release delete v1.2.3 +``` + +--- + +## Workflow Triggers + +### release.yml +| Event | Pattern | Example | +|-------|---------|---------| +| Tag push | `v*` | `v1.2.3` | +| Excludes | `test-v*` | `test-v1.2.3` | + +### docker-release.yml +| Event | Pattern | Example | +|-------|---------|---------| +| Release published | `release.published` | Automatic | +| Tag push | `docker-rebuild-v*` | `docker-rebuild-v1.2.3` | + +--- + +## Environment Variables + +### release.yml +| Variable | Source | Example | +|----------|--------|---------| +| `VERSION` | Git tag | `1.2.3` | +| `TWINE_USERNAME` | Static | `__token__` | +| `TWINE_PASSWORD` | Secret | `pypi-Ag...` | +| `GITHUB_TOKEN` | Auto | `ghp_...` | + +### docker-release.yml +| Variable | Source | Example | +|----------|--------|---------| +| `VERSION` | Release/Tag | `1.2.3` | +| `MAJOR` | Computed | `1` | +| `MINOR` | Computed | `1.2` | +| `DOCKER_USERNAME` | Secret | `unclecode` | +| `DOCKER_TOKEN` | Secret | `dckr_pat_...` | + +--- + +## Docker Tags Generated + +| Version | Tags Created | +|---------|-------------| +| v1.0.0 | `1.0.0`, `1.0`, `1`, `latest` | +| v1.1.0 | `1.1.0`, `1.1`, `1`, `latest` | +| v1.2.3 | `1.2.3`, `1.2`, `1`, `latest` | +| v2.0.0 | `2.0.0`, `2.0`, `2`, `latest` | + +--- + +## Workflow Outputs + +### release.yml +| Output | Location | Time | +|--------|----------|------| +| PyPI Package | https://pypi.org/project/crawl4ai/ | ~2-3 min | +| GitHub Release | Repository β Releases | ~2-3 min | +| Workflow Summary | Actions β Run β Summary | Immediate | + +### docker-release.yml +| Output | Location | Time | +|--------|----------|------| +| Docker Images | https://hub.docker.com/r/unclecode/crawl4ai | ~1-15 min | +| Workflow Summary | Actions β Run β Summary | Immediate | + +--- + +## Common Issues + +| Issue | Solution | +|-------|----------| +| Version mismatch | Update `crawl4ai/__version__.py` to match tag | +| PyPI 403 Forbidden | Check `PYPI_TOKEN` secret | +| PyPI 400 File exists | Version already published, increment version | +| Docker auth failed | Regenerate `DOCKER_TOKEN` | +| Docker build timeout | Check Dockerfile, review build logs | +| Cache not working | First build on branch always cold | + +--- + +## Secrets Checklist + +- [ ] `PYPI_TOKEN` - PyPI API token (project or account scope) +- [ ] `DOCKER_USERNAME` - Docker Hub username +- [ ] `DOCKER_TOKEN` - Docker Hub access token (read/write) +- [ ] `GITHUB_TOKEN` - Auto-provided (no action needed) + +--- + +## Workflow Dependencies + +### release.yml Dependencies +```yaml +Python: 3.12 +Actions: + - actions/checkout@v4 + - actions/setup-python@v5 + - softprops/action-gh-release@v2 +PyPI Packages: + - build + - twine +``` + +### docker-release.yml Dependencies +```yaml +Actions: + - actions/checkout@v4 + - docker/setup-buildx-action@v3 + - docker/login-action@v3 + - docker/build-push-action@v5 +Docker: + - Buildx + - QEMU (for multi-arch) +``` + +--- + +## Cache Information + +### Type +- GitHub Actions Cache (`type=gha`) + +### Storage +- **Limit**: 10GB per repository +- **Retention**: 7 days for unused entries +- **Cleanup**: Automatic LRU eviction + +### Performance +| Scenario | Cache Hit | Build Time | +|----------|-----------|------------| +| First build | 0% | 10-15 min | +| Code change only | 85% | 1-2 min | +| Dependency update | 60% | 3-5 min | +| No changes | 100% | 30-60 sec | + +--- + +## Build Platforms + +| Platform | Architecture | Devices | +|----------|--------------|---------| +| linux/amd64 | x86_64 | Intel/AMD servers, AWS EC2, GCP | +| linux/arm64 | aarch64 | Apple Silicon, AWS Graviton, Raspberry Pi | + +--- + +## Version Validation + +### Pre-Tag Checklist +```bash +# Check current version +python -c "from crawl4ai.__version__ import __version__; print(__version__)" + +# Verify it matches intended tag +# If tag is v1.2.3, version should be "1.2.3" +``` + +### Post-Release Verification +```bash +# PyPI +pip install crawl4ai==1.2.3 +python -c "import crawl4ai; print(crawl4ai.__version__)" + +# Docker +docker pull unclecode/crawl4ai:1.2.3 +docker run unclecode/crawl4ai:1.2.3 python -c "import crawl4ai; print(crawl4ai.__version__)" +``` + +--- + +## Monitoring URLs + +| Service | URL | +|---------|-----| +| GitHub Actions | `https://github.com/{owner}/{repo}/actions` | +| PyPI Project | `https://pypi.org/project/crawl4ai/` | +| Docker Hub | `https://hub.docker.com/r/unclecode/crawl4ai` | +| GitHub Releases | `https://github.com/{owner}/{repo}/releases` | + +--- + +## Rollback Strategy + +### PyPI (Cannot Delete) +```bash +# Increment patch version +git tag v1.2.4 +git push origin v1.2.4 +``` + +### Docker (Can Overwrite) +```bash +# Rebuild with fix +git tag docker-rebuild-v1.2.3 +git push origin docker-rebuild-v1.2.3 +``` + +### GitHub Release +```bash +# Delete release +gh release delete v1.2.3 + +# Delete tag +git push --delete origin v1.2.3 +``` + +--- + +## Status Badge Markdown + +```markdown +[](https://github.com/{owner}/{repo}/actions/workflows/release.yml) + +[](https://github.com/{owner}/{repo}/actions/workflows/docker-release.yml) +``` + +--- + +## Timeline Example + +``` +0:00 - Push tag v1.2.3 +0:01 - release.yml starts +0:02 - Version validation passes +0:03 - Package built +0:04 - PyPI upload starts +0:06 - PyPI upload complete β +0:07 - GitHub release created β +0:08 - release.yml complete +0:08 - docker-release.yml triggered +0:10 - Docker build starts +0:12 - amd64 image built (cache hit) +0:14 - arm64 image built (cache hit) +0:15 - Images pushed to Docker Hub β +0:16 - docker-release.yml complete + +Total: ~16 minutes +Critical path (PyPI + GitHub): ~8 minutes +``` + +--- + +## Contact + +For workflow issues: +1. Check Actions tab for logs +2. Review this reference +3. See [README.md](./README.md) for detailed docs diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 3ee9042c..bf1ad7dc 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -10,53 +10,53 @@ jobs: runs-on: ubuntu-latest permissions: contents: write # Required for creating releases - + steps: - name: Checkout code uses: actions/checkout@v4 - + - name: Set up Python uses: actions/setup-python@v5 with: python-version: '3.12' - + - name: Extract version from tag id: get_version run: | TAG_VERSION=${GITHUB_REF#refs/tags/v} echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT echo "Releasing version: $TAG_VERSION" - + - name: Install package dependencies run: | pip install -e . - + - name: Check version consistency run: | TAG_VERSION=${{ steps.get_version.outputs.VERSION }} PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)") - + echo "Tag version: $TAG_VERSION" echo "Package version: $PACKAGE_VERSION" - + if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then echo "β Version mismatch! Tag: $TAG_VERSION, Package: $PACKAGE_VERSION" echo "Please update crawl4ai/__version__.py to match the tag version" exit 1 fi echo "β Version check passed: $TAG_VERSION" - + - name: Install build dependencies run: | python -m pip install --upgrade pip pip install build twine - + - name: Build package run: python -m build - + - name: Check package run: twine check dist/* - + - name: Upload to PyPI env: TWINE_USERNAME: __token__ @@ -65,37 +65,7 @@ jobs: echo "π¦ Uploading to PyPI..." twine upload dist/* echo "β Package uploaded to https://pypi.org/project/crawl4ai/" - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Log in to Docker Hub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.DOCKER_TOKEN }} - - - name: Extract major and minor versions - id: versions - run: | - VERSION=${{ steps.get_version.outputs.VERSION }} - MAJOR=$(echo $VERSION | cut -d. -f1) - MINOR=$(echo $VERSION | cut -d. -f1-2) - echo "MAJOR=$MAJOR" >> $GITHUB_OUTPUT - echo "MINOR=$MINOR" >> $GITHUB_OUTPUT - - - name: Build and push Docker images - uses: docker/build-push-action@v5 - with: - context: . - push: true - tags: | - unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }} - unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }} - unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }} - unclecode/crawl4ai:latest - platforms: linux/amd64,linux/arm64 - + - name: Create GitHub Release uses: softprops/action-gh-release@v2 with: @@ -103,26 +73,29 @@ jobs: name: Release v${{ steps.get_version.outputs.VERSION }} body: | ## π Crawl4AI v${{ steps.get_version.outputs.VERSION }} Released! - + ### π¦ Installation - + **PyPI:** ```bash pip install crawl4ai==${{ steps.get_version.outputs.VERSION }} ``` - + **Docker:** ```bash docker pull unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }} docker pull unclecode/crawl4ai:latest ``` - + + **Note:** Docker images are being built and will be available shortly. + Check the [Docker Release workflow](https://github.com/${{ github.repository }}/actions/workflows/docker-release.yml) for build status. + ### π What's Changed See [CHANGELOG.md](https://github.com/${{ github.repository }}/blob/main/CHANGELOG.md) for details. draft: false prerelease: false token: ${{ secrets.GITHUB_TOKEN }} - + - name: Summary run: | echo "## π Release Complete!" >> $GITHUB_STEP_SUMMARY @@ -132,11 +105,9 @@ jobs: echo "- URL: https://pypi.org/project/crawl4ai/" >> $GITHUB_STEP_SUMMARY echo "- Install: \`pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY - echo "### π³ Docker Images" >> $GITHUB_STEP_SUMMARY - echo "- \`unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY - echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}\`" >> $GITHUB_STEP_SUMMARY - echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}\`" >> $GITHUB_STEP_SUMMARY - echo "- \`unclecode/crawl4ai:latest\`" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY echo "### π GitHub Release" >> $GITHUB_STEP_SUMMARY - echo "https://github.com/${{ github.repository }}/releases/tag/v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY \ No newline at end of file + echo "- https://github.com/${{ github.repository }}/releases/tag/v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### π³ Docker Images" >> $GITHUB_STEP_SUMMARY + echo "Docker images are being built in a separate workflow." >> $GITHUB_STEP_SUMMARY + echo "Check: https://github.com/${{ github.repository }}/actions/workflows/docker-release.yml" >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/release.yml.backup b/.github/workflows/release.yml.backup new file mode 100644 index 00000000..3ee9042c --- /dev/null +++ b/.github/workflows/release.yml.backup @@ -0,0 +1,142 @@ +name: Release Pipeline +on: + push: + tags: + - 'v*' + - '!test-v*' # Exclude test tags + +jobs: + release: + runs-on: ubuntu-latest + permissions: + contents: write # Required for creating releases + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Extract version from tag + id: get_version + run: | + TAG_VERSION=${GITHUB_REF#refs/tags/v} + echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT + echo "Releasing version: $TAG_VERSION" + + - name: Install package dependencies + run: | + pip install -e . + + - name: Check version consistency + run: | + TAG_VERSION=${{ steps.get_version.outputs.VERSION }} + PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)") + + echo "Tag version: $TAG_VERSION" + echo "Package version: $PACKAGE_VERSION" + + if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then + echo "β Version mismatch! Tag: $TAG_VERSION, Package: $PACKAGE_VERSION" + echo "Please update crawl4ai/__version__.py to match the tag version" + exit 1 + fi + echo "β Version check passed: $TAG_VERSION" + + - name: Install build dependencies + run: | + python -m pip install --upgrade pip + pip install build twine + + - name: Build package + run: python -m build + + - name: Check package + run: twine check dist/* + + - name: Upload to PyPI + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} + run: | + echo "π¦ Uploading to PyPI..." + twine upload dist/* + echo "β Package uploaded to https://pypi.org/project/crawl4ai/" + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_TOKEN }} + + - name: Extract major and minor versions + id: versions + run: | + VERSION=${{ steps.get_version.outputs.VERSION }} + MAJOR=$(echo $VERSION | cut -d. -f1) + MINOR=$(echo $VERSION | cut -d. -f1-2) + echo "MAJOR=$MAJOR" >> $GITHUB_OUTPUT + echo "MINOR=$MINOR" >> $GITHUB_OUTPUT + + - name: Build and push Docker images + uses: docker/build-push-action@v5 + with: + context: . + push: true + tags: | + unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }} + unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }} + unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }} + unclecode/crawl4ai:latest + platforms: linux/amd64,linux/arm64 + + - name: Create GitHub Release + uses: softprops/action-gh-release@v2 + with: + tag_name: v${{ steps.get_version.outputs.VERSION }} + name: Release v${{ steps.get_version.outputs.VERSION }} + body: | + ## π Crawl4AI v${{ steps.get_version.outputs.VERSION }} Released! + + ### π¦ Installation + + **PyPI:** + ```bash + pip install crawl4ai==${{ steps.get_version.outputs.VERSION }} + ``` + + **Docker:** + ```bash + docker pull unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }} + docker pull unclecode/crawl4ai:latest + ``` + + ### π What's Changed + See [CHANGELOG.md](https://github.com/${{ github.repository }}/blob/main/CHANGELOG.md) for details. + draft: false + prerelease: false + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Summary + run: | + echo "## π Release Complete!" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### π¦ PyPI Package" >> $GITHUB_STEP_SUMMARY + echo "- Version: ${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY + echo "- URL: https://pypi.org/project/crawl4ai/" >> $GITHUB_STEP_SUMMARY + echo "- Install: \`pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### π³ Docker Images" >> $GITHUB_STEP_SUMMARY + echo "- \`unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY + echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}\`" >> $GITHUB_STEP_SUMMARY + echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}\`" >> $GITHUB_STEP_SUMMARY + echo "- \`unclecode/crawl4ai:latest\`" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### π GitHub Release" >> $GITHUB_STEP_SUMMARY + echo "https://github.com/${{ github.repository }}/releases/tag/v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY \ No newline at end of file diff --git a/.gitignore b/.gitignore index a5389a3e..4679b80c 100644 --- a/.gitignore +++ b/.gitignore @@ -266,6 +266,8 @@ continue_config.json .llm.env .private/ +.claude/ + CLAUDE_MONITOR.md CLAUDE.md diff --git a/docs/md_v2/assets/crawl4ai-skill.zip b/docs/md_v2/assets/crawl4ai-skill.zip new file mode 100644 index 00000000..21785b02 Binary files /dev/null and b/docs/md_v2/assets/crawl4ai-skill.zip differ diff --git a/docs/md_v2/complete-sdk-reference.md b/docs/md_v2/complete-sdk-reference.md new file mode 100644 index 00000000..d4a5ba65 --- /dev/null +++ b/docs/md_v2/complete-sdk-reference.md @@ -0,0 +1,5196 @@ +# Crawl4AI Complete SDK Documentation + +**Generated:** 2025-10-19 12:56 +**Format:** Ultra-Dense Reference (Optimized for AI Assistants) +**Crawl4AI Version:** 0.7.4 + +--- + +## Navigation + + +- [Installation & Setup](#installation--setup) +- [Quick Start](#quick-start) +- [Core API](#core-api) +- [Configuration](#configuration) +- [Crawling Patterns](#crawling-patterns) +- [Content Processing](#content-processing) +- [Extraction Strategies](#extraction-strategies) +- [Advanced Features](#advanced-features) + +--- + + +# Installation & Setup + +# Installation & Setup (2023 Edition) +## 1. Basic Installation +```bash +pip install crawl4ai +``` +## 2. Initial Setup & Diagnostics +### 2.1 Run the Setup Command +```bash +crawl4ai-setup +``` +- Performs OS-level checks (e.g., missing libs on Linux) +- Confirms your environment is ready to crawl +### 2.2 Diagnostics +```bash +crawl4ai-doctor +``` +- Check Python version compatibility +- Verify Playwright installation +- Inspect environment variables or library conflicts +If any issues arise, follow its suggestions (e.g., installing additional system packages) and re-run `crawl4ai-setup`. +## 3. Verifying Installation: A Simple Crawl (Skip this step if you already run `crawl4ai-doctor`) +Below is a minimal Python script demonstrating a **basic** crawl. It uses our new **`BrowserConfig`** and **`CrawlerRunConfig`** for clarity, though no custom settings are passed in this example: +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + +async def main(): + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://www.example.com", + ) + print(result.markdown[:300]) # Show the first 300 characters of extracted text + +if __name__ == "__main__": + asyncio.run(main()) +``` +- A headless browser session loads `example.com` +- Crawl4AI returns ~300 characters of markdown. +If errors occur, rerun `crawl4ai-doctor` or manually ensure Playwright is installed correctly. +## 4. Advanced Installation (Optional) +### 4.1 Torch, Transformers, or All +- **Text Clustering (Torch)** + ```bash + pip install crawl4ai[torch] + crawl4ai-setup + ``` +- **Transformers** + ```bash + pip install crawl4ai[transformer] + crawl4ai-setup + ``` +- **All Features** + ```bash + pip install crawl4ai[all] + crawl4ai-setup + ``` +```bash +crawl4ai-download-models +``` +## 5. Docker (Experimental) +```bash +docker pull unclecode/crawl4ai:basic +docker run -p 11235:11235 unclecode/crawl4ai:basic +``` +You can then make POST requests to `http://localhost:11235/crawl` to perform crawls.β**Production usage** is discouraged until our new Docker approach is ready (planned in Jan or Feb 2025). +## 6. Local Server Mode (Legacy) +## Summary +1.β**Install** with `pip install crawl4ai` and run `crawl4ai-setup`. +2.β**Diagnose** with `crawl4ai-doctor` if you see errors. +3.β**Verify** by crawling `example.com` with minimal `BrowserConfig` + `CrawlerRunConfig`. + + + +# Quick Start + +# Getting Started with Crawl4AI +1. Run your **first crawl** using minimal configuration. +3. Experiment with a simple **CSS-based extraction** strategy. +5. Crawl a **dynamic** page that loads content via JavaScript. +## 1. Introduction +- An asynchronous crawler, **`AsyncWebCrawler`**. +- Configurable browser and run settings via **`BrowserConfig`** and **`CrawlerRunConfig`**. +- Automatic HTML-to-Markdown conversion via **`DefaultMarkdownGenerator`** (supports optional filters). +- Multiple extraction strategies (LLM-based or βtraditionalβ CSS/XPath-based). +## 2. Your First Crawl +Hereβs a minimal Python script that creates an **`AsyncWebCrawler`**, fetches a webpage, and prints the first 300 characters of its Markdown output: +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def main(): + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com") + print(result.markdown[:300]) # Print first 300 chars + +if __name__ == "__main__": + asyncio.run(main()) +``` +- **`AsyncWebCrawler`** launches a headless browser (Chromium by default). +- It fetches `https://example.com`. +- Crawl4AI automatically converts the HTML into Markdown. +## 3. Basic Configuration (Light Introduction) +1.β**`BrowserConfig`**: Controls browser behavior (headless or full UI, user agent, JavaScript toggles, etc.). +2.β**`CrawlerRunConfig`**: Controls how each crawl runs (caching, extraction, timeouts, hooking, etc.). +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode + +async def main(): + browser_conf = BrowserConfig(headless=True) # or False to see the browser + run_conf = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS + ) + + async with AsyncWebCrawler(config=browser_conf) as crawler: + result = await crawler.arun( + url="https://example.com", + config=run_conf + ) + print(result.markdown) + +if __name__ == "__main__": + asyncio.run(main()) +``` +> IMPORTANT: By default cache mode is set to `CacheMode.BYPASS` to have fresh content. Set `CacheMode.ENABLED` to enable caching. +## 4. Generating Markdown Output +- **`result.markdown`**: +- **`result.markdown.fit_markdown`**: + The same content after applying any configured **content filter** (e.g., `PruningContentFilter`). +### Example: Using a Filter with `DefaultMarkdownGenerator` +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.content_filter_strategy import PruningContentFilter +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator + +md_generator = DefaultMarkdownGenerator( + content_filter=PruningContentFilter(threshold=0.4, threshold_type="fixed") +) + +config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + markdown_generator=md_generator +) + +async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://news.ycombinator.com", config=config) + print("Raw Markdown length:", len(result.markdown.raw_markdown)) + print("Fit Markdown length:", len(result.markdown.fit_markdown)) +``` +**Note**: If you do **not** specify a content filter or markdown generator, youβll typically see only the raw Markdown. `PruningContentFilter` may adds around `50ms` in processing time. Weβll dive deeper into these strategies in a dedicated **Markdown Generation** tutorial. +## 5. Simple Data Extraction (CSS-based) +```python +from crawl4ai import JsonCssExtractionStrategy +from crawl4ai import LLMConfig + +# Generate a schema (one-time cost) +html = "