Compare commits
93 Commits
fix/relati
...
bugfix/aru
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c003cb6e4f | ||
|
|
2c918155aa | ||
|
|
854694ef33 | ||
|
|
6534ece026 | ||
|
|
89e28d4eee | ||
|
|
c0f1865287 | ||
|
|
46ef1116c4 | ||
|
|
4df83893ac | ||
|
|
13e116610d | ||
|
|
613097d121 | ||
|
|
44ef0682b0 | ||
|
|
b74524fdfb | ||
|
|
bcac486921 | ||
|
|
6aef5a120f | ||
|
|
7cac008c10 | ||
|
|
7e8fb3a8f3 | ||
|
|
3efb59fb9a | ||
|
|
c7b7475b92 | ||
|
|
b71d624168 | ||
|
|
d670dcde0a | ||
|
|
f8606f6865 | ||
|
|
52da8d72bc | ||
|
|
8b7e67566e | ||
|
|
7388baa205 | ||
|
|
897bc3a493 | ||
|
|
8a37710313 | ||
|
|
97c92c4f62 | ||
|
|
f6a02c4358 | ||
|
|
6d1a398419 | ||
|
|
c107617920 | ||
|
|
69d0ef89dd | ||
|
|
1bf85bcb1a | ||
|
|
749232ba1a | ||
|
|
c7288dd2f1 | ||
|
|
fdbcddbf1a | ||
|
|
564d437d97 | ||
|
|
9cd06ea7eb | ||
|
|
c91b235cb7 | ||
|
|
eb257c2ba3 | ||
|
|
8d364a0731 | ||
|
|
6aff0e55aa | ||
|
|
38a0742708 | ||
|
|
a720a3a9fe | ||
|
|
017144c2dd | ||
|
|
32887ea40d | ||
|
|
eea41bf1ca | ||
|
|
21c302f439 | ||
|
|
8fc1747225 | ||
|
|
aadab30c3d | ||
|
|
4a04b8506a | ||
|
|
7dadb65b80 | ||
|
|
a3f057e19f | ||
|
|
216019f29a | ||
|
|
abe8a92561 | ||
|
|
5a4f21fad9 | ||
|
|
611d48f93b | ||
|
|
936397ee0e | ||
|
|
2c373f0642 | ||
|
|
d2c7f345ab | ||
|
|
8c62277718 | ||
|
|
46e1a67f61 | ||
|
|
7dfe528d43 | ||
|
|
5145d42df7 | ||
|
|
9900f63f97 | ||
|
|
9292b265fc | ||
|
|
80aa6c11d9 | ||
|
|
749d200866 | ||
|
|
408ad1b750 | ||
|
|
35dd206925 | ||
|
|
8d30662647 | ||
|
|
ef46df10da | ||
|
|
0d8d043109 | ||
|
|
70af81d9d7 | ||
|
|
2dc6588573 | ||
|
|
361499d291 | ||
|
|
3fe49a766c | ||
|
|
fef715a891 | ||
|
|
69e8ca3d0d | ||
|
|
a1950afd98 | ||
|
|
d0eb5a6ffe | ||
|
|
77559f3373 | ||
|
|
3899ac3d3b | ||
|
|
23431d8109 | ||
|
|
1717827732 | ||
|
|
f8eaf01ed1 | ||
|
|
14b42b1f9a | ||
|
|
3bc56dd028 | ||
|
|
1874a7b8d2 | ||
|
|
6a3b3e9d38 | ||
|
|
4ed33fce9e | ||
|
|
f7a3366f72 | ||
|
|
88a9fbbb7e | ||
|
|
be63c98db3 |
81
.github/workflows/docker-release.yml
vendored
Normal file
81
.github/workflows/docker-release.yml
vendored
Normal file
@@ -0,0 +1,81 @@
|
||||
name: Docker Release
|
||||
on:
|
||||
release:
|
||||
types: [published]
|
||||
push:
|
||||
tags:
|
||||
- 'docker-rebuild-v*' # Allow manual Docker rebuilds via tags
|
||||
|
||||
jobs:
|
||||
docker:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Extract version from release or tag
|
||||
id: get_version
|
||||
run: |
|
||||
if [ "${{ github.event_name }}" == "release" ]; then
|
||||
# Triggered by release event
|
||||
VERSION="${{ github.event.release.tag_name }}"
|
||||
VERSION=${VERSION#v} # Remove 'v' prefix
|
||||
else
|
||||
# Triggered by docker-rebuild-v* tag
|
||||
VERSION=${GITHUB_REF#refs/tags/docker-rebuild-v}
|
||||
fi
|
||||
echo "VERSION=$VERSION" >> $GITHUB_OUTPUT
|
||||
echo "Building Docker images for version: $VERSION"
|
||||
|
||||
- name: Extract major and minor versions
|
||||
id: versions
|
||||
run: |
|
||||
VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||
MAJOR=$(echo $VERSION | cut -d. -f1)
|
||||
MINOR=$(echo $VERSION | cut -d. -f1-2)
|
||||
echo "MAJOR=$MAJOR" >> $GITHUB_OUTPUT
|
||||
echo "MINOR=$MINOR" >> $GITHUB_OUTPUT
|
||||
echo "Semantic versions - Major: $MAJOR, Minor: $MINOR"
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Build and push Docker images
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: |
|
||||
unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
||||
unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}
|
||||
unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}
|
||||
unclecode/crawl4ai:latest
|
||||
platforms: linux/amd64,linux/arm64
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
|
||||
- name: Summary
|
||||
run: |
|
||||
echo "## 🐳 Docker Release Complete!" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### Published Images" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:latest\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### Platforms" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- linux/amd64" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- linux/arm64" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 🚀 Pull Command" >> $GITHUB_STEP_SUMMARY
|
||||
echo "\`\`\`bash" >> $GITHUB_STEP_SUMMARY
|
||||
echo "docker pull unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
|
||||
917
.github/workflows/docs/ARCHITECTURE.md
vendored
Normal file
917
.github/workflows/docs/ARCHITECTURE.md
vendored
Normal file
@@ -0,0 +1,917 @@
|
||||
# Workflow Architecture Documentation
|
||||
|
||||
## Overview
|
||||
|
||||
This document describes the technical architecture of the split release pipeline for Crawl4AI.
|
||||
|
||||
---
|
||||
|
||||
## Architecture Diagram
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ Developer │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ git tag v1.2.3 │
|
||||
│ git push --tags │
|
||||
└──────────────────────────────┬──────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ GitHub Repository │
|
||||
│ │
|
||||
│ ┌────────────────────────────────────────────────────────┐ │
|
||||
│ │ Tag Event: v1.2.3 │ │
|
||||
│ └────────────────────────────────────────────────────────┘ │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ ┌────────────────────────────────────────────────────────┐ │
|
||||
│ │ release.yml (Release Pipeline) │ │
|
||||
│ │ ┌──────────────────────────────────────────────┐ │ │
|
||||
│ │ │ 1. Extract Version │ │ │
|
||||
│ │ │ v1.2.3 → 1.2.3 │ │ │
|
||||
│ │ └──────────────────────────────────────────────┘ │ │
|
||||
│ │ ┌──────────────────────────────────────────────┐ │ │
|
||||
│ │ │ 2. Validate Version │ │ │
|
||||
│ │ │ Tag == __version__.py │ │ │
|
||||
│ │ └──────────────────────────────────────────────┘ │ │
|
||||
│ │ ┌──────────────────────────────────────────────┐ │ │
|
||||
│ │ │ 3. Build Python Package │ │ │
|
||||
│ │ │ - Source dist (.tar.gz) │ │ │
|
||||
│ │ │ - Wheel (.whl) │ │ │
|
||||
│ │ └──────────────────────────────────────────────┘ │ │
|
||||
│ │ ┌──────────────────────────────────────────────┐ │ │
|
||||
│ │ │ 4. Upload to PyPI │ │ │
|
||||
│ │ │ - Authenticate with token │ │ │
|
||||
│ │ │ - Upload dist/* │ │ │
|
||||
│ │ └──────────────────────────────────────────────┘ │ │
|
||||
│ │ ┌──────────────────────────────────────────────┐ │ │
|
||||
│ │ │ 5. Create GitHub Release │ │ │
|
||||
│ │ │ - Tag: v1.2.3 │ │ │
|
||||
│ │ │ - Body: Install instructions │ │ │
|
||||
│ │ │ - Status: Published │ │ │
|
||||
│ │ └──────────────────────────────────────────────┘ │ │
|
||||
│ └────────────────────────────────────────────────────────┘ │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ ┌────────────────────────────────────────────────────────┐ │
|
||||
│ │ Release Event: published (v1.2.3) │ │
|
||||
│ └────────────────────────────────────────────────────────┘ │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ ┌────────────────────────────────────────────────────────┐ │
|
||||
│ │ docker-release.yml (Docker Pipeline) │ │
|
||||
│ │ ┌──────────────────────────────────────────────┐ │ │
|
||||
│ │ │ 1. Extract Version from Release │ │ │
|
||||
│ │ │ github.event.release.tag_name → 1.2.3 │ │ │
|
||||
│ │ └──────────────────────────────────────────────┘ │ │
|
||||
│ │ ┌──────────────────────────────────────────────┐ │ │
|
||||
│ │ │ 2. Parse Semantic Versions │ │ │
|
||||
│ │ │ 1.2.3 → Major: 1, Minor: 1.2 │ │ │
|
||||
│ │ └──────────────────────────────────────────────┘ │ │
|
||||
│ │ ┌──────────────────────────────────────────────┐ │ │
|
||||
│ │ │ 3. Setup Multi-Arch Build │ │ │
|
||||
│ │ │ - Docker Buildx │ │ │
|
||||
│ │ │ - QEMU emulation │ │ │
|
||||
│ │ └──────────────────────────────────────────────┘ │ │
|
||||
│ │ ┌──────────────────────────────────────────────┐ │ │
|
||||
│ │ │ 4. Authenticate Docker Hub │ │ │
|
||||
│ │ │ - Username: DOCKER_USERNAME │ │ │
|
||||
│ │ │ - Token: DOCKER_TOKEN │ │ │
|
||||
│ │ └──────────────────────────────────────────────┘ │ │
|
||||
│ │ ┌──────────────────────────────────────────────┐ │ │
|
||||
│ │ │ 5. Build Multi-Arch Images │ │ │
|
||||
│ │ │ ┌────────────────┬────────────────┐ │ │ │
|
||||
│ │ │ │ linux/amd64 │ linux/arm64 │ │ │ │
|
||||
│ │ │ └────────────────┴────────────────┘ │ │ │
|
||||
│ │ │ Cache: GitHub Actions (type=gha) │ │ │
|
||||
│ │ └──────────────────────────────────────────────┘ │ │
|
||||
│ │ ┌──────────────────────────────────────────────┐ │ │
|
||||
│ │ │ 6. Push to Docker Hub │ │ │
|
||||
│ │ │ Tags: │ │ │
|
||||
│ │ │ - unclecode/crawl4ai:1.2.3 │ │ │
|
||||
│ │ │ - unclecode/crawl4ai:1.2 │ │ │
|
||||
│ │ │ - unclecode/crawl4ai:1 │ │ │
|
||||
│ │ │ - unclecode/crawl4ai:latest │ │ │
|
||||
│ │ └──────────────────────────────────────────────┘ │ │
|
||||
│ └────────────────────────────────────────────────────────┘ │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ External Services │
|
||||
│ │
|
||||
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │
|
||||
│ │ PyPI │ │ Docker Hub │ │ GitHub │ │
|
||||
│ │ │ │ │ │ │ │
|
||||
│ │ crawl4ai │ │ unclecode/ │ │ Releases │ │
|
||||
│ │ 1.2.3 │ │ crawl4ai │ │ v1.2.3 │ │
|
||||
│ └──────────────┘ └──────────────┘ └──────────────┘ │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Component Details
|
||||
|
||||
### 1. Release Pipeline (release.yml)
|
||||
|
||||
#### Purpose
|
||||
Fast publication of Python package and GitHub release.
|
||||
|
||||
#### Input
|
||||
- **Trigger**: Git tag matching `v*` (excluding `test-v*`)
|
||||
- **Example**: `v1.2.3`
|
||||
|
||||
#### Processing Stages
|
||||
|
||||
##### Stage 1: Version Extraction
|
||||
```bash
|
||||
Input: refs/tags/v1.2.3
|
||||
Output: VERSION=1.2.3
|
||||
```
|
||||
|
||||
**Implementation**:
|
||||
```bash
|
||||
TAG_VERSION=${GITHUB_REF#refs/tags/v} # Remove 'refs/tags/v' prefix
|
||||
echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT
|
||||
```
|
||||
|
||||
##### Stage 2: Version Validation
|
||||
```bash
|
||||
Input: TAG_VERSION=1.2.3
|
||||
Check: crawl4ai/__version__.py contains __version__ = "1.2.3"
|
||||
Output: Pass/Fail
|
||||
```
|
||||
|
||||
**Implementation**:
|
||||
```bash
|
||||
PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)")
|
||||
if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then
|
||||
exit 1
|
||||
fi
|
||||
```
|
||||
|
||||
##### Stage 3: Package Build
|
||||
```bash
|
||||
Input: Source code + pyproject.toml
|
||||
Output: dist/crawl4ai-1.2.3.tar.gz
|
||||
dist/crawl4ai-1.2.3-py3-none-any.whl
|
||||
```
|
||||
|
||||
**Implementation**:
|
||||
```bash
|
||||
python -m build
|
||||
# Uses build backend defined in pyproject.toml
|
||||
```
|
||||
|
||||
##### Stage 4: PyPI Upload
|
||||
```bash
|
||||
Input: dist/*.{tar.gz,whl}
|
||||
Auth: PYPI_TOKEN
|
||||
Output: Package published to PyPI
|
||||
```
|
||||
|
||||
**Implementation**:
|
||||
```bash
|
||||
twine upload dist/*
|
||||
# Environment:
|
||||
# TWINE_USERNAME: __token__
|
||||
# TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
|
||||
```
|
||||
|
||||
##### Stage 5: GitHub Release Creation
|
||||
```bash
|
||||
Input: Tag: v1.2.3
|
||||
Body: Markdown content
|
||||
Output: Published GitHub release
|
||||
```
|
||||
|
||||
**Implementation**:
|
||||
```yaml
|
||||
uses: softprops/action-gh-release@v2
|
||||
with:
|
||||
tag_name: v1.2.3
|
||||
name: Release v1.2.3
|
||||
body: |
|
||||
Installation instructions and changelog
|
||||
draft: false
|
||||
prerelease: false
|
||||
```
|
||||
|
||||
#### Output
|
||||
- **PyPI Package**: https://pypi.org/project/crawl4ai/1.2.3/
|
||||
- **GitHub Release**: Published release on repository
|
||||
- **Event**: `release.published` (triggers Docker workflow)
|
||||
|
||||
#### Timeline
|
||||
```
|
||||
0:00 - Tag pushed
|
||||
0:01 - Checkout + Python setup
|
||||
0:02 - Version validation
|
||||
0:03 - Package build
|
||||
0:04 - PyPI upload starts
|
||||
0:06 - PyPI upload complete
|
||||
0:07 - GitHub release created
|
||||
0:08 - Workflow complete
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 2. Docker Release Pipeline (docker-release.yml)
|
||||
|
||||
#### Purpose
|
||||
Build and publish multi-architecture Docker images.
|
||||
|
||||
#### Inputs
|
||||
|
||||
##### Input 1: Release Event (Automatic)
|
||||
```yaml
|
||||
Event: release.published
|
||||
Data: github.event.release.tag_name = "v1.2.3"
|
||||
```
|
||||
|
||||
##### Input 2: Docker Rebuild Tag (Manual)
|
||||
```yaml
|
||||
Tag: docker-rebuild-v1.2.3
|
||||
```
|
||||
|
||||
#### Processing Stages
|
||||
|
||||
##### Stage 1: Version Detection
|
||||
```bash
|
||||
# From release event:
|
||||
VERSION = github.event.release.tag_name.strip("v")
|
||||
# Result: "1.2.3"
|
||||
|
||||
# From rebuild tag:
|
||||
VERSION = GITHUB_REF.replace("refs/tags/docker-rebuild-v", "")
|
||||
# Result: "1.2.3"
|
||||
```
|
||||
|
||||
##### Stage 2: Semantic Version Parsing
|
||||
```bash
|
||||
Input: VERSION=1.2.3
|
||||
Output: MAJOR=1
|
||||
MINOR=1.2
|
||||
PATCH=3 (implicit)
|
||||
```
|
||||
|
||||
**Implementation**:
|
||||
```bash
|
||||
MAJOR=$(echo $VERSION | cut -d. -f1) # Extract first component
|
||||
MINOR=$(echo $VERSION | cut -d. -f1-2) # Extract first two components
|
||||
```
|
||||
|
||||
##### Stage 3: Multi-Architecture Setup
|
||||
```yaml
|
||||
Setup:
|
||||
- Docker Buildx (multi-platform builder)
|
||||
- QEMU (ARM emulation on x86)
|
||||
|
||||
Platforms:
|
||||
- linux/amd64 (x86_64)
|
||||
- linux/arm64 (aarch64)
|
||||
```
|
||||
|
||||
**Architecture**:
|
||||
```
|
||||
GitHub Runner (linux/amd64)
|
||||
├─ Buildx Builder
|
||||
│ ├─ Native: Build linux/amd64 image
|
||||
│ └─ QEMU: Emulate ARM to build linux/arm64 image
|
||||
└─ Generate manifest list (points to both images)
|
||||
```
|
||||
|
||||
##### Stage 4: Docker Hub Authentication
|
||||
```bash
|
||||
Input: DOCKER_USERNAME
|
||||
DOCKER_TOKEN
|
||||
Output: Authenticated Docker client
|
||||
```
|
||||
|
||||
##### Stage 5: Build with Cache
|
||||
```yaml
|
||||
Cache Configuration:
|
||||
cache-from: type=gha # Read from GitHub Actions cache
|
||||
cache-to: type=gha,mode=max # Write all layers
|
||||
|
||||
Cache Key Components:
|
||||
- Workflow file path
|
||||
- Branch name
|
||||
- Architecture (amd64/arm64)
|
||||
```
|
||||
|
||||
**Cache Hierarchy**:
|
||||
```
|
||||
Cache Entry: main/docker-release.yml/linux-amd64
|
||||
├─ Layer: sha256:abc123... (FROM python:3.12)
|
||||
├─ Layer: sha256:def456... (RUN apt-get update)
|
||||
├─ Layer: sha256:ghi789... (COPY requirements.txt)
|
||||
├─ Layer: sha256:jkl012... (RUN pip install)
|
||||
└─ Layer: sha256:mno345... (COPY . /app)
|
||||
|
||||
Cache Hit/Miss Logic:
|
||||
- If layer input unchanged → cache hit → skip build
|
||||
- If layer input changed → cache miss → rebuild + all subsequent layers
|
||||
```
|
||||
|
||||
##### Stage 6: Tag Generation
|
||||
```bash
|
||||
Input: VERSION=1.2.3, MAJOR=1, MINOR=1.2
|
||||
|
||||
Output Tags:
|
||||
- unclecode/crawl4ai:1.2.3 (exact version)
|
||||
- unclecode/crawl4ai:1.2 (minor version)
|
||||
- unclecode/crawl4ai:1 (major version)
|
||||
- unclecode/crawl4ai:latest (latest stable)
|
||||
```
|
||||
|
||||
**Tag Strategy**:
|
||||
- All tags point to same image SHA
|
||||
- Users can pin to desired stability level
|
||||
- Pushing new version updates `1`, `1.2`, and `latest` automatically
|
||||
|
||||
##### Stage 7: Push to Registry
|
||||
```bash
|
||||
For each tag:
|
||||
For each platform (amd64, arm64):
|
||||
Push image to Docker Hub
|
||||
|
||||
Create manifest list:
|
||||
Manifest: unclecode/crawl4ai:1.2.3
|
||||
├─ linux/amd64: sha256:abc...
|
||||
└─ linux/arm64: sha256:def...
|
||||
|
||||
Docker CLI automatically selects correct platform on pull
|
||||
```
|
||||
|
||||
#### Output
|
||||
- **Docker Images**: 4 tags × 2 platforms = 8 image variants + 4 manifests
|
||||
- **Docker Hub**: https://hub.docker.com/r/unclecode/crawl4ai/tags
|
||||
|
||||
#### Timeline
|
||||
|
||||
**Cold Cache (First Build)**:
|
||||
```
|
||||
0:00 - Release event received
|
||||
0:01 - Checkout + Buildx setup
|
||||
0:02 - Docker Hub auth
|
||||
0:03 - Start build (amd64)
|
||||
0:08 - Complete amd64 build
|
||||
0:09 - Start build (arm64)
|
||||
0:14 - Complete arm64 build
|
||||
0:15 - Generate manifests
|
||||
0:16 - Push all tags
|
||||
0:17 - Workflow complete
|
||||
```
|
||||
|
||||
**Warm Cache (Code Change Only)**:
|
||||
```
|
||||
0:00 - Release event received
|
||||
0:01 - Checkout + Buildx setup
|
||||
0:02 - Docker Hub auth
|
||||
0:03 - Start build (amd64) - cache hit for layers 1-4
|
||||
0:04 - Complete amd64 build (only layer 5 rebuilt)
|
||||
0:05 - Start build (arm64) - cache hit for layers 1-4
|
||||
0:06 - Complete arm64 build (only layer 5 rebuilt)
|
||||
0:07 - Generate manifests
|
||||
0:08 - Push all tags
|
||||
0:09 - Workflow complete
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Data Flow
|
||||
|
||||
### Version Information Flow
|
||||
|
||||
```
|
||||
Developer
|
||||
│
|
||||
▼
|
||||
crawl4ai/__version__.py
|
||||
__version__ = "1.2.3"
|
||||
│
|
||||
├─► Git Tag
|
||||
│ v1.2.3
|
||||
│ │
|
||||
│ ▼
|
||||
│ release.yml
|
||||
│ │
|
||||
│ ├─► Validation
|
||||
│ │ ✓ Match
|
||||
│ │
|
||||
│ ├─► PyPI Package
|
||||
│ │ crawl4ai==1.2.3
|
||||
│ │
|
||||
│ └─► GitHub Release
|
||||
│ v1.2.3
|
||||
│ │
|
||||
│ ▼
|
||||
│ docker-release.yml
|
||||
│ │
|
||||
│ └─► Docker Tags
|
||||
│ 1.2.3, 1.2, 1, latest
|
||||
│
|
||||
└─► Package Metadata
|
||||
pyproject.toml
|
||||
version = "1.2.3"
|
||||
```
|
||||
|
||||
### Secrets Flow
|
||||
|
||||
```
|
||||
GitHub Secrets (Encrypted at Rest)
|
||||
│
|
||||
├─► PYPI_TOKEN
|
||||
│ │
|
||||
│ ▼
|
||||
│ release.yml
|
||||
│ │
|
||||
│ ▼
|
||||
│ TWINE_PASSWORD env var (masked in logs)
|
||||
│ │
|
||||
│ ▼
|
||||
│ PyPI API (HTTPS)
|
||||
│
|
||||
├─► DOCKER_USERNAME
|
||||
│ │
|
||||
│ ▼
|
||||
│ docker-release.yml
|
||||
│ │
|
||||
│ ▼
|
||||
│ docker/login-action (masked in logs)
|
||||
│ │
|
||||
│ ▼
|
||||
│ Docker Hub API (HTTPS)
|
||||
│
|
||||
└─► DOCKER_TOKEN
|
||||
│
|
||||
▼
|
||||
docker-release.yml
|
||||
│
|
||||
▼
|
||||
docker/login-action (masked in logs)
|
||||
│
|
||||
▼
|
||||
Docker Hub API (HTTPS)
|
||||
```
|
||||
|
||||
### Artifact Flow
|
||||
|
||||
```
|
||||
Source Code
|
||||
│
|
||||
├─► release.yml
|
||||
│ │
|
||||
│ ▼
|
||||
│ python -m build
|
||||
│ │
|
||||
│ ├─► crawl4ai-1.2.3.tar.gz
|
||||
│ │ │
|
||||
│ │ ▼
|
||||
│ │ PyPI Storage
|
||||
│ │ │
|
||||
│ │ ▼
|
||||
│ │ pip install crawl4ai
|
||||
│ │
|
||||
│ └─► crawl4ai-1.2.3-py3-none-any.whl
|
||||
│ │
|
||||
│ ▼
|
||||
│ PyPI Storage
|
||||
│ │
|
||||
│ ▼
|
||||
│ pip install crawl4ai
|
||||
│
|
||||
└─► docker-release.yml
|
||||
│
|
||||
▼
|
||||
docker build
|
||||
│
|
||||
├─► Image: linux/amd64
|
||||
│ │
|
||||
│ └─► Docker Hub
|
||||
│ unclecode/crawl4ai:1.2.3-amd64
|
||||
│
|
||||
└─► Image: linux/arm64
|
||||
│
|
||||
└─► Docker Hub
|
||||
unclecode/crawl4ai:1.2.3-arm64
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## State Machines
|
||||
|
||||
### Release Pipeline State Machine
|
||||
|
||||
```
|
||||
┌─────────┐
|
||||
│ START │
|
||||
└────┬────┘
|
||||
│
|
||||
▼
|
||||
┌──────────────┐
|
||||
│ Extract │
|
||||
│ Version │
|
||||
└──────┬───────┘
|
||||
│
|
||||
▼
|
||||
┌──────────────┐ ┌─────────┐
|
||||
│ Validate │─────►│ FAILED │
|
||||
│ Version │ No │ (Exit 1)│
|
||||
└──────┬───────┘ └─────────┘
|
||||
│ Yes
|
||||
▼
|
||||
┌──────────────┐
|
||||
│ Build │
|
||||
│ Package │
|
||||
└──────┬───────┘
|
||||
│
|
||||
▼
|
||||
┌──────────────┐ ┌─────────┐
|
||||
│ Upload │─────►│ FAILED │
|
||||
│ to PyPI │ Error│ (Exit 1)│
|
||||
└──────┬───────┘ └─────────┘
|
||||
│ Success
|
||||
▼
|
||||
┌──────────────┐
|
||||
│ Create │
|
||||
│ GH Release │
|
||||
└──────┬───────┘
|
||||
│
|
||||
▼
|
||||
┌──────────────┐
|
||||
│ SUCCESS │
|
||||
│ (Emit Event) │
|
||||
└──────────────┘
|
||||
```
|
||||
|
||||
### Docker Pipeline State Machine
|
||||
|
||||
```
|
||||
┌─────────┐
|
||||
│ START │
|
||||
│ (Event) │
|
||||
└────┬────┘
|
||||
│
|
||||
▼
|
||||
┌──────────────┐
|
||||
│ Detect │
|
||||
│ Version │
|
||||
│ Source │
|
||||
└──────┬───────┘
|
||||
│
|
||||
▼
|
||||
┌──────────────┐
|
||||
│ Parse │
|
||||
│ Semantic │
|
||||
│ Versions │
|
||||
└──────┬───────┘
|
||||
│
|
||||
▼
|
||||
┌──────────────┐ ┌─────────┐
|
||||
│ Authenticate │─────►│ FAILED │
|
||||
│ Docker Hub │ Error│ (Exit 1)│
|
||||
└──────┬───────┘ └─────────┘
|
||||
│ Success
|
||||
▼
|
||||
┌──────────────┐
|
||||
│ Build │
|
||||
│ amd64 │
|
||||
└──────┬───────┘
|
||||
│
|
||||
▼
|
||||
┌──────────────┐ ┌─────────┐
|
||||
│ Build │─────►│ FAILED │
|
||||
│ arm64 │ Error│ (Exit 1)│
|
||||
└──────┬───────┘ └─────────┘
|
||||
│ Success
|
||||
▼
|
||||
┌──────────────┐
|
||||
│ Push All │
|
||||
│ Tags │
|
||||
└──────┬───────┘
|
||||
│
|
||||
▼
|
||||
┌──────────────┐
|
||||
│ SUCCESS │
|
||||
└──────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Security Architecture
|
||||
|
||||
### Threat Model
|
||||
|
||||
#### Threats Mitigated
|
||||
|
||||
1. **Secret Exposure**
|
||||
- Mitigation: GitHub Actions secret masking
|
||||
- Evidence: Secrets never appear in logs
|
||||
|
||||
2. **Unauthorized Package Upload**
|
||||
- Mitigation: Scoped PyPI tokens
|
||||
- Evidence: Token limited to `crawl4ai` project
|
||||
|
||||
3. **Man-in-the-Middle**
|
||||
- Mitigation: HTTPS for all API calls
|
||||
- Evidence: PyPI, Docker Hub, GitHub all use TLS
|
||||
|
||||
4. **Supply Chain Tampering**
|
||||
- Mitigation: Immutable artifacts, content checksums
|
||||
- Evidence: PyPI stores SHA256, Docker uses content-addressable storage
|
||||
|
||||
#### Trust Boundaries
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────┐
|
||||
│ Trusted Zone │
|
||||
│ ┌────────────────────────────────┐ │
|
||||
│ │ GitHub Actions Runner │ │
|
||||
│ │ - Ephemeral VM │ │
|
||||
│ │ - Isolated environment │ │
|
||||
│ │ - Access to secrets │ │
|
||||
│ └────────────────────────────────┘ │
|
||||
│ │ │
|
||||
│ │ HTTPS (TLS 1.2+) │
|
||||
│ ▼ │
|
||||
└─────────────────────────────────────────┘
|
||||
│
|
||||
┌────────────┼────────────┐
|
||||
│ │ │
|
||||
▼ ▼ ▼
|
||||
┌────────┐ ┌─────────┐ ┌──────────┐
|
||||
│ PyPI │ │ Docker │ │ GitHub │
|
||||
│ API │ │ Hub │ │ API │
|
||||
└────────┘ └─────────┘ └──────────┘
|
||||
External External External
|
||||
Service Service Service
|
||||
```
|
||||
|
||||
### Secret Management
|
||||
|
||||
#### Secret Lifecycle
|
||||
|
||||
```
|
||||
Creation (Developer)
|
||||
│
|
||||
├─► PyPI: Create API token (scoped to project)
|
||||
├─► Docker Hub: Create access token (read/write)
|
||||
│
|
||||
▼
|
||||
Storage (GitHub)
|
||||
│
|
||||
├─► Encrypted at rest (AES-256)
|
||||
├─► Access controlled (repo-scoped)
|
||||
│
|
||||
▼
|
||||
Usage (Workflow)
|
||||
│
|
||||
├─► Injected as env vars
|
||||
├─► Masked in logs (GitHub redacts on output)
|
||||
├─► Never persisted to disk (in-memory only)
|
||||
│
|
||||
▼
|
||||
Transmission (API Call)
|
||||
│
|
||||
├─► HTTPS only
|
||||
├─► TLS 1.2+ with strong ciphers
|
||||
│
|
||||
▼
|
||||
Rotation (Manual)
|
||||
│
|
||||
└─► Regenerate on PyPI/Docker Hub
|
||||
Update GitHub secret
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Performance Characteristics
|
||||
|
||||
### Release Pipeline Performance
|
||||
|
||||
| Metric | Value | Notes |
|
||||
|--------|-------|-------|
|
||||
| Cold start | ~2-3 min | First run on new runner |
|
||||
| Warm start | ~2-3 min | Minimal caching benefit |
|
||||
| PyPI upload | ~30-60 sec | Network-bound |
|
||||
| Package build | ~30 sec | CPU-bound |
|
||||
| Parallelization | None | Sequential by design |
|
||||
|
||||
### Docker Pipeline Performance
|
||||
|
||||
| Metric | Cold Cache | Warm Cache (code) | Warm Cache (deps) |
|
||||
|--------|-----------|-------------------|-------------------|
|
||||
| Total time | 10-15 min | 1-2 min | 3-5 min |
|
||||
| amd64 build | 5-7 min | 30-60 sec | 1-2 min |
|
||||
| arm64 build | 5-7 min | 30-60 sec | 1-2 min |
|
||||
| Push time | 1-2 min | 30 sec | 30 sec |
|
||||
| Cache hit rate | 0% | 85% | 60% |
|
||||
|
||||
### Cache Performance Model
|
||||
|
||||
```python
|
||||
def estimate_build_time(changes):
|
||||
base_time = 60 # seconds (setup + push)
|
||||
|
||||
if "Dockerfile" in changes:
|
||||
return base_time + (10 * 60) # Full rebuild: ~11 min
|
||||
elif "requirements.txt" in changes:
|
||||
return base_time + (3 * 60) # Deps rebuild: ~4 min
|
||||
elif any(f.endswith(".py") for f in changes):
|
||||
return base_time + 60 # Code only: ~2 min
|
||||
else:
|
||||
return base_time # No changes: ~1 min
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Scalability Considerations
|
||||
|
||||
### Current Limits
|
||||
|
||||
| Resource | Limit | Impact |
|
||||
|----------|-------|--------|
|
||||
| Workflow concurrency | 20 (default) | Max 20 releases in parallel |
|
||||
| Artifact storage | 500 MB/artifact | PyPI packages small (<10 MB) |
|
||||
| Cache storage | 10 GB/repo | Docker layers fit comfortably |
|
||||
| Workflow run time | 6 hours | Plenty of headroom |
|
||||
|
||||
### Scaling Strategies
|
||||
|
||||
#### Horizontal Scaling (Multiple Repos)
|
||||
```
|
||||
crawl4ai (main)
|
||||
├─ release.yml
|
||||
└─ docker-release.yml
|
||||
|
||||
crawl4ai-plugins (separate)
|
||||
├─ release.yml
|
||||
└─ docker-release.yml
|
||||
|
||||
Each repo has independent:
|
||||
- Secrets
|
||||
- Cache (10 GB each)
|
||||
- Concurrency limits (20 each)
|
||||
```
|
||||
|
||||
#### Vertical Scaling (Larger Runners)
|
||||
```yaml
|
||||
jobs:
|
||||
docker:
|
||||
runs-on: ubuntu-latest-8-cores # GitHub-hosted larger runner
|
||||
# 4x faster builds for CPU-bound layers
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Disaster Recovery
|
||||
|
||||
### Failure Scenarios
|
||||
|
||||
#### Scenario 1: Release Pipeline Fails
|
||||
|
||||
**Failure Point**: PyPI upload fails (network error)
|
||||
|
||||
**State**:
|
||||
- ✓ Version validated
|
||||
- ✓ Package built
|
||||
- ✗ PyPI upload
|
||||
- ✗ GitHub release
|
||||
|
||||
**Recovery**:
|
||||
```bash
|
||||
# Manual upload
|
||||
twine upload dist/*
|
||||
|
||||
# Retry workflow (re-run from GitHub Actions UI)
|
||||
```
|
||||
|
||||
**Prevention**: Add retry logic to PyPI upload
|
||||
|
||||
#### Scenario 2: Docker Pipeline Fails
|
||||
|
||||
**Failure Point**: ARM build fails (dependency issue)
|
||||
|
||||
**State**:
|
||||
- ✓ PyPI published
|
||||
- ✓ GitHub release created
|
||||
- ✓ amd64 image built
|
||||
- ✗ arm64 image build
|
||||
|
||||
**Recovery**:
|
||||
```bash
|
||||
# Fix Dockerfile
|
||||
git commit -am "fix: ARM build dependency"
|
||||
|
||||
# Trigger rebuild
|
||||
git tag docker-rebuild-v1.2.3
|
||||
git push origin docker-rebuild-v1.2.3
|
||||
```
|
||||
|
||||
**Impact**: PyPI package available, only Docker ARM users affected
|
||||
|
||||
#### Scenario 3: Partial Release
|
||||
|
||||
**Failure Point**: GitHub release creation fails
|
||||
|
||||
**State**:
|
||||
- ✓ PyPI published
|
||||
- ✗ GitHub release
|
||||
- ✗ Docker images
|
||||
|
||||
**Recovery**:
|
||||
```bash
|
||||
# Create release manually
|
||||
gh release create v1.2.3 \
|
||||
--title "Release v1.2.3" \
|
||||
--notes "..."
|
||||
|
||||
# This triggers docker-release.yml automatically
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Monitoring and Observability
|
||||
|
||||
### Metrics to Track
|
||||
|
||||
#### Release Pipeline
|
||||
- Success rate (target: >99%)
|
||||
- Duration (target: <3 min)
|
||||
- PyPI upload time (target: <60 sec)
|
||||
|
||||
#### Docker Pipeline
|
||||
- Success rate (target: >95%)
|
||||
- Duration (target: <15 min cold, <2 min warm)
|
||||
- Cache hit rate (target: >80% for code changes)
|
||||
|
||||
### Alerting
|
||||
|
||||
**Critical Alerts**:
|
||||
- Release pipeline failure (blocks release)
|
||||
- PyPI authentication failure (expired token)
|
||||
|
||||
**Warning Alerts**:
|
||||
- Docker build >15 min (performance degradation)
|
||||
- Cache hit rate <50% (cache issue)
|
||||
|
||||
### Logging
|
||||
|
||||
**GitHub Actions Logs**:
|
||||
- Retention: 90 days
|
||||
- Downloadable: Yes
|
||||
- Searchable: Limited
|
||||
|
||||
**Recommended External Logging**:
|
||||
```yaml
|
||||
- name: Send logs to external service
|
||||
if: failure()
|
||||
run: |
|
||||
curl -X POST https://logs.example.com/api/v1/logs \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"workflow\": \"${{ github.workflow }}\", \"status\": \"failed\"}"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
### Planned Improvements
|
||||
|
||||
1. **Automated Changelog Generation**
|
||||
- Use conventional commits
|
||||
- Generate CHANGELOG.md automatically
|
||||
|
||||
2. **Pre-release Testing**
|
||||
- Test builds on `test-v*` tags
|
||||
- Upload to TestPyPI
|
||||
|
||||
3. **Notification System**
|
||||
- Slack/Discord notifications on release
|
||||
- Email on failure
|
||||
|
||||
4. **Performance Optimization**
|
||||
- Parallel Docker builds (amd64 + arm64 simultaneously)
|
||||
- Persistent runners for better caching
|
||||
|
||||
5. **Enhanced Validation**
|
||||
- Smoke tests after PyPI upload
|
||||
- Container security scanning
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
|
||||
- [GitHub Actions Architecture](https://docs.github.com/en/actions/learn-github-actions/understanding-github-actions)
|
||||
- [Docker Build Cache](https://docs.docker.com/build/cache/)
|
||||
- [PyPI API Documentation](https://warehouse.pypa.io/api-reference/)
|
||||
|
||||
---
|
||||
|
||||
**Last Updated**: 2025-01-21
|
||||
**Version**: 2.0
|
||||
1029
.github/workflows/docs/README.md
vendored
Normal file
1029
.github/workflows/docs/README.md
vendored
Normal file
File diff suppressed because it is too large
Load Diff
287
.github/workflows/docs/WORKFLOW_REFERENCE.md
vendored
Normal file
287
.github/workflows/docs/WORKFLOW_REFERENCE.md
vendored
Normal file
@@ -0,0 +1,287 @@
|
||||
# Workflow Quick Reference
|
||||
|
||||
## Quick Commands
|
||||
|
||||
### Standard Release
|
||||
```bash
|
||||
# 1. Update version
|
||||
vim crawl4ai/__version__.py # Set to "1.2.3"
|
||||
|
||||
# 2. Commit and tag
|
||||
git add crawl4ai/__version__.py
|
||||
git commit -m "chore: bump version to 1.2.3"
|
||||
git tag v1.2.3
|
||||
git push origin main
|
||||
git push origin v1.2.3
|
||||
|
||||
# 3. Monitor
|
||||
# - PyPI: ~2-3 minutes
|
||||
# - Docker: ~1-15 minutes
|
||||
```
|
||||
|
||||
### Docker Rebuild Only
|
||||
```bash
|
||||
git tag docker-rebuild-v1.2.3
|
||||
git push origin docker-rebuild-v1.2.3
|
||||
```
|
||||
|
||||
### Delete Tag (Undo Release)
|
||||
```bash
|
||||
# Local
|
||||
git tag -d v1.2.3
|
||||
|
||||
# Remote
|
||||
git push --delete origin v1.2.3
|
||||
|
||||
# GitHub Release
|
||||
gh release delete v1.2.3
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Workflow Triggers
|
||||
|
||||
### release.yml
|
||||
| Event | Pattern | Example |
|
||||
|-------|---------|---------|
|
||||
| Tag push | `v*` | `v1.2.3` |
|
||||
| Excludes | `test-v*` | `test-v1.2.3` |
|
||||
|
||||
### docker-release.yml
|
||||
| Event | Pattern | Example |
|
||||
|-------|---------|---------|
|
||||
| Release published | `release.published` | Automatic |
|
||||
| Tag push | `docker-rebuild-v*` | `docker-rebuild-v1.2.3` |
|
||||
|
||||
---
|
||||
|
||||
## Environment Variables
|
||||
|
||||
### release.yml
|
||||
| Variable | Source | Example |
|
||||
|----------|--------|---------|
|
||||
| `VERSION` | Git tag | `1.2.3` |
|
||||
| `TWINE_USERNAME` | Static | `__token__` |
|
||||
| `TWINE_PASSWORD` | Secret | `pypi-Ag...` |
|
||||
| `GITHUB_TOKEN` | Auto | `ghp_...` |
|
||||
|
||||
### docker-release.yml
|
||||
| Variable | Source | Example |
|
||||
|----------|--------|---------|
|
||||
| `VERSION` | Release/Tag | `1.2.3` |
|
||||
| `MAJOR` | Computed | `1` |
|
||||
| `MINOR` | Computed | `1.2` |
|
||||
| `DOCKER_USERNAME` | Secret | `unclecode` |
|
||||
| `DOCKER_TOKEN` | Secret | `dckr_pat_...` |
|
||||
|
||||
---
|
||||
|
||||
## Docker Tags Generated
|
||||
|
||||
| Version | Tags Created |
|
||||
|---------|-------------|
|
||||
| v1.0.0 | `1.0.0`, `1.0`, `1`, `latest` |
|
||||
| v1.1.0 | `1.1.0`, `1.1`, `1`, `latest` |
|
||||
| v1.2.3 | `1.2.3`, `1.2`, `1`, `latest` |
|
||||
| v2.0.0 | `2.0.0`, `2.0`, `2`, `latest` |
|
||||
|
||||
---
|
||||
|
||||
## Workflow Outputs
|
||||
|
||||
### release.yml
|
||||
| Output | Location | Time |
|
||||
|--------|----------|------|
|
||||
| PyPI Package | https://pypi.org/project/crawl4ai/ | ~2-3 min |
|
||||
| GitHub Release | Repository → Releases | ~2-3 min |
|
||||
| Workflow Summary | Actions → Run → Summary | Immediate |
|
||||
|
||||
### docker-release.yml
|
||||
| Output | Location | Time |
|
||||
|--------|----------|------|
|
||||
| Docker Images | https://hub.docker.com/r/unclecode/crawl4ai | ~1-15 min |
|
||||
| Workflow Summary | Actions → Run → Summary | Immediate |
|
||||
|
||||
---
|
||||
|
||||
## Common Issues
|
||||
|
||||
| Issue | Solution |
|
||||
|-------|----------|
|
||||
| Version mismatch | Update `crawl4ai/__version__.py` to match tag |
|
||||
| PyPI 403 Forbidden | Check `PYPI_TOKEN` secret |
|
||||
| PyPI 400 File exists | Version already published, increment version |
|
||||
| Docker auth failed | Regenerate `DOCKER_TOKEN` |
|
||||
| Docker build timeout | Check Dockerfile, review build logs |
|
||||
| Cache not working | First build on branch always cold |
|
||||
|
||||
---
|
||||
|
||||
## Secrets Checklist
|
||||
|
||||
- [ ] `PYPI_TOKEN` - PyPI API token (project or account scope)
|
||||
- [ ] `DOCKER_USERNAME` - Docker Hub username
|
||||
- [ ] `DOCKER_TOKEN` - Docker Hub access token (read/write)
|
||||
- [ ] `GITHUB_TOKEN` - Auto-provided (no action needed)
|
||||
|
||||
---
|
||||
|
||||
## Workflow Dependencies
|
||||
|
||||
### release.yml Dependencies
|
||||
```yaml
|
||||
Python: 3.12
|
||||
Actions:
|
||||
- actions/checkout@v4
|
||||
- actions/setup-python@v5
|
||||
- softprops/action-gh-release@v2
|
||||
PyPI Packages:
|
||||
- build
|
||||
- twine
|
||||
```
|
||||
|
||||
### docker-release.yml Dependencies
|
||||
```yaml
|
||||
Actions:
|
||||
- actions/checkout@v4
|
||||
- docker/setup-buildx-action@v3
|
||||
- docker/login-action@v3
|
||||
- docker/build-push-action@v5
|
||||
Docker:
|
||||
- Buildx
|
||||
- QEMU (for multi-arch)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Cache Information
|
||||
|
||||
### Type
|
||||
- GitHub Actions Cache (`type=gha`)
|
||||
|
||||
### Storage
|
||||
- **Limit**: 10GB per repository
|
||||
- **Retention**: 7 days for unused entries
|
||||
- **Cleanup**: Automatic LRU eviction
|
||||
|
||||
### Performance
|
||||
| Scenario | Cache Hit | Build Time |
|
||||
|----------|-----------|------------|
|
||||
| First build | 0% | 10-15 min |
|
||||
| Code change only | 85% | 1-2 min |
|
||||
| Dependency update | 60% | 3-5 min |
|
||||
| No changes | 100% | 30-60 sec |
|
||||
|
||||
---
|
||||
|
||||
## Build Platforms
|
||||
|
||||
| Platform | Architecture | Devices |
|
||||
|----------|--------------|---------|
|
||||
| linux/amd64 | x86_64 | Intel/AMD servers, AWS EC2, GCP |
|
||||
| linux/arm64 | aarch64 | Apple Silicon, AWS Graviton, Raspberry Pi |
|
||||
|
||||
---
|
||||
|
||||
## Version Validation
|
||||
|
||||
### Pre-Tag Checklist
|
||||
```bash
|
||||
# Check current version
|
||||
python -c "from crawl4ai.__version__ import __version__; print(__version__)"
|
||||
|
||||
# Verify it matches intended tag
|
||||
# If tag is v1.2.3, version should be "1.2.3"
|
||||
```
|
||||
|
||||
### Post-Release Verification
|
||||
```bash
|
||||
# PyPI
|
||||
pip install crawl4ai==1.2.3
|
||||
python -c "import crawl4ai; print(crawl4ai.__version__)"
|
||||
|
||||
# Docker
|
||||
docker pull unclecode/crawl4ai:1.2.3
|
||||
docker run unclecode/crawl4ai:1.2.3 python -c "import crawl4ai; print(crawl4ai.__version__)"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Monitoring URLs
|
||||
|
||||
| Service | URL |
|
||||
|---------|-----|
|
||||
| GitHub Actions | `https://github.com/{owner}/{repo}/actions` |
|
||||
| PyPI Project | `https://pypi.org/project/crawl4ai/` |
|
||||
| Docker Hub | `https://hub.docker.com/r/unclecode/crawl4ai` |
|
||||
| GitHub Releases | `https://github.com/{owner}/{repo}/releases` |
|
||||
|
||||
---
|
||||
|
||||
## Rollback Strategy
|
||||
|
||||
### PyPI (Cannot Delete)
|
||||
```bash
|
||||
# Increment patch version
|
||||
git tag v1.2.4
|
||||
git push origin v1.2.4
|
||||
```
|
||||
|
||||
### Docker (Can Overwrite)
|
||||
```bash
|
||||
# Rebuild with fix
|
||||
git tag docker-rebuild-v1.2.3
|
||||
git push origin docker-rebuild-v1.2.3
|
||||
```
|
||||
|
||||
### GitHub Release
|
||||
```bash
|
||||
# Delete release
|
||||
gh release delete v1.2.3
|
||||
|
||||
# Delete tag
|
||||
git push --delete origin v1.2.3
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Status Badge Markdown
|
||||
|
||||
```markdown
|
||||
[](https://github.com/{owner}/{repo}/actions/workflows/release.yml)
|
||||
|
||||
[](https://github.com/{owner}/{repo}/actions/workflows/docker-release.yml)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Timeline Example
|
||||
|
||||
```
|
||||
0:00 - Push tag v1.2.3
|
||||
0:01 - release.yml starts
|
||||
0:02 - Version validation passes
|
||||
0:03 - Package built
|
||||
0:04 - PyPI upload starts
|
||||
0:06 - PyPI upload complete ✓
|
||||
0:07 - GitHub release created ✓
|
||||
0:08 - release.yml complete
|
||||
0:08 - docker-release.yml triggered
|
||||
0:10 - Docker build starts
|
||||
0:12 - amd64 image built (cache hit)
|
||||
0:14 - arm64 image built (cache hit)
|
||||
0:15 - Images pushed to Docker Hub ✓
|
||||
0:16 - docker-release.yml complete
|
||||
|
||||
Total: ~16 minutes
|
||||
Critical path (PyPI + GitHub): ~8 minutes
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Contact
|
||||
|
||||
For workflow issues:
|
||||
1. Check Actions tab for logs
|
||||
2. Review this reference
|
||||
3. See [README.md](./README.md) for detailed docs
|
||||
79
.github/workflows/release.yml
vendored
79
.github/workflows/release.yml
vendored
@@ -10,53 +10,53 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write # Required for creating releases
|
||||
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
|
||||
- name: Extract version from tag
|
||||
id: get_version
|
||||
run: |
|
||||
TAG_VERSION=${GITHUB_REF#refs/tags/v}
|
||||
echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT
|
||||
echo "Releasing version: $TAG_VERSION"
|
||||
|
||||
|
||||
- name: Install package dependencies
|
||||
run: |
|
||||
pip install -e .
|
||||
|
||||
|
||||
- name: Check version consistency
|
||||
run: |
|
||||
TAG_VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||
PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)")
|
||||
|
||||
|
||||
echo "Tag version: $TAG_VERSION"
|
||||
echo "Package version: $PACKAGE_VERSION"
|
||||
|
||||
|
||||
if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then
|
||||
echo "❌ Version mismatch! Tag: $TAG_VERSION, Package: $PACKAGE_VERSION"
|
||||
echo "Please update crawl4ai/__version__.py to match the tag version"
|
||||
exit 1
|
||||
fi
|
||||
echo "✅ Version check passed: $TAG_VERSION"
|
||||
|
||||
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install build twine
|
||||
|
||||
|
||||
- name: Build package
|
||||
run: python -m build
|
||||
|
||||
|
||||
- name: Check package
|
||||
run: twine check dist/*
|
||||
|
||||
|
||||
- name: Upload to PyPI
|
||||
env:
|
||||
TWINE_USERNAME: __token__
|
||||
@@ -65,37 +65,7 @@ jobs:
|
||||
echo "📦 Uploading to PyPI..."
|
||||
twine upload dist/*
|
||||
echo "✅ Package uploaded to https://pypi.org/project/crawl4ai/"
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Extract major and minor versions
|
||||
id: versions
|
||||
run: |
|
||||
VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||
MAJOR=$(echo $VERSION | cut -d. -f1)
|
||||
MINOR=$(echo $VERSION | cut -d. -f1-2)
|
||||
echo "MAJOR=$MAJOR" >> $GITHUB_OUTPUT
|
||||
echo "MINOR=$MINOR" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Build and push Docker images
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: |
|
||||
unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
||||
unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}
|
||||
unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}
|
||||
unclecode/crawl4ai:latest
|
||||
platforms: linux/amd64,linux/arm64
|
||||
|
||||
|
||||
- name: Create GitHub Release
|
||||
uses: softprops/action-gh-release@v2
|
||||
with:
|
||||
@@ -103,26 +73,29 @@ jobs:
|
||||
name: Release v${{ steps.get_version.outputs.VERSION }}
|
||||
body: |
|
||||
## 🎉 Crawl4AI v${{ steps.get_version.outputs.VERSION }} Released!
|
||||
|
||||
|
||||
### 📦 Installation
|
||||
|
||||
|
||||
**PyPI:**
|
||||
```bash
|
||||
pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}
|
||||
```
|
||||
|
||||
|
||||
**Docker:**
|
||||
```bash
|
||||
docker pull unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
||||
docker pull unclecode/crawl4ai:latest
|
||||
```
|
||||
|
||||
|
||||
**Note:** Docker images are being built and will be available shortly.
|
||||
Check the [Docker Release workflow](https://github.com/${{ github.repository }}/actions/workflows/docker-release.yml) for build status.
|
||||
|
||||
### 📝 What's Changed
|
||||
See [CHANGELOG.md](https://github.com/${{ github.repository }}/blob/main/CHANGELOG.md) for details.
|
||||
draft: false
|
||||
prerelease: false
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
|
||||
- name: Summary
|
||||
run: |
|
||||
echo "## 🚀 Release Complete!" >> $GITHUB_STEP_SUMMARY
|
||||
@@ -132,11 +105,9 @@ jobs:
|
||||
echo "- URL: https://pypi.org/project/crawl4ai/" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- Install: \`pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 🐳 Docker Images" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:latest\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 📋 GitHub Release" >> $GITHUB_STEP_SUMMARY
|
||||
echo "https://github.com/${{ github.repository }}/releases/tag/v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- https://github.com/${{ github.repository }}/releases/tag/v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 🐳 Docker Images" >> $GITHUB_STEP_SUMMARY
|
||||
echo "Docker images are being built in a separate workflow." >> $GITHUB_STEP_SUMMARY
|
||||
echo "Check: https://github.com/${{ github.repository }}/actions/workflows/docker-release.yml" >> $GITHUB_STEP_SUMMARY
|
||||
|
||||
142
.github/workflows/release.yml.backup
vendored
Normal file
142
.github/workflows/release.yml.backup
vendored
Normal file
@@ -0,0 +1,142 @@
|
||||
name: Release Pipeline
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- 'v*'
|
||||
- '!test-v*' # Exclude test tags
|
||||
|
||||
jobs:
|
||||
release:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write # Required for creating releases
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Extract version from tag
|
||||
id: get_version
|
||||
run: |
|
||||
TAG_VERSION=${GITHUB_REF#refs/tags/v}
|
||||
echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT
|
||||
echo "Releasing version: $TAG_VERSION"
|
||||
|
||||
- name: Install package dependencies
|
||||
run: |
|
||||
pip install -e .
|
||||
|
||||
- name: Check version consistency
|
||||
run: |
|
||||
TAG_VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||
PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)")
|
||||
|
||||
echo "Tag version: $TAG_VERSION"
|
||||
echo "Package version: $PACKAGE_VERSION"
|
||||
|
||||
if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then
|
||||
echo "❌ Version mismatch! Tag: $TAG_VERSION, Package: $PACKAGE_VERSION"
|
||||
echo "Please update crawl4ai/__version__.py to match the tag version"
|
||||
exit 1
|
||||
fi
|
||||
echo "✅ Version check passed: $TAG_VERSION"
|
||||
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install build twine
|
||||
|
||||
- name: Build package
|
||||
run: python -m build
|
||||
|
||||
- name: Check package
|
||||
run: twine check dist/*
|
||||
|
||||
- name: Upload to PyPI
|
||||
env:
|
||||
TWINE_USERNAME: __token__
|
||||
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
|
||||
run: |
|
||||
echo "📦 Uploading to PyPI..."
|
||||
twine upload dist/*
|
||||
echo "✅ Package uploaded to https://pypi.org/project/crawl4ai/"
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Extract major and minor versions
|
||||
id: versions
|
||||
run: |
|
||||
VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||
MAJOR=$(echo $VERSION | cut -d. -f1)
|
||||
MINOR=$(echo $VERSION | cut -d. -f1-2)
|
||||
echo "MAJOR=$MAJOR" >> $GITHUB_OUTPUT
|
||||
echo "MINOR=$MINOR" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Build and push Docker images
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: |
|
||||
unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
||||
unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}
|
||||
unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}
|
||||
unclecode/crawl4ai:latest
|
||||
platforms: linux/amd64,linux/arm64
|
||||
|
||||
- name: Create GitHub Release
|
||||
uses: softprops/action-gh-release@v2
|
||||
with:
|
||||
tag_name: v${{ steps.get_version.outputs.VERSION }}
|
||||
name: Release v${{ steps.get_version.outputs.VERSION }}
|
||||
body: |
|
||||
## 🎉 Crawl4AI v${{ steps.get_version.outputs.VERSION }} Released!
|
||||
|
||||
### 📦 Installation
|
||||
|
||||
**PyPI:**
|
||||
```bash
|
||||
pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}
|
||||
```
|
||||
|
||||
**Docker:**
|
||||
```bash
|
||||
docker pull unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
||||
docker pull unclecode/crawl4ai:latest
|
||||
```
|
||||
|
||||
### 📝 What's Changed
|
||||
See [CHANGELOG.md](https://github.com/${{ github.repository }}/blob/main/CHANGELOG.md) for details.
|
||||
draft: false
|
||||
prerelease: false
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Summary
|
||||
run: |
|
||||
echo "## 🚀 Release Complete!" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 📦 PyPI Package" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- Version: ${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- URL: https://pypi.org/project/crawl4ai/" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- Install: \`pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 🐳 Docker Images" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:latest\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 📋 GitHub Release" >> $GITHUB_STEP_SUMMARY
|
||||
echo "https://github.com/${{ github.repository }}/releases/tag/v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
15
.gitignore
vendored
15
.gitignore
vendored
@@ -1,6 +1,13 @@
|
||||
# Scripts folder (private tools)
|
||||
.scripts/
|
||||
|
||||
# Database files
|
||||
*.db
|
||||
|
||||
# Environment files
|
||||
.env
|
||||
.env.local
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
@@ -259,15 +266,19 @@ continue_config.json
|
||||
.llm.env
|
||||
.private/
|
||||
|
||||
.claude/
|
||||
|
||||
CLAUDE_MONITOR.md
|
||||
CLAUDE.md
|
||||
|
||||
tests/**/test_site
|
||||
tests/**/reports
|
||||
tests/**/benchmark_reports
|
||||
|
||||
test_scripts/
|
||||
docs/**/data
|
||||
.codecat/
|
||||
|
||||
docs/apps/linkdin/debug*/
|
||||
docs/apps/linkdin/samples/insights/*
|
||||
docs/apps/linkdin/samples/insights/*
|
||||
|
||||
scripts/
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
FROM python:3.12-slim-bookworm AS build
|
||||
|
||||
# C4ai version
|
||||
ARG C4AI_VER=0.7.0-r1
|
||||
ARG C4AI_VER=0.7.6
|
||||
ENV C4AI_VERSION=$C4AI_VER
|
||||
LABEL c4ai.version=$C4AI_VER
|
||||
|
||||
|
||||
88
README.md
88
README.md
@@ -27,11 +27,13 @@
|
||||
|
||||
Crawl4AI turns the web into clean, LLM ready Markdown for RAG, agents, and data pipelines. Fast, controllable, battle tested by a 50k+ star community.
|
||||
|
||||
[✨ Check out latest update v0.7.4](#-recent-updates)
|
||||
[✨ Check out latest update v0.7.6](#-recent-updates)
|
||||
|
||||
✨ New in v0.7.4: Revolutionary LLM Table Extraction with intelligent chunking, enhanced concurrency fixes, memory management refactor, and critical stability improvements. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.4.md)
|
||||
✨ **New in v0.7.6**: Complete Webhook Infrastructure for Docker Job Queue API! Real-time notifications for both `/crawl/job` and `/llm/job` endpoints with exponential backoff retry, custom headers, and flexible delivery modes. No more polling! [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.6.md)
|
||||
|
||||
✨ Recent v0.7.3: Undetected Browser Support, Multi-URL Configurations, Memory Monitoring, Enhanced Table Extraction, GitHub Sponsors. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.3.md)
|
||||
✨ Recent v0.7.5: Docker Hooks System with function-based API for pipeline customization, Enhanced LLM Integration with custom providers, HTTPS Preservation, and multiple community-reported bug fixes. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.5.md)
|
||||
|
||||
✨ Previous v0.7.4: Revolutionary LLM Table Extraction with intelligent chunking, enhanced concurrency fixes, memory management refactor, and critical stability improvements. [Release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.4.md)
|
||||
|
||||
<details>
|
||||
<summary>🤓 <strong>My Personal Story</strong></summary>
|
||||
@@ -177,7 +179,7 @@ No rate-limited APIs. No lock-in. Build and own your data pipeline with direct g
|
||||
- 📸 **Screenshots**: Capture page screenshots during crawling for debugging or analysis.
|
||||
- 📂 **Raw Data Crawling**: Directly process raw HTML (`raw:`) or local files (`file://`).
|
||||
- 🔗 **Comprehensive Link Extraction**: Extracts internal, external links, and embedded iframe content.
|
||||
- 🛠️ **Customizable Hooks**: Define hooks at every step to customize crawling behavior.
|
||||
- 🛠️ **Customizable Hooks**: Define hooks at every step to customize crawling behavior (supports both string and function-based APIs).
|
||||
- 💾 **Caching**: Cache data for improved speed and to avoid redundant fetches.
|
||||
- 📄 **Metadata Extraction**: Retrieve structured metadata from web pages.
|
||||
- 📡 **IFrame Content Extraction**: Seamless extraction from embedded iframe content.
|
||||
@@ -544,6 +546,54 @@ async def test_news_crawl():
|
||||
|
||||
## ✨ Recent Updates
|
||||
|
||||
<details>
|
||||
<summary><strong>Version 0.7.5 Release Highlights - The Docker Hooks & Security Update</strong></summary>
|
||||
|
||||
- **🔧 Docker Hooks System**: Complete pipeline customization with user-provided Python functions at 8 key points
|
||||
- **✨ Function-Based Hooks API (NEW)**: Write hooks as regular Python functions with full IDE support:
|
||||
```python
|
||||
from crawl4ai import hooks_to_string
|
||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||
|
||||
# Define hooks as regular Python functions
|
||||
async def on_page_context_created(page, context, **kwargs):
|
||||
"""Block images to speed up crawling"""
|
||||
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||
return page
|
||||
|
||||
async def before_goto(page, context, url, **kwargs):
|
||||
"""Add custom headers"""
|
||||
await page.set_extra_http_headers({'X-Crawl4AI': 'v0.7.5'})
|
||||
return page
|
||||
|
||||
# Option 1: Use hooks_to_string() utility for REST API
|
||||
hooks_code = hooks_to_string({
|
||||
"on_page_context_created": on_page_context_created,
|
||||
"before_goto": before_goto
|
||||
})
|
||||
|
||||
# Option 2: Docker client with automatic conversion (Recommended)
|
||||
client = Crawl4aiDockerClient(base_url="http://localhost:11235")
|
||||
results = await client.crawl(
|
||||
urls=["https://httpbin.org/html"],
|
||||
hooks={
|
||||
"on_page_context_created": on_page_context_created,
|
||||
"before_goto": before_goto
|
||||
}
|
||||
)
|
||||
# ✓ Full IDE support, type checking, and reusability!
|
||||
```
|
||||
|
||||
- **🤖 Enhanced LLM Integration**: Custom providers with temperature control and base_url configuration
|
||||
- **🔒 HTTPS Preservation**: Secure internal link handling with `preserve_https_for_internal_links=True`
|
||||
- **🐍 Python 3.10+ Support**: Modern language features and enhanced performance
|
||||
- **🛠️ Bug Fixes**: Resolved multiple community-reported issues including URL processing, JWT authentication, and proxy configuration
|
||||
|
||||
[Full v0.7.5 Release Notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.5.md)
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><strong>Version 0.7.4 Release Highlights - The Intelligent Table Extraction & Performance Update</strong></summary>
|
||||
|
||||
@@ -919,6 +969,36 @@ We envision a future where AI is powered by real human knowledge, ensuring data
|
||||
For more details, see our [full mission statement](./MISSION.md).
|
||||
</details>
|
||||
|
||||
## 🌟 Current Sponsors
|
||||
|
||||
### 🏢 Enterprise Sponsors & Partners
|
||||
|
||||
Our enterprise sponsors and technology partners help scale Crawl4AI to power production-grade data pipelines.
|
||||
|
||||
| Company | About | Sponsorship Tier |
|
||||
|------|------|----------------------------|
|
||||
| <a href="https://dashboard.capsolver.com/passport/register?inviteCode=ESVSECTX5Q23" target="_blank"><picture><source width="120" media="(prefers-color-scheme: dark)" srcset="https://docs.crawl4ai.com/uploads/sponsors/20251013045338_72a71fa4ee4d2f40.png"><source width="120" media="(prefers-color-scheme: light)" srcset="https://www.capsolver.com/assets/images/logo-text.png"><img alt="Capsolver" src="https://www.capsolver.com/assets/images/logo-text.png"></picture></a> | AI-powered Captcha solving service. Supports all major Captcha types, including reCAPTCHA, Cloudflare, and more | 🥈 Silver |
|
||||
| <a href="https://kipo.ai" target="_blank"><img src="https://docs.crawl4ai.com/uploads/sponsors/20251013045751_2d54f57f117c651e.png" alt="DataSync" width="120"/></a> | Helps engineers and buyers find, compare, and source electronic & industrial parts in seconds, with specs, pricing, lead times & alternatives.| 🥇 Gold |
|
||||
| <a href="https://www.kidocode.com/" target="_blank"><img src="https://docs.crawl4ai.com/uploads/sponsors/20251013045045_bb8dace3f0440d65.svg" alt="Kidocode" width="120"/><p align="center">KidoCode</p></a> | Kidocode is a hybrid technology and entrepreneurship school for kids aged 5–18, offering both online and on-campus education. | 🥇 Gold |
|
||||
| <a href="https://www.alephnull.sg/" target="_blank"><img src="https://docs.crawl4ai.com/uploads/sponsors/20251013050323_a9e8e8c4c3650421.svg" alt="Aleph null" width="120"/></a> | Singapore-based Aleph Null is Asia’s leading edtech hub, dedicated to student-centric, AI-driven education—empowering learners with the tools to thrive in a fast-changing world. | 🥇 Gold |
|
||||
|
||||
### 🧑🤝 Individual Sponsors
|
||||
|
||||
A heartfelt thanks to our individual supporters! Every contribution helps us keep our opensource mission alive and thriving!
|
||||
|
||||
<p align="left">
|
||||
<a href="https://github.com/hafezparast"><img src="https://avatars.githubusercontent.com/u/14273305?s=60&v=4" style="border-radius:50%;" width="64px;"/></a>
|
||||
<a href="https://github.com/ntohidi"><img src="https://avatars.githubusercontent.com/u/17140097?s=60&v=4" style="border-radius:50%;"width="64px;"/></a>
|
||||
<a href="https://github.com/Sjoeborg"><img src="https://avatars.githubusercontent.com/u/17451310?s=60&v=4" style="border-radius:50%;"width="64px;"/></a>
|
||||
<a href="https://github.com/romek-rozen"><img src="https://avatars.githubusercontent.com/u/30595969?s=60&v=4" style="border-radius:50%;"width="64px;"/></a>
|
||||
<a href="https://github.com/Kourosh-Kiyani"><img src="https://avatars.githubusercontent.com/u/34105600?s=60&v=4" style="border-radius:50%;"width="64px;"/></a>
|
||||
<a href="https://github.com/Etherdrake"><img src="https://avatars.githubusercontent.com/u/67021215?s=60&v=4" style="border-radius:50%;"width="64px;"/></a>
|
||||
<a href="https://github.com/shaman247"><img src="https://avatars.githubusercontent.com/u/211010067?s=60&v=4" style="border-radius:50%;"width="64px;"/></a>
|
||||
<a href="https://github.com/work-flow-manager"><img src="https://avatars.githubusercontent.com/u/217665461?s=60&v=4" style="border-radius:50%;"width="64px;"/></a>
|
||||
</p>
|
||||
|
||||
> Want to join them? [Sponsor Crawl4AI →](https://github.com/sponsors/unclecode)
|
||||
|
||||
## Star History
|
||||
|
||||
[](https://star-history.com/#unclecode/crawl4ai&Date)
|
||||
|
||||
@@ -103,7 +103,8 @@ from .browser_adapter import (
|
||||
|
||||
from .utils import (
|
||||
start_colab_display_server,
|
||||
setup_colab_environment
|
||||
setup_colab_environment,
|
||||
hooks_to_string
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
@@ -183,6 +184,7 @@ __all__ = [
|
||||
"ProxyConfig",
|
||||
"start_colab_display_server",
|
||||
"setup_colab_environment",
|
||||
"hooks_to_string",
|
||||
# C4A Script additions
|
||||
"c4a_compile",
|
||||
"c4a_validate",
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# crawl4ai/__version__.py
|
||||
|
||||
# This is the version that will be used for stable releases
|
||||
__version__ = "0.7.4"
|
||||
__version__ = "0.7.6"
|
||||
|
||||
# For nightly builds, this gets set during build process
|
||||
__nightly_version__ = None
|
||||
|
||||
@@ -19,7 +19,7 @@ import re
|
||||
from pathlib import Path
|
||||
|
||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||
from crawl4ai.async_configs import CrawlerRunConfig, LinkPreviewConfig
|
||||
from crawl4ai.async_configs import CrawlerRunConfig, LinkPreviewConfig, LLMConfig
|
||||
from crawl4ai.models import Link, CrawlResult
|
||||
import numpy as np
|
||||
|
||||
@@ -178,7 +178,7 @@ class AdaptiveConfig:
|
||||
|
||||
# Embedding strategy parameters
|
||||
embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
|
||||
embedding_llm_config: Optional[Dict] = None # Separate config for embeddings
|
||||
embedding_llm_config: Optional[Union[LLMConfig, Dict]] = None # Separate config for embeddings
|
||||
n_query_variations: int = 10
|
||||
coverage_threshold: float = 0.85
|
||||
alpha_shape_alpha: float = 0.5
|
||||
@@ -250,6 +250,30 @@ class AdaptiveConfig:
|
||||
assert 0 <= self.embedding_quality_max_confidence <= 1, "embedding_quality_max_confidence must be between 0 and 1"
|
||||
assert self.embedding_quality_scale_factor > 0, "embedding_quality_scale_factor must be positive"
|
||||
assert 0 <= self.embedding_min_confidence_threshold <= 1, "embedding_min_confidence_threshold must be between 0 and 1"
|
||||
|
||||
@property
|
||||
def _embedding_llm_config_dict(self) -> Optional[Dict]:
|
||||
"""Convert LLMConfig to dict format for backward compatibility."""
|
||||
if self.embedding_llm_config is None:
|
||||
return None
|
||||
|
||||
if isinstance(self.embedding_llm_config, dict):
|
||||
# Already a dict - return as-is for backward compatibility
|
||||
return self.embedding_llm_config
|
||||
|
||||
# Convert LLMConfig object to dict format
|
||||
return {
|
||||
'provider': self.embedding_llm_config.provider,
|
||||
'api_token': self.embedding_llm_config.api_token,
|
||||
'base_url': getattr(self.embedding_llm_config, 'base_url', None),
|
||||
'temperature': getattr(self.embedding_llm_config, 'temperature', None),
|
||||
'max_tokens': getattr(self.embedding_llm_config, 'max_tokens', None),
|
||||
'top_p': getattr(self.embedding_llm_config, 'top_p', None),
|
||||
'frequency_penalty': getattr(self.embedding_llm_config, 'frequency_penalty', None),
|
||||
'presence_penalty': getattr(self.embedding_llm_config, 'presence_penalty', None),
|
||||
'stop': getattr(self.embedding_llm_config, 'stop', None),
|
||||
'n': getattr(self.embedding_llm_config, 'n', None),
|
||||
}
|
||||
|
||||
|
||||
class CrawlStrategy(ABC):
|
||||
@@ -593,7 +617,7 @@ class StatisticalStrategy(CrawlStrategy):
|
||||
class EmbeddingStrategy(CrawlStrategy):
|
||||
"""Embedding-based adaptive crawling using semantic space coverage"""
|
||||
|
||||
def __init__(self, embedding_model: str = None, llm_config: Dict = None):
|
||||
def __init__(self, embedding_model: str = None, llm_config: Union[LLMConfig, Dict] = None):
|
||||
self.embedding_model = embedding_model or "sentence-transformers/all-MiniLM-L6-v2"
|
||||
self.llm_config = llm_config
|
||||
self._embedding_cache = {}
|
||||
@@ -605,14 +629,24 @@ class EmbeddingStrategy(CrawlStrategy):
|
||||
self._kb_embeddings_hash = None # Track KB changes
|
||||
self._validation_embeddings_cache = None # Cache validation query embeddings
|
||||
self._kb_similarity_threshold = 0.95 # Threshold for deduplication
|
||||
|
||||
def _get_embedding_llm_config_dict(self) -> Dict:
|
||||
"""Get embedding LLM config as dict with fallback to default."""
|
||||
if hasattr(self, 'config') and self.config:
|
||||
config_dict = self.config._embedding_llm_config_dict
|
||||
if config_dict:
|
||||
return config_dict
|
||||
|
||||
# Fallback to default if no config provided
|
||||
return {
|
||||
'provider': 'openai/text-embedding-3-small',
|
||||
'api_token': os.getenv('OPENAI_API_KEY')
|
||||
}
|
||||
|
||||
async def _get_embeddings(self, texts: List[str]) -> Any:
|
||||
"""Get embeddings using configured method"""
|
||||
from .utils import get_text_embeddings
|
||||
embedding_llm_config = {
|
||||
'provider': 'openai/text-embedding-3-small',
|
||||
'api_token': os.getenv('OPENAI_API_KEY')
|
||||
}
|
||||
embedding_llm_config = self._get_embedding_llm_config_dict()
|
||||
return await get_text_embeddings(
|
||||
texts,
|
||||
embedding_llm_config,
|
||||
@@ -679,8 +713,20 @@ class EmbeddingStrategy(CrawlStrategy):
|
||||
Return as a JSON array of strings."""
|
||||
|
||||
# Use the LLM for query generation
|
||||
provider = self.llm_config.get('provider', 'openai/gpt-4o-mini') if self.llm_config else 'openai/gpt-4o-mini'
|
||||
api_token = self.llm_config.get('api_token') if self.llm_config else None
|
||||
# Convert LLMConfig to dict if needed
|
||||
llm_config_dict = None
|
||||
if self.llm_config:
|
||||
if isinstance(self.llm_config, dict):
|
||||
llm_config_dict = self.llm_config
|
||||
else:
|
||||
# Convert LLMConfig object to dict
|
||||
llm_config_dict = {
|
||||
'provider': self.llm_config.provider,
|
||||
'api_token': self.llm_config.api_token
|
||||
}
|
||||
|
||||
provider = llm_config_dict.get('provider', 'openai/gpt-4o-mini') if llm_config_dict else 'openai/gpt-4o-mini'
|
||||
api_token = llm_config_dict.get('api_token') if llm_config_dict else None
|
||||
|
||||
# response = perform_completion_with_backoff(
|
||||
# provider=provider,
|
||||
@@ -843,10 +889,7 @@ class EmbeddingStrategy(CrawlStrategy):
|
||||
|
||||
# Batch embed only uncached links
|
||||
if texts_to_embed:
|
||||
embedding_llm_config = {
|
||||
'provider': 'openai/text-embedding-3-small',
|
||||
'api_token': os.getenv('OPENAI_API_KEY')
|
||||
}
|
||||
embedding_llm_config = self._get_embedding_llm_config_dict()
|
||||
new_embeddings = await get_text_embeddings(texts_to_embed, embedding_llm_config, self.embedding_model)
|
||||
|
||||
# Cache the new embeddings
|
||||
@@ -1184,10 +1227,7 @@ class EmbeddingStrategy(CrawlStrategy):
|
||||
return
|
||||
|
||||
# Get embeddings for new texts
|
||||
embedding_llm_config = {
|
||||
'provider': 'openai/text-embedding-3-small',
|
||||
'api_token': os.getenv('OPENAI_API_KEY')
|
||||
}
|
||||
embedding_llm_config = self._get_embedding_llm_config_dict()
|
||||
new_embeddings = await get_text_embeddings(new_texts, embedding_llm_config, self.embedding_model)
|
||||
|
||||
# Deduplicate embeddings before adding to KB
|
||||
@@ -1256,10 +1296,12 @@ class AdaptiveCrawler:
|
||||
if strategy_name == "statistical":
|
||||
return StatisticalStrategy()
|
||||
elif strategy_name == "embedding":
|
||||
return EmbeddingStrategy(
|
||||
strategy = EmbeddingStrategy(
|
||||
embedding_model=self.config.embedding_model,
|
||||
llm_config=self.config.embedding_llm_config
|
||||
)
|
||||
strategy.config = self.config # Pass config to strategy
|
||||
return strategy
|
||||
else:
|
||||
raise ValueError(f"Unknown strategy: {strategy_name}")
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import os
|
||||
from typing import Union
|
||||
import warnings
|
||||
from .config import (
|
||||
DEFAULT_PROVIDER,
|
||||
DEFAULT_PROVIDER_API_KEY,
|
||||
@@ -257,24 +258,39 @@ class ProxyConfig:
|
||||
|
||||
@staticmethod
|
||||
def from_string(proxy_str: str) -> "ProxyConfig":
|
||||
"""Create a ProxyConfig from a string in the format 'ip:port:username:password'."""
|
||||
parts = proxy_str.split(":")
|
||||
if len(parts) == 4: # ip:port:username:password
|
||||
"""Create a ProxyConfig from a string.
|
||||
|
||||
Supported formats:
|
||||
- 'http://username:password@ip:port'
|
||||
- 'http://ip:port'
|
||||
- 'socks5://ip:port'
|
||||
- 'ip:port:username:password'
|
||||
- 'ip:port'
|
||||
"""
|
||||
s = (proxy_str or "").strip()
|
||||
# URL with credentials
|
||||
if "@" in s and "://" in s:
|
||||
auth_part, server_part = s.split("@", 1)
|
||||
protocol, credentials = auth_part.split("://", 1)
|
||||
if ":" in credentials:
|
||||
username, password = credentials.split(":", 1)
|
||||
return ProxyConfig(
|
||||
server=f"{protocol}://{server_part}",
|
||||
username=username,
|
||||
password=password,
|
||||
)
|
||||
# URL without credentials (keep scheme)
|
||||
if "://" in s and "@" not in s:
|
||||
return ProxyConfig(server=s)
|
||||
# Colon separated forms
|
||||
parts = s.split(":")
|
||||
if len(parts) == 4:
|
||||
ip, port, username, password = parts
|
||||
return ProxyConfig(
|
||||
server=f"http://{ip}:{port}",
|
||||
username=username,
|
||||
password=password,
|
||||
ip=ip
|
||||
)
|
||||
elif len(parts) == 2: # ip:port only
|
||||
return ProxyConfig(server=f"http://{ip}:{port}", username=username, password=password)
|
||||
if len(parts) == 2:
|
||||
ip, port = parts
|
||||
return ProxyConfig(
|
||||
server=f"http://{ip}:{port}",
|
||||
ip=ip
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Invalid proxy string format: {proxy_str}")
|
||||
return ProxyConfig(server=f"http://{ip}:{port}")
|
||||
raise ValueError(f"Invalid proxy string format: {proxy_str}")
|
||||
|
||||
@staticmethod
|
||||
def from_dict(proxy_dict: Dict) -> "ProxyConfig":
|
||||
@@ -438,6 +454,7 @@ class BrowserConfig:
|
||||
host: str = "localhost",
|
||||
enable_stealth: bool = False,
|
||||
):
|
||||
|
||||
self.browser_type = browser_type
|
||||
self.headless = headless
|
||||
self.browser_mode = browser_mode
|
||||
@@ -450,13 +467,22 @@ class BrowserConfig:
|
||||
if self.browser_type in ["firefox", "webkit"]:
|
||||
self.channel = ""
|
||||
self.chrome_channel = ""
|
||||
if proxy:
|
||||
warnings.warn("The 'proxy' parameter is deprecated and will be removed in a future release. Use 'proxy_config' instead.", UserWarning)
|
||||
self.proxy = proxy
|
||||
self.proxy_config = proxy_config
|
||||
if isinstance(self.proxy_config, dict):
|
||||
self.proxy_config = ProxyConfig.from_dict(self.proxy_config)
|
||||
if isinstance(self.proxy_config, str):
|
||||
self.proxy_config = ProxyConfig.from_string(self.proxy_config)
|
||||
|
||||
|
||||
if self.proxy and self.proxy_config:
|
||||
warnings.warn("Both 'proxy' and 'proxy_config' are provided. 'proxy_config' will take precedence.", UserWarning)
|
||||
self.proxy = None
|
||||
elif self.proxy:
|
||||
# Convert proxy string to ProxyConfig if proxy_config is not provided
|
||||
self.proxy_config = ProxyConfig.from_string(self.proxy)
|
||||
self.proxy = None
|
||||
|
||||
self.viewport_width = viewport_width
|
||||
self.viewport_height = viewport_height
|
||||
|
||||
@@ -1047,14 +1047,28 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
raise e
|
||||
|
||||
finally:
|
||||
# If no session_id is given we should close the page
|
||||
# Clean up page after crawl completes
|
||||
# For managed CDP browsers, close pages that are not part of a session to prevent memory leaks
|
||||
all_contexts = page.context.browser.contexts
|
||||
total_pages = sum(len(context.pages) for context in all_contexts)
|
||||
total_pages = sum(len(context.pages) for context in all_contexts)
|
||||
|
||||
should_close_page = False
|
||||
|
||||
if config.session_id:
|
||||
# Session pages are kept alive for reuse
|
||||
pass
|
||||
elif total_pages <= 1 and (self.browser_config.use_managed_browser or self.browser_config.headless):
|
||||
elif self.browser_config.use_managed_browser:
|
||||
# For managed browsers (CDP), close non-session pages to prevent tab accumulation
|
||||
# This is especially important for arun_many() with multiple concurrent crawls
|
||||
should_close_page = True
|
||||
elif total_pages <= 1 and self.browser_config.headless:
|
||||
# Keep the last page in headless mode to avoid closing the browser
|
||||
pass
|
||||
else:
|
||||
# For non-managed browsers, close the page
|
||||
should_close_page = True
|
||||
|
||||
if should_close_page:
|
||||
# Detach listeners before closing to prevent potential errors during close
|
||||
if config.capture_network_requests:
|
||||
page.remove_listener("request", handle_request_capture)
|
||||
@@ -1383,9 +1397,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
try:
|
||||
await self.adapter.evaluate(page,
|
||||
f"""
|
||||
(() => {{
|
||||
(async () => {{
|
||||
try {{
|
||||
{remove_overlays_js}
|
||||
const removeOverlays = {remove_overlays_js};
|
||||
await removeOverlays();
|
||||
return {{ success: true }};
|
||||
}} catch (error) {{
|
||||
return {{
|
||||
|
||||
@@ -455,8 +455,6 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
|
||||
# Update priorities for waiting tasks if needed
|
||||
await self._update_queue_priorities()
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
if self.monitor:
|
||||
@@ -467,6 +465,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
memory_monitor.cancel()
|
||||
if self.monitor:
|
||||
self.monitor.stop()
|
||||
return results
|
||||
|
||||
async def _update_queue_priorities(self):
|
||||
"""Periodically update priorities of items in the queue to prevent starvation"""
|
||||
|
||||
@@ -148,6 +148,134 @@ class PlaywrightAdapter(BrowserAdapter):
|
||||
return Page, Error, PlaywrightTimeoutError
|
||||
|
||||
|
||||
class StealthAdapter(BrowserAdapter):
|
||||
"""Adapter for Playwright with stealth features using playwright_stealth"""
|
||||
|
||||
def __init__(self):
|
||||
self._console_script_injected = {}
|
||||
self._stealth_available = self._check_stealth_availability()
|
||||
|
||||
def _check_stealth_availability(self) -> bool:
|
||||
"""Check if playwright_stealth is available and get the correct function"""
|
||||
try:
|
||||
from playwright_stealth import stealth_async
|
||||
self._stealth_function = stealth_async
|
||||
return True
|
||||
except ImportError:
|
||||
try:
|
||||
from playwright_stealth import stealth_sync
|
||||
self._stealth_function = stealth_sync
|
||||
return True
|
||||
except ImportError:
|
||||
self._stealth_function = None
|
||||
return False
|
||||
|
||||
async def apply_stealth(self, page: Page):
|
||||
"""Apply stealth to a page if available"""
|
||||
if self._stealth_available and self._stealth_function:
|
||||
try:
|
||||
if hasattr(self._stealth_function, '__call__'):
|
||||
if 'async' in getattr(self._stealth_function, '__name__', ''):
|
||||
await self._stealth_function(page)
|
||||
else:
|
||||
self._stealth_function(page)
|
||||
except Exception as e:
|
||||
# Fail silently or log error depending on requirements
|
||||
pass
|
||||
|
||||
async def evaluate(self, page: Page, expression: str, arg: Any = None) -> Any:
|
||||
"""Standard Playwright evaluate with stealth applied"""
|
||||
if arg is not None:
|
||||
return await page.evaluate(expression, arg)
|
||||
return await page.evaluate(expression)
|
||||
|
||||
async def setup_console_capture(self, page: Page, captured_console: List[Dict]) -> Optional[Callable]:
|
||||
"""Setup console capture using Playwright's event system with stealth"""
|
||||
# Apply stealth to the page first
|
||||
await self.apply_stealth(page)
|
||||
|
||||
def handle_console_capture(msg):
|
||||
try:
|
||||
message_type = "unknown"
|
||||
try:
|
||||
message_type = msg.type
|
||||
except:
|
||||
pass
|
||||
|
||||
message_text = "unknown"
|
||||
try:
|
||||
message_text = msg.text
|
||||
except:
|
||||
pass
|
||||
|
||||
entry = {
|
||||
"type": message_type,
|
||||
"text": message_text,
|
||||
"timestamp": time.time()
|
||||
}
|
||||
|
||||
captured_console.append(entry)
|
||||
|
||||
except Exception as e:
|
||||
captured_console.append({
|
||||
"type": "console_capture_error",
|
||||
"error": str(e),
|
||||
"timestamp": time.time()
|
||||
})
|
||||
|
||||
page.on("console", handle_console_capture)
|
||||
return handle_console_capture
|
||||
|
||||
async def setup_error_capture(self, page: Page, captured_console: List[Dict]) -> Optional[Callable]:
|
||||
"""Setup error capture using Playwright's event system"""
|
||||
def handle_pageerror_capture(err):
|
||||
try:
|
||||
error_message = "Unknown error"
|
||||
try:
|
||||
error_message = err.message
|
||||
except:
|
||||
pass
|
||||
|
||||
error_stack = ""
|
||||
try:
|
||||
error_stack = err.stack
|
||||
except:
|
||||
pass
|
||||
|
||||
captured_console.append({
|
||||
"type": "error",
|
||||
"text": error_message,
|
||||
"stack": error_stack,
|
||||
"timestamp": time.time()
|
||||
})
|
||||
except Exception as e:
|
||||
captured_console.append({
|
||||
"type": "pageerror_capture_error",
|
||||
"error": str(e),
|
||||
"timestamp": time.time()
|
||||
})
|
||||
|
||||
page.on("pageerror", handle_pageerror_capture)
|
||||
return handle_pageerror_capture
|
||||
|
||||
async def retrieve_console_messages(self, page: Page) -> List[Dict]:
|
||||
"""Not needed for Playwright - messages are captured via events"""
|
||||
return []
|
||||
|
||||
async def cleanup_console_capture(self, page: Page, handle_console: Optional[Callable], handle_error: Optional[Callable]):
|
||||
"""Remove event listeners"""
|
||||
if handle_console:
|
||||
page.remove_listener("console", handle_console)
|
||||
if handle_error:
|
||||
page.remove_listener("pageerror", handle_error)
|
||||
|
||||
def get_imports(self) -> tuple:
|
||||
"""Return Playwright imports"""
|
||||
from playwright.async_api import Page, Error
|
||||
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
|
||||
return Page, Error, PlaywrightTimeoutError
|
||||
|
||||
|
||||
class UndetectedAdapter(BrowserAdapter):
|
||||
"""Adapter for undetected browser automation with stealth features"""
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ from .js_snippet import load_js_script
|
||||
from .config import DOWNLOAD_PAGE_TIMEOUT
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from .utils import get_chromium_path
|
||||
import warnings
|
||||
|
||||
|
||||
BROWSER_DISABLE_OPTIONS = [
|
||||
@@ -613,9 +614,11 @@ class BrowserManager:
|
||||
# for all racers). Prevents 'Target page/context closed' errors.
|
||||
self._page_lock = asyncio.Lock()
|
||||
|
||||
# Stealth-related attributes
|
||||
self._stealth_instance = None
|
||||
self._stealth_cm = None
|
||||
# Stealth adapter for stealth mode
|
||||
self._stealth_adapter = None
|
||||
if self.config.enable_stealth and not self.use_undetected:
|
||||
from .browser_adapter import StealthAdapter
|
||||
self._stealth_adapter = StealthAdapter()
|
||||
|
||||
# Initialize ManagedBrowser if needed
|
||||
if self.config.use_managed_browser:
|
||||
@@ -649,16 +652,8 @@ class BrowserManager:
|
||||
else:
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
# Initialize playwright with or without stealth
|
||||
if self.config.enable_stealth and not self.use_undetected:
|
||||
# Import stealth only when needed
|
||||
from playwright_stealth import Stealth
|
||||
# Use the recommended stealth wrapper approach
|
||||
self._stealth_instance = Stealth()
|
||||
self._stealth_cm = self._stealth_instance.use_async(async_playwright())
|
||||
self.playwright = await self._stealth_cm.__aenter__()
|
||||
else:
|
||||
self.playwright = await async_playwright().start()
|
||||
# Initialize playwright
|
||||
self.playwright = await async_playwright().start()
|
||||
|
||||
if self.config.cdp_url or self.config.use_managed_browser:
|
||||
self.config.use_managed_browser = True
|
||||
@@ -741,17 +736,18 @@ class BrowserManager:
|
||||
)
|
||||
os.makedirs(browser_args["downloads_path"], exist_ok=True)
|
||||
|
||||
if self.config.proxy or self.config.proxy_config:
|
||||
if self.config.proxy:
|
||||
warnings.warn(
|
||||
"BrowserConfig.proxy is deprecated and ignored. Use proxy_config instead.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
if self.config.proxy_config:
|
||||
from playwright.async_api import ProxySettings
|
||||
|
||||
proxy_settings = (
|
||||
ProxySettings(server=self.config.proxy)
|
||||
if self.config.proxy
|
||||
else ProxySettings(
|
||||
server=self.config.proxy_config.server,
|
||||
username=self.config.proxy_config.username,
|
||||
password=self.config.proxy_config.password,
|
||||
)
|
||||
proxy_settings = ProxySettings(
|
||||
server=self.config.proxy_config.server,
|
||||
username=self.config.proxy_config.username,
|
||||
password=self.config.proxy_config.password,
|
||||
)
|
||||
browser_args["proxy"] = proxy_settings
|
||||
|
||||
@@ -1007,6 +1003,19 @@ class BrowserManager:
|
||||
signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest()
|
||||
return signature_hash
|
||||
|
||||
async def _apply_stealth_to_page(self, page):
|
||||
"""Apply stealth to a page if stealth mode is enabled"""
|
||||
if self._stealth_adapter:
|
||||
try:
|
||||
await self._stealth_adapter.apply_stealth(page)
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.warning(
|
||||
message="Failed to apply stealth to page: {error}",
|
||||
tag="STEALTH",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
|
||||
async def get_page(self, crawlerRunConfig: CrawlerRunConfig):
|
||||
"""
|
||||
Get a page for the given session ID, creating a new one if needed.
|
||||
@@ -1026,32 +1035,20 @@ class BrowserManager:
|
||||
self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
|
||||
return page, context
|
||||
|
||||
# If using a managed browser, just grab the shared default_context
|
||||
# If using a managed browser, reuse the default context and create new pages
|
||||
if self.config.use_managed_browser:
|
||||
context = self.default_context
|
||||
if self.config.storage_state:
|
||||
context = await self.create_browser_context(crawlerRunConfig)
|
||||
ctx = self.default_context # default context, one window only
|
||||
# Clone runtime state from storage to the shared context
|
||||
ctx = self.default_context
|
||||
ctx = await clone_runtime_state(context, ctx, crawlerRunConfig, self.config)
|
||||
# Avoid concurrent new_page on shared persistent context
|
||||
# See GH-1198: context.pages can be empty under races
|
||||
async with self._page_lock:
|
||||
page = await ctx.new_page()
|
||||
else:
|
||||
context = self.default_context
|
||||
pages = context.pages
|
||||
page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
|
||||
if not page:
|
||||
if pages:
|
||||
page = pages[0]
|
||||
else:
|
||||
# Double-check under lock to avoid TOCTOU and ensure only
|
||||
# one task calls new_page when pages=[] concurrently
|
||||
async with self._page_lock:
|
||||
pages = context.pages
|
||||
if pages:
|
||||
page = pages[0]
|
||||
else:
|
||||
page = await context.new_page()
|
||||
|
||||
# Always create a new page for concurrent safety
|
||||
# The page-level isolation prevents race conditions while sharing the same context
|
||||
async with self._page_lock:
|
||||
page = await context.new_page()
|
||||
|
||||
await self._apply_stealth_to_page(page)
|
||||
else:
|
||||
# Otherwise, check if we have an existing context for this config
|
||||
config_signature = self._make_config_signature(crawlerRunConfig)
|
||||
@@ -1067,6 +1064,7 @@ class BrowserManager:
|
||||
|
||||
# Create a new page from the chosen context
|
||||
page = await context.new_page()
|
||||
await self._apply_stealth_to_page(page)
|
||||
|
||||
# If a session_id is specified, store this session so we can reuse later
|
||||
if crawlerRunConfig.session_id:
|
||||
@@ -1133,19 +1131,5 @@ class BrowserManager:
|
||||
self.managed_browser = None
|
||||
|
||||
if self.playwright:
|
||||
# Handle stealth context manager cleanup if it exists
|
||||
if hasattr(self, '_stealth_cm') and self._stealth_cm is not None:
|
||||
try:
|
||||
await self._stealth_cm.__aexit__(None, None, None)
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.error(
|
||||
message="Error closing stealth context: {error}",
|
||||
tag="ERROR",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
self._stealth_cm = None
|
||||
self._stealth_instance = None
|
||||
else:
|
||||
await self.playwright.stop()
|
||||
await self.playwright.stop()
|
||||
self.playwright = None
|
||||
|
||||
@@ -122,11 +122,6 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
||||
|
||||
valid_links.append(base_url)
|
||||
|
||||
# If we have more valid links than capacity, limit them
|
||||
if len(valid_links) > remaining_capacity:
|
||||
valid_links = valid_links[:remaining_capacity]
|
||||
self.logger.info(f"Limiting to {remaining_capacity} URLs due to max_pages limit")
|
||||
|
||||
# Record the new depths and add to next_links
|
||||
for url in valid_links:
|
||||
depths[url] = new_depth
|
||||
@@ -146,7 +141,8 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
||||
"""
|
||||
queue: asyncio.PriorityQueue = asyncio.PriorityQueue()
|
||||
# Push the initial URL with score 0 and depth 0.
|
||||
await queue.put((0, 0, start_url, None))
|
||||
initial_score = self.url_scorer.score(start_url) if self.url_scorer else 0
|
||||
await queue.put((-initial_score, 0, start_url, None))
|
||||
visited: Set[str] = set()
|
||||
depths: Dict[str, int] = {start_url: 0}
|
||||
|
||||
@@ -193,7 +189,7 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
||||
result.metadata = result.metadata or {}
|
||||
result.metadata["depth"] = depth
|
||||
result.metadata["parent_url"] = parent_url
|
||||
result.metadata["score"] = score
|
||||
result.metadata["score"] = -score
|
||||
|
||||
# Count only successful crawls toward max_pages limit
|
||||
if result.success:
|
||||
@@ -214,7 +210,7 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
||||
for new_url, new_parent in new_links:
|
||||
new_depth = depths.get(new_url, depth + 1)
|
||||
new_score = self.url_scorer.score(new_url) if self.url_scorer else 0
|
||||
await queue.put((new_score, new_depth, new_url, new_parent))
|
||||
await queue.put((-new_score, new_depth, new_url, new_parent))
|
||||
|
||||
# End of crawl.
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from typing import List, Optional, Union, AsyncGenerator, Dict, Any
|
||||
from typing import List, Optional, Union, AsyncGenerator, Dict, Any, Callable
|
||||
import httpx
|
||||
import json
|
||||
from urllib.parse import urljoin
|
||||
@@ -7,6 +7,7 @@ import asyncio
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from .models import CrawlResult
|
||||
from .async_logger import AsyncLogger, LogLevel
|
||||
from .utils import hooks_to_string
|
||||
|
||||
|
||||
class Crawl4aiClientError(Exception):
|
||||
@@ -70,17 +71,41 @@ class Crawl4aiDockerClient:
|
||||
self.logger.error(f"Server unreachable: {str(e)}", tag="ERROR")
|
||||
raise ConnectionError(f"Cannot connect to server: {str(e)}")
|
||||
|
||||
def _prepare_request(self, urls: List[str], browser_config: Optional[BrowserConfig] = None,
|
||||
crawler_config: Optional[CrawlerRunConfig] = None) -> Dict[str, Any]:
|
||||
def _prepare_request(
|
||||
self,
|
||||
urls: List[str],
|
||||
browser_config: Optional[BrowserConfig] = None,
|
||||
crawler_config: Optional[CrawlerRunConfig] = None,
|
||||
hooks: Optional[Union[Dict[str, Callable], Dict[str, str]]] = None,
|
||||
hooks_timeout: int = 30
|
||||
) -> Dict[str, Any]:
|
||||
"""Prepare request data from configs."""
|
||||
if self._token:
|
||||
self._http_client.headers["Authorization"] = f"Bearer {self._token}"
|
||||
return {
|
||||
|
||||
request_data = {
|
||||
"urls": urls,
|
||||
"browser_config": browser_config.dump() if browser_config else {},
|
||||
"crawler_config": crawler_config.dump() if crawler_config else {}
|
||||
}
|
||||
|
||||
# Handle hooks if provided
|
||||
if hooks:
|
||||
# Check if hooks are already strings or need conversion
|
||||
if any(callable(v) for v in hooks.values()):
|
||||
# Convert function objects to strings
|
||||
hooks_code = hooks_to_string(hooks)
|
||||
else:
|
||||
# Already in string format
|
||||
hooks_code = hooks
|
||||
|
||||
request_data["hooks"] = {
|
||||
"code": hooks_code,
|
||||
"timeout": hooks_timeout
|
||||
}
|
||||
|
||||
return request_data
|
||||
|
||||
async def _request(self, method: str, endpoint: str, **kwargs) -> httpx.Response:
|
||||
"""Make an HTTP request with error handling."""
|
||||
url = urljoin(self.base_url, endpoint)
|
||||
@@ -102,16 +127,42 @@ class Crawl4aiDockerClient:
|
||||
self,
|
||||
urls: List[str],
|
||||
browser_config: Optional[BrowserConfig] = None,
|
||||
crawler_config: Optional[CrawlerRunConfig] = None
|
||||
crawler_config: Optional[CrawlerRunConfig] = None,
|
||||
hooks: Optional[Union[Dict[str, Callable], Dict[str, str]]] = None,
|
||||
hooks_timeout: int = 30
|
||||
) -> Union[CrawlResult, List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
|
||||
"""Execute a crawl operation."""
|
||||
"""
|
||||
Execute a crawl operation.
|
||||
|
||||
Args:
|
||||
urls: List of URLs to crawl
|
||||
browser_config: Browser configuration
|
||||
crawler_config: Crawler configuration
|
||||
hooks: Optional hooks - can be either:
|
||||
- Dict[str, Callable]: Function objects that will be converted to strings
|
||||
- Dict[str, str]: Already stringified hook code
|
||||
hooks_timeout: Timeout in seconds for each hook execution (1-120)
|
||||
|
||||
Returns:
|
||||
Single CrawlResult, list of results, or async generator for streaming
|
||||
|
||||
Example with function hooks:
|
||||
>>> async def my_hook(page, context, **kwargs):
|
||||
... await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||
... return page
|
||||
>>>
|
||||
>>> result = await client.crawl(
|
||||
... ["https://example.com"],
|
||||
... hooks={"on_page_context_created": my_hook}
|
||||
... )
|
||||
"""
|
||||
await self._check_server()
|
||||
|
||||
data = self._prepare_request(urls, browser_config, crawler_config)
|
||||
|
||||
data = self._prepare_request(urls, browser_config, crawler_config, hooks, hooks_timeout)
|
||||
is_streaming = crawler_config and crawler_config.stream
|
||||
|
||||
|
||||
self.logger.info(f"Crawling {len(urls)} URLs {'(streaming)' if is_streaming else ''}", tag="CRAWL")
|
||||
|
||||
|
||||
if is_streaming:
|
||||
async def stream_results() -> AsyncGenerator[CrawlResult, None]:
|
||||
async with self._http_client.stream("POST", f"{self.base_url}/crawl/stream", json=data) as response:
|
||||
@@ -128,12 +179,12 @@ class Crawl4aiDockerClient:
|
||||
else:
|
||||
yield CrawlResult(**result)
|
||||
return stream_results()
|
||||
|
||||
|
||||
response = await self._request("POST", "/crawl", json=data)
|
||||
result_data = response.json()
|
||||
if not result_data.get("success", False):
|
||||
raise RequestError(f"Crawl failed: {result_data.get('msg', 'Unknown error')}")
|
||||
|
||||
|
||||
results = [CrawlResult(**r) for r in result_data.get("results", [])]
|
||||
self.logger.success(f"Crawl completed with {len(results)} results", tag="CRAWL")
|
||||
return results[0] if len(results) == 1 else results
|
||||
|
||||
@@ -47,6 +47,7 @@ from urllib.parse import (
|
||||
urljoin, urlparse, urlunparse,
|
||||
parse_qsl, urlencode, quote, unquote
|
||||
)
|
||||
import inspect
|
||||
|
||||
|
||||
# Monkey patch to fix wildcard handling in urllib.robotparser
|
||||
@@ -3529,4 +3530,52 @@ def get_memory_stats() -> Tuple[float, float, float]:
|
||||
available_gb = get_true_available_memory_gb()
|
||||
used_percent = get_true_memory_usage_percent()
|
||||
|
||||
return used_percent, available_gb, total_gb
|
||||
return used_percent, available_gb, total_gb
|
||||
|
||||
|
||||
# Hook utilities for Docker API
|
||||
def hooks_to_string(hooks: Dict[str, Callable]) -> Dict[str, str]:
|
||||
"""
|
||||
Convert hook function objects to string representations for Docker API.
|
||||
|
||||
This utility simplifies the process of using hooks with the Docker API by converting
|
||||
Python function objects into the string format required by the API.
|
||||
|
||||
Args:
|
||||
hooks: Dictionary mapping hook point names to Python function objects.
|
||||
Functions should be async and follow hook signature requirements.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping hook point names to string representations of the functions.
|
||||
|
||||
Example:
|
||||
>>> async def my_hook(page, context, **kwargs):
|
||||
... await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||
... return page
|
||||
>>>
|
||||
>>> hooks_dict = {"on_page_context_created": my_hook}
|
||||
>>> api_hooks = hooks_to_string(hooks_dict)
|
||||
>>> # api_hooks is now ready to use with Docker API
|
||||
|
||||
Raises:
|
||||
ValueError: If a hook is not callable or source cannot be extracted
|
||||
"""
|
||||
result = {}
|
||||
|
||||
for hook_name, hook_func in hooks.items():
|
||||
if not callable(hook_func):
|
||||
raise ValueError(f"Hook '{hook_name}' must be a callable function, got {type(hook_func)}")
|
||||
|
||||
try:
|
||||
# Get the source code of the function
|
||||
source = inspect.getsource(hook_func)
|
||||
# Remove any leading indentation to get clean source
|
||||
source = textwrap.dedent(source)
|
||||
result[hook_name] = source
|
||||
except (OSError, TypeError) as e:
|
||||
raise ValueError(
|
||||
f"Cannot extract source code for hook '{hook_name}'. "
|
||||
f"Make sure the function is defined in a file (not interactively). Error: {e}"
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
- [Python SDK](#python-sdk)
|
||||
- [Understanding Request Schema](#understanding-request-schema)
|
||||
- [REST API Examples](#rest-api-examples)
|
||||
- [Asynchronous Jobs with Webhooks](#asynchronous-jobs-with-webhooks)
|
||||
- [Additional API Endpoints](#additional-api-endpoints)
|
||||
- [HTML Extraction Endpoint](#html-extraction-endpoint)
|
||||
- [Screenshot Endpoint](#screenshot-endpoint)
|
||||
@@ -58,15 +59,13 @@ Pull and run images directly from Docker Hub without building locally.
|
||||
|
||||
#### 1. Pull the Image
|
||||
|
||||
Our latest release candidate is `0.7.0-r1`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
|
||||
|
||||
> ⚠️ **Important Note**: The `latest` tag currently points to the stable `0.6.0` version. After testing and validation, `0.7.0` (without -r1) will be released and `latest` will be updated. For now, please use `0.7.0-r1` to test the new features.
|
||||
Our latest stable release is `0.7.6`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
|
||||
|
||||
```bash
|
||||
# Pull the release candidate (for testing new features)
|
||||
docker pull unclecode/crawl4ai:0.7.0-r1
|
||||
# Pull the latest stable version (0.7.6)
|
||||
docker pull unclecode/crawl4ai:0.7.6
|
||||
|
||||
# Or pull the current stable version (0.6.0)
|
||||
# Or use the latest tag (points to 0.7.6)
|
||||
docker pull unclecode/crawl4ai:latest
|
||||
```
|
||||
|
||||
@@ -101,7 +100,7 @@ EOL
|
||||
-p 11235:11235 \
|
||||
--name crawl4ai \
|
||||
--shm-size=1g \
|
||||
unclecode/crawl4ai:0.7.0-r1
|
||||
unclecode/crawl4ai:0.7.6
|
||||
```
|
||||
|
||||
* **With LLM support:**
|
||||
@@ -112,7 +111,7 @@ EOL
|
||||
--name crawl4ai \
|
||||
--env-file .llm.env \
|
||||
--shm-size=1g \
|
||||
unclecode/crawl4ai:0.7.0-r1
|
||||
unclecode/crawl4ai:0.7.6
|
||||
```
|
||||
|
||||
> The server will be available at `http://localhost:11235`. Visit `/playground` to access the interactive testing interface.
|
||||
@@ -185,7 +184,7 @@ The `docker-compose.yml` file in the project root provides a simplified approach
|
||||
```bash
|
||||
# Pulls and runs the release candidate from Docker Hub
|
||||
# Automatically selects the correct architecture
|
||||
IMAGE=unclecode/crawl4ai:0.7.0-r1 docker compose up -d
|
||||
IMAGE=unclecode/crawl4ai:0.7.6 docker compose up -d
|
||||
```
|
||||
|
||||
* **Build and Run Locally:**
|
||||
@@ -648,6 +647,194 @@ async def test_stream_crawl(token: str = None): # Made token optional
|
||||
# asyncio.run(test_stream_crawl())
|
||||
```
|
||||
|
||||
### Asynchronous Jobs with Webhooks
|
||||
|
||||
For long-running crawls or when you want to avoid keeping connections open, use the job queue endpoints. Instead of polling for results, configure a webhook to receive notifications when jobs complete.
|
||||
|
||||
#### Why Use Jobs & Webhooks?
|
||||
|
||||
- **No Polling Required** - Get notified when crawls complete instead of constantly checking status
|
||||
- **Better Resource Usage** - Free up client connections while jobs run in the background
|
||||
- **Scalable Architecture** - Ideal for high-volume crawling with TypeScript/Node.js clients or microservices
|
||||
- **Reliable Delivery** - Automatic retry with exponential backoff (5 attempts: 1s → 2s → 4s → 8s → 16s)
|
||||
|
||||
#### How It Works
|
||||
|
||||
1. **Submit Job** → POST to `/crawl/job` with optional `webhook_config`
|
||||
2. **Get Task ID** → Receive a `task_id` immediately
|
||||
3. **Job Runs** → Crawl executes in the background
|
||||
4. **Webhook Fired** → Server POSTs completion notification to your webhook URL
|
||||
5. **Fetch Results** → If data wasn't included in webhook, GET `/crawl/job/{task_id}`
|
||||
|
||||
#### Quick Example
|
||||
|
||||
```bash
|
||||
# Submit a crawl job with webhook notification
|
||||
curl -X POST http://localhost:11235/crawl/job \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"urls": ["https://example.com"],
|
||||
"webhook_config": {
|
||||
"webhook_url": "https://myapp.com/webhooks/crawl-complete",
|
||||
"webhook_data_in_payload": false
|
||||
}
|
||||
}'
|
||||
|
||||
# Response: {"task_id": "crawl_a1b2c3d4"}
|
||||
```
|
||||
|
||||
**Your webhook receives:**
|
||||
```json
|
||||
{
|
||||
"task_id": "crawl_a1b2c3d4",
|
||||
"task_type": "crawl",
|
||||
"status": "completed",
|
||||
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
||||
"urls": ["https://example.com"]
|
||||
}
|
||||
```
|
||||
|
||||
Then fetch the results:
|
||||
```bash
|
||||
curl http://localhost:11235/crawl/job/crawl_a1b2c3d4
|
||||
```
|
||||
|
||||
#### Include Data in Webhook
|
||||
|
||||
Set `webhook_data_in_payload: true` to receive the full crawl results directly in the webhook:
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:11235/crawl/job \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"urls": ["https://example.com"],
|
||||
"webhook_config": {
|
||||
"webhook_url": "https://myapp.com/webhooks/crawl-complete",
|
||||
"webhook_data_in_payload": true
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
**Your webhook receives the complete data:**
|
||||
```json
|
||||
{
|
||||
"task_id": "crawl_a1b2c3d4",
|
||||
"task_type": "crawl",
|
||||
"status": "completed",
|
||||
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
||||
"urls": ["https://example.com"],
|
||||
"data": {
|
||||
"markdown": "...",
|
||||
"html": "...",
|
||||
"links": {...},
|
||||
"metadata": {...}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### Webhook Authentication
|
||||
|
||||
Add custom headers for authentication:
|
||||
|
||||
```json
|
||||
{
|
||||
"urls": ["https://example.com"],
|
||||
"webhook_config": {
|
||||
"webhook_url": "https://myapp.com/webhooks/crawl",
|
||||
"webhook_data_in_payload": false,
|
||||
"webhook_headers": {
|
||||
"X-Webhook-Secret": "your-secret-token",
|
||||
"X-Service-ID": "crawl4ai-prod"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### Global Default Webhook
|
||||
|
||||
Configure a default webhook URL in `config.yml` for all jobs:
|
||||
|
||||
```yaml
|
||||
webhooks:
|
||||
enabled: true
|
||||
default_url: "https://myapp.com/webhooks/default"
|
||||
data_in_payload: false
|
||||
retry:
|
||||
max_attempts: 5
|
||||
initial_delay_ms: 1000
|
||||
max_delay_ms: 32000
|
||||
timeout_ms: 30000
|
||||
```
|
||||
|
||||
Now jobs without `webhook_config` automatically use the default webhook.
|
||||
|
||||
#### Job Status Polling (Without Webhooks)
|
||||
|
||||
If you prefer polling instead of webhooks, just omit `webhook_config`:
|
||||
|
||||
```bash
|
||||
# Submit job
|
||||
curl -X POST http://localhost:11235/crawl/job \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"urls": ["https://example.com"]}'
|
||||
# Response: {"task_id": "crawl_xyz"}
|
||||
|
||||
# Poll for status
|
||||
curl http://localhost:11235/crawl/job/crawl_xyz
|
||||
```
|
||||
|
||||
The response includes `status` field: `"processing"`, `"completed"`, or `"failed"`.
|
||||
|
||||
#### LLM Extraction Jobs with Webhooks
|
||||
|
||||
The same webhook system works for LLM extraction jobs via `/llm/job`:
|
||||
|
||||
```bash
|
||||
# Submit LLM extraction job with webhook
|
||||
curl -X POST http://localhost:11235/llm/job \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"url": "https://example.com/article",
|
||||
"q": "Extract the article title, author, and main points",
|
||||
"provider": "openai/gpt-4o-mini",
|
||||
"webhook_config": {
|
||||
"webhook_url": "https://myapp.com/webhooks/llm-complete",
|
||||
"webhook_data_in_payload": true,
|
||||
"webhook_headers": {
|
||||
"X-Webhook-Secret": "your-secret-token"
|
||||
}
|
||||
}
|
||||
}'
|
||||
|
||||
# Response: {"task_id": "llm_1234567890"}
|
||||
```
|
||||
|
||||
**Your webhook receives:**
|
||||
```json
|
||||
{
|
||||
"task_id": "llm_1234567890",
|
||||
"task_type": "llm_extraction",
|
||||
"status": "completed",
|
||||
"timestamp": "2025-10-22T12:30:00.000000+00:00",
|
||||
"urls": ["https://example.com/article"],
|
||||
"data": {
|
||||
"extracted_content": {
|
||||
"title": "Understanding Web Scraping",
|
||||
"author": "John Doe",
|
||||
"main_points": ["Point 1", "Point 2", "Point 3"]
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Key Differences for LLM Jobs:**
|
||||
- Task type is `"llm_extraction"` instead of `"crawl"`
|
||||
- Extracted data is in `data.extracted_content`
|
||||
- Single URL only (not an array)
|
||||
- Supports schema-based extraction with `schema` parameter
|
||||
|
||||
> 💡 **Pro tip**: See [WEBHOOK_EXAMPLES.md](./WEBHOOK_EXAMPLES.md) for detailed examples including TypeScript client code, Flask webhook handlers, and failure handling.
|
||||
|
||||
---
|
||||
|
||||
## Metrics & Monitoring
|
||||
@@ -826,10 +1013,11 @@ We're here to help you succeed with Crawl4AI! Here's how to get support:
|
||||
|
||||
In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
|
||||
- Building and running the Docker container
|
||||
- Configuring the environment
|
||||
- Configuring the environment
|
||||
- Using the interactive playground for testing
|
||||
- Making API requests with proper typing
|
||||
- Using the Python SDK
|
||||
- Asynchronous job queues with webhook notifications
|
||||
- Leveraging specialized endpoints for screenshots, PDFs, and JavaScript execution
|
||||
- Connecting via the Model Context Protocol (MCP)
|
||||
- Monitoring your deployment
|
||||
|
||||
378
deploy/docker/WEBHOOK_EXAMPLES.md
Normal file
378
deploy/docker/WEBHOOK_EXAMPLES.md
Normal file
@@ -0,0 +1,378 @@
|
||||
# Webhook Feature Examples
|
||||
|
||||
This document provides examples of how to use the webhook feature for crawl jobs in Crawl4AI.
|
||||
|
||||
## Overview
|
||||
|
||||
The webhook feature allows you to receive notifications when crawl jobs complete, eliminating the need for polling. Webhooks are sent with exponential backoff retry logic to ensure reliable delivery.
|
||||
|
||||
## Configuration
|
||||
|
||||
### Global Configuration (config.yml)
|
||||
|
||||
You can configure default webhook settings in `config.yml`:
|
||||
|
||||
```yaml
|
||||
webhooks:
|
||||
enabled: true
|
||||
default_url: null # Optional: default webhook URL for all jobs
|
||||
data_in_payload: false # Optional: default behavior for including data
|
||||
retry:
|
||||
max_attempts: 5
|
||||
initial_delay_ms: 1000 # 1s, 2s, 4s, 8s, 16s exponential backoff
|
||||
max_delay_ms: 32000
|
||||
timeout_ms: 30000 # 30s timeout per webhook call
|
||||
headers: # Optional: default headers to include
|
||||
User-Agent: "Crawl4AI-Webhook/1.0"
|
||||
```
|
||||
|
||||
## API Usage Examples
|
||||
|
||||
### Example 1: Basic Webhook (Notification Only)
|
||||
|
||||
Send a webhook notification without including the crawl data in the payload.
|
||||
|
||||
**Request:**
|
||||
```bash
|
||||
curl -X POST http://localhost:11235/crawl/job \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"urls": ["https://example.com"],
|
||||
"webhook_config": {
|
||||
"webhook_url": "https://myapp.com/webhooks/crawl-complete",
|
||||
"webhook_data_in_payload": false
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"task_id": "crawl_a1b2c3d4"
|
||||
}
|
||||
```
|
||||
|
||||
**Webhook Payload Received:**
|
||||
```json
|
||||
{
|
||||
"task_id": "crawl_a1b2c3d4",
|
||||
"task_type": "crawl",
|
||||
"status": "completed",
|
||||
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
||||
"urls": ["https://example.com"]
|
||||
}
|
||||
```
|
||||
|
||||
Your webhook handler should then fetch the results:
|
||||
```bash
|
||||
curl http://localhost:11235/crawl/job/crawl_a1b2c3d4
|
||||
```
|
||||
|
||||
### Example 2: Webhook with Data Included
|
||||
|
||||
Include the full crawl results in the webhook payload.
|
||||
|
||||
**Request:**
|
||||
```bash
|
||||
curl -X POST http://localhost:11235/crawl/job \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"urls": ["https://example.com"],
|
||||
"webhook_config": {
|
||||
"webhook_url": "https://myapp.com/webhooks/crawl-complete",
|
||||
"webhook_data_in_payload": true
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
**Webhook Payload Received:**
|
||||
```json
|
||||
{
|
||||
"task_id": "crawl_a1b2c3d4",
|
||||
"task_type": "crawl",
|
||||
"status": "completed",
|
||||
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
||||
"urls": ["https://example.com"],
|
||||
"data": {
|
||||
"markdown": "...",
|
||||
"html": "...",
|
||||
"links": {...},
|
||||
"metadata": {...}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Example 3: Webhook with Custom Headers
|
||||
|
||||
Include custom headers for authentication or identification.
|
||||
|
||||
**Request:**
|
||||
```bash
|
||||
curl -X POST http://localhost:11235/crawl/job \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"urls": ["https://example.com"],
|
||||
"webhook_config": {
|
||||
"webhook_url": "https://myapp.com/webhooks/crawl-complete",
|
||||
"webhook_data_in_payload": false,
|
||||
"webhook_headers": {
|
||||
"X-Webhook-Secret": "my-secret-token",
|
||||
"X-Service-ID": "crawl4ai-production"
|
||||
}
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
The webhook will be sent with these additional headers plus the default headers from config.
|
||||
|
||||
### Example 4: Failure Notification
|
||||
|
||||
When a crawl job fails, a webhook is sent with error details.
|
||||
|
||||
**Webhook Payload on Failure:**
|
||||
```json
|
||||
{
|
||||
"task_id": "crawl_a1b2c3d4",
|
||||
"task_type": "crawl",
|
||||
"status": "failed",
|
||||
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
||||
"urls": ["https://example.com"],
|
||||
"error": "Connection timeout after 30s"
|
||||
}
|
||||
```
|
||||
|
||||
### Example 5: Using Global Default Webhook
|
||||
|
||||
If you set a `default_url` in config.yml, jobs without webhook_config will use it:
|
||||
|
||||
**config.yml:**
|
||||
```yaml
|
||||
webhooks:
|
||||
enabled: true
|
||||
default_url: "https://myapp.com/webhooks/default"
|
||||
data_in_payload: false
|
||||
```
|
||||
|
||||
**Request (no webhook_config needed):**
|
||||
```bash
|
||||
curl -X POST http://localhost:11235/crawl/job \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"urls": ["https://example.com"]
|
||||
}'
|
||||
```
|
||||
|
||||
The webhook will be sent to the default URL configured in config.yml.
|
||||
|
||||
### Example 6: LLM Extraction Job with Webhook
|
||||
|
||||
Use webhooks with the LLM extraction endpoint for asynchronous processing.
|
||||
|
||||
**Request:**
|
||||
```bash
|
||||
curl -X POST http://localhost:11235/llm/job \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"url": "https://example.com/article",
|
||||
"q": "Extract the article title, author, and publication date",
|
||||
"schema": "{\"type\": \"object\", \"properties\": {\"title\": {\"type\": \"string\"}, \"author\": {\"type\": \"string\"}, \"date\": {\"type\": \"string\"}}}",
|
||||
"cache": false,
|
||||
"provider": "openai/gpt-4o-mini",
|
||||
"webhook_config": {
|
||||
"webhook_url": "https://myapp.com/webhooks/llm-complete",
|
||||
"webhook_data_in_payload": true
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"task_id": "llm_1698765432_12345"
|
||||
}
|
||||
```
|
||||
|
||||
**Webhook Payload Received:**
|
||||
```json
|
||||
{
|
||||
"task_id": "llm_1698765432_12345",
|
||||
"task_type": "llm_extraction",
|
||||
"status": "completed",
|
||||
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
||||
"urls": ["https://example.com/article"],
|
||||
"data": {
|
||||
"extracted_content": {
|
||||
"title": "Understanding Web Scraping",
|
||||
"author": "John Doe",
|
||||
"date": "2025-10-21"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Webhook Handler Example
|
||||
|
||||
Here's a simple Python Flask webhook handler that supports both crawl and LLM extraction jobs:
|
||||
|
||||
```python
|
||||
from flask import Flask, request, jsonify
|
||||
import requests
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
@app.route('/webhooks/crawl-complete', methods=['POST'])
|
||||
def handle_crawl_webhook():
|
||||
payload = request.json
|
||||
|
||||
task_id = payload['task_id']
|
||||
task_type = payload['task_type']
|
||||
status = payload['status']
|
||||
|
||||
if status == 'completed':
|
||||
# If data not in payload, fetch it
|
||||
if 'data' not in payload:
|
||||
# Determine endpoint based on task type
|
||||
endpoint = 'crawl' if task_type == 'crawl' else 'llm'
|
||||
response = requests.get(f'http://localhost:11235/{endpoint}/job/{task_id}')
|
||||
data = response.json()
|
||||
else:
|
||||
data = payload['data']
|
||||
|
||||
# Process based on task type
|
||||
if task_type == 'crawl':
|
||||
print(f"Processing crawl results for {task_id}")
|
||||
# Handle crawl results
|
||||
results = data.get('results', [])
|
||||
for result in results:
|
||||
print(f" - {result.get('url')}: {len(result.get('markdown', ''))} chars")
|
||||
|
||||
elif task_type == 'llm_extraction':
|
||||
print(f"Processing LLM extraction for {task_id}")
|
||||
# Handle LLM extraction
|
||||
# Note: Webhook sends 'extracted_content', API returns 'result'
|
||||
extracted = data.get('extracted_content', data.get('result', {}))
|
||||
print(f" - Extracted: {extracted}")
|
||||
|
||||
# Your business logic here...
|
||||
|
||||
elif status == 'failed':
|
||||
error = payload.get('error', 'Unknown error')
|
||||
print(f"{task_type} job {task_id} failed: {error}")
|
||||
# Handle failure...
|
||||
|
||||
return jsonify({"status": "received"}), 200
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(port=8080)
|
||||
```
|
||||
|
||||
## Retry Logic
|
||||
|
||||
The webhook delivery service uses exponential backoff retry logic:
|
||||
|
||||
- **Attempts:** Up to 5 attempts by default
|
||||
- **Delays:** 1s → 2s → 4s → 8s → 16s
|
||||
- **Timeout:** 30 seconds per attempt
|
||||
- **Retry Conditions:**
|
||||
- Server errors (5xx status codes)
|
||||
- Network errors
|
||||
- Timeouts
|
||||
- **No Retry:**
|
||||
- Client errors (4xx status codes)
|
||||
- Successful delivery (2xx status codes)
|
||||
|
||||
## Benefits
|
||||
|
||||
1. **No Polling Required** - Eliminates constant API calls to check job status
|
||||
2. **Real-time Notifications** - Immediate notification when jobs complete
|
||||
3. **Reliable Delivery** - Exponential backoff ensures webhooks are delivered
|
||||
4. **Flexible** - Choose between notification-only or full data delivery
|
||||
5. **Secure** - Support for custom headers for authentication
|
||||
6. **Configurable** - Global defaults or per-job configuration
|
||||
7. **Universal Support** - Works with both `/crawl/job` and `/llm/job` endpoints
|
||||
|
||||
## TypeScript Client Example
|
||||
|
||||
```typescript
|
||||
interface WebhookConfig {
|
||||
webhook_url: string;
|
||||
webhook_data_in_payload?: boolean;
|
||||
webhook_headers?: Record<string, string>;
|
||||
}
|
||||
|
||||
interface CrawlJobRequest {
|
||||
urls: string[];
|
||||
browser_config?: Record<string, any>;
|
||||
crawler_config?: Record<string, any>;
|
||||
webhook_config?: WebhookConfig;
|
||||
}
|
||||
|
||||
interface LLMJobRequest {
|
||||
url: string;
|
||||
q: string;
|
||||
schema?: string;
|
||||
cache?: boolean;
|
||||
provider?: string;
|
||||
webhook_config?: WebhookConfig;
|
||||
}
|
||||
|
||||
async function createCrawlJob(request: CrawlJobRequest) {
|
||||
const response = await fetch('http://localhost:11235/crawl/job', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(request)
|
||||
});
|
||||
|
||||
const { task_id } = await response.json();
|
||||
return task_id;
|
||||
}
|
||||
|
||||
async function createLLMJob(request: LLMJobRequest) {
|
||||
const response = await fetch('http://localhost:11235/llm/job', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(request)
|
||||
});
|
||||
|
||||
const { task_id } = await response.json();
|
||||
return task_id;
|
||||
}
|
||||
|
||||
// Usage - Crawl Job
|
||||
const crawlTaskId = await createCrawlJob({
|
||||
urls: ['https://example.com'],
|
||||
webhook_config: {
|
||||
webhook_url: 'https://myapp.com/webhooks/crawl-complete',
|
||||
webhook_data_in_payload: false,
|
||||
webhook_headers: {
|
||||
'X-Webhook-Secret': 'my-secret'
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Usage - LLM Extraction Job
|
||||
const llmTaskId = await createLLMJob({
|
||||
url: 'https://example.com/article',
|
||||
q: 'Extract the main points from this article',
|
||||
provider: 'openai/gpt-4o-mini',
|
||||
webhook_config: {
|
||||
webhook_url: 'https://myapp.com/webhooks/llm-complete',
|
||||
webhook_data_in_payload: true,
|
||||
webhook_headers: {
|
||||
'X-Webhook-Secret': 'my-secret'
|
||||
}
|
||||
}
|
||||
});
|
||||
```
|
||||
|
||||
## Monitoring and Debugging
|
||||
|
||||
Webhook delivery attempts are logged at INFO level:
|
||||
- Successful deliveries
|
||||
- Retry attempts with delays
|
||||
- Final failures after max attempts
|
||||
|
||||
Check the application logs for webhook delivery status:
|
||||
```bash
|
||||
docker logs crawl4ai-container | grep -i webhook
|
||||
```
|
||||
@@ -46,6 +46,7 @@ from utils import (
|
||||
get_llm_temperature,
|
||||
get_llm_base_url
|
||||
)
|
||||
from webhook import WebhookDeliveryService
|
||||
|
||||
import psutil, time
|
||||
|
||||
@@ -120,10 +121,14 @@ async def process_llm_extraction(
|
||||
schema: Optional[str] = None,
|
||||
cache: str = "0",
|
||||
provider: Optional[str] = None,
|
||||
webhook_config: Optional[Dict] = None,
|
||||
temperature: Optional[float] = None,
|
||||
base_url: Optional[str] = None
|
||||
) -> None:
|
||||
"""Process LLM extraction in background."""
|
||||
# Initialize webhook service
|
||||
webhook_service = WebhookDeliveryService(config)
|
||||
|
||||
try:
|
||||
# Validate provider
|
||||
is_valid, error_msg = validate_llm_provider(config, provider)
|
||||
@@ -132,6 +137,16 @@ async def process_llm_extraction(
|
||||
"status": TaskStatus.FAILED,
|
||||
"error": error_msg
|
||||
})
|
||||
|
||||
# Send webhook notification on failure
|
||||
await webhook_service.notify_job_completion(
|
||||
task_id=task_id,
|
||||
task_type="llm_extraction",
|
||||
status="failed",
|
||||
urls=[url],
|
||||
webhook_config=webhook_config,
|
||||
error=error_msg
|
||||
)
|
||||
return
|
||||
api_key = get_llm_api_key(config, provider) # Returns None to let litellm handle it
|
||||
llm_strategy = LLMExtractionStrategy(
|
||||
@@ -162,17 +177,40 @@ async def process_llm_extraction(
|
||||
"status": TaskStatus.FAILED,
|
||||
"error": result.error_message
|
||||
})
|
||||
|
||||
# Send webhook notification on failure
|
||||
await webhook_service.notify_job_completion(
|
||||
task_id=task_id,
|
||||
task_type="llm_extraction",
|
||||
status="failed",
|
||||
urls=[url],
|
||||
webhook_config=webhook_config,
|
||||
error=result.error_message
|
||||
)
|
||||
return
|
||||
|
||||
try:
|
||||
content = json.loads(result.extracted_content)
|
||||
except json.JSONDecodeError:
|
||||
content = result.extracted_content
|
||||
|
||||
result_data = {"extracted_content": content}
|
||||
|
||||
await redis.hset(f"task:{task_id}", mapping={
|
||||
"status": TaskStatus.COMPLETED,
|
||||
"result": json.dumps(content)
|
||||
})
|
||||
|
||||
# Send webhook notification on successful completion
|
||||
await webhook_service.notify_job_completion(
|
||||
task_id=task_id,
|
||||
task_type="llm_extraction",
|
||||
status="completed",
|
||||
urls=[url],
|
||||
webhook_config=webhook_config,
|
||||
result=result_data
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"LLM extraction error: {str(e)}", exc_info=True)
|
||||
await redis.hset(f"task:{task_id}", mapping={
|
||||
@@ -180,6 +218,16 @@ async def process_llm_extraction(
|
||||
"error": str(e)
|
||||
})
|
||||
|
||||
# Send webhook notification on failure
|
||||
await webhook_service.notify_job_completion(
|
||||
task_id=task_id,
|
||||
task_type="llm_extraction",
|
||||
status="failed",
|
||||
urls=[url],
|
||||
webhook_config=webhook_config,
|
||||
error=str(e)
|
||||
)
|
||||
|
||||
async def handle_markdown_request(
|
||||
url: str,
|
||||
filter_type: FilterType,
|
||||
@@ -261,6 +309,7 @@ async def handle_llm_request(
|
||||
cache: str = "0",
|
||||
config: Optional[dict] = None,
|
||||
provider: Optional[str] = None,
|
||||
webhook_config: Optional[Dict] = None,
|
||||
temperature: Optional[float] = None,
|
||||
api_base_url: Optional[str] = None
|
||||
) -> JSONResponse:
|
||||
@@ -294,6 +343,7 @@ async def handle_llm_request(
|
||||
base_url,
|
||||
config,
|
||||
provider,
|
||||
webhook_config,
|
||||
temperature,
|
||||
api_base_url
|
||||
)
|
||||
@@ -341,6 +391,7 @@ async def create_new_task(
|
||||
base_url: str,
|
||||
config: dict,
|
||||
provider: Optional[str] = None,
|
||||
webhook_config: Optional[Dict] = None,
|
||||
temperature: Optional[float] = None,
|
||||
api_base_url: Optional[str] = None
|
||||
) -> JSONResponse:
|
||||
@@ -351,12 +402,18 @@ async def create_new_task(
|
||||
|
||||
from datetime import datetime
|
||||
task_id = f"llm_{int(datetime.now().timestamp())}_{id(background_tasks)}"
|
||||
|
||||
await redis.hset(f"task:{task_id}", mapping={
|
||||
|
||||
task_data = {
|
||||
"status": TaskStatus.PROCESSING,
|
||||
"created_at": datetime.now().isoformat(),
|
||||
"url": decoded_url
|
||||
})
|
||||
}
|
||||
|
||||
# Store webhook config if provided
|
||||
if webhook_config:
|
||||
task_data["webhook_config"] = json.dumps(webhook_config)
|
||||
|
||||
await redis.hset(f"task:{task_id}", mapping=task_data)
|
||||
|
||||
background_tasks.add_task(
|
||||
process_llm_extraction,
|
||||
@@ -368,6 +425,7 @@ async def create_new_task(
|
||||
schema,
|
||||
cache,
|
||||
provider,
|
||||
webhook_config,
|
||||
temperature,
|
||||
api_base_url
|
||||
)
|
||||
@@ -442,13 +500,15 @@ async def handle_crawl_request(
|
||||
urls: List[str],
|
||||
browser_config: dict,
|
||||
crawler_config: dict,
|
||||
config: dict
|
||||
config: dict,
|
||||
hooks_config: Optional[dict] = None
|
||||
) -> dict:
|
||||
"""Handle non-streaming crawl requests."""
|
||||
"""Handle non-streaming crawl requests with optional hooks."""
|
||||
start_mem_mb = _get_memory_mb() # <--- Get memory before
|
||||
start_time = time.time()
|
||||
mem_delta_mb = None
|
||||
peak_mem_mb = start_mem_mb
|
||||
hook_manager = None
|
||||
|
||||
try:
|
||||
urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls]
|
||||
@@ -468,11 +528,27 @@ async def handle_crawl_request(
|
||||
# crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config)
|
||||
# await crawler.start()
|
||||
|
||||
# Attach hooks if provided
|
||||
hooks_status = {}
|
||||
if hooks_config:
|
||||
from hook_manager import attach_user_hooks_to_crawler, UserHookManager
|
||||
hook_manager = UserHookManager(timeout=hooks_config.get('timeout', 30))
|
||||
hooks_status, hook_manager = await attach_user_hooks_to_crawler(
|
||||
crawler,
|
||||
hooks_config.get('code', {}),
|
||||
timeout=hooks_config.get('timeout', 30),
|
||||
hook_manager=hook_manager
|
||||
)
|
||||
logger.info(f"Hooks attachment status: {hooks_status['status']}")
|
||||
|
||||
base_config = config["crawler"]["base_config"]
|
||||
# Iterate on key-value pairs in global_config then use haseattr to set them
|
||||
# Iterate on key-value pairs in global_config then use hasattr to set them
|
||||
for key, value in base_config.items():
|
||||
if hasattr(crawler_config, key):
|
||||
setattr(crawler_config, key, value)
|
||||
current_value = getattr(crawler_config, key)
|
||||
# Only set base config if user didn't provide a value
|
||||
if current_value is None or current_value == "":
|
||||
setattr(crawler_config, key, value)
|
||||
|
||||
results = []
|
||||
func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many")
|
||||
@@ -481,6 +557,10 @@ async def handle_crawl_request(
|
||||
config=crawler_config,
|
||||
dispatcher=dispatcher)
|
||||
results = await partial_func()
|
||||
|
||||
# Ensure results is always a list
|
||||
if not isinstance(results, list):
|
||||
results = [results]
|
||||
|
||||
# await crawler.close()
|
||||
|
||||
@@ -495,22 +575,71 @@ async def handle_crawl_request(
|
||||
# Process results to handle PDF bytes
|
||||
processed_results = []
|
||||
for result in results:
|
||||
result_dict = result.model_dump()
|
||||
# if fit_html is not a string, set it to None to avoid serialization errors
|
||||
if "fit_html" in result_dict and not (result_dict["fit_html"] is None or isinstance(result_dict["fit_html"], str)):
|
||||
result_dict["fit_html"] = None
|
||||
# If PDF exists, encode it to base64
|
||||
if result_dict.get('pdf') is not None:
|
||||
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
|
||||
processed_results.append(result_dict)
|
||||
try:
|
||||
# Check if result has model_dump method (is a proper CrawlResult)
|
||||
if hasattr(result, 'model_dump'):
|
||||
result_dict = result.model_dump()
|
||||
elif isinstance(result, dict):
|
||||
result_dict = result
|
||||
else:
|
||||
# Handle unexpected result type
|
||||
logger.warning(f"Unexpected result type: {type(result)}")
|
||||
result_dict = {
|
||||
"url": str(result) if hasattr(result, '__str__') else "unknown",
|
||||
"success": False,
|
||||
"error_message": f"Unexpected result type: {type(result).__name__}"
|
||||
}
|
||||
|
||||
# if fit_html is not a string, set it to None to avoid serialization errors
|
||||
if "fit_html" in result_dict and not (result_dict["fit_html"] is None or isinstance(result_dict["fit_html"], str)):
|
||||
result_dict["fit_html"] = None
|
||||
|
||||
# If PDF exists, encode it to base64
|
||||
if result_dict.get('pdf') is not None and isinstance(result_dict.get('pdf'), bytes):
|
||||
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
|
||||
|
||||
processed_results.append(result_dict)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing result: {e}")
|
||||
processed_results.append({
|
||||
"url": "unknown",
|
||||
"success": False,
|
||||
"error_message": str(e)
|
||||
})
|
||||
|
||||
return {
|
||||
response = {
|
||||
"success": True,
|
||||
"results": processed_results,
|
||||
"server_processing_time_s": end_time - start_time,
|
||||
"server_memory_delta_mb": mem_delta_mb,
|
||||
"server_peak_memory_mb": peak_mem_mb
|
||||
}
|
||||
|
||||
# Add hooks information if hooks were used
|
||||
if hooks_config and hook_manager:
|
||||
from hook_manager import UserHookManager
|
||||
if isinstance(hook_manager, UserHookManager):
|
||||
try:
|
||||
# Ensure all hook data is JSON serializable
|
||||
hook_data = {
|
||||
"status": hooks_status,
|
||||
"execution_log": hook_manager.execution_log,
|
||||
"errors": hook_manager.errors,
|
||||
"summary": hook_manager.get_summary()
|
||||
}
|
||||
# Test that it's serializable
|
||||
json.dumps(hook_data)
|
||||
response["hooks"] = hook_data
|
||||
except (TypeError, ValueError) as e:
|
||||
logger.error(f"Hook data not JSON serializable: {e}")
|
||||
response["hooks"] = {
|
||||
"status": {"status": "error", "message": "Hook data serialization failed"},
|
||||
"execution_log": [],
|
||||
"errors": [{"error": str(e)}],
|
||||
"summary": {}
|
||||
}
|
||||
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Crawl error: {str(e)}", exc_info=True)
|
||||
@@ -539,9 +668,11 @@ async def handle_stream_crawl_request(
|
||||
urls: List[str],
|
||||
browser_config: dict,
|
||||
crawler_config: dict,
|
||||
config: dict
|
||||
) -> Tuple[AsyncWebCrawler, AsyncGenerator]:
|
||||
"""Handle streaming crawl requests."""
|
||||
config: dict,
|
||||
hooks_config: Optional[dict] = None
|
||||
) -> Tuple[AsyncWebCrawler, AsyncGenerator, Optional[Dict]]:
|
||||
"""Handle streaming crawl requests with optional hooks."""
|
||||
hooks_info = None
|
||||
try:
|
||||
browser_config = BrowserConfig.load(browser_config)
|
||||
# browser_config.verbose = True # Set to False or remove for production stress testing
|
||||
@@ -562,6 +693,20 @@ async def handle_stream_crawl_request(
|
||||
|
||||
# crawler = AsyncWebCrawler(config=browser_config)
|
||||
# await crawler.start()
|
||||
|
||||
# Attach hooks if provided
|
||||
if hooks_config:
|
||||
from hook_manager import attach_user_hooks_to_crawler, UserHookManager
|
||||
hook_manager = UserHookManager(timeout=hooks_config.get('timeout', 30))
|
||||
hooks_status, hook_manager = await attach_user_hooks_to_crawler(
|
||||
crawler,
|
||||
hooks_config.get('code', {}),
|
||||
timeout=hooks_config.get('timeout', 30),
|
||||
hook_manager=hook_manager
|
||||
)
|
||||
logger.info(f"Hooks attachment status for streaming: {hooks_status['status']}")
|
||||
# Include hook manager in hooks_info for proper tracking
|
||||
hooks_info = {'status': hooks_status, 'manager': hook_manager}
|
||||
|
||||
results_gen = await crawler.arun_many(
|
||||
urls=urls,
|
||||
@@ -569,7 +714,7 @@ async def handle_stream_crawl_request(
|
||||
dispatcher=dispatcher
|
||||
)
|
||||
|
||||
return crawler, results_gen
|
||||
return crawler, results_gen, hooks_info
|
||||
|
||||
except Exception as e:
|
||||
# Make sure to close crawler if started during an error here
|
||||
@@ -593,6 +738,7 @@ async def handle_crawl_job(
|
||||
browser_config: Dict,
|
||||
crawler_config: Dict,
|
||||
config: Dict,
|
||||
webhook_config: Optional[Dict] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
Fire-and-forget version of handle_crawl_request.
|
||||
@@ -600,13 +746,24 @@ async def handle_crawl_job(
|
||||
lets /crawl/job/{task_id} polling fetch the result.
|
||||
"""
|
||||
task_id = f"crawl_{uuid4().hex[:8]}"
|
||||
await redis.hset(f"task:{task_id}", mapping={
|
||||
|
||||
# Store task data in Redis
|
||||
task_data = {
|
||||
"status": TaskStatus.PROCESSING, # <-- keep enum values consistent
|
||||
"created_at": datetime.now(timezone.utc).replace(tzinfo=None).isoformat(),
|
||||
"url": json.dumps(urls), # store list as JSON string
|
||||
"result": "",
|
||||
"error": "",
|
||||
})
|
||||
}
|
||||
|
||||
# Store webhook config if provided
|
||||
if webhook_config:
|
||||
task_data["webhook_config"] = json.dumps(webhook_config)
|
||||
|
||||
await redis.hset(f"task:{task_id}", mapping=task_data)
|
||||
|
||||
# Initialize webhook service
|
||||
webhook_service = WebhookDeliveryService(config)
|
||||
|
||||
async def _runner():
|
||||
try:
|
||||
@@ -620,6 +777,17 @@ async def handle_crawl_job(
|
||||
"status": TaskStatus.COMPLETED,
|
||||
"result": json.dumps(result),
|
||||
})
|
||||
|
||||
# Send webhook notification on successful completion
|
||||
await webhook_service.notify_job_completion(
|
||||
task_id=task_id,
|
||||
task_type="crawl",
|
||||
status="completed",
|
||||
urls=urls,
|
||||
webhook_config=webhook_config,
|
||||
result=result
|
||||
)
|
||||
|
||||
await asyncio.sleep(5) # Give Redis time to process the update
|
||||
except Exception as exc:
|
||||
await redis.hset(f"task:{task_id}", mapping={
|
||||
@@ -627,5 +795,15 @@ async def handle_crawl_job(
|
||||
"error": str(exc),
|
||||
})
|
||||
|
||||
# Send webhook notification on failure
|
||||
await webhook_service.notify_job_completion(
|
||||
task_id=task_id,
|
||||
task_type="crawl",
|
||||
status="failed",
|
||||
urls=urls,
|
||||
webhook_config=webhook_config,
|
||||
error=str(exc)
|
||||
)
|
||||
|
||||
background_tasks.add_task(_runner)
|
||||
return {"task_id": task_id}
|
||||
@@ -7520,17 +7520,18 @@ class BrowserManager:
|
||||
)
|
||||
os.makedirs(browser_args["downloads_path"], exist_ok=True)
|
||||
|
||||
if self.config.proxy or self.config.proxy_config:
|
||||
if self.config.proxy:
|
||||
warnings.warn(
|
||||
"BrowserConfig.proxy is deprecated and ignored. Use proxy_config instead.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
if self.config.proxy_config:
|
||||
from playwright.async_api import ProxySettings
|
||||
|
||||
proxy_settings = (
|
||||
ProxySettings(server=self.config.proxy)
|
||||
if self.config.proxy
|
||||
else ProxySettings(
|
||||
server=self.config.proxy_config.server,
|
||||
username=self.config.proxy_config.username,
|
||||
password=self.config.proxy_config.password,
|
||||
)
|
||||
proxy_settings = ProxySettings(
|
||||
server=self.config.proxy_config.server,
|
||||
username=self.config.proxy_config.username,
|
||||
password=self.config.proxy_config.password,
|
||||
)
|
||||
browser_args["proxy"] = proxy_settings
|
||||
|
||||
|
||||
@@ -87,4 +87,17 @@ observability:
|
||||
enabled: True
|
||||
endpoint: "/metrics"
|
||||
health_check:
|
||||
endpoint: "/health"
|
||||
endpoint: "/health"
|
||||
|
||||
# Webhook Configuration
|
||||
webhooks:
|
||||
enabled: true
|
||||
default_url: null # Optional: default webhook URL for all jobs
|
||||
data_in_payload: false # Optional: default behavior for including data
|
||||
retry:
|
||||
max_attempts: 5
|
||||
initial_delay_ms: 1000 # 1s, 2s, 4s, 8s, 16s exponential backoff
|
||||
max_delay_ms: 32000
|
||||
timeout_ms: 30000 # 30s timeout per webhook call
|
||||
headers: # Optional: default headers to include
|
||||
User-Agent: "Crawl4AI-Webhook/1.0"
|
||||
512
deploy/docker/hook_manager.py
Normal file
512
deploy/docker/hook_manager.py
Normal file
@@ -0,0 +1,512 @@
|
||||
"""
|
||||
Hook Manager for User-Provided Hook Functions
|
||||
Handles validation, compilation, and safe execution of user-provided hook code
|
||||
"""
|
||||
|
||||
import ast
|
||||
import asyncio
|
||||
import traceback
|
||||
from typing import Dict, Callable, Optional, Tuple, List, Any
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class UserHookManager:
|
||||
"""Manages user-provided hook functions with error isolation"""
|
||||
|
||||
# Expected signatures for each hook point
|
||||
HOOK_SIGNATURES = {
|
||||
"on_browser_created": ["browser"],
|
||||
"on_page_context_created": ["page", "context"],
|
||||
"before_goto": ["page", "context", "url"],
|
||||
"after_goto": ["page", "context", "url", "response"],
|
||||
"on_user_agent_updated": ["page", "context", "user_agent"],
|
||||
"on_execution_started": ["page", "context"],
|
||||
"before_retrieve_html": ["page", "context"],
|
||||
"before_return_html": ["page", "context", "html"]
|
||||
}
|
||||
|
||||
# Default timeout for hook execution (in seconds)
|
||||
DEFAULT_TIMEOUT = 30
|
||||
|
||||
def __init__(self, timeout: int = DEFAULT_TIMEOUT):
|
||||
self.timeout = timeout
|
||||
self.errors: List[Dict[str, Any]] = []
|
||||
self.compiled_hooks: Dict[str, Callable] = {}
|
||||
self.execution_log: List[Dict[str, Any]] = []
|
||||
|
||||
def validate_hook_structure(self, hook_code: str, hook_point: str) -> Tuple[bool, str]:
|
||||
"""
|
||||
Validate the structure of user-provided hook code
|
||||
|
||||
Args:
|
||||
hook_code: The Python code string containing the hook function
|
||||
hook_point: The hook point name (e.g., 'on_page_context_created')
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid, error_message)
|
||||
"""
|
||||
try:
|
||||
# Parse the code
|
||||
tree = ast.parse(hook_code)
|
||||
|
||||
# Check if it's empty
|
||||
if not tree.body:
|
||||
return False, "Hook code is empty"
|
||||
|
||||
# Find the function definition
|
||||
func_def = None
|
||||
for node in tree.body:
|
||||
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
||||
func_def = node
|
||||
break
|
||||
|
||||
if not func_def:
|
||||
return False, "Hook must contain a function definition (def or async def)"
|
||||
|
||||
# Check if it's async (all hooks should be async)
|
||||
if not isinstance(func_def, ast.AsyncFunctionDef):
|
||||
return False, f"Hook function must be async (use 'async def' instead of 'def')"
|
||||
|
||||
# Get function name for better error messages
|
||||
func_name = func_def.name
|
||||
|
||||
# Validate parameters
|
||||
expected_params = self.HOOK_SIGNATURES.get(hook_point, [])
|
||||
if not expected_params:
|
||||
return False, f"Unknown hook point: {hook_point}"
|
||||
|
||||
func_params = [arg.arg for arg in func_def.args.args]
|
||||
|
||||
# Check if it has **kwargs for flexibility
|
||||
has_kwargs = func_def.args.kwarg is not None
|
||||
|
||||
# Must have at least the expected parameters
|
||||
missing_params = []
|
||||
for expected in expected_params:
|
||||
if expected not in func_params:
|
||||
missing_params.append(expected)
|
||||
|
||||
if missing_params and not has_kwargs:
|
||||
return False, f"Hook function '{func_name}' must accept parameters: {', '.join(expected_params)} (missing: {', '.join(missing_params)})"
|
||||
|
||||
# Check if it returns something (should return page or browser)
|
||||
has_return = any(isinstance(node, ast.Return) for node in ast.walk(func_def))
|
||||
if not has_return:
|
||||
# Warning, not error - we'll handle this
|
||||
logger.warning(f"Hook function '{func_name}' should return the {expected_params[0]} object")
|
||||
|
||||
return True, "Valid"
|
||||
|
||||
except SyntaxError as e:
|
||||
return False, f"Syntax error at line {e.lineno}: {str(e)}"
|
||||
except Exception as e:
|
||||
return False, f"Failed to parse hook code: {str(e)}"
|
||||
|
||||
def compile_hook(self, hook_code: str, hook_point: str) -> Optional[Callable]:
|
||||
"""
|
||||
Compile user-provided hook code into a callable function
|
||||
|
||||
Args:
|
||||
hook_code: The Python code string
|
||||
hook_point: The hook point name
|
||||
|
||||
Returns:
|
||||
Compiled function or None if compilation failed
|
||||
"""
|
||||
try:
|
||||
# Create a safe namespace for the hook
|
||||
# Use a more complete builtins that includes __import__
|
||||
import builtins
|
||||
safe_builtins = {}
|
||||
|
||||
# Add safe built-in functions
|
||||
allowed_builtins = [
|
||||
'print', 'len', 'str', 'int', 'float', 'bool',
|
||||
'list', 'dict', 'set', 'tuple', 'range', 'enumerate',
|
||||
'zip', 'map', 'filter', 'any', 'all', 'sum', 'min', 'max',
|
||||
'sorted', 'reversed', 'abs', 'round', 'isinstance', 'type',
|
||||
'getattr', 'hasattr', 'setattr', 'callable', 'iter', 'next',
|
||||
'__import__', '__build_class__' # Required for exec
|
||||
]
|
||||
|
||||
for name in allowed_builtins:
|
||||
if hasattr(builtins, name):
|
||||
safe_builtins[name] = getattr(builtins, name)
|
||||
|
||||
namespace = {
|
||||
'__name__': f'user_hook_{hook_point}',
|
||||
'__builtins__': safe_builtins
|
||||
}
|
||||
|
||||
# Add commonly needed imports
|
||||
exec("import asyncio", namespace)
|
||||
exec("import json", namespace)
|
||||
exec("import re", namespace)
|
||||
exec("from typing import Dict, List, Optional", namespace)
|
||||
|
||||
# Execute the code to define the function
|
||||
exec(hook_code, namespace)
|
||||
|
||||
# Find the async function in the namespace
|
||||
for name, obj in namespace.items():
|
||||
if callable(obj) and not name.startswith('_') and asyncio.iscoroutinefunction(obj):
|
||||
return obj
|
||||
|
||||
# If no async function found, look for any function
|
||||
for name, obj in namespace.items():
|
||||
if callable(obj) and not name.startswith('_'):
|
||||
logger.warning(f"Found non-async function '{name}' - wrapping it")
|
||||
# Wrap sync function in async
|
||||
async def async_wrapper(*args, **kwargs):
|
||||
return obj(*args, **kwargs)
|
||||
return async_wrapper
|
||||
|
||||
raise ValueError("No callable function found in hook code")
|
||||
|
||||
except Exception as e:
|
||||
error = {
|
||||
'hook_point': hook_point,
|
||||
'error': f"Failed to compile hook: {str(e)}",
|
||||
'type': 'compilation_error',
|
||||
'traceback': traceback.format_exc()
|
||||
}
|
||||
self.errors.append(error)
|
||||
logger.error(f"Hook compilation failed for {hook_point}: {str(e)}")
|
||||
return None
|
||||
|
||||
async def execute_hook_safely(
|
||||
self,
|
||||
hook_func: Callable,
|
||||
hook_point: str,
|
||||
*args,
|
||||
**kwargs
|
||||
) -> Tuple[Any, Optional[Dict]]:
|
||||
"""
|
||||
Execute a user hook with error isolation and timeout
|
||||
|
||||
Args:
|
||||
hook_func: The compiled hook function
|
||||
hook_point: The hook point name
|
||||
*args, **kwargs: Arguments to pass to the hook
|
||||
|
||||
Returns:
|
||||
Tuple of (result, error_dict)
|
||||
"""
|
||||
start_time = asyncio.get_event_loop().time()
|
||||
|
||||
try:
|
||||
# Add timeout to prevent infinite loops
|
||||
result = await asyncio.wait_for(
|
||||
hook_func(*args, **kwargs),
|
||||
timeout=self.timeout
|
||||
)
|
||||
|
||||
# Log successful execution
|
||||
execution_time = asyncio.get_event_loop().time() - start_time
|
||||
self.execution_log.append({
|
||||
'hook_point': hook_point,
|
||||
'status': 'success',
|
||||
'execution_time': execution_time,
|
||||
'timestamp': start_time
|
||||
})
|
||||
|
||||
return result, None
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
error = {
|
||||
'hook_point': hook_point,
|
||||
'error': f'Hook execution timed out ({self.timeout}s limit)',
|
||||
'type': 'timeout',
|
||||
'execution_time': self.timeout
|
||||
}
|
||||
self.errors.append(error)
|
||||
self.execution_log.append({
|
||||
'hook_point': hook_point,
|
||||
'status': 'timeout',
|
||||
'error': error['error'],
|
||||
'execution_time': self.timeout,
|
||||
'timestamp': start_time
|
||||
})
|
||||
# Return the first argument (usually page/browser) to continue
|
||||
return args[0] if args else None, error
|
||||
|
||||
except Exception as e:
|
||||
execution_time = asyncio.get_event_loop().time() - start_time
|
||||
error = {
|
||||
'hook_point': hook_point,
|
||||
'error': str(e),
|
||||
'type': type(e).__name__,
|
||||
'traceback': traceback.format_exc(),
|
||||
'execution_time': execution_time
|
||||
}
|
||||
self.errors.append(error)
|
||||
self.execution_log.append({
|
||||
'hook_point': hook_point,
|
||||
'status': 'failed',
|
||||
'error': str(e),
|
||||
'error_type': type(e).__name__,
|
||||
'execution_time': execution_time,
|
||||
'timestamp': start_time
|
||||
})
|
||||
# Return the first argument (usually page/browser) to continue
|
||||
return args[0] if args else None, error
|
||||
|
||||
def get_summary(self) -> Dict[str, Any]:
|
||||
"""Get a summary of hook execution"""
|
||||
total_hooks = len(self.execution_log)
|
||||
successful = sum(1 for log in self.execution_log if log['status'] == 'success')
|
||||
failed = sum(1 for log in self.execution_log if log['status'] == 'failed')
|
||||
timed_out = sum(1 for log in self.execution_log if log['status'] == 'timeout')
|
||||
|
||||
return {
|
||||
'total_executions': total_hooks,
|
||||
'successful': successful,
|
||||
'failed': failed,
|
||||
'timed_out': timed_out,
|
||||
'success_rate': (successful / total_hooks * 100) if total_hooks > 0 else 0,
|
||||
'total_errors': len(self.errors)
|
||||
}
|
||||
|
||||
|
||||
class IsolatedHookWrapper:
|
||||
"""Wraps user hooks with error isolation and reporting"""
|
||||
|
||||
def __init__(self, hook_manager: UserHookManager):
|
||||
self.hook_manager = hook_manager
|
||||
|
||||
def create_hook_wrapper(self, user_hook: Callable, hook_point: str) -> Callable:
|
||||
"""
|
||||
Create a wrapper that isolates hook errors from main process
|
||||
|
||||
Args:
|
||||
user_hook: The compiled user hook function
|
||||
hook_point: The hook point name
|
||||
|
||||
Returns:
|
||||
Wrapped async function that handles errors gracefully
|
||||
"""
|
||||
|
||||
async def wrapped_hook(*args, **kwargs):
|
||||
"""Wrapped hook with error isolation"""
|
||||
# Get the main return object (page/browser)
|
||||
# This ensures we always have something to return
|
||||
return_obj = None
|
||||
if args:
|
||||
return_obj = args[0]
|
||||
elif 'page' in kwargs:
|
||||
return_obj = kwargs['page']
|
||||
elif 'browser' in kwargs:
|
||||
return_obj = kwargs['browser']
|
||||
|
||||
try:
|
||||
# Execute user hook with safety
|
||||
result, error = await self.hook_manager.execute_hook_safely(
|
||||
user_hook,
|
||||
hook_point,
|
||||
*args,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
if error:
|
||||
# Hook failed but we continue with original object
|
||||
logger.warning(f"User hook failed at {hook_point}: {error['error']}")
|
||||
return return_obj
|
||||
|
||||
# Hook succeeded - return its result or the original object
|
||||
if result is None:
|
||||
logger.debug(f"Hook at {hook_point} returned None, using original object")
|
||||
return return_obj
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
# This should rarely happen due to execute_hook_safely
|
||||
logger.error(f"Unexpected error in hook wrapper for {hook_point}: {e}")
|
||||
return return_obj
|
||||
|
||||
# Set function name for debugging
|
||||
wrapped_hook.__name__ = f"wrapped_{hook_point}"
|
||||
return wrapped_hook
|
||||
|
||||
|
||||
async def process_user_hooks(
|
||||
hooks_input: Dict[str, str],
|
||||
timeout: int = 30
|
||||
) -> Tuple[Dict[str, Callable], List[Dict], UserHookManager]:
|
||||
"""
|
||||
Process and compile user-provided hook functions
|
||||
|
||||
Args:
|
||||
hooks_input: Dictionary mapping hook points to code strings
|
||||
timeout: Timeout for each hook execution
|
||||
|
||||
Returns:
|
||||
Tuple of (compiled_hooks, validation_errors, hook_manager)
|
||||
"""
|
||||
|
||||
hook_manager = UserHookManager(timeout=timeout)
|
||||
wrapper = IsolatedHookWrapper(hook_manager)
|
||||
compiled_hooks = {}
|
||||
validation_errors = []
|
||||
|
||||
for hook_point, hook_code in hooks_input.items():
|
||||
# Skip empty hooks
|
||||
if not hook_code or not hook_code.strip():
|
||||
continue
|
||||
|
||||
# Validate hook point
|
||||
if hook_point not in UserHookManager.HOOK_SIGNATURES:
|
||||
validation_errors.append({
|
||||
'hook_point': hook_point,
|
||||
'error': f'Unknown hook point. Valid points: {", ".join(UserHookManager.HOOK_SIGNATURES.keys())}',
|
||||
'code_preview': hook_code[:100] + '...' if len(hook_code) > 100 else hook_code
|
||||
})
|
||||
continue
|
||||
|
||||
# Validate structure
|
||||
is_valid, message = hook_manager.validate_hook_structure(hook_code, hook_point)
|
||||
if not is_valid:
|
||||
validation_errors.append({
|
||||
'hook_point': hook_point,
|
||||
'error': message,
|
||||
'code_preview': hook_code[:100] + '...' if len(hook_code) > 100 else hook_code
|
||||
})
|
||||
continue
|
||||
|
||||
# Compile the hook
|
||||
hook_func = hook_manager.compile_hook(hook_code, hook_point)
|
||||
if hook_func:
|
||||
# Wrap with error isolation
|
||||
wrapped_hook = wrapper.create_hook_wrapper(hook_func, hook_point)
|
||||
compiled_hooks[hook_point] = wrapped_hook
|
||||
logger.info(f"Successfully compiled hook for {hook_point}")
|
||||
else:
|
||||
validation_errors.append({
|
||||
'hook_point': hook_point,
|
||||
'error': 'Failed to compile hook function - check syntax and structure',
|
||||
'code_preview': hook_code[:100] + '...' if len(hook_code) > 100 else hook_code
|
||||
})
|
||||
|
||||
return compiled_hooks, validation_errors, hook_manager
|
||||
|
||||
|
||||
async def process_user_hooks_with_manager(
|
||||
hooks_input: Dict[str, str],
|
||||
hook_manager: UserHookManager
|
||||
) -> Tuple[Dict[str, Callable], List[Dict]]:
|
||||
"""
|
||||
Process and compile user-provided hook functions with existing manager
|
||||
|
||||
Args:
|
||||
hooks_input: Dictionary mapping hook points to code strings
|
||||
hook_manager: Existing UserHookManager instance
|
||||
|
||||
Returns:
|
||||
Tuple of (compiled_hooks, validation_errors)
|
||||
"""
|
||||
|
||||
wrapper = IsolatedHookWrapper(hook_manager)
|
||||
compiled_hooks = {}
|
||||
validation_errors = []
|
||||
|
||||
for hook_point, hook_code in hooks_input.items():
|
||||
# Skip empty hooks
|
||||
if not hook_code or not hook_code.strip():
|
||||
continue
|
||||
|
||||
# Validate hook point
|
||||
if hook_point not in UserHookManager.HOOK_SIGNATURES:
|
||||
validation_errors.append({
|
||||
'hook_point': hook_point,
|
||||
'error': f'Unknown hook point. Valid points: {", ".join(UserHookManager.HOOK_SIGNATURES.keys())}',
|
||||
'code_preview': hook_code[:100] + '...' if len(hook_code) > 100 else hook_code
|
||||
})
|
||||
continue
|
||||
|
||||
# Validate structure
|
||||
is_valid, message = hook_manager.validate_hook_structure(hook_code, hook_point)
|
||||
if not is_valid:
|
||||
validation_errors.append({
|
||||
'hook_point': hook_point,
|
||||
'error': message,
|
||||
'code_preview': hook_code[:100] + '...' if len(hook_code) > 100 else hook_code
|
||||
})
|
||||
continue
|
||||
|
||||
# Compile the hook
|
||||
hook_func = hook_manager.compile_hook(hook_code, hook_point)
|
||||
if hook_func:
|
||||
# Wrap with error isolation
|
||||
wrapped_hook = wrapper.create_hook_wrapper(hook_func, hook_point)
|
||||
compiled_hooks[hook_point] = wrapped_hook
|
||||
logger.info(f"Successfully compiled hook for {hook_point}")
|
||||
else:
|
||||
validation_errors.append({
|
||||
'hook_point': hook_point,
|
||||
'error': 'Failed to compile hook function - check syntax and structure',
|
||||
'code_preview': hook_code[:100] + '...' if len(hook_code) > 100 else hook_code
|
||||
})
|
||||
|
||||
return compiled_hooks, validation_errors
|
||||
|
||||
|
||||
async def attach_user_hooks_to_crawler(
|
||||
crawler, # AsyncWebCrawler instance
|
||||
user_hooks: Dict[str, str],
|
||||
timeout: int = 30,
|
||||
hook_manager: Optional[UserHookManager] = None
|
||||
) -> Tuple[Dict[str, Any], UserHookManager]:
|
||||
"""
|
||||
Attach user-provided hooks to crawler with full error reporting
|
||||
|
||||
Args:
|
||||
crawler: AsyncWebCrawler instance
|
||||
user_hooks: Dictionary mapping hook points to code strings
|
||||
timeout: Timeout for each hook execution
|
||||
hook_manager: Optional existing UserHookManager instance
|
||||
|
||||
Returns:
|
||||
Tuple of (status_dict, hook_manager)
|
||||
"""
|
||||
|
||||
# Use provided hook_manager or create a new one
|
||||
if hook_manager is None:
|
||||
hook_manager = UserHookManager(timeout=timeout)
|
||||
|
||||
# Process hooks with the hook_manager
|
||||
compiled_hooks, validation_errors = await process_user_hooks_with_manager(
|
||||
user_hooks, hook_manager
|
||||
)
|
||||
|
||||
# Log validation errors
|
||||
if validation_errors:
|
||||
logger.warning(f"Hook validation errors: {validation_errors}")
|
||||
|
||||
# Attach successfully compiled hooks
|
||||
attached_hooks = []
|
||||
for hook_point, wrapped_hook in compiled_hooks.items():
|
||||
try:
|
||||
crawler.crawler_strategy.set_hook(hook_point, wrapped_hook)
|
||||
attached_hooks.append(hook_point)
|
||||
logger.info(f"Attached hook to {hook_point}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to attach hook to {hook_point}: {e}")
|
||||
validation_errors.append({
|
||||
'hook_point': hook_point,
|
||||
'error': f'Failed to attach hook: {str(e)}'
|
||||
})
|
||||
|
||||
status = 'success' if not validation_errors else ('partial' if attached_hooks else 'failed')
|
||||
|
||||
status_dict = {
|
||||
'status': status,
|
||||
'attached_hooks': attached_hooks,
|
||||
'validation_errors': validation_errors,
|
||||
'total_hooks_provided': len(user_hooks),
|
||||
'successfully_attached': len(attached_hooks),
|
||||
'failed_validation': len(validation_errors)
|
||||
}
|
||||
|
||||
return status_dict, hook_manager
|
||||
@@ -12,6 +12,7 @@ from api import (
|
||||
handle_crawl_job,
|
||||
handle_task_status,
|
||||
)
|
||||
from schemas import WebhookConfig
|
||||
|
||||
# ------------- dependency placeholders -------------
|
||||
_redis = None # will be injected from server.py
|
||||
@@ -37,6 +38,7 @@ class LlmJobPayload(BaseModel):
|
||||
schema: Optional[str] = None
|
||||
cache: bool = False
|
||||
provider: Optional[str] = None
|
||||
webhook_config: Optional[WebhookConfig] = None
|
||||
temperature: Optional[float] = None
|
||||
base_url: Optional[str] = None
|
||||
|
||||
@@ -45,6 +47,7 @@ class CrawlJobPayload(BaseModel):
|
||||
urls: list[HttpUrl]
|
||||
browser_config: Dict = {}
|
||||
crawler_config: Dict = {}
|
||||
webhook_config: Optional[WebhookConfig] = None
|
||||
|
||||
|
||||
# ---------- LLM job ---------------------------------------------------------
|
||||
@@ -55,6 +58,10 @@ async def llm_job_enqueue(
|
||||
request: Request,
|
||||
_td: Dict = Depends(lambda: _token_dep()), # late-bound dep
|
||||
):
|
||||
webhook_config = None
|
||||
if payload.webhook_config:
|
||||
webhook_config = payload.webhook_config.model_dump(mode='json')
|
||||
|
||||
return await handle_llm_request(
|
||||
_redis,
|
||||
background_tasks,
|
||||
@@ -65,6 +72,7 @@ async def llm_job_enqueue(
|
||||
cache=payload.cache,
|
||||
config=_config,
|
||||
provider=payload.provider,
|
||||
webhook_config=webhook_config,
|
||||
temperature=payload.temperature,
|
||||
api_base_url=payload.base_url,
|
||||
)
|
||||
@@ -86,6 +94,10 @@ async def crawl_job_enqueue(
|
||||
background_tasks: BackgroundTasks,
|
||||
_td: Dict = Depends(lambda: _token_dep()),
|
||||
):
|
||||
webhook_config = None
|
||||
if payload.webhook_config:
|
||||
webhook_config = payload.webhook_config.model_dump(mode='json')
|
||||
|
||||
return await handle_crawl_job(
|
||||
_redis,
|
||||
background_tasks,
|
||||
@@ -93,6 +105,7 @@ async def crawl_job_enqueue(
|
||||
payload.browser_config,
|
||||
payload.crawler_config,
|
||||
config=_config,
|
||||
webhook_config=webhook_config,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -12,6 +12,6 @@ pydantic>=2.11
|
||||
rank-bm25==0.2.2
|
||||
anyio==4.9.0
|
||||
PyJWT==2.10.1
|
||||
mcp>=1.6.0
|
||||
mcp>=1.18.0
|
||||
websockets>=15.0.1
|
||||
httpx[http2]>=0.27.2
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from typing import List, Optional, Dict
|
||||
from enum import Enum
|
||||
from pydantic import BaseModel, Field
|
||||
from pydantic import BaseModel, Field, HttpUrl
|
||||
from utils import FilterType
|
||||
|
||||
|
||||
@@ -9,6 +9,50 @@ class CrawlRequest(BaseModel):
|
||||
browser_config: Optional[Dict] = Field(default_factory=dict)
|
||||
crawler_config: Optional[Dict] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class HookConfig(BaseModel):
|
||||
"""Configuration for user-provided hooks"""
|
||||
code: Dict[str, str] = Field(
|
||||
default_factory=dict,
|
||||
description="Map of hook points to Python code strings"
|
||||
)
|
||||
timeout: int = Field(
|
||||
default=30,
|
||||
ge=1,
|
||||
le=120,
|
||||
description="Timeout in seconds for each hook execution"
|
||||
)
|
||||
|
||||
class Config:
|
||||
schema_extra = {
|
||||
"example": {
|
||||
"code": {
|
||||
"on_page_context_created": """
|
||||
async def hook(page, context, **kwargs):
|
||||
# Block images to speed up crawling
|
||||
await context.route("**/*.{png,jpg,jpeg,gif}", lambda route: route.abort())
|
||||
return page
|
||||
""",
|
||||
"before_retrieve_html": """
|
||||
async def hook(page, context, **kwargs):
|
||||
# Scroll to load lazy content
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
await page.wait_for_timeout(2000)
|
||||
return page
|
||||
"""
|
||||
},
|
||||
"timeout": 30
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class CrawlRequestWithHooks(CrawlRequest):
|
||||
"""Extended crawl request with hooks support"""
|
||||
hooks: Optional[HookConfig] = Field(
|
||||
default=None,
|
||||
description="Optional user-provided hook functions"
|
||||
)
|
||||
|
||||
class MarkdownRequest(BaseModel):
|
||||
"""Request body for the /md endpoint."""
|
||||
url: str = Field(..., description="Absolute http/https URL to fetch")
|
||||
@@ -41,4 +85,22 @@ class JSEndpointRequest(BaseModel):
|
||||
scripts: List[str] = Field(
|
||||
...,
|
||||
description="List of separated JavaScript snippets to execute"
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class WebhookConfig(BaseModel):
|
||||
"""Configuration for webhook notifications."""
|
||||
webhook_url: HttpUrl
|
||||
webhook_data_in_payload: bool = False
|
||||
webhook_headers: Optional[Dict[str, str]] = None
|
||||
|
||||
|
||||
class WebhookPayload(BaseModel):
|
||||
"""Payload sent to webhook endpoints."""
|
||||
task_id: str
|
||||
task_type: str # "crawl", "llm_extraction", etc.
|
||||
status: str # "completed" or "failed"
|
||||
timestamp: str # ISO 8601 format
|
||||
urls: List[str]
|
||||
error: Optional[str] = None
|
||||
data: Optional[Dict] = None # Included only if webhook_data_in_payload=True
|
||||
@@ -23,7 +23,7 @@ from api import (
|
||||
stream_results
|
||||
)
|
||||
from schemas import (
|
||||
CrawlRequest,
|
||||
CrawlRequestWithHooks,
|
||||
MarkdownRequest,
|
||||
RawCode,
|
||||
HTMLRequest,
|
||||
@@ -462,6 +462,72 @@ async def get_schema():
|
||||
"crawler": CrawlerRunConfig().dump()}
|
||||
|
||||
|
||||
@app.get("/hooks/info")
|
||||
async def get_hooks_info():
|
||||
"""Get information about available hook points and their signatures"""
|
||||
from hook_manager import UserHookManager
|
||||
|
||||
hook_info = {}
|
||||
for hook_point, params in UserHookManager.HOOK_SIGNATURES.items():
|
||||
hook_info[hook_point] = {
|
||||
"parameters": params,
|
||||
"description": get_hook_description(hook_point),
|
||||
"example": get_hook_example(hook_point)
|
||||
}
|
||||
|
||||
return JSONResponse({
|
||||
"available_hooks": hook_info,
|
||||
"timeout_limits": {
|
||||
"min": 1,
|
||||
"max": 120,
|
||||
"default": 30
|
||||
}
|
||||
})
|
||||
|
||||
|
||||
def get_hook_description(hook_point: str) -> str:
|
||||
"""Get description for each hook point"""
|
||||
descriptions = {
|
||||
"on_browser_created": "Called after browser instance is created",
|
||||
"on_page_context_created": "Called after page and context are created - ideal for authentication",
|
||||
"before_goto": "Called before navigating to the target URL",
|
||||
"after_goto": "Called after navigation is complete",
|
||||
"on_user_agent_updated": "Called when user agent is updated",
|
||||
"on_execution_started": "Called when custom JavaScript execution begins",
|
||||
"before_retrieve_html": "Called before retrieving the final HTML - ideal for scrolling",
|
||||
"before_return_html": "Called just before returning the HTML content"
|
||||
}
|
||||
return descriptions.get(hook_point, "")
|
||||
|
||||
|
||||
def get_hook_example(hook_point: str) -> str:
|
||||
"""Get example code for each hook point"""
|
||||
examples = {
|
||||
"on_page_context_created": """async def hook(page, context, **kwargs):
|
||||
# Add authentication cookie
|
||||
await context.add_cookies([{
|
||||
'name': 'session',
|
||||
'value': 'my-session-id',
|
||||
'domain': '.example.com'
|
||||
}])
|
||||
return page""",
|
||||
|
||||
"before_retrieve_html": """async def hook(page, context, **kwargs):
|
||||
# Scroll to load lazy content
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
await page.wait_for_timeout(2000)
|
||||
return page""",
|
||||
|
||||
"before_goto": """async def hook(page, context, url, **kwargs):
|
||||
# Set custom headers
|
||||
await page.set_extra_http_headers({
|
||||
'X-Custom-Header': 'value'
|
||||
})
|
||||
return page"""
|
||||
}
|
||||
return examples.get(hook_point, "# Implement your hook logic here\nreturn page")
|
||||
|
||||
|
||||
@app.get(config["observability"]["health_check"]["endpoint"])
|
||||
async def health():
|
||||
return {"status": "ok", "timestamp": time.time(), "version": __version__}
|
||||
@@ -477,19 +543,35 @@ async def metrics():
|
||||
@mcp_tool("crawl")
|
||||
async def crawl(
|
||||
request: Request,
|
||||
crawl_request: CrawlRequest,
|
||||
crawl_request: CrawlRequestWithHooks,
|
||||
_td: Dict = Depends(token_dep),
|
||||
):
|
||||
"""
|
||||
Crawl a list of URLs and return the results as JSON.
|
||||
For streaming responses, use /crawl/stream endpoint.
|
||||
Supports optional user-provided hook functions for customization.
|
||||
"""
|
||||
if not crawl_request.urls:
|
||||
raise HTTPException(400, "At least one URL required")
|
||||
# Check whether it is a redirection for a streaming request
|
||||
crawler_config = CrawlerRunConfig.load(crawl_request.crawler_config)
|
||||
if crawler_config.stream:
|
||||
return await stream_process(crawl_request=crawl_request)
|
||||
|
||||
# Prepare hooks config if provided
|
||||
hooks_config = None
|
||||
if crawl_request.hooks:
|
||||
hooks_config = {
|
||||
'code': crawl_request.hooks.code,
|
||||
'timeout': crawl_request.hooks.timeout
|
||||
}
|
||||
|
||||
results = await handle_crawl_request(
|
||||
urls=crawl_request.urls,
|
||||
browser_config=crawl_request.browser_config,
|
||||
crawler_config=crawl_request.crawler_config,
|
||||
config=config,
|
||||
hooks_config=hooks_config
|
||||
)
|
||||
# check if all of the results are not successful
|
||||
if all(not result["success"] for result in results["results"]):
|
||||
@@ -501,25 +583,46 @@ async def crawl(
|
||||
@limiter.limit(config["rate_limiting"]["default_limit"])
|
||||
async def crawl_stream(
|
||||
request: Request,
|
||||
crawl_request: CrawlRequest,
|
||||
crawl_request: CrawlRequestWithHooks,
|
||||
_td: Dict = Depends(token_dep),
|
||||
):
|
||||
if not crawl_request.urls:
|
||||
raise HTTPException(400, "At least one URL required")
|
||||
crawler, gen = await handle_stream_crawl_request(
|
||||
|
||||
return await stream_process(crawl_request=crawl_request)
|
||||
|
||||
async def stream_process(crawl_request: CrawlRequestWithHooks):
|
||||
|
||||
# Prepare hooks config if provided# Prepare hooks config if provided
|
||||
hooks_config = None
|
||||
if crawl_request.hooks:
|
||||
hooks_config = {
|
||||
'code': crawl_request.hooks.code,
|
||||
'timeout': crawl_request.hooks.timeout
|
||||
}
|
||||
|
||||
crawler, gen, hooks_info = await handle_stream_crawl_request(
|
||||
urls=crawl_request.urls,
|
||||
browser_config=crawl_request.browser_config,
|
||||
crawler_config=crawl_request.crawler_config,
|
||||
config=config,
|
||||
hooks_config=hooks_config
|
||||
)
|
||||
|
||||
# Add hooks info to response headers if available
|
||||
headers = {
|
||||
"Cache-Control": "no-cache",
|
||||
"Connection": "keep-alive",
|
||||
"X-Stream-Status": "active",
|
||||
}
|
||||
if hooks_info:
|
||||
import json
|
||||
headers["X-Hooks-Status"] = json.dumps(hooks_info['status']['status'])
|
||||
|
||||
return StreamingResponse(
|
||||
stream_results(crawler, gen),
|
||||
media_type="application/x-ndjson",
|
||||
headers={
|
||||
"Cache-Control": "no-cache",
|
||||
"Connection": "keep-alive",
|
||||
"X-Stream-Status": "active",
|
||||
},
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -371,7 +371,7 @@
|
||||
|
||||
<div class="flex items-center">
|
||||
<input id="st-stream" type="checkbox" class="mr-2">
|
||||
<label for="st-stream" class="text-sm">Use /crawl/stream</label>
|
||||
<label for="st-stream" class="text-sm">Enable streaming mode</label>
|
||||
<button id="st-run"
|
||||
class="ml-auto bg-accent text-dark px-4 py-2 rounded hover:bg-opacity-90 font-medium">
|
||||
Run Stress Test
|
||||
@@ -596,6 +596,14 @@
|
||||
forceHighlightElement(curlCodeEl);
|
||||
}
|
||||
|
||||
// Detect if stream is requested inside payload
|
||||
function shouldUseStream(payload) {
|
||||
const toBool = (v) => v === true || (typeof v === 'string' && v.toLowerCase() === 'true');
|
||||
const fromCrawler = payload && payload.crawler_config && payload.crawler_config.params && payload.crawler_config.params.stream;
|
||||
const direct = payload && payload.stream;
|
||||
return toBool(fromCrawler) || toBool(direct);
|
||||
}
|
||||
|
||||
// Main run function
|
||||
async function runCrawl() {
|
||||
const endpoint = document.getElementById('endpoint').value;
|
||||
@@ -611,16 +619,24 @@
|
||||
: { browser_config: cfgJson };
|
||||
}
|
||||
} catch (err) {
|
||||
updateStatus('error');
|
||||
document.querySelector('#response-content code').textContent =
|
||||
JSON.stringify({ error: err.message }, null, 2);
|
||||
forceHighlightElement(document.querySelector('#response-content code'));
|
||||
return; // stop run
|
||||
const codeText = cm.getValue();
|
||||
const streamFlag = /stream\s*=\s*True/i.test(codeText);
|
||||
const isCrawlEndpoint = document.getElementById('endpoint').value === 'crawl';
|
||||
if (isCrawlEndpoint && streamFlag) {
|
||||
// Fallback: proceed with minimal config only for stream
|
||||
advConfig = { crawler_config: { stream: true } };
|
||||
} else {
|
||||
updateStatus('error');
|
||||
document.querySelector('#response-content code').textContent =
|
||||
JSON.stringify({ error: err.message }, null, 2);
|
||||
forceHighlightElement(document.querySelector('#response-content code'));
|
||||
return; // stop run
|
||||
}
|
||||
}
|
||||
|
||||
const endpointMap = {
|
||||
crawl: '/crawl',
|
||||
// crawl_stream: '/crawl/stream',
|
||||
crawl_stream: '/crawl/stream', // Keep for backward compatibility
|
||||
md: '/md',
|
||||
llm: '/llm'
|
||||
};
|
||||
@@ -647,7 +663,7 @@
|
||||
// This will be handled directly in the fetch below
|
||||
payload = null;
|
||||
} else {
|
||||
// Default payload for /crawl and /crawl/stream
|
||||
// Default payload for /crawl (supports both streaming and batch modes)
|
||||
payload = {
|
||||
urls,
|
||||
...advConfig
|
||||
@@ -659,6 +675,7 @@
|
||||
try {
|
||||
const startTime = performance.now();
|
||||
let response, responseData;
|
||||
const useStreamOverride = (endpoint === 'crawl') && shouldUseStream(payload);
|
||||
|
||||
if (endpoint === 'llm') {
|
||||
// Special handling for LLM endpoint which uses URL pattern: /llm/{encoded_url}?q={query}
|
||||
@@ -681,8 +698,8 @@
|
||||
document.querySelector('#response-content code').textContent = JSON.stringify(responseData, null, 2);
|
||||
document.querySelector('#response-content code').className = 'json hljs';
|
||||
forceHighlightElement(document.querySelector('#response-content code'));
|
||||
} else if (endpoint === 'crawl_stream') {
|
||||
// Stream processing
|
||||
} else if (endpoint === 'crawl_stream' || useStreamOverride) {
|
||||
// Stream processing - now handled directly by /crawl endpoint
|
||||
response = await fetch(api, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
@@ -757,6 +774,7 @@
|
||||
const question = document.getElementById('llm-question').value.trim() || "What is this page about?";
|
||||
generateSnippets(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, null, 'GET');
|
||||
} else {
|
||||
// Use the same API endpoint for both streaming and non-streaming
|
||||
generateSnippets(api, payload);
|
||||
}
|
||||
} catch (error) {
|
||||
@@ -786,7 +804,7 @@
|
||||
document.getElementById('stress-avg-time').textContent = '0';
|
||||
document.getElementById('stress-peak-mem').textContent = '0';
|
||||
|
||||
const api = useStream ? '/crawl/stream' : '/crawl';
|
||||
const api = '/crawl'; // Always use /crawl - backend handles streaming internally
|
||||
const urls = Array.from({ length: total }, (_, i) => `https://httpbin.org/anything/stress-${i}-${Date.now()}`);
|
||||
const chunks = [];
|
||||
|
||||
|
||||
159
deploy/docker/webhook.py
Normal file
159
deploy/docker/webhook.py
Normal file
@@ -0,0 +1,159 @@
|
||||
"""
|
||||
Webhook delivery service for Crawl4AI.
|
||||
|
||||
This module provides webhook notification functionality with exponential backoff retry logic.
|
||||
"""
|
||||
import asyncio
|
||||
import httpx
|
||||
import logging
|
||||
from typing import Dict, Optional
|
||||
from datetime import datetime, timezone
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class WebhookDeliveryService:
|
||||
"""Handles webhook delivery with exponential backoff retry logic."""
|
||||
|
||||
def __init__(self, config: Dict):
|
||||
"""
|
||||
Initialize the webhook delivery service.
|
||||
|
||||
Args:
|
||||
config: Application configuration dictionary containing webhook settings
|
||||
"""
|
||||
self.config = config.get("webhooks", {})
|
||||
self.max_attempts = self.config.get("retry", {}).get("max_attempts", 5)
|
||||
self.initial_delay = self.config.get("retry", {}).get("initial_delay_ms", 1000) / 1000
|
||||
self.max_delay = self.config.get("retry", {}).get("max_delay_ms", 32000) / 1000
|
||||
self.timeout = self.config.get("retry", {}).get("timeout_ms", 30000) / 1000
|
||||
|
||||
async def send_webhook(
|
||||
self,
|
||||
webhook_url: str,
|
||||
payload: Dict,
|
||||
headers: Optional[Dict[str, str]] = None
|
||||
) -> bool:
|
||||
"""
|
||||
Send webhook with exponential backoff retry logic.
|
||||
|
||||
Args:
|
||||
webhook_url: The URL to send the webhook to
|
||||
payload: The JSON payload to send
|
||||
headers: Optional custom headers
|
||||
|
||||
Returns:
|
||||
bool: True if delivered successfully, False otherwise
|
||||
"""
|
||||
default_headers = self.config.get("headers", {})
|
||||
merged_headers = {**default_headers, **(headers or {})}
|
||||
merged_headers["Content-Type"] = "application/json"
|
||||
|
||||
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
||||
for attempt in range(self.max_attempts):
|
||||
try:
|
||||
logger.info(
|
||||
f"Sending webhook (attempt {attempt + 1}/{self.max_attempts}) to {webhook_url}"
|
||||
)
|
||||
|
||||
response = await client.post(
|
||||
webhook_url,
|
||||
json=payload,
|
||||
headers=merged_headers
|
||||
)
|
||||
|
||||
# Success or client error (don't retry client errors)
|
||||
if response.status_code < 500:
|
||||
if 200 <= response.status_code < 300:
|
||||
logger.info(f"Webhook delivered successfully to {webhook_url}")
|
||||
return True
|
||||
else:
|
||||
logger.warning(
|
||||
f"Webhook rejected with status {response.status_code}: {response.text[:200]}"
|
||||
)
|
||||
return False # Client error - don't retry
|
||||
|
||||
# Server error - retry with backoff
|
||||
logger.warning(
|
||||
f"Webhook failed with status {response.status_code}, will retry"
|
||||
)
|
||||
|
||||
except httpx.TimeoutException as exc:
|
||||
logger.error(f"Webhook timeout (attempt {attempt + 1}): {exc}")
|
||||
except httpx.RequestError as exc:
|
||||
logger.error(f"Webhook request error (attempt {attempt + 1}): {exc}")
|
||||
except Exception as exc:
|
||||
logger.error(f"Webhook delivery error (attempt {attempt + 1}): {exc}")
|
||||
|
||||
# Calculate exponential backoff delay
|
||||
if attempt < self.max_attempts - 1:
|
||||
delay = min(self.initial_delay * (2 ** attempt), self.max_delay)
|
||||
logger.info(f"Retrying in {delay}s...")
|
||||
await asyncio.sleep(delay)
|
||||
|
||||
logger.error(
|
||||
f"Webhook delivery failed after {self.max_attempts} attempts to {webhook_url}"
|
||||
)
|
||||
return False
|
||||
|
||||
async def notify_job_completion(
|
||||
self,
|
||||
task_id: str,
|
||||
task_type: str,
|
||||
status: str,
|
||||
urls: list,
|
||||
webhook_config: Optional[Dict],
|
||||
result: Optional[Dict] = None,
|
||||
error: Optional[str] = None
|
||||
):
|
||||
"""
|
||||
Notify webhook of job completion.
|
||||
|
||||
Args:
|
||||
task_id: The task identifier
|
||||
task_type: Type of task (e.g., "crawl", "llm_extraction")
|
||||
status: Task status ("completed" or "failed")
|
||||
urls: List of URLs that were crawled
|
||||
webhook_config: Webhook configuration from the job request
|
||||
result: Optional crawl result data
|
||||
error: Optional error message if failed
|
||||
"""
|
||||
# Determine webhook URL
|
||||
webhook_url = None
|
||||
data_in_payload = self.config.get("data_in_payload", False)
|
||||
custom_headers = None
|
||||
|
||||
if webhook_config:
|
||||
webhook_url = webhook_config.get("webhook_url")
|
||||
data_in_payload = webhook_config.get("webhook_data_in_payload", data_in_payload)
|
||||
custom_headers = webhook_config.get("webhook_headers")
|
||||
|
||||
if not webhook_url:
|
||||
webhook_url = self.config.get("default_url")
|
||||
|
||||
if not webhook_url:
|
||||
logger.debug("No webhook URL configured, skipping notification")
|
||||
return
|
||||
|
||||
# Check if webhooks are enabled
|
||||
if not self.config.get("enabled", True):
|
||||
logger.debug("Webhooks are disabled, skipping notification")
|
||||
return
|
||||
|
||||
# Build payload
|
||||
payload = {
|
||||
"task_id": task_id,
|
||||
"task_type": task_type,
|
||||
"status": status,
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"urls": urls
|
||||
}
|
||||
|
||||
if error:
|
||||
payload["error"] = error
|
||||
|
||||
if data_in_payload and result:
|
||||
payload["data"] = result
|
||||
|
||||
# Send webhook (fire and forget - don't block on completion)
|
||||
await self.send_webhook(webhook_url, payload, custom_headers)
|
||||
@@ -6,15 +6,16 @@ x-base-config: &base-config
|
||||
- "11235:11235" # Gunicorn port
|
||||
env_file:
|
||||
- .llm.env # API keys (create from .llm.env.example)
|
||||
environment:
|
||||
- OPENAI_API_KEY=${OPENAI_API_KEY:-}
|
||||
- DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
|
||||
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
|
||||
- GROQ_API_KEY=${GROQ_API_KEY:-}
|
||||
- TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
|
||||
- MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
|
||||
- GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
|
||||
- LLM_PROVIDER=${LLM_PROVIDER:-} # Optional: Override default provider (e.g., "anthropic/claude-3-opus")
|
||||
# Uncomment to set default environment variables (will overwrite .llm.env)
|
||||
# environment:
|
||||
# - OPENAI_API_KEY=${OPENAI_API_KEY:-}
|
||||
# - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
|
||||
# - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
|
||||
# - GROQ_API_KEY=${GROQ_API_KEY:-}
|
||||
# - TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
|
||||
# - MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
|
||||
# - GEMINI_API_KEY=${GEMINI_API_KEY:-}
|
||||
# - LLM_PROVIDER=${LLM_PROVIDER:-} # Optional: Override default provider (e.g., "anthropic/claude-3-opus")
|
||||
volumes:
|
||||
- /dev/shm:/dev/shm # Chromium performance
|
||||
deploy:
|
||||
|
||||
@@ -10,7 +10,6 @@ Today I'm releasing Crawl4AI v0.7.4—the Intelligent Table Extraction & Perform
|
||||
|
||||
- **🚀 LLMTableExtraction**: Revolutionary table extraction with intelligent chunking for massive tables
|
||||
- **⚡ Enhanced Concurrency**: True concurrency improvements for fast-completing tasks in batch operations
|
||||
- **🧹 Memory Management Refactor**: Streamlined memory utilities and better resource management
|
||||
- **🔧 Browser Manager Fixes**: Resolved race conditions in concurrent page creation
|
||||
- **⌨️ Cross-Platform Browser Profiler**: Improved keyboard handling and quit mechanisms
|
||||
- **🔗 Advanced URL Processing**: Better handling of raw URLs and base tag link resolution
|
||||
@@ -158,40 +157,6 @@ async with AsyncWebCrawler() as crawler:
|
||||
- **Monitoring Systems**: Faster health checks and status page monitoring
|
||||
- **Data Aggregation**: Improved performance for real-time data collection
|
||||
|
||||
## 🧹 Memory Management Refactor: Cleaner Architecture
|
||||
|
||||
**The Problem:** Memory utilities were scattered and difficult to maintain, with potential import conflicts and unclear organization.
|
||||
|
||||
**My Solution:** I consolidated all memory-related utilities into the main `utils.py` module, creating a cleaner, more maintainable architecture.
|
||||
|
||||
### Improved Memory Handling
|
||||
|
||||
```python
|
||||
# All memory utilities now consolidated
|
||||
from crawl4ai.utils import get_true_memory_usage_percent, MemoryMonitor
|
||||
|
||||
# Enhanced memory monitoring
|
||||
monitor = MemoryMonitor()
|
||||
monitor.start_monitoring()
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Memory-efficient batch processing
|
||||
results = await crawler.arun_many(large_url_list)
|
||||
|
||||
# Get accurate memory metrics
|
||||
memory_usage = get_true_memory_usage_percent()
|
||||
memory_report = monitor.get_report()
|
||||
|
||||
print(f"Memory efficiency: {memory_report['efficiency']:.1f}%")
|
||||
print(f"Peak usage: {memory_report['peak_mb']:.1f} MB")
|
||||
```
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
- **Production Stability**: More reliable memory tracking and management
|
||||
- **Code Maintainability**: Cleaner architecture for easier debugging
|
||||
- **Import Clarity**: Resolved potential conflicts and import issues
|
||||
- **Developer Experience**: Simpler API for memory monitoring
|
||||
|
||||
## 🔧 Critical Stability Fixes
|
||||
|
||||
### Browser Manager Race Condition Resolution
|
||||
|
||||
318
docs/blog/release-v0.7.5.md
Normal file
318
docs/blog/release-v0.7.5.md
Normal file
@@ -0,0 +1,318 @@
|
||||
# 🚀 Crawl4AI v0.7.5: The Docker Hooks & Security Update
|
||||
|
||||
*September 29, 2025 • 8 min read*
|
||||
|
||||
---
|
||||
|
||||
Today I'm releasing Crawl4AI v0.7.5—focused on extensibility and security. This update introduces the Docker Hooks System for pipeline customization, enhanced LLM integration, and important security improvements.
|
||||
|
||||
## 🎯 What's New at a Glance
|
||||
|
||||
- **Docker Hooks System**: Custom Python functions at key pipeline points with function-based API
|
||||
- **Function-Based Hooks**: New `hooks_to_string()` utility with Docker client auto-conversion
|
||||
- **Enhanced LLM Integration**: Custom providers with temperature control
|
||||
- **HTTPS Preservation**: Secure internal link handling
|
||||
- **Bug Fixes**: Resolved multiple community-reported issues
|
||||
- **Improved Docker Error Handling**: Better debugging and reliability
|
||||
|
||||
## 🔧 Docker Hooks System: Pipeline Customization
|
||||
|
||||
Every scraping project needs custom logic—authentication, performance optimization, content processing. Traditional solutions require forking or complex workarounds. Docker Hooks let you inject custom Python functions at 8 key points in the crawling pipeline.
|
||||
|
||||
### Real Example: Authentication & Performance
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
# Real working hooks for httpbin.org
|
||||
hooks_config = {
|
||||
"on_page_context_created": """
|
||||
async def hook(page, context, **kwargs):
|
||||
print("Hook: Setting up page context")
|
||||
# Block images to speed up crawling
|
||||
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||
print("Hook: Images blocked")
|
||||
return page
|
||||
""",
|
||||
|
||||
"before_retrieve_html": """
|
||||
async def hook(page, context, **kwargs):
|
||||
print("Hook: Before retrieving HTML")
|
||||
# Scroll to bottom to load lazy content
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
await page.wait_for_timeout(1000)
|
||||
print("Hook: Scrolled to bottom")
|
||||
return page
|
||||
""",
|
||||
|
||||
"before_goto": """
|
||||
async def hook(page, context, url, **kwargs):
|
||||
print(f"Hook: About to navigate to {url}")
|
||||
# Add custom headers
|
||||
await page.set_extra_http_headers({
|
||||
'X-Test-Header': 'crawl4ai-hooks-test'
|
||||
})
|
||||
return page
|
||||
"""
|
||||
}
|
||||
|
||||
# Test with Docker API
|
||||
payload = {
|
||||
"urls": ["https://httpbin.org/html"],
|
||||
"hooks": {
|
||||
"code": hooks_config,
|
||||
"timeout": 30
|
||||
}
|
||||
}
|
||||
|
||||
response = requests.post("http://localhost:11235/crawl", json=payload)
|
||||
result = response.json()
|
||||
|
||||
if result.get('success'):
|
||||
print("✅ Hooks executed successfully!")
|
||||
print(f"Content length: {len(result.get('markdown', ''))} characters")
|
||||
```
|
||||
|
||||
**Available Hook Points:**
|
||||
- `on_browser_created`: Browser setup
|
||||
- `on_page_context_created`: Page context configuration
|
||||
- `before_goto`: Pre-navigation setup
|
||||
- `after_goto`: Post-navigation processing
|
||||
- `on_user_agent_updated`: User agent changes
|
||||
- `on_execution_started`: Crawl initialization
|
||||
- `before_retrieve_html`: Pre-extraction processing
|
||||
- `before_return_html`: Final HTML processing
|
||||
|
||||
### Function-Based Hooks API
|
||||
|
||||
Writing hooks as strings works, but lacks IDE support and type checking. v0.7.5 introduces a function-based approach with automatic conversion!
|
||||
|
||||
**Option 1: Using the `hooks_to_string()` Utility**
|
||||
|
||||
```python
|
||||
from crawl4ai import hooks_to_string
|
||||
import requests
|
||||
|
||||
# Define hooks as regular Python functions (with full IDE support!)
|
||||
async def on_page_context_created(page, context, **kwargs):
|
||||
"""Block images to speed up crawling"""
|
||||
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||
return page
|
||||
|
||||
async def before_goto(page, context, url, **kwargs):
|
||||
"""Add custom headers"""
|
||||
await page.set_extra_http_headers({
|
||||
'X-Crawl4AI': 'v0.7.5',
|
||||
'X-Custom-Header': 'my-value'
|
||||
})
|
||||
return page
|
||||
|
||||
# Convert functions to strings
|
||||
hooks_code = hooks_to_string({
|
||||
"on_page_context_created": on_page_context_created,
|
||||
"before_goto": before_goto
|
||||
})
|
||||
|
||||
# Use with REST API
|
||||
payload = {
|
||||
"urls": ["https://httpbin.org/html"],
|
||||
"hooks": {"code": hooks_code, "timeout": 30}
|
||||
}
|
||||
response = requests.post("http://localhost:11235/crawl", json=payload)
|
||||
```
|
||||
|
||||
**Option 2: Docker Client with Automatic Conversion (Recommended!)**
|
||||
|
||||
```python
|
||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||
|
||||
# Define hooks as functions (same as above)
|
||||
async def on_page_context_created(page, context, **kwargs):
|
||||
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||
return page
|
||||
|
||||
async def before_retrieve_html(page, context, **kwargs):
|
||||
# Scroll to load lazy content
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
await page.wait_for_timeout(1000)
|
||||
return page
|
||||
|
||||
# Use Docker client - conversion happens automatically!
|
||||
client = Crawl4aiDockerClient(base_url="http://localhost:11235")
|
||||
|
||||
results = await client.crawl(
|
||||
urls=["https://httpbin.org/html"],
|
||||
hooks={
|
||||
"on_page_context_created": on_page_context_created,
|
||||
"before_retrieve_html": before_retrieve_html
|
||||
},
|
||||
hooks_timeout=30
|
||||
)
|
||||
|
||||
if results and results.success:
|
||||
print(f"✅ Hooks executed! HTML length: {len(results.html)}")
|
||||
```
|
||||
|
||||
**Benefits of Function-Based Hooks:**
|
||||
- ✅ Full IDE support (autocomplete, syntax highlighting)
|
||||
- ✅ Type checking and linting
|
||||
- ✅ Easier to test and debug
|
||||
- ✅ Reusable across projects
|
||||
- ✅ Automatic conversion in Docker client
|
||||
- ✅ No breaking changes - string hooks still work!
|
||||
|
||||
## 🤖 Enhanced LLM Integration
|
||||
|
||||
Enhanced LLM integration with custom providers, temperature control, and base URL configuration.
|
||||
|
||||
### Multi-Provider Support
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
|
||||
# Test with different providers
|
||||
async def test_llm_providers():
|
||||
# OpenAI with custom temperature
|
||||
openai_strategy = LLMExtractionStrategy(
|
||||
provider="gemini/gemini-2.5-flash-lite",
|
||||
api_token="your-api-token",
|
||||
temperature=0.7, # New in v0.7.5
|
||||
instruction="Summarize this page in one sentence"
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://example.com",
|
||||
config=CrawlerRunConfig(extraction_strategy=openai_strategy)
|
||||
)
|
||||
|
||||
if result.success:
|
||||
print("✅ LLM extraction completed")
|
||||
print(result.extracted_content)
|
||||
|
||||
# Docker API with enhanced LLM config
|
||||
llm_payload = {
|
||||
"url": "https://example.com",
|
||||
"f": "llm",
|
||||
"q": "Summarize this page in one sentence.",
|
||||
"provider": "gemini/gemini-2.5-flash-lite",
|
||||
"temperature": 0.7
|
||||
}
|
||||
|
||||
response = requests.post("http://localhost:11235/md", json=llm_payload)
|
||||
```
|
||||
|
||||
**New Features:**
|
||||
- Custom `temperature` parameter for creativity control
|
||||
- `base_url` for custom API endpoints
|
||||
- Multi-provider environment variable support
|
||||
- Docker API integration
|
||||
|
||||
## 🔒 HTTPS Preservation
|
||||
|
||||
**The Problem:** Modern web apps require HTTPS everywhere. When crawlers downgrade internal links from HTTPS to HTTP, authentication breaks and security warnings appear.
|
||||
|
||||
**Solution:** HTTPS preservation maintains secure protocols throughout crawling.
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, FilterChain, URLPatternFilter, BFSDeepCrawlStrategy
|
||||
|
||||
async def test_https_preservation():
|
||||
# Enable HTTPS preservation
|
||||
url_filter = URLPatternFilter(
|
||||
patterns=["^(https:\/\/)?quotes\.toscrape\.com(\/.*)?$"]
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
exclude_external_links=True,
|
||||
preserve_https_for_internal_links=True, # New in v0.7.5
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=2,
|
||||
max_pages=5,
|
||||
filter_chain=FilterChain([url_filter])
|
||||
)
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
async for result in await crawler.arun(
|
||||
url="https://quotes.toscrape.com",
|
||||
config=config
|
||||
):
|
||||
# All internal links maintain HTTPS
|
||||
internal_links = [link['href'] for link in result.links['internal']]
|
||||
https_links = [link for link in internal_links if link.startswith('https://')]
|
||||
|
||||
print(f"HTTPS links preserved: {len(https_links)}/{len(internal_links)}")
|
||||
for link in https_links[:3]:
|
||||
print(f" → {link}")
|
||||
```
|
||||
|
||||
## 🛠️ Bug Fixes and Improvements
|
||||
|
||||
### Major Fixes
|
||||
- **URL Processing**: Fixed '+' sign preservation in query parameters (#1332)
|
||||
- **Proxy Configuration**: Enhanced proxy string parsing (old `proxy` parameter deprecated)
|
||||
- **Docker Error Handling**: Comprehensive error messages with status codes
|
||||
- **Memory Management**: Fixed leaks in long-running sessions
|
||||
- **JWT Authentication**: Fixed Docker JWT validation issues (#1442)
|
||||
- **Playwright Stealth**: Fixed stealth features for Playwright integration (#1481)
|
||||
- **API Configuration**: Fixed config handling to prevent overriding user-provided settings (#1505)
|
||||
- **Docker Filter Serialization**: Resolved JSON encoding errors in deep crawl strategy (#1419)
|
||||
- **LLM Provider Support**: Fixed custom LLM provider integration for adaptive crawler (#1291)
|
||||
- **Performance Issues**: Resolved backoff strategy failures and timeout handling (#989)
|
||||
|
||||
### Community-Reported Issues Fixed
|
||||
This release addresses multiple issues reported by the community through GitHub issues and Discord discussions:
|
||||
- Fixed browser configuration reference errors
|
||||
- Resolved dependency conflicts with cssselect
|
||||
- Improved error messaging for failed authentications
|
||||
- Enhanced compatibility with various proxy configurations
|
||||
- Fixed edge cases in URL normalization
|
||||
|
||||
### Configuration Updates
|
||||
```python
|
||||
# Old proxy config (deprecated)
|
||||
# browser_config = BrowserConfig(proxy="http://proxy:8080")
|
||||
|
||||
# New enhanced proxy config
|
||||
browser_config = BrowserConfig(
|
||||
proxy_config={
|
||||
"server": "http://proxy:8080",
|
||||
"username": "optional-user",
|
||||
"password": "optional-pass"
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
## 🔄 Breaking Changes
|
||||
|
||||
1. **Python 3.10+ Required**: Upgrade from Python 3.9
|
||||
2. **Proxy Parameter Deprecated**: Use new `proxy_config` structure
|
||||
3. **New Dependency**: Added `cssselect` for better CSS handling
|
||||
|
||||
## 🚀 Get Started
|
||||
|
||||
```bash
|
||||
# Install latest version
|
||||
pip install crawl4ai==0.7.5
|
||||
|
||||
# Docker deployment
|
||||
docker pull unclecode/crawl4ai:latest
|
||||
docker run -p 11235:11235 unclecode/crawl4ai:latest
|
||||
```
|
||||
|
||||
**Try the Demo:**
|
||||
```bash
|
||||
# Run working examples
|
||||
python docs/releases_review/demo_v0.7.5.py
|
||||
```
|
||||
|
||||
**Resources:**
|
||||
- 📖 Documentation: [docs.crawl4ai.com](https://docs.crawl4ai.com)
|
||||
- 🐙 GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||
- 💬 Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||
- 🐦 Twitter: [@unclecode](https://x.com/unclecode)
|
||||
|
||||
Happy crawling! 🕷️
|
||||
314
docs/blog/release-v0.7.6.md
Normal file
314
docs/blog/release-v0.7.6.md
Normal file
@@ -0,0 +1,314 @@
|
||||
# Crawl4AI v0.7.6 Release Notes
|
||||
|
||||
*Release Date: October 22, 2025*
|
||||
|
||||
I'm excited to announce Crawl4AI v0.7.6, featuring a complete webhook infrastructure for the Docker job queue API! This release eliminates polling and brings real-time notifications to both crawling and LLM extraction workflows.
|
||||
|
||||
## 🎯 What's New
|
||||
|
||||
### Webhook Support for Docker Job Queue API
|
||||
|
||||
The headline feature of v0.7.6 is comprehensive webhook support for asynchronous job processing. No more constant polling to check if your jobs are done - get instant notifications when they complete!
|
||||
|
||||
**Key Capabilities:**
|
||||
|
||||
- ✅ **Universal Webhook Support**: Both `/crawl/job` and `/llm/job` endpoints now support webhooks
|
||||
- ✅ **Flexible Delivery Modes**: Choose notification-only or include full data in the webhook payload
|
||||
- ✅ **Reliable Delivery**: Exponential backoff retry mechanism (5 attempts: 1s → 2s → 4s → 8s → 16s)
|
||||
- ✅ **Custom Authentication**: Add custom headers for webhook authentication
|
||||
- ✅ **Global Configuration**: Set default webhook URL in `config.yml` for all jobs
|
||||
- ✅ **Task Type Identification**: Distinguish between `crawl` and `llm_extraction` tasks
|
||||
|
||||
### How It Works
|
||||
|
||||
Instead of constantly checking job status:
|
||||
|
||||
**OLD WAY (Polling):**
|
||||
```python
|
||||
# Submit job
|
||||
response = requests.post("http://localhost:11235/crawl/job", json=payload)
|
||||
task_id = response.json()['task_id']
|
||||
|
||||
# Poll until complete
|
||||
while True:
|
||||
status = requests.get(f"http://localhost:11235/crawl/job/{task_id}")
|
||||
if status.json()['status'] == 'completed':
|
||||
break
|
||||
time.sleep(5) # Wait and try again
|
||||
```
|
||||
|
||||
**NEW WAY (Webhooks):**
|
||||
```python
|
||||
# Submit job with webhook
|
||||
payload = {
|
||||
"urls": ["https://example.com"],
|
||||
"webhook_config": {
|
||||
"webhook_url": "https://myapp.com/webhook",
|
||||
"webhook_data_in_payload": True
|
||||
}
|
||||
}
|
||||
response = requests.post("http://localhost:11235/crawl/job", json=payload)
|
||||
|
||||
# Done! Webhook will notify you when complete
|
||||
# Your webhook handler receives the results automatically
|
||||
```
|
||||
|
||||
### Crawl Job Webhooks
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:11235/crawl/job \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"urls": ["https://example.com"],
|
||||
"browser_config": {"headless": true},
|
||||
"crawler_config": {"cache_mode": "bypass"},
|
||||
"webhook_config": {
|
||||
"webhook_url": "https://myapp.com/webhooks/crawl-complete",
|
||||
"webhook_data_in_payload": false,
|
||||
"webhook_headers": {
|
||||
"X-Webhook-Secret": "your-secret-token"
|
||||
}
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
### LLM Extraction Job Webhooks (NEW!)
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:11235/llm/job \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"url": "https://example.com/article",
|
||||
"q": "Extract the article title, author, and publication date",
|
||||
"schema": "{\"type\":\"object\",\"properties\":{\"title\":{\"type\":\"string\"}}}",
|
||||
"provider": "openai/gpt-4o-mini",
|
||||
"webhook_config": {
|
||||
"webhook_url": "https://myapp.com/webhooks/llm-complete",
|
||||
"webhook_data_in_payload": true
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
### Webhook Payload Structure
|
||||
|
||||
**Success (with data):**
|
||||
```json
|
||||
{
|
||||
"task_id": "llm_1698765432",
|
||||
"task_type": "llm_extraction",
|
||||
"status": "completed",
|
||||
"timestamp": "2025-10-22T10:30:00.000000+00:00",
|
||||
"urls": ["https://example.com/article"],
|
||||
"data": {
|
||||
"extracted_content": {
|
||||
"title": "Understanding Web Scraping",
|
||||
"author": "John Doe",
|
||||
"date": "2025-10-22"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Failure:**
|
||||
```json
|
||||
{
|
||||
"task_id": "crawl_abc123",
|
||||
"task_type": "crawl",
|
||||
"status": "failed",
|
||||
"timestamp": "2025-10-22T10:30:00.000000+00:00",
|
||||
"urls": ["https://example.com"],
|
||||
"error": "Connection timeout after 30s"
|
||||
}
|
||||
```
|
||||
|
||||
### Simple Webhook Handler Example
|
||||
|
||||
```python
|
||||
from flask import Flask, request, jsonify
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
@app.route('/webhook', methods=['POST'])
|
||||
def handle_webhook():
|
||||
payload = request.json
|
||||
|
||||
task_id = payload['task_id']
|
||||
task_type = payload['task_type']
|
||||
status = payload['status']
|
||||
|
||||
if status == 'completed':
|
||||
if 'data' in payload:
|
||||
# Process data directly
|
||||
data = payload['data']
|
||||
else:
|
||||
# Fetch from API
|
||||
endpoint = 'crawl' if task_type == 'crawl' else 'llm'
|
||||
response = requests.get(f'http://localhost:11235/{endpoint}/job/{task_id}')
|
||||
data = response.json()
|
||||
|
||||
# Your business logic here
|
||||
print(f"Job {task_id} completed!")
|
||||
|
||||
elif status == 'failed':
|
||||
error = payload.get('error', 'Unknown error')
|
||||
print(f"Job {task_id} failed: {error}")
|
||||
|
||||
return jsonify({"status": "received"}), 200
|
||||
|
||||
app.run(port=8080)
|
||||
```
|
||||
|
||||
## 📊 Performance Improvements
|
||||
|
||||
- **Reduced Server Load**: Eliminates constant polling requests
|
||||
- **Lower Latency**: Instant notification vs. polling interval delay
|
||||
- **Better Resource Usage**: Frees up client connections while jobs run in background
|
||||
- **Scalable Architecture**: Handles high-volume crawling workflows efficiently
|
||||
|
||||
## 🐛 Bug Fixes
|
||||
|
||||
- Fixed webhook configuration serialization for Pydantic HttpUrl fields
|
||||
- Improved error handling in webhook delivery service
|
||||
- Enhanced Redis task storage for webhook config persistence
|
||||
|
||||
## 🌍 Expected Real-World Impact
|
||||
|
||||
### For Web Scraping Workflows
|
||||
- **Reduced Costs**: Less API calls = lower bandwidth and server costs
|
||||
- **Better UX**: Instant notifications improve user experience
|
||||
- **Scalability**: Handle 100s of concurrent jobs without polling overhead
|
||||
|
||||
### For LLM Extraction Pipelines
|
||||
- **Async Processing**: Submit LLM extraction jobs and move on
|
||||
- **Batch Processing**: Queue multiple extractions, get notified as they complete
|
||||
- **Integration**: Easy integration with workflow automation tools (Zapier, n8n, etc.)
|
||||
|
||||
### For Microservices
|
||||
- **Event-Driven**: Perfect for event-driven microservice architectures
|
||||
- **Decoupling**: Decouple job submission from result processing
|
||||
- **Reliability**: Automatic retries ensure webhooks are delivered
|
||||
|
||||
## 🔄 Breaking Changes
|
||||
|
||||
**None!** This release is fully backward compatible.
|
||||
|
||||
- Webhook configuration is optional
|
||||
- Existing code continues to work without modification
|
||||
- Polling is still supported for jobs without webhook config
|
||||
|
||||
## 📚 Documentation
|
||||
|
||||
### New Documentation
|
||||
- **[WEBHOOK_EXAMPLES.md](../deploy/docker/WEBHOOK_EXAMPLES.md)** - Comprehensive webhook usage guide
|
||||
- **[docker_webhook_example.py](../docs/examples/docker_webhook_example.py)** - Working code examples
|
||||
|
||||
### Updated Documentation
|
||||
- **[Docker README](../deploy/docker/README.md)** - Added webhook sections
|
||||
- API documentation with webhook examples
|
||||
|
||||
## 🛠️ Migration Guide
|
||||
|
||||
No migration needed! Webhooks are opt-in:
|
||||
|
||||
1. **To use webhooks**: Add `webhook_config` to your job payload
|
||||
2. **To keep polling**: Continue using your existing code
|
||||
|
||||
### Quick Start
|
||||
|
||||
```python
|
||||
# Just add webhook_config to your existing payload
|
||||
payload = {
|
||||
# Your existing configuration
|
||||
"urls": ["https://example.com"],
|
||||
"browser_config": {...},
|
||||
"crawler_config": {...},
|
||||
|
||||
# NEW: Add webhook configuration
|
||||
"webhook_config": {
|
||||
"webhook_url": "https://myapp.com/webhook",
|
||||
"webhook_data_in_payload": True
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## 🔧 Configuration
|
||||
|
||||
### Global Webhook Configuration (config.yml)
|
||||
|
||||
```yaml
|
||||
webhooks:
|
||||
enabled: true
|
||||
default_url: "https://myapp.com/webhooks/default" # Optional
|
||||
data_in_payload: false
|
||||
retry:
|
||||
max_attempts: 5
|
||||
initial_delay_ms: 1000
|
||||
max_delay_ms: 32000
|
||||
timeout_ms: 30000
|
||||
headers:
|
||||
User-Agent: "Crawl4AI-Webhook/1.0"
|
||||
```
|
||||
|
||||
## 🚀 Upgrade Instructions
|
||||
|
||||
### Docker
|
||||
|
||||
```bash
|
||||
# Pull the latest image
|
||||
docker pull unclecode/crawl4ai:0.7.6
|
||||
|
||||
# Or use latest tag
|
||||
docker pull unclecode/crawl4ai:latest
|
||||
|
||||
# Run with webhook support
|
||||
docker run -d \
|
||||
-p 11235:11235 \
|
||||
--env-file .llm.env \
|
||||
--name crawl4ai \
|
||||
unclecode/crawl4ai:0.7.6
|
||||
```
|
||||
|
||||
### Python Package
|
||||
|
||||
```bash
|
||||
pip install --upgrade crawl4ai
|
||||
```
|
||||
|
||||
## 💡 Pro Tips
|
||||
|
||||
1. **Use notification-only mode** for large results - fetch data separately to avoid large webhook payloads
|
||||
2. **Set custom headers** for webhook authentication and request tracking
|
||||
3. **Configure global default webhook** for consistent handling across all jobs
|
||||
4. **Implement idempotent webhook handlers** - same webhook may be delivered multiple times on retry
|
||||
5. **Use structured schemas** with LLM extraction for predictable webhook data
|
||||
|
||||
## 🎬 Demo
|
||||
|
||||
Try the release demo:
|
||||
|
||||
```bash
|
||||
python docs/releases_review/demo_v0.7.6.py
|
||||
```
|
||||
|
||||
This comprehensive demo showcases:
|
||||
- Crawl job webhooks (notification-only and with data)
|
||||
- LLM extraction webhooks (with JSON schema support)
|
||||
- Custom headers for authentication
|
||||
- Webhook retry mechanism
|
||||
- Real-time webhook receiver
|
||||
|
||||
## 🙏 Acknowledgments
|
||||
|
||||
Thank you to the community for the feedback that shaped this feature! Special thanks to everyone who requested webhook support for asynchronous job processing.
|
||||
|
||||
## 📞 Support
|
||||
|
||||
- **Documentation**: https://docs.crawl4ai.com
|
||||
- **GitHub Issues**: https://github.com/unclecode/crawl4ai/issues
|
||||
- **Discord**: https://discord.gg/crawl4ai
|
||||
|
||||
---
|
||||
|
||||
**Happy crawling with webhooks!** 🕷️🪝
|
||||
|
||||
*- unclecode*
|
||||
154
docs/examples/adaptive_crawling/llm_config_example.py
Normal file
154
docs/examples/adaptive_crawling/llm_config_example.py
Normal file
@@ -0,0 +1,154 @@
|
||||
import asyncio
|
||||
import os
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig, LLMConfig
|
||||
|
||||
|
||||
async def test_configuration(name: str, config: AdaptiveConfig, url: str, query: str):
|
||||
"""Test a specific configuration"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Configuration: {name}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
result = await adaptive.digest(start_url=url, query=query)
|
||||
|
||||
print("\n" + "="*50)
|
||||
print("CRAWL STATISTICS")
|
||||
print("="*50)
|
||||
adaptive.print_stats(detailed=False)
|
||||
|
||||
# Get the most relevant content found
|
||||
print("\n" + "="*50)
|
||||
print("MOST RELEVANT PAGES")
|
||||
print("="*50)
|
||||
|
||||
relevant_pages = adaptive.get_relevant_content(top_k=5)
|
||||
for i, page in enumerate(relevant_pages, 1):
|
||||
print(f"\n{i}. {page['url']}")
|
||||
print(f" Relevance Score: {page['score']:.2%}")
|
||||
|
||||
# Show a snippet of the content
|
||||
content = page['content'] or ""
|
||||
if content:
|
||||
snippet = content[:200].replace('\n', ' ')
|
||||
if len(content) > 200:
|
||||
snippet += "..."
|
||||
print(f" Preview: {snippet}")
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"Pages crawled: {len(result.crawled_urls)}")
|
||||
print(f"Final confidence: {adaptive.confidence:.1%}")
|
||||
print(f"Stopped reason: {result.metrics.get('stopped_reason', 'max_pages')}")
|
||||
|
||||
if result.metrics.get('is_irrelevant', False):
|
||||
print("⚠️ Query detected as irrelevant!")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def llm_embedding():
|
||||
"""Demonstrate various embedding configurations"""
|
||||
|
||||
print("EMBEDDING STRATEGY CONFIGURATION EXAMPLES")
|
||||
print("=" * 60)
|
||||
|
||||
# Base URL and query for testing
|
||||
test_url = "https://docs.python.org/3/library/asyncio.html"
|
||||
|
||||
openai_llm_config = LLMConfig(
|
||||
provider='openai/text-embedding-3-small',
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
temperature=0.7,
|
||||
max_tokens=2000
|
||||
)
|
||||
config_openai = AdaptiveConfig(
|
||||
strategy="embedding",
|
||||
max_pages=10,
|
||||
|
||||
# Use OpenAI embeddings
|
||||
embedding_llm_config=openai_llm_config,
|
||||
# embedding_llm_config={
|
||||
# 'provider': 'openai/text-embedding-3-small',
|
||||
# 'api_token': os.getenv('OPENAI_API_KEY')
|
||||
# },
|
||||
|
||||
# OpenAI embeddings are high quality, can be stricter
|
||||
embedding_k_exp=4.0,
|
||||
n_query_variations=12
|
||||
)
|
||||
|
||||
await test_configuration(
|
||||
"OpenAI Embeddings",
|
||||
config_openai,
|
||||
test_url,
|
||||
# "event-driven architecture patterns"
|
||||
"async await context managers coroutines"
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
|
||||
async def basic_adaptive_crawling():
|
||||
"""Basic adaptive crawling example"""
|
||||
|
||||
# Initialize the crawler
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# Create an adaptive crawler with default settings (statistical strategy)
|
||||
adaptive = AdaptiveCrawler(crawler)
|
||||
|
||||
# Note: You can also use embedding strategy for semantic understanding:
|
||||
# from crawl4ai import AdaptiveConfig
|
||||
# config = AdaptiveConfig(strategy="embedding")
|
||||
# adaptive = AdaptiveCrawler(crawler, config)
|
||||
|
||||
# Start adaptive crawling
|
||||
print("Starting adaptive crawl for Python async programming information...")
|
||||
result = await adaptive.digest(
|
||||
start_url="https://docs.python.org/3/library/asyncio.html",
|
||||
query="async await context managers coroutines"
|
||||
)
|
||||
|
||||
# Display crawl statistics
|
||||
print("\n" + "="*50)
|
||||
print("CRAWL STATISTICS")
|
||||
print("="*50)
|
||||
adaptive.print_stats(detailed=False)
|
||||
|
||||
# Get the most relevant content found
|
||||
print("\n" + "="*50)
|
||||
print("MOST RELEVANT PAGES")
|
||||
print("="*50)
|
||||
|
||||
relevant_pages = adaptive.get_relevant_content(top_k=5)
|
||||
for i, page in enumerate(relevant_pages, 1):
|
||||
print(f"\n{i}. {page['url']}")
|
||||
print(f" Relevance Score: {page['score']:.2%}")
|
||||
|
||||
# Show a snippet of the content
|
||||
content = page['content'] or ""
|
||||
if content:
|
||||
snippet = content[:200].replace('\n', ' ')
|
||||
if len(content) > 200:
|
||||
snippet += "..."
|
||||
print(f" Preview: {snippet}")
|
||||
|
||||
# Show final confidence
|
||||
print(f"\n{'='*50}")
|
||||
print(f"Final Confidence: {adaptive.confidence:.2%}")
|
||||
print(f"Total Pages Crawled: {len(result.crawled_urls)}")
|
||||
print(f"Knowledge Base Size: {len(adaptive.state.knowledge_base)} documents")
|
||||
|
||||
|
||||
if adaptive.confidence >= 0.8:
|
||||
print("✓ High confidence - can answer detailed questions about async Python")
|
||||
elif adaptive.confidence >= 0.6:
|
||||
print("~ Moderate confidence - can answer basic questions")
|
||||
else:
|
||||
print("✗ Low confidence - need more information")
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(llm_embedding())
|
||||
# asyncio.run(basic_adaptive_crawling())
|
||||
@@ -18,7 +18,7 @@ A comprehensive web-based tutorial for learning and experimenting with C4A-Scrip
|
||||
|
||||
2. **Install Dependencies**
|
||||
```bash
|
||||
pip install flask
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
3. **Launch the Server**
|
||||
@@ -28,7 +28,7 @@ A comprehensive web-based tutorial for learning and experimenting with C4A-Scrip
|
||||
|
||||
4. **Open in Browser**
|
||||
```
|
||||
http://localhost:8080
|
||||
http://localhost:8000
|
||||
```
|
||||
|
||||
**🌐 Try Online**: [Live Demo](https://docs.crawl4ai.com/c4a-script/demo)
|
||||
@@ -325,7 +325,7 @@ Powers the recording functionality:
|
||||
### Configuration
|
||||
```python
|
||||
# server.py configuration
|
||||
PORT = 8080
|
||||
PORT = 8000
|
||||
DEBUG = True
|
||||
THREADED = True
|
||||
```
|
||||
@@ -343,9 +343,9 @@ THREADED = True
|
||||
**Port Already in Use**
|
||||
```bash
|
||||
# Kill existing process
|
||||
lsof -ti:8080 | xargs kill -9
|
||||
lsof -ti:8000 | xargs kill -9
|
||||
# Or use different port
|
||||
python server.py --port 8081
|
||||
python server.py --port 8001
|
||||
```
|
||||
|
||||
**Blockly Not Loading**
|
||||
|
||||
@@ -216,7 +216,7 @@ def get_examples():
|
||||
'name': 'Handle Cookie Banner',
|
||||
'description': 'Accept cookies and close newsletter popup',
|
||||
'script': '''# Handle cookie banner and newsletter
|
||||
GO http://127.0.0.1:8080/playground/
|
||||
GO http://127.0.0.1:8000/playground/
|
||||
WAIT `body` 2
|
||||
IF (EXISTS `.cookie-banner`) THEN CLICK `.accept`
|
||||
IF (EXISTS `.newsletter-popup`) THEN CLICK `.close`'''
|
||||
|
||||
522
docs/examples/docker_client_hooks_example.py
Normal file
522
docs/examples/docker_client_hooks_example.py
Normal file
@@ -0,0 +1,522 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Comprehensive hooks examples using Docker Client with function objects.
|
||||
|
||||
This approach is recommended because:
|
||||
- Write hooks as regular Python functions
|
||||
- Full IDE support (autocomplete, type checking)
|
||||
- Automatic conversion to API format
|
||||
- Reusable and testable code
|
||||
- Clean, readable syntax
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import Crawl4aiDockerClient
|
||||
|
||||
# API_BASE_URL = "http://localhost:11235"
|
||||
API_BASE_URL = "http://localhost:11234"
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Hook Function Definitions
|
||||
# ============================================================================
|
||||
|
||||
# --- All Hooks Demo ---
|
||||
async def browser_created_hook(browser, **kwargs):
|
||||
"""Called after browser is created"""
|
||||
print("[HOOK] Browser created and ready")
|
||||
return browser
|
||||
|
||||
|
||||
async def page_context_hook(page, context, **kwargs):
|
||||
"""Setup page environment"""
|
||||
print("[HOOK] Setting up page environment")
|
||||
|
||||
# Set viewport
|
||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||
|
||||
# Add cookies
|
||||
await context.add_cookies([{
|
||||
"name": "test_session",
|
||||
"value": "abc123xyz",
|
||||
"domain": ".httpbin.org",
|
||||
"path": "/"
|
||||
}])
|
||||
|
||||
# Block resources
|
||||
await context.route("**/*.{png,jpg,jpeg,gif}", lambda route: route.abort())
|
||||
await context.route("**/analytics/*", lambda route: route.abort())
|
||||
|
||||
print("[HOOK] Environment configured")
|
||||
return page
|
||||
|
||||
|
||||
async def user_agent_hook(page, context, user_agent, **kwargs):
|
||||
"""Called when user agent is updated"""
|
||||
print(f"[HOOK] User agent: {user_agent[:50]}...")
|
||||
return page
|
||||
|
||||
|
||||
async def before_goto_hook(page, context, url, **kwargs):
|
||||
"""Called before navigating to URL"""
|
||||
print(f"[HOOK] Navigating to: {url}")
|
||||
|
||||
await page.set_extra_http_headers({
|
||||
"X-Custom-Header": "crawl4ai-test",
|
||||
"Accept-Language": "en-US"
|
||||
})
|
||||
|
||||
return page
|
||||
|
||||
|
||||
async def after_goto_hook(page, context, url, response, **kwargs):
|
||||
"""Called after page loads"""
|
||||
print(f"[HOOK] Page loaded: {url}")
|
||||
|
||||
await page.wait_for_timeout(1000)
|
||||
|
||||
try:
|
||||
await page.wait_for_selector("body", timeout=2000)
|
||||
print("[HOOK] Body element ready")
|
||||
except:
|
||||
print("[HOOK] Timeout, continuing")
|
||||
|
||||
return page
|
||||
|
||||
|
||||
async def execution_started_hook(page, context, **kwargs):
|
||||
"""Called when custom JS execution starts"""
|
||||
print("[HOOK] JS execution started")
|
||||
await page.evaluate("console.log('[HOOK] Custom JS');")
|
||||
return page
|
||||
|
||||
|
||||
async def before_retrieve_hook(page, context, **kwargs):
|
||||
"""Called before retrieving HTML"""
|
||||
print("[HOOK] Preparing HTML retrieval")
|
||||
|
||||
# Scroll for lazy content
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
||||
await page.wait_for_timeout(500)
|
||||
await page.evaluate("window.scrollTo(0, 0);")
|
||||
|
||||
print("[HOOK] Scrolling complete")
|
||||
return page
|
||||
|
||||
|
||||
async def before_return_hook(page, context, html, **kwargs):
|
||||
"""Called before returning HTML"""
|
||||
print(f"[HOOK] HTML ready: {len(html)} chars")
|
||||
|
||||
metrics = await page.evaluate('''() => ({
|
||||
images: document.images.length,
|
||||
links: document.links.length,
|
||||
scripts: document.scripts.length
|
||||
})''')
|
||||
|
||||
print(f"[HOOK] Metrics - Images: {metrics['images']}, Links: {metrics['links']}")
|
||||
return page
|
||||
|
||||
|
||||
# --- Authentication Hooks ---
|
||||
async def auth_context_hook(page, context, **kwargs):
|
||||
"""Setup authentication context"""
|
||||
print("[HOOK] Setting up authentication")
|
||||
|
||||
# Add auth cookies
|
||||
await context.add_cookies([{
|
||||
"name": "auth_token",
|
||||
"value": "fake_jwt_token",
|
||||
"domain": ".httpbin.org",
|
||||
"path": "/",
|
||||
"httpOnly": True
|
||||
}])
|
||||
|
||||
# Set localStorage
|
||||
await page.evaluate('''
|
||||
localStorage.setItem('user_id', '12345');
|
||||
localStorage.setItem('auth_time', new Date().toISOString());
|
||||
''')
|
||||
|
||||
print("[HOOK] Auth context ready")
|
||||
return page
|
||||
|
||||
|
||||
async def auth_headers_hook(page, context, url, **kwargs):
|
||||
"""Add authentication headers"""
|
||||
print(f"[HOOK] Adding auth headers for {url}")
|
||||
|
||||
import base64
|
||||
credentials = base64.b64encode(b"user:passwd").decode('ascii')
|
||||
|
||||
await page.set_extra_http_headers({
|
||||
'Authorization': f'Basic {credentials}',
|
||||
'X-API-Key': 'test-key-123'
|
||||
})
|
||||
|
||||
return page
|
||||
|
||||
|
||||
# --- Performance Optimization Hooks ---
|
||||
async def performance_hook(page, context, **kwargs):
|
||||
"""Optimize page for performance"""
|
||||
print("[HOOK] Optimizing for performance")
|
||||
|
||||
# Block resource-heavy content
|
||||
await context.route("**/*.{png,jpg,jpeg,gif,webp,svg}", lambda r: r.abort())
|
||||
await context.route("**/*.{woff,woff2,ttf}", lambda r: r.abort())
|
||||
await context.route("**/*.{mp4,webm,ogg}", lambda r: r.abort())
|
||||
await context.route("**/googletagmanager.com/*", lambda r: r.abort())
|
||||
await context.route("**/google-analytics.com/*", lambda r: r.abort())
|
||||
await context.route("**/facebook.com/*", lambda r: r.abort())
|
||||
|
||||
# Disable animations
|
||||
await page.add_style_tag(content='''
|
||||
*, *::before, *::after {
|
||||
animation-duration: 0s !important;
|
||||
transition-duration: 0s !important;
|
||||
}
|
||||
''')
|
||||
|
||||
print("[HOOK] Optimizations applied")
|
||||
return page
|
||||
|
||||
|
||||
async def cleanup_hook(page, context, **kwargs):
|
||||
"""Clean page before extraction"""
|
||||
print("[HOOK] Cleaning page")
|
||||
|
||||
await page.evaluate('''() => {
|
||||
const selectors = [
|
||||
'.ad', '.ads', '.advertisement',
|
||||
'.popup', '.modal', '.overlay',
|
||||
'.cookie-banner', '.newsletter'
|
||||
];
|
||||
|
||||
selectors.forEach(sel => {
|
||||
document.querySelectorAll(sel).forEach(el => el.remove());
|
||||
});
|
||||
|
||||
document.querySelectorAll('script, style').forEach(el => el.remove());
|
||||
}''')
|
||||
|
||||
print("[HOOK] Page cleaned")
|
||||
return page
|
||||
|
||||
|
||||
# --- Content Extraction Hooks ---
|
||||
async def wait_dynamic_content_hook(page, context, url, response, **kwargs):
|
||||
"""Wait for dynamic content to load"""
|
||||
print(f"[HOOK] Waiting for dynamic content on {url}")
|
||||
|
||||
await page.wait_for_timeout(2000)
|
||||
|
||||
# Click "Load More" if exists
|
||||
try:
|
||||
load_more = await page.query_selector('[class*="load-more"], button:has-text("Load More")')
|
||||
if load_more:
|
||||
await load_more.click()
|
||||
await page.wait_for_timeout(1000)
|
||||
print("[HOOK] Clicked 'Load More'")
|
||||
except:
|
||||
pass
|
||||
|
||||
return page
|
||||
|
||||
|
||||
async def extract_metadata_hook(page, context, **kwargs):
|
||||
"""Extract page metadata"""
|
||||
print("[HOOK] Extracting metadata")
|
||||
|
||||
metadata = await page.evaluate('''() => {
|
||||
const getMeta = (name) => {
|
||||
const el = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
|
||||
return el ? el.getAttribute('content') : null;
|
||||
};
|
||||
|
||||
return {
|
||||
title: document.title,
|
||||
description: getMeta('description'),
|
||||
author: getMeta('author'),
|
||||
keywords: getMeta('keywords'),
|
||||
};
|
||||
}''')
|
||||
|
||||
print(f"[HOOK] Metadata: {metadata}")
|
||||
|
||||
# Infinite scroll
|
||||
for i in range(3):
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
||||
await page.wait_for_timeout(1000)
|
||||
print(f"[HOOK] Scroll {i+1}/3")
|
||||
|
||||
return page
|
||||
|
||||
|
||||
# --- Multi-URL Hooks ---
|
||||
async def url_specific_hook(page, context, url, **kwargs):
|
||||
"""Apply URL-specific logic"""
|
||||
print(f"[HOOK] Processing URL: {url}")
|
||||
|
||||
# URL-specific headers
|
||||
if 'html' in url:
|
||||
await page.set_extra_http_headers({"X-Type": "HTML"})
|
||||
elif 'json' in url:
|
||||
await page.set_extra_http_headers({"X-Type": "JSON"})
|
||||
|
||||
return page
|
||||
|
||||
|
||||
async def track_progress_hook(page, context, url, response, **kwargs):
|
||||
"""Track crawl progress"""
|
||||
status = response.status if response else 'unknown'
|
||||
print(f"[HOOK] Loaded {url} - Status: {status}")
|
||||
return page
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Test Functions
|
||||
# ============================================================================
|
||||
|
||||
async def test_all_hooks_comprehensive():
|
||||
"""Test all 8 hook types"""
|
||||
print("=" * 70)
|
||||
print("Test 1: All Hooks Comprehensive Demo (Docker Client)")
|
||||
print("=" * 70)
|
||||
|
||||
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||
print("\nCrawling with all 8 hooks...")
|
||||
|
||||
# Define hooks with function objects
|
||||
hooks = {
|
||||
"on_browser_created": browser_created_hook,
|
||||
"on_page_context_created": page_context_hook,
|
||||
"on_user_agent_updated": user_agent_hook,
|
||||
"before_goto": before_goto_hook,
|
||||
"after_goto": after_goto_hook,
|
||||
"on_execution_started": execution_started_hook,
|
||||
"before_retrieve_html": before_retrieve_hook,
|
||||
"before_return_html": before_return_hook
|
||||
}
|
||||
|
||||
result = await client.crawl(
|
||||
["https://httpbin.org/html"],
|
||||
hooks=hooks,
|
||||
hooks_timeout=30
|
||||
)
|
||||
|
||||
print("\n✅ Success!")
|
||||
print(f" URL: {result.url}")
|
||||
print(f" Success: {result.success}")
|
||||
print(f" HTML: {len(result.html)} chars")
|
||||
|
||||
|
||||
async def test_authentication_workflow():
|
||||
"""Test authentication with hooks"""
|
||||
print("\n" + "=" * 70)
|
||||
print("Test 2: Authentication Workflow (Docker Client)")
|
||||
print("=" * 70)
|
||||
|
||||
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||
print("\nTesting authentication...")
|
||||
|
||||
hooks = {
|
||||
"on_page_context_created": auth_context_hook,
|
||||
"before_goto": auth_headers_hook
|
||||
}
|
||||
|
||||
result = await client.crawl(
|
||||
["https://httpbin.org/basic-auth/user/passwd"],
|
||||
hooks=hooks,
|
||||
hooks_timeout=15
|
||||
)
|
||||
|
||||
print("\n✅ Authentication completed")
|
||||
|
||||
if result.success:
|
||||
if '"authenticated"' in result.html and 'true' in result.html:
|
||||
print(" ✅ Basic auth successful!")
|
||||
else:
|
||||
print(" ⚠️ Auth status unclear")
|
||||
else:
|
||||
print(f" ❌ Failed: {result.error_message}")
|
||||
|
||||
|
||||
async def test_performance_optimization():
|
||||
"""Test performance optimization"""
|
||||
print("\n" + "=" * 70)
|
||||
print("Test 3: Performance Optimization (Docker Client)")
|
||||
print("=" * 70)
|
||||
|
||||
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||
print("\nTesting performance hooks...")
|
||||
|
||||
hooks = {
|
||||
"on_page_context_created": performance_hook,
|
||||
"before_retrieve_html": cleanup_hook
|
||||
}
|
||||
|
||||
result = await client.crawl(
|
||||
["https://httpbin.org/html"],
|
||||
hooks=hooks,
|
||||
hooks_timeout=10
|
||||
)
|
||||
|
||||
print("\n✅ Optimization completed")
|
||||
print(f" HTML size: {len(result.html):,} chars")
|
||||
print(" Resources blocked, ads removed")
|
||||
|
||||
|
||||
async def test_content_extraction():
|
||||
"""Test content extraction"""
|
||||
print("\n" + "=" * 70)
|
||||
print("Test 4: Content Extraction (Docker Client)")
|
||||
print("=" * 70)
|
||||
|
||||
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||
print("\nTesting extraction hooks...")
|
||||
|
||||
hooks = {
|
||||
"after_goto": wait_dynamic_content_hook,
|
||||
"before_retrieve_html": extract_metadata_hook
|
||||
}
|
||||
|
||||
result = await client.crawl(
|
||||
["https://www.kidocode.com/"],
|
||||
hooks=hooks,
|
||||
hooks_timeout=20
|
||||
)
|
||||
|
||||
print("\n✅ Extraction completed")
|
||||
print(f" URL: {result.url}")
|
||||
print(f" Success: {result.success}")
|
||||
print(f" Metadata: {result.metadata}")
|
||||
|
||||
|
||||
async def test_multi_url_crawl():
|
||||
"""Test hooks with multiple URLs"""
|
||||
print("\n" + "=" * 70)
|
||||
print("Test 5: Multi-URL Crawl (Docker Client)")
|
||||
print("=" * 70)
|
||||
|
||||
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||
print("\nCrawling multiple URLs...")
|
||||
|
||||
hooks = {
|
||||
"before_goto": url_specific_hook,
|
||||
"after_goto": track_progress_hook
|
||||
}
|
||||
|
||||
results = await client.crawl(
|
||||
[
|
||||
"https://httpbin.org/html",
|
||||
"https://httpbin.org/json",
|
||||
"https://httpbin.org/xml"
|
||||
],
|
||||
hooks=hooks,
|
||||
hooks_timeout=15
|
||||
)
|
||||
|
||||
print("\n✅ Multi-URL crawl completed")
|
||||
print(f"\n Crawled {len(results)} URLs:")
|
||||
for i, result in enumerate(results, 1):
|
||||
status = "✅" if result.success else "❌"
|
||||
print(f" {status} {i}. {result.url}")
|
||||
|
||||
|
||||
async def test_reusable_hook_library():
|
||||
"""Test using reusable hook library"""
|
||||
print("\n" + "=" * 70)
|
||||
print("Test 6: Reusable Hook Library (Docker Client)")
|
||||
print("=" * 70)
|
||||
|
||||
# Create a library of reusable hooks
|
||||
class HookLibrary:
|
||||
@staticmethod
|
||||
async def block_images(page, context, **kwargs):
|
||||
"""Block all images"""
|
||||
await context.route("**/*.{png,jpg,jpeg,gif}", lambda r: r.abort())
|
||||
print("[LIBRARY] Images blocked")
|
||||
return page
|
||||
|
||||
@staticmethod
|
||||
async def block_analytics(page, context, **kwargs):
|
||||
"""Block analytics"""
|
||||
await context.route("**/analytics/*", lambda r: r.abort())
|
||||
await context.route("**/google-analytics.com/*", lambda r: r.abort())
|
||||
print("[LIBRARY] Analytics blocked")
|
||||
return page
|
||||
|
||||
@staticmethod
|
||||
async def scroll_infinite(page, context, **kwargs):
|
||||
"""Handle infinite scroll"""
|
||||
for i in range(5):
|
||||
prev = await page.evaluate("document.body.scrollHeight")
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
||||
await page.wait_for_timeout(1000)
|
||||
curr = await page.evaluate("document.body.scrollHeight")
|
||||
if curr == prev:
|
||||
break
|
||||
print("[LIBRARY] Infinite scroll complete")
|
||||
return page
|
||||
|
||||
async with Crawl4aiDockerClient(base_url=API_BASE_URL, verbose=False) as client:
|
||||
print("\nUsing hook library...")
|
||||
|
||||
hooks = {
|
||||
"on_page_context_created": HookLibrary.block_images,
|
||||
"before_retrieve_html": HookLibrary.scroll_infinite
|
||||
}
|
||||
|
||||
result = await client.crawl(
|
||||
["https://www.kidocode.com/"],
|
||||
hooks=hooks,
|
||||
hooks_timeout=20
|
||||
)
|
||||
|
||||
print("\n✅ Library hooks completed")
|
||||
print(f" Success: {result.success}")
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Main
|
||||
# ============================================================================
|
||||
|
||||
async def main():
|
||||
"""Run all Docker client hook examples"""
|
||||
print("🔧 Crawl4AI Docker Client - Hooks Examples (Function-Based)")
|
||||
print("Using Python function objects with automatic conversion")
|
||||
print("=" * 70)
|
||||
|
||||
tests = [
|
||||
("All Hooks Demo", test_all_hooks_comprehensive),
|
||||
("Authentication", test_authentication_workflow),
|
||||
("Performance", test_performance_optimization),
|
||||
("Extraction", test_content_extraction),
|
||||
("Multi-URL", test_multi_url_crawl),
|
||||
("Hook Library", test_reusable_hook_library)
|
||||
]
|
||||
|
||||
for i, (name, test_func) in enumerate(tests, 1):
|
||||
try:
|
||||
await test_func()
|
||||
print(f"\n✅ Test {i}/{len(tests)}: {name} completed\n")
|
||||
except Exception as e:
|
||||
print(f"\n❌ Test {i}/{len(tests)}: {name} failed: {e}\n")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
print("=" * 70)
|
||||
print("🎉 All Docker client hook examples completed!")
|
||||
print("\n💡 Key Benefits of Function-Based Hooks:")
|
||||
print(" • Write as regular Python functions")
|
||||
print(" • Full IDE support (autocomplete, types)")
|
||||
print(" • Automatic conversion to API format")
|
||||
print(" • Reusable across projects")
|
||||
print(" • Clean, readable code")
|
||||
print(" • Easy to test and debug")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
627
docs/examples/docker_hooks_examples.py
Normal file
627
docs/examples/docker_hooks_examples.py
Normal file
@@ -0,0 +1,627 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
🚀 Crawl4AI Docker Hooks System - Complete Examples
|
||||
====================================================
|
||||
|
||||
This file demonstrates the Docker Hooks System with three different approaches:
|
||||
|
||||
1. String-based hooks for REST API
|
||||
2. hooks_to_string() utility to convert functions
|
||||
3. Docker Client with automatic conversion (most convenient)
|
||||
|
||||
Requirements:
|
||||
- Docker container running: docker run -p 11235:11235 unclecode/crawl4ai:latest
|
||||
- crawl4ai installed: pip install crawl4ai
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import requests
|
||||
import json
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
# Import Crawl4AI components
|
||||
from crawl4ai import hooks_to_string
|
||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||
|
||||
# Configuration
|
||||
DOCKER_URL = "http://localhost:11235"
|
||||
TEST_URLS = [
|
||||
"https://www.kidocode.com",
|
||||
"https://quotes.toscrape.com",
|
||||
"https://httpbin.org/html",
|
||||
]
|
||||
|
||||
|
||||
def print_section(title: str, description: str = ""):
|
||||
"""Print a formatted section header"""
|
||||
print("\n" + "=" * 70)
|
||||
print(f" {title}")
|
||||
if description:
|
||||
print(f" {description}")
|
||||
print("=" * 70 + "\n")
|
||||
|
||||
|
||||
def check_docker_service() -> bool:
|
||||
"""Check if Docker service is running"""
|
||||
try:
|
||||
response = requests.get(f"{DOCKER_URL}/health", timeout=3)
|
||||
return response.status_code == 200
|
||||
except:
|
||||
return False
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# REUSABLE HOOK LIBRARY
|
||||
# ============================================================================
|
||||
|
||||
async def performance_optimization_hook(page, context, **kwargs):
|
||||
"""
|
||||
Performance Hook: Block unnecessary resources to speed up crawling
|
||||
"""
|
||||
print(" [Hook] 🚀 Optimizing performance - blocking images and ads...")
|
||||
|
||||
# Block images
|
||||
await context.route(
|
||||
"**/*.{png,jpg,jpeg,gif,webp,svg,ico}",
|
||||
lambda route: route.abort()
|
||||
)
|
||||
|
||||
# Block ads and analytics
|
||||
await context.route("**/analytics/*", lambda route: route.abort())
|
||||
await context.route("**/ads/*", lambda route: route.abort())
|
||||
await context.route("**/google-analytics.com/*", lambda route: route.abort())
|
||||
|
||||
print(" [Hook] ✓ Performance optimization applied")
|
||||
return page
|
||||
|
||||
|
||||
async def viewport_setup_hook(page, context, **kwargs):
|
||||
"""
|
||||
Viewport Hook: Set consistent viewport size for rendering
|
||||
"""
|
||||
print(" [Hook] 🖥️ Setting viewport to 1920x1080...")
|
||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||
print(" [Hook] ✓ Viewport configured")
|
||||
return page
|
||||
|
||||
|
||||
async def authentication_headers_hook(page, context, url, **kwargs):
|
||||
"""
|
||||
Headers Hook: Add custom authentication and tracking headers
|
||||
"""
|
||||
print(f" [Hook] 🔐 Adding custom headers for {url[:50]}...")
|
||||
|
||||
await page.set_extra_http_headers({
|
||||
'X-Crawl4AI': 'docker-hooks',
|
||||
'X-Custom-Hook': 'function-based',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
})
|
||||
|
||||
print(" [Hook] ✓ Custom headers added")
|
||||
return page
|
||||
|
||||
|
||||
async def lazy_loading_handler_hook(page, context, **kwargs):
|
||||
"""
|
||||
Content Hook: Handle lazy-loaded content by scrolling
|
||||
"""
|
||||
print(" [Hook] 📜 Scrolling to load lazy content...")
|
||||
|
||||
# Scroll to bottom
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
await page.wait_for_timeout(1000)
|
||||
|
||||
# Scroll to middle
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight / 2)")
|
||||
await page.wait_for_timeout(500)
|
||||
|
||||
# Scroll back to top
|
||||
await page.evaluate("window.scrollTo(0, 0)")
|
||||
await page.wait_for_timeout(500)
|
||||
|
||||
print(" [Hook] ✓ Lazy content loaded")
|
||||
return page
|
||||
|
||||
|
||||
async def page_analytics_hook(page, context, **kwargs):
|
||||
"""
|
||||
Analytics Hook: Log page metrics before extraction
|
||||
"""
|
||||
print(" [Hook] 📊 Collecting page analytics...")
|
||||
|
||||
metrics = await page.evaluate('''
|
||||
() => ({
|
||||
title: document.title,
|
||||
images: document.images.length,
|
||||
links: document.links.length,
|
||||
scripts: document.scripts.length,
|
||||
headings: document.querySelectorAll('h1, h2, h3').length,
|
||||
paragraphs: document.querySelectorAll('p').length
|
||||
})
|
||||
''')
|
||||
|
||||
print(f" [Hook] 📈 Page: {metrics['title'][:50]}...")
|
||||
print(f" Links: {metrics['links']}, Images: {metrics['images']}, "
|
||||
f"Headings: {metrics['headings']}, Paragraphs: {metrics['paragraphs']}")
|
||||
|
||||
return page
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# APPROACH 1: String-Based Hooks (REST API)
|
||||
# ============================================================================
|
||||
|
||||
def example_1_string_based_hooks():
|
||||
"""
|
||||
Demonstrate string-based hooks with REST API
|
||||
Use this when working with REST API directly or non-Python clients
|
||||
"""
|
||||
print_section(
|
||||
"APPROACH 1: String-Based Hooks (REST API)",
|
||||
"Define hooks as strings for REST API requests"
|
||||
)
|
||||
|
||||
# Define hooks as strings
|
||||
hooks_config = {
|
||||
"on_page_context_created": """
|
||||
async def hook(page, context, **kwargs):
|
||||
print(" [String Hook] Setting up page context...")
|
||||
# Block images for performance
|
||||
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||
return page
|
||||
""",
|
||||
|
||||
"before_goto": """
|
||||
async def hook(page, context, url, **kwargs):
|
||||
print(f" [String Hook] Navigating to {url[:50]}...")
|
||||
await page.set_extra_http_headers({
|
||||
'X-Crawl4AI': 'string-based-hooks',
|
||||
})
|
||||
return page
|
||||
""",
|
||||
|
||||
"before_retrieve_html": """
|
||||
async def hook(page, context, **kwargs):
|
||||
print(" [String Hook] Scrolling page...")
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
await page.wait_for_timeout(1000)
|
||||
return page
|
||||
"""
|
||||
}
|
||||
|
||||
# Prepare request payload
|
||||
payload = {
|
||||
"urls": [TEST_URLS[2]], # httpbin.org
|
||||
"hooks": {
|
||||
"code": hooks_config,
|
||||
"timeout": 30
|
||||
},
|
||||
"crawler_config": {
|
||||
"cache_mode": "bypass"
|
||||
}
|
||||
}
|
||||
|
||||
print(f"🎯 Target URL: {TEST_URLS[2]}")
|
||||
print(f"🔧 Configured {len(hooks_config)} string-based hooks")
|
||||
print(f"📡 Sending request to Docker API...\n")
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
response = requests.post(f"{DOCKER_URL}/crawl", json=payload, timeout=60)
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
|
||||
print(f"\n✅ Request successful! (took {execution_time:.2f}s)")
|
||||
|
||||
# Display results
|
||||
if result.get('results') and result['results'][0].get('success'):
|
||||
crawl_result = result['results'][0]
|
||||
html_length = len(crawl_result.get('html', ''))
|
||||
markdown_length = len(crawl_result.get('markdown', ''))
|
||||
|
||||
print(f"\n📊 Results:")
|
||||
print(f" • HTML length: {html_length:,} characters")
|
||||
print(f" • Markdown length: {markdown_length:,} characters")
|
||||
print(f" • URL: {crawl_result.get('url')}")
|
||||
|
||||
# Check hooks execution
|
||||
if 'hooks' in result:
|
||||
hooks_info = result['hooks']
|
||||
print(f"\n🎣 Hooks Execution:")
|
||||
print(f" • Status: {hooks_info['status']['status']}")
|
||||
print(f" • Attached hooks: {len(hooks_info['status']['attached_hooks'])}")
|
||||
|
||||
if 'summary' in hooks_info:
|
||||
summary = hooks_info['summary']
|
||||
print(f" • Total executions: {summary['total_executions']}")
|
||||
print(f" • Successful: {summary['successful']}")
|
||||
print(f" • Success rate: {summary['success_rate']:.1f}%")
|
||||
else:
|
||||
print(f"⚠️ Crawl completed but no results")
|
||||
|
||||
else:
|
||||
print(f"❌ Request failed with status {response.status_code}")
|
||||
print(f" Error: {response.text[:200]}")
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
print("⏰ Request timed out after 60 seconds")
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {str(e)}")
|
||||
|
||||
print("\n" + "─" * 70)
|
||||
print("✓ String-based hooks example complete\n")
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# APPROACH 2: Function-Based Hooks with hooks_to_string() Utility
|
||||
# ============================================================================
|
||||
|
||||
def example_2_hooks_to_string_utility():
|
||||
"""
|
||||
Demonstrate the hooks_to_string() utility for converting functions
|
||||
Use this when you want to write hooks as functions but use REST API
|
||||
"""
|
||||
print_section(
|
||||
"APPROACH 2: hooks_to_string() Utility",
|
||||
"Convert Python functions to strings for REST API"
|
||||
)
|
||||
|
||||
print("📦 Creating hook functions...")
|
||||
print(" • performance_optimization_hook")
|
||||
print(" • authentication_headers_hook")
|
||||
print(" • lazy_loading_handler_hook")
|
||||
|
||||
# Convert function objects to strings using the utility
|
||||
print("\n🔄 Converting functions to strings with hooks_to_string()...")
|
||||
|
||||
hooks_dict = {
|
||||
"on_page_context_created": performance_optimization_hook,
|
||||
"before_goto": authentication_headers_hook,
|
||||
"before_retrieve_html": lazy_loading_handler_hook,
|
||||
}
|
||||
|
||||
hooks_as_strings = hooks_to_string(hooks_dict)
|
||||
|
||||
print(f"✅ Successfully converted {len(hooks_as_strings)} functions to strings")
|
||||
|
||||
# Show a preview
|
||||
print("\n📝 Sample converted hook (first 200 characters):")
|
||||
print("─" * 70)
|
||||
sample_hook = list(hooks_as_strings.values())[0]
|
||||
print(sample_hook[:200] + "...")
|
||||
print("─" * 70)
|
||||
|
||||
# Use the converted hooks with REST API
|
||||
print("\n📡 Using converted hooks with REST API...")
|
||||
|
||||
payload = {
|
||||
"urls": [TEST_URLS[2]],
|
||||
"hooks": {
|
||||
"code": hooks_as_strings,
|
||||
"timeout": 30
|
||||
}
|
||||
}
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
response = requests.post(f"{DOCKER_URL}/crawl", json=payload, timeout=60)
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
print(f"\n✅ Request successful! (took {execution_time:.2f}s)")
|
||||
|
||||
if result.get('results') and result['results'][0].get('success'):
|
||||
crawl_result = result['results'][0]
|
||||
print(f" • HTML length: {len(crawl_result.get('html', '')):,} characters")
|
||||
print(f" • Hooks executed successfully!")
|
||||
else:
|
||||
print(f"❌ Request failed: {response.status_code}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {str(e)}")
|
||||
|
||||
print("\n💡 Benefits of hooks_to_string():")
|
||||
print(" ✓ Write hooks as regular Python functions")
|
||||
print(" ✓ Full IDE support (autocomplete, syntax highlighting)")
|
||||
print(" ✓ Type checking and linting")
|
||||
print(" ✓ Easy to test and debug")
|
||||
print(" ✓ Reusable across projects")
|
||||
print(" ✓ Works with any REST API client")
|
||||
|
||||
print("\n" + "─" * 70)
|
||||
print("✓ hooks_to_string() utility example complete\n")
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# APPROACH 3: Docker Client with Automatic Conversion (RECOMMENDED)
|
||||
# ============================================================================
|
||||
|
||||
async def example_3_docker_client_auto_conversion():
|
||||
"""
|
||||
Demonstrate Docker Client with automatic hook conversion (RECOMMENDED)
|
||||
Use this for the best developer experience with Python
|
||||
"""
|
||||
print_section(
|
||||
"APPROACH 3: Docker Client with Auto-Conversion (RECOMMENDED)",
|
||||
"Pass function objects directly - conversion happens automatically!"
|
||||
)
|
||||
|
||||
print("🐳 Initializing Crawl4AI Docker Client...")
|
||||
client = Crawl4aiDockerClient(base_url=DOCKER_URL)
|
||||
|
||||
print("✅ Client ready!\n")
|
||||
|
||||
# Use our reusable hook library - just pass the function objects!
|
||||
print("📚 Using reusable hook library:")
|
||||
print(" • performance_optimization_hook")
|
||||
print(" • authentication_headers_hook")
|
||||
print(" • lazy_loading_handler_hook")
|
||||
print(" • page_analytics_hook")
|
||||
|
||||
print("\n🎯 Target URL: " + TEST_URLS[0])
|
||||
print("🚀 Starting crawl with automatic hook conversion...\n")
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
|
||||
# Pass function objects directly - NO manual conversion needed! ✨
|
||||
results = await client.crawl(
|
||||
urls=[TEST_URLS[0]],
|
||||
hooks={
|
||||
"on_page_context_created": performance_optimization_hook,
|
||||
"before_goto": authentication_headers_hook,
|
||||
"before_retrieve_html": lazy_loading_handler_hook,
|
||||
"before_return_html": page_analytics_hook,
|
||||
},
|
||||
hooks_timeout=30
|
||||
)
|
||||
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
print(f"\n✅ Crawl completed! (took {execution_time:.2f}s)\n")
|
||||
|
||||
# Display results
|
||||
if results and results.success:
|
||||
result = results
|
||||
print(f"📊 Results:")
|
||||
print(f" • URL: {result.url}")
|
||||
print(f" • Success: {result.success}")
|
||||
print(f" • HTML length: {len(result.html):,} characters")
|
||||
print(f" • Markdown length: {len(result.markdown):,} characters")
|
||||
|
||||
# Show metadata
|
||||
if result.metadata:
|
||||
print(f"\n📋 Metadata:")
|
||||
print(f" • Title: {result.metadata.get('title', 'N/A')[:50]}...")
|
||||
|
||||
# Show links
|
||||
if result.links:
|
||||
internal_count = len(result.links.get('internal', []))
|
||||
external_count = len(result.links.get('external', []))
|
||||
print(f"\n🔗 Links Found:")
|
||||
print(f" • Internal: {internal_count}")
|
||||
print(f" • External: {external_count}")
|
||||
else:
|
||||
print(f"⚠️ Crawl completed but no successful results")
|
||||
if results:
|
||||
print(f" Error: {results.error_message}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
print("\n🌟 Why Docker Client is RECOMMENDED:")
|
||||
print(" ✓ Automatic function-to-string conversion")
|
||||
print(" ✓ No manual hooks_to_string() calls needed")
|
||||
print(" ✓ Cleaner, more Pythonic code")
|
||||
print(" ✓ Full type hints and IDE support")
|
||||
print(" ✓ Built-in error handling")
|
||||
print(" ✓ Async/await support")
|
||||
|
||||
print("\n" + "─" * 70)
|
||||
print("✓ Docker Client auto-conversion example complete\n")
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# APPROACH 4: Authentication Example
|
||||
# ============================================================================
|
||||
|
||||
def example_4_authentication_flow():
|
||||
"""
|
||||
Demonstrate authentication flow with multiple hooks
|
||||
"""
|
||||
print_section(
|
||||
"EXAMPLE 4: Authentication Flow",
|
||||
"Using hooks for authentication with cookies and headers"
|
||||
)
|
||||
|
||||
hooks_code = {
|
||||
"on_page_context_created": """
|
||||
async def hook(page, context, **kwargs):
|
||||
print("[HOOK] Setting up authentication context")
|
||||
|
||||
# Add authentication cookies
|
||||
await context.add_cookies([
|
||||
{
|
||||
"name": "auth_token",
|
||||
"value": "fake_jwt_token_here",
|
||||
"domain": ".httpbin.org",
|
||||
"path": "/",
|
||||
"httpOnly": True,
|
||||
"secure": True
|
||||
}
|
||||
])
|
||||
|
||||
return page
|
||||
""",
|
||||
|
||||
"before_goto": """
|
||||
async def hook(page, context, url, **kwargs):
|
||||
print(f"[HOOK] Adding auth headers for {url}")
|
||||
|
||||
# Add Authorization header
|
||||
import base64
|
||||
credentials = base64.b64encode(b"user:passwd").decode('ascii')
|
||||
|
||||
await page.set_extra_http_headers({
|
||||
'Authorization': f'Basic {credentials}',
|
||||
'X-API-Key': 'test-api-key-123'
|
||||
})
|
||||
|
||||
return page
|
||||
"""
|
||||
}
|
||||
|
||||
payload = {
|
||||
"urls": ["https://httpbin.org/basic-auth/user/passwd"],
|
||||
"hooks": {
|
||||
"code": hooks_code,
|
||||
"timeout": 15
|
||||
}
|
||||
}
|
||||
|
||||
print("\nTesting authentication with httpbin endpoints...")
|
||||
response = requests.post(f"{DOCKER_URL}/crawl", json=payload)
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
print("✅ Authentication test completed")
|
||||
|
||||
if 'results' in data:
|
||||
for i, result in enumerate(data['results']):
|
||||
print(f"\n URL {i+1}: {result['url']}")
|
||||
if result.get('success'):
|
||||
# Check for authentication success indicators
|
||||
html_content = result.get('html', '')
|
||||
if '"authenticated"' in html_content and 'true' in html_content:
|
||||
print(" ✅ Authentication successful! Basic auth worked.")
|
||||
else:
|
||||
print(" ⚠️ Page loaded but auth status unclear")
|
||||
else:
|
||||
print(f" ❌ Failed: {result.get('error_message', 'Unknown error')}")
|
||||
else:
|
||||
print(f"❌ Error: {response.status_code}")
|
||||
|
||||
print("\n" + "─" * 70)
|
||||
print("✓ Authentication example complete\n")
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# MAIN EXECUTION
|
||||
# ============================================================================
|
||||
|
||||
async def main():
|
||||
"""
|
||||
Run all example demonstrations
|
||||
"""
|
||||
print("\n" + "=" * 70)
|
||||
print(" 🚀 Crawl4AI - Docker Hooks System Examples")
|
||||
print("=" * 70)
|
||||
|
||||
# Check Docker service
|
||||
print("\n🔍 Checking Docker service status...")
|
||||
if not check_docker_service():
|
||||
print("❌ Docker service is not running!")
|
||||
print("\n📋 To start the Docker service:")
|
||||
print(" docker run -p 11235:11235 unclecode/crawl4ai:latest")
|
||||
print("\nPlease start the service and run this example again.")
|
||||
return
|
||||
|
||||
print("✅ Docker service is running!\n")
|
||||
|
||||
# Run all examples
|
||||
examples = [
|
||||
("String-Based Hooks (REST API)", example_1_string_based_hooks, False),
|
||||
("hooks_to_string() Utility", example_2_hooks_to_string_utility, False),
|
||||
("Docker Client Auto-Conversion (Recommended)", example_3_docker_client_auto_conversion, True),
|
||||
("Authentication Flow", example_4_authentication_flow, False),
|
||||
]
|
||||
|
||||
for i, (name, example_func, is_async) in enumerate(examples, 1):
|
||||
print(f"\n{'🔷' * 35}")
|
||||
print(f"Example {i}/{len(examples)}: {name}")
|
||||
print(f"{'🔷' * 35}\n")
|
||||
|
||||
try:
|
||||
if is_async:
|
||||
await example_func()
|
||||
else:
|
||||
example_func()
|
||||
|
||||
print(f"✅ Example {i} completed successfully!")
|
||||
|
||||
# Pause between examples (except the last one)
|
||||
if i < len(examples):
|
||||
print("\n⏸️ Press Enter to continue to next example...")
|
||||
input()
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print(f"\n⏹️ Examples interrupted by user")
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"\n❌ Example {i} failed: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
print("\nContinuing to next example...\n")
|
||||
continue
|
||||
|
||||
# Final summary
|
||||
print("\n" + "=" * 70)
|
||||
print(" 🎉 All Examples Complete!")
|
||||
print("=" * 70)
|
||||
|
||||
print("\n📊 Summary - Three Approaches to Docker Hooks:")
|
||||
|
||||
print("\n✨ 1. String-Based Hooks:")
|
||||
print(" • Write hooks as strings directly in JSON")
|
||||
print(" • Best for: REST API, non-Python clients, simple use cases")
|
||||
print(" • Cons: No IDE support, harder to debug")
|
||||
|
||||
print("\n✨ 2. hooks_to_string() Utility:")
|
||||
print(" • Write hooks as Python functions, convert to strings")
|
||||
print(" • Best for: Python with REST API, reusable hook libraries")
|
||||
print(" • Pros: IDE support, type checking, easy debugging")
|
||||
|
||||
print("\n✨ 3. Docker Client (RECOMMENDED):")
|
||||
print(" • Pass function objects directly, automatic conversion")
|
||||
print(" • Best for: Python applications, best developer experience")
|
||||
print(" • Pros: All benefits of #2 + cleaner code, no manual conversion")
|
||||
|
||||
print("\n💡 Recommendation:")
|
||||
print(" Use Docker Client (#3) for Python applications")
|
||||
print(" Use hooks_to_string() (#2) when you need REST API flexibility")
|
||||
print(" Use string-based (#1) for non-Python clients or simple scripts")
|
||||
|
||||
print("\n🎯 8 Hook Points Available:")
|
||||
print(" • on_browser_created, on_page_context_created")
|
||||
print(" • on_user_agent_updated, before_goto, after_goto")
|
||||
print(" • on_execution_started, before_retrieve_html, before_return_html")
|
||||
|
||||
print("\n📚 Resources:")
|
||||
print(" • Docs: https://docs.crawl4ai.com/core/docker-deployment")
|
||||
print(" • GitHub: https://github.com/unclecode/crawl4ai")
|
||||
print(" • Discord: https://discord.gg/jP8KfhDhyN")
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print(" Happy Crawling! 🕷️")
|
||||
print("=" * 70 + "\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("\n🎬 Starting Crawl4AI Docker Hooks Examples...")
|
||||
print("Press Ctrl+C anytime to exit\n")
|
||||
|
||||
try:
|
||||
asyncio.run(main())
|
||||
except KeyboardInterrupt:
|
||||
print("\n\n👋 Examples stopped by user. Thanks for exploring Crawl4AI!")
|
||||
except Exception as e:
|
||||
print(f"\n\n❌ Error: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
461
docs/examples/docker_webhook_example.py
Normal file
461
docs/examples/docker_webhook_example.py
Normal file
@@ -0,0 +1,461 @@
|
||||
"""
|
||||
Docker Webhook Example for Crawl4AI
|
||||
|
||||
This example demonstrates how to use webhooks with the Crawl4AI job queue API.
|
||||
Instead of polling for results, webhooks notify your application when jobs complete.
|
||||
|
||||
Supports both:
|
||||
- /crawl/job - Raw crawling with markdown extraction
|
||||
- /llm/job - LLM-powered content extraction
|
||||
|
||||
Prerequisites:
|
||||
1. Crawl4AI Docker container running on localhost:11235
|
||||
2. Flask installed: pip install flask requests
|
||||
3. LLM API key configured in .llm.env (for LLM extraction examples)
|
||||
|
||||
Usage:
|
||||
1. Run this script: python docker_webhook_example.py
|
||||
2. The webhook server will start on http://localhost:8080
|
||||
3. Jobs will be submitted and webhooks will be received automatically
|
||||
"""
|
||||
|
||||
import requests
|
||||
import json
|
||||
import time
|
||||
from flask import Flask, request, jsonify
|
||||
from threading import Thread
|
||||
|
||||
# Configuration
|
||||
CRAWL4AI_BASE_URL = "http://localhost:11235"
|
||||
WEBHOOK_BASE_URL = "http://localhost:8080" # Your webhook receiver URL
|
||||
|
||||
# Initialize Flask app for webhook receiver
|
||||
app = Flask(__name__)
|
||||
|
||||
# Store received webhook data for demonstration
|
||||
received_webhooks = []
|
||||
|
||||
|
||||
@app.route('/webhooks/crawl-complete', methods=['POST'])
|
||||
def handle_crawl_webhook():
|
||||
"""
|
||||
Webhook handler that receives notifications when crawl jobs complete.
|
||||
|
||||
Payload structure:
|
||||
{
|
||||
"task_id": "crawl_abc123",
|
||||
"task_type": "crawl",
|
||||
"status": "completed" or "failed",
|
||||
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
||||
"urls": ["https://example.com"],
|
||||
"error": "error message" (only if failed),
|
||||
"data": {...} (only if webhook_data_in_payload=True)
|
||||
}
|
||||
"""
|
||||
payload = request.json
|
||||
print(f"\n{'='*60}")
|
||||
print(f"📬 Webhook received for task: {payload['task_id']}")
|
||||
print(f" Status: {payload['status']}")
|
||||
print(f" Timestamp: {payload['timestamp']}")
|
||||
print(f" URLs: {payload['urls']}")
|
||||
|
||||
if payload['status'] == 'completed':
|
||||
# If data is in payload, process it directly
|
||||
if 'data' in payload:
|
||||
print(f" ✅ Data included in webhook")
|
||||
data = payload['data']
|
||||
# Process the crawl results here
|
||||
for result in data.get('results', []):
|
||||
print(f" - Crawled: {result.get('url')}")
|
||||
print(f" - Markdown length: {len(result.get('markdown', ''))}")
|
||||
else:
|
||||
# Fetch results from API if not included
|
||||
print(f" 📥 Fetching results from API...")
|
||||
task_id = payload['task_id']
|
||||
result_response = requests.get(f"{CRAWL4AI_BASE_URL}/crawl/job/{task_id}")
|
||||
if result_response.ok:
|
||||
data = result_response.json()
|
||||
print(f" ✅ Results fetched successfully")
|
||||
# Process the crawl results here
|
||||
for result in data['result'].get('results', []):
|
||||
print(f" - Crawled: {result.get('url')}")
|
||||
print(f" - Markdown length: {len(result.get('markdown', ''))}")
|
||||
|
||||
elif payload['status'] == 'failed':
|
||||
print(f" ❌ Job failed: {payload.get('error', 'Unknown error')}")
|
||||
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
# Store webhook for demonstration
|
||||
received_webhooks.append(payload)
|
||||
|
||||
# Return 200 OK to acknowledge receipt
|
||||
return jsonify({"status": "received"}), 200
|
||||
|
||||
|
||||
@app.route('/webhooks/llm-complete', methods=['POST'])
|
||||
def handle_llm_webhook():
|
||||
"""
|
||||
Webhook handler that receives notifications when LLM extraction jobs complete.
|
||||
|
||||
Payload structure:
|
||||
{
|
||||
"task_id": "llm_1698765432_12345",
|
||||
"task_type": "llm_extraction",
|
||||
"status": "completed" or "failed",
|
||||
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
||||
"urls": ["https://example.com/article"],
|
||||
"error": "error message" (only if failed),
|
||||
"data": {"extracted_content": {...}} (only if webhook_data_in_payload=True)
|
||||
}
|
||||
"""
|
||||
payload = request.json
|
||||
print(f"\n{'='*60}")
|
||||
print(f"🤖 LLM Webhook received for task: {payload['task_id']}")
|
||||
print(f" Task Type: {payload['task_type']}")
|
||||
print(f" Status: {payload['status']}")
|
||||
print(f" Timestamp: {payload['timestamp']}")
|
||||
print(f" URL: {payload['urls'][0]}")
|
||||
|
||||
if payload['status'] == 'completed':
|
||||
# If data is in payload, process it directly
|
||||
if 'data' in payload:
|
||||
print(f" ✅ Data included in webhook")
|
||||
data = payload['data']
|
||||
# Webhook wraps extracted content in 'extracted_content' field
|
||||
extracted = data.get('extracted_content', {})
|
||||
print(f" - Extracted content:")
|
||||
print(f" {json.dumps(extracted, indent=8)}")
|
||||
else:
|
||||
# Fetch results from API if not included
|
||||
print(f" 📥 Fetching results from API...")
|
||||
task_id = payload['task_id']
|
||||
result_response = requests.get(f"{CRAWL4AI_BASE_URL}/llm/job/{task_id}")
|
||||
if result_response.ok:
|
||||
data = result_response.json()
|
||||
print(f" ✅ Results fetched successfully")
|
||||
# API returns unwrapped content in 'result' field
|
||||
extracted = data['result']
|
||||
print(f" - Extracted content:")
|
||||
print(f" {json.dumps(extracted, indent=8)}")
|
||||
|
||||
elif payload['status'] == 'failed':
|
||||
print(f" ❌ Job failed: {payload.get('error', 'Unknown error')}")
|
||||
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
# Store webhook for demonstration
|
||||
received_webhooks.append(payload)
|
||||
|
||||
# Return 200 OK to acknowledge receipt
|
||||
return jsonify({"status": "received"}), 200
|
||||
|
||||
|
||||
def start_webhook_server():
|
||||
"""Start the Flask webhook server in a separate thread"""
|
||||
app.run(host='0.0.0.0', port=8080, debug=False, use_reloader=False)
|
||||
|
||||
|
||||
def submit_crawl_job_with_webhook(urls, webhook_url, include_data=False):
|
||||
"""
|
||||
Submit a crawl job with webhook notification.
|
||||
|
||||
Args:
|
||||
urls: List of URLs to crawl
|
||||
webhook_url: URL to receive webhook notifications
|
||||
include_data: Whether to include full results in webhook payload
|
||||
|
||||
Returns:
|
||||
task_id: The job's task identifier
|
||||
"""
|
||||
payload = {
|
||||
"urls": urls,
|
||||
"browser_config": {"headless": True},
|
||||
"crawler_config": {"cache_mode": "bypass"},
|
||||
"webhook_config": {
|
||||
"webhook_url": webhook_url,
|
||||
"webhook_data_in_payload": include_data,
|
||||
# Optional: Add custom headers for authentication
|
||||
# "webhook_headers": {
|
||||
# "X-Webhook-Secret": "your-secret-token"
|
||||
# }
|
||||
}
|
||||
}
|
||||
|
||||
print(f"\n🚀 Submitting crawl job...")
|
||||
print(f" URLs: {urls}")
|
||||
print(f" Webhook: {webhook_url}")
|
||||
print(f" Include data: {include_data}")
|
||||
|
||||
response = requests.post(
|
||||
f"{CRAWL4AI_BASE_URL}/crawl/job",
|
||||
json=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
if response.ok:
|
||||
data = response.json()
|
||||
task_id = data['task_id']
|
||||
print(f" ✅ Job submitted successfully")
|
||||
print(f" Task ID: {task_id}")
|
||||
return task_id
|
||||
else:
|
||||
print(f" ❌ Failed to submit job: {response.text}")
|
||||
return None
|
||||
|
||||
|
||||
def submit_llm_job_with_webhook(url, query, webhook_url, include_data=False, schema=None, provider=None):
|
||||
"""
|
||||
Submit an LLM extraction job with webhook notification.
|
||||
|
||||
Args:
|
||||
url: URL to extract content from
|
||||
query: Instruction for the LLM (e.g., "Extract article title and author")
|
||||
webhook_url: URL to receive webhook notifications
|
||||
include_data: Whether to include full results in webhook payload
|
||||
schema: Optional JSON schema for structured extraction
|
||||
provider: Optional LLM provider (e.g., "openai/gpt-4o-mini")
|
||||
|
||||
Returns:
|
||||
task_id: The job's task identifier
|
||||
"""
|
||||
payload = {
|
||||
"url": url,
|
||||
"q": query,
|
||||
"cache": False,
|
||||
"webhook_config": {
|
||||
"webhook_url": webhook_url,
|
||||
"webhook_data_in_payload": include_data,
|
||||
# Optional: Add custom headers for authentication
|
||||
# "webhook_headers": {
|
||||
# "X-Webhook-Secret": "your-secret-token"
|
||||
# }
|
||||
}
|
||||
}
|
||||
|
||||
if schema:
|
||||
payload["schema"] = schema
|
||||
|
||||
if provider:
|
||||
payload["provider"] = provider
|
||||
|
||||
print(f"\n🤖 Submitting LLM extraction job...")
|
||||
print(f" URL: {url}")
|
||||
print(f" Query: {query}")
|
||||
print(f" Webhook: {webhook_url}")
|
||||
print(f" Include data: {include_data}")
|
||||
if provider:
|
||||
print(f" Provider: {provider}")
|
||||
|
||||
response = requests.post(
|
||||
f"{CRAWL4AI_BASE_URL}/llm/job",
|
||||
json=payload,
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
if response.ok:
|
||||
data = response.json()
|
||||
task_id = data['task_id']
|
||||
print(f" ✅ Job submitted successfully")
|
||||
print(f" Task ID: {task_id}")
|
||||
return task_id
|
||||
else:
|
||||
print(f" ❌ Failed to submit job: {response.text}")
|
||||
return None
|
||||
|
||||
|
||||
def submit_job_without_webhook(urls):
|
||||
"""
|
||||
Submit a job without webhook (traditional polling approach).
|
||||
|
||||
Args:
|
||||
urls: List of URLs to crawl
|
||||
|
||||
Returns:
|
||||
task_id: The job's task identifier
|
||||
"""
|
||||
payload = {
|
||||
"urls": urls,
|
||||
"browser_config": {"headless": True},
|
||||
"crawler_config": {"cache_mode": "bypass"}
|
||||
}
|
||||
|
||||
print(f"\n🚀 Submitting crawl job (without webhook)...")
|
||||
print(f" URLs: {urls}")
|
||||
|
||||
response = requests.post(
|
||||
f"{CRAWL4AI_BASE_URL}/crawl/job",
|
||||
json=payload
|
||||
)
|
||||
|
||||
if response.ok:
|
||||
data = response.json()
|
||||
task_id = data['task_id']
|
||||
print(f" ✅ Job submitted successfully")
|
||||
print(f" Task ID: {task_id}")
|
||||
return task_id
|
||||
else:
|
||||
print(f" ❌ Failed to submit job: {response.text}")
|
||||
return None
|
||||
|
||||
|
||||
def poll_job_status(task_id, timeout=60):
|
||||
"""
|
||||
Poll for job status (used when webhook is not configured).
|
||||
|
||||
Args:
|
||||
task_id: The job's task identifier
|
||||
timeout: Maximum time to wait in seconds
|
||||
"""
|
||||
print(f"\n⏳ Polling for job status...")
|
||||
start_time = time.time()
|
||||
|
||||
while time.time() - start_time < timeout:
|
||||
response = requests.get(f"{CRAWL4AI_BASE_URL}/crawl/job/{task_id}")
|
||||
|
||||
if response.ok:
|
||||
data = response.json()
|
||||
status = data.get('status', 'unknown')
|
||||
|
||||
if status == 'completed':
|
||||
print(f" ✅ Job completed!")
|
||||
return data
|
||||
elif status == 'failed':
|
||||
print(f" ❌ Job failed: {data.get('error', 'Unknown error')}")
|
||||
return data
|
||||
else:
|
||||
print(f" ⏳ Status: {status}, waiting...")
|
||||
time.sleep(2)
|
||||
else:
|
||||
print(f" ❌ Failed to get status: {response.text}")
|
||||
return None
|
||||
|
||||
print(f" ⏰ Timeout reached")
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
"""Run the webhook demonstration"""
|
||||
|
||||
# Check if Crawl4AI is running
|
||||
try:
|
||||
health = requests.get(f"{CRAWL4AI_BASE_URL}/health", timeout=5)
|
||||
print(f"✅ Crawl4AI is running: {health.json()}")
|
||||
except:
|
||||
print(f"❌ Cannot connect to Crawl4AI at {CRAWL4AI_BASE_URL}")
|
||||
print(" Please make sure Docker container is running:")
|
||||
print(" docker run -d -p 11235:11235 --name crawl4ai unclecode/crawl4ai:latest")
|
||||
return
|
||||
|
||||
# Start webhook server in background thread
|
||||
print(f"\n🌐 Starting webhook server at {WEBHOOK_BASE_URL}...")
|
||||
webhook_thread = Thread(target=start_webhook_server, daemon=True)
|
||||
webhook_thread.start()
|
||||
time.sleep(2) # Give server time to start
|
||||
|
||||
# Example 1: Job with webhook (notification only, fetch data separately)
|
||||
print(f"\n{'='*60}")
|
||||
print("Example 1: Webhook Notification Only")
|
||||
print(f"{'='*60}")
|
||||
task_id_1 = submit_crawl_job_with_webhook(
|
||||
urls=["https://example.com"],
|
||||
webhook_url=f"{WEBHOOK_BASE_URL}/webhooks/crawl-complete",
|
||||
include_data=False
|
||||
)
|
||||
|
||||
# Example 2: Job with webhook (data included in payload)
|
||||
time.sleep(5) # Wait a bit between requests
|
||||
print(f"\n{'='*60}")
|
||||
print("Example 2: Webhook with Full Data")
|
||||
print(f"{'='*60}")
|
||||
task_id_2 = submit_crawl_job_with_webhook(
|
||||
urls=["https://www.python.org"],
|
||||
webhook_url=f"{WEBHOOK_BASE_URL}/webhooks/crawl-complete",
|
||||
include_data=True
|
||||
)
|
||||
|
||||
# Example 3: LLM extraction with webhook (notification only)
|
||||
time.sleep(5) # Wait a bit between requests
|
||||
print(f"\n{'='*60}")
|
||||
print("Example 3: LLM Extraction with Webhook (Notification Only)")
|
||||
print(f"{'='*60}")
|
||||
task_id_3 = submit_llm_job_with_webhook(
|
||||
url="https://www.example.com",
|
||||
query="Extract the main heading and description from this page.",
|
||||
webhook_url=f"{WEBHOOK_BASE_URL}/webhooks/llm-complete",
|
||||
include_data=False,
|
||||
provider="openai/gpt-4o-mini"
|
||||
)
|
||||
|
||||
# Example 4: LLM extraction with webhook (data included + schema)
|
||||
time.sleep(5) # Wait a bit between requests
|
||||
print(f"\n{'='*60}")
|
||||
print("Example 4: LLM Extraction with Schema and Full Data")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Define a schema for structured extraction
|
||||
schema = json.dumps({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string", "description": "Page title"},
|
||||
"description": {"type": "string", "description": "Page description"}
|
||||
},
|
||||
"required": ["title"]
|
||||
})
|
||||
|
||||
task_id_4 = submit_llm_job_with_webhook(
|
||||
url="https://www.python.org",
|
||||
query="Extract the title and description of this website",
|
||||
webhook_url=f"{WEBHOOK_BASE_URL}/webhooks/llm-complete",
|
||||
include_data=True,
|
||||
schema=schema,
|
||||
provider="openai/gpt-4o-mini"
|
||||
)
|
||||
|
||||
# Example 5: Traditional polling (no webhook)
|
||||
time.sleep(5) # Wait a bit between requests
|
||||
print(f"\n{'='*60}")
|
||||
print("Example 5: Traditional Polling (No Webhook)")
|
||||
print(f"{'='*60}")
|
||||
task_id_5 = submit_job_without_webhook(
|
||||
urls=["https://github.com"]
|
||||
)
|
||||
if task_id_5:
|
||||
result = poll_job_status(task_id_5)
|
||||
if result and result.get('status') == 'completed':
|
||||
print(f" ✅ Results retrieved via polling")
|
||||
|
||||
# Wait for webhooks to arrive
|
||||
print(f"\n⏳ Waiting for webhooks to be received...")
|
||||
time.sleep(30) # Give jobs time to complete and webhooks to arrive (longer for LLM)
|
||||
|
||||
# Summary
|
||||
print(f"\n{'='*60}")
|
||||
print("Summary")
|
||||
print(f"{'='*60}")
|
||||
print(f"Total webhooks received: {len(received_webhooks)}")
|
||||
|
||||
crawl_webhooks = [w for w in received_webhooks if w['task_type'] == 'crawl']
|
||||
llm_webhooks = [w for w in received_webhooks if w['task_type'] == 'llm_extraction']
|
||||
|
||||
print(f"\n📊 Breakdown:")
|
||||
print(f" - Crawl webhooks: {len(crawl_webhooks)}")
|
||||
print(f" - LLM extraction webhooks: {len(llm_webhooks)}")
|
||||
|
||||
print(f"\n📋 Details:")
|
||||
for i, webhook in enumerate(received_webhooks, 1):
|
||||
task_type = webhook['task_type']
|
||||
icon = "🕷️" if task_type == "crawl" else "🤖"
|
||||
print(f"{i}. {icon} Task {webhook['task_id']}: {webhook['status']} ({task_type})")
|
||||
|
||||
print(f"\n✅ Demo completed!")
|
||||
print(f"\n💡 Pro tips:")
|
||||
print(f" - In production, your webhook URL should be publicly accessible")
|
||||
print(f" (e.g., https://myapp.com/webhooks) or use ngrok for testing")
|
||||
print(f" - Both /crawl/job and /llm/job support the same webhook configuration")
|
||||
print(f" - Use webhook_data_in_payload=true to get results directly in the webhook")
|
||||
print(f" - LLM jobs may take longer, adjust timeouts accordingly")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
594
docs/md_v2/advanced/cdp-browser-crawling.md
Normal file
594
docs/md_v2/advanced/cdp-browser-crawling.md
Normal file
@@ -0,0 +1,594 @@
|
||||
# CDP Browser Crawling
|
||||
|
||||
> **New in v0.7.6**: Efficient concurrent crawling with managed CDP (Chrome DevTools Protocol) browsers. Connect to a running browser instance and perform multiple crawls without spawning new windows.
|
||||
|
||||
## 1. Overview
|
||||
|
||||
When working with CDP browsers, you can connect to an existing browser instance instead of launching a new one for each crawl. This is particularly useful for:
|
||||
|
||||
- **Development**: Keep your browser open with DevTools for debugging
|
||||
- **Persistent Sessions**: Maintain authentication across multiple crawls
|
||||
- **Resource Efficiency**: Reuse a single browser instance for multiple operations
|
||||
- **Concurrent Crawling**: Run multiple crawls simultaneously with proper isolation
|
||||
|
||||
**Key Benefits:**
|
||||
|
||||
- ✅ Single browser window with multiple tabs (no window clutter)
|
||||
- ✅ Shared state (cookies, localStorage) across crawls
|
||||
- ✅ Concurrent safety with automatic page isolation
|
||||
- ✅ Automatic cleanup to prevent memory leaks
|
||||
- ✅ Works seamlessly with `arun_many()` for parallel crawling
|
||||
|
||||
---
|
||||
|
||||
## 2. Quick Start
|
||||
|
||||
### 2.1 Starting a CDP Browser
|
||||
|
||||
Use the Crawl4AI CLI to start a managed CDP browser:
|
||||
|
||||
```bash
|
||||
# Start CDP browser on default port (9222)
|
||||
crwl cdp
|
||||
|
||||
# Start on custom port
|
||||
crwl cdp -d 9223
|
||||
|
||||
# Start in headless mode
|
||||
crwl cdp --headless
|
||||
```
|
||||
|
||||
The browser will stay running until you press 'q' or close the terminal.
|
||||
|
||||
### 2.2 Basic CDP Connection
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
|
||||
async def main():
|
||||
# Configure CDP connection
|
||||
browser_cfg = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
cdp_url="http://localhost:9222",
|
||||
verbose=True
|
||||
)
|
||||
|
||||
# Crawl a single URL
|
||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=CrawlerRunConfig()
|
||||
)
|
||||
print(f"Success: {result.success}")
|
||||
print(f"Content length: {len(result.markdown)}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. Concurrent Crawling with arun_many()
|
||||
|
||||
The real power of CDP crawling shines with `arun_many()`. The browser manager automatically handles:
|
||||
|
||||
- **Page Isolation**: Each crawl gets its own tab
|
||||
- **Context Sharing**: All tabs share cookies and localStorage
|
||||
- **Concurrent Safety**: Proper locking prevents race conditions
|
||||
- **Auto Cleanup**: Tabs are closed after crawling (except sessions)
|
||||
|
||||
### 3.1 Basic Concurrent Crawling
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
|
||||
async def crawl_multiple_urls():
|
||||
# URLs to crawl
|
||||
urls = [
|
||||
"https://example.com",
|
||||
"https://httpbin.org/html",
|
||||
"https://www.python.org",
|
||||
]
|
||||
|
||||
# Configure CDP browser
|
||||
browser_cfg = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
cdp_url="http://localhost:9222",
|
||||
verbose=False
|
||||
)
|
||||
|
||||
# Configure crawler (bypass cache for fresh data)
|
||||
crawler_cfg = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
# Crawl all URLs concurrently
|
||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=crawler_cfg
|
||||
)
|
||||
|
||||
# Process results
|
||||
for result in results:
|
||||
print(f"\nURL: {result.url}")
|
||||
if result.success:
|
||||
print(f"✓ Success | Content length: {len(result.markdown)}")
|
||||
else:
|
||||
print(f"✗ Failed: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(crawl_multiple_urls())
|
||||
```
|
||||
|
||||
### 3.2 With Session Management
|
||||
|
||||
Use sessions to maintain authentication and state across individual crawls:
|
||||
|
||||
```python
|
||||
async def crawl_with_sessions():
|
||||
browser_cfg = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
cdp_url="http://localhost:9222"
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||
# First crawl: Login page
|
||||
login_result = await crawler.arun(
|
||||
url="https://example.com/login",
|
||||
config=CrawlerRunConfig(
|
||||
session_id="my-session", # Session persists
|
||||
js_code="document.querySelector('#login').click();"
|
||||
)
|
||||
)
|
||||
|
||||
# Second crawl: Reuse authenticated session
|
||||
dashboard_result = await crawler.arun(
|
||||
url="https://example.com/dashboard",
|
||||
config=CrawlerRunConfig(
|
||||
session_id="my-session" # Same session, cookies preserved
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 4. How It Works
|
||||
|
||||
### 4.1 Browser Context Reuse
|
||||
|
||||
When using CDP browsers, Crawl4AI:
|
||||
|
||||
1. **Connects** to the existing browser via CDP URL
|
||||
2. **Reuses** the default browser context (single window)
|
||||
3. **Creates** new pages (tabs) for each crawl
|
||||
4. **Locks** page creation to prevent concurrent races
|
||||
5. **Cleans up** pages after crawling (unless it's a session)
|
||||
|
||||
```python
|
||||
# Internal behavior (simplified)
|
||||
if self.config.use_managed_browser:
|
||||
context = self.default_context # Shared context
|
||||
|
||||
# Thread-safe page creation
|
||||
async with self._page_lock:
|
||||
page = await context.new_page() # New tab per crawl
|
||||
|
||||
# After crawl completes
|
||||
if not config.session_id:
|
||||
await page.close() # Auto cleanup
|
||||
```
|
||||
|
||||
### 4.2 Page Lifecycle
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
A[Start Crawl] --> B{Has session_id?}
|
||||
B -->|Yes| C[Reuse existing page]
|
||||
B -->|No| D[Create new page/tab]
|
||||
D --> E[Navigate & Extract]
|
||||
C --> E
|
||||
E --> F{Is session?}
|
||||
F -->|Yes| G[Keep page open]
|
||||
F -->|No| H[Close page]
|
||||
H --> I[End]
|
||||
G --> I
|
||||
```
|
||||
|
||||
### 4.3 State Sharing
|
||||
|
||||
All pages in the same context share:
|
||||
|
||||
- 🍪 **Cookies**: Authentication tokens, preferences
|
||||
- 💾 **localStorage**: Client-side data storage
|
||||
- 🔐 **sessionStorage**: Per-tab session data
|
||||
- 🌐 **Network cache**: Shared HTTP cache
|
||||
|
||||
This makes it perfect for crawling authenticated sites or maintaining state across multiple pages.
|
||||
|
||||
---
|
||||
|
||||
## 5. Configuration Options
|
||||
|
||||
### 5.1 BrowserConfig for CDP
|
||||
|
||||
```python
|
||||
browser_cfg = BrowserConfig(
|
||||
browser_type="chromium", # Must be "chromium" for CDP
|
||||
cdp_url="http://localhost:9222", # CDP endpoint URL
|
||||
verbose=True, # Log browser operations
|
||||
|
||||
# Optional: Override headers for all requests
|
||||
headers={
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
},
|
||||
|
||||
# Optional: Set user agent
|
||||
user_agent="Mozilla/5.0 ...",
|
||||
|
||||
# Optional: Enable stealth mode (requires dedicated browser)
|
||||
# enable_stealth=False, # Not compatible with CDP
|
||||
)
|
||||
```
|
||||
|
||||
### 5.2 CrawlerRunConfig Options
|
||||
|
||||
```python
|
||||
crawler_cfg = CrawlerRunConfig(
|
||||
# Session management
|
||||
session_id="my-session", # Persist page across calls
|
||||
|
||||
# Caching
|
||||
cache_mode=CacheMode.BYPASS, # Fresh data every time
|
||||
|
||||
# Browser location (affects timezone, locale)
|
||||
locale="en-US",
|
||||
timezone_id="America/New_York",
|
||||
geolocation={
|
||||
"latitude": 40.7128,
|
||||
"longitude": -74.0060
|
||||
},
|
||||
|
||||
# Proxy (per-crawl override)
|
||||
proxy_config={
|
||||
"server": "http://proxy.example.com:8080",
|
||||
"username": "user",
|
||||
"password": "pass"
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 6. Advanced Patterns
|
||||
|
||||
### 6.1 Streaming Results
|
||||
|
||||
Process URLs as they complete instead of waiting for all:
|
||||
|
||||
```python
|
||||
async def stream_crawl_results():
|
||||
browser_cfg = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
cdp_url="http://localhost:9222"
|
||||
)
|
||||
|
||||
urls = ["https://example.com" for _ in range(100)]
|
||||
|
||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||
# Stream results as they complete
|
||||
async for result in crawler.arun_many(
|
||||
urls=urls,
|
||||
config=CrawlerRunConfig(stream=True)
|
||||
):
|
||||
if result.success:
|
||||
print(f"✓ {result.url}: {len(result.markdown)} chars")
|
||||
# Process immediately instead of waiting for all
|
||||
await save_to_database(result)
|
||||
```
|
||||
|
||||
### 6.2 Custom Concurrency Control
|
||||
|
||||
```python
|
||||
from crawl4ai import CrawlerRunConfig
|
||||
|
||||
# Limit concurrent crawls to 3
|
||||
crawler_cfg = CrawlerRunConfig(
|
||||
semaphore_count=3, # Max 3 concurrent requests
|
||||
mean_delay=0.5, # Average 0.5s delay between requests
|
||||
max_range=1.0, # +/- 1s random delay
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||
results = await crawler.arun_many(urls, config=crawler_cfg)
|
||||
```
|
||||
|
||||
### 6.3 Multi-Config Crawling
|
||||
|
||||
Different configurations for different URL groups:
|
||||
|
||||
```python
|
||||
from crawl4ai import CrawlerRunConfig
|
||||
|
||||
# Fast crawl for static pages
|
||||
fast_config = CrawlerRunConfig(
|
||||
wait_until="domcontentloaded",
|
||||
page_timeout=30000
|
||||
)
|
||||
|
||||
# Slow crawl for dynamic pages
|
||||
slow_config = CrawlerRunConfig(
|
||||
wait_until="networkidle",
|
||||
page_timeout=60000,
|
||||
js_code="window.scrollTo(0, document.body.scrollHeight);"
|
||||
)
|
||||
|
||||
configs = [fast_config, slow_config, fast_config]
|
||||
urls = ["https://static.com", "https://dynamic.com", "https://static2.com"]
|
||||
|
||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||
results = await crawler.arun_many(urls, configs=configs)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 7. Best Practices
|
||||
|
||||
### 7.1 Resource Management
|
||||
|
||||
✅ **DO:**
|
||||
```python
|
||||
# Use context manager for automatic cleanup
|
||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||
results = await crawler.arun_many(urls)
|
||||
# Browser connection closed automatically
|
||||
```
|
||||
|
||||
❌ **DON'T:**
|
||||
```python
|
||||
# Manual management risks resource leaks
|
||||
crawler = AsyncWebCrawler(config=browser_cfg)
|
||||
await crawler.start()
|
||||
results = await crawler.arun_many(urls)
|
||||
# Forgot to call crawler.close()!
|
||||
```
|
||||
|
||||
### 7.2 Session Management
|
||||
|
||||
✅ **DO:**
|
||||
```python
|
||||
# Use sessions for related crawls
|
||||
config = CrawlerRunConfig(session_id="user-flow")
|
||||
await crawler.arun(login_url, config=config)
|
||||
await crawler.arun(dashboard_url, config=config)
|
||||
await crawler.kill_session("user-flow") # Clean up when done
|
||||
```
|
||||
|
||||
❌ **DON'T:**
|
||||
```python
|
||||
# Creating new session IDs unnecessarily
|
||||
for i in range(100):
|
||||
config = CrawlerRunConfig(session_id=f"session-{i}")
|
||||
await crawler.arun(url, config=config)
|
||||
# 100 unclosed sessions accumulate!
|
||||
```
|
||||
|
||||
### 7.3 Error Handling
|
||||
|
||||
```python
|
||||
async def robust_crawl(urls):
|
||||
browser_cfg = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
cdp_url="http://localhost:9222"
|
||||
)
|
||||
|
||||
try:
|
||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||
results = await crawler.arun_many(urls)
|
||||
|
||||
# Separate successes and failures
|
||||
successes = [r for r in results if r.success]
|
||||
failures = [r for r in results if not r.success]
|
||||
|
||||
print(f"✓ {len(successes)} succeeded")
|
||||
print(f"✗ {len(failures)} failed")
|
||||
|
||||
# Retry failures with different config
|
||||
if failures:
|
||||
retry_urls = [r.url for r in failures]
|
||||
retry_config = CrawlerRunConfig(
|
||||
page_timeout=120000, # Longer timeout
|
||||
wait_until="networkidle"
|
||||
)
|
||||
retry_results = await crawler.arun_many(
|
||||
retry_urls,
|
||||
config=retry_config
|
||||
)
|
||||
|
||||
return successes + retry_results
|
||||
|
||||
except Exception as e:
|
||||
print(f"Fatal error: {e}")
|
||||
return []
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 8. Troubleshooting
|
||||
|
||||
### 8.1 Connection Issues
|
||||
|
||||
**Problem**: `Cannot connect to CDP browser`
|
||||
|
||||
```python
|
||||
# Check CDP browser is running
|
||||
$ lsof -i :9222
|
||||
# Should show: Chromium PID USER FD TYPE ...
|
||||
|
||||
# Or start it if not running
|
||||
$ crwl cdp
|
||||
```
|
||||
|
||||
**Problem**: `ERR_ABORTED` errors in concurrent crawls
|
||||
|
||||
✅ **Fixed in v0.7.6**: This issue has been resolved. Pages are now properly isolated with locking.
|
||||
|
||||
### 8.2 Performance Issues
|
||||
|
||||
**Problem**: Too many open tabs
|
||||
|
||||
```python
|
||||
# Ensure you're not using session_id for everything
|
||||
config = CrawlerRunConfig() # No session_id
|
||||
await crawler.arun_many(urls, config=config)
|
||||
# Pages auto-close after crawling
|
||||
```
|
||||
|
||||
**Problem**: Memory leaks
|
||||
|
||||
```python
|
||||
# Always use context manager
|
||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||
# Crawling code here
|
||||
pass
|
||||
# Automatic cleanup on exit
|
||||
```
|
||||
|
||||
### 8.3 State Issues
|
||||
|
||||
**Problem**: Cookies not persisting
|
||||
|
||||
```python
|
||||
# Use the same context (automatic with CDP)
|
||||
browser_cfg = BrowserConfig(cdp_url="http://localhost:9222")
|
||||
# All crawls share cookies automatically
|
||||
```
|
||||
|
||||
**Problem**: Need isolated state
|
||||
|
||||
```python
|
||||
# Use different CDP endpoints or non-CDP browsers
|
||||
browser_cfg_1 = BrowserConfig(cdp_url="http://localhost:9222")
|
||||
browser_cfg_2 = BrowserConfig(cdp_url="http://localhost:9223")
|
||||
# Completely isolated browsers
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 9. Comparison: CDP vs Regular Browsers
|
||||
|
||||
| Feature | CDP Browser | Regular Browser |
|
||||
|---------|-------------|-----------------|
|
||||
| **Window Management** | ✅ Single window, multiple tabs | ❌ New window per context |
|
||||
| **Startup Time** | ✅ Instant (already running) | ⏱️ ~2-3s per launch |
|
||||
| **State Sharing** | ✅ Shared cookies/localStorage | ⚠️ Isolated by default |
|
||||
| **Concurrent Safety** | ✅ Automatic locking | ✅ Separate processes |
|
||||
| **Memory Usage** | ✅ Lower (shared browser) | ⚠️ Higher (multiple processes) |
|
||||
| **Session Persistence** | ✅ Native support | ✅ Via session_id |
|
||||
| **Stealth Mode** | ❌ Not compatible | ✅ Full support |
|
||||
| **Best For** | Development, authenticated crawls | Production, isolated crawls |
|
||||
|
||||
---
|
||||
|
||||
## 10. Real-World Examples
|
||||
|
||||
### 10.1 E-commerce Product Scraping
|
||||
|
||||
```python
|
||||
async def scrape_products():
|
||||
browser_cfg = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
cdp_url="http://localhost:9222"
|
||||
)
|
||||
|
||||
# Get product URLs from category page
|
||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||
category_result = await crawler.arun(
|
||||
url="https://shop.example.com/category",
|
||||
config=CrawlerRunConfig(
|
||||
css_selector=".product-link"
|
||||
)
|
||||
)
|
||||
|
||||
# Extract product URLs
|
||||
product_urls = extract_urls(category_result.links)
|
||||
|
||||
# Crawl all products concurrently
|
||||
product_results = await crawler.arun_many(
|
||||
urls=product_urls,
|
||||
config=CrawlerRunConfig(
|
||||
css_selector=".product-details",
|
||||
semaphore_count=5 # Polite crawling
|
||||
)
|
||||
)
|
||||
|
||||
return [extract_product_data(r) for r in product_results]
|
||||
```
|
||||
|
||||
### 10.2 News Article Monitoring
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
|
||||
async def monitor_news_sites():
|
||||
browser_cfg = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
cdp_url="http://localhost:9222"
|
||||
)
|
||||
|
||||
news_sites = [
|
||||
"https://news.site1.com",
|
||||
"https://news.site2.com",
|
||||
"https://news.site3.com"
|
||||
]
|
||||
|
||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||
while True:
|
||||
print(f"\n[{datetime.now()}] Checking for updates...")
|
||||
|
||||
results = await crawler.arun_many(
|
||||
urls=news_sites,
|
||||
config=CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS, # Always fresh
|
||||
css_selector=".article-headline"
|
||||
)
|
||||
)
|
||||
|
||||
for result in results:
|
||||
if result.success:
|
||||
headlines = extract_headlines(result)
|
||||
for headline in headlines:
|
||||
if is_new(headline):
|
||||
notify_user(headline)
|
||||
|
||||
# Check every 5 minutes
|
||||
await asyncio.sleep(300)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 11. Summary
|
||||
|
||||
CDP browser crawling offers:
|
||||
|
||||
- 🚀 **Performance**: Faster startup, lower resource usage
|
||||
- 🔄 **State Management**: Shared cookies and authentication
|
||||
- 🎯 **Concurrent Safety**: Automatic page isolation and cleanup
|
||||
- 💻 **Developer Friendly**: Visual debugging with DevTools
|
||||
|
||||
**When to use CDP:**
|
||||
- Development and debugging
|
||||
- Authenticated crawling (login required)
|
||||
- Sequential crawls needing state
|
||||
- Resource-constrained environments
|
||||
|
||||
**When to use regular browsers:**
|
||||
- Production deployments
|
||||
- Maximum isolation required
|
||||
- Stealth mode needed
|
||||
- Distributed/cloud crawling
|
||||
|
||||
For most use cases, **CDP browsers provide the best balance** of performance, convenience, and safety.
|
||||
@@ -82,6 +82,42 @@ If you installed Crawl4AI (which installs Playwright under the hood), you alread
|
||||
|
||||
---
|
||||
|
||||
### Creating a Profile Using the Crawl4AI CLI (Easiest)
|
||||
|
||||
If you prefer a guided, interactive setup, use the built-in CLI to create and manage persistent browser profiles.
|
||||
|
||||
1.⠀Launch the profile manager:
|
||||
```bash
|
||||
crwl profiles
|
||||
```
|
||||
|
||||
2.⠀Choose "Create new profile" and enter a profile name. A Chromium window opens so you can log in to sites and configure settings. When finished, return to the terminal and press `q` to save the profile.
|
||||
|
||||
3.⠀Profiles are saved under `~/.crawl4ai/profiles/<profile_name>` (for example: `/home/<you>/.crawl4ai/profiles/test_profile_1`) along with a `storage_state.json` for cookies and session data.
|
||||
|
||||
4.⠀Optionally, choose "List profiles" in the CLI to view available profiles and their paths.
|
||||
|
||||
5.⠀Use the saved path with `BrowserConfig.user_data_dir`:
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
||||
|
||||
profile_path = "/home/<you>/.crawl4ai/profiles/test_profile_1"
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
use_managed_browser=True,
|
||||
user_data_dir=profile_path,
|
||||
browser_type="chromium",
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url="https://example.com/private")
|
||||
```
|
||||
|
||||
The CLI also supports listing and deleting profiles, and even testing a crawl directly from the menu.
|
||||
|
||||
---
|
||||
|
||||
## 3. Using Managed Browsers in Crawl4AI
|
||||
|
||||
Once you have a data directory with your session data, pass it to **`BrowserConfig`**:
|
||||
|
||||
@@ -7,13 +7,13 @@ Simple proxy configuration with `BrowserConfig`:
|
||||
```python
|
||||
from crawl4ai.async_configs import BrowserConfig
|
||||
|
||||
# Using proxy URL
|
||||
browser_config = BrowserConfig(proxy="http://proxy.example.com:8080")
|
||||
# Using HTTP proxy
|
||||
browser_config = BrowserConfig(proxy_config={"server": "http://proxy.example.com:8080"})
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
|
||||
# Using SOCKS proxy
|
||||
browser_config = BrowserConfig(proxy="socks5://proxy.example.com:1080")
|
||||
browser_config = BrowserConfig(proxy_config={"server": "socks5://proxy.example.com:1080"})
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
```
|
||||
@@ -25,7 +25,11 @@ Use an authenticated proxy with `BrowserConfig`:
|
||||
```python
|
||||
from crawl4ai.async_configs import BrowserConfig
|
||||
|
||||
browser_config = BrowserConfig(proxy="http://[username]:[password]@[host]:[port]")
|
||||
browser_config = BrowserConfig(proxy_config={
|
||||
"server": "http://[host]:[port]",
|
||||
"username": "[username]",
|
||||
"password": "[password]",
|
||||
})
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
```
|
||||
|
||||
@@ -23,7 +23,7 @@ browser_cfg = BrowserConfig(
|
||||
| **`headless`** | `bool` (default: `True`) | Headless means no visible UI. `False` is handy for debugging. |
|
||||
| **`viewport_width`** | `int` (default: `1080`) | Initial page width (in px). Useful for testing responsive layouts. |
|
||||
| **`viewport_height`** | `int` (default: `600`) | Initial page height (in px). |
|
||||
| **`proxy`** | `str` (default: `None`) | Single-proxy URL if you want all traffic to go through it, e.g. `"http://user:pass@proxy:8080"`. |
|
||||
| **`proxy`** | `str` (deprecated) | Deprecated. Use `proxy_config` instead. If set, it will be auto-converted internally. |
|
||||
| **`proxy_config`** | `dict` (default: `None`) | For advanced or multi-proxy needs, specify details like `{"server": "...", "username": "...", ...}`. |
|
||||
| **`use_persistent_context`** | `bool` (default: `False`) | If `True`, uses a **persistent** browser context (keep cookies, sessions across runs). Also sets `use_managed_browser=True`. |
|
||||
| **`user_data_dir`** | `str or None` (default: `None`) | Directory to store user data (profiles, cookies). Must be set if you want permanent sessions. |
|
||||
|
||||
@@ -18,7 +18,7 @@ A comprehensive web-based tutorial for learning and experimenting with C4A-Scrip
|
||||
|
||||
2. **Install Dependencies**
|
||||
```bash
|
||||
pip install flask
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
3. **Launch the Server**
|
||||
@@ -28,7 +28,7 @@ A comprehensive web-based tutorial for learning and experimenting with C4A-Scrip
|
||||
|
||||
4. **Open in Browser**
|
||||
```
|
||||
http://localhost:8080
|
||||
http://localhost:8000
|
||||
```
|
||||
|
||||
**🌐 Try Online**: [Live Demo](https://docs.crawl4ai.com/c4a-script/demo)
|
||||
@@ -325,7 +325,7 @@ Powers the recording functionality:
|
||||
### Configuration
|
||||
```python
|
||||
# server.py configuration
|
||||
PORT = 8080
|
||||
PORT = 8000
|
||||
DEBUG = True
|
||||
THREADED = True
|
||||
```
|
||||
@@ -343,9 +343,9 @@ THREADED = True
|
||||
**Port Already in Use**
|
||||
```bash
|
||||
# Kill existing process
|
||||
lsof -ti:8080 | xargs kill -9
|
||||
lsof -ti:8000 | xargs kill -9
|
||||
# Or use different port
|
||||
python server.py --port 8081
|
||||
python server.py --port 8001
|
||||
```
|
||||
|
||||
**Blockly Not Loading**
|
||||
|
||||
@@ -216,7 +216,7 @@ def get_examples():
|
||||
'name': 'Handle Cookie Banner',
|
||||
'description': 'Accept cookies and close newsletter popup',
|
||||
'script': '''# Handle cookie banner and newsletter
|
||||
GO http://127.0.0.1:8080/playground/
|
||||
GO http://127.0.0.1:8000/playground/
|
||||
WAIT `body` 2
|
||||
IF (EXISTS `.cookie-banner`) THEN CLICK `.accept`
|
||||
IF (EXISTS `.newsletter-popup`) THEN CLICK `.close`'''
|
||||
@@ -283,7 +283,7 @@ WAIT `.success-message` 5'''
|
||||
return jsonify(examples)
|
||||
|
||||
if __name__ == '__main__':
|
||||
port = int(os.environ.get('PORT', 8080))
|
||||
port = int(os.environ.get('PORT', 8000))
|
||||
print(f"""
|
||||
╔══════════════════════════════════════════════════════════╗
|
||||
║ C4A-Script Interactive Tutorial Server ║
|
||||
|
||||
BIN
docs/md_v2/assets/crawl4ai-skill.zip
Normal file
BIN
docs/md_v2/assets/crawl4ai-skill.zip
Normal file
Binary file not shown.
BIN
docs/md_v2/assets/images/logo.png
Normal file
BIN
docs/md_v2/assets/images/logo.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 1.6 KiB |
376
docs/md_v2/assets/page_actions.css
Normal file
376
docs/md_v2/assets/page_actions.css
Normal file
@@ -0,0 +1,376 @@
|
||||
/* ==== File: assets/page_actions.css ==== */
|
||||
/* Page Actions Dropdown - Terminal Style */
|
||||
|
||||
/* Wrapper - positioned in content area */
|
||||
.page-actions-wrapper {
|
||||
position: absolute;
|
||||
top: 1.3rem;
|
||||
right: 1rem;
|
||||
z-index: 1000;
|
||||
}
|
||||
|
||||
/* Floating Action Button */
|
||||
.page-actions-button {
|
||||
position: relative;
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
background: #3f3f44;
|
||||
border: 1px solid #50ffff;
|
||||
color: #e8e9ed;
|
||||
padding: 0.75rem 1rem;
|
||||
border-radius: 6px;
|
||||
font-family: 'Dank Mono', Monaco, monospace;
|
||||
font-size: 0.875rem;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s ease;
|
||||
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3);
|
||||
}
|
||||
|
||||
.page-actions-button:hover {
|
||||
background: #50ffff;
|
||||
color: #070708;
|
||||
transform: translateY(-2px);
|
||||
box-shadow: 0 6px 16px rgba(80, 255, 255, 0.3);
|
||||
}
|
||||
|
||||
.page-actions-button::before {
|
||||
content: '▤';
|
||||
font-size: 1.2rem;
|
||||
line-height: 1;
|
||||
}
|
||||
|
||||
.page-actions-button::after {
|
||||
content: '▼';
|
||||
font-size: 0.6rem;
|
||||
transition: transform 0.2s ease;
|
||||
}
|
||||
|
||||
.page-actions-button.active::after {
|
||||
transform: rotate(180deg);
|
||||
}
|
||||
|
||||
/* Dropdown Menu */
|
||||
.page-actions-dropdown {
|
||||
position: absolute;
|
||||
top: 3.5rem;
|
||||
right: 0;
|
||||
z-index: 1001;
|
||||
background: #1a1a1a;
|
||||
border: 1px solid #3f3f44;
|
||||
border-radius: 8px;
|
||||
min-width: 280px;
|
||||
opacity: 0;
|
||||
visibility: hidden;
|
||||
transform: translateY(-10px);
|
||||
transition: all 0.2s ease;
|
||||
box-shadow: 0 8px 24px rgba(0, 0, 0, 0.5);
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.page-actions-dropdown.active {
|
||||
opacity: 1;
|
||||
visibility: visible;
|
||||
transform: translateY(0);
|
||||
}
|
||||
|
||||
.page-actions-dropdown::before {
|
||||
content: '';
|
||||
position: absolute;
|
||||
top: -8px;
|
||||
right: 1.5rem;
|
||||
width: 0;
|
||||
height: 0;
|
||||
border-left: 8px solid transparent;
|
||||
border-right: 8px solid transparent;
|
||||
border-bottom: 8px solid #3f3f44;
|
||||
}
|
||||
|
||||
/* Menu Header */
|
||||
.page-actions-header {
|
||||
background: #3f3f44;
|
||||
padding: 0.5rem 0.75rem;
|
||||
border-bottom: 1px solid #50ffff;
|
||||
font-family: 'Dank Mono', Monaco, monospace;
|
||||
font-size: 0.7rem;
|
||||
color: #a3abba;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.05em;
|
||||
}
|
||||
|
||||
.page-actions-header::before {
|
||||
content: '┌─';
|
||||
margin-right: 0.5rem;
|
||||
color: #50ffff;
|
||||
}
|
||||
|
||||
/* Menu Items */
|
||||
.page-actions-menu {
|
||||
list-style: none;
|
||||
margin: 0;
|
||||
padding: 0.25rem 0;
|
||||
}
|
||||
|
||||
.page-action-item {
|
||||
display: block;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
ul>li.page-action-item::after{
|
||||
content: '';
|
||||
}
|
||||
.page-action-link {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
padding: 0.5rem 0.75rem;
|
||||
color: #e8e9ed;
|
||||
text-decoration: none !important;
|
||||
font-family: 'Dank Mono', Monaco, monospace;
|
||||
font-size: 0.8rem;
|
||||
transition: all 0.15s ease;
|
||||
cursor: pointer;
|
||||
border-left: 3px solid transparent;
|
||||
}
|
||||
|
||||
.page-action-link:hover:not(.disabled) {
|
||||
background: #3f3f44;
|
||||
border-left-color: #50ffff;
|
||||
color: #50ffff;
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
.page-action-link.disabled {
|
||||
opacity: 0.5;
|
||||
cursor: not-allowed;
|
||||
}
|
||||
|
||||
.page-action-link.disabled:hover {
|
||||
background: transparent;
|
||||
color: #e8e9ed;
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
/* Icons using ASCII/Terminal characters */
|
||||
.page-action-icon {
|
||||
font-size: 1rem;
|
||||
width: 1.5rem;
|
||||
text-align: center;
|
||||
font-weight: bold;
|
||||
color: #50ffff;
|
||||
}
|
||||
|
||||
.page-action-link:hover:not(.disabled) .page-action-icon {
|
||||
color: #50ffff;
|
||||
}
|
||||
|
||||
.page-action-link.disabled .page-action-icon {
|
||||
color: #666;
|
||||
}
|
||||
|
||||
/* Specific icons */
|
||||
.icon-copy::before {
|
||||
content: '⎘'; /* Copy/duplicate symbol */
|
||||
}
|
||||
|
||||
.icon-view::before {
|
||||
content: '⎙'; /* Document symbol */
|
||||
}
|
||||
|
||||
.icon-ai::before {
|
||||
content: '⚡'; /* Lightning/AI symbol */
|
||||
}
|
||||
|
||||
/* Action Text */
|
||||
.page-action-text {
|
||||
flex: 1;
|
||||
}
|
||||
|
||||
.page-action-label {
|
||||
display: block;
|
||||
font-weight: 600;
|
||||
margin-bottom: 0.05rem;
|
||||
line-height: 1.3;
|
||||
}
|
||||
|
||||
.page-action-description {
|
||||
display: block;
|
||||
font-size: 0.7rem;
|
||||
color: #a3abba;
|
||||
line-height: 1.2;
|
||||
}
|
||||
|
||||
/* Badge */
|
||||
/* External link indicator */
|
||||
.page-action-external::after {
|
||||
content: '→';
|
||||
margin-left: 0.25rem;
|
||||
font-size: 0.75rem;
|
||||
}
|
||||
|
||||
/* Divider */
|
||||
.page-actions-divider {
|
||||
height: 1px;
|
||||
background: #3f3f44;
|
||||
margin: 0.25rem 0;
|
||||
}
|
||||
|
||||
/* Success/Copy feedback */
|
||||
.page-action-copied {
|
||||
background: #50ff50 !important;
|
||||
color: #070708 !important;
|
||||
border-left-color: #50ff50 !important;
|
||||
}
|
||||
|
||||
.page-action-copied .page-action-icon {
|
||||
color: #070708 !important;
|
||||
}
|
||||
|
||||
.page-action-copied .page-action-icon::before {
|
||||
content: '✓';
|
||||
}
|
||||
|
||||
/* Mobile Responsive */
|
||||
@media (max-width: 768px) {
|
||||
.page-actions-wrapper {
|
||||
top: 0.5rem;
|
||||
right: 0.5rem;
|
||||
}
|
||||
|
||||
.page-actions-button {
|
||||
padding: 0.6rem 0.8rem;
|
||||
font-size: 0.8rem;
|
||||
}
|
||||
|
||||
.page-actions-dropdown {
|
||||
min-width: 260px;
|
||||
max-width: calc(100vw - 2rem);
|
||||
right: -0.5rem;
|
||||
}
|
||||
|
||||
.page-action-link {
|
||||
padding: 0.6rem 0.8rem;
|
||||
font-size: 0.8rem;
|
||||
}
|
||||
|
||||
.page-action-description {
|
||||
font-size: 0.7rem;
|
||||
}
|
||||
}
|
||||
|
||||
/* Animation for tooltip/notification */
|
||||
@keyframes slideInFromTop {
|
||||
from {
|
||||
transform: translateY(-20px);
|
||||
opacity: 0;
|
||||
}
|
||||
to {
|
||||
transform: translateY(0);
|
||||
opacity: 1;
|
||||
}
|
||||
}
|
||||
|
||||
.page-actions-notification {
|
||||
position: fixed;
|
||||
top: calc(var(--header-height) + 0.5rem);
|
||||
right: 50%;
|
||||
transform: translateX(50%);
|
||||
z-index: 1100;
|
||||
background: #50ff50;
|
||||
color: #070708;
|
||||
padding: 0.75rem 1.5rem;
|
||||
border-radius: 6px;
|
||||
font-family: 'Dank Mono', Monaco, monospace;
|
||||
font-size: 0.875rem;
|
||||
font-weight: 600;
|
||||
box-shadow: 0 4px 12px rgba(80, 255, 80, 0.4);
|
||||
animation: slideInFromTop 0.3s ease;
|
||||
pointer-events: none;
|
||||
}
|
||||
|
||||
.page-actions-notification::before {
|
||||
content: '✓ ';
|
||||
margin-right: 0.5rem;
|
||||
}
|
||||
|
||||
/* Hide on print */
|
||||
@media print {
|
||||
.page-actions-button,
|
||||
.page-actions-dropdown {
|
||||
display: none !important;
|
||||
}
|
||||
}
|
||||
|
||||
/* Overlay for mobile */
|
||||
.page-actions-overlay {
|
||||
display: none;
|
||||
position: fixed;
|
||||
top: 0;
|
||||
left: 0;
|
||||
right: 0;
|
||||
bottom: 0;
|
||||
background: rgba(0, 0, 0, 0.5);
|
||||
z-index: 998;
|
||||
opacity: 0;
|
||||
transition: opacity 0.2s ease;
|
||||
}
|
||||
|
||||
.page-actions-overlay.active {
|
||||
display: block;
|
||||
opacity: 1;
|
||||
}
|
||||
|
||||
@media (max-width: 768px) {
|
||||
.page-actions-overlay {
|
||||
display: block;
|
||||
}
|
||||
}
|
||||
|
||||
/* Keyboard focus styles */
|
||||
.page-action-link:focus {
|
||||
outline: 2px solid #50ffff;
|
||||
outline-offset: -2px;
|
||||
}
|
||||
|
||||
.page-actions-button:focus {
|
||||
outline: 2px solid #50ffff;
|
||||
outline-offset: 2px;
|
||||
}
|
||||
|
||||
/* Loading state */
|
||||
.page-action-link.loading {
|
||||
pointer-events: none;
|
||||
opacity: 0.7;
|
||||
}
|
||||
|
||||
.page-action-link.loading .page-action-icon::before {
|
||||
content: '⟳';
|
||||
animation: spin 1s linear infinite;
|
||||
}
|
||||
|
||||
@keyframes spin {
|
||||
from { transform: rotate(0deg); }
|
||||
to { transform: rotate(360deg); }
|
||||
}
|
||||
|
||||
/* Terminal-style border effect on hover */
|
||||
.page-actions-dropdown:hover {
|
||||
border-color: #50ffff;
|
||||
}
|
||||
|
||||
/* Footer info */
|
||||
.page-actions-footer {
|
||||
background: #070708;
|
||||
padding: 0.4rem 0.75rem;
|
||||
border-top: 1px solid #3f3f44;
|
||||
font-size: 0.65rem;
|
||||
color: #666;
|
||||
text-align: center;
|
||||
font-family: 'Dank Mono', Monaco, monospace;
|
||||
}
|
||||
|
||||
.page-actions-footer::before {
|
||||
content: '└─';
|
||||
margin-right: 0.5rem;
|
||||
color: #3f3f44;
|
||||
}
|
||||
427
docs/md_v2/assets/page_actions.js
Normal file
427
docs/md_v2/assets/page_actions.js
Normal file
@@ -0,0 +1,427 @@
|
||||
// ==== File: assets/page_actions.js ====
|
||||
// Page Actions - Copy/View Markdown functionality
|
||||
|
||||
document.addEventListener('DOMContentLoaded', () => {
|
||||
// Configuration
|
||||
const config = {
|
||||
githubRepo: 'unclecode/crawl4ai',
|
||||
githubBranch: 'main',
|
||||
docsPath: 'docs/md_v2',
|
||||
excludePaths: ['/apps/c4a-script/', '/apps/llmtxt/', '/apps/crawl4ai-assistant/', '/core/ask-ai/'], // Don't show on app pages
|
||||
};
|
||||
|
||||
let cachedMarkdown = null;
|
||||
let cachedMarkdownPath = null;
|
||||
|
||||
// Check if we should show the button on this page
|
||||
function shouldShowButton() {
|
||||
const currentPath = window.location.pathname;
|
||||
|
||||
// Don't show on homepage
|
||||
if (currentPath === '/' || currentPath === '/index.html') {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Don't show on 404 pages
|
||||
if (document.title && document.title.toLowerCase().includes('404')) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Require mkdocs main content container
|
||||
const mainContent = document.getElementById('terminal-mkdocs-main-content');
|
||||
if (!mainContent) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Don't show on excluded paths (apps)
|
||||
for (const excludePath of config.excludePaths) {
|
||||
if (currentPath.includes(excludePath)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Only show on documentation pages
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!shouldShowButton()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Get current page markdown path
|
||||
function getCurrentMarkdownPath() {
|
||||
let path = window.location.pathname;
|
||||
|
||||
// Remove leading/trailing slashes
|
||||
path = path.replace(/^\/|\/$/g, '');
|
||||
|
||||
// Remove .html extension if present
|
||||
path = path.replace(/\.html$/, '');
|
||||
|
||||
// Handle root/index
|
||||
if (!path || path === 'index') {
|
||||
return 'index.md';
|
||||
}
|
||||
|
||||
// Add .md extension
|
||||
return `${path}.md`;
|
||||
}
|
||||
|
||||
async function loadMarkdownContent() {
|
||||
const mdPath = getCurrentMarkdownPath();
|
||||
|
||||
if (!mdPath) {
|
||||
throw new Error('Invalid markdown path');
|
||||
}
|
||||
|
||||
const rawUrl = getGithubRawUrl();
|
||||
const response = await fetch(rawUrl);
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`Failed to fetch markdown: ${response.status}`);
|
||||
}
|
||||
|
||||
const markdown = await response.text();
|
||||
cachedMarkdown = markdown;
|
||||
cachedMarkdownPath = mdPath;
|
||||
return markdown;
|
||||
}
|
||||
|
||||
async function ensureMarkdownCached() {
|
||||
const mdPath = getCurrentMarkdownPath();
|
||||
|
||||
if (!mdPath) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (cachedMarkdown && cachedMarkdownPath === mdPath) {
|
||||
return true;
|
||||
}
|
||||
|
||||
try {
|
||||
await loadMarkdownContent();
|
||||
return true;
|
||||
} catch (error) {
|
||||
console.warn('Page Actions: Markdown not available for this page.', error);
|
||||
cachedMarkdown = null;
|
||||
cachedMarkdownPath = null;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
async function getMarkdownContent() {
|
||||
const available = await ensureMarkdownCached();
|
||||
if (!available) {
|
||||
throw new Error('Markdown not available for this page.');
|
||||
}
|
||||
return cachedMarkdown;
|
||||
}
|
||||
|
||||
// Get GitHub raw URL for current page
|
||||
function getGithubRawUrl() {
|
||||
const mdPath = getCurrentMarkdownPath();
|
||||
return `https://raw.githubusercontent.com/${config.githubRepo}/${config.githubBranch}/${config.docsPath}/${mdPath}`;
|
||||
}
|
||||
|
||||
// Get GitHub file URL for current page (for viewing)
|
||||
function getGithubFileUrl() {
|
||||
const mdPath = getCurrentMarkdownPath();
|
||||
return `https://github.com/${config.githubRepo}/blob/${config.githubBranch}/${config.docsPath}/${mdPath}`;
|
||||
}
|
||||
|
||||
// Create the UI
|
||||
function createPageActionsUI() {
|
||||
// Find the main content area
|
||||
const mainContent = document.getElementById('terminal-mkdocs-main-content');
|
||||
if (!mainContent) {
|
||||
console.warn('Page Actions: Could not find #terminal-mkdocs-main-content');
|
||||
return null;
|
||||
}
|
||||
|
||||
// Create button
|
||||
const button = document.createElement('button');
|
||||
button.className = 'page-actions-button';
|
||||
button.setAttribute('aria-label', 'Page copy');
|
||||
button.setAttribute('aria-expanded', 'false');
|
||||
button.innerHTML = '<span>Page Copy</span>';
|
||||
|
||||
// Create overlay for mobile
|
||||
const overlay = document.createElement('div');
|
||||
overlay.className = 'page-actions-overlay';
|
||||
|
||||
// Create dropdown
|
||||
const dropdown = document.createElement('div');
|
||||
dropdown.className = 'page-actions-dropdown';
|
||||
dropdown.setAttribute('role', 'menu');
|
||||
dropdown.innerHTML = `
|
||||
<div class="page-actions-header">Page Copy</div>
|
||||
<ul class="page-actions-menu">
|
||||
<li class="page-action-item">
|
||||
<a href="#" class="page-action-link" id="action-copy-markdown" role="menuitem">
|
||||
<span class="page-action-icon icon-copy"></span>
|
||||
<span class="page-action-text">
|
||||
<span class="page-action-label">Copy as Markdown</span>
|
||||
<span class="page-action-description">Copy page for LLMs</span>
|
||||
</span>
|
||||
</a>
|
||||
</li>
|
||||
<li class="page-action-item">
|
||||
<a href="#" class="page-action-link page-action-external" id="action-view-markdown" target="_blank" role="menuitem">
|
||||
<span class="page-action-icon icon-view"></span>
|
||||
<span class="page-action-text">
|
||||
<span class="page-action-label">View as Markdown</span>
|
||||
<span class="page-action-description">Open raw source</span>
|
||||
</span>
|
||||
</a>
|
||||
</li>
|
||||
<div class="page-actions-divider"></div>
|
||||
<li class="page-action-item">
|
||||
<a href="#" class="page-action-link page-action-external" id="action-open-chatgpt" role="menuitem">
|
||||
<span class="page-action-icon icon-ai"></span>
|
||||
<span class="page-action-text">
|
||||
<span class="page-action-label">Open in ChatGPT</span>
|
||||
<span class="page-action-description">Ask questions about this page</span>
|
||||
</span>
|
||||
</a>
|
||||
</li>
|
||||
</ul>
|
||||
<div class="page-actions-footer">ESC to close</div>
|
||||
`;
|
||||
|
||||
// Create a wrapper for button and dropdown
|
||||
const wrapper = document.createElement('div');
|
||||
wrapper.className = 'page-actions-wrapper';
|
||||
wrapper.appendChild(button);
|
||||
wrapper.appendChild(dropdown);
|
||||
|
||||
// Inject into main content area
|
||||
mainContent.appendChild(wrapper);
|
||||
|
||||
// Append overlay to body
|
||||
document.body.appendChild(overlay);
|
||||
|
||||
return { button, dropdown, overlay, wrapper };
|
||||
}
|
||||
|
||||
// Toggle dropdown
|
||||
function toggleDropdown(button, dropdown, overlay) {
|
||||
const isActive = dropdown.classList.contains('active');
|
||||
|
||||
if (isActive) {
|
||||
closeDropdown(button, dropdown, overlay);
|
||||
} else {
|
||||
openDropdown(button, dropdown, overlay);
|
||||
}
|
||||
}
|
||||
|
||||
function openDropdown(button, dropdown, overlay) {
|
||||
dropdown.classList.add('active');
|
||||
// Don't activate overlay - not needed
|
||||
button.classList.add('active');
|
||||
button.setAttribute('aria-expanded', 'true');
|
||||
}
|
||||
|
||||
function closeDropdown(button, dropdown, overlay) {
|
||||
dropdown.classList.remove('active');
|
||||
// Don't deactivate overlay - not needed
|
||||
button.classList.remove('active');
|
||||
button.setAttribute('aria-expanded', 'false');
|
||||
}
|
||||
|
||||
// Show notification
|
||||
function showNotification(message, duration = 2000) {
|
||||
const notification = document.createElement('div');
|
||||
notification.className = 'page-actions-notification';
|
||||
notification.textContent = message;
|
||||
document.body.appendChild(notification);
|
||||
|
||||
setTimeout(() => {
|
||||
notification.remove();
|
||||
}, duration);
|
||||
}
|
||||
|
||||
// Copy markdown to clipboard
|
||||
async function copyMarkdownToClipboard(link) {
|
||||
// Add loading state
|
||||
link.classList.add('loading');
|
||||
|
||||
try {
|
||||
const markdown = await getMarkdownContent();
|
||||
|
||||
// Copy to clipboard
|
||||
await navigator.clipboard.writeText(markdown);
|
||||
|
||||
// Visual feedback
|
||||
link.classList.remove('loading');
|
||||
link.classList.add('page-action-copied');
|
||||
|
||||
showNotification('Markdown copied to clipboard!');
|
||||
|
||||
// Reset after delay
|
||||
setTimeout(() => {
|
||||
link.classList.remove('page-action-copied');
|
||||
}, 2000);
|
||||
|
||||
} catch (error) {
|
||||
console.error('Error copying markdown:', error);
|
||||
link.classList.remove('loading');
|
||||
showNotification('Error: Could not copy markdown');
|
||||
}
|
||||
}
|
||||
|
||||
// View markdown in new tab
|
||||
function viewMarkdown() {
|
||||
const githubUrl = getGithubFileUrl();
|
||||
window.open(githubUrl, '_blank', 'noopener,noreferrer');
|
||||
}
|
||||
|
||||
function getCurrentPageUrl() {
|
||||
const { href } = window.location;
|
||||
return href.split('#')[0];
|
||||
}
|
||||
|
||||
function openChatGPT() {
|
||||
const pageUrl = getCurrentPageUrl();
|
||||
const prompt = encodeURIComponent(`Read ${pageUrl} so I can ask questions about it.`);
|
||||
const chatUrl = `https://chatgpt.com/?hint=search&prompt=${prompt}`;
|
||||
window.open(chatUrl, '_blank', 'noopener,noreferrer');
|
||||
}
|
||||
|
||||
(async () => {
|
||||
if (!shouldShowButton()) {
|
||||
return;
|
||||
}
|
||||
|
||||
const markdownAvailable = await ensureMarkdownCached();
|
||||
if (!markdownAvailable) {
|
||||
return;
|
||||
}
|
||||
|
||||
const ui = createPageActionsUI();
|
||||
if (!ui) {
|
||||
return;
|
||||
}
|
||||
|
||||
const { button, dropdown, overlay } = ui;
|
||||
|
||||
// Event listeners
|
||||
button.addEventListener('click', (e) => {
|
||||
e.stopPropagation();
|
||||
toggleDropdown(button, dropdown, overlay);
|
||||
});
|
||||
|
||||
overlay.addEventListener('click', () => {
|
||||
closeDropdown(button, dropdown, overlay);
|
||||
});
|
||||
|
||||
// Copy markdown action
|
||||
document.getElementById('action-copy-markdown').addEventListener('click', async (e) => {
|
||||
e.preventDefault();
|
||||
e.stopPropagation();
|
||||
await copyMarkdownToClipboard(e.currentTarget);
|
||||
});
|
||||
|
||||
// View markdown action
|
||||
document.getElementById('action-view-markdown').addEventListener('click', (e) => {
|
||||
e.preventDefault();
|
||||
e.stopPropagation();
|
||||
viewMarkdown();
|
||||
closeDropdown(button, dropdown, overlay);
|
||||
});
|
||||
|
||||
// Open in ChatGPT action
|
||||
document.getElementById('action-open-chatgpt').addEventListener('click', (e) => {
|
||||
e.preventDefault();
|
||||
e.stopPropagation();
|
||||
openChatGPT();
|
||||
closeDropdown(button, dropdown, overlay);
|
||||
});
|
||||
|
||||
// Close on ESC key
|
||||
document.addEventListener('keydown', (e) => {
|
||||
if (e.key === 'Escape' && dropdown.classList.contains('active')) {
|
||||
closeDropdown(button, dropdown, overlay);
|
||||
}
|
||||
});
|
||||
|
||||
// Close when clicking outside
|
||||
document.addEventListener('click', (e) => {
|
||||
if (!dropdown.contains(e.target) && !button.contains(e.target)) {
|
||||
closeDropdown(button, dropdown, overlay);
|
||||
}
|
||||
});
|
||||
|
||||
// Prevent dropdown from closing when clicking inside
|
||||
dropdown.addEventListener('click', (e) => {
|
||||
// Only stop propagation if not clicking on a link
|
||||
if (!e.target.closest('.page-action-link')) {
|
||||
e.stopPropagation();
|
||||
}
|
||||
});
|
||||
|
||||
// Close dropdown on link click (except for copy which handles itself)
|
||||
dropdown.querySelectorAll('.page-action-link:not(#action-copy-markdown)').forEach(link => {
|
||||
link.addEventListener('click', () => {
|
||||
if (!link.classList.contains('disabled')) {
|
||||
setTimeout(() => {
|
||||
closeDropdown(button, dropdown, overlay);
|
||||
}, 100);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// Handle window resize
|
||||
let resizeTimer;
|
||||
window.addEventListener('resize', () => {
|
||||
clearTimeout(resizeTimer);
|
||||
resizeTimer = setTimeout(() => {
|
||||
// Close dropdown on resize to prevent positioning issues
|
||||
if (dropdown.classList.contains('active')) {
|
||||
closeDropdown(button, dropdown, overlay);
|
||||
}
|
||||
}, 250);
|
||||
});
|
||||
|
||||
// Accessibility: Focus management
|
||||
button.addEventListener('keydown', (e) => {
|
||||
if (e.key === 'Enter' || e.key === ' ') {
|
||||
e.preventDefault();
|
||||
toggleDropdown(button, dropdown, overlay);
|
||||
|
||||
// Focus first menu item when opening
|
||||
if (dropdown.classList.contains('active')) {
|
||||
const firstLink = dropdown.querySelector('.page-action-link:not(.disabled)');
|
||||
if (firstLink) {
|
||||
setTimeout(() => firstLink.focus(), 100);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Arrow key navigation within menu
|
||||
dropdown.addEventListener('keydown', (e) => {
|
||||
if (!dropdown.classList.contains('active')) return;
|
||||
|
||||
const links = Array.from(dropdown.querySelectorAll('.page-action-link:not(.disabled)'));
|
||||
const currentIndex = links.indexOf(document.activeElement);
|
||||
|
||||
if (e.key === 'ArrowDown') {
|
||||
e.preventDefault();
|
||||
const nextIndex = (currentIndex + 1) % links.length;
|
||||
links[nextIndex].focus();
|
||||
} else if (e.key === 'ArrowUp') {
|
||||
e.preventDefault();
|
||||
const prevIndex = (currentIndex - 1 + links.length) % links.length;
|
||||
links[prevIndex].focus();
|
||||
} else if (e.key === 'Home') {
|
||||
e.preventDefault();
|
||||
links[0].focus();
|
||||
} else if (e.key === 'End') {
|
||||
e.preventDefault();
|
||||
links[links.length - 1].focus();
|
||||
}
|
||||
});
|
||||
|
||||
console.log('Page Actions initialized for:', getCurrentMarkdownPath());
|
||||
})();
|
||||
});
|
||||
@@ -20,17 +20,43 @@ Ever wondered why your AI coding assistant struggles with your library despite c
|
||||
|
||||
## Latest Release
|
||||
|
||||
### [Crawl4AI v0.7.6 – The Webhook Infrastructure Update](../blog/release-v0.7.6.md)
|
||||
*October 22, 2025*
|
||||
|
||||
Crawl4AI v0.7.6 introduces comprehensive webhook support for the Docker job queue API, bringing real-time notifications to both crawling and LLM extraction workflows. No more polling!
|
||||
|
||||
Key highlights:
|
||||
- **🪝 Complete Webhook Support**: Real-time notifications for both `/crawl/job` and `/llm/job` endpoints
|
||||
- **🔄 Reliable Delivery**: Exponential backoff retry mechanism (5 attempts: 1s → 2s → 4s → 8s → 16s)
|
||||
- **🔐 Custom Authentication**: Add custom headers for webhook authentication
|
||||
- **📊 Flexible Delivery**: Choose notification-only or include full data in payload
|
||||
- **⚙️ Global Configuration**: Set default webhook URL in config.yml for all jobs
|
||||
- **🎯 Zero Breaking Changes**: Fully backward compatible, webhooks are opt-in
|
||||
|
||||
[Read full release notes →](../blog/release-v0.7.6.md)
|
||||
|
||||
## Recent Releases
|
||||
|
||||
### [Crawl4AI v0.7.5 – The Docker Hooks & Security Update](../blog/release-v0.7.5.md)
|
||||
*September 29, 2025*
|
||||
|
||||
Crawl4AI v0.7.5 introduces the powerful Docker Hooks System for complete pipeline customization, enhanced LLM integration with custom providers, HTTPS preservation for modern web security, and resolves multiple community-reported issues.
|
||||
|
||||
Key highlights:
|
||||
- **🔧 Docker Hooks System**: Custom Python functions at 8 key pipeline points for unprecedented customization
|
||||
- **🤖 Enhanced LLM Integration**: Custom providers with temperature control and base_url configuration
|
||||
- **🔒 HTTPS Preservation**: Secure internal link handling for modern web applications
|
||||
- **🐍 Python 3.10+ Support**: Modern language features and enhanced performance
|
||||
- **🛠️ Bug Fixes**: Resolved multiple community-reported issues including URL processing, JWT authentication, and proxy configuration
|
||||
|
||||
[Read full release notes →](../blog/release-v0.7.5.md)
|
||||
|
||||
## Recent Releases
|
||||
|
||||
### [Crawl4AI v0.7.4 – The Intelligent Table Extraction & Performance Update](../blog/release-v0.7.4.md)
|
||||
*August 17, 2025*
|
||||
|
||||
Crawl4AI v0.7.4 introduces revolutionary LLM-powered table extraction with intelligent chunking, performance improvements for concurrent crawling, enhanced browser management, and critical stability fixes that make Crawl4AI more robust for production workloads.
|
||||
|
||||
Key highlights:
|
||||
- **🚀 LLMTableExtraction**: Revolutionary table extraction with intelligent chunking for massive tables
|
||||
- **⚡ Dispatcher Bug Fix**: Fixed sequential processing issue in arun_many for fast-completing tasks
|
||||
- **🧹 Memory Management Refactor**: Streamlined memory utilities and better resource management
|
||||
- **🔧 Browser Manager Fixes**: Resolved race conditions in concurrent page creation
|
||||
- **🔗 Advanced URL Processing**: Better handling of raw URLs and base tag link resolution
|
||||
Revolutionary LLM-powered table extraction with intelligent chunking, performance improvements for concurrent crawling, enhanced browser management, and critical stability fixes.
|
||||
|
||||
[Read full release notes →](../blog/release-v0.7.4.md)
|
||||
|
||||
|
||||
314
docs/md_v2/blog/releases/0.7.6.md
Normal file
314
docs/md_v2/blog/releases/0.7.6.md
Normal file
@@ -0,0 +1,314 @@
|
||||
# Crawl4AI v0.7.6 Release Notes
|
||||
|
||||
*Release Date: October 22, 2025*
|
||||
|
||||
I'm excited to announce Crawl4AI v0.7.6, featuring a complete webhook infrastructure for the Docker job queue API! This release eliminates polling and brings real-time notifications to both crawling and LLM extraction workflows.
|
||||
|
||||
## 🎯 What's New
|
||||
|
||||
### Webhook Support for Docker Job Queue API
|
||||
|
||||
The headline feature of v0.7.6 is comprehensive webhook support for asynchronous job processing. No more constant polling to check if your jobs are done - get instant notifications when they complete!
|
||||
|
||||
**Key Capabilities:**
|
||||
|
||||
- ✅ **Universal Webhook Support**: Both `/crawl/job` and `/llm/job` endpoints now support webhooks
|
||||
- ✅ **Flexible Delivery Modes**: Choose notification-only or include full data in the webhook payload
|
||||
- ✅ **Reliable Delivery**: Exponential backoff retry mechanism (5 attempts: 1s → 2s → 4s → 8s → 16s)
|
||||
- ✅ **Custom Authentication**: Add custom headers for webhook authentication
|
||||
- ✅ **Global Configuration**: Set default webhook URL in `config.yml` for all jobs
|
||||
- ✅ **Task Type Identification**: Distinguish between `crawl` and `llm_extraction` tasks
|
||||
|
||||
### How It Works
|
||||
|
||||
Instead of constantly checking job status:
|
||||
|
||||
**OLD WAY (Polling):**
|
||||
```python
|
||||
# Submit job
|
||||
response = requests.post("http://localhost:11235/crawl/job", json=payload)
|
||||
task_id = response.json()['task_id']
|
||||
|
||||
# Poll until complete
|
||||
while True:
|
||||
status = requests.get(f"http://localhost:11235/crawl/job/{task_id}")
|
||||
if status.json()['status'] == 'completed':
|
||||
break
|
||||
time.sleep(5) # Wait and try again
|
||||
```
|
||||
|
||||
**NEW WAY (Webhooks):**
|
||||
```python
|
||||
# Submit job with webhook
|
||||
payload = {
|
||||
"urls": ["https://example.com"],
|
||||
"webhook_config": {
|
||||
"webhook_url": "https://myapp.com/webhook",
|
||||
"webhook_data_in_payload": True
|
||||
}
|
||||
}
|
||||
response = requests.post("http://localhost:11235/crawl/job", json=payload)
|
||||
|
||||
# Done! Webhook will notify you when complete
|
||||
# Your webhook handler receives the results automatically
|
||||
```
|
||||
|
||||
### Crawl Job Webhooks
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:11235/crawl/job \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"urls": ["https://example.com"],
|
||||
"browser_config": {"headless": true},
|
||||
"crawler_config": {"cache_mode": "bypass"},
|
||||
"webhook_config": {
|
||||
"webhook_url": "https://myapp.com/webhooks/crawl-complete",
|
||||
"webhook_data_in_payload": false,
|
||||
"webhook_headers": {
|
||||
"X-Webhook-Secret": "your-secret-token"
|
||||
}
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
### LLM Extraction Job Webhooks (NEW!)
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:11235/llm/job \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"url": "https://example.com/article",
|
||||
"q": "Extract the article title, author, and publication date",
|
||||
"schema": "{\"type\":\"object\",\"properties\":{\"title\":{\"type\":\"string\"}}}",
|
||||
"provider": "openai/gpt-4o-mini",
|
||||
"webhook_config": {
|
||||
"webhook_url": "https://myapp.com/webhooks/llm-complete",
|
||||
"webhook_data_in_payload": true
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
### Webhook Payload Structure
|
||||
|
||||
**Success (with data):**
|
||||
```json
|
||||
{
|
||||
"task_id": "llm_1698765432",
|
||||
"task_type": "llm_extraction",
|
||||
"status": "completed",
|
||||
"timestamp": "2025-10-22T10:30:00.000000+00:00",
|
||||
"urls": ["https://example.com/article"],
|
||||
"data": {
|
||||
"extracted_content": {
|
||||
"title": "Understanding Web Scraping",
|
||||
"author": "John Doe",
|
||||
"date": "2025-10-22"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Failure:**
|
||||
```json
|
||||
{
|
||||
"task_id": "crawl_abc123",
|
||||
"task_type": "crawl",
|
||||
"status": "failed",
|
||||
"timestamp": "2025-10-22T10:30:00.000000+00:00",
|
||||
"urls": ["https://example.com"],
|
||||
"error": "Connection timeout after 30s"
|
||||
}
|
||||
```
|
||||
|
||||
### Simple Webhook Handler Example
|
||||
|
||||
```python
|
||||
from flask import Flask, request, jsonify
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
@app.route('/webhook', methods=['POST'])
|
||||
def handle_webhook():
|
||||
payload = request.json
|
||||
|
||||
task_id = payload['task_id']
|
||||
task_type = payload['task_type']
|
||||
status = payload['status']
|
||||
|
||||
if status == 'completed':
|
||||
if 'data' in payload:
|
||||
# Process data directly
|
||||
data = payload['data']
|
||||
else:
|
||||
# Fetch from API
|
||||
endpoint = 'crawl' if task_type == 'crawl' else 'llm'
|
||||
response = requests.get(f'http://localhost:11235/{endpoint}/job/{task_id}')
|
||||
data = response.json()
|
||||
|
||||
# Your business logic here
|
||||
print(f"Job {task_id} completed!")
|
||||
|
||||
elif status == 'failed':
|
||||
error = payload.get('error', 'Unknown error')
|
||||
print(f"Job {task_id} failed: {error}")
|
||||
|
||||
return jsonify({"status": "received"}), 200
|
||||
|
||||
app.run(port=8080)
|
||||
```
|
||||
|
||||
## 📊 Performance Improvements
|
||||
|
||||
- **Reduced Server Load**: Eliminates constant polling requests
|
||||
- **Lower Latency**: Instant notification vs. polling interval delay
|
||||
- **Better Resource Usage**: Frees up client connections while jobs run in background
|
||||
- **Scalable Architecture**: Handles high-volume crawling workflows efficiently
|
||||
|
||||
## 🐛 Bug Fixes
|
||||
|
||||
- Fixed webhook configuration serialization for Pydantic HttpUrl fields
|
||||
- Improved error handling in webhook delivery service
|
||||
- Enhanced Redis task storage for webhook config persistence
|
||||
|
||||
## 🌍 Expected Real-World Impact
|
||||
|
||||
### For Web Scraping Workflows
|
||||
- **Reduced Costs**: Less API calls = lower bandwidth and server costs
|
||||
- **Better UX**: Instant notifications improve user experience
|
||||
- **Scalability**: Handle 100s of concurrent jobs without polling overhead
|
||||
|
||||
### For LLM Extraction Pipelines
|
||||
- **Async Processing**: Submit LLM extraction jobs and move on
|
||||
- **Batch Processing**: Queue multiple extractions, get notified as they complete
|
||||
- **Integration**: Easy integration with workflow automation tools (Zapier, n8n, etc.)
|
||||
|
||||
### For Microservices
|
||||
- **Event-Driven**: Perfect for event-driven microservice architectures
|
||||
- **Decoupling**: Decouple job submission from result processing
|
||||
- **Reliability**: Automatic retries ensure webhooks are delivered
|
||||
|
||||
## 🔄 Breaking Changes
|
||||
|
||||
**None!** This release is fully backward compatible.
|
||||
|
||||
- Webhook configuration is optional
|
||||
- Existing code continues to work without modification
|
||||
- Polling is still supported for jobs without webhook config
|
||||
|
||||
## 📚 Documentation
|
||||
|
||||
### New Documentation
|
||||
- **[WEBHOOK_EXAMPLES.md](../deploy/docker/WEBHOOK_EXAMPLES.md)** - Comprehensive webhook usage guide
|
||||
- **[docker_webhook_example.py](../docs/examples/docker_webhook_example.py)** - Working code examples
|
||||
|
||||
### Updated Documentation
|
||||
- **[Docker README](../deploy/docker/README.md)** - Added webhook sections
|
||||
- API documentation with webhook examples
|
||||
|
||||
## 🛠️ Migration Guide
|
||||
|
||||
No migration needed! Webhooks are opt-in:
|
||||
|
||||
1. **To use webhooks**: Add `webhook_config` to your job payload
|
||||
2. **To keep polling**: Continue using your existing code
|
||||
|
||||
### Quick Start
|
||||
|
||||
```python
|
||||
# Just add webhook_config to your existing payload
|
||||
payload = {
|
||||
# Your existing configuration
|
||||
"urls": ["https://example.com"],
|
||||
"browser_config": {...},
|
||||
"crawler_config": {...},
|
||||
|
||||
# NEW: Add webhook configuration
|
||||
"webhook_config": {
|
||||
"webhook_url": "https://myapp.com/webhook",
|
||||
"webhook_data_in_payload": True
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## 🔧 Configuration
|
||||
|
||||
### Global Webhook Configuration (config.yml)
|
||||
|
||||
```yaml
|
||||
webhooks:
|
||||
enabled: true
|
||||
default_url: "https://myapp.com/webhooks/default" # Optional
|
||||
data_in_payload: false
|
||||
retry:
|
||||
max_attempts: 5
|
||||
initial_delay_ms: 1000
|
||||
max_delay_ms: 32000
|
||||
timeout_ms: 30000
|
||||
headers:
|
||||
User-Agent: "Crawl4AI-Webhook/1.0"
|
||||
```
|
||||
|
||||
## 🚀 Upgrade Instructions
|
||||
|
||||
### Docker
|
||||
|
||||
```bash
|
||||
# Pull the latest image
|
||||
docker pull unclecode/crawl4ai:0.7.6
|
||||
|
||||
# Or use latest tag
|
||||
docker pull unclecode/crawl4ai:latest
|
||||
|
||||
# Run with webhook support
|
||||
docker run -d \
|
||||
-p 11235:11235 \
|
||||
--env-file .llm.env \
|
||||
--name crawl4ai \
|
||||
unclecode/crawl4ai:0.7.6
|
||||
```
|
||||
|
||||
### Python Package
|
||||
|
||||
```bash
|
||||
pip install --upgrade crawl4ai
|
||||
```
|
||||
|
||||
## 💡 Pro Tips
|
||||
|
||||
1. **Use notification-only mode** for large results - fetch data separately to avoid large webhook payloads
|
||||
2. **Set custom headers** for webhook authentication and request tracking
|
||||
3. **Configure global default webhook** for consistent handling across all jobs
|
||||
4. **Implement idempotent webhook handlers** - same webhook may be delivered multiple times on retry
|
||||
5. **Use structured schemas** with LLM extraction for predictable webhook data
|
||||
|
||||
## 🎬 Demo
|
||||
|
||||
Try the release demo:
|
||||
|
||||
```bash
|
||||
python docs/releases_review/demo_v0.7.6.py
|
||||
```
|
||||
|
||||
This comprehensive demo showcases:
|
||||
- Crawl job webhooks (notification-only and with data)
|
||||
- LLM extraction webhooks (with JSON schema support)
|
||||
- Custom headers for authentication
|
||||
- Webhook retry mechanism
|
||||
- Real-time webhook receiver
|
||||
|
||||
## 🙏 Acknowledgments
|
||||
|
||||
Thank you to the community for the feedback that shaped this feature! Special thanks to everyone who requested webhook support for asynchronous job processing.
|
||||
|
||||
## 📞 Support
|
||||
|
||||
- **Documentation**: https://docs.crawl4ai.com
|
||||
- **GitHub Issues**: https://github.com/unclecode/crawl4ai/issues
|
||||
- **Discord**: https://discord.gg/crawl4ai
|
||||
|
||||
---
|
||||
|
||||
**Happy crawling with webhooks!** 🕷️🪝
|
||||
|
||||
*- unclecode*
|
||||
318
docs/md_v2/blog/releases/v0.7.5.md
Normal file
318
docs/md_v2/blog/releases/v0.7.5.md
Normal file
@@ -0,0 +1,318 @@
|
||||
# 🚀 Crawl4AI v0.7.5: The Docker Hooks & Security Update
|
||||
|
||||
*September 29, 2025 • 8 min read*
|
||||
|
||||
---
|
||||
|
||||
Today I'm releasing Crawl4AI v0.7.5—focused on extensibility and security. This update introduces the Docker Hooks System for pipeline customization, enhanced LLM integration, and important security improvements.
|
||||
|
||||
## 🎯 What's New at a Glance
|
||||
|
||||
- **Docker Hooks System**: Custom Python functions at key pipeline points with function-based API
|
||||
- **Function-Based Hooks**: New `hooks_to_string()` utility with Docker client auto-conversion
|
||||
- **Enhanced LLM Integration**: Custom providers with temperature control
|
||||
- **HTTPS Preservation**: Secure internal link handling
|
||||
- **Bug Fixes**: Resolved multiple community-reported issues
|
||||
- **Improved Docker Error Handling**: Better debugging and reliability
|
||||
|
||||
## 🔧 Docker Hooks System: Pipeline Customization
|
||||
|
||||
Every scraping project needs custom logic—authentication, performance optimization, content processing. Traditional solutions require forking or complex workarounds. Docker Hooks let you inject custom Python functions at 8 key points in the crawling pipeline.
|
||||
|
||||
### Real Example: Authentication & Performance
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
# Real working hooks for httpbin.org
|
||||
hooks_config = {
|
||||
"on_page_context_created": """
|
||||
async def hook(page, context, **kwargs):
|
||||
print("Hook: Setting up page context")
|
||||
# Block images to speed up crawling
|
||||
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||
print("Hook: Images blocked")
|
||||
return page
|
||||
""",
|
||||
|
||||
"before_retrieve_html": """
|
||||
async def hook(page, context, **kwargs):
|
||||
print("Hook: Before retrieving HTML")
|
||||
# Scroll to bottom to load lazy content
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
await page.wait_for_timeout(1000)
|
||||
print("Hook: Scrolled to bottom")
|
||||
return page
|
||||
""",
|
||||
|
||||
"before_goto": """
|
||||
async def hook(page, context, url, **kwargs):
|
||||
print(f"Hook: About to navigate to {url}")
|
||||
# Add custom headers
|
||||
await page.set_extra_http_headers({
|
||||
'X-Test-Header': 'crawl4ai-hooks-test'
|
||||
})
|
||||
return page
|
||||
"""
|
||||
}
|
||||
|
||||
# Test with Docker API
|
||||
payload = {
|
||||
"urls": ["https://httpbin.org/html"],
|
||||
"hooks": {
|
||||
"code": hooks_config,
|
||||
"timeout": 30
|
||||
}
|
||||
}
|
||||
|
||||
response = requests.post("http://localhost:11235/crawl", json=payload)
|
||||
result = response.json()
|
||||
|
||||
if result.get('success'):
|
||||
print("✅ Hooks executed successfully!")
|
||||
print(f"Content length: {len(result.get('markdown', ''))} characters")
|
||||
```
|
||||
|
||||
**Available Hook Points:**
|
||||
- `on_browser_created`: Browser setup
|
||||
- `on_page_context_created`: Page context configuration
|
||||
- `before_goto`: Pre-navigation setup
|
||||
- `after_goto`: Post-navigation processing
|
||||
- `on_user_agent_updated`: User agent changes
|
||||
- `on_execution_started`: Crawl initialization
|
||||
- `before_retrieve_html`: Pre-extraction processing
|
||||
- `before_return_html`: Final HTML processing
|
||||
|
||||
### Function-Based Hooks API
|
||||
|
||||
Writing hooks as strings works, but lacks IDE support and type checking. v0.7.5 introduces a function-based approach with automatic conversion!
|
||||
|
||||
**Option 1: Using the `hooks_to_string()` Utility**
|
||||
|
||||
```python
|
||||
from crawl4ai import hooks_to_string
|
||||
import requests
|
||||
|
||||
# Define hooks as regular Python functions (with full IDE support!)
|
||||
async def on_page_context_created(page, context, **kwargs):
|
||||
"""Block images to speed up crawling"""
|
||||
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||
return page
|
||||
|
||||
async def before_goto(page, context, url, **kwargs):
|
||||
"""Add custom headers"""
|
||||
await page.set_extra_http_headers({
|
||||
'X-Crawl4AI': 'v0.7.5',
|
||||
'X-Custom-Header': 'my-value'
|
||||
})
|
||||
return page
|
||||
|
||||
# Convert functions to strings
|
||||
hooks_code = hooks_to_string({
|
||||
"on_page_context_created": on_page_context_created,
|
||||
"before_goto": before_goto
|
||||
})
|
||||
|
||||
# Use with REST API
|
||||
payload = {
|
||||
"urls": ["https://httpbin.org/html"],
|
||||
"hooks": {"code": hooks_code, "timeout": 30}
|
||||
}
|
||||
response = requests.post("http://localhost:11235/crawl", json=payload)
|
||||
```
|
||||
|
||||
**Option 2: Docker Client with Automatic Conversion (Recommended!)**
|
||||
|
||||
```python
|
||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||
|
||||
# Define hooks as functions (same as above)
|
||||
async def on_page_context_created(page, context, **kwargs):
|
||||
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||
return page
|
||||
|
||||
async def before_retrieve_html(page, context, **kwargs):
|
||||
# Scroll to load lazy content
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
await page.wait_for_timeout(1000)
|
||||
return page
|
||||
|
||||
# Use Docker client - conversion happens automatically!
|
||||
client = Crawl4aiDockerClient(base_url="http://localhost:11235")
|
||||
|
||||
results = await client.crawl(
|
||||
urls=["https://httpbin.org/html"],
|
||||
hooks={
|
||||
"on_page_context_created": on_page_context_created,
|
||||
"before_retrieve_html": before_retrieve_html
|
||||
},
|
||||
hooks_timeout=30
|
||||
)
|
||||
|
||||
if results and results.success:
|
||||
print(f"✅ Hooks executed! HTML length: {len(results.html)}")
|
||||
```
|
||||
|
||||
**Benefits of Function-Based Hooks:**
|
||||
- ✅ Full IDE support (autocomplete, syntax highlighting)
|
||||
- ✅ Type checking and linting
|
||||
- ✅ Easier to test and debug
|
||||
- ✅ Reusable across projects
|
||||
- ✅ Automatic conversion in Docker client
|
||||
- ✅ No breaking changes - string hooks still work!
|
||||
|
||||
## 🤖 Enhanced LLM Integration
|
||||
|
||||
Enhanced LLM integration with custom providers, temperature control, and base URL configuration.
|
||||
|
||||
### Multi-Provider Support
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
|
||||
# Test with different providers
|
||||
async def test_llm_providers():
|
||||
# OpenAI with custom temperature
|
||||
openai_strategy = LLMExtractionStrategy(
|
||||
provider="gemini/gemini-2.5-flash-lite",
|
||||
api_token="your-api-token",
|
||||
temperature=0.7, # New in v0.7.5
|
||||
instruction="Summarize this page in one sentence"
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://example.com",
|
||||
config=CrawlerRunConfig(extraction_strategy=openai_strategy)
|
||||
)
|
||||
|
||||
if result.success:
|
||||
print("✅ LLM extraction completed")
|
||||
print(result.extracted_content)
|
||||
|
||||
# Docker API with enhanced LLM config
|
||||
llm_payload = {
|
||||
"url": "https://example.com",
|
||||
"f": "llm",
|
||||
"q": "Summarize this page in one sentence.",
|
||||
"provider": "gemini/gemini-2.5-flash-lite",
|
||||
"temperature": 0.7
|
||||
}
|
||||
|
||||
response = requests.post("http://localhost:11235/md", json=llm_payload)
|
||||
```
|
||||
|
||||
**New Features:**
|
||||
- Custom `temperature` parameter for creativity control
|
||||
- `base_url` for custom API endpoints
|
||||
- Multi-provider environment variable support
|
||||
- Docker API integration
|
||||
|
||||
## 🔒 HTTPS Preservation
|
||||
|
||||
**The Problem:** Modern web apps require HTTPS everywhere. When crawlers downgrade internal links from HTTPS to HTTP, authentication breaks and security warnings appear.
|
||||
|
||||
**Solution:** HTTPS preservation maintains secure protocols throughout crawling.
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, FilterChain, URLPatternFilter, BFSDeepCrawlStrategy
|
||||
|
||||
async def test_https_preservation():
|
||||
# Enable HTTPS preservation
|
||||
url_filter = URLPatternFilter(
|
||||
patterns=["^(https:\/\/)?quotes\.toscrape\.com(\/.*)?$"]
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
exclude_external_links=True,
|
||||
preserve_https_for_internal_links=True, # New in v0.7.5
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=2,
|
||||
max_pages=5,
|
||||
filter_chain=FilterChain([url_filter])
|
||||
)
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
async for result in await crawler.arun(
|
||||
url="https://quotes.toscrape.com",
|
||||
config=config
|
||||
):
|
||||
# All internal links maintain HTTPS
|
||||
internal_links = [link['href'] for link in result.links['internal']]
|
||||
https_links = [link for link in internal_links if link.startswith('https://')]
|
||||
|
||||
print(f"HTTPS links preserved: {len(https_links)}/{len(internal_links)}")
|
||||
for link in https_links[:3]:
|
||||
print(f" → {link}")
|
||||
```
|
||||
|
||||
## 🛠️ Bug Fixes and Improvements
|
||||
|
||||
### Major Fixes
|
||||
- **URL Processing**: Fixed '+' sign preservation in query parameters (#1332)
|
||||
- **Proxy Configuration**: Enhanced proxy string parsing (old `proxy` parameter deprecated)
|
||||
- **Docker Error Handling**: Comprehensive error messages with status codes
|
||||
- **Memory Management**: Fixed leaks in long-running sessions
|
||||
- **JWT Authentication**: Fixed Docker JWT validation issues (#1442)
|
||||
- **Playwright Stealth**: Fixed stealth features for Playwright integration (#1481)
|
||||
- **API Configuration**: Fixed config handling to prevent overriding user-provided settings (#1505)
|
||||
- **Docker Filter Serialization**: Resolved JSON encoding errors in deep crawl strategy (#1419)
|
||||
- **LLM Provider Support**: Fixed custom LLM provider integration for adaptive crawler (#1291)
|
||||
- **Performance Issues**: Resolved backoff strategy failures and timeout handling (#989)
|
||||
|
||||
### Community-Reported Issues Fixed
|
||||
This release addresses multiple issues reported by the community through GitHub issues and Discord discussions:
|
||||
- Fixed browser configuration reference errors
|
||||
- Resolved dependency conflicts with cssselect
|
||||
- Improved error messaging for failed authentications
|
||||
- Enhanced compatibility with various proxy configurations
|
||||
- Fixed edge cases in URL normalization
|
||||
|
||||
### Configuration Updates
|
||||
```python
|
||||
# Old proxy config (deprecated)
|
||||
# browser_config = BrowserConfig(proxy="http://proxy:8080")
|
||||
|
||||
# New enhanced proxy config
|
||||
browser_config = BrowserConfig(
|
||||
proxy_config={
|
||||
"server": "http://proxy:8080",
|
||||
"username": "optional-user",
|
||||
"password": "optional-pass"
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
## 🔄 Breaking Changes
|
||||
|
||||
1. **Python 3.10+ Required**: Upgrade from Python 3.9
|
||||
2. **Proxy Parameter Deprecated**: Use new `proxy_config` structure
|
||||
3. **New Dependency**: Added `cssselect` for better CSS handling
|
||||
|
||||
## 🚀 Get Started
|
||||
|
||||
```bash
|
||||
# Install latest version
|
||||
pip install crawl4ai==0.7.5
|
||||
|
||||
# Docker deployment
|
||||
docker pull unclecode/crawl4ai:latest
|
||||
docker run -p 11235:11235 unclecode/crawl4ai:latest
|
||||
```
|
||||
|
||||
**Try the Demo:**
|
||||
```bash
|
||||
# Run working examples
|
||||
python docs/releases_review/demo_v0.7.5.py
|
||||
```
|
||||
|
||||
**Resources:**
|
||||
- 📖 Documentation: [docs.crawl4ai.com](https://docs.crawl4ai.com)
|
||||
- 🐙 GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||
- 💬 Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||
- 🐦 Twitter: [@unclecode](https://x.com/unclecode)
|
||||
|
||||
Happy crawling! 🕷️
|
||||
1371
docs/md_v2/branding/index.md
Normal file
1371
docs/md_v2/branding/index.md
Normal file
File diff suppressed because it is too large
Load Diff
5196
docs/md_v2/complete-sdk-reference.md
Normal file
5196
docs/md_v2/complete-sdk-reference.md
Normal file
File diff suppressed because it is too large
Load Diff
@@ -108,7 +108,19 @@ config = AdaptiveConfig(
|
||||
embedding_min_confidence_threshold=0.1 # Stop if completely irrelevant
|
||||
)
|
||||
|
||||
# With custom embedding provider (e.g., OpenAI)
|
||||
# With custom LLM provider for query expansion (recommended)
|
||||
from crawl4ai import LLMConfig
|
||||
|
||||
config = AdaptiveConfig(
|
||||
strategy="embedding",
|
||||
embedding_llm_config=LLMConfig(
|
||||
provider='openai/text-embedding-3-small',
|
||||
api_token='your-api-key',
|
||||
temperature=0.7
|
||||
)
|
||||
)
|
||||
|
||||
# Alternative: Dictionary format (backward compatible)
|
||||
config = AdaptiveConfig(
|
||||
strategy="embedding",
|
||||
embedding_llm_config={
|
||||
|
||||
@@ -69,12 +69,12 @@ The tutorial includes a Flask-based web interface with:
|
||||
cd docs/examples/c4a_script/tutorial/
|
||||
|
||||
# Install dependencies
|
||||
pip install flask
|
||||
pip install -r requirements.txt
|
||||
|
||||
# Launch the tutorial server
|
||||
python app.py
|
||||
python server.py
|
||||
|
||||
# Open http://localhost:5000 in your browser
|
||||
# Open http://localhost:8000 in your browser
|
||||
```
|
||||
|
||||
## Core Concepts
|
||||
@@ -111,8 +111,8 @@ CLICK `.submit-btn`
|
||||
# By attribute
|
||||
CLICK `button[type="submit"]`
|
||||
|
||||
# By text content
|
||||
CLICK `button:contains("Sign In")`
|
||||
# By accessible attributes
|
||||
CLICK `button[aria-label="Search"][title="Search"]`
|
||||
|
||||
# Complex selectors
|
||||
CLICK `.form-container input[name="email"]`
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -57,7 +57,28 @@
|
||||
|
||||
Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for large language models, AI agents, and data pipelines. Fully open source, flexible, and built for real-time performance, **Crawl4AI** empowers developers with unmatched speed, precision, and deployment ease.
|
||||
|
||||
> **Note**: If you're looking for the old documentation, you can access it [here](https://old.docs.crawl4ai.com).
|
||||
> Enjoy using Crawl4AI? Consider **[becoming a sponsor](https://github.com/sponsors/unclecode)** to support ongoing development and community growth!
|
||||
|
||||
## 🆕 AI Assistant Skill Now Available!
|
||||
|
||||
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 10px; margin: 20px 0; box-shadow: 0 4px 6px rgba(0,0,0,0.1);">
|
||||
<h3 style="color: white; margin: 0 0 10px 0;">🤖 Crawl4AI Skill for Claude & AI Assistants</h3>
|
||||
<p style="color: white; margin: 10px 0;">Supercharge your AI coding assistant with complete Crawl4AI knowledge! Download our comprehensive skill package that includes:</p>
|
||||
<ul style="color: white; margin: 10px 0;">
|
||||
<li>📚 Complete SDK reference (23K+ words)</li>
|
||||
<li>🚀 Ready-to-use extraction scripts</li>
|
||||
<li>⚡ Schema generation for efficient scraping</li>
|
||||
<li>🔧 Version 0.7.4 compatible</li>
|
||||
</ul>
|
||||
<div style="text-align: center; margin-top: 15px;">
|
||||
<a href="assets/crawl4ai-skill.zip" download style="background: white; color: #667eea; padding: 12px 30px; border-radius: 5px; text-decoration: none; font-weight: bold; display: inline-block; transition: transform 0.2s;">
|
||||
📦 Download Skill Package
|
||||
</a>
|
||||
</div>
|
||||
<p style="color: white; margin: 15px 0 0 0; font-size: 0.9em; text-align: center;">
|
||||
Works with Claude, Cursor, Windsurf, and other AI coding assistants. Import the .zip file into your AI assistant's skill/knowledge system.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
## 🎯 New: Adaptive Web Crawling
|
||||
|
||||
|
||||
66
docs/md_v2/marketplace/README.md
Normal file
66
docs/md_v2/marketplace/README.md
Normal file
@@ -0,0 +1,66 @@
|
||||
# Crawl4AI Marketplace
|
||||
|
||||
A terminal-themed marketplace for tools, integrations, and resources related to Crawl4AI.
|
||||
|
||||
## Setup
|
||||
|
||||
### Backend
|
||||
|
||||
1. Install dependencies:
|
||||
```bash
|
||||
cd backend
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
2. Generate dummy data:
|
||||
```bash
|
||||
python dummy_data.py
|
||||
```
|
||||
|
||||
3. Run the server:
|
||||
```bash
|
||||
python server.py
|
||||
```
|
||||
|
||||
The API will be available at http://localhost:8100
|
||||
|
||||
### Frontend
|
||||
|
||||
1. Open `frontend/index.html` in your browser
|
||||
2. Or serve via MkDocs as part of the documentation site
|
||||
|
||||
## Database Schema
|
||||
|
||||
The marketplace uses SQLite with automatic migration from `schema.yaml`. Tables include:
|
||||
- **apps**: Tools and integrations
|
||||
- **articles**: Reviews, tutorials, and news
|
||||
- **categories**: App categories
|
||||
- **sponsors**: Sponsored content
|
||||
|
||||
## API Endpoints
|
||||
|
||||
- `GET /api/apps` - List apps with filters
|
||||
- `GET /api/articles` - List articles
|
||||
- `GET /api/categories` - Get all categories
|
||||
- `GET /api/sponsors` - Get active sponsors
|
||||
- `GET /api/search?q=query` - Search across content
|
||||
- `GET /api/stats` - Marketplace statistics
|
||||
|
||||
## Features
|
||||
|
||||
- **Smart caching**: LocalStorage with TTL (1 hour)
|
||||
- **Terminal theme**: Consistent with Crawl4AI branding
|
||||
- **Responsive design**: Works on all devices
|
||||
- **Fast search**: Debounced with 300ms delay
|
||||
- **CORS protected**: Only crawl4ai.com and localhost
|
||||
|
||||
## Admin Panel
|
||||
|
||||
Coming soon - for now, edit the database directly or modify `dummy_data.py`
|
||||
|
||||
## Deployment
|
||||
|
||||
For production deployment on EC2:
|
||||
1. Update `API_BASE` in `marketplace.js` to production URL
|
||||
2. Run FastAPI with proper production settings (use gunicorn/uvicorn)
|
||||
3. Set up nginx proxy if needed
|
||||
759
docs/md_v2/marketplace/admin/admin.css
Normal file
759
docs/md_v2/marketplace/admin/admin.css
Normal file
@@ -0,0 +1,759 @@
|
||||
/* Admin Dashboard - C4AI Terminal Style */
|
||||
|
||||
/* Utility Classes */
|
||||
.hidden {
|
||||
display: none !important;
|
||||
}
|
||||
|
||||
/* Brand Colors */
|
||||
:root {
|
||||
--c4ai-cyan: #50ffff;
|
||||
--c4ai-green: #50ff50;
|
||||
--c4ai-yellow: #ffff50;
|
||||
--c4ai-pink: #ff50ff;
|
||||
--c4ai-blue: #5050ff;
|
||||
}
|
||||
|
||||
.admin-container {
|
||||
min-height: 100vh;
|
||||
background: var(--bg-dark);
|
||||
}
|
||||
|
||||
/* Login Screen */
|
||||
.login-screen {
|
||||
min-height: 100vh;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
background: linear-gradient(135deg, #070708 0%, #1a1a2e 100%);
|
||||
}
|
||||
|
||||
.login-box {
|
||||
background: var(--bg-secondary);
|
||||
border: 2px solid var(--primary-cyan);
|
||||
padding: 3rem;
|
||||
width: 400px;
|
||||
box-shadow: 0 0 40px rgba(80, 255, 255, 0.2);
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.login-logo {
|
||||
height: 60px;
|
||||
margin-bottom: 2rem;
|
||||
filter: brightness(1.2);
|
||||
}
|
||||
|
||||
.login-box h1 {
|
||||
color: var(--primary-cyan);
|
||||
font-size: 1.5rem;
|
||||
margin-bottom: 2rem;
|
||||
}
|
||||
|
||||
#login-form input {
|
||||
width: 100%;
|
||||
padding: 0.75rem;
|
||||
background: var(--bg-dark);
|
||||
border: 1px solid var(--border-color);
|
||||
color: var(--text-primary);
|
||||
font-family: inherit;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
#login-form input:focus {
|
||||
outline: none;
|
||||
border-color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
#login-form button {
|
||||
width: 100%;
|
||||
padding: 0.75rem;
|
||||
background: linear-gradient(135deg, var(--primary-cyan), var(--primary-teal));
|
||||
border: none;
|
||||
color: var(--bg-dark);
|
||||
font-weight: 600;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
#login-form button:hover {
|
||||
box-shadow: 0 4px 15px rgba(80, 255, 255, 0.3);
|
||||
transform: translateY(-2px);
|
||||
}
|
||||
|
||||
.error-msg {
|
||||
color: var(--error);
|
||||
font-size: 0.875rem;
|
||||
margin-top: 1rem;
|
||||
}
|
||||
|
||||
/* Admin Dashboard */
|
||||
.admin-dashboard.hidden {
|
||||
display: none;
|
||||
}
|
||||
|
||||
.admin-header {
|
||||
background: var(--bg-secondary);
|
||||
border-bottom: 2px solid var(--primary-cyan);
|
||||
padding: 1rem 0;
|
||||
}
|
||||
|
||||
.header-content {
|
||||
max-width: 1800px;
|
||||
margin: 0 auto;
|
||||
padding: 0 2rem;
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.header-left {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 1rem;
|
||||
}
|
||||
|
||||
.header-logo {
|
||||
height: 35px;
|
||||
}
|
||||
|
||||
.admin-header h1 {
|
||||
font-size: 1.25rem;
|
||||
color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
.header-right {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 2rem;
|
||||
}
|
||||
|
||||
.admin-user {
|
||||
color: var(--text-secondary);
|
||||
}
|
||||
|
||||
.logout-btn {
|
||||
padding: 0.5rem 1rem;
|
||||
background: transparent;
|
||||
border: 1px solid var(--error);
|
||||
color: var(--error);
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.logout-btn:hover {
|
||||
background: rgba(255, 60, 116, 0.1);
|
||||
}
|
||||
|
||||
/* Layout */
|
||||
.admin-layout {
|
||||
display: flex;
|
||||
max-width: 1800px;
|
||||
margin: 0 auto;
|
||||
min-height: calc(100vh - 60px);
|
||||
}
|
||||
|
||||
/* Sidebar */
|
||||
.admin-sidebar {
|
||||
width: 250px;
|
||||
background: var(--bg-secondary);
|
||||
border-right: 1px solid var(--border-color);
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
justify-content: space-between;
|
||||
}
|
||||
|
||||
.sidebar-nav {
|
||||
padding: 1rem 0;
|
||||
}
|
||||
|
||||
.nav-btn {
|
||||
width: 100%;
|
||||
padding: 1rem 1.5rem;
|
||||
background: transparent;
|
||||
border: none;
|
||||
border-left: 3px solid transparent;
|
||||
color: var(--text-secondary);
|
||||
text-align: left;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.75rem;
|
||||
}
|
||||
|
||||
.nav-btn:hover {
|
||||
background: rgba(80, 255, 255, 0.05);
|
||||
color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
.nav-btn.active {
|
||||
border-left-color: var(--primary-cyan);
|
||||
background: rgba(80, 255, 255, 0.1);
|
||||
color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
.nav-icon {
|
||||
font-size: 1.25rem;
|
||||
margin-right: 0.25rem;
|
||||
display: inline-block;
|
||||
width: 1.5rem;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.nav-btn[data-section="stats"] .nav-icon {
|
||||
color: var(--c4ai-cyan);
|
||||
}
|
||||
|
||||
.nav-btn[data-section="apps"] .nav-icon {
|
||||
color: var(--c4ai-green);
|
||||
}
|
||||
|
||||
.nav-btn[data-section="articles"] .nav-icon {
|
||||
color: var(--c4ai-yellow);
|
||||
}
|
||||
|
||||
.nav-btn[data-section="categories"] .nav-icon {
|
||||
color: var(--c4ai-pink);
|
||||
}
|
||||
|
||||
.nav-btn[data-section="sponsors"] .nav-icon {
|
||||
color: var(--c4ai-blue);
|
||||
}
|
||||
|
||||
.sidebar-actions {
|
||||
padding: 1rem;
|
||||
border-top: 1px solid var(--border-color);
|
||||
}
|
||||
|
||||
.action-btn {
|
||||
width: 100%;
|
||||
padding: 0.75rem;
|
||||
background: var(--bg-tertiary);
|
||||
border: 1px solid var(--border-color);
|
||||
color: var(--text-secondary);
|
||||
cursor: pointer;
|
||||
margin-bottom: 0.5rem;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.action-btn:hover {
|
||||
border-color: var(--primary-cyan);
|
||||
color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
/* Main Content */
|
||||
.admin-main {
|
||||
flex: 1;
|
||||
padding: 2rem;
|
||||
overflow-y: auto;
|
||||
}
|
||||
|
||||
.content-section {
|
||||
display: none;
|
||||
}
|
||||
|
||||
.content-section.active {
|
||||
display: block;
|
||||
}
|
||||
|
||||
/* Stats Grid */
|
||||
.stats-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
|
||||
gap: 1.5rem;
|
||||
margin-bottom: 3rem;
|
||||
}
|
||||
|
||||
.stat-card {
|
||||
background: linear-gradient(135deg, rgba(80, 255, 255, 0.03), rgba(243, 128, 245, 0.02));
|
||||
border: 1px solid rgba(80, 255, 255, 0.3);
|
||||
padding: 1.5rem;
|
||||
display: flex;
|
||||
gap: 1.5rem;
|
||||
}
|
||||
|
||||
.stat-icon {
|
||||
font-size: 2rem;
|
||||
width: 3rem;
|
||||
height: 3rem;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
border: 2px solid;
|
||||
border-radius: 4px;
|
||||
}
|
||||
|
||||
.stat-card:nth-child(1) .stat-icon {
|
||||
color: var(--c4ai-cyan);
|
||||
border-color: var(--c4ai-cyan);
|
||||
}
|
||||
|
||||
.stat-card:nth-child(2) .stat-icon {
|
||||
color: var(--c4ai-green);
|
||||
border-color: var(--c4ai-green);
|
||||
}
|
||||
|
||||
.stat-card:nth-child(3) .stat-icon {
|
||||
color: var(--c4ai-yellow);
|
||||
border-color: var(--c4ai-yellow);
|
||||
}
|
||||
|
||||
.stat-card:nth-child(4) .stat-icon {
|
||||
color: var(--c4ai-pink);
|
||||
border-color: var(--c4ai-pink);
|
||||
}
|
||||
|
||||
.stat-number {
|
||||
font-size: 2rem;
|
||||
color: var(--primary-cyan);
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.stat-label {
|
||||
color: var(--text-secondary);
|
||||
}
|
||||
|
||||
.stat-detail {
|
||||
font-size: 0.875rem;
|
||||
color: var(--text-tertiary);
|
||||
margin-top: 0.5rem;
|
||||
}
|
||||
|
||||
/* Quick Actions */
|
||||
.quick-actions {
|
||||
display: flex;
|
||||
gap: 1rem;
|
||||
}
|
||||
|
||||
.quick-btn {
|
||||
padding: 0.75rem 1.5rem;
|
||||
background: transparent;
|
||||
border: 1px solid var(--primary-cyan);
|
||||
color: var(--primary-cyan);
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.quick-btn:hover {
|
||||
background: rgba(80, 255, 255, 0.1);
|
||||
transform: translateY(-2px);
|
||||
}
|
||||
|
||||
/* Section Headers */
|
||||
.section-header {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
margin-bottom: 2rem;
|
||||
}
|
||||
|
||||
.section-header h2 {
|
||||
font-size: 1.5rem;
|
||||
color: var(--text-primary);
|
||||
}
|
||||
|
||||
.header-actions {
|
||||
display: flex;
|
||||
gap: 1rem;
|
||||
}
|
||||
|
||||
.search-input {
|
||||
padding: 0.5rem 1rem;
|
||||
background: var(--bg-dark);
|
||||
border: 1px solid var(--border-color);
|
||||
color: var(--text-primary);
|
||||
width: 250px;
|
||||
}
|
||||
|
||||
.search-input:focus {
|
||||
outline: none;
|
||||
border-color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
.filter-select {
|
||||
padding: 0.5rem;
|
||||
background: var(--bg-dark);
|
||||
border: 1px solid var(--border-color);
|
||||
color: var(--text-primary);
|
||||
}
|
||||
|
||||
.add-btn {
|
||||
padding: 0.5rem 1rem;
|
||||
background: linear-gradient(135deg, var(--primary-cyan), var(--primary-teal));
|
||||
border: none;
|
||||
color: var(--bg-dark);
|
||||
font-weight: 600;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.add-btn:hover {
|
||||
box-shadow: 0 4px 15px rgba(80, 255, 255, 0.3);
|
||||
transform: translateY(-2px);
|
||||
}
|
||||
|
||||
/* Data Tables */
|
||||
.data-table {
|
||||
background: var(--bg-secondary);
|
||||
border: 1px solid var(--border-color);
|
||||
overflow-x: auto;
|
||||
}
|
||||
|
||||
.data-table table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
}
|
||||
|
||||
.data-table th {
|
||||
background: var(--bg-tertiary);
|
||||
padding: 1rem;
|
||||
text-align: left;
|
||||
color: var(--primary-cyan);
|
||||
font-weight: 600;
|
||||
border-bottom: 2px solid var(--border-color);
|
||||
position: sticky;
|
||||
top: 0;
|
||||
z-index: 10;
|
||||
}
|
||||
|
||||
.data-table td {
|
||||
padding: 1rem;
|
||||
border-bottom: 1px solid var(--border-color);
|
||||
}
|
||||
|
||||
.data-table tr:hover {
|
||||
background: rgba(80, 255, 255, 0.03);
|
||||
}
|
||||
|
||||
/* Table Actions */
|
||||
.table-actions {
|
||||
display: flex;
|
||||
gap: 0.5rem;
|
||||
}
|
||||
|
||||
.table-logo {
|
||||
width: 48px;
|
||||
height: 48px;
|
||||
object-fit: contain;
|
||||
border-radius: 6px;
|
||||
background: var(--bg-tertiary);
|
||||
border: 1px solid var(--border-color);
|
||||
padding: 4px;
|
||||
}
|
||||
|
||||
.btn-edit, .btn-delete, .btn-duplicate {
|
||||
padding: 0.25rem 0.5rem;
|
||||
background: transparent;
|
||||
border: 1px solid var(--border-color);
|
||||
color: var(--text-secondary);
|
||||
cursor: pointer;
|
||||
font-size: 0.875rem;
|
||||
}
|
||||
|
||||
.btn-edit:hover {
|
||||
border-color: var(--primary-cyan);
|
||||
color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
.btn-delete:hover {
|
||||
border-color: var(--error);
|
||||
color: var(--error);
|
||||
}
|
||||
|
||||
.btn-duplicate:hover {
|
||||
border-color: var(--accent-pink);
|
||||
color: var(--accent-pink);
|
||||
}
|
||||
|
||||
/* Badges in Tables */
|
||||
.badge {
|
||||
padding: 0.25rem 0.5rem;
|
||||
font-size: 0.75rem;
|
||||
text-transform: uppercase;
|
||||
}
|
||||
|
||||
.badge.featured {
|
||||
background: var(--primary-cyan);
|
||||
color: var(--bg-dark);
|
||||
}
|
||||
|
||||
.badge.sponsored {
|
||||
background: var(--warning);
|
||||
color: var(--bg-dark);
|
||||
}
|
||||
|
||||
.badge.active {
|
||||
background: var(--success);
|
||||
color: var(--bg-dark);
|
||||
}
|
||||
|
||||
/* Modal Enhancements */
|
||||
.modal-content.large {
|
||||
max-width: 1000px;
|
||||
width: 90%;
|
||||
max-height: 90vh;
|
||||
}
|
||||
|
||||
.modal-header {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
padding: 1.5rem;
|
||||
border-bottom: 1px solid var(--border-color);
|
||||
}
|
||||
|
||||
.modal-body {
|
||||
padding: 1.5rem;
|
||||
overflow-y: auto;
|
||||
max-height: calc(90vh - 140px);
|
||||
}
|
||||
|
||||
.modal-footer {
|
||||
display: flex;
|
||||
justify-content: flex-end;
|
||||
gap: 1rem;
|
||||
padding: 1rem 1.5rem;
|
||||
border-top: 1px solid var(--border-color);
|
||||
}
|
||||
|
||||
.btn-cancel, .btn-save {
|
||||
padding: 0.5rem 1.5rem;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.btn-cancel {
|
||||
background: transparent;
|
||||
border: 1px solid var(--border-color);
|
||||
color: var(--text-secondary);
|
||||
}
|
||||
|
||||
.btn-cancel:hover {
|
||||
border-color: var(--error);
|
||||
color: var(--error);
|
||||
}
|
||||
|
||||
.btn-save {
|
||||
background: linear-gradient(135deg, var(--primary-cyan), var(--primary-teal));
|
||||
border: none;
|
||||
color: var(--bg-dark);
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.btn-save:hover {
|
||||
box-shadow: 0 4px 15px rgba(80, 255, 255, 0.3);
|
||||
}
|
||||
|
||||
/* Form Styles */
|
||||
.form-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
|
||||
gap: 1.5rem;
|
||||
}
|
||||
|
||||
.form-group {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.5rem;
|
||||
}
|
||||
|
||||
.form-group label {
|
||||
color: var(--text-secondary);
|
||||
font-size: 0.875rem;
|
||||
}
|
||||
|
||||
.form-group input,
|
||||
.form-group select,
|
||||
.form-group textarea {
|
||||
padding: 0.5rem;
|
||||
background: var(--bg-dark);
|
||||
border: 1px solid var(--border-color);
|
||||
color: var(--text-primary);
|
||||
font-family: inherit;
|
||||
}
|
||||
|
||||
.form-group input:focus,
|
||||
.form-group select:focus,
|
||||
.form-group textarea:focus {
|
||||
outline: none;
|
||||
border-color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
.form-group.full-width {
|
||||
grid-column: 1 / -1;
|
||||
}
|
||||
|
||||
.checkbox-group {
|
||||
display: flex;
|
||||
gap: 2rem;
|
||||
}
|
||||
|
||||
.checkbox-label {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.sponsor-form {
|
||||
grid-template-columns: 200px repeat(2, minmax(220px, 1fr));
|
||||
align-items: flex-start;
|
||||
grid-auto-flow: dense;
|
||||
}
|
||||
|
||||
.sponsor-logo-group {
|
||||
grid-row: span 3;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.75rem;
|
||||
}
|
||||
|
||||
.span-two {
|
||||
grid-column: span 2;
|
||||
}
|
||||
|
||||
.logo-upload {
|
||||
position: relative;
|
||||
width: 180px;
|
||||
}
|
||||
|
||||
.image-preview {
|
||||
width: 180px;
|
||||
height: 180px;
|
||||
border: 1px dashed var(--border-color);
|
||||
border-radius: 12px;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
background: var(--bg-tertiary);
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.image-preview.empty {
|
||||
color: var(--text-secondary);
|
||||
font-size: 0.75rem;
|
||||
text-align: center;
|
||||
padding: 0.75rem;
|
||||
}
|
||||
|
||||
.image-preview img {
|
||||
max-width: 100%;
|
||||
max-height: 100%;
|
||||
object-fit: contain;
|
||||
}
|
||||
|
||||
.upload-btn {
|
||||
position: absolute;
|
||||
left: 50%;
|
||||
bottom: 12px;
|
||||
transform: translateX(-50%);
|
||||
padding: 0.35rem 1rem;
|
||||
background: linear-gradient(135deg, var(--primary-cyan), var(--primary-teal));
|
||||
border: none;
|
||||
border-radius: 999px;
|
||||
color: var(--bg-dark);
|
||||
font-size: 0.75rem;
|
||||
font-weight: 600;
|
||||
cursor: pointer;
|
||||
box-shadow: 0 6px 18px rgba(80, 255, 255, 0.25);
|
||||
}
|
||||
|
||||
.upload-btn:hover {
|
||||
box-shadow: 0 8px 22px rgba(80, 255, 255, 0.35);
|
||||
}
|
||||
|
||||
.logo-upload input[type="file"] {
|
||||
display: none;
|
||||
}
|
||||
|
||||
.upload-hint {
|
||||
font-size: 0.75rem;
|
||||
color: var(--text-secondary);
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
@media (max-width: 960px) {
|
||||
.sponsor-form {
|
||||
grid-template-columns: repeat(auto-fit, minmax(240px, 1fr));
|
||||
}
|
||||
|
||||
.sponsor-logo-group {
|
||||
grid-column: 1 / -1;
|
||||
grid-row: auto;
|
||||
flex-direction: row;
|
||||
align-items: center;
|
||||
gap: 1.5rem;
|
||||
}
|
||||
|
||||
.logo-upload {
|
||||
width: 160px;
|
||||
}
|
||||
|
||||
.span-two {
|
||||
grid-column: 1 / -1;
|
||||
}
|
||||
}
|
||||
|
||||
/* Rich Text Editor */
|
||||
.editor-toolbar {
|
||||
display: flex;
|
||||
gap: 0.5rem;
|
||||
padding: 0.5rem;
|
||||
background: var(--bg-tertiary);
|
||||
border: 1px solid var(--border-color);
|
||||
border-bottom: none;
|
||||
}
|
||||
|
||||
.editor-btn {
|
||||
padding: 0.25rem 0.5rem;
|
||||
background: transparent;
|
||||
border: 1px solid var(--border-color);
|
||||
color: var(--text-secondary);
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.editor-btn:hover {
|
||||
background: rgba(80, 255, 255, 0.1);
|
||||
border-color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
.editor-content {
|
||||
min-height: 300px;
|
||||
padding: 1rem;
|
||||
background: var(--bg-dark);
|
||||
border: 1px solid var(--border-color);
|
||||
font-family: 'Dank Mono', Monaco, monospace;
|
||||
}
|
||||
|
||||
/* Responsive */
|
||||
@media (max-width: 1024px) {
|
||||
.admin-layout {
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.admin-sidebar {
|
||||
width: 100%;
|
||||
border-right: none;
|
||||
border-bottom: 1px solid var(--border-color);
|
||||
}
|
||||
|
||||
.sidebar-nav {
|
||||
display: flex;
|
||||
overflow-x: auto;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
.nav-btn {
|
||||
border-left: none;
|
||||
border-bottom: 3px solid transparent;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.nav-btn.active {
|
||||
border-bottom-color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
.sidebar-actions {
|
||||
display: none;
|
||||
}
|
||||
}
|
||||
933
docs/md_v2/marketplace/admin/admin.js
Normal file
933
docs/md_v2/marketplace/admin/admin.js
Normal file
@@ -0,0 +1,933 @@
|
||||
// Admin Dashboard - Smart & Powerful
|
||||
const { API_BASE, API_ORIGIN } = (() => {
|
||||
const cleanOrigin = (value) => value ? value.replace(/\/$/, '') : '';
|
||||
const params = new URLSearchParams(window.location.search);
|
||||
const overrideParam = cleanOrigin(params.get('api_origin'));
|
||||
|
||||
let storedOverride = '';
|
||||
try {
|
||||
storedOverride = cleanOrigin(localStorage.getItem('marketplace_api_origin'));
|
||||
} catch (error) {
|
||||
storedOverride = '';
|
||||
}
|
||||
|
||||
let origin = overrideParam || storedOverride;
|
||||
|
||||
if (overrideParam && overrideParam !== storedOverride) {
|
||||
try {
|
||||
localStorage.setItem('marketplace_api_origin', overrideParam);
|
||||
} catch (error) {
|
||||
// ignore storage errors (private mode, etc.)
|
||||
}
|
||||
}
|
||||
|
||||
const { protocol, hostname, port } = window.location;
|
||||
const isLocalHost = ['localhost', '127.0.0.1', '0.0.0.0'].includes(hostname);
|
||||
|
||||
if (!origin && isLocalHost && port !== '8100') {
|
||||
origin = `${protocol}//127.0.0.1:8100`;
|
||||
}
|
||||
|
||||
if (origin) {
|
||||
const normalized = cleanOrigin(origin);
|
||||
return { API_BASE: `${normalized}/marketplace/api`, API_ORIGIN: normalized };
|
||||
}
|
||||
|
||||
return { API_BASE: '/marketplace/api', API_ORIGIN: '' };
|
||||
})();
|
||||
|
||||
const resolveAssetUrl = (path) => {
|
||||
if (!path) return '';
|
||||
if (/^https?:\/\//i.test(path)) return path;
|
||||
if (path.startsWith('/') && API_ORIGIN) {
|
||||
return `${API_ORIGIN}${path}`;
|
||||
}
|
||||
return path;
|
||||
};
|
||||
|
||||
class AdminDashboard {
|
||||
constructor() {
|
||||
this.token = localStorage.getItem('admin_token');
|
||||
this.currentSection = 'stats';
|
||||
this.data = {
|
||||
apps: [],
|
||||
articles: [],
|
||||
categories: [],
|
||||
sponsors: []
|
||||
};
|
||||
this.editingItem = null;
|
||||
this.init();
|
||||
}
|
||||
|
||||
async init() {
|
||||
// Check auth
|
||||
if (!this.token) {
|
||||
this.showLogin();
|
||||
return;
|
||||
}
|
||||
|
||||
// Try to load stats to verify token
|
||||
try {
|
||||
await this.loadStats();
|
||||
this.showDashboard();
|
||||
this.setupEventListeners();
|
||||
await this.loadAllData();
|
||||
} catch (error) {
|
||||
if (error.status === 401) {
|
||||
this.showLogin();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
showLogin() {
|
||||
document.getElementById('login-screen').classList.remove('hidden');
|
||||
document.getElementById('admin-dashboard').classList.add('hidden');
|
||||
|
||||
// Set up login button click handler
|
||||
const loginBtn = document.getElementById('login-btn');
|
||||
if (loginBtn) {
|
||||
loginBtn.onclick = async () => {
|
||||
const password = document.getElementById('password').value;
|
||||
await this.login(password);
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
async login(password) {
|
||||
try {
|
||||
const response = await fetch(`${API_BASE}/admin/login`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ password })
|
||||
});
|
||||
|
||||
if (!response.ok) throw new Error('Invalid password');
|
||||
|
||||
const data = await response.json();
|
||||
this.token = data.token;
|
||||
localStorage.setItem('admin_token', this.token);
|
||||
|
||||
document.getElementById('login-screen').classList.add('hidden');
|
||||
this.showDashboard();
|
||||
this.setupEventListeners();
|
||||
await this.loadAllData();
|
||||
} catch (error) {
|
||||
document.getElementById('login-error').textContent = 'Invalid password';
|
||||
document.getElementById('password').value = '';
|
||||
}
|
||||
}
|
||||
|
||||
showDashboard() {
|
||||
document.getElementById('login-screen').classList.add('hidden');
|
||||
document.getElementById('admin-dashboard').classList.remove('hidden');
|
||||
}
|
||||
|
||||
setupEventListeners() {
|
||||
// Navigation
|
||||
document.querySelectorAll('.nav-btn').forEach(btn => {
|
||||
btn.onclick = () => this.switchSection(btn.dataset.section);
|
||||
});
|
||||
|
||||
// Logout
|
||||
document.getElementById('logout-btn').onclick = () => this.logout();
|
||||
|
||||
// Export/Backup
|
||||
document.getElementById('export-btn').onclick = () => this.exportData();
|
||||
document.getElementById('backup-btn').onclick = () => this.backupDatabase();
|
||||
|
||||
// Search
|
||||
['apps', 'articles'].forEach(type => {
|
||||
const searchInput = document.getElementById(`${type}-search`);
|
||||
if (searchInput) {
|
||||
searchInput.oninput = (e) => this.filterTable(type, e.target.value);
|
||||
}
|
||||
});
|
||||
|
||||
// Category filter
|
||||
const categoryFilter = document.getElementById('apps-filter');
|
||||
if (categoryFilter) {
|
||||
categoryFilter.onchange = (e) => this.filterByCategory(e.target.value);
|
||||
}
|
||||
|
||||
// Save button in modal
|
||||
document.getElementById('save-btn').onclick = () => this.saveItem();
|
||||
}
|
||||
|
||||
async loadAllData() {
|
||||
try {
|
||||
await this.loadStats();
|
||||
} catch (e) {
|
||||
console.error('Failed to load stats:', e);
|
||||
}
|
||||
|
||||
try {
|
||||
await this.loadApps();
|
||||
} catch (e) {
|
||||
console.error('Failed to load apps:', e);
|
||||
}
|
||||
|
||||
try {
|
||||
await this.loadArticles();
|
||||
} catch (e) {
|
||||
console.error('Failed to load articles:', e);
|
||||
}
|
||||
|
||||
try {
|
||||
await this.loadCategories();
|
||||
} catch (e) {
|
||||
console.error('Failed to load categories:', e);
|
||||
}
|
||||
|
||||
try {
|
||||
await this.loadSponsors();
|
||||
} catch (e) {
|
||||
console.error('Failed to load sponsors:', e);
|
||||
}
|
||||
|
||||
this.populateCategoryFilter();
|
||||
}
|
||||
|
||||
async apiCall(endpoint, options = {}) {
|
||||
const isFormData = options.body instanceof FormData;
|
||||
const headers = {
|
||||
'Authorization': `Bearer ${this.token}`,
|
||||
...options.headers
|
||||
};
|
||||
|
||||
if (!isFormData && !headers['Content-Type']) {
|
||||
headers['Content-Type'] = 'application/json';
|
||||
}
|
||||
|
||||
const response = await fetch(`${API_BASE}${endpoint}`, {
|
||||
...options,
|
||||
headers
|
||||
});
|
||||
|
||||
if (response.status === 401) {
|
||||
this.logout();
|
||||
throw { status: 401 };
|
||||
}
|
||||
|
||||
if (!response.ok) throw new Error(`API Error: ${response.status}`);
|
||||
return response.json();
|
||||
}
|
||||
|
||||
async loadStats() {
|
||||
const stats = await this.apiCall(`/admin/stats?_=${Date.now()}`, {
|
||||
cache: 'no-store'
|
||||
});
|
||||
|
||||
document.getElementById('stat-apps').textContent = stats.apps.total;
|
||||
document.getElementById('stat-featured').textContent = stats.apps.featured;
|
||||
document.getElementById('stat-sponsored').textContent = stats.apps.sponsored;
|
||||
document.getElementById('stat-articles').textContent = stats.articles;
|
||||
document.getElementById('stat-sponsors').textContent = stats.sponsors.active;
|
||||
document.getElementById('stat-views').textContent = this.formatNumber(stats.total_views);
|
||||
}
|
||||
|
||||
async loadApps() {
|
||||
this.data.apps = await this.apiCall(`/apps?limit=100&_=${Date.now()}`, {
|
||||
cache: 'no-store'
|
||||
});
|
||||
this.renderAppsTable(this.data.apps);
|
||||
}
|
||||
|
||||
async loadArticles() {
|
||||
this.data.articles = await this.apiCall(`/articles?limit=100&_=${Date.now()}`, {
|
||||
cache: 'no-store'
|
||||
});
|
||||
this.renderArticlesTable(this.data.articles);
|
||||
}
|
||||
|
||||
async loadCategories() {
|
||||
const cacheBuster = Date.now();
|
||||
this.data.categories = await this.apiCall(`/categories?_=${cacheBuster}`, {
|
||||
cache: 'no-store'
|
||||
});
|
||||
this.renderCategoriesTable(this.data.categories);
|
||||
}
|
||||
|
||||
async loadSponsors() {
|
||||
const cacheBuster = Date.now();
|
||||
this.data.sponsors = await this.apiCall(`/sponsors?limit=100&_=${cacheBuster}`, {
|
||||
cache: 'no-store'
|
||||
});
|
||||
this.renderSponsorsTable(this.data.sponsors);
|
||||
}
|
||||
|
||||
renderAppsTable(apps) {
|
||||
const table = document.getElementById('apps-table');
|
||||
table.innerHTML = `
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>ID</th>
|
||||
<th>Name</th>
|
||||
<th>Category</th>
|
||||
<th>Type</th>
|
||||
<th>Rating</th>
|
||||
<th>Downloads</th>
|
||||
<th>Status</th>
|
||||
<th>Actions</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
${apps.map(app => `
|
||||
<tr>
|
||||
<td>${app.id}</td>
|
||||
<td>${app.name}</td>
|
||||
<td>${app.category}</td>
|
||||
<td>${app.type}</td>
|
||||
<td>◆ ${app.rating}/5</td>
|
||||
<td>${this.formatNumber(app.downloads)}</td>
|
||||
<td>
|
||||
${app.featured ? '<span class="badge featured">Featured</span>' : ''}
|
||||
${app.sponsored ? '<span class="badge sponsored">Sponsored</span>' : ''}
|
||||
</td>
|
||||
<td>
|
||||
<div class="table-actions">
|
||||
<button class="btn-edit" onclick="admin.editItem('apps', ${app.id})">Edit</button>
|
||||
<button class="btn-duplicate" onclick="admin.duplicateItem('apps', ${app.id})">Duplicate</button>
|
||||
<button class="btn-delete" onclick="admin.deleteItem('apps', ${app.id})">Delete</button>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
`).join('')}
|
||||
</tbody>
|
||||
</table>
|
||||
`;
|
||||
}
|
||||
|
||||
renderArticlesTable(articles) {
|
||||
const table = document.getElementById('articles-table');
|
||||
table.innerHTML = `
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>ID</th>
|
||||
<th>Title</th>
|
||||
<th>Category</th>
|
||||
<th>Author</th>
|
||||
<th>Published</th>
|
||||
<th>Views</th>
|
||||
<th>Actions</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
${articles.map(article => `
|
||||
<tr>
|
||||
<td>${article.id}</td>
|
||||
<td>${article.title}</td>
|
||||
<td>${article.category}</td>
|
||||
<td>${article.author}</td>
|
||||
<td>${new Date(article.published_date).toLocaleDateString()}</td>
|
||||
<td>${this.formatNumber(article.views)}</td>
|
||||
<td>
|
||||
<div class="table-actions">
|
||||
<button class="btn-edit" onclick="admin.editItem('articles', ${article.id})">Edit</button>
|
||||
<button class="btn-duplicate" onclick="admin.duplicateItem('articles', ${article.id})">Duplicate</button>
|
||||
<button class="btn-delete" onclick="admin.deleteItem('articles', ${article.id})">Delete</button>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
`).join('')}
|
||||
</tbody>
|
||||
</table>
|
||||
`;
|
||||
}
|
||||
|
||||
renderCategoriesTable(categories) {
|
||||
const table = document.getElementById('categories-table');
|
||||
table.innerHTML = `
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Order</th>
|
||||
<th>Icon</th>
|
||||
<th>Name</th>
|
||||
<th>Description</th>
|
||||
<th>Actions</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
${categories.map(cat => `
|
||||
<tr>
|
||||
<td>${cat.order_index}</td>
|
||||
<td>${cat.icon}</td>
|
||||
<td>${cat.name}</td>
|
||||
<td>${cat.description}</td>
|
||||
<td>
|
||||
<div class="table-actions">
|
||||
<button class="btn-edit" onclick="admin.editItem('categories', ${cat.id})">Edit</button>
|
||||
<button class="btn-delete" onclick="admin.deleteCategory(${cat.id})">Delete</button>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
`).join('')}
|
||||
</tbody>
|
||||
</table>
|
||||
`;
|
||||
}
|
||||
|
||||
renderSponsorsTable(sponsors) {
|
||||
const table = document.getElementById('sponsors-table');
|
||||
table.innerHTML = `
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>ID</th>
|
||||
<th>Logo</th>
|
||||
<th>Company</th>
|
||||
<th>Tier</th>
|
||||
<th>Start</th>
|
||||
<th>End</th>
|
||||
<th>Status</th>
|
||||
<th>Actions</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
${sponsors.map(sponsor => `
|
||||
<tr>
|
||||
<td>${sponsor.id}</td>
|
||||
<td>${sponsor.logo_url ? `<img class="table-logo" src="${resolveAssetUrl(sponsor.logo_url)}" alt="${sponsor.company_name} logo">` : '-'}</td>
|
||||
<td>${sponsor.company_name}</td>
|
||||
<td>${sponsor.tier}</td>
|
||||
<td>${new Date(sponsor.start_date).toLocaleDateString()}</td>
|
||||
<td>${new Date(sponsor.end_date).toLocaleDateString()}</td>
|
||||
<td>${sponsor.active ? '<span class="badge active">Active</span>' : 'Inactive'}</td>
|
||||
<td>
|
||||
<div class="table-actions">
|
||||
<button class="btn-edit" onclick="admin.editItem('sponsors', ${sponsor.id})">Edit</button>
|
||||
<button class="btn-delete" onclick="admin.deleteItem('sponsors', ${sponsor.id})">Delete</button>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
`).join('')}
|
||||
</tbody>
|
||||
</table>
|
||||
`;
|
||||
}
|
||||
|
||||
showAddForm(type) {
|
||||
this.editingItem = null;
|
||||
this.showModal(type, null);
|
||||
}
|
||||
|
||||
async editItem(type, id) {
|
||||
const item = this.data[type].find(i => i.id === id);
|
||||
if (item) {
|
||||
this.editingItem = item;
|
||||
this.showModal(type, item);
|
||||
}
|
||||
}
|
||||
|
||||
async duplicateItem(type, id) {
|
||||
const item = this.data[type].find(i => i.id === id);
|
||||
if (item) {
|
||||
const newItem = { ...item };
|
||||
delete newItem.id;
|
||||
newItem.name = `${newItem.name || newItem.title} (Copy)`;
|
||||
if (newItem.slug) newItem.slug = `${newItem.slug}-copy-${Date.now()}`;
|
||||
|
||||
this.editingItem = null;
|
||||
this.showModal(type, newItem);
|
||||
}
|
||||
}
|
||||
|
||||
showModal(type, item) {
|
||||
const modal = document.getElementById('form-modal');
|
||||
const title = document.getElementById('modal-title');
|
||||
const body = document.getElementById('modal-body');
|
||||
|
||||
title.textContent = item ? `Edit ${type.slice(0, -1)}` : `Add New ${type.slice(0, -1)}`;
|
||||
|
||||
if (type === 'apps') {
|
||||
body.innerHTML = this.getAppForm(item);
|
||||
} else if (type === 'articles') {
|
||||
body.innerHTML = this.getArticleForm(item);
|
||||
} else if (type === 'categories') {
|
||||
body.innerHTML = this.getCategoryForm(item);
|
||||
} else if (type === 'sponsors') {
|
||||
body.innerHTML = this.getSponsorForm(item);
|
||||
}
|
||||
|
||||
modal.classList.remove('hidden');
|
||||
modal.dataset.type = type;
|
||||
|
||||
if (type === 'sponsors') {
|
||||
this.setupLogoUploadHandlers();
|
||||
}
|
||||
}
|
||||
|
||||
getAppForm(app) {
|
||||
return `
|
||||
<div class="form-grid">
|
||||
<div class="form-group">
|
||||
<label>Name *</label>
|
||||
<input type="text" id="form-name" value="${app?.name || ''}" required>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label>Slug</label>
|
||||
<input type="text" id="form-slug" value="${app?.slug || ''}" placeholder="auto-generated">
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label>Category</label>
|
||||
<select id="form-category">
|
||||
${this.data.categories.map(cat =>
|
||||
`<option value="${cat.name}" ${app?.category === cat.name ? 'selected' : ''}>${cat.name}</option>`
|
||||
).join('')}
|
||||
</select>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label>Type</label>
|
||||
<select id="form-type">
|
||||
<option value="Open Source" ${app?.type === 'Open Source' ? 'selected' : ''}>Open Source</option>
|
||||
<option value="Paid" ${app?.type === 'Paid' ? 'selected' : ''}>Paid</option>
|
||||
<option value="Freemium" ${app?.type === 'Freemium' ? 'selected' : ''}>Freemium</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label>Rating</label>
|
||||
<input type="number" id="form-rating" value="${app?.rating || 4.5}" min="0" max="5" step="0.1">
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label>Downloads</label>
|
||||
<input type="number" id="form-downloads" value="${app?.downloads || 0}">
|
||||
</div>
|
||||
<div class="form-group full-width">
|
||||
<label>Description</label>
|
||||
<textarea id="form-description" rows="3">${app?.description || ''}</textarea>
|
||||
</div>
|
||||
<div class="form-group full-width">
|
||||
<label>Image URL</label>
|
||||
<input type="text" id="form-image" value="${app?.image || ''}" placeholder="https://...">
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label>Website URL</label>
|
||||
<input type="text" id="form-website" value="${app?.website_url || ''}">
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label>GitHub URL</label>
|
||||
<input type="text" id="form-github" value="${app?.github_url || ''}">
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label>Pricing</label>
|
||||
<input type="text" id="form-pricing" value="${app?.pricing || 'Free'}">
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label>Contact Email</label>
|
||||
<input type="email" id="form-email" value="${app?.contact_email || ''}">
|
||||
</div>
|
||||
<div class="form-group full-width checkbox-group">
|
||||
<label class="checkbox-label">
|
||||
<input type="checkbox" id="form-featured" ${app?.featured ? 'checked' : ''}>
|
||||
Featured
|
||||
</label>
|
||||
<label class="checkbox-label">
|
||||
<input type="checkbox" id="form-sponsored" ${app?.sponsored ? 'checked' : ''}>
|
||||
Sponsored
|
||||
</label>
|
||||
</div>
|
||||
<div class="form-group full-width">
|
||||
<label>Long Description (Markdown - Overview tab)</label>
|
||||
<textarea id="form-long-description" rows="10" placeholder="Enter detailed description with markdown formatting...">${app?.long_description || ''}</textarea>
|
||||
<small>Markdown support: **bold**, *italic*, [links](url), # headers, code blocks, lists</small>
|
||||
</div>
|
||||
<div class="form-group full-width">
|
||||
<label>Integration Guide (Markdown - Integration tab)</label>
|
||||
<textarea id="form-integration" rows="20" placeholder="Enter integration guide with installation, examples, and code snippets using markdown...">${app?.integration_guide || ''}</textarea>
|
||||
<small>Single markdown field with installation, examples, and complete guide. Code blocks get auto copy buttons.</small>
|
||||
</div>
|
||||
<div class="form-group full-width">
|
||||
<label>Documentation (Markdown - Documentation tab)</label>
|
||||
<textarea id="form-documentation" rows="20" placeholder="Enter documentation with API reference, examples, and best practices using markdown...">${app?.documentation || ''}</textarea>
|
||||
<small>Full documentation with API reference, examples, best practices, etc.</small>
|
||||
</div>
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
|
||||
getArticleForm(article) {
|
||||
return `
|
||||
<div class="form-grid">
|
||||
<div class="form-group full-width">
|
||||
<label>Title *</label>
|
||||
<input type="text" id="form-title" value="${article?.title || ''}" required>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label>Author</label>
|
||||
<input type="text" id="form-author" value="${article?.author || 'Crawl4AI Team'}">
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label>Category</label>
|
||||
<select id="form-category">
|
||||
<option value="News" ${article?.category === 'News' ? 'selected' : ''}>News</option>
|
||||
<option value="Tutorial" ${article?.category === 'Tutorial' ? 'selected' : ''}>Tutorial</option>
|
||||
<option value="Review" ${article?.category === 'Review' ? 'selected' : ''}>Review</option>
|
||||
<option value="Comparison" ${article?.category === 'Comparison' ? 'selected' : ''}>Comparison</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="form-group full-width">
|
||||
<label>Featured Image URL</label>
|
||||
<input type="text" id="form-image" value="${article?.featured_image || ''}">
|
||||
</div>
|
||||
<div class="form-group full-width">
|
||||
<label>Content</label>
|
||||
<textarea id="form-content" rows="20">${article?.content || ''}</textarea>
|
||||
</div>
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
|
||||
getCategoryForm(category) {
|
||||
return `
|
||||
<div class="form-grid">
|
||||
<div class="form-group">
|
||||
<label>Name *</label>
|
||||
<input type="text" id="form-name" value="${category?.name || ''}" required>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label>Icon</label>
|
||||
<input type="text" id="form-icon" value="${category?.icon || '📁'}" maxlength="2">
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label>Order</label>
|
||||
<input type="number" id="form-order" value="${category?.order_index || 0}">
|
||||
</div>
|
||||
<div class="form-group full-width">
|
||||
<label>Description</label>
|
||||
<textarea id="form-description" rows="3">${category?.description || ''}</textarea>
|
||||
</div>
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
|
||||
getSponsorForm(sponsor) {
|
||||
const existingFile = sponsor?.logo_url ? sponsor.logo_url.split('/').pop().split('?')[0] : '';
|
||||
return `
|
||||
<div class="form-grid sponsor-form">
|
||||
<div class="form-group sponsor-logo-group">
|
||||
<label>Logo</label>
|
||||
<input type="hidden" id="form-logo-url" value="${sponsor?.logo_url || ''}">
|
||||
<div class="logo-upload">
|
||||
<div class="image-preview ${sponsor?.logo_url ? '' : 'empty'}" id="form-logo-preview">
|
||||
${sponsor?.logo_url ? `<img src="${resolveAssetUrl(sponsor.logo_url)}" alt="Logo preview">` : '<span>No logo uploaded</span>'}
|
||||
</div>
|
||||
<button type="button" class="upload-btn" id="form-logo-button">Upload Logo</button>
|
||||
<input type="file" id="form-logo-file" accept="image/png,image/jpeg,image/webp,image/svg+xml" hidden>
|
||||
</div>
|
||||
<p class="upload-hint" id="form-logo-filename">${existingFile ? `Current: ${existingFile}` : 'No file selected'}</p>
|
||||
</div>
|
||||
<div class="form-group span-two">
|
||||
<label>Company Name *</label>
|
||||
<input type="text" id="form-name" value="${sponsor?.company_name || ''}" required>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label>Tier</label>
|
||||
<select id="form-tier">
|
||||
<option value="Bronze" ${sponsor?.tier === 'Bronze' ? 'selected' : ''}>Bronze</option>
|
||||
<option value="Silver" ${sponsor?.tier === 'Silver' ? 'selected' : ''}>Silver</option>
|
||||
<option value="Gold" ${sponsor?.tier === 'Gold' ? 'selected' : ''}>Gold</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label>Landing URL</label>
|
||||
<input type="text" id="form-landing" value="${sponsor?.landing_url || ''}">
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label>Banner URL</label>
|
||||
<input type="text" id="form-banner" value="${sponsor?.banner_url || ''}">
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label>Start Date</label>
|
||||
<input type="date" id="form-start" value="${sponsor?.start_date?.split('T')[0] || ''}">
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label>End Date</label>
|
||||
<input type="date" id="form-end" value="${sponsor?.end_date?.split('T')[0] || ''}">
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label class="checkbox-label">
|
||||
<input type="checkbox" id="form-active" ${sponsor?.active ? 'checked' : ''}>
|
||||
Active
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
|
||||
async saveItem() {
|
||||
const modal = document.getElementById('form-modal');
|
||||
const type = modal.dataset.type;
|
||||
|
||||
try {
|
||||
if (type === 'sponsors') {
|
||||
const fileInput = document.getElementById('form-logo-file');
|
||||
if (fileInput && fileInput.files && fileInput.files[0]) {
|
||||
const formData = new FormData();
|
||||
formData.append('file', fileInput.files[0]);
|
||||
formData.append('folder', 'sponsors');
|
||||
|
||||
const uploadResponse = await this.apiCall('/admin/upload-image', {
|
||||
method: 'POST',
|
||||
body: formData
|
||||
});
|
||||
|
||||
if (!uploadResponse.url) {
|
||||
throw new Error('Image upload failed');
|
||||
}
|
||||
|
||||
document.getElementById('form-logo-url').value = uploadResponse.url;
|
||||
}
|
||||
}
|
||||
|
||||
const data = this.collectFormData(type);
|
||||
|
||||
if (this.editingItem) {
|
||||
await this.apiCall(`/admin/${type}/${this.editingItem.id}`, {
|
||||
method: 'PUT',
|
||||
body: JSON.stringify(data)
|
||||
});
|
||||
} else {
|
||||
await this.apiCall(`/admin/${type}`, {
|
||||
method: 'POST',
|
||||
body: JSON.stringify(data)
|
||||
});
|
||||
}
|
||||
|
||||
this.closeModal();
|
||||
await this[`load${type.charAt(0).toUpperCase() + type.slice(1)}`]();
|
||||
await this.loadStats();
|
||||
} catch (error) {
|
||||
alert('Error saving item: ' + error.message);
|
||||
}
|
||||
}
|
||||
|
||||
collectFormData(type) {
|
||||
const data = {};
|
||||
|
||||
if (type === 'apps') {
|
||||
data.name = document.getElementById('form-name').value;
|
||||
data.slug = document.getElementById('form-slug').value || this.generateSlug(data.name);
|
||||
data.description = document.getElementById('form-description').value;
|
||||
data.category = document.getElementById('form-category').value;
|
||||
data.type = document.getElementById('form-type').value;
|
||||
const rating = parseFloat(document.getElementById('form-rating').value);
|
||||
const downloads = parseInt(document.getElementById('form-downloads').value, 10);
|
||||
data.rating = Number.isFinite(rating) ? rating : 0;
|
||||
data.downloads = Number.isFinite(downloads) ? downloads : 0;
|
||||
data.image = document.getElementById('form-image').value;
|
||||
data.website_url = document.getElementById('form-website').value;
|
||||
data.github_url = document.getElementById('form-github').value;
|
||||
data.pricing = document.getElementById('form-pricing').value;
|
||||
data.contact_email = document.getElementById('form-email').value;
|
||||
data.featured = document.getElementById('form-featured').checked ? 1 : 0;
|
||||
data.sponsored = document.getElementById('form-sponsored').checked ? 1 : 0;
|
||||
data.long_description = document.getElementById('form-long-description').value;
|
||||
data.integration_guide = document.getElementById('form-integration').value;
|
||||
data.documentation = document.getElementById('form-documentation').value;
|
||||
} else if (type === 'articles') {
|
||||
data.title = document.getElementById('form-title').value;
|
||||
data.slug = this.generateSlug(data.title);
|
||||
data.author = document.getElementById('form-author').value;
|
||||
data.category = document.getElementById('form-category').value;
|
||||
data.featured_image = document.getElementById('form-image').value;
|
||||
data.content = document.getElementById('form-content').value;
|
||||
} else if (type === 'categories') {
|
||||
data.name = document.getElementById('form-name').value;
|
||||
data.slug = this.generateSlug(data.name);
|
||||
data.icon = document.getElementById('form-icon').value;
|
||||
data.description = document.getElementById('form-description').value;
|
||||
const orderIndex = parseInt(document.getElementById('form-order').value, 10);
|
||||
data.order_index = Number.isFinite(orderIndex) ? orderIndex : 0;
|
||||
} else if (type === 'sponsors') {
|
||||
data.company_name = document.getElementById('form-name').value;
|
||||
data.logo_url = document.getElementById('form-logo-url').value;
|
||||
data.tier = document.getElementById('form-tier').value;
|
||||
data.landing_url = document.getElementById('form-landing').value;
|
||||
data.banner_url = document.getElementById('form-banner').value;
|
||||
data.start_date = document.getElementById('form-start').value;
|
||||
data.end_date = document.getElementById('form-end').value;
|
||||
data.active = document.getElementById('form-active').checked ? 1 : 0;
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
setupLogoUploadHandlers() {
|
||||
const fileInput = document.getElementById('form-logo-file');
|
||||
const preview = document.getElementById('form-logo-preview');
|
||||
const logoUrlInput = document.getElementById('form-logo-url');
|
||||
const trigger = document.getElementById('form-logo-button');
|
||||
const fileNameEl = document.getElementById('form-logo-filename');
|
||||
|
||||
if (!fileInput || !preview || !logoUrlInput) return;
|
||||
|
||||
const setFileName = (text) => {
|
||||
if (fileNameEl) {
|
||||
fileNameEl.textContent = text;
|
||||
}
|
||||
};
|
||||
|
||||
const setEmptyState = () => {
|
||||
preview.innerHTML = '<span>No logo uploaded</span>';
|
||||
preview.classList.add('empty');
|
||||
setFileName('No file selected');
|
||||
};
|
||||
|
||||
const setExistingState = () => {
|
||||
if (logoUrlInput.value) {
|
||||
const existingFile = logoUrlInput.value.split('/').pop().split('?')[0];
|
||||
preview.innerHTML = `<img src="${resolveAssetUrl(logoUrlInput.value)}" alt="Logo preview">`;
|
||||
preview.classList.remove('empty');
|
||||
setFileName(existingFile ? `Current: ${existingFile}` : 'Current logo');
|
||||
} else {
|
||||
setEmptyState();
|
||||
}
|
||||
};
|
||||
|
||||
setExistingState();
|
||||
|
||||
if (trigger) {
|
||||
trigger.onclick = () => fileInput.click();
|
||||
}
|
||||
|
||||
fileInput.addEventListener('change', (event) => {
|
||||
const file = event.target.files && event.target.files[0];
|
||||
|
||||
if (!file) {
|
||||
setExistingState();
|
||||
return;
|
||||
}
|
||||
|
||||
setFileName(file.name);
|
||||
|
||||
const reader = new FileReader();
|
||||
reader.onload = () => {
|
||||
preview.innerHTML = `<img src="${reader.result}" alt="Logo preview">`;
|
||||
preview.classList.remove('empty');
|
||||
};
|
||||
reader.readAsDataURL(file);
|
||||
});
|
||||
}
|
||||
|
||||
async deleteItem(type, id) {
|
||||
if (!confirm(`Are you sure you want to delete this ${type.slice(0, -1)}?`)) return;
|
||||
|
||||
try {
|
||||
await this.apiCall(`/admin/${type}/${id}`, { method: 'DELETE' });
|
||||
await this[`load${type.charAt(0).toUpperCase() + type.slice(1)}`]();
|
||||
await this.loadStats();
|
||||
} catch (error) {
|
||||
alert('Error deleting item: ' + error.message);
|
||||
}
|
||||
}
|
||||
|
||||
async deleteCategory(id) {
|
||||
const hasApps = this.data.apps.some(app =>
|
||||
app.category === this.data.categories.find(c => c.id === id)?.name
|
||||
);
|
||||
|
||||
if (hasApps) {
|
||||
alert('Cannot delete category with existing apps');
|
||||
return;
|
||||
}
|
||||
|
||||
await this.deleteItem('categories', id);
|
||||
}
|
||||
|
||||
closeModal() {
|
||||
document.getElementById('form-modal').classList.add('hidden');
|
||||
this.editingItem = null;
|
||||
}
|
||||
|
||||
switchSection(section) {
|
||||
// Update navigation
|
||||
document.querySelectorAll('.nav-btn').forEach(btn => {
|
||||
btn.classList.toggle('active', btn.dataset.section === section);
|
||||
});
|
||||
|
||||
// Show section
|
||||
document.querySelectorAll('.content-section').forEach(sec => {
|
||||
sec.classList.remove('active');
|
||||
});
|
||||
document.getElementById(`${section}-section`).classList.add('active');
|
||||
|
||||
this.currentSection = section;
|
||||
}
|
||||
|
||||
filterTable(type, query) {
|
||||
const items = this.data[type].filter(item => {
|
||||
const searchText = Object.values(item).join(' ').toLowerCase();
|
||||
return searchText.includes(query.toLowerCase());
|
||||
});
|
||||
|
||||
if (type === 'apps') {
|
||||
this.renderAppsTable(items);
|
||||
} else if (type === 'articles') {
|
||||
this.renderArticlesTable(items);
|
||||
}
|
||||
}
|
||||
|
||||
filterByCategory(category) {
|
||||
const apps = category
|
||||
? this.data.apps.filter(app => app.category === category)
|
||||
: this.data.apps;
|
||||
this.renderAppsTable(apps);
|
||||
}
|
||||
|
||||
populateCategoryFilter() {
|
||||
const filter = document.getElementById('apps-filter');
|
||||
if (!filter) return;
|
||||
|
||||
filter.innerHTML = '<option value="">All Categories</option>';
|
||||
this.data.categories.forEach(cat => {
|
||||
filter.innerHTML += `<option value="${cat.name}">${cat.name}</option>`;
|
||||
});
|
||||
}
|
||||
|
||||
async exportData() {
|
||||
const data = {
|
||||
apps: this.data.apps,
|
||||
articles: this.data.articles,
|
||||
categories: this.data.categories,
|
||||
sponsors: this.data.sponsors,
|
||||
exported: new Date().toISOString()
|
||||
};
|
||||
|
||||
const blob = new Blob([JSON.stringify(data, null, 2)], { type: 'application/json' });
|
||||
const url = URL.createObjectURL(blob);
|
||||
const a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = `marketplace-export-${Date.now()}.json`;
|
||||
a.click();
|
||||
}
|
||||
|
||||
async backupDatabase() {
|
||||
// In production, this would download the SQLite file
|
||||
alert('Database backup would be implemented on the server side');
|
||||
}
|
||||
|
||||
generateSlug(text) {
|
||||
return text.toLowerCase()
|
||||
.replace(/[^\w\s-]/g, '')
|
||||
.replace(/\s+/g, '-')
|
||||
.replace(/-+/g, '-')
|
||||
.trim();
|
||||
}
|
||||
|
||||
formatNumber(num) {
|
||||
if (num >= 1000000) return (num / 1000000).toFixed(1) + 'M';
|
||||
if (num >= 1000) return (num / 1000).toFixed(1) + 'K';
|
||||
return num.toString();
|
||||
}
|
||||
|
||||
logout() {
|
||||
localStorage.removeItem('admin_token');
|
||||
this.token = null;
|
||||
this.showLogin();
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize
|
||||
const admin = new AdminDashboard();
|
||||
215
docs/md_v2/marketplace/admin/index.html
Normal file
215
docs/md_v2/marketplace/admin/index.html
Normal file
@@ -0,0 +1,215 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en" data-theme="dark">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Admin Dashboard - Crawl4AI Marketplace</title>
|
||||
<link rel="stylesheet" href="../frontend/marketplace.css?v=1759329000">
|
||||
<link rel="stylesheet" href="admin.css?v=1759329000">
|
||||
</head>
|
||||
<body>
|
||||
<div class="admin-container">
|
||||
<!-- Login Screen -->
|
||||
<div id="login-screen" class="login-screen">
|
||||
<div class="login-box">
|
||||
<img src="../../assets/images/logo.png" alt="Crawl4AI" class="login-logo">
|
||||
<h1>[ Admin Access ]</h1>
|
||||
<div id="login-form">
|
||||
<input type="password" id="password" placeholder="Enter admin password" autofocus onkeypress="if(event.key==='Enter'){document.getElementById('login-btn').click()}">
|
||||
<button type="button" id="login-btn">→ Login</button>
|
||||
</div>
|
||||
<div id="login-error" class="error-msg"></div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Admin Dashboard -->
|
||||
<div id="admin-dashboard" class="admin-dashboard hidden">
|
||||
<!-- Header -->
|
||||
<header class="admin-header">
|
||||
<div class="header-content">
|
||||
<div class="header-left">
|
||||
<img src="../../assets/images/logo.png" alt="Crawl4AI" class="header-logo">
|
||||
<h1>[ Admin Dashboard ]</h1>
|
||||
</div>
|
||||
<div class="header-right">
|
||||
<span class="admin-user">Administrator</span>
|
||||
<button id="logout-btn" class="logout-btn">↗ Logout</button>
|
||||
</div>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<!-- Main Layout -->
|
||||
<div class="admin-layout">
|
||||
<!-- Sidebar -->
|
||||
<aside class="admin-sidebar">
|
||||
<nav class="sidebar-nav">
|
||||
<button class="nav-btn active" data-section="stats">
|
||||
<span class="nav-icon">▓</span> Dashboard
|
||||
</button>
|
||||
<button class="nav-btn" data-section="apps">
|
||||
<span class="nav-icon">◆</span> Apps
|
||||
</button>
|
||||
<button class="nav-btn" data-section="articles">
|
||||
<span class="nav-icon">■</span> Articles
|
||||
</button>
|
||||
<button class="nav-btn" data-section="categories">
|
||||
<span class="nav-icon">□</span> Categories
|
||||
</button>
|
||||
<button class="nav-btn" data-section="sponsors">
|
||||
<span class="nav-icon">◆</span> Sponsors
|
||||
</button>
|
||||
</nav>
|
||||
|
||||
<div class="sidebar-actions">
|
||||
<button id="export-btn" class="action-btn">
|
||||
<span>↓</span> Export Data
|
||||
</button>
|
||||
<button id="backup-btn" class="action-btn">
|
||||
<span>▪</span> Backup DB
|
||||
</button>
|
||||
</div>
|
||||
</aside>
|
||||
|
||||
<!-- Main Content -->
|
||||
<main class="admin-main">
|
||||
<!-- Stats Section -->
|
||||
<section id="stats-section" class="content-section active">
|
||||
<h2>Dashboard Overview</h2>
|
||||
<div class="stats-grid">
|
||||
<div class="stat-card">
|
||||
<div class="stat-icon">◆</div>
|
||||
<div class="stat-info">
|
||||
<div class="stat-number" id="stat-apps">--</div>
|
||||
<div class="stat-label">Total Apps</div>
|
||||
<div class="stat-detail">
|
||||
<span id="stat-featured">--</span> featured,
|
||||
<span id="stat-sponsored">--</span> sponsored
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="stat-card">
|
||||
<div class="stat-icon">■</div>
|
||||
<div class="stat-info">
|
||||
<div class="stat-number" id="stat-articles">--</div>
|
||||
<div class="stat-label">Articles</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="stat-card">
|
||||
<div class="stat-icon">◆</div>
|
||||
<div class="stat-info">
|
||||
<div class="stat-number" id="stat-sponsors">--</div>
|
||||
<div class="stat-label">Active Sponsors</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="stat-card">
|
||||
<div class="stat-icon">●</div>
|
||||
<div class="stat-info">
|
||||
<div class="stat-number" id="stat-views">--</div>
|
||||
<div class="stat-label">Total Views</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<h3>Quick Actions</h3>
|
||||
<div class="quick-actions">
|
||||
<button class="quick-btn" onclick="admin.showAddForm('apps')">
|
||||
<span>→</span> Add New App
|
||||
</button>
|
||||
<button class="quick-btn" onclick="admin.showAddForm('articles')">
|
||||
<span>→</span> Write Article
|
||||
</button>
|
||||
<button class="quick-btn" onclick="admin.showAddForm('sponsors')">
|
||||
<span>→</span> Add Sponsor
|
||||
</button>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Apps Section -->
|
||||
<section id="apps-section" class="content-section">
|
||||
<div class="section-header">
|
||||
<h2>Apps Management</h2>
|
||||
<div class="header-actions">
|
||||
<input type="text" id="apps-search" class="search-input" placeholder="Search apps...">
|
||||
<select id="apps-filter" class="filter-select">
|
||||
<option value="">All Categories</option>
|
||||
</select>
|
||||
<button class="add-btn" onclick="admin.showAddForm('apps')">
|
||||
<span>→</span> Add App
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
<div class="data-table" id="apps-table">
|
||||
<!-- Apps table will be populated here -->
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Articles Section -->
|
||||
<section id="articles-section" class="content-section">
|
||||
<div class="section-header">
|
||||
<h2>Articles Management</h2>
|
||||
<div class="header-actions">
|
||||
<input type="text" id="articles-search" class="search-input" placeholder="Search articles...">
|
||||
<button class="add-btn" onclick="admin.showAddForm('articles')">
|
||||
<span>→</span> Add Article
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
<div class="data-table" id="articles-table">
|
||||
<!-- Articles table will be populated here -->
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Categories Section -->
|
||||
<section id="categories-section" class="content-section">
|
||||
<div class="section-header">
|
||||
<h2>Categories Management</h2>
|
||||
<div class="header-actions">
|
||||
<button class="add-btn" onclick="admin.showAddForm('categories')">
|
||||
<span>→</span> Add Category
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
<div class="data-table" id="categories-table">
|
||||
<!-- Categories table will be populated here -->
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Sponsors Section -->
|
||||
<section id="sponsors-section" class="content-section">
|
||||
<div class="section-header">
|
||||
<h2>Sponsors Management</h2>
|
||||
<div class="header-actions">
|
||||
<button class="add-btn" onclick="admin.showAddForm('sponsors')">
|
||||
<span>→</span> Add Sponsor
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
<div class="data-table" id="sponsors-table">
|
||||
<!-- Sponsors table will be populated here -->
|
||||
</div>
|
||||
</section>
|
||||
</main>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Modal for Add/Edit Forms -->
|
||||
<div id="form-modal" class="modal hidden">
|
||||
<div class="modal-content large">
|
||||
<div class="modal-header">
|
||||
<h2 id="modal-title">Add/Edit</h2>
|
||||
<button class="modal-close" onclick="admin.closeModal()">✕</button>
|
||||
</div>
|
||||
<div class="modal-body" id="modal-body">
|
||||
<!-- Dynamic form content -->
|
||||
</div>
|
||||
<div class="modal-footer">
|
||||
<button class="btn-cancel" onclick="admin.closeModal()">Cancel</button>
|
||||
<button class="btn-save" id="save-btn">Save</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script src="admin.js?v=1759335000"></script>
|
||||
</body>
|
||||
</html>
|
||||
683
docs/md_v2/marketplace/app-detail.css
Normal file
683
docs/md_v2/marketplace/app-detail.css
Normal file
@@ -0,0 +1,683 @@
|
||||
/* App Detail Page Styles */
|
||||
|
||||
.app-detail-container {
|
||||
min-height: 100vh;
|
||||
background: var(--bg-dark);
|
||||
}
|
||||
|
||||
/* Back Button */
|
||||
.header-nav {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.back-btn {
|
||||
padding: 0.5rem 1rem;
|
||||
background: transparent;
|
||||
border: 1px solid var(--border-color);
|
||||
color: var(--primary-cyan);
|
||||
text-decoration: none;
|
||||
transition: all 0.2s;
|
||||
font-size: 0.875rem;
|
||||
}
|
||||
|
||||
.back-btn:hover {
|
||||
border-color: var(--primary-cyan);
|
||||
background: rgba(80, 255, 255, 0.1);
|
||||
}
|
||||
|
||||
/* App Hero Section */
|
||||
.app-hero {
|
||||
max-width: 1800px;
|
||||
margin: 2rem auto;
|
||||
padding: 0 2rem;
|
||||
}
|
||||
|
||||
.app-hero-content {
|
||||
display: grid;
|
||||
grid-template-columns: 1fr 2fr;
|
||||
gap: 3rem;
|
||||
background: linear-gradient(135deg, #1a1a2e, #0f0f1e);
|
||||
border: 2px solid var(--primary-cyan);
|
||||
padding: 2rem;
|
||||
box-shadow: 0 0 30px rgba(80, 255, 255, 0.15),
|
||||
inset 0 0 20px rgba(80, 255, 255, 0.05);
|
||||
}
|
||||
|
||||
.app-hero-image {
|
||||
width: 100%;
|
||||
height: 300px;
|
||||
background: linear-gradient(135deg, rgba(80, 255, 255, 0.1), rgba(243, 128, 245, 0.05));
|
||||
background-size: cover;
|
||||
background-position: center;
|
||||
border: 1px solid var(--border-color);
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
font-size: 4rem;
|
||||
color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
.app-badges {
|
||||
display: flex;
|
||||
gap: 0.5rem;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
.app-badge {
|
||||
padding: 0.3rem 0.6rem;
|
||||
background: var(--bg-tertiary);
|
||||
color: var(--text-secondary);
|
||||
font-size: 0.75rem;
|
||||
text-transform: uppercase;
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.app-badge.featured {
|
||||
background: linear-gradient(135deg, var(--primary-cyan), var(--primary-teal));
|
||||
color: var(--bg-dark);
|
||||
box-shadow: 0 2px 10px rgba(80, 255, 255, 0.3);
|
||||
}
|
||||
|
||||
.app-badge.sponsored {
|
||||
background: linear-gradient(135deg, var(--warning), #ff8c00);
|
||||
color: var(--bg-dark);
|
||||
box-shadow: 0 2px 10px rgba(245, 158, 11, 0.3);
|
||||
}
|
||||
|
||||
.app-hero-info h1 {
|
||||
font-size: 2.5rem;
|
||||
color: var(--primary-cyan);
|
||||
margin: 0.5rem 0;
|
||||
text-shadow: 0 0 20px rgba(80, 255, 255, 0.5);
|
||||
}
|
||||
|
||||
.app-tagline {
|
||||
font-size: 1.1rem;
|
||||
color: var(--text-secondary);
|
||||
margin-bottom: 2rem;
|
||||
}
|
||||
|
||||
/* Stats */
|
||||
.app-stats {
|
||||
display: flex;
|
||||
gap: 2rem;
|
||||
margin: 2rem 0;
|
||||
padding: 1rem 0;
|
||||
border-top: 1px solid var(--border-color);
|
||||
border-bottom: 1px solid var(--border-color);
|
||||
}
|
||||
|
||||
.stat {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.25rem;
|
||||
}
|
||||
|
||||
.stat-value {
|
||||
font-size: 1.5rem;
|
||||
color: var(--primary-cyan);
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.stat-label {
|
||||
font-size: 0.875rem;
|
||||
color: var(--text-tertiary);
|
||||
}
|
||||
|
||||
/* Action Buttons */
|
||||
.app-actions {
|
||||
display: flex;
|
||||
gap: 1rem;
|
||||
margin: 2rem 0;
|
||||
}
|
||||
|
||||
.action-btn {
|
||||
padding: 0.75rem 1.5rem;
|
||||
border: 1px solid var(--border-color);
|
||||
background: transparent;
|
||||
color: var(--text-primary);
|
||||
text-decoration: none;
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
transition: all 0.2s;
|
||||
cursor: pointer;
|
||||
font-family: inherit;
|
||||
font-size: 0.9rem;
|
||||
}
|
||||
|
||||
.action-btn.primary {
|
||||
background: linear-gradient(135deg, var(--primary-cyan), var(--primary-teal));
|
||||
color: var(--bg-dark);
|
||||
border-color: var(--primary-cyan);
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.action-btn.primary:hover {
|
||||
box-shadow: 0 4px 15px rgba(80, 255, 255, 0.3);
|
||||
transform: translateY(-2px);
|
||||
}
|
||||
|
||||
.action-btn.secondary {
|
||||
border-color: var(--accent-pink);
|
||||
color: var(--accent-pink);
|
||||
}
|
||||
|
||||
.action-btn.secondary:hover {
|
||||
background: rgba(243, 128, 245, 0.1);
|
||||
box-shadow: 0 4px 15px rgba(243, 128, 245, 0.2);
|
||||
}
|
||||
|
||||
.action-btn.ghost {
|
||||
border-color: var(--border-color);
|
||||
color: var(--text-secondary);
|
||||
}
|
||||
|
||||
.action-btn.ghost:hover {
|
||||
border-color: var(--primary-cyan);
|
||||
color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
/* Pricing */
|
||||
.pricing-info {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 1rem;
|
||||
font-size: 1.1rem;
|
||||
}
|
||||
|
||||
.pricing-label {
|
||||
color: var(--text-tertiary);
|
||||
}
|
||||
|
||||
.pricing-value {
|
||||
color: var(--warning);
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
/* Navigation Tabs */
|
||||
.tabs {
|
||||
display: flex;
|
||||
flex-direction: row;
|
||||
gap: 0;
|
||||
border-bottom: 2px solid var(--border-color);
|
||||
margin-bottom: 0;
|
||||
background: var(--bg-tertiary);
|
||||
}
|
||||
|
||||
.tab-btn {
|
||||
padding: 1rem 2rem;
|
||||
background: transparent;
|
||||
border: none;
|
||||
border-bottom: 3px solid transparent;
|
||||
color: var(--text-secondary);
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
font-family: inherit;
|
||||
font-size: 0.95rem;
|
||||
margin-bottom: -2px;
|
||||
white-space: nowrap;
|
||||
font-weight: 500;
|
||||
}
|
||||
|
||||
.tab-btn:hover {
|
||||
color: var(--primary-cyan);
|
||||
background: rgba(80, 255, 255, 0.05);
|
||||
}
|
||||
|
||||
.tab-btn.active {
|
||||
color: var(--primary-cyan);
|
||||
border-bottom-color: var(--primary-cyan);
|
||||
background: var(--bg-secondary);
|
||||
}
|
||||
|
||||
.app-nav {
|
||||
max-width: 1800px;
|
||||
margin: 2rem auto 0;
|
||||
padding: 0 2rem;
|
||||
display: flex;
|
||||
gap: 1rem;
|
||||
border-bottom: 2px solid var(--border-color);
|
||||
}
|
||||
|
||||
.nav-tab {
|
||||
padding: 1rem 1.5rem;
|
||||
background: transparent;
|
||||
border: none;
|
||||
border-bottom: 2px solid transparent;
|
||||
color: var(--text-secondary);
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
font-family: inherit;
|
||||
font-size: 0.9rem;
|
||||
margin-bottom: -2px;
|
||||
}
|
||||
|
||||
.nav-tab:hover {
|
||||
color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
.nav-tab.active {
|
||||
color: var(--primary-cyan);
|
||||
border-bottom-color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
/* Main Content Wrapper */
|
||||
.app-main {
|
||||
max-width: 1800px;
|
||||
margin: 2rem auto;
|
||||
padding: 0 2rem;
|
||||
}
|
||||
|
||||
/* Content Sections */
|
||||
.app-content {
|
||||
background: var(--bg-secondary);
|
||||
border: 1px solid var(--border-color);
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
.tab-content {
|
||||
display: none !important;
|
||||
padding: 2rem;
|
||||
}
|
||||
|
||||
.tab-content.active {
|
||||
display: block !important;
|
||||
}
|
||||
|
||||
/* Overview Layout */
|
||||
.overview-columns {
|
||||
display: grid;
|
||||
grid-template-columns: 2fr 1fr;
|
||||
gap: 2rem;
|
||||
}
|
||||
|
||||
.overview-main h2, .overview-main h3 {
|
||||
color: var(--primary-cyan);
|
||||
margin-top: 2rem;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
.overview-main h2:first-child {
|
||||
margin-top: 0;
|
||||
}
|
||||
|
||||
.overview-main h2 {
|
||||
font-size: 1.8rem;
|
||||
border-bottom: 2px solid var(--border-color);
|
||||
padding-bottom: 0.5rem;
|
||||
}
|
||||
|
||||
.overview-main h3 {
|
||||
font-size: 1.3rem;
|
||||
}
|
||||
|
||||
.features-list {
|
||||
list-style: none;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
.features-list li {
|
||||
padding: 0.5rem 0;
|
||||
padding-left: 1.5rem;
|
||||
position: relative;
|
||||
color: var(--text-secondary);
|
||||
}
|
||||
|
||||
.features-list li:before {
|
||||
content: "▸";
|
||||
position: absolute;
|
||||
left: 0;
|
||||
color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
.use-cases p {
|
||||
color: var(--text-secondary);
|
||||
line-height: 1.6;
|
||||
}
|
||||
|
||||
/* Sidebar */
|
||||
.sidebar {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 1rem;
|
||||
}
|
||||
|
||||
.sidebar-card {
|
||||
background: var(--bg-secondary);
|
||||
border: 1px solid var(--border-color);
|
||||
padding: 1.5rem;
|
||||
}
|
||||
|
||||
.sidebar-card h3 {
|
||||
font-size: 1.1rem;
|
||||
color: var(--primary-cyan);
|
||||
margin: 0 0 1rem 0;
|
||||
border-bottom: 1px solid var(--border-color);
|
||||
padding-bottom: 0.5rem;
|
||||
}
|
||||
|
||||
.stats-grid {
|
||||
display: grid;
|
||||
grid-template-columns: 1fr 1fr;
|
||||
gap: 1rem;
|
||||
}
|
||||
|
||||
.stats-grid > div {
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.metadata {
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
.metadata div {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
padding: 0.75rem 0;
|
||||
border-bottom: 1px solid var(--border-color);
|
||||
}
|
||||
|
||||
.metadata dt {
|
||||
color: var(--text-tertiary);
|
||||
font-weight: normal;
|
||||
}
|
||||
|
||||
.metadata dd {
|
||||
color: var(--text-primary);
|
||||
margin: 0;
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.sidebar-card p {
|
||||
color: var(--text-secondary);
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
/* Integration Content */
|
||||
.integration-content {
|
||||
max-width: 100%;
|
||||
}
|
||||
|
||||
.integration-content h2 {
|
||||
font-size: 1.8rem;
|
||||
color: var(--primary-cyan);
|
||||
margin: 0 0 2rem 0;
|
||||
padding-bottom: 0.5rem;
|
||||
border-bottom: 2px solid var(--border-color);
|
||||
}
|
||||
|
||||
.integration-content h3 {
|
||||
font-size: 1.3rem;
|
||||
color: var(--text-primary);
|
||||
margin: 2rem 0 1rem;
|
||||
}
|
||||
|
||||
.docs-content {
|
||||
max-width: 100%;
|
||||
}
|
||||
|
||||
.docs-content h2 {
|
||||
font-size: 1.8rem;
|
||||
color: var(--primary-cyan);
|
||||
margin: 0 0 1.5rem 0;
|
||||
padding-bottom: 0.5rem;
|
||||
border-bottom: 2px solid var(--border-color);
|
||||
}
|
||||
|
||||
.docs-content h3 {
|
||||
font-size: 1.3rem;
|
||||
color: var(--text-primary);
|
||||
margin: 2rem 0 1rem;
|
||||
}
|
||||
|
||||
.docs-content h4 {
|
||||
font-size: 1.1rem;
|
||||
color: var(--accent-pink);
|
||||
margin: 1.5rem 0 0.5rem;
|
||||
}
|
||||
|
||||
.docs-content p {
|
||||
color: var(--text-secondary);
|
||||
line-height: 1.6;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
.docs-content code {
|
||||
background: var(--bg-tertiary);
|
||||
padding: 0.2rem 0.4rem;
|
||||
color: var(--primary-cyan);
|
||||
font-family: 'Dank Mono', Monaco, monospace;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
/* Code Blocks */
|
||||
.code-block {
|
||||
background: var(--bg-dark);
|
||||
border: 1px solid var(--border-color);
|
||||
margin: 1rem 0;
|
||||
overflow: hidden;
|
||||
position: relative;
|
||||
}
|
||||
|
||||
.code-header {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
padding: 0.5rem 1rem;
|
||||
background: var(--bg-tertiary);
|
||||
border-bottom: 1px solid var(--border-color);
|
||||
}
|
||||
|
||||
.code-lang {
|
||||
color: var(--primary-cyan);
|
||||
font-size: 0.875rem;
|
||||
text-transform: uppercase;
|
||||
}
|
||||
|
||||
.copy-btn {
|
||||
position: absolute;
|
||||
top: 0.5rem;
|
||||
right: 0.5rem;
|
||||
padding: 0.4rem 0.8rem;
|
||||
background: var(--bg-tertiary);
|
||||
border: 1px solid var(--border-color);
|
||||
color: var(--text-secondary);
|
||||
cursor: pointer;
|
||||
font-size: 0.75rem;
|
||||
transition: all 0.2s;
|
||||
z-index: 10;
|
||||
}
|
||||
|
||||
.copy-btn:hover {
|
||||
border-color: var(--primary-cyan);
|
||||
color: var(--primary-cyan);
|
||||
background: var(--bg-secondary);
|
||||
}
|
||||
|
||||
.code-block pre {
|
||||
margin: 0;
|
||||
padding: 1rem;
|
||||
overflow-x: auto;
|
||||
}
|
||||
|
||||
.code-block code {
|
||||
background: transparent;
|
||||
padding: 0;
|
||||
color: var(--text-secondary);
|
||||
font-size: 0.875rem;
|
||||
line-height: 1.5;
|
||||
}
|
||||
|
||||
/* Markdown rendered code blocks */
|
||||
.integration-content pre,
|
||||
.docs-content pre {
|
||||
background: var(--bg-dark);
|
||||
border: 1px solid var(--border-color);
|
||||
margin: 1rem 0;
|
||||
padding: 1rem;
|
||||
padding-top: 2.5rem; /* Space for copy button */
|
||||
overflow-x: auto;
|
||||
position: relative;
|
||||
max-height: none; /* Remove any height restrictions */
|
||||
height: auto; /* Allow content to expand */
|
||||
}
|
||||
|
||||
.integration-content pre code,
|
||||
.docs-content pre code {
|
||||
background: transparent;
|
||||
padding: 0;
|
||||
color: var(--text-secondary);
|
||||
font-size: 0.875rem;
|
||||
line-height: 1.5;
|
||||
white-space: pre; /* Preserve whitespace and line breaks */
|
||||
display: block;
|
||||
}
|
||||
|
||||
/* Feature Grid */
|
||||
.feature-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
|
||||
gap: 1rem;
|
||||
margin: 2rem 0;
|
||||
}
|
||||
|
||||
.feature-card {
|
||||
background: var(--bg-tertiary);
|
||||
border: 1px solid var(--border-color);
|
||||
padding: 1.5rem;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.feature-card:hover {
|
||||
border-color: var(--primary-cyan);
|
||||
background: rgba(80, 255, 255, 0.05);
|
||||
}
|
||||
|
||||
.feature-card h4 {
|
||||
margin-top: 0;
|
||||
}
|
||||
|
||||
/* Info Box */
|
||||
.info-box {
|
||||
background: linear-gradient(135deg, rgba(80, 255, 255, 0.05), rgba(243, 128, 245, 0.03));
|
||||
border: 1px solid var(--primary-cyan);
|
||||
border-left: 4px solid var(--primary-cyan);
|
||||
padding: 1.5rem;
|
||||
margin: 2rem 0;
|
||||
}
|
||||
|
||||
.info-box h4 {
|
||||
margin-top: 0;
|
||||
color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
/* Support Grid */
|
||||
.support-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
|
||||
gap: 1rem;
|
||||
margin: 2rem 0;
|
||||
}
|
||||
|
||||
.support-card {
|
||||
background: var(--bg-tertiary);
|
||||
border: 1px solid var(--border-color);
|
||||
padding: 1.5rem;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.support-card h3 {
|
||||
color: var(--primary-cyan);
|
||||
margin-bottom: 0.5rem;
|
||||
}
|
||||
|
||||
/* Related Apps */
|
||||
.related-apps {
|
||||
max-width: 1800px;
|
||||
margin: 4rem auto;
|
||||
padding: 0 2rem;
|
||||
}
|
||||
|
||||
.related-apps h2 {
|
||||
font-size: 1.5rem;
|
||||
color: var(--text-primary);
|
||||
margin-bottom: 1.5rem;
|
||||
}
|
||||
|
||||
.related-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fill, minmax(250px, 1fr));
|
||||
gap: 1rem;
|
||||
}
|
||||
|
||||
.related-app-card {
|
||||
background: var(--bg-secondary);
|
||||
border: 1px solid var(--border-color);
|
||||
padding: 1rem;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.related-app-card:hover {
|
||||
border-color: var(--primary-cyan);
|
||||
transform: translateY(-2px);
|
||||
}
|
||||
|
||||
/* Responsive */
|
||||
@media (max-width: 1024px) {
|
||||
.app-hero-content {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
|
||||
.app-stats {
|
||||
justify-content: space-around;
|
||||
}
|
||||
|
||||
.overview-columns {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
}
|
||||
|
||||
@media (max-width: 768px) {
|
||||
.app-hero-info h1 {
|
||||
font-size: 2rem;
|
||||
}
|
||||
|
||||
.app-actions {
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.tabs {
|
||||
overflow-x: auto;
|
||||
-webkit-overflow-scrolling: touch;
|
||||
}
|
||||
|
||||
.tab-btn {
|
||||
padding: 0.75rem 1.5rem;
|
||||
font-size: 0.875rem;
|
||||
}
|
||||
|
||||
.app-nav {
|
||||
overflow-x: auto;
|
||||
gap: 0;
|
||||
}
|
||||
|
||||
.nav-tab {
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.feature-grid,
|
||||
.support-grid {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
|
||||
.tab-content {
|
||||
padding: 1rem;
|
||||
}
|
||||
|
||||
.app-main {
|
||||
padding: 0 1rem;
|
||||
}
|
||||
}
|
||||
175
docs/md_v2/marketplace/app-detail.html
Normal file
175
docs/md_v2/marketplace/app-detail.html
Normal file
@@ -0,0 +1,175 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en" data-theme="dark">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>App Details - Crawl4AI Marketplace</title>
|
||||
<link rel="stylesheet" href="marketplace.css">
|
||||
<link rel="stylesheet" href="app-detail.css">
|
||||
</head>
|
||||
<body>
|
||||
<div class="app-detail-container">
|
||||
<!-- Header -->
|
||||
<header class="marketplace-header">
|
||||
<div class="header-content">
|
||||
<div class="header-left">
|
||||
<div class="logo-title">
|
||||
<img src="../assets/images/logo.png" alt="Crawl4AI" class="header-logo">
|
||||
<h1>
|
||||
<span class="ascii-border">[</span>
|
||||
Marketplace
|
||||
<span class="ascii-border">]</span>
|
||||
</h1>
|
||||
</div>
|
||||
</div>
|
||||
<div class="header-nav">
|
||||
<a href="index.html" class="back-btn">← Back to Marketplace</a>
|
||||
</div>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<!-- App Hero Section -->
|
||||
<section class="app-hero">
|
||||
<div class="app-hero-content">
|
||||
<div class="app-hero-image" id="app-image">
|
||||
<!-- Dynamic image -->
|
||||
</div>
|
||||
<div class="app-hero-info">
|
||||
<div class="app-badges">
|
||||
<span class="app-badge" id="app-type">Open Source</span>
|
||||
<span class="app-badge featured" id="app-featured" style="display:none">FEATURED</span>
|
||||
<span class="app-badge sponsored" id="app-sponsored" style="display:none">SPONSORED</span>
|
||||
</div>
|
||||
<h1 id="app-name">App Name</h1>
|
||||
<p id="app-description" class="app-tagline">App description goes here</p>
|
||||
|
||||
<div class="app-stats">
|
||||
<div class="stat">
|
||||
<span class="stat-value" id="app-rating">★★★★★</span>
|
||||
<span class="stat-label">Rating</span>
|
||||
</div>
|
||||
<div class="stat">
|
||||
<span class="stat-value" id="app-downloads">0</span>
|
||||
<span class="stat-label">Downloads</span>
|
||||
</div>
|
||||
<div class="stat">
|
||||
<span class="stat-value" id="app-category">Category</span>
|
||||
<span class="stat-label">Category</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="app-actions">
|
||||
<a href="#" id="app-website" class="action-btn primary" target="_blank">Visit Website</a>
|
||||
<a href="#" id="app-github" class="action-btn" target="_blank">View GitHub</a>
|
||||
<a href="#" id="app-demo" class="action-btn" target="_blank" style="display:none">Live Demo</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- App Details Section -->
|
||||
<main class="app-main">
|
||||
<div class="app-content">
|
||||
<div class="tabs">
|
||||
<button class="tab-btn active" data-tab="overview">Overview</button>
|
||||
<button class="tab-btn" data-tab="integration">Integration</button>
|
||||
<!-- <button class="tab-btn" data-tab="docs">Documentation</button>
|
||||
<button class="tab-btn" data-tab="support">Support</button> -->
|
||||
</div>
|
||||
|
||||
<section id="overview-tab" class="tab-content active">
|
||||
<div class="overview-columns">
|
||||
<div class="overview-main">
|
||||
<div id="app-overview">Overview content goes here.</div>
|
||||
</div>
|
||||
|
||||
<aside class="sidebar">
|
||||
<div class="sidebar-card">
|
||||
<h3>Download Stats</h3>
|
||||
<div class="stats-grid">
|
||||
<div>
|
||||
<span class="stat-value" id="sidebar-downloads">0</span>
|
||||
<span class="stat-label">Downloads</span>
|
||||
</div>
|
||||
<div>
|
||||
<span class="stat-value" id="sidebar-rating">0.0</span>
|
||||
<span class="stat-label">Rating</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="sidebar-card">
|
||||
<h3>App Metadata</h3>
|
||||
<dl class="metadata">
|
||||
<div>
|
||||
<dt>Category</dt>
|
||||
<dd id="sidebar-category">-</dd>
|
||||
</div>
|
||||
<div>
|
||||
<dt>Type</dt>
|
||||
<dd id="sidebar-type">-</dd>
|
||||
</div>
|
||||
<div>
|
||||
<dt>Status</dt>
|
||||
<dd id="sidebar-status">Active</dd>
|
||||
</div>
|
||||
<div>
|
||||
<dt>Pricing</dt>
|
||||
<dd id="sidebar-pricing">-</dd>
|
||||
</div>
|
||||
</dl>
|
||||
</div>
|
||||
|
||||
<div class="sidebar-card">
|
||||
<h3>Contact</h3>
|
||||
<p id="sidebar-contact">contact@example.com</p>
|
||||
</div>
|
||||
</aside>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section id="integration-tab" class="tab-content">
|
||||
<div class="integration-content" id="app-integration">
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- <section id="docs-tab" class="tab-content">
|
||||
<div class="docs-content" id="app-docs">
|
||||
</div>
|
||||
</section> -->
|
||||
|
||||
<!-- <section id="support-tab" class="tab-content">
|
||||
<div class="docs-content">
|
||||
<h2>Support</h2>
|
||||
<div class="support-grid">
|
||||
<div class="support-card">
|
||||
<h3>📧 Contact</h3>
|
||||
<p id="app-contact">contact@example.com</p>
|
||||
</div>
|
||||
<div class="support-card">
|
||||
<h3>🐛 Report Issues</h3>
|
||||
<p>Found a bug? Report it on GitHub Issues.</p>
|
||||
</div>
|
||||
<div class="support-card">
|
||||
<h3>💬 Community</h3>
|
||||
<p>Join our Discord for help and discussions.</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</section> -->
|
||||
</div>
|
||||
|
||||
</main>
|
||||
|
||||
<!-- Related Apps -->
|
||||
<section class="related-apps">
|
||||
<h2>Related Apps</h2>
|
||||
<div id="related-apps-grid" class="related-grid">
|
||||
<!-- Dynamic related apps -->
|
||||
</div>
|
||||
</section>
|
||||
</div>
|
||||
|
||||
<script src="app-detail.js"></script>
|
||||
</body>
|
||||
</html>
|
||||
318
docs/md_v2/marketplace/app-detail.js
Normal file
318
docs/md_v2/marketplace/app-detail.js
Normal file
@@ -0,0 +1,318 @@
|
||||
// App Detail Page JavaScript
|
||||
const { API_BASE, API_ORIGIN } = (() => {
|
||||
const { hostname, port, protocol } = window.location;
|
||||
const isLocalHost = ['localhost', '127.0.0.1', '0.0.0.0'].includes(hostname);
|
||||
|
||||
if (isLocalHost && port && port !== '8100') {
|
||||
const origin = `${protocol}//127.0.0.1:8100`;
|
||||
return { API_BASE: `${origin}/marketplace/api`, API_ORIGIN: origin };
|
||||
}
|
||||
|
||||
return { API_BASE: '/marketplace/api', API_ORIGIN: '' };
|
||||
})();
|
||||
|
||||
class AppDetailPage {
|
||||
constructor() {
|
||||
this.appSlug = this.getAppSlugFromURL();
|
||||
this.appData = null;
|
||||
this.init();
|
||||
}
|
||||
|
||||
getAppSlugFromURL() {
|
||||
const params = new URLSearchParams(window.location.search);
|
||||
return params.get('app') || '';
|
||||
}
|
||||
|
||||
async init() {
|
||||
if (!this.appSlug) {
|
||||
window.location.href = 'index.html';
|
||||
return;
|
||||
}
|
||||
|
||||
await this.loadAppDetails();
|
||||
this.setupEventListeners();
|
||||
await this.loadRelatedApps();
|
||||
}
|
||||
|
||||
async loadAppDetails() {
|
||||
try {
|
||||
const response = await fetch(`${API_BASE}/apps/${this.appSlug}`);
|
||||
if (!response.ok) throw new Error('App not found');
|
||||
|
||||
this.appData = await response.json();
|
||||
this.renderAppDetails();
|
||||
} catch (error) {
|
||||
console.error('Error loading app details:', error);
|
||||
// Fallback to loading all apps and finding the right one
|
||||
try {
|
||||
const response = await fetch(`${API_BASE}/apps`);
|
||||
const apps = await response.json();
|
||||
this.appData = apps.find(app => app.slug === this.appSlug || app.name.toLowerCase().replace(/\s+/g, '-') === this.appSlug);
|
||||
if (this.appData) {
|
||||
this.renderAppDetails();
|
||||
} else {
|
||||
window.location.href = 'index.html';
|
||||
}
|
||||
} catch (err) {
|
||||
console.error('Error loading apps:', err);
|
||||
window.location.href = 'index.html';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
renderAppDetails() {
|
||||
if (!this.appData) return;
|
||||
|
||||
// Update title
|
||||
document.title = `${this.appData.name} - Crawl4AI Marketplace`;
|
||||
|
||||
// Hero image
|
||||
const appImage = document.getElementById('app-image');
|
||||
if (this.appData.image) {
|
||||
appImage.style.backgroundImage = `url('${this.appData.image}')`;
|
||||
appImage.innerHTML = '';
|
||||
} else {
|
||||
appImage.innerHTML = `[${this.appData.category || 'APP'}]`;
|
||||
}
|
||||
|
||||
// Basic info
|
||||
document.getElementById('app-name').textContent = this.appData.name;
|
||||
document.getElementById('app-description').textContent = this.appData.description;
|
||||
document.getElementById('app-type').textContent = this.appData.type || 'Open Source';
|
||||
document.getElementById('app-category').textContent = this.appData.category;
|
||||
|
||||
// Badges
|
||||
if (this.appData.featured) {
|
||||
document.getElementById('app-featured').style.display = 'inline-block';
|
||||
}
|
||||
if (this.appData.sponsored) {
|
||||
document.getElementById('app-sponsored').style.display = 'inline-block';
|
||||
}
|
||||
|
||||
// Stats
|
||||
const rating = this.appData.rating || 0;
|
||||
const stars = '★'.repeat(Math.floor(rating)) + '☆'.repeat(5 - Math.floor(rating));
|
||||
document.getElementById('app-rating').textContent = stars + ` ${rating}/5`;
|
||||
document.getElementById('app-downloads').textContent = this.formatNumber(this.appData.downloads || 0);
|
||||
|
||||
// Action buttons
|
||||
const websiteBtn = document.getElementById('app-website');
|
||||
const githubBtn = document.getElementById('app-github');
|
||||
|
||||
if (this.appData.website_url) {
|
||||
websiteBtn.href = this.appData.website_url;
|
||||
} else {
|
||||
websiteBtn.style.display = 'none';
|
||||
}
|
||||
|
||||
if (this.appData.github_url) {
|
||||
githubBtn.href = this.appData.github_url;
|
||||
} else {
|
||||
githubBtn.style.display = 'none';
|
||||
}
|
||||
|
||||
// Contact
|
||||
document.getElementById('app-contact') && (document.getElementById('app-contact').textContent = this.appData.contact_email || 'Not available');
|
||||
|
||||
// Sidebar info
|
||||
document.getElementById('sidebar-downloads').textContent = this.formatNumber(this.appData.downloads || 0);
|
||||
document.getElementById('sidebar-rating').textContent = (this.appData.rating || 0).toFixed(1);
|
||||
document.getElementById('sidebar-category').textContent = this.appData.category || '-';
|
||||
document.getElementById('sidebar-type').textContent = this.appData.type || '-';
|
||||
document.getElementById('sidebar-status').textContent = this.appData.status || 'Active';
|
||||
document.getElementById('sidebar-pricing').textContent = this.appData.pricing || 'Free';
|
||||
document.getElementById('sidebar-contact').textContent = this.appData.contact_email || 'contact@example.com';
|
||||
|
||||
// Render tab contents from database fields
|
||||
this.renderTabContents();
|
||||
}
|
||||
|
||||
renderTabContents() {
|
||||
// Overview tab - use long_description from database
|
||||
const overviewDiv = document.getElementById('app-overview');
|
||||
if (overviewDiv) {
|
||||
if (this.appData.long_description) {
|
||||
overviewDiv.innerHTML = this.renderMarkdown(this.appData.long_description);
|
||||
} else {
|
||||
overviewDiv.innerHTML = `<p>${this.appData.description || 'No overview available.'}</p>`;
|
||||
}
|
||||
}
|
||||
|
||||
// Integration tab - use integration_guide field from database
|
||||
const integrationDiv = document.getElementById('app-integration');
|
||||
if (integrationDiv) {
|
||||
if (this.appData.integration_guide) {
|
||||
integrationDiv.innerHTML = this.renderMarkdown(this.appData.integration_guide);
|
||||
// Add copy buttons to all code blocks
|
||||
this.addCopyButtonsToCodeBlocks(integrationDiv);
|
||||
} else {
|
||||
integrationDiv.innerHTML = '<p>Integration guide not yet available. Please check the official website for details.</p>';
|
||||
}
|
||||
}
|
||||
|
||||
// Documentation tab - use documentation field from database
|
||||
const docsDiv = document.getElementById('app-docs');
|
||||
if (docsDiv) {
|
||||
if (this.appData.documentation) {
|
||||
docsDiv.innerHTML = this.renderMarkdown(this.appData.documentation);
|
||||
// Add copy buttons to all code blocks
|
||||
this.addCopyButtonsToCodeBlocks(docsDiv);
|
||||
} else {
|
||||
docsDiv.innerHTML = '<p>Documentation coming soon.</p>';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
addCopyButtonsToCodeBlocks(container) {
|
||||
// Find all code blocks and add copy buttons
|
||||
const codeBlocks = container.querySelectorAll('pre code');
|
||||
codeBlocks.forEach(codeBlock => {
|
||||
const pre = codeBlock.parentElement;
|
||||
|
||||
// Skip if already has a copy button
|
||||
if (pre.querySelector('.copy-btn')) return;
|
||||
|
||||
// Create copy button
|
||||
const copyBtn = document.createElement('button');
|
||||
copyBtn.className = 'copy-btn';
|
||||
copyBtn.textContent = 'Copy';
|
||||
copyBtn.onclick = () => {
|
||||
navigator.clipboard.writeText(codeBlock.textContent).then(() => {
|
||||
copyBtn.textContent = '✓ Copied!';
|
||||
setTimeout(() => {
|
||||
copyBtn.textContent = 'Copy';
|
||||
}, 2000);
|
||||
});
|
||||
};
|
||||
|
||||
// Add button to pre element
|
||||
pre.style.position = 'relative';
|
||||
pre.insertBefore(copyBtn, codeBlock);
|
||||
});
|
||||
}
|
||||
|
||||
renderMarkdown(text) {
|
||||
if (!text) return '';
|
||||
|
||||
// Store code blocks temporarily to protect them from processing
|
||||
const codeBlocks = [];
|
||||
let processed = text.replace(/```(\w+)?\n([\s\S]*?)```/g, (match, lang, code) => {
|
||||
const placeholder = `___CODE_BLOCK_${codeBlocks.length}___`;
|
||||
codeBlocks.push(`<pre><code class="language-${lang || ''}">${this.escapeHtml(code)}</code></pre>`);
|
||||
return placeholder;
|
||||
});
|
||||
|
||||
// Store inline code temporarily
|
||||
const inlineCodes = [];
|
||||
processed = processed.replace(/`([^`]+)`/g, (match, code) => {
|
||||
const placeholder = `___INLINE_CODE_${inlineCodes.length}___`;
|
||||
inlineCodes.push(`<code>${this.escapeHtml(code)}</code>`);
|
||||
return placeholder;
|
||||
});
|
||||
|
||||
// Now process the rest of the markdown
|
||||
processed = processed
|
||||
// Headers
|
||||
.replace(/^### (.*$)/gim, '<h3>$1</h3>')
|
||||
.replace(/^## (.*$)/gim, '<h2>$1</h2>')
|
||||
.replace(/^# (.*$)/gim, '<h1>$1</h1>')
|
||||
// Bold
|
||||
.replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
|
||||
// Italic
|
||||
.replace(/\*(.*?)\*/g, '<em>$1</em>')
|
||||
// Links
|
||||
.replace(/\[([^\]]+)\]\(([^)]+)\)/g, '<a href="$2" target="_blank">$1</a>')
|
||||
// Line breaks
|
||||
.replace(/\n\n/g, '</p><p>')
|
||||
.replace(/\n/g, '<br>')
|
||||
// Lists
|
||||
.replace(/^\* (.*)$/gim, '<li>$1</li>')
|
||||
.replace(/^- (.*)$/gim, '<li>$1</li>')
|
||||
// Wrap in paragraphs
|
||||
.replace(/^(?!<[h|p|pre|ul|ol|li])/gim, '<p>')
|
||||
.replace(/(?<![>])$/gim, '</p>');
|
||||
|
||||
// Restore inline code
|
||||
inlineCodes.forEach((code, i) => {
|
||||
processed = processed.replace(`___INLINE_CODE_${i}___`, code);
|
||||
});
|
||||
|
||||
// Restore code blocks
|
||||
codeBlocks.forEach((block, i) => {
|
||||
processed = processed.replace(`___CODE_BLOCK_${i}___`, block);
|
||||
});
|
||||
|
||||
return processed;
|
||||
}
|
||||
|
||||
escapeHtml(text) {
|
||||
const div = document.createElement('div');
|
||||
div.textContent = text;
|
||||
return div.innerHTML;
|
||||
}
|
||||
|
||||
formatNumber(num) {
|
||||
if (num >= 1000000) {
|
||||
return (num / 1000000).toFixed(1) + 'M';
|
||||
} else if (num >= 1000) {
|
||||
return (num / 1000).toFixed(1) + 'K';
|
||||
}
|
||||
return num.toString();
|
||||
}
|
||||
|
||||
setupEventListeners() {
|
||||
// Tab switching
|
||||
const tabs = document.querySelectorAll('.tab-btn');
|
||||
|
||||
tabs.forEach(tab => {
|
||||
tab.addEventListener('click', () => {
|
||||
// Update active tab button
|
||||
tabs.forEach(t => t.classList.remove('active'));
|
||||
tab.classList.add('active');
|
||||
|
||||
// Show corresponding content
|
||||
const tabName = tab.dataset.tab;
|
||||
|
||||
// Hide all tab contents
|
||||
const allTabContents = document.querySelectorAll('.tab-content');
|
||||
allTabContents.forEach(content => {
|
||||
content.classList.remove('active');
|
||||
});
|
||||
|
||||
// Show the selected tab content
|
||||
const targetTab = document.getElementById(`${tabName}-tab`);
|
||||
if (targetTab) {
|
||||
targetTab.classList.add('active');
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async loadRelatedApps() {
|
||||
try {
|
||||
const response = await fetch(`${API_BASE}/apps?category=${encodeURIComponent(this.appData.category)}&limit=4`);
|
||||
const apps = await response.json();
|
||||
|
||||
const relatedApps = apps.filter(app => app.slug !== this.appSlug).slice(0, 3);
|
||||
const grid = document.getElementById('related-apps-grid');
|
||||
|
||||
grid.innerHTML = relatedApps.map(app => `
|
||||
<div class="related-app-card" onclick="window.location.href='app-detail.html?app=${app.slug || app.name.toLowerCase().replace(/\s+/g, '-')}'">
|
||||
<h4>${app.name}</h4>
|
||||
<p>${app.description.substring(0, 100)}...</p>
|
||||
<div style="display: flex; justify-content: space-between; margin-top: 0.5rem; font-size: 0.75rem;">
|
||||
<span style="color: var(--primary-cyan)">${app.type}</span>
|
||||
<span style="color: var(--warning)">★ ${app.rating}/5</span>
|
||||
</div>
|
||||
</div>
|
||||
`).join('');
|
||||
} catch (error) {
|
||||
console.error('Error loading related apps:', error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize when DOM is loaded
|
||||
document.addEventListener('DOMContentLoaded', () => {
|
||||
new AppDetailPage();
|
||||
});
|
||||
14
docs/md_v2/marketplace/backend/.env.example
Normal file
14
docs/md_v2/marketplace/backend/.env.example
Normal file
@@ -0,0 +1,14 @@
|
||||
# Marketplace Configuration
|
||||
# Copy this to .env and update with your values
|
||||
|
||||
# Admin password (required)
|
||||
MARKETPLACE_ADMIN_PASSWORD=change_this_password
|
||||
|
||||
# JWT secret key (required) - generate with: python3 -c "import secrets; print(secrets.token_urlsafe(32))"
|
||||
MARKETPLACE_JWT_SECRET=change_this_to_a_secure_random_key
|
||||
|
||||
# Database path (optional, defaults to ./marketplace.db)
|
||||
MARKETPLACE_DB_PATH=./marketplace.db
|
||||
|
||||
# Token expiry in hours (optional, defaults to 4)
|
||||
MARKETPLACE_TOKEN_EXPIRY=4
|
||||
59
docs/md_v2/marketplace/backend/config.py
Normal file
59
docs/md_v2/marketplace/backend/config.py
Normal file
@@ -0,0 +1,59 @@
|
||||
"""
|
||||
Marketplace Configuration - Loads from .env file
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Load .env file
|
||||
env_path = Path(__file__).parent / '.env'
|
||||
if not env_path.exists():
|
||||
print("\n❌ ERROR: No .env file found!")
|
||||
print("Please copy .env.example to .env and update with your values:")
|
||||
print(f" cp {Path(__file__).parent}/.env.example {Path(__file__).parent}/.env")
|
||||
print("\nThen edit .env with your secure values.")
|
||||
sys.exit(1)
|
||||
|
||||
load_dotenv(env_path)
|
||||
|
||||
# Required environment variables
|
||||
required_vars = ['MARKETPLACE_ADMIN_PASSWORD', 'MARKETPLACE_JWT_SECRET']
|
||||
missing_vars = [var for var in required_vars if not os.getenv(var)]
|
||||
|
||||
if missing_vars:
|
||||
print(f"\n❌ ERROR: Missing required environment variables: {', '.join(missing_vars)}")
|
||||
print("Please check your .env file and ensure all required variables are set.")
|
||||
sys.exit(1)
|
||||
|
||||
class Config:
|
||||
"""Configuration loaded from environment variables"""
|
||||
|
||||
# Admin authentication - hashed from password in .env
|
||||
ADMIN_PASSWORD_HASH = hashlib.sha256(
|
||||
os.getenv('MARKETPLACE_ADMIN_PASSWORD').encode()
|
||||
).hexdigest()
|
||||
|
||||
# JWT secret for token generation
|
||||
JWT_SECRET_KEY = os.getenv('MARKETPLACE_JWT_SECRET')
|
||||
|
||||
# Database path
|
||||
DATABASE_PATH = os.getenv('MARKETPLACE_DB_PATH', './marketplace.db')
|
||||
|
||||
# Token expiry in hours
|
||||
TOKEN_EXPIRY_HOURS = int(os.getenv('MARKETPLACE_TOKEN_EXPIRY', '4'))
|
||||
|
||||
# CORS origins - hardcoded as they don't contain secrets
|
||||
ALLOWED_ORIGINS = [
|
||||
"http://localhost:8000",
|
||||
"http://localhost:8080",
|
||||
"http://localhost:8100",
|
||||
"http://127.0.0.1:8000",
|
||||
"http://127.0.0.1:8080",
|
||||
"http://127.0.0.1:8100",
|
||||
"https://crawl4ai.com",
|
||||
"https://www.crawl4ai.com",
|
||||
"https://docs.crawl4ai.com",
|
||||
"https://market.crawl4ai.com"
|
||||
]
|
||||
117
docs/md_v2/marketplace/backend/database.py
Normal file
117
docs/md_v2/marketplace/backend/database.py
Normal file
@@ -0,0 +1,117 @@
|
||||
import sqlite3
|
||||
import yaml
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Any
|
||||
|
||||
class DatabaseManager:
|
||||
def __init__(self, db_path=None, schema_path='schema.yaml'):
|
||||
self.schema = self._load_schema(schema_path)
|
||||
# Use provided path or fallback to schema default
|
||||
self.db_path = db_path or self.schema['database']['name']
|
||||
self.conn = None
|
||||
self._init_database()
|
||||
|
||||
def _load_schema(self, path: str) -> Dict:
|
||||
with open(path, 'r') as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
def _init_database(self):
|
||||
"""Auto-create/migrate database from schema"""
|
||||
self.conn = sqlite3.connect(self.db_path, check_same_thread=False)
|
||||
self.conn.row_factory = sqlite3.Row
|
||||
|
||||
for table_name, table_def in self.schema['tables'].items():
|
||||
self._create_or_update_table(table_name, table_def['columns'])
|
||||
|
||||
def _create_or_update_table(self, table_name: str, columns: Dict):
|
||||
cursor = self.conn.cursor()
|
||||
|
||||
# Check if table exists
|
||||
cursor.execute(f"SELECT name FROM sqlite_master WHERE type='table' AND name=?", (table_name,))
|
||||
table_exists = cursor.fetchone() is not None
|
||||
|
||||
if not table_exists:
|
||||
# Create table
|
||||
col_defs = []
|
||||
for col_name, col_spec in columns.items():
|
||||
col_def = f"{col_name} {col_spec['type']}"
|
||||
if col_spec.get('primary'):
|
||||
col_def += " PRIMARY KEY"
|
||||
if col_spec.get('autoincrement'):
|
||||
col_def += " AUTOINCREMENT"
|
||||
if col_spec.get('unique'):
|
||||
col_def += " UNIQUE"
|
||||
if col_spec.get('required'):
|
||||
col_def += " NOT NULL"
|
||||
if 'default' in col_spec:
|
||||
default = col_spec['default']
|
||||
if default == 'CURRENT_TIMESTAMP':
|
||||
col_def += f" DEFAULT {default}"
|
||||
elif isinstance(default, str):
|
||||
col_def += f" DEFAULT '{default}'"
|
||||
else:
|
||||
col_def += f" DEFAULT {default}"
|
||||
col_defs.append(col_def)
|
||||
|
||||
create_sql = f"CREATE TABLE {table_name} ({', '.join(col_defs)})"
|
||||
cursor.execute(create_sql)
|
||||
else:
|
||||
# Check for new columns and add them
|
||||
cursor.execute(f"PRAGMA table_info({table_name})")
|
||||
existing_columns = {row[1] for row in cursor.fetchall()}
|
||||
|
||||
for col_name, col_spec in columns.items():
|
||||
if col_name not in existing_columns:
|
||||
col_def = f"{col_spec['type']}"
|
||||
if 'default' in col_spec:
|
||||
default = col_spec['default']
|
||||
if default == 'CURRENT_TIMESTAMP':
|
||||
col_def += f" DEFAULT {default}"
|
||||
elif isinstance(default, str):
|
||||
col_def += f" DEFAULT '{default}'"
|
||||
else:
|
||||
col_def += f" DEFAULT {default}"
|
||||
|
||||
cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN {col_name} {col_def}")
|
||||
|
||||
self.conn.commit()
|
||||
|
||||
def get_all(self, table: str, limit: int = 100, offset: int = 0, where: str = None) -> List[Dict]:
|
||||
cursor = self.conn.cursor()
|
||||
query = f"SELECT * FROM {table}"
|
||||
if where:
|
||||
query += f" WHERE {where}"
|
||||
query += f" LIMIT {limit} OFFSET {offset}"
|
||||
|
||||
cursor.execute(query)
|
||||
rows = cursor.fetchall()
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
def search(self, query: str, tables: List[str] = None) -> Dict[str, List[Dict]]:
|
||||
if not tables:
|
||||
tables = list(self.schema['tables'].keys())
|
||||
|
||||
results = {}
|
||||
cursor = self.conn.cursor()
|
||||
|
||||
for table in tables:
|
||||
# Search in text columns
|
||||
columns = self.schema['tables'][table]['columns']
|
||||
text_cols = [col for col, spec in columns.items()
|
||||
if spec['type'] == 'TEXT' and col != 'id']
|
||||
|
||||
if text_cols:
|
||||
where_clause = ' OR '.join([f"{col} LIKE ?" for col in text_cols])
|
||||
params = [f'%{query}%'] * len(text_cols)
|
||||
|
||||
cursor.execute(f"SELECT * FROM {table} WHERE {where_clause} LIMIT 10", params)
|
||||
rows = cursor.fetchall()
|
||||
if rows:
|
||||
results[table] = [dict(row) for row in rows]
|
||||
|
||||
return results
|
||||
|
||||
def close(self):
|
||||
if self.conn:
|
||||
self.conn.close()
|
||||
267
docs/md_v2/marketplace/backend/dummy_data.py
Normal file
267
docs/md_v2/marketplace/backend/dummy_data.py
Normal file
@@ -0,0 +1,267 @@
|
||||
import sqlite3
|
||||
import json
|
||||
import random
|
||||
from datetime import datetime, timedelta
|
||||
from database import DatabaseManager
|
||||
|
||||
def generate_slug(text):
|
||||
return text.lower().replace(' ', '-').replace('&', 'and')
|
||||
|
||||
def generate_dummy_data():
|
||||
db = DatabaseManager()
|
||||
conn = db.conn
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Clear existing data
|
||||
for table in ['apps', 'articles', 'categories', 'sponsors']:
|
||||
cursor.execute(f"DELETE FROM {table}")
|
||||
|
||||
# Categories
|
||||
categories = [
|
||||
("Browser Automation", "⚙", "Tools for browser automation and control"),
|
||||
("Proxy Services", "🔒", "Proxy providers and rotation services"),
|
||||
("LLM Integration", "🤖", "AI/LLM tools and integrations"),
|
||||
("Data Processing", "📊", "Data extraction and processing tools"),
|
||||
("Cloud Infrastructure", "☁", "Cloud browser and computing services"),
|
||||
("Developer Tools", "🛠", "Development and testing utilities")
|
||||
]
|
||||
|
||||
for i, (name, icon, desc) in enumerate(categories):
|
||||
cursor.execute("""
|
||||
INSERT INTO categories (name, slug, icon, description, order_index)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
""", (name, generate_slug(name), icon, desc, i))
|
||||
|
||||
# Apps with real Unsplash images
|
||||
apps_data = [
|
||||
# Browser Automation
|
||||
("Playwright Cloud", "Browser Automation", "Paid", True, True,
|
||||
"Scalable browser automation in the cloud with Playwright", "https://playwright.cloud",
|
||||
None, "$99/month starter", 4.8, 12500,
|
||||
"https://images.unsplash.com/photo-1633356122544-f134324a6cee?w=800&h=400&fit=crop"),
|
||||
|
||||
("Selenium Grid Hub", "Browser Automation", "Freemium", False, False,
|
||||
"Distributed Selenium grid for parallel testing", "https://seleniumhub.io",
|
||||
"https://github.com/seleniumhub/grid", "Free - $299/month", 4.2, 8400,
|
||||
"https://images.unsplash.com/photo-1555066931-4365d14bab8c?w=800&h=400&fit=crop"),
|
||||
|
||||
("Puppeteer Extra", "Browser Automation", "Open Source", True, False,
|
||||
"Enhanced Puppeteer with stealth plugins and more", "https://puppeteer-extra.dev",
|
||||
"https://github.com/berstend/puppeteer-extra", "Free", 4.6, 15200,
|
||||
"https://images.unsplash.com/photo-1461749280684-dccba630e2f6?w=800&h=400&fit=crop"),
|
||||
|
||||
# Proxy Services
|
||||
("BrightData", "Proxy Services", "Paid", True, True,
|
||||
"Premium proxy network with 72M+ IPs worldwide", "https://brightdata.com",
|
||||
None, "Starting $500/month", 4.7, 9800,
|
||||
"https://images.unsplash.com/photo-1558494949-ef010cbdcc31?w=800&h=400&fit=crop"),
|
||||
|
||||
("SmartProxy", "Proxy Services", "Paid", False, True,
|
||||
"Residential and datacenter proxies with rotation", "https://smartproxy.com",
|
||||
None, "Starting $75/month", 4.3, 7600,
|
||||
"https://images.unsplash.com/photo-1544197150-b99a580bb7a8?w=800&h=400&fit=crop"),
|
||||
|
||||
("ProxyMesh", "Proxy Services", "Freemium", False, False,
|
||||
"Rotating proxy servers with sticky sessions", "https://proxymesh.com",
|
||||
None, "$10-$50/month", 4.0, 4200,
|
||||
"https://images.unsplash.com/photo-1451187580459-43490279c0fa?w=800&h=400&fit=crop"),
|
||||
|
||||
# LLM Integration
|
||||
("LangChain Crawl", "LLM Integration", "Open Source", True, False,
|
||||
"LangChain integration for Crawl4AI workflows", "https://langchain-crawl.dev",
|
||||
"https://github.com/langchain/crawl", "Free", 4.5, 18900,
|
||||
"https://images.unsplash.com/photo-1677442136019-21780ecad995?w=800&h=400&fit=crop"),
|
||||
|
||||
("GPT Scraper", "LLM Integration", "Freemium", False, False,
|
||||
"Extract structured data using GPT models", "https://gptscraper.ai",
|
||||
None, "Free - $99/month", 4.1, 5600,
|
||||
"https://images.unsplash.com/photo-1655720828018-edd2daec9349?w=800&h=400&fit=crop"),
|
||||
|
||||
("Claude Extract", "LLM Integration", "Paid", True, True,
|
||||
"Professional extraction using Claude AI", "https://claude-extract.com",
|
||||
None, "$199/month", 4.9, 3200,
|
||||
"https://images.unsplash.com/photo-1686191128892-3b09ad503b4f?w=800&h=400&fit=crop"),
|
||||
|
||||
# Data Processing
|
||||
("DataMiner Pro", "Data Processing", "Paid", False, False,
|
||||
"Advanced data extraction and transformation", "https://dataminer.pro",
|
||||
None, "$149/month", 4.2, 6700,
|
||||
"https://images.unsplash.com/photo-1551288049-bebda4e38f71?w=800&h=400&fit=crop"),
|
||||
|
||||
("ScraperAPI", "Data Processing", "Freemium", True, True,
|
||||
"Simple API for web scraping with proxy rotation", "https://scraperapi.com",
|
||||
None, "Free - $299/month", 4.6, 22300,
|
||||
"https://images.unsplash.com/photo-1460925895917-afdab827c52f?w=800&h=400&fit=crop"),
|
||||
|
||||
("Apify", "Data Processing", "Freemium", False, False,
|
||||
"Web scraping and automation platform", "https://apify.com",
|
||||
None, "$49-$499/month", 4.4, 14500,
|
||||
"https://images.unsplash.com/photo-1504639725590-34d0984388bd?w=800&h=400&fit=crop"),
|
||||
|
||||
# Cloud Infrastructure
|
||||
("BrowserCloud", "Cloud Infrastructure", "Paid", True, True,
|
||||
"Managed headless browsers in the cloud", "https://browsercloud.io",
|
||||
None, "$199/month", 4.5, 8900,
|
||||
"https://images.unsplash.com/photo-1667372393119-3d4c48d07fc9?w=800&h=400&fit=crop"),
|
||||
|
||||
("LambdaTest", "Cloud Infrastructure", "Freemium", False, False,
|
||||
"Cross-browser testing on cloud", "https://lambdatest.com",
|
||||
None, "Free - $99/month", 4.1, 11200,
|
||||
"https://images.unsplash.com/photo-1451187580459-43490279c0fa?w=800&h=400&fit=crop"),
|
||||
|
||||
("Browserless", "Cloud Infrastructure", "Freemium", True, False,
|
||||
"Headless browser automation API", "https://browserless.io",
|
||||
None, "$50-$500/month", 4.7, 19800,
|
||||
"https://images.unsplash.com/photo-1639762681485-074b7f938ba0?w=800&h=400&fit=crop"),
|
||||
|
||||
# Developer Tools
|
||||
("Crawl4AI VSCode", "Developer Tools", "Open Source", True, False,
|
||||
"VSCode extension for Crawl4AI development", "https://marketplace.visualstudio.com",
|
||||
"https://github.com/crawl4ai/vscode", "Free", 4.8, 34500,
|
||||
"https://images.unsplash.com/photo-1629654297299-c8506221ca97?w=800&h=400&fit=crop"),
|
||||
|
||||
("Postman Collection", "Developer Tools", "Open Source", False, False,
|
||||
"Postman collection for Crawl4AI API testing", "https://postman.com/crawl4ai",
|
||||
"https://github.com/crawl4ai/postman", "Free", 4.3, 7800,
|
||||
"https://images.unsplash.com/photo-1599507593499-a3f7d7d97667?w=800&h=400&fit=crop"),
|
||||
|
||||
("Debug Toolkit", "Developer Tools", "Open Source", False, False,
|
||||
"Debugging tools for crawler development", "https://debug.crawl4ai.com",
|
||||
"https://github.com/crawl4ai/debug", "Free", 4.0, 4300,
|
||||
"https://images.unsplash.com/photo-1515879218367-8466d910aaa4?w=800&h=400&fit=crop"),
|
||||
]
|
||||
|
||||
for name, category, type_, featured, sponsored, desc, url, github, pricing, rating, downloads, image in apps_data:
|
||||
screenshots = json.dumps([
|
||||
f"https://images.unsplash.com/photo-{random.randint(1500000000000, 1700000000000)}-{random.randint(1000000000000, 9999999999999)}?w=800&h=600&fit=crop",
|
||||
f"https://images.unsplash.com/photo-{random.randint(1500000000000, 1700000000000)}-{random.randint(1000000000000, 9999999999999)}?w=800&h=600&fit=crop"
|
||||
])
|
||||
cursor.execute("""
|
||||
INSERT INTO apps (name, slug, description, category, type, featured, sponsored,
|
||||
website_url, github_url, pricing, rating, downloads, image, screenshots, logo_url,
|
||||
integration_guide, contact_email, views)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", (name, generate_slug(name), desc, category, type_, featured, sponsored,
|
||||
url, github, pricing, rating, downloads, image, screenshots,
|
||||
f"https://ui-avatars.com/api/?name={name}&background=50ffff&color=070708&size=128",
|
||||
f"# {name} Integration\n\n```python\nfrom crawl4ai import AsyncWebCrawler\n# Integration code coming soon...\n```",
|
||||
f"contact@{generate_slug(name)}.com",
|
||||
random.randint(100, 5000)))
|
||||
|
||||
# Articles with real images
|
||||
articles_data = [
|
||||
("Browser Automation Showdown: Playwright vs Puppeteer vs Selenium",
|
||||
"Review", "John Doe", ["Playwright Cloud", "Puppeteer Extra"],
|
||||
["browser-automation", "comparison", "2024"],
|
||||
"https://images.unsplash.com/photo-1587620962725-abab7fe55159?w=1200&h=630&fit=crop"),
|
||||
|
||||
("Top 5 Proxy Services for Web Scraping in 2024",
|
||||
"Comparison", "Jane Smith", ["BrightData", "SmartProxy", "ProxyMesh"],
|
||||
["proxy", "web-scraping", "guide"],
|
||||
"https://images.unsplash.com/photo-1558494949-ef010cbdcc31?w=1200&h=630&fit=crop"),
|
||||
|
||||
("Integrating LLMs with Crawl4AI: A Complete Guide",
|
||||
"Tutorial", "Crawl4AI Team", ["LangChain Crawl", "GPT Scraper", "Claude Extract"],
|
||||
["llm", "integration", "tutorial"],
|
||||
"https://images.unsplash.com/photo-1677442136019-21780ecad995?w=1200&h=630&fit=crop"),
|
||||
|
||||
("Building Scalable Crawlers with Cloud Infrastructure",
|
||||
"Tutorial", "Mike Johnson", ["BrowserCloud", "Browserless"],
|
||||
["cloud", "scalability", "architecture"],
|
||||
"https://images.unsplash.com/photo-1667372393119-3d4c48d07fc9?w=1200&h=630&fit=crop"),
|
||||
|
||||
("What's New in Crawl4AI Marketplace",
|
||||
"News", "Crawl4AI Team", [],
|
||||
["marketplace", "announcement", "news"],
|
||||
"https://images.unsplash.com/photo-1556075798-4825dfaaf498?w=1200&h=630&fit=crop"),
|
||||
|
||||
("Cost Analysis: Self-Hosted vs Cloud Browser Solutions",
|
||||
"Comparison", "Sarah Chen", ["BrowserCloud", "LambdaTest", "Browserless"],
|
||||
["cost", "cloud", "comparison"],
|
||||
"https://images.unsplash.com/photo-1554224155-8d04cb21cd6c?w=1200&h=630&fit=crop"),
|
||||
|
||||
("Getting Started with Browser Automation",
|
||||
"Tutorial", "Crawl4AI Team", ["Playwright Cloud", "Selenium Grid Hub"],
|
||||
["beginner", "tutorial", "automation"],
|
||||
"https://images.unsplash.com/photo-1498050108023-c5249f4df085?w=1200&h=630&fit=crop"),
|
||||
|
||||
("The Future of Web Scraping: AI-Powered Extraction",
|
||||
"News", "Dr. Alan Turing", ["Claude Extract", "GPT Scraper"],
|
||||
["ai", "future", "trends"],
|
||||
"https://images.unsplash.com/photo-1593720213428-28a5b9e94613?w=1200&h=630&fit=crop")
|
||||
]
|
||||
|
||||
for title, category, author, related_apps, tags, image in articles_data:
|
||||
# Get app IDs for related apps
|
||||
related_ids = []
|
||||
for app_name in related_apps:
|
||||
cursor.execute("SELECT id FROM apps WHERE name = ?", (app_name,))
|
||||
result = cursor.fetchone()
|
||||
if result:
|
||||
related_ids.append(result[0])
|
||||
|
||||
content = f"""# {title}
|
||||
|
||||
By {author} | {datetime.now().strftime('%B %d, %Y')}
|
||||
|
||||
## Introduction
|
||||
|
||||
This is a comprehensive article about {title.lower()}. Lorem ipsum dolor sit amet, consectetur adipiscing elit.
|
||||
Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
|
||||
|
||||
## Key Points
|
||||
|
||||
- Important point about the topic
|
||||
- Another crucial insight
|
||||
- Technical details and specifications
|
||||
- Performance comparisons
|
||||
|
||||
## Conclusion
|
||||
|
||||
In summary, this article explored various aspects of the topic. Stay tuned for more updates!
|
||||
"""
|
||||
|
||||
cursor.execute("""
|
||||
INSERT INTO articles (title, slug, content, author, category, related_apps,
|
||||
featured_image, tags, views)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", (title, generate_slug(title), content, author, category,
|
||||
json.dumps(related_ids), image, json.dumps(tags),
|
||||
random.randint(200, 10000)))
|
||||
|
||||
# Sponsors
|
||||
sponsors_data = [
|
||||
("BrightData", "Gold", "https://brightdata.com",
|
||||
"https://images.unsplash.com/photo-1558494949-ef010cbdcc31?w=728&h=90&fit=crop"),
|
||||
("ScraperAPI", "Gold", "https://scraperapi.com",
|
||||
"https://images.unsplash.com/photo-1460925895917-afdab827c52f?w=728&h=90&fit=crop"),
|
||||
("BrowserCloud", "Silver", "https://browsercloud.io",
|
||||
"https://images.unsplash.com/photo-1667372393119-3d4c48d07fc9?w=728&h=90&fit=crop"),
|
||||
("Claude Extract", "Silver", "https://claude-extract.com",
|
||||
"https://images.unsplash.com/photo-1686191128892-3b09ad503b4f?w=728&h=90&fit=crop"),
|
||||
("SmartProxy", "Bronze", "https://smartproxy.com",
|
||||
"https://images.unsplash.com/photo-1544197150-b99a580bb7a8?w=728&h=90&fit=crop")
|
||||
]
|
||||
|
||||
for company, tier, landing_url, banner in sponsors_data:
|
||||
start_date = datetime.now() - timedelta(days=random.randint(1, 30))
|
||||
end_date = datetime.now() + timedelta(days=random.randint(30, 180))
|
||||
|
||||
cursor.execute("""
|
||||
INSERT INTO sponsors (company_name, logo_url, tier, banner_url,
|
||||
landing_url, active, start_date, end_date)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", (company,
|
||||
f"https://ui-avatars.com/api/?name={company}&background=09b5a5&color=fff&size=200",
|
||||
tier, banner, landing_url, 1,
|
||||
start_date.isoformat(), end_date.isoformat()))
|
||||
|
||||
conn.commit()
|
||||
print("✓ Dummy data generated successfully!")
|
||||
print(f" - {len(categories)} categories")
|
||||
print(f" - {len(apps_data)} apps")
|
||||
print(f" - {len(articles_data)} articles")
|
||||
print(f" - {len(sponsors_data)} sponsors")
|
||||
|
||||
if __name__ == "__main__":
|
||||
generate_dummy_data()
|
||||
5
docs/md_v2/marketplace/backend/requirements.txt
Normal file
5
docs/md_v2/marketplace/backend/requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
fastapi
|
||||
uvicorn
|
||||
pyyaml
|
||||
python-multipart
|
||||
python-dotenv
|
||||
75
docs/md_v2/marketplace/backend/schema.yaml
Normal file
75
docs/md_v2/marketplace/backend/schema.yaml
Normal file
@@ -0,0 +1,75 @@
|
||||
database:
|
||||
name: marketplace.db
|
||||
|
||||
tables:
|
||||
apps:
|
||||
columns:
|
||||
id: {type: INTEGER, primary: true, autoincrement: true}
|
||||
name: {type: TEXT, required: true}
|
||||
slug: {type: TEXT, unique: true}
|
||||
description: {type: TEXT}
|
||||
long_description: {type: TEXT}
|
||||
logo_url: {type: TEXT}
|
||||
image: {type: TEXT}
|
||||
screenshots: {type: JSON, default: '[]'}
|
||||
category: {type: TEXT}
|
||||
type: {type: TEXT, default: 'Open Source'}
|
||||
status: {type: TEXT, default: 'Active'}
|
||||
website_url: {type: TEXT}
|
||||
github_url: {type: TEXT}
|
||||
demo_url: {type: TEXT}
|
||||
video_url: {type: TEXT}
|
||||
documentation_url: {type: TEXT}
|
||||
support_url: {type: TEXT}
|
||||
discord_url: {type: TEXT}
|
||||
pricing: {type: TEXT}
|
||||
rating: {type: REAL, default: 0.0}
|
||||
downloads: {type: INTEGER, default: 0}
|
||||
featured: {type: BOOLEAN, default: 0}
|
||||
sponsored: {type: BOOLEAN, default: 0}
|
||||
integration_guide: {type: TEXT}
|
||||
documentation: {type: TEXT}
|
||||
examples: {type: TEXT}
|
||||
installation_command: {type: TEXT}
|
||||
requirements: {type: TEXT}
|
||||
changelog: {type: TEXT}
|
||||
tags: {type: JSON, default: '[]'}
|
||||
added_date: {type: DATETIME, default: CURRENT_TIMESTAMP}
|
||||
updated_date: {type: DATETIME, default: CURRENT_TIMESTAMP}
|
||||
contact_email: {type: TEXT}
|
||||
views: {type: INTEGER, default: 0}
|
||||
|
||||
articles:
|
||||
columns:
|
||||
id: {type: INTEGER, primary: true, autoincrement: true}
|
||||
title: {type: TEXT, required: true}
|
||||
slug: {type: TEXT, unique: true}
|
||||
content: {type: TEXT}
|
||||
author: {type: TEXT, default: 'Crawl4AI Team'}
|
||||
category: {type: TEXT}
|
||||
related_apps: {type: JSON, default: '[]'}
|
||||
featured_image: {type: TEXT}
|
||||
published_date: {type: DATETIME, default: CURRENT_TIMESTAMP}
|
||||
tags: {type: JSON, default: '[]'}
|
||||
views: {type: INTEGER, default: 0}
|
||||
|
||||
categories:
|
||||
columns:
|
||||
id: {type: INTEGER, primary: true, autoincrement: true}
|
||||
name: {type: TEXT, unique: true}
|
||||
slug: {type: TEXT, unique: true}
|
||||
icon: {type: TEXT}
|
||||
description: {type: TEXT}
|
||||
order_index: {type: INTEGER, default: 0}
|
||||
|
||||
sponsors:
|
||||
columns:
|
||||
id: {type: INTEGER, primary: true, autoincrement: true}
|
||||
company_name: {type: TEXT, required: true}
|
||||
logo_url: {type: TEXT}
|
||||
tier: {type: TEXT, default: 'Bronze'}
|
||||
banner_url: {type: TEXT}
|
||||
landing_url: {type: TEXT}
|
||||
active: {type: BOOLEAN, default: 1}
|
||||
start_date: {type: DATETIME}
|
||||
end_date: {type: DATETIME}
|
||||
497
docs/md_v2/marketplace/backend/server.py
Normal file
497
docs/md_v2/marketplace/backend/server.py
Normal file
@@ -0,0 +1,497 @@
|
||||
from fastapi import FastAPI, HTTPException, Query, Depends, Body, UploadFile, File, Form, APIRouter
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import JSONResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
||||
from typing import Optional, Dict, Any
|
||||
import json
|
||||
import hashlib
|
||||
import secrets
|
||||
import re
|
||||
from pathlib import Path
|
||||
from database import DatabaseManager
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
# Import configuration (will exit if .env not found or invalid)
|
||||
from config import Config
|
||||
|
||||
app = FastAPI(title="Crawl4AI Marketplace API")
|
||||
router = APIRouter(prefix="/marketplace/api")
|
||||
|
||||
# Security setup
|
||||
security = HTTPBearer()
|
||||
tokens = {} # In production, use Redis or database for token storage
|
||||
|
||||
# CORS configuration
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=Config.ALLOWED_ORIGINS,
|
||||
allow_credentials=True,
|
||||
allow_methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"],
|
||||
allow_headers=["*"],
|
||||
max_age=3600
|
||||
)
|
||||
|
||||
# Initialize database with configurable path
|
||||
db = DatabaseManager(Config.DATABASE_PATH)
|
||||
|
||||
BASE_DIR = Path(__file__).parent
|
||||
UPLOAD_ROOT = BASE_DIR / "uploads"
|
||||
UPLOAD_ROOT.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
app.mount("/uploads", StaticFiles(directory=UPLOAD_ROOT), name="uploads")
|
||||
|
||||
ALLOWED_IMAGE_TYPES = {
|
||||
"image/png": ".png",
|
||||
"image/jpeg": ".jpg",
|
||||
"image/webp": ".webp",
|
||||
"image/svg+xml": ".svg"
|
||||
}
|
||||
ALLOWED_UPLOAD_FOLDERS = {"sponsors"}
|
||||
MAX_UPLOAD_SIZE = 2 * 1024 * 1024 # 2 MB
|
||||
|
||||
def json_response(data, cache_time=3600):
|
||||
"""Helper to return JSON with cache headers"""
|
||||
return JSONResponse(
|
||||
content=data,
|
||||
headers={
|
||||
"Cache-Control": f"public, max-age={cache_time}",
|
||||
"X-Content-Type-Options": "nosniff"
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def to_int(value, default=0):
|
||||
"""Coerce incoming values to integers, falling back to default."""
|
||||
if value is None:
|
||||
return default
|
||||
if isinstance(value, bool):
|
||||
return int(value)
|
||||
if isinstance(value, (int, float)):
|
||||
return int(value)
|
||||
|
||||
if isinstance(value, str):
|
||||
stripped = value.strip()
|
||||
if not stripped:
|
||||
return default
|
||||
|
||||
match = re.match(r"^-?\d+", stripped)
|
||||
if match:
|
||||
try:
|
||||
return int(match.group())
|
||||
except ValueError:
|
||||
return default
|
||||
return default
|
||||
|
||||
# ============= PUBLIC ENDPOINTS =============
|
||||
|
||||
@router.get("/apps")
|
||||
async def get_apps(
|
||||
category: Optional[str] = None,
|
||||
type: Optional[str] = None,
|
||||
featured: Optional[bool] = None,
|
||||
sponsored: Optional[bool] = None,
|
||||
limit: int = Query(default=20, le=10000),
|
||||
offset: int = Query(default=0)
|
||||
):
|
||||
"""Get apps with optional filters"""
|
||||
where_clauses = []
|
||||
if category:
|
||||
where_clauses.append(f"category = '{category}'")
|
||||
if type:
|
||||
where_clauses.append(f"type = '{type}'")
|
||||
if featured is not None:
|
||||
where_clauses.append(f"featured = {1 if featured else 0}")
|
||||
if sponsored is not None:
|
||||
where_clauses.append(f"sponsored = {1 if sponsored else 0}")
|
||||
|
||||
where = " AND ".join(where_clauses) if where_clauses else None
|
||||
apps = db.get_all('apps', limit=limit, offset=offset, where=where)
|
||||
|
||||
# Parse JSON fields
|
||||
for app in apps:
|
||||
if app.get('screenshots'):
|
||||
app['screenshots'] = json.loads(app['screenshots'])
|
||||
|
||||
return json_response(apps)
|
||||
|
||||
@router.get("/apps/{slug}")
|
||||
async def get_app(slug: str):
|
||||
"""Get single app by slug"""
|
||||
apps = db.get_all('apps', where=f"slug = '{slug}'", limit=1)
|
||||
if not apps:
|
||||
raise HTTPException(status_code=404, detail="App not found")
|
||||
|
||||
app = apps[0]
|
||||
if app.get('screenshots'):
|
||||
app['screenshots'] = json.loads(app['screenshots'])
|
||||
|
||||
return json_response(app)
|
||||
|
||||
@router.get("/articles")
|
||||
async def get_articles(
|
||||
category: Optional[str] = None,
|
||||
limit: int = Query(default=20, le=10000),
|
||||
offset: int = Query(default=0)
|
||||
):
|
||||
"""Get articles with optional category filter"""
|
||||
where = f"category = '{category}'" if category else None
|
||||
articles = db.get_all('articles', limit=limit, offset=offset, where=where)
|
||||
|
||||
# Parse JSON fields
|
||||
for article in articles:
|
||||
if article.get('related_apps'):
|
||||
article['related_apps'] = json.loads(article['related_apps'])
|
||||
if article.get('tags'):
|
||||
article['tags'] = json.loads(article['tags'])
|
||||
|
||||
return json_response(articles)
|
||||
|
||||
@router.get("/articles/{slug}")
|
||||
async def get_article(slug: str):
|
||||
"""Get single article by slug"""
|
||||
articles = db.get_all('articles', where=f"slug = '{slug}'", limit=1)
|
||||
if not articles:
|
||||
raise HTTPException(status_code=404, detail="Article not found")
|
||||
|
||||
article = articles[0]
|
||||
if article.get('related_apps'):
|
||||
article['related_apps'] = json.loads(article['related_apps'])
|
||||
if article.get('tags'):
|
||||
article['tags'] = json.loads(article['tags'])
|
||||
|
||||
return json_response(article)
|
||||
|
||||
@router.get("/categories")
|
||||
async def get_categories():
|
||||
"""Get all categories ordered by index"""
|
||||
categories = db.get_all('categories', limit=50)
|
||||
for category in categories:
|
||||
category['order_index'] = to_int(category.get('order_index'), 0)
|
||||
categories.sort(key=lambda x: x.get('order_index', 0))
|
||||
return json_response(categories, cache_time=7200)
|
||||
|
||||
@router.get("/sponsors")
|
||||
async def get_sponsors(active: Optional[bool] = True):
|
||||
"""Get sponsors, default active only"""
|
||||
where = f"active = {1 if active else 0}" if active is not None else None
|
||||
sponsors = db.get_all('sponsors', where=where, limit=20)
|
||||
|
||||
# Filter by date if active
|
||||
if active:
|
||||
now = datetime.now().isoformat()
|
||||
sponsors = [s for s in sponsors
|
||||
if (not s.get('start_date') or s['start_date'] <= now) and
|
||||
(not s.get('end_date') or s['end_date'] >= now)]
|
||||
|
||||
return json_response(sponsors)
|
||||
|
||||
@router.get("/search")
|
||||
async def search(q: str = Query(min_length=2)):
|
||||
"""Search across apps and articles"""
|
||||
if len(q) < 2:
|
||||
return json_response({})
|
||||
|
||||
results = db.search(q, tables=['apps', 'articles'])
|
||||
|
||||
# Parse JSON fields in results
|
||||
for table, items in results.items():
|
||||
for item in items:
|
||||
if table == 'apps' and item.get('screenshots'):
|
||||
item['screenshots'] = json.loads(item['screenshots'])
|
||||
elif table == 'articles':
|
||||
if item.get('related_apps'):
|
||||
item['related_apps'] = json.loads(item['related_apps'])
|
||||
if item.get('tags'):
|
||||
item['tags'] = json.loads(item['tags'])
|
||||
|
||||
return json_response(results, cache_time=1800)
|
||||
|
||||
@router.get("/stats")
|
||||
async def get_stats():
|
||||
"""Get marketplace statistics"""
|
||||
stats = {
|
||||
"total_apps": len(db.get_all('apps', limit=10000)),
|
||||
"total_articles": len(db.get_all('articles', limit=10000)),
|
||||
"total_categories": len(db.get_all('categories', limit=1000)),
|
||||
"active_sponsors": len(db.get_all('sponsors', where="active = 1", limit=1000))
|
||||
}
|
||||
return json_response(stats, cache_time=1800)
|
||||
|
||||
# ============= ADMIN AUTHENTICATION =============
|
||||
|
||||
def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
|
||||
"""Verify admin authentication token"""
|
||||
token = credentials.credentials
|
||||
if token not in tokens or tokens[token] < datetime.now():
|
||||
raise HTTPException(status_code=401, detail="Invalid or expired token")
|
||||
return token
|
||||
|
||||
|
||||
@router.post("/admin/upload-image", dependencies=[Depends(verify_token)])
|
||||
async def upload_image(file: UploadFile = File(...), folder: str = Form("sponsors")):
|
||||
"""Upload image files for admin assets"""
|
||||
folder = (folder or "").strip().lower()
|
||||
if folder not in ALLOWED_UPLOAD_FOLDERS:
|
||||
raise HTTPException(status_code=400, detail="Invalid upload folder")
|
||||
|
||||
if file.content_type not in ALLOWED_IMAGE_TYPES:
|
||||
raise HTTPException(status_code=400, detail="Unsupported file type")
|
||||
|
||||
contents = await file.read()
|
||||
if len(contents) > MAX_UPLOAD_SIZE:
|
||||
raise HTTPException(status_code=400, detail="File too large (max 2MB)")
|
||||
|
||||
extension = ALLOWED_IMAGE_TYPES[file.content_type]
|
||||
filename = f"{datetime.now().strftime('%Y%m%d%H%M%S')}_{secrets.token_hex(8)}{extension}"
|
||||
|
||||
target_dir = UPLOAD_ROOT / folder
|
||||
target_dir.mkdir(parents=True, exist_ok=True)
|
||||
target_path = target_dir / filename
|
||||
target_path.write_bytes(contents)
|
||||
|
||||
return {"url": f"/uploads/{folder}/{filename}"}
|
||||
|
||||
@router.post("/admin/login")
|
||||
async def admin_login(password: str = Body(..., embed=True)):
|
||||
"""Admin login with password"""
|
||||
provided_hash = hashlib.sha256(password.encode()).hexdigest()
|
||||
|
||||
if provided_hash != Config.ADMIN_PASSWORD_HASH:
|
||||
# Log failed attempt in production
|
||||
print(f"Failed login attempt at {datetime.now()}")
|
||||
raise HTTPException(status_code=401, detail="Invalid password")
|
||||
|
||||
# Generate secure token
|
||||
token = secrets.token_urlsafe(32)
|
||||
tokens[token] = datetime.now() + timedelta(hours=Config.TOKEN_EXPIRY_HOURS)
|
||||
|
||||
return {
|
||||
"token": token,
|
||||
"expires_in": Config.TOKEN_EXPIRY_HOURS * 3600
|
||||
}
|
||||
|
||||
# ============= ADMIN ENDPOINTS =============
|
||||
|
||||
@router.get("/admin/stats", dependencies=[Depends(verify_token)])
|
||||
async def get_admin_stats():
|
||||
"""Get detailed admin statistics"""
|
||||
stats = {
|
||||
"apps": {
|
||||
"total": len(db.get_all('apps', limit=10000)),
|
||||
"featured": len(db.get_all('apps', where="featured = 1", limit=10000)),
|
||||
"sponsored": len(db.get_all('apps', where="sponsored = 1", limit=10000))
|
||||
},
|
||||
"articles": len(db.get_all('articles', limit=10000)),
|
||||
"categories": len(db.get_all('categories', limit=1000)),
|
||||
"sponsors": {
|
||||
"active": len(db.get_all('sponsors', where="active = 1", limit=1000)),
|
||||
"total": len(db.get_all('sponsors', limit=10000))
|
||||
},
|
||||
"total_views": sum(app.get('views', 0) for app in db.get_all('apps', limit=10000))
|
||||
}
|
||||
return stats
|
||||
|
||||
# Apps CRUD
|
||||
@router.post("/admin/apps", dependencies=[Depends(verify_token)])
|
||||
async def create_app(app_data: Dict[str, Any]):
|
||||
"""Create new app"""
|
||||
try:
|
||||
# Handle JSON fields
|
||||
for field in ['screenshots', 'tags']:
|
||||
if field in app_data and isinstance(app_data[field], list):
|
||||
app_data[field] = json.dumps(app_data[field])
|
||||
|
||||
cursor = db.conn.cursor()
|
||||
columns = ', '.join(app_data.keys())
|
||||
placeholders = ', '.join(['?' for _ in app_data])
|
||||
cursor.execute(f"INSERT INTO apps ({columns}) VALUES ({placeholders})",
|
||||
list(app_data.values()))
|
||||
db.conn.commit()
|
||||
return {"id": cursor.lastrowid, "message": "App created"}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
@router.put("/admin/apps/{app_id}", dependencies=[Depends(verify_token)])
|
||||
async def update_app(app_id: int, app_data: Dict[str, Any]):
|
||||
"""Update app"""
|
||||
try:
|
||||
# Handle JSON fields
|
||||
for field in ['screenshots', 'tags']:
|
||||
if field in app_data and isinstance(app_data[field], list):
|
||||
app_data[field] = json.dumps(app_data[field])
|
||||
|
||||
set_clause = ', '.join([f"{k} = ?" for k in app_data.keys()])
|
||||
cursor = db.conn.cursor()
|
||||
cursor.execute(f"UPDATE apps SET {set_clause} WHERE id = ?",
|
||||
list(app_data.values()) + [app_id])
|
||||
db.conn.commit()
|
||||
return {"message": "App updated"}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
@router.delete("/admin/apps/{app_id}", dependencies=[Depends(verify_token)])
|
||||
async def delete_app(app_id: int):
|
||||
"""Delete app"""
|
||||
cursor = db.conn.cursor()
|
||||
cursor.execute("DELETE FROM apps WHERE id = ?", (app_id,))
|
||||
db.conn.commit()
|
||||
return {"message": "App deleted"}
|
||||
|
||||
# Articles CRUD
|
||||
@router.post("/admin/articles", dependencies=[Depends(verify_token)])
|
||||
async def create_article(article_data: Dict[str, Any]):
|
||||
"""Create new article"""
|
||||
try:
|
||||
for field in ['related_apps', 'tags']:
|
||||
if field in article_data and isinstance(article_data[field], list):
|
||||
article_data[field] = json.dumps(article_data[field])
|
||||
|
||||
cursor = db.conn.cursor()
|
||||
columns = ', '.join(article_data.keys())
|
||||
placeholders = ', '.join(['?' for _ in article_data])
|
||||
cursor.execute(f"INSERT INTO articles ({columns}) VALUES ({placeholders})",
|
||||
list(article_data.values()))
|
||||
db.conn.commit()
|
||||
return {"id": cursor.lastrowid, "message": "Article created"}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
@router.put("/admin/articles/{article_id}", dependencies=[Depends(verify_token)])
|
||||
async def update_article(article_id: int, article_data: Dict[str, Any]):
|
||||
"""Update article"""
|
||||
try:
|
||||
for field in ['related_apps', 'tags']:
|
||||
if field in article_data and isinstance(article_data[field], list):
|
||||
article_data[field] = json.dumps(article_data[field])
|
||||
|
||||
set_clause = ', '.join([f"{k} = ?" for k in article_data.keys()])
|
||||
cursor = db.conn.cursor()
|
||||
cursor.execute(f"UPDATE articles SET {set_clause} WHERE id = ?",
|
||||
list(article_data.values()) + [article_id])
|
||||
db.conn.commit()
|
||||
return {"message": "Article updated"}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
@router.delete("/admin/articles/{article_id}", dependencies=[Depends(verify_token)])
|
||||
async def delete_article(article_id: int):
|
||||
"""Delete article"""
|
||||
cursor = db.conn.cursor()
|
||||
cursor.execute("DELETE FROM articles WHERE id = ?", (article_id,))
|
||||
db.conn.commit()
|
||||
return {"message": "Article deleted"}
|
||||
|
||||
# Categories CRUD
|
||||
@router.post("/admin/categories", dependencies=[Depends(verify_token)])
|
||||
async def create_category(category_data: Dict[str, Any]):
|
||||
"""Create new category"""
|
||||
try:
|
||||
category_data = dict(category_data)
|
||||
category_data['order_index'] = to_int(category_data.get('order_index'), 0)
|
||||
|
||||
cursor = db.conn.cursor()
|
||||
columns = ', '.join(category_data.keys())
|
||||
placeholders = ', '.join(['?' for _ in category_data])
|
||||
cursor.execute(f"INSERT INTO categories ({columns}) VALUES ({placeholders})",
|
||||
list(category_data.values()))
|
||||
db.conn.commit()
|
||||
return {"id": cursor.lastrowid, "message": "Category created"}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
@router.put("/admin/categories/{cat_id}", dependencies=[Depends(verify_token)])
|
||||
async def update_category(cat_id: int, category_data: Dict[str, Any]):
|
||||
"""Update category"""
|
||||
try:
|
||||
category_data = dict(category_data)
|
||||
if 'order_index' in category_data:
|
||||
category_data['order_index'] = to_int(category_data.get('order_index'), 0)
|
||||
|
||||
set_clause = ', '.join([f"{k} = ?" for k in category_data.keys()])
|
||||
cursor = db.conn.cursor()
|
||||
cursor.execute(f"UPDATE categories SET {set_clause} WHERE id = ?",
|
||||
list(category_data.values()) + [cat_id])
|
||||
db.conn.commit()
|
||||
return {"message": "Category updated"}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
|
||||
@router.delete("/admin/categories/{cat_id}", dependencies=[Depends(verify_token)])
|
||||
async def delete_category(cat_id: int):
|
||||
"""Delete category"""
|
||||
try:
|
||||
cursor = db.conn.cursor()
|
||||
cursor.execute("DELETE FROM categories WHERE id = ?", (cat_id,))
|
||||
db.conn.commit()
|
||||
return {"message": "Category deleted"}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
# Sponsors CRUD
|
||||
@router.post("/admin/sponsors", dependencies=[Depends(verify_token)])
|
||||
async def create_sponsor(sponsor_data: Dict[str, Any]):
|
||||
"""Create new sponsor"""
|
||||
try:
|
||||
cursor = db.conn.cursor()
|
||||
columns = ', '.join(sponsor_data.keys())
|
||||
placeholders = ', '.join(['?' for _ in sponsor_data])
|
||||
cursor.execute(f"INSERT INTO sponsors ({columns}) VALUES ({placeholders})",
|
||||
list(sponsor_data.values()))
|
||||
db.conn.commit()
|
||||
return {"id": cursor.lastrowid, "message": "Sponsor created"}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
@router.put("/admin/sponsors/{sponsor_id}", dependencies=[Depends(verify_token)])
|
||||
async def update_sponsor(sponsor_id: int, sponsor_data: Dict[str, Any]):
|
||||
"""Update sponsor"""
|
||||
try:
|
||||
set_clause = ', '.join([f"{k} = ?" for k in sponsor_data.keys()])
|
||||
cursor = db.conn.cursor()
|
||||
cursor.execute(f"UPDATE sponsors SET {set_clause} WHERE id = ?",
|
||||
list(sponsor_data.values()) + [sponsor_id])
|
||||
db.conn.commit()
|
||||
return {"message": "Sponsor updated"}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
|
||||
@router.delete("/admin/sponsors/{sponsor_id}", dependencies=[Depends(verify_token)])
|
||||
async def delete_sponsor(sponsor_id: int):
|
||||
"""Delete sponsor"""
|
||||
try:
|
||||
cursor = db.conn.cursor()
|
||||
cursor.execute("DELETE FROM sponsors WHERE id = ?", (sponsor_id,))
|
||||
db.conn.commit()
|
||||
return {"message": "Sponsor deleted"}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
app.include_router(router)
|
||||
|
||||
# Version info
|
||||
VERSION = "1.1.0"
|
||||
BUILD_DATE = "2025-10-26"
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
"""API info"""
|
||||
return {
|
||||
"name": "Crawl4AI Marketplace API",
|
||||
"version": VERSION,
|
||||
"build_date": BUILD_DATE,
|
||||
"endpoints": [
|
||||
"/marketplace/api/apps",
|
||||
"/marketplace/api/articles",
|
||||
"/marketplace/api/categories",
|
||||
"/marketplace/api/sponsors",
|
||||
"/marketplace/api/search?q=query",
|
||||
"/marketplace/api/stats"
|
||||
]
|
||||
}
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="127.0.0.1", port=8100)
|
||||
2
docs/md_v2/marketplace/backend/uploads/.gitignore
vendored
Normal file
2
docs/md_v2/marketplace/backend/uploads/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
*
|
||||
!.gitignore
|
||||
462
docs/md_v2/marketplace/frontend/app-detail.css
Normal file
462
docs/md_v2/marketplace/frontend/app-detail.css
Normal file
@@ -0,0 +1,462 @@
|
||||
/* App Detail Page Styles */
|
||||
|
||||
.app-detail-container {
|
||||
min-height: 100vh;
|
||||
background: var(--bg-dark);
|
||||
}
|
||||
|
||||
/* Back Button */
|
||||
.header-nav {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.back-btn {
|
||||
padding: 0.5rem 1rem;
|
||||
background: transparent;
|
||||
border: 1px solid var(--border-color);
|
||||
color: var(--primary-cyan);
|
||||
text-decoration: none;
|
||||
transition: all 0.2s;
|
||||
font-size: 0.875rem;
|
||||
}
|
||||
|
||||
.back-btn:hover {
|
||||
border-color: var(--primary-cyan);
|
||||
background: rgba(80, 255, 255, 0.1);
|
||||
}
|
||||
|
||||
/* App Hero Section */
|
||||
.app-hero {
|
||||
max-width: 1800px;
|
||||
margin: 2rem auto;
|
||||
padding: 0 2rem;
|
||||
}
|
||||
|
||||
.app-hero-content {
|
||||
display: grid;
|
||||
grid-template-columns: 1fr 2fr;
|
||||
gap: 3rem;
|
||||
background: linear-gradient(135deg, #1a1a2e, #0f0f1e);
|
||||
border: 2px solid var(--primary-cyan);
|
||||
padding: 2rem;
|
||||
box-shadow: 0 0 30px rgba(80, 255, 255, 0.15),
|
||||
inset 0 0 20px rgba(80, 255, 255, 0.05);
|
||||
}
|
||||
|
||||
.app-hero-image {
|
||||
width: 100%;
|
||||
height: 300px;
|
||||
background: linear-gradient(135deg, rgba(80, 255, 255, 0.1), rgba(243, 128, 245, 0.05));
|
||||
background-size: cover;
|
||||
background-position: center;
|
||||
border: 1px solid var(--border-color);
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
font-size: 4rem;
|
||||
color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
.app-badges {
|
||||
display: flex;
|
||||
gap: 0.5rem;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
.app-badge {
|
||||
padding: 0.3rem 0.6rem;
|
||||
background: var(--bg-tertiary);
|
||||
color: var(--text-secondary);
|
||||
font-size: 0.75rem;
|
||||
text-transform: uppercase;
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.app-badge.featured {
|
||||
background: linear-gradient(135deg, var(--primary-cyan), var(--primary-teal));
|
||||
color: var(--bg-dark);
|
||||
box-shadow: 0 2px 10px rgba(80, 255, 255, 0.3);
|
||||
}
|
||||
|
||||
.app-badge.sponsored {
|
||||
background: linear-gradient(135deg, var(--warning), #ff8c00);
|
||||
color: var(--bg-dark);
|
||||
box-shadow: 0 2px 10px rgba(245, 158, 11, 0.3);
|
||||
}
|
||||
|
||||
.app-hero-info h1 {
|
||||
font-size: 2.5rem;
|
||||
color: var(--primary-cyan);
|
||||
margin: 0.5rem 0;
|
||||
text-shadow: 0 0 20px rgba(80, 255, 255, 0.5);
|
||||
}
|
||||
|
||||
.app-tagline {
|
||||
font-size: 1.1rem;
|
||||
color: var(--text-secondary);
|
||||
margin-bottom: 2rem;
|
||||
}
|
||||
|
||||
/* Stats */
|
||||
.app-stats {
|
||||
display: flex;
|
||||
gap: 2rem;
|
||||
margin: 2rem 0;
|
||||
padding: 1rem 0;
|
||||
border-top: 1px solid var(--border-color);
|
||||
border-bottom: 1px solid var(--border-color);
|
||||
}
|
||||
|
||||
.stat {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.25rem;
|
||||
}
|
||||
|
||||
.stat-value {
|
||||
font-size: 1.5rem;
|
||||
color: var(--primary-cyan);
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.stat-label {
|
||||
font-size: 0.875rem;
|
||||
color: var(--text-tertiary);
|
||||
}
|
||||
|
||||
/* Action Buttons */
|
||||
.app-actions {
|
||||
display: flex;
|
||||
gap: 1rem;
|
||||
margin: 2rem 0;
|
||||
}
|
||||
|
||||
.action-btn {
|
||||
padding: 0.75rem 1.5rem;
|
||||
border: 1px solid var(--border-color);
|
||||
background: transparent;
|
||||
color: var(--text-primary);
|
||||
text-decoration: none;
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
transition: all 0.2s;
|
||||
cursor: pointer;
|
||||
font-family: inherit;
|
||||
font-size: 0.9rem;
|
||||
}
|
||||
|
||||
.action-btn.primary {
|
||||
background: linear-gradient(135deg, var(--primary-cyan), var(--primary-teal));
|
||||
color: var(--bg-dark);
|
||||
border-color: var(--primary-cyan);
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.action-btn.primary:hover {
|
||||
box-shadow: 0 4px 15px rgba(80, 255, 255, 0.3);
|
||||
transform: translateY(-2px);
|
||||
}
|
||||
|
||||
.action-btn.secondary {
|
||||
border-color: var(--accent-pink);
|
||||
color: var(--accent-pink);
|
||||
}
|
||||
|
||||
.action-btn.secondary:hover {
|
||||
background: rgba(243, 128, 245, 0.1);
|
||||
box-shadow: 0 4px 15px rgba(243, 128, 245, 0.2);
|
||||
}
|
||||
|
||||
.action-btn.ghost {
|
||||
border-color: var(--border-color);
|
||||
color: var(--text-secondary);
|
||||
}
|
||||
|
||||
.action-btn.ghost:hover {
|
||||
border-color: var(--primary-cyan);
|
||||
color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
/* Pricing */
|
||||
.pricing-info {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 1rem;
|
||||
font-size: 1.1rem;
|
||||
}
|
||||
|
||||
.pricing-label {
|
||||
color: var(--text-tertiary);
|
||||
}
|
||||
|
||||
.pricing-value {
|
||||
color: var(--warning);
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
/* Navigation Tabs */
|
||||
.app-nav {
|
||||
max-width: 1800px;
|
||||
margin: 2rem auto 0;
|
||||
padding: 0 2rem;
|
||||
display: flex;
|
||||
gap: 1rem;
|
||||
border-bottom: 2px solid var(--border-color);
|
||||
}
|
||||
|
||||
.nav-tab {
|
||||
padding: 1rem 1.5rem;
|
||||
background: transparent;
|
||||
border: none;
|
||||
border-bottom: 2px solid transparent;
|
||||
color: var(--text-secondary);
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
font-family: inherit;
|
||||
font-size: 0.9rem;
|
||||
margin-bottom: -2px;
|
||||
}
|
||||
|
||||
.nav-tab:hover {
|
||||
color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
.nav-tab.active {
|
||||
color: var(--primary-cyan);
|
||||
border-bottom-color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
/* Content Sections */
|
||||
.app-content {
|
||||
max-width: 1800px;
|
||||
margin: 2rem auto;
|
||||
padding: 0 2rem;
|
||||
}
|
||||
|
||||
.tab-content {
|
||||
display: none;
|
||||
}
|
||||
|
||||
.tab-content.active {
|
||||
display: block;
|
||||
}
|
||||
|
||||
.docs-content {
|
||||
max-width: 1200px;
|
||||
padding: 2rem;
|
||||
background: var(--bg-secondary);
|
||||
border: 1px solid var(--border-color);
|
||||
}
|
||||
|
||||
.docs-content h2 {
|
||||
font-size: 1.8rem;
|
||||
color: var(--primary-cyan);
|
||||
margin-bottom: 1rem;
|
||||
padding-bottom: 0.5rem;
|
||||
border-bottom: 1px solid var(--border-color);
|
||||
}
|
||||
|
||||
.docs-content h3 {
|
||||
font-size: 1.3rem;
|
||||
color: var(--text-primary);
|
||||
margin: 2rem 0 1rem;
|
||||
}
|
||||
|
||||
.docs-content h4 {
|
||||
font-size: 1.1rem;
|
||||
color: var(--accent-pink);
|
||||
margin: 1.5rem 0 0.5rem;
|
||||
}
|
||||
|
||||
.docs-content p {
|
||||
color: var(--text-secondary);
|
||||
line-height: 1.6;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
.docs-content code {
|
||||
background: var(--bg-tertiary);
|
||||
padding: 0.2rem 0.4rem;
|
||||
color: var(--primary-cyan);
|
||||
font-family: 'Dank Mono', Monaco, monospace;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
/* Code Blocks */
|
||||
.code-block {
|
||||
background: var(--bg-dark);
|
||||
border: 1px solid var(--border-color);
|
||||
margin: 1rem 0;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.code-header {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
padding: 0.5rem 1rem;
|
||||
background: var(--bg-tertiary);
|
||||
border-bottom: 1px solid var(--border-color);
|
||||
}
|
||||
|
||||
.code-lang {
|
||||
color: var(--primary-cyan);
|
||||
font-size: 0.875rem;
|
||||
text-transform: uppercase;
|
||||
}
|
||||
|
||||
.copy-btn {
|
||||
padding: 0.25rem 0.5rem;
|
||||
background: transparent;
|
||||
border: 1px solid var(--border-color);
|
||||
color: var(--text-secondary);
|
||||
cursor: pointer;
|
||||
font-size: 0.75rem;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.copy-btn:hover {
|
||||
border-color: var(--primary-cyan);
|
||||
color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
.code-block pre {
|
||||
margin: 0;
|
||||
padding: 1rem;
|
||||
overflow-x: auto;
|
||||
}
|
||||
|
||||
.code-block code {
|
||||
background: transparent;
|
||||
padding: 0;
|
||||
color: var(--text-secondary);
|
||||
font-size: 0.875rem;
|
||||
line-height: 1.5;
|
||||
}
|
||||
|
||||
/* Feature Grid */
|
||||
.feature-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
|
||||
gap: 1rem;
|
||||
margin: 2rem 0;
|
||||
}
|
||||
|
||||
.feature-card {
|
||||
background: var(--bg-tertiary);
|
||||
border: 1px solid var(--border-color);
|
||||
padding: 1.5rem;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.feature-card:hover {
|
||||
border-color: var(--primary-cyan);
|
||||
background: rgba(80, 255, 255, 0.05);
|
||||
}
|
||||
|
||||
.feature-card h4 {
|
||||
margin-top: 0;
|
||||
}
|
||||
|
||||
/* Info Box */
|
||||
.info-box {
|
||||
background: linear-gradient(135deg, rgba(80, 255, 255, 0.05), rgba(243, 128, 245, 0.03));
|
||||
border: 1px solid var(--primary-cyan);
|
||||
border-left: 4px solid var(--primary-cyan);
|
||||
padding: 1.5rem;
|
||||
margin: 2rem 0;
|
||||
}
|
||||
|
||||
.info-box h4 {
|
||||
margin-top: 0;
|
||||
color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
/* Support Grid */
|
||||
.support-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
|
||||
gap: 1rem;
|
||||
margin: 2rem 0;
|
||||
}
|
||||
|
||||
.support-card {
|
||||
background: var(--bg-tertiary);
|
||||
border: 1px solid var(--border-color);
|
||||
padding: 1.5rem;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.support-card h3 {
|
||||
color: var(--primary-cyan);
|
||||
margin-bottom: 0.5rem;
|
||||
}
|
||||
|
||||
/* Related Apps */
|
||||
.related-apps {
|
||||
max-width: 1800px;
|
||||
margin: 4rem auto;
|
||||
padding: 0 2rem;
|
||||
}
|
||||
|
||||
.related-apps h2 {
|
||||
font-size: 1.5rem;
|
||||
color: var(--text-primary);
|
||||
margin-bottom: 1.5rem;
|
||||
}
|
||||
|
||||
.related-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fill, minmax(250px, 1fr));
|
||||
gap: 1rem;
|
||||
}
|
||||
|
||||
.related-app-card {
|
||||
background: var(--bg-secondary);
|
||||
border: 1px solid var(--border-color);
|
||||
padding: 1rem;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.related-app-card:hover {
|
||||
border-color: var(--primary-cyan);
|
||||
transform: translateY(-2px);
|
||||
}
|
||||
|
||||
/* Responsive */
|
||||
@media (max-width: 1024px) {
|
||||
.app-hero-content {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
|
||||
.app-stats {
|
||||
justify-content: space-around;
|
||||
}
|
||||
}
|
||||
|
||||
@media (max-width: 768px) {
|
||||
.app-hero-info h1 {
|
||||
font-size: 2rem;
|
||||
}
|
||||
|
||||
.app-actions {
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.app-nav {
|
||||
overflow-x: auto;
|
||||
gap: 0;
|
||||
}
|
||||
|
||||
.nav-tab {
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.feature-grid,
|
||||
.support-grid {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
}
|
||||
234
docs/md_v2/marketplace/frontend/app-detail.html
Normal file
234
docs/md_v2/marketplace/frontend/app-detail.html
Normal file
@@ -0,0 +1,234 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en" data-theme="dark">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>App Details - Crawl4AI Marketplace</title>
|
||||
<link rel="stylesheet" href="marketplace.css">
|
||||
<link rel="stylesheet" href="app-detail.css">
|
||||
</head>
|
||||
<body>
|
||||
<div class="app-detail-container">
|
||||
<!-- Header -->
|
||||
<header class="marketplace-header">
|
||||
<div class="header-content">
|
||||
<div class="header-left">
|
||||
<div class="logo-title">
|
||||
<img src="../../assets/images/logo.png" alt="Crawl4AI" class="header-logo">
|
||||
<h1>
|
||||
<span class="ascii-border">[</span>
|
||||
Marketplace
|
||||
<span class="ascii-border">]</span>
|
||||
</h1>
|
||||
</div>
|
||||
</div>
|
||||
<div class="header-nav">
|
||||
<a href="index.html" class="back-btn">← Back to Marketplace</a>
|
||||
</div>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<!-- App Hero Section -->
|
||||
<section class="app-hero">
|
||||
<div class="app-hero-content">
|
||||
<div class="app-hero-image" id="app-image">
|
||||
<!-- Dynamic image -->
|
||||
</div>
|
||||
<div class="app-hero-info">
|
||||
<div class="app-badges">
|
||||
<span class="app-badge" id="app-type">Open Source</span>
|
||||
<span class="app-badge featured" id="app-featured" style="display:none">FEATURED</span>
|
||||
<span class="app-badge sponsored" id="app-sponsored" style="display:none">SPONSORED</span>
|
||||
</div>
|
||||
<h1 id="app-name">App Name</h1>
|
||||
<p id="app-description" class="app-tagline">App description goes here</p>
|
||||
|
||||
<div class="app-stats">
|
||||
<div class="stat">
|
||||
<span class="stat-value" id="app-rating">★★★★★</span>
|
||||
<span class="stat-label">Rating</span>
|
||||
</div>
|
||||
<div class="stat">
|
||||
<span class="stat-value" id="app-downloads">0</span>
|
||||
<span class="stat-label">Downloads</span>
|
||||
</div>
|
||||
<div class="stat">
|
||||
<span class="stat-value" id="app-category">Category</span>
|
||||
<span class="stat-label">Category</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="app-actions">
|
||||
<a href="#" id="app-website" class="action-btn primary" target="_blank">
|
||||
<span>→</span> Visit Website
|
||||
</a>
|
||||
<a href="#" id="app-github" class="action-btn secondary" target="_blank">
|
||||
<span>⚡</span> View on GitHub
|
||||
</a>
|
||||
<button id="copy-integration" class="action-btn ghost">
|
||||
<span>📋</span> Copy Integration
|
||||
</button>
|
||||
</div>
|
||||
|
||||
<div class="pricing-info">
|
||||
<span class="pricing-label">Pricing:</span>
|
||||
<span id="app-pricing" class="pricing-value">Free</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Navigation Tabs -->
|
||||
<nav class="app-nav">
|
||||
<button class="nav-tab active" data-tab="integration">Integration Guide</button>
|
||||
<button class="nav-tab" data-tab="docs">Documentation</button>
|
||||
<button class="nav-tab" data-tab="examples">Examples</button>
|
||||
<button class="nav-tab" data-tab="support">Support</button>
|
||||
</nav>
|
||||
|
||||
<!-- Content Sections -->
|
||||
<main class="app-content">
|
||||
<!-- Integration Guide Tab -->
|
||||
<section id="integration-tab" class="tab-content active">
|
||||
<div class="docs-content">
|
||||
<h2>Quick Start</h2>
|
||||
<p>Get started with this integration in just a few steps.</p>
|
||||
|
||||
<h3>Installation</h3>
|
||||
<div class="code-block">
|
||||
<div class="code-header">
|
||||
<span class="code-lang">bash</span>
|
||||
<button class="copy-btn">Copy</button>
|
||||
</div>
|
||||
<pre><code id="install-code">pip install crawl4ai</code></pre>
|
||||
</div>
|
||||
|
||||
<h3>Basic Usage</h3>
|
||||
<div class="code-block">
|
||||
<div class="code-header">
|
||||
<span class="code-lang">python</span>
|
||||
<button class="copy-btn">Copy</button>
|
||||
</div>
|
||||
<pre><code id="usage-code">from crawl4ai import AsyncWebCrawler
|
||||
|
||||
async def main():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
# Your configuration here
|
||||
)
|
||||
print(result.markdown)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
asyncio.run(main())</code></pre>
|
||||
</div>
|
||||
|
||||
<h3>Advanced Configuration</h3>
|
||||
<p>Customize the crawler with these advanced options:</p>
|
||||
|
||||
<div class="feature-grid">
|
||||
<div class="feature-card">
|
||||
<h4>🚀 Performance</h4>
|
||||
<p>Optimize crawling speed with parallel processing and caching strategies.</p>
|
||||
</div>
|
||||
<div class="feature-card">
|
||||
<h4>🔒 Authentication</h4>
|
||||
<p>Handle login forms, cookies, and session management automatically.</p>
|
||||
</div>
|
||||
<div class="feature-card">
|
||||
<h4>🎯 Extraction</h4>
|
||||
<p>Use CSS selectors, XPath, or AI-powered content extraction.</p>
|
||||
</div>
|
||||
<div class="feature-card">
|
||||
<h4>🔄 Proxy Support</h4>
|
||||
<p>Rotate proxies and bypass rate limiting with built-in proxy management.</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<h3>Integration Example</h3>
|
||||
<div class="code-block">
|
||||
<div class="code-header">
|
||||
<span class="code-lang">python</span>
|
||||
<button class="copy-btn">Copy</button>
|
||||
</div>
|
||||
<pre><code id="integration-code">from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
|
||||
async def extract_with_llm():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai",
|
||||
api_key="your-api-key",
|
||||
instruction="Extract product information"
|
||||
),
|
||||
bypass_cache=True
|
||||
)
|
||||
return result.extracted_content
|
||||
|
||||
# Run the extraction
|
||||
data = await extract_with_llm()
|
||||
print(data)</code></pre>
|
||||
</div>
|
||||
|
||||
<div class="info-box">
|
||||
<h4>💡 Pro Tip</h4>
|
||||
<p>Use the <code>bypass_cache=True</code> parameter when you need fresh data, or set <code>cache_mode="write"</code> to update the cache with new content.</p>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Documentation Tab -->
|
||||
<section id="docs-tab" class="tab-content">
|
||||
<div class="docs-content">
|
||||
<h2>Documentation</h2>
|
||||
<p>Complete documentation and API reference.</p>
|
||||
<!-- Dynamic content loaded here -->
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Examples Tab -->
|
||||
<section id="examples-tab" class="tab-content">
|
||||
<div class="docs-content">
|
||||
<h2>Examples</h2>
|
||||
<p>Real-world examples and use cases.</p>
|
||||
<!-- Dynamic content loaded here -->
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Support Tab -->
|
||||
<section id="support-tab" class="tab-content">
|
||||
<div class="docs-content">
|
||||
<h2>Support</h2>
|
||||
<div class="support-grid">
|
||||
<div class="support-card">
|
||||
<h3>📧 Contact</h3>
|
||||
<p id="app-contact">contact@example.com</p>
|
||||
</div>
|
||||
<div class="support-card">
|
||||
<h3>🐛 Report Issues</h3>
|
||||
<p>Found a bug? Report it on GitHub Issues.</p>
|
||||
</div>
|
||||
<div class="support-card">
|
||||
<h3>💬 Community</h3>
|
||||
<p>Join our Discord for help and discussions.</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
</main>
|
||||
|
||||
<!-- Related Apps -->
|
||||
<section class="related-apps">
|
||||
<h2>Related Apps</h2>
|
||||
<div id="related-apps-grid" class="related-grid">
|
||||
<!-- Dynamic related apps -->
|
||||
</div>
|
||||
</section>
|
||||
</div>
|
||||
|
||||
<script src="app-detail.js"></script>
|
||||
</body>
|
||||
</html>
|
||||
334
docs/md_v2/marketplace/frontend/app-detail.js
Normal file
334
docs/md_v2/marketplace/frontend/app-detail.js
Normal file
@@ -0,0 +1,334 @@
|
||||
// App Detail Page JavaScript
|
||||
const { API_BASE, API_ORIGIN } = (() => {
|
||||
const { hostname, port, protocol } = window.location;
|
||||
const isLocalHost = ['localhost', '127.0.0.1', '0.0.0.0'].includes(hostname);
|
||||
|
||||
if (isLocalHost && port && port !== '8100') {
|
||||
const origin = `${protocol}//127.0.0.1:8100`;
|
||||
return { API_BASE: `${origin}/marketplace/api`, API_ORIGIN: origin };
|
||||
}
|
||||
|
||||
return { API_BASE: '/marketplace/api', API_ORIGIN: '' };
|
||||
})();
|
||||
|
||||
class AppDetailPage {
|
||||
constructor() {
|
||||
this.appSlug = this.getAppSlugFromURL();
|
||||
this.appData = null;
|
||||
this.init();
|
||||
}
|
||||
|
||||
getAppSlugFromURL() {
|
||||
const params = new URLSearchParams(window.location.search);
|
||||
return params.get('app') || '';
|
||||
}
|
||||
|
||||
async init() {
|
||||
if (!this.appSlug) {
|
||||
window.location.href = 'index.html';
|
||||
return;
|
||||
}
|
||||
|
||||
await this.loadAppDetails();
|
||||
this.setupEventListeners();
|
||||
await this.loadRelatedApps();
|
||||
}
|
||||
|
||||
async loadAppDetails() {
|
||||
try {
|
||||
const response = await fetch(`${API_BASE}/apps/${this.appSlug}`);
|
||||
if (!response.ok) throw new Error('App not found');
|
||||
|
||||
this.appData = await response.json();
|
||||
this.renderAppDetails();
|
||||
} catch (error) {
|
||||
console.error('Error loading app details:', error);
|
||||
// Fallback to loading all apps and finding the right one
|
||||
try {
|
||||
const response = await fetch(`${API_BASE}/apps`);
|
||||
const apps = await response.json();
|
||||
this.appData = apps.find(app => app.slug === this.appSlug || app.name.toLowerCase().replace(/\s+/g, '-') === this.appSlug);
|
||||
if (this.appData) {
|
||||
this.renderAppDetails();
|
||||
} else {
|
||||
window.location.href = 'index.html';
|
||||
}
|
||||
} catch (err) {
|
||||
console.error('Error loading apps:', err);
|
||||
window.location.href = 'index.html';
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
renderAppDetails() {
|
||||
if (!this.appData) return;
|
||||
|
||||
// Update title
|
||||
document.title = `${this.appData.name} - Crawl4AI Marketplace`;
|
||||
|
||||
// Hero image
|
||||
const appImage = document.getElementById('app-image');
|
||||
if (this.appData.image) {
|
||||
appImage.style.backgroundImage = `url('${this.appData.image}')`;
|
||||
appImage.innerHTML = '';
|
||||
} else {
|
||||
appImage.innerHTML = `[${this.appData.category || 'APP'}]`;
|
||||
}
|
||||
|
||||
// Basic info
|
||||
document.getElementById('app-name').textContent = this.appData.name;
|
||||
document.getElementById('app-description').textContent = this.appData.description;
|
||||
document.getElementById('app-type').textContent = this.appData.type || 'Open Source';
|
||||
document.getElementById('app-category').textContent = this.appData.category;
|
||||
document.getElementById('app-pricing').textContent = this.appData.pricing || 'Free';
|
||||
|
||||
// Badges
|
||||
if (this.appData.featured) {
|
||||
document.getElementById('app-featured').style.display = 'inline-block';
|
||||
}
|
||||
if (this.appData.sponsored) {
|
||||
document.getElementById('app-sponsored').style.display = 'inline-block';
|
||||
}
|
||||
|
||||
// Stats
|
||||
const rating = this.appData.rating || 0;
|
||||
const stars = '★'.repeat(Math.floor(rating)) + '☆'.repeat(5 - Math.floor(rating));
|
||||
document.getElementById('app-rating').textContent = stars + ` ${rating}/5`;
|
||||
document.getElementById('app-downloads').textContent = this.formatNumber(this.appData.downloads || 0);
|
||||
|
||||
// Action buttons
|
||||
const websiteBtn = document.getElementById('app-website');
|
||||
const githubBtn = document.getElementById('app-github');
|
||||
|
||||
if (this.appData.website_url) {
|
||||
websiteBtn.href = this.appData.website_url;
|
||||
} else {
|
||||
websiteBtn.style.display = 'none';
|
||||
}
|
||||
|
||||
if (this.appData.github_url) {
|
||||
githubBtn.href = this.appData.github_url;
|
||||
} else {
|
||||
githubBtn.style.display = 'none';
|
||||
}
|
||||
|
||||
// Contact
|
||||
document.getElementById('app-contact').textContent = this.appData.contact_email || 'Not available';
|
||||
|
||||
// Integration guide
|
||||
this.renderIntegrationGuide();
|
||||
}
|
||||
|
||||
renderIntegrationGuide() {
|
||||
// Installation code
|
||||
const installCode = document.getElementById('install-code');
|
||||
if (this.appData.type === 'Open Source' && this.appData.github_url) {
|
||||
installCode.textContent = `# Clone from GitHub
|
||||
git clone ${this.appData.github_url}
|
||||
|
||||
# Install dependencies
|
||||
pip install -r requirements.txt`;
|
||||
} else if (this.appData.name.toLowerCase().includes('api')) {
|
||||
installCode.textContent = `# Install via pip
|
||||
pip install ${this.appData.slug}
|
||||
|
||||
# Or install from source
|
||||
pip install git+${this.appData.github_url || 'https://github.com/example/repo'}`;
|
||||
}
|
||||
|
||||
// Usage code - customize based on category
|
||||
const usageCode = document.getElementById('usage-code');
|
||||
if (this.appData.category === 'Browser Automation') {
|
||||
usageCode.textContent = `from crawl4ai import AsyncWebCrawler
|
||||
from ${this.appData.slug.replace(/-/g, '_')} import ${this.appData.name.replace(/\s+/g, '')}
|
||||
|
||||
async def main():
|
||||
# Initialize ${this.appData.name}
|
||||
automation = ${this.appData.name.replace(/\s+/g, '')}()
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
browser_config=automation.config,
|
||||
wait_for="css:body"
|
||||
)
|
||||
print(result.markdown)`;
|
||||
} else if (this.appData.category === 'Proxy Services') {
|
||||
usageCode.textContent = `from crawl4ai import AsyncWebCrawler
|
||||
import ${this.appData.slug.replace(/-/g, '_')}
|
||||
|
||||
# Configure proxy
|
||||
proxy_config = {
|
||||
"server": "${this.appData.website_url || 'https://proxy.example.com'}",
|
||||
"username": "your_username",
|
||||
"password": "your_password"
|
||||
}
|
||||
|
||||
async with AsyncWebCrawler(proxy=proxy_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
bypass_cache=True
|
||||
)
|
||||
print(result.status_code)`;
|
||||
} else if (this.appData.category === 'LLM Integration') {
|
||||
usageCode.textContent = `from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
|
||||
# Configure LLM extraction
|
||||
strategy = LLMExtractionStrategy(
|
||||
provider="${this.appData.name.toLowerCase().includes('gpt') ? 'openai' : 'anthropic'}",
|
||||
api_key="your-api-key",
|
||||
model="${this.appData.name.toLowerCase().includes('gpt') ? 'gpt-4' : 'claude-3'}",
|
||||
instruction="Extract structured data"
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
extraction_strategy=strategy
|
||||
)
|
||||
print(result.extracted_content)`;
|
||||
}
|
||||
|
||||
// Integration example
|
||||
const integrationCode = document.getElementById('integration-code');
|
||||
integrationCode.textContent = this.appData.integration_guide ||
|
||||
`# Complete ${this.appData.name} Integration Example
|
||||
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
import json
|
||||
|
||||
async def crawl_with_${this.appData.slug.replace(/-/g, '_')}():
|
||||
"""
|
||||
Complete example showing how to use ${this.appData.name}
|
||||
with Crawl4AI for production web scraping
|
||||
"""
|
||||
|
||||
# Define extraction schema
|
||||
schema = {
|
||||
"name": "ProductList",
|
||||
"baseSelector": "div.product",
|
||||
"fields": [
|
||||
{"name": "title", "selector": "h2", "type": "text"},
|
||||
{"name": "price", "selector": ".price", "type": "text"},
|
||||
{"name": "image", "selector": "img", "type": "attribute", "attribute": "src"},
|
||||
{"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
|
||||
]
|
||||
}
|
||||
|
||||
# Initialize crawler with ${this.appData.name}
|
||||
async with AsyncWebCrawler(
|
||||
browser_type="chromium",
|
||||
headless=True,
|
||||
verbose=True
|
||||
) as crawler:
|
||||
|
||||
# Crawl with extraction
|
||||
result = await crawler.arun(
|
||||
url="https://example.com/products",
|
||||
extraction_strategy=JsonCssExtractionStrategy(schema),
|
||||
cache_mode="bypass",
|
||||
wait_for="css:.product",
|
||||
screenshot=True
|
||||
)
|
||||
|
||||
# Process results
|
||||
if result.success:
|
||||
products = json.loads(result.extracted_content)
|
||||
print(f"Found {len(products)} products")
|
||||
|
||||
for product in products[:5]:
|
||||
print(f"- {product['title']}: {product['price']}")
|
||||
|
||||
return products
|
||||
|
||||
# Run the crawler
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
asyncio.run(crawl_with_${this.appData.slug.replace(/-/g, '_')}())`;
|
||||
}
|
||||
|
||||
formatNumber(num) {
|
||||
if (num >= 1000000) {
|
||||
return (num / 1000000).toFixed(1) + 'M';
|
||||
} else if (num >= 1000) {
|
||||
return (num / 1000).toFixed(1) + 'K';
|
||||
}
|
||||
return num.toString();
|
||||
}
|
||||
|
||||
setupEventListeners() {
|
||||
// Tab switching
|
||||
const tabs = document.querySelectorAll('.nav-tab');
|
||||
tabs.forEach(tab => {
|
||||
tab.addEventListener('click', () => {
|
||||
// Update active tab
|
||||
tabs.forEach(t => t.classList.remove('active'));
|
||||
tab.classList.add('active');
|
||||
|
||||
// Show corresponding content
|
||||
const tabName = tab.dataset.tab;
|
||||
document.querySelectorAll('.tab-content').forEach(content => {
|
||||
content.classList.remove('active');
|
||||
});
|
||||
document.getElementById(`${tabName}-tab`).classList.add('active');
|
||||
});
|
||||
});
|
||||
|
||||
// Copy integration code
|
||||
document.getElementById('copy-integration').addEventListener('click', () => {
|
||||
const code = document.getElementById('integration-code').textContent;
|
||||
navigator.clipboard.writeText(code).then(() => {
|
||||
const btn = document.getElementById('copy-integration');
|
||||
const originalText = btn.innerHTML;
|
||||
btn.innerHTML = '<span>✓</span> Copied!';
|
||||
setTimeout(() => {
|
||||
btn.innerHTML = originalText;
|
||||
}, 2000);
|
||||
});
|
||||
});
|
||||
|
||||
// Copy code buttons
|
||||
document.querySelectorAll('.copy-btn').forEach(btn => {
|
||||
btn.addEventListener('click', (e) => {
|
||||
const codeBlock = e.target.closest('.code-block');
|
||||
const code = codeBlock.querySelector('code').textContent;
|
||||
navigator.clipboard.writeText(code).then(() => {
|
||||
btn.textContent = 'Copied!';
|
||||
setTimeout(() => {
|
||||
btn.textContent = 'Copy';
|
||||
}, 2000);
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async loadRelatedApps() {
|
||||
try {
|
||||
const response = await fetch(`${API_BASE}/apps?category=${encodeURIComponent(this.appData.category)}&limit=4`);
|
||||
const apps = await response.json();
|
||||
|
||||
const relatedApps = apps.filter(app => app.slug !== this.appSlug).slice(0, 3);
|
||||
const grid = document.getElementById('related-apps-grid');
|
||||
|
||||
grid.innerHTML = relatedApps.map(app => `
|
||||
<div class="related-app-card" onclick="window.location.href='app-detail.html?app=${app.slug || app.name.toLowerCase().replace(/\s+/g, '-')}'">
|
||||
<h4>${app.name}</h4>
|
||||
<p>${app.description.substring(0, 100)}...</p>
|
||||
<div style="display: flex; justify-content: space-between; margin-top: 0.5rem; font-size: 0.75rem;">
|
||||
<span style="color: var(--primary-cyan)">${app.type}</span>
|
||||
<span style="color: var(--warning)">★ ${app.rating}/5</span>
|
||||
</div>
|
||||
</div>
|
||||
`).join('');
|
||||
} catch (error) {
|
||||
console.error('Error loading related apps:', error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize when DOM is loaded
|
||||
document.addEventListener('DOMContentLoaded', () => {
|
||||
new AppDetailPage();
|
||||
});
|
||||
147
docs/md_v2/marketplace/frontend/index.html
Normal file
147
docs/md_v2/marketplace/frontend/index.html
Normal file
@@ -0,0 +1,147 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en" data-theme="dark">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Marketplace - Crawl4AI</title>
|
||||
<link rel="stylesheet" href="marketplace.css">
|
||||
</head>
|
||||
<body>
|
||||
<div class="marketplace-container">
|
||||
<!-- Header -->
|
||||
<header class="marketplace-header">
|
||||
<div class="header-content">
|
||||
<div class="header-left">
|
||||
<div class="logo-title">
|
||||
<img src="../../assets/images/logo.png" alt="Crawl4AI" class="header-logo">
|
||||
<h1>
|
||||
<span class="ascii-border">[</span>
|
||||
Marketplace
|
||||
<span class="ascii-border">]</span>
|
||||
</h1>
|
||||
</div>
|
||||
<p class="tagline">Tools, Integrations & Resources for Web Crawling</p>
|
||||
</div>
|
||||
<div class="header-stats" id="stats">
|
||||
<span class="stat-item">Apps: <span id="total-apps">--</span></span>
|
||||
<span class="stat-item">Articles: <span id="total-articles">--</span></span>
|
||||
<span class="stat-item">Downloads: <span id="total-downloads">--</span></span>
|
||||
</div>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<!-- Search and Category Bar -->
|
||||
<div class="search-filter-bar">
|
||||
<div class="search-box">
|
||||
<span class="search-icon">></span>
|
||||
<input type="text" id="search-input" placeholder="Search apps, articles, tools..." />
|
||||
<kbd>/</kbd>
|
||||
</div>
|
||||
<div class="category-filter" id="category-filter">
|
||||
<button class="filter-btn active" data-category="all">All</button>
|
||||
<!-- Categories will be loaded here -->
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Magazine Grid Layout -->
|
||||
<main class="magazine-layout">
|
||||
<!-- Hero Featured Section -->
|
||||
<section class="hero-featured">
|
||||
<div id="featured-hero" class="featured-hero-card">
|
||||
<!-- Large featured card with big image -->
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Secondary Featured -->
|
||||
<section class="secondary-featured">
|
||||
<div id="featured-secondary" class="featured-secondary-cards">
|
||||
<!-- 2-3 medium featured cards with images -->
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Sponsored Section -->
|
||||
<section class="sponsored-section">
|
||||
<div class="section-label">SPONSORED</div>
|
||||
<div id="sponsored-content" class="sponsored-cards">
|
||||
<!-- Sponsored content cards -->
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Main Content Grid -->
|
||||
<section class="main-content">
|
||||
<!-- Apps Column -->
|
||||
<div class="apps-column">
|
||||
<div class="column-header">
|
||||
<h2><span class="ascii-icon">></span> Latest Apps</h2>
|
||||
<select id="type-filter" class="mini-filter">
|
||||
<option value="">All</option>
|
||||
<option value="Open Source">Open Source</option>
|
||||
<option value="Paid">Paid</option>
|
||||
</select>
|
||||
</div>
|
||||
<div id="apps-grid" class="apps-compact-grid">
|
||||
<!-- Compact app cards -->
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Articles Column -->
|
||||
<div class="articles-column">
|
||||
<div class="column-header">
|
||||
<h2><span class="ascii-icon">></span> Latest Articles</h2>
|
||||
</div>
|
||||
<div id="articles-list" class="articles-compact-list">
|
||||
<!-- Article items -->
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Trending/Tools Column -->
|
||||
<div class="trending-column">
|
||||
<div class="column-header">
|
||||
<h2><span class="ascii-icon">#</span> Trending</h2>
|
||||
</div>
|
||||
<div id="trending-list" class="trending-items">
|
||||
<!-- Trending items -->
|
||||
</div>
|
||||
|
||||
<div class="submit-box">
|
||||
<h3><span class="ascii-icon">+</span> Submit Your Tool</h3>
|
||||
<p>Share your integration</p>
|
||||
<a href="mailto:marketplace@crawl4ai.com" class="submit-btn">Submit →</a>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- More Apps Grid -->
|
||||
<section class="more-apps">
|
||||
<div class="section-header">
|
||||
<h2><span class="ascii-icon">></span> More Apps</h2>
|
||||
<button id="load-more" class="load-more-btn">Load More ↓</button>
|
||||
</div>
|
||||
<div id="more-apps-grid" class="more-apps-grid">
|
||||
<!-- Additional app cards -->
|
||||
</div>
|
||||
</section>
|
||||
</main>
|
||||
|
||||
<!-- Footer -->
|
||||
<footer class="marketplace-footer">
|
||||
<div class="footer-content">
|
||||
<div class="footer-section">
|
||||
<h3>About Marketplace</h3>
|
||||
<p>Discover tools and integrations built by the Crawl4AI community.</p>
|
||||
</div>
|
||||
<div class="footer-section">
|
||||
<h3>Become a Sponsor</h3>
|
||||
<p>Reach developers building with Crawl4AI</p>
|
||||
<a href="mailto:sponsors@crawl4ai.com" class="sponsor-btn">Learn More →</a>
|
||||
</div>
|
||||
</div>
|
||||
<div class="footer-bottom">
|
||||
<p>[ Crawl4AI Marketplace · Updated <span id="last-update">--</span> ]</p>
|
||||
</div>
|
||||
</footer>
|
||||
</div>
|
||||
|
||||
<script src="marketplace.js"></script>
|
||||
</body>
|
||||
</html>
|
||||
957
docs/md_v2/marketplace/frontend/marketplace.css
Normal file
957
docs/md_v2/marketplace/frontend/marketplace.css
Normal file
@@ -0,0 +1,957 @@
|
||||
/* Marketplace CSS - Magazine Style Terminal Theme */
|
||||
@import url('../../assets/styles.css');
|
||||
|
||||
:root {
|
||||
--primary-cyan: #50ffff;
|
||||
--primary-teal: #09b5a5;
|
||||
--accent-pink: #f380f5;
|
||||
--bg-dark: #070708;
|
||||
--bg-secondary: #1a1a1a;
|
||||
--bg-tertiary: #3f3f44;
|
||||
--text-primary: #e8e9ed;
|
||||
--text-secondary: #d5cec0;
|
||||
--text-tertiary: #a3abba;
|
||||
--border-color: #3f3f44;
|
||||
--success: #50ff50;
|
||||
--error: #ff3c74;
|
||||
--warning: #f59e0b;
|
||||
}
|
||||
|
||||
* {
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
body {
|
||||
font-family: 'Dank Mono', Monaco, monospace;
|
||||
background: var(--bg-dark);
|
||||
color: var(--text-primary);
|
||||
line-height: 1.6;
|
||||
}
|
||||
|
||||
/* Global link styles */
|
||||
a {
|
||||
color: var(--primary-cyan);
|
||||
text-decoration: none;
|
||||
transition: color 0.2s;
|
||||
}
|
||||
|
||||
a:hover {
|
||||
color: var(--accent-pink);
|
||||
}
|
||||
|
||||
.marketplace-container {
|
||||
min-height: 100vh;
|
||||
}
|
||||
|
||||
/* Header */
|
||||
.marketplace-header {
|
||||
background: var(--bg-secondary);
|
||||
border-bottom: 1px solid var(--border-color);
|
||||
padding: 1.5rem 0;
|
||||
}
|
||||
|
||||
.header-content {
|
||||
max-width: 1800px;
|
||||
margin: 0 auto;
|
||||
padding: 0 2rem;
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.logo-title {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 1rem;
|
||||
}
|
||||
|
||||
.header-logo {
|
||||
height: 40px;
|
||||
width: auto;
|
||||
filter: brightness(1.2);
|
||||
}
|
||||
|
||||
.marketplace-header h1 {
|
||||
font-size: 1.5rem;
|
||||
color: var(--primary-cyan);
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
.ascii-border {
|
||||
color: var(--border-color);
|
||||
}
|
||||
|
||||
.tagline {
|
||||
font-size: 0.875rem;
|
||||
color: var(--text-tertiary);
|
||||
margin-top: 0.25rem;
|
||||
}
|
||||
|
||||
.header-stats {
|
||||
display: flex;
|
||||
gap: 2rem;
|
||||
}
|
||||
|
||||
.stat-item {
|
||||
font-size: 0.875rem;
|
||||
color: var(--text-secondary);
|
||||
}
|
||||
|
||||
.stat-item span {
|
||||
color: var(--primary-cyan);
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
/* Search and Filter Bar */
|
||||
.search-filter-bar {
|
||||
max-width: 1800px;
|
||||
margin: 1.5rem auto;
|
||||
padding: 0 2rem;
|
||||
display: flex;
|
||||
gap: 1rem;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.search-box {
|
||||
flex: 1;
|
||||
max-width: 500px;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
background: var(--bg-secondary);
|
||||
border: 1px solid var(--border-color);
|
||||
padding: 0.75rem 1rem;
|
||||
transition: border-color 0.2s;
|
||||
}
|
||||
|
||||
.search-box:focus-within {
|
||||
border-color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
.search-icon {
|
||||
color: var(--text-tertiary);
|
||||
margin-right: 1rem;
|
||||
}
|
||||
|
||||
#search-input {
|
||||
flex: 1;
|
||||
background: transparent;
|
||||
border: none;
|
||||
color: var(--text-primary);
|
||||
font-family: inherit;
|
||||
font-size: 0.9rem;
|
||||
outline: none;
|
||||
}
|
||||
|
||||
.search-box kbd {
|
||||
font-size: 0.75rem;
|
||||
padding: 0.2rem 0.5rem;
|
||||
background: var(--bg-tertiary);
|
||||
border: 1px solid var(--border-color);
|
||||
color: var(--text-tertiary);
|
||||
}
|
||||
|
||||
.category-filter {
|
||||
display: flex;
|
||||
gap: 0.5rem;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
|
||||
.filter-btn {
|
||||
background: transparent;
|
||||
border: 1px solid var(--border-color);
|
||||
color: var(--text-secondary);
|
||||
padding: 0.5rem 1rem;
|
||||
font-family: inherit;
|
||||
font-size: 0.875rem;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.filter-btn:hover {
|
||||
border-color: var(--primary-cyan);
|
||||
color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
.filter-btn.active {
|
||||
background: var(--primary-cyan);
|
||||
color: var(--bg-dark);
|
||||
border-color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
/* Magazine Layout */
|
||||
.magazine-layout {
|
||||
max-width: 1800px;
|
||||
margin: 0 auto;
|
||||
padding: 0 2rem 4rem;
|
||||
display: grid;
|
||||
grid-template-columns: 1fr;
|
||||
gap: 2rem;
|
||||
}
|
||||
|
||||
/* Hero Featured Section */
|
||||
.hero-featured {
|
||||
grid-column: 1 / -1;
|
||||
position: relative;
|
||||
}
|
||||
|
||||
.hero-featured::before {
|
||||
content: '';
|
||||
position: absolute;
|
||||
top: -20px;
|
||||
left: -20px;
|
||||
right: -20px;
|
||||
bottom: -20px;
|
||||
background: radial-gradient(ellipse at center, rgba(80, 255, 255, 0.05), transparent 70%);
|
||||
pointer-events: none;
|
||||
z-index: -1;
|
||||
}
|
||||
|
||||
.featured-hero-card {
|
||||
background: linear-gradient(135deg, #1a1a2e, #0f0f1e);
|
||||
border: 2px solid var(--primary-cyan);
|
||||
box-shadow: 0 0 30px rgba(80, 255, 255, 0.15),
|
||||
inset 0 0 20px rgba(80, 255, 255, 0.05);
|
||||
height: 380px;
|
||||
position: relative;
|
||||
overflow: hidden;
|
||||
cursor: pointer;
|
||||
transition: all 0.3s ease;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.featured-hero-card:hover {
|
||||
border-color: var(--accent-pink);
|
||||
box-shadow: 0 0 40px rgba(243, 128, 245, 0.2),
|
||||
inset 0 0 30px rgba(243, 128, 245, 0.05);
|
||||
transform: translateY(-2px);
|
||||
}
|
||||
|
||||
.hero-image {
|
||||
width: 100%;
|
||||
height: 240px;
|
||||
background: linear-gradient(135deg, rgba(80, 255, 255, 0.1), rgba(243, 128, 245, 0.05));
|
||||
background-size: cover;
|
||||
background-position: center;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
font-size: 3rem;
|
||||
color: var(--primary-cyan);
|
||||
flex-shrink: 0;
|
||||
position: relative;
|
||||
filter: brightness(1.1) contrast(1.1);
|
||||
}
|
||||
|
||||
.hero-image::after {
|
||||
content: '';
|
||||
position: absolute;
|
||||
bottom: 0;
|
||||
left: 0;
|
||||
right: 0;
|
||||
height: 60%;
|
||||
background: linear-gradient(to top, rgba(10, 10, 20, 0.95), transparent);
|
||||
}
|
||||
|
||||
.hero-content {
|
||||
padding: 1.5rem;
|
||||
}
|
||||
|
||||
.hero-badge {
|
||||
display: inline-block;
|
||||
padding: 0.3rem 0.6rem;
|
||||
background: linear-gradient(135deg, var(--primary-cyan), var(--primary-teal));
|
||||
color: var(--bg-dark);
|
||||
font-size: 0.7rem;
|
||||
text-transform: uppercase;
|
||||
margin-bottom: 0.5rem;
|
||||
font-weight: 600;
|
||||
box-shadow: 0 2px 10px rgba(80, 255, 255, 0.3);
|
||||
}
|
||||
|
||||
.hero-title {
|
||||
font-size: 1.6rem;
|
||||
color: var(--primary-cyan);
|
||||
margin: 0.5rem 0;
|
||||
text-shadow: 0 0 20px rgba(80, 255, 255, 0.5);
|
||||
}
|
||||
|
||||
.hero-description {
|
||||
color: var(--text-secondary);
|
||||
line-height: 1.5;
|
||||
}
|
||||
|
||||
.hero-meta {
|
||||
display: flex;
|
||||
gap: 1.5rem;
|
||||
margin-top: 1rem;
|
||||
font-size: 0.875rem;
|
||||
}
|
||||
|
||||
.hero-meta span {
|
||||
color: var(--text-tertiary);
|
||||
}
|
||||
|
||||
.hero-meta span:first-child {
|
||||
color: var(--warning);
|
||||
}
|
||||
|
||||
/* Secondary Featured */
|
||||
.secondary-featured {
|
||||
grid-column: 1 / -1;
|
||||
height: 380px;
|
||||
display: flex;
|
||||
align-items: stretch;
|
||||
}
|
||||
|
||||
.featured-secondary-cards {
|
||||
width: 100%;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.75rem;
|
||||
justify-content: space-between;
|
||||
}
|
||||
|
||||
.secondary-card {
|
||||
background: linear-gradient(135deg, rgba(80, 255, 255, 0.03), rgba(243, 128, 245, 0.02));
|
||||
border: 1px solid rgba(80, 255, 255, 0.3);
|
||||
cursor: pointer;
|
||||
transition: all 0.3s ease;
|
||||
display: flex;
|
||||
overflow: hidden;
|
||||
height: calc((380px - 1.5rem) / 3);
|
||||
flex: 1;
|
||||
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.3);
|
||||
}
|
||||
|
||||
.secondary-card:hover {
|
||||
border-color: var(--accent-pink);
|
||||
background: linear-gradient(135deg, rgba(243, 128, 245, 0.05), rgba(80, 255, 255, 0.03));
|
||||
box-shadow: 0 4px 15px rgba(243, 128, 245, 0.2);
|
||||
transform: translateX(-3px);
|
||||
}
|
||||
|
||||
.secondary-image {
|
||||
width: 120px;
|
||||
background: linear-gradient(135deg, var(--bg-tertiary), var(--bg-secondary));
|
||||
background-size: cover;
|
||||
background-position: center;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
font-size: 1.5rem;
|
||||
color: var(--primary-cyan);
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
.secondary-content {
|
||||
flex: 1;
|
||||
padding: 1rem;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
justify-content: space-between;
|
||||
}
|
||||
|
||||
.secondary-title {
|
||||
font-size: 1rem;
|
||||
color: var(--text-primary);
|
||||
margin-bottom: 0.25rem;
|
||||
}
|
||||
|
||||
.secondary-desc {
|
||||
font-size: 0.75rem;
|
||||
color: var(--text-secondary);
|
||||
display: -webkit-box;
|
||||
-webkit-line-clamp: 2;
|
||||
-webkit-box-orient: vertical;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.secondary-meta {
|
||||
font-size: 0.75rem;
|
||||
color: var(--text-tertiary);
|
||||
}
|
||||
|
||||
.secondary-meta span:last-child {
|
||||
color: var(--warning);
|
||||
}
|
||||
|
||||
/* Sponsored Section */
|
||||
.sponsored-section {
|
||||
grid-column: 1 / -1;
|
||||
background: var(--bg-secondary);
|
||||
border: 1px solid var(--warning);
|
||||
padding: 1rem;
|
||||
position: relative;
|
||||
}
|
||||
|
||||
.section-label {
|
||||
position: absolute;
|
||||
top: -0.5rem;
|
||||
left: 1rem;
|
||||
background: var(--bg-secondary);
|
||||
padding: 0 0.5rem;
|
||||
color: var(--warning);
|
||||
font-size: 0.65rem;
|
||||
letter-spacing: 0.1em;
|
||||
}
|
||||
|
||||
.sponsored-cards {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
|
||||
gap: 1rem;
|
||||
}
|
||||
|
||||
.sponsor-card {
|
||||
padding: 1rem;
|
||||
background: var(--bg-tertiary);
|
||||
border: 1px solid var(--border-color);
|
||||
}
|
||||
|
||||
.sponsor-card h4 {
|
||||
color: var(--accent-pink);
|
||||
margin-bottom: 0.5rem;
|
||||
}
|
||||
|
||||
.sponsor-card p {
|
||||
color: var(--text-secondary);
|
||||
font-size: 0.85rem;
|
||||
margin-bottom: 0.75rem;
|
||||
}
|
||||
|
||||
.sponsor-card a {
|
||||
color: var(--primary-cyan);
|
||||
text-decoration: none;
|
||||
font-size: 0.85rem;
|
||||
}
|
||||
|
||||
.sponsor-card a:hover {
|
||||
color: var(--accent-pink);
|
||||
}
|
||||
|
||||
/* Main Content Grid */
|
||||
.main-content {
|
||||
grid-column: 1 / -1;
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
|
||||
gap: 2rem;
|
||||
}
|
||||
|
||||
/* Column Headers */
|
||||
.column-header {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
margin-bottom: 1rem;
|
||||
border-bottom: 1px solid var(--border-color);
|
||||
padding-bottom: 0.5rem;
|
||||
}
|
||||
|
||||
.column-header h2 {
|
||||
font-size: 1.1rem;
|
||||
color: var(--text-primary);
|
||||
}
|
||||
|
||||
.mini-filter {
|
||||
background: var(--bg-tertiary);
|
||||
border: 1px solid var(--border-color);
|
||||
color: var(--text-primary);
|
||||
padding: 0.25rem 0.5rem;
|
||||
font-family: inherit;
|
||||
font-size: 0.75rem;
|
||||
}
|
||||
|
||||
.ascii-icon {
|
||||
color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
/* Apps Column */
|
||||
.apps-compact-grid {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.75rem;
|
||||
}
|
||||
|
||||
.app-compact {
|
||||
background: var(--bg-secondary);
|
||||
border: 1px solid var(--border-color);
|
||||
border-left: 3px solid var(--border-color);
|
||||
padding: 0.75rem;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.app-compact:hover {
|
||||
border-color: var(--primary-cyan);
|
||||
border-left-color: var(--accent-pink);
|
||||
transform: translateX(2px);
|
||||
}
|
||||
|
||||
.app-compact-header {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
font-size: 0.75rem;
|
||||
color: var(--text-tertiary);
|
||||
margin-bottom: 0.25rem;
|
||||
}
|
||||
|
||||
.app-compact-header span:first-child {
|
||||
color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
.app-compact-header span:last-child {
|
||||
color: var(--warning);
|
||||
}
|
||||
|
||||
.app-compact-title {
|
||||
font-size: 0.9rem;
|
||||
color: var(--text-primary);
|
||||
margin-bottom: 0.25rem;
|
||||
}
|
||||
|
||||
.app-compact-desc {
|
||||
font-size: 0.75rem;
|
||||
color: var(--text-secondary);
|
||||
display: -webkit-box;
|
||||
-webkit-line-clamp: 2;
|
||||
-webkit-box-orient: vertical;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
/* Articles Column */
|
||||
.articles-compact-list {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 1rem;
|
||||
}
|
||||
|
||||
.article-compact {
|
||||
border-left: 2px solid var(--border-color);
|
||||
padding-left: 1rem;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.article-compact:hover {
|
||||
border-left-color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
.article-meta {
|
||||
font-size: 0.7rem;
|
||||
color: var(--text-tertiary);
|
||||
margin-bottom: 0.25rem;
|
||||
}
|
||||
|
||||
.article-meta span:first-child {
|
||||
color: var(--accent-pink);
|
||||
}
|
||||
|
||||
.article-title {
|
||||
font-size: 0.9rem;
|
||||
color: var(--text-primary);
|
||||
margin-bottom: 0.25rem;
|
||||
}
|
||||
|
||||
.article-author {
|
||||
font-size: 0.75rem;
|
||||
color: var(--text-secondary);
|
||||
}
|
||||
|
||||
/* Trending Column */
|
||||
.trending-items {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.5rem;
|
||||
}
|
||||
|
||||
.trending-item {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.75rem;
|
||||
padding: 0.5rem;
|
||||
background: var(--bg-secondary);
|
||||
border: 1px solid var(--border-color);
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.trending-item:hover {
|
||||
border-color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
.trending-rank {
|
||||
font-size: 1.2rem;
|
||||
color: var(--primary-cyan);
|
||||
width: 2rem;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.trending-info {
|
||||
flex: 1;
|
||||
}
|
||||
|
||||
.trending-name {
|
||||
font-size: 0.85rem;
|
||||
color: var(--text-primary);
|
||||
}
|
||||
|
||||
.trending-stats {
|
||||
font-size: 0.7rem;
|
||||
color: var(--text-tertiary);
|
||||
}
|
||||
|
||||
/* Submit Box */
|
||||
.submit-box {
|
||||
margin-top: 1.5rem;
|
||||
background: var(--bg-secondary);
|
||||
border: 1px solid var(--primary-cyan);
|
||||
padding: 1rem;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.submit-box h3 {
|
||||
font-size: 1rem;
|
||||
color: var(--primary-cyan);
|
||||
margin-bottom: 0.5rem;
|
||||
}
|
||||
|
||||
.submit-box p {
|
||||
font-size: 0.8rem;
|
||||
color: var(--text-secondary);
|
||||
margin-bottom: 0.75rem;
|
||||
}
|
||||
|
||||
.submit-btn {
|
||||
display: inline-block;
|
||||
padding: 0.5rem 1rem;
|
||||
background: transparent;
|
||||
border: 1px solid var(--primary-cyan);
|
||||
color: var(--primary-cyan);
|
||||
text-decoration: none;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.submit-btn:hover {
|
||||
background: var(--primary-cyan);
|
||||
color: var(--bg-dark);
|
||||
}
|
||||
|
||||
/* More Apps Section */
|
||||
.more-apps {
|
||||
grid-column: 1 / -1;
|
||||
margin-top: 2rem;
|
||||
}
|
||||
|
||||
.section-header {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
.more-apps-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
|
||||
gap: 1rem;
|
||||
}
|
||||
|
||||
.load-more-btn {
|
||||
background: transparent;
|
||||
border: 1px solid var(--border-color);
|
||||
color: var(--text-secondary);
|
||||
padding: 0.5rem 1.5rem;
|
||||
font-family: inherit;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.load-more-btn:hover {
|
||||
border-color: var(--primary-cyan);
|
||||
color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
/* Footer */
|
||||
.marketplace-footer {
|
||||
background: var(--bg-secondary);
|
||||
border-top: 1px solid var(--border-color);
|
||||
margin-top: 4rem;
|
||||
padding: 2rem 0;
|
||||
}
|
||||
|
||||
.footer-content {
|
||||
max-width: 1800px;
|
||||
margin: 0 auto;
|
||||
padding: 0 2rem;
|
||||
display: grid;
|
||||
grid-template-columns: 1fr 1fr;
|
||||
gap: 2rem;
|
||||
}
|
||||
|
||||
.footer-section h3 {
|
||||
font-size: 1rem;
|
||||
margin-bottom: 0.5rem;
|
||||
color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
.footer-section p {
|
||||
font-size: 0.875rem;
|
||||
color: var(--text-secondary);
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
.sponsor-btn {
|
||||
display: inline-block;
|
||||
padding: 0.5rem 1rem;
|
||||
background: transparent;
|
||||
border: 1px solid var(--primary-cyan);
|
||||
color: var(--primary-cyan);
|
||||
text-decoration: none;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.sponsor-btn:hover {
|
||||
background: var(--primary-cyan);
|
||||
color: var(--bg-dark);
|
||||
}
|
||||
|
||||
.footer-bottom {
|
||||
max-width: 1800px;
|
||||
margin: 2rem auto 0;
|
||||
padding: 1rem 2rem 0;
|
||||
border-top: 1px solid var(--border-color);
|
||||
font-size: 0.75rem;
|
||||
color: var(--text-tertiary);
|
||||
}
|
||||
|
||||
/* Modal */
|
||||
.modal {
|
||||
position: fixed;
|
||||
top: 0;
|
||||
left: 0;
|
||||
right: 0;
|
||||
bottom: 0;
|
||||
background: rgba(0, 0, 0, 0.8);
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
z-index: 1000;
|
||||
}
|
||||
|
||||
.modal.hidden {
|
||||
display: none;
|
||||
}
|
||||
|
||||
.modal-content {
|
||||
background: var(--bg-secondary);
|
||||
border: 1px solid var(--primary-cyan);
|
||||
max-width: 800px;
|
||||
width: 90%;
|
||||
max-height: 80vh;
|
||||
overflow-y: auto;
|
||||
position: relative;
|
||||
}
|
||||
|
||||
.modal-close {
|
||||
position: absolute;
|
||||
top: 1rem;
|
||||
right: 1rem;
|
||||
background: transparent;
|
||||
border: 1px solid var(--border-color);
|
||||
color: var(--text-primary);
|
||||
padding: 0.25rem 0.5rem;
|
||||
cursor: pointer;
|
||||
font-size: 1.2rem;
|
||||
}
|
||||
|
||||
.modal-close:hover {
|
||||
border-color: var(--error);
|
||||
color: var(--error);
|
||||
}
|
||||
|
||||
.app-detail {
|
||||
padding: 2rem;
|
||||
}
|
||||
|
||||
.app-detail h2 {
|
||||
font-size: 1.5rem;
|
||||
margin-bottom: 1rem;
|
||||
color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
/* Loading */
|
||||
.loading {
|
||||
text-align: center;
|
||||
padding: 2rem;
|
||||
color: var(--text-tertiary);
|
||||
}
|
||||
|
||||
.no-results {
|
||||
text-align: center;
|
||||
padding: 2rem;
|
||||
color: var(--text-tertiary);
|
||||
}
|
||||
|
||||
/* Responsive - Tablet */
|
||||
@media (min-width: 768px) {
|
||||
.magazine-layout {
|
||||
grid-template-columns: repeat(2, 1fr);
|
||||
}
|
||||
|
||||
.hero-featured {
|
||||
grid-column: 1 / -1;
|
||||
}
|
||||
|
||||
.secondary-featured {
|
||||
grid-column: 1 / -1;
|
||||
}
|
||||
|
||||
.sponsored-section {
|
||||
grid-column: 1 / -1;
|
||||
}
|
||||
|
||||
.main-content {
|
||||
grid-column: 1 / -1;
|
||||
grid-template-columns: repeat(2, 1fr);
|
||||
}
|
||||
}
|
||||
|
||||
/* Responsive - Desktop */
|
||||
@media (min-width: 1024px) {
|
||||
.magazine-layout {
|
||||
grid-template-columns: repeat(3, 1fr);
|
||||
}
|
||||
|
||||
.hero-featured {
|
||||
grid-column: 1 / 3;
|
||||
grid-row: 1;
|
||||
}
|
||||
|
||||
.secondary-featured {
|
||||
grid-column: 3 / 4;
|
||||
grid-row: 1;
|
||||
}
|
||||
|
||||
.featured-secondary-cards {
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.sponsored-section {
|
||||
grid-column: 1 / -1;
|
||||
}
|
||||
|
||||
.main-content {
|
||||
grid-column: 1 / -1;
|
||||
grid-template-columns: repeat(3, 1fr);
|
||||
}
|
||||
}
|
||||
|
||||
/* Responsive - Wide Desktop */
|
||||
@media (min-width: 1400px) {
|
||||
.magazine-layout {
|
||||
grid-template-columns: repeat(4, 1fr);
|
||||
}
|
||||
|
||||
.hero-featured {
|
||||
grid-column: 1 / 3;
|
||||
}
|
||||
|
||||
.secondary-featured {
|
||||
grid-column: 3 / 5;
|
||||
grid-row: 1;
|
||||
}
|
||||
|
||||
.featured-secondary-cards {
|
||||
grid-template-columns: repeat(2, 1fr);
|
||||
}
|
||||
|
||||
.main-content {
|
||||
grid-template-columns: repeat(4, 1fr);
|
||||
}
|
||||
|
||||
.apps-column {
|
||||
grid-column: span 2;
|
||||
}
|
||||
|
||||
.more-apps-grid {
|
||||
grid-template-columns: repeat(auto-fill, minmax(250px, 1fr));
|
||||
}
|
||||
}
|
||||
|
||||
/* Responsive - Ultra Wide Desktop (for coders with wide monitors) */
|
||||
@media (min-width: 1800px) {
|
||||
.magazine-layout {
|
||||
grid-template-columns: repeat(5, 1fr);
|
||||
}
|
||||
|
||||
.hero-featured {
|
||||
grid-column: 1 / 3;
|
||||
}
|
||||
|
||||
.secondary-featured {
|
||||
grid-column: 3 / 6;
|
||||
}
|
||||
|
||||
.featured-secondary-cards {
|
||||
grid-template-columns: repeat(3, 1fr);
|
||||
}
|
||||
|
||||
.sponsored-section {
|
||||
grid-column: 1 / -1;
|
||||
}
|
||||
|
||||
.sponsored-cards {
|
||||
grid-template-columns: repeat(5, 1fr);
|
||||
}
|
||||
|
||||
.main-content {
|
||||
grid-template-columns: repeat(5, 1fr);
|
||||
}
|
||||
|
||||
.apps-column {
|
||||
grid-column: span 2;
|
||||
}
|
||||
|
||||
.articles-column {
|
||||
grid-column: span 2;
|
||||
}
|
||||
|
||||
.more-apps-grid {
|
||||
grid-template-columns: repeat(auto-fill, minmax(300px, 1fr));
|
||||
}
|
||||
}
|
||||
|
||||
/* Responsive - Mobile */
|
||||
@media (max-width: 767px) {
|
||||
.header-content {
|
||||
flex-direction: column;
|
||||
gap: 1rem;
|
||||
}
|
||||
|
||||
.search-filter-bar {
|
||||
flex-direction: column;
|
||||
align-items: stretch;
|
||||
}
|
||||
|
||||
.search-box {
|
||||
max-width: none;
|
||||
}
|
||||
|
||||
.magazine-layout {
|
||||
padding: 0 1rem 2rem;
|
||||
}
|
||||
|
||||
.footer-content {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
|
||||
.secondary-card {
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.secondary-image {
|
||||
width: 100%;
|
||||
height: 150px;
|
||||
}
|
||||
}
|
||||
395
docs/md_v2/marketplace/frontend/marketplace.js
Normal file
395
docs/md_v2/marketplace/frontend/marketplace.js
Normal file
@@ -0,0 +1,395 @@
|
||||
// Marketplace JS - Magazine Layout
|
||||
const API_BASE = '/marketplace/api';
|
||||
const CACHE_TTL = 3600000; // 1 hour in ms
|
||||
|
||||
class MarketplaceCache {
|
||||
constructor() {
|
||||
this.prefix = 'c4ai_market_';
|
||||
}
|
||||
|
||||
get(key) {
|
||||
const item = localStorage.getItem(this.prefix + key);
|
||||
if (!item) return null;
|
||||
|
||||
const data = JSON.parse(item);
|
||||
if (Date.now() > data.expires) {
|
||||
localStorage.removeItem(this.prefix + key);
|
||||
return null;
|
||||
}
|
||||
return data.value;
|
||||
}
|
||||
|
||||
set(key, value, ttl = CACHE_TTL) {
|
||||
const data = {
|
||||
value: value,
|
||||
expires: Date.now() + ttl
|
||||
};
|
||||
localStorage.setItem(this.prefix + key, JSON.stringify(data));
|
||||
}
|
||||
|
||||
clear() {
|
||||
Object.keys(localStorage)
|
||||
.filter(k => k.startsWith(this.prefix))
|
||||
.forEach(k => localStorage.removeItem(k));
|
||||
}
|
||||
}
|
||||
|
||||
class MarketplaceAPI {
|
||||
constructor() {
|
||||
this.cache = new MarketplaceCache();
|
||||
this.searchTimeout = null;
|
||||
}
|
||||
|
||||
async fetch(endpoint, useCache = true) {
|
||||
const cacheKey = endpoint.replace(/[^\w]/g, '_');
|
||||
|
||||
if (useCache) {
|
||||
const cached = this.cache.get(cacheKey);
|
||||
if (cached) return cached;
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await fetch(`${API_BASE}${endpoint}`);
|
||||
if (!response.ok) throw new Error(`HTTP ${response.status}`);
|
||||
|
||||
const data = await response.json();
|
||||
this.cache.set(cacheKey, data);
|
||||
return data;
|
||||
} catch (error) {
|
||||
console.error('API Error:', error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async getStats() {
|
||||
return this.fetch('/stats');
|
||||
}
|
||||
|
||||
async getCategories() {
|
||||
return this.fetch('/categories');
|
||||
}
|
||||
|
||||
async getApps(params = {}) {
|
||||
const query = new URLSearchParams(params).toString();
|
||||
return this.fetch(`/apps${query ? '?' + query : ''}`);
|
||||
}
|
||||
|
||||
async getArticles(params = {}) {
|
||||
const query = new URLSearchParams(params).toString();
|
||||
return this.fetch(`/articles${query ? '?' + query : ''}`);
|
||||
}
|
||||
|
||||
async getSponsors() {
|
||||
return this.fetch('/sponsors');
|
||||
}
|
||||
|
||||
async search(query) {
|
||||
if (query.length < 2) return {};
|
||||
return this.fetch(`/search?q=${encodeURIComponent(query)}`, false);
|
||||
}
|
||||
}
|
||||
|
||||
class MarketplaceUI {
|
||||
constructor() {
|
||||
this.api = new MarketplaceAPI();
|
||||
this.currentCategory = 'all';
|
||||
this.currentType = '';
|
||||
this.searchTimeout = null;
|
||||
this.loadedApps = 10;
|
||||
this.init();
|
||||
}
|
||||
|
||||
async init() {
|
||||
await this.loadStats();
|
||||
await this.loadCategories();
|
||||
await this.loadFeaturedContent();
|
||||
await this.loadSponsors();
|
||||
await this.loadMainContent();
|
||||
this.setupEventListeners();
|
||||
}
|
||||
|
||||
async loadStats() {
|
||||
const stats = await this.api.getStats();
|
||||
if (stats) {
|
||||
document.getElementById('total-apps').textContent = stats.total_apps || '0';
|
||||
document.getElementById('total-articles').textContent = stats.total_articles || '0';
|
||||
document.getElementById('total-downloads').textContent = stats.total_downloads || '0';
|
||||
document.getElementById('last-update').textContent = new Date().toLocaleDateString();
|
||||
}
|
||||
}
|
||||
|
||||
async loadCategories() {
|
||||
const categories = await this.api.getCategories();
|
||||
if (!categories) return;
|
||||
|
||||
const filter = document.getElementById('category-filter');
|
||||
categories.forEach(cat => {
|
||||
const btn = document.createElement('button');
|
||||
btn.className = 'filter-btn';
|
||||
btn.dataset.category = cat.slug;
|
||||
btn.textContent = cat.name;
|
||||
btn.onclick = () => this.filterByCategory(cat.slug);
|
||||
filter.appendChild(btn);
|
||||
});
|
||||
}
|
||||
|
||||
async loadFeaturedContent() {
|
||||
// Load hero featured
|
||||
const featured = await this.api.getApps({ featured: true, limit: 4 });
|
||||
if (!featured || !featured.length) return;
|
||||
|
||||
// Hero card (first featured)
|
||||
const hero = featured[0];
|
||||
const heroCard = document.getElementById('featured-hero');
|
||||
if (hero) {
|
||||
const imageUrl = hero.image || '';
|
||||
heroCard.innerHTML = `
|
||||
<div class="hero-image" ${imageUrl ? `style="background-image: url('${imageUrl}')"` : ''}>
|
||||
${!imageUrl ? `[${hero.category || 'APP'}]` : ''}
|
||||
</div>
|
||||
<div class="hero-content">
|
||||
<span class="hero-badge">${hero.type || 'PAID'}</span>
|
||||
<h2 class="hero-title">${hero.name}</h2>
|
||||
<p class="hero-description">${hero.description}</p>
|
||||
<div class="hero-meta">
|
||||
<span>★ ${hero.rating || 0}/5</span>
|
||||
<span>${hero.downloads || 0} downloads</span>
|
||||
</div>
|
||||
</div>
|
||||
`;
|
||||
heroCard.onclick = () => this.showAppDetail(hero);
|
||||
}
|
||||
|
||||
// Secondary featured cards
|
||||
const secondary = document.getElementById('featured-secondary');
|
||||
secondary.innerHTML = '';
|
||||
if (featured.length > 1) {
|
||||
featured.slice(1, 4).forEach(app => {
|
||||
const card = document.createElement('div');
|
||||
card.className = 'secondary-card';
|
||||
const imageUrl = app.image || '';
|
||||
card.innerHTML = `
|
||||
<div class="secondary-image" ${imageUrl ? `style="background-image: url('${imageUrl}')"` : ''}>
|
||||
${!imageUrl ? `[${app.category || 'APP'}]` : ''}
|
||||
</div>
|
||||
<div class="secondary-content">
|
||||
<h3 class="secondary-title">${app.name}</h3>
|
||||
<p class="secondary-desc">${(app.description || '').substring(0, 100)}...</p>
|
||||
<div class="secondary-meta">
|
||||
<span>${app.type || 'Open Source'}</span> · <span>★ ${app.rating || 0}/5</span>
|
||||
</div>
|
||||
</div>
|
||||
`;
|
||||
card.onclick = () => this.showAppDetail(app);
|
||||
secondary.appendChild(card);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
async loadSponsors() {
|
||||
const sponsors = await this.api.getSponsors();
|
||||
if (!sponsors || !sponsors.length) {
|
||||
// Show placeholder if no sponsors
|
||||
const container = document.getElementById('sponsored-content');
|
||||
container.innerHTML = `
|
||||
<div class="sponsor-card">
|
||||
<h4>Become a Sponsor</h4>
|
||||
<p>Reach thousands of developers using Crawl4AI</p>
|
||||
<a href="mailto:sponsors@crawl4ai.com">Contact Us →</a>
|
||||
</div>
|
||||
`;
|
||||
return;
|
||||
}
|
||||
|
||||
const container = document.getElementById('sponsored-content');
|
||||
container.innerHTML = sponsors.slice(0, 5).map(sponsor => `
|
||||
<div class="sponsor-card">
|
||||
<h4>${sponsor.company_name}</h4>
|
||||
<p>${sponsor.tier} Sponsor - Premium Solutions</p>
|
||||
<a href="${sponsor.landing_url}" target="_blank">Learn More →</a>
|
||||
</div>
|
||||
`).join('');
|
||||
}
|
||||
|
||||
async loadMainContent() {
|
||||
// Load apps column
|
||||
const apps = await this.api.getApps({ limit: 8 });
|
||||
if (apps && apps.length) {
|
||||
const appsGrid = document.getElementById('apps-grid');
|
||||
appsGrid.innerHTML = apps.map(app => `
|
||||
<div class="app-compact" onclick="marketplace.showAppDetail(${JSON.stringify(app).replace(/"/g, '"')})">
|
||||
<div class="app-compact-header">
|
||||
<span>${app.category}</span>
|
||||
<span>★ ${app.rating}/5</span>
|
||||
</div>
|
||||
<div class="app-compact-title">${app.name}</div>
|
||||
<div class="app-compact-desc">${app.description}</div>
|
||||
</div>
|
||||
`).join('');
|
||||
}
|
||||
|
||||
// Load articles column
|
||||
const articles = await this.api.getArticles({ limit: 6 });
|
||||
if (articles && articles.length) {
|
||||
const articlesList = document.getElementById('articles-list');
|
||||
articlesList.innerHTML = articles.map(article => `
|
||||
<div class="article-compact" onclick="marketplace.showArticle('${article.id}')">
|
||||
<div class="article-meta">
|
||||
<span>${article.category}</span> · <span>${new Date(article.published_at).toLocaleDateString()}</span>
|
||||
</div>
|
||||
<div class="article-title">${article.title}</div>
|
||||
<div class="article-author">by ${article.author}</div>
|
||||
</div>
|
||||
`).join('');
|
||||
}
|
||||
|
||||
// Load trending
|
||||
if (apps && apps.length) {
|
||||
const trending = apps.slice(0, 5);
|
||||
const trendingList = document.getElementById('trending-list');
|
||||
trendingList.innerHTML = trending.map((app, i) => `
|
||||
<div class="trending-item" onclick="marketplace.showAppDetail(${JSON.stringify(app).replace(/"/g, '"')})">
|
||||
<div class="trending-rank">${i + 1}</div>
|
||||
<div class="trending-info">
|
||||
<div class="trending-name">${app.name}</div>
|
||||
<div class="trending-stats">${app.downloads} downloads</div>
|
||||
</div>
|
||||
</div>
|
||||
`).join('');
|
||||
}
|
||||
|
||||
// Load more apps grid
|
||||
const moreApps = await this.api.getApps({ offset: 8, limit: 12 });
|
||||
if (moreApps && moreApps.length) {
|
||||
const moreGrid = document.getElementById('more-apps-grid');
|
||||
moreGrid.innerHTML = moreApps.map(app => `
|
||||
<div class="app-compact" onclick="marketplace.showAppDetail(${JSON.stringify(app).replace(/"/g, '"')})">
|
||||
<div class="app-compact-header">
|
||||
<span>${app.category}</span>
|
||||
<span>${app.type}</span>
|
||||
</div>
|
||||
<div class="app-compact-title">${app.name}</div>
|
||||
</div>
|
||||
`).join('');
|
||||
}
|
||||
}
|
||||
|
||||
setupEventListeners() {
|
||||
// Search
|
||||
const searchInput = document.getElementById('search-input');
|
||||
searchInput.addEventListener('input', (e) => {
|
||||
clearTimeout(this.searchTimeout);
|
||||
this.searchTimeout = setTimeout(() => this.search(e.target.value), 300);
|
||||
});
|
||||
|
||||
// Keyboard shortcut
|
||||
document.addEventListener('keydown', (e) => {
|
||||
if (e.key === '/' && !searchInput.contains(document.activeElement)) {
|
||||
e.preventDefault();
|
||||
searchInput.focus();
|
||||
}
|
||||
if (e.key === 'Escape' && searchInput.contains(document.activeElement)) {
|
||||
searchInput.blur();
|
||||
searchInput.value = '';
|
||||
}
|
||||
});
|
||||
|
||||
// Type filter
|
||||
const typeFilter = document.getElementById('type-filter');
|
||||
typeFilter.addEventListener('change', (e) => {
|
||||
this.currentType = e.target.value;
|
||||
this.loadMainContent();
|
||||
});
|
||||
|
||||
// Load more
|
||||
const loadMore = document.getElementById('load-more');
|
||||
loadMore.addEventListener('click', () => this.loadMoreApps());
|
||||
}
|
||||
|
||||
async filterByCategory(category) {
|
||||
// Update active state
|
||||
document.querySelectorAll('.filter-btn').forEach(btn => {
|
||||
btn.classList.toggle('active', btn.dataset.category === category);
|
||||
});
|
||||
|
||||
this.currentCategory = category;
|
||||
await this.loadMainContent();
|
||||
}
|
||||
|
||||
async search(query) {
|
||||
if (!query) {
|
||||
await this.loadMainContent();
|
||||
return;
|
||||
}
|
||||
|
||||
const results = await this.api.search(query);
|
||||
if (!results) return;
|
||||
|
||||
// Update apps grid with search results
|
||||
if (results.apps && results.apps.length) {
|
||||
const appsGrid = document.getElementById('apps-grid');
|
||||
appsGrid.innerHTML = results.apps.map(app => `
|
||||
<div class="app-compact" onclick="marketplace.showAppDetail(${JSON.stringify(app).replace(/"/g, '"')})">
|
||||
<div class="app-compact-header">
|
||||
<span>${app.category}</span>
|
||||
<span>★ ${app.rating}/5</span>
|
||||
</div>
|
||||
<div class="app-compact-title">${app.name}</div>
|
||||
<div class="app-compact-desc">${app.description}</div>
|
||||
</div>
|
||||
`).join('');
|
||||
}
|
||||
|
||||
// Update articles with search results
|
||||
if (results.articles && results.articles.length) {
|
||||
const articlesList = document.getElementById('articles-list');
|
||||
articlesList.innerHTML = results.articles.map(article => `
|
||||
<div class="article-compact" onclick="marketplace.showArticle('${article.id}')">
|
||||
<div class="article-meta">
|
||||
<span>${article.category}</span> · <span>${new Date(article.published_at).toLocaleDateString()}</span>
|
||||
</div>
|
||||
<div class="article-title">${article.title}</div>
|
||||
<div class="article-author">by ${article.author}</div>
|
||||
</div>
|
||||
`).join('');
|
||||
}
|
||||
}
|
||||
|
||||
async loadMoreApps() {
|
||||
this.loadedApps += 12;
|
||||
const moreApps = await this.api.getApps({ offset: this.loadedApps, limit: 12 });
|
||||
if (moreApps && moreApps.length) {
|
||||
const moreGrid = document.getElementById('more-apps-grid');
|
||||
moreApps.forEach(app => {
|
||||
const card = document.createElement('div');
|
||||
card.className = 'app-compact';
|
||||
card.innerHTML = `
|
||||
<div class="app-compact-header">
|
||||
<span>${app.category}</span>
|
||||
<span>${app.type}</span>
|
||||
</div>
|
||||
<div class="app-compact-title">${app.name}</div>
|
||||
`;
|
||||
card.onclick = () => this.showAppDetail(app);
|
||||
moreGrid.appendChild(card);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
showAppDetail(app) {
|
||||
// Navigate to detail page instead of showing modal
|
||||
const slug = app.slug || app.name.toLowerCase().replace(/\s+/g, '-');
|
||||
window.location.href = `app-detail.html?app=${slug}`;
|
||||
}
|
||||
|
||||
showArticle(articleId) {
|
||||
// Could create article detail page similarly
|
||||
console.log('Show article:', articleId);
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize marketplace
|
||||
let marketplace;
|
||||
document.addEventListener('DOMContentLoaded', () => {
|
||||
marketplace = new MarketplaceUI();
|
||||
});
|
||||
147
docs/md_v2/marketplace/index.html
Normal file
147
docs/md_v2/marketplace/index.html
Normal file
@@ -0,0 +1,147 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en" data-theme="dark">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Marketplace - Crawl4AI</title>
|
||||
<link rel="stylesheet" href="marketplace.css">
|
||||
</head>
|
||||
<body>
|
||||
<div class="marketplace-container">
|
||||
<!-- Header -->
|
||||
<header class="marketplace-header">
|
||||
<div class="header-content">
|
||||
<div class="header-left">
|
||||
<div class="logo-title">
|
||||
<img src="../assets/images/logo.png" alt="Crawl4AI" class="header-logo">
|
||||
<h1>
|
||||
<span class="ascii-border">[</span>
|
||||
Marketplace
|
||||
<span class="ascii-border">]</span>
|
||||
</h1>
|
||||
</div>
|
||||
<p class="tagline">Tools, Integrations & Resources for Web Crawling</p>
|
||||
</div>
|
||||
<div class="header-stats" id="stats">
|
||||
<span class="stat-item">Apps: <span id="total-apps">--</span></span>
|
||||
<span class="stat-item">Articles: <span id="total-articles">--</span></span>
|
||||
<span class="stat-item">Downloads: <span id="total-downloads">--</span></span>
|
||||
</div>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<!-- Search and Category Bar -->
|
||||
<div class="search-filter-bar">
|
||||
<div class="search-box">
|
||||
<span class="search-icon">></span>
|
||||
<input type="text" id="search-input" placeholder="Search apps, articles, tools..." />
|
||||
<kbd>/</kbd>
|
||||
</div>
|
||||
<div class="category-filter" id="category-filter">
|
||||
<button class="filter-btn active" data-category="all">All</button>
|
||||
<!-- Categories will be loaded here -->
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Magazine Grid Layout -->
|
||||
<main class="magazine-layout">
|
||||
<!-- Hero Featured Section -->
|
||||
<section class="hero-featured">
|
||||
<div id="featured-hero" class="featured-hero-card">
|
||||
<!-- Large featured card with big image -->
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Secondary Featured -->
|
||||
<section class="secondary-featured">
|
||||
<div id="featured-secondary" class="featured-secondary-cards">
|
||||
<!-- 2-3 medium featured cards with images -->
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Sponsored Section -->
|
||||
<section class="sponsored-section">
|
||||
<div class="section-label">SPONSORED</div>
|
||||
<div id="sponsored-content" class="sponsored-cards">
|
||||
<!-- Sponsored content cards -->
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Main Content Grid -->
|
||||
<section class="main-content">
|
||||
<!-- Apps Column -->
|
||||
<div class="apps-column">
|
||||
<div class="column-header">
|
||||
<h2><span class="ascii-icon">></span> Latest Apps</h2>
|
||||
<select id="type-filter" class="mini-filter">
|
||||
<option value="">All</option>
|
||||
<option value="Open Source">Open Source</option>
|
||||
<option value="Paid">Paid</option>
|
||||
</select>
|
||||
</div>
|
||||
<div id="apps-grid" class="apps-compact-grid">
|
||||
<!-- Compact app cards -->
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Articles Column -->
|
||||
<div class="articles-column">
|
||||
<div class="column-header">
|
||||
<h2><span class="ascii-icon">></span> Latest Articles</h2>
|
||||
</div>
|
||||
<div id="articles-list" class="articles-compact-list">
|
||||
<!-- Article items -->
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Trending/Tools Column -->
|
||||
<div class="trending-column">
|
||||
<div class="column-header">
|
||||
<h2><span class="ascii-icon">#</span> Trending</h2>
|
||||
</div>
|
||||
<div id="trending-list" class="trending-items">
|
||||
<!-- Trending items -->
|
||||
</div>
|
||||
|
||||
<div class="submit-box">
|
||||
<h3><span class="ascii-icon">+</span> Submit Your Tool</h3>
|
||||
<p>Share your integration</p>
|
||||
<a href="mailto:marketplace@crawl4ai.com" class="submit-btn">Submit →</a>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- More Apps Grid -->
|
||||
<section class="more-apps">
|
||||
<div class="section-header">
|
||||
<h2><span class="ascii-icon">></span> More Apps</h2>
|
||||
<button id="load-more" class="load-more-btn">Load More ↓</button>
|
||||
</div>
|
||||
<div id="more-apps-grid" class="more-apps-grid">
|
||||
<!-- Additional app cards -->
|
||||
</div>
|
||||
</section>
|
||||
</main>
|
||||
|
||||
<!-- Footer -->
|
||||
<footer class="marketplace-footer">
|
||||
<div class="footer-content">
|
||||
<div class="footer-section">
|
||||
<h3>About Marketplace</h3>
|
||||
<p>Discover tools and integrations built by the Crawl4AI community.</p>
|
||||
</div>
|
||||
<div class="footer-section">
|
||||
<h3>Become a Sponsor</h3>
|
||||
<p>Reach developers building with Crawl4AI</p>
|
||||
<a href="mailto:sponsors@crawl4ai.com" class="sponsor-btn">Learn More →</a>
|
||||
</div>
|
||||
</div>
|
||||
<div class="footer-bottom">
|
||||
<p>[ Crawl4AI Marketplace · Updated <span id="last-update">--</span> ]</p>
|
||||
</div>
|
||||
</footer>
|
||||
</div>
|
||||
|
||||
<script src="marketplace.js"></script>
|
||||
</body>
|
||||
</html>
|
||||
994
docs/md_v2/marketplace/marketplace.css
Normal file
994
docs/md_v2/marketplace/marketplace.css
Normal file
@@ -0,0 +1,994 @@
|
||||
/* Marketplace CSS - Magazine Style Terminal Theme */
|
||||
@import url('../../assets/styles.css');
|
||||
|
||||
:root {
|
||||
--primary-cyan: #50ffff;
|
||||
--primary-teal: #09b5a5;
|
||||
--accent-pink: #f380f5;
|
||||
--bg-dark: #070708;
|
||||
--bg-secondary: #1a1a1a;
|
||||
--bg-tertiary: #3f3f44;
|
||||
--text-primary: #e8e9ed;
|
||||
--text-secondary: #d5cec0;
|
||||
--text-tertiary: #a3abba;
|
||||
--border-color: #3f3f44;
|
||||
--success: #50ff50;
|
||||
--error: #ff3c74;
|
||||
--warning: #f59e0b;
|
||||
}
|
||||
|
||||
* {
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
body {
|
||||
font-family: 'Dank Mono', Monaco, monospace;
|
||||
background: var(--bg-dark);
|
||||
color: var(--text-primary);
|
||||
line-height: 1.6;
|
||||
}
|
||||
|
||||
/* Global link styles */
|
||||
a {
|
||||
color: var(--primary-cyan);
|
||||
text-decoration: none;
|
||||
transition: color 0.2s;
|
||||
}
|
||||
|
||||
a:hover {
|
||||
color: var(--accent-pink);
|
||||
}
|
||||
|
||||
.marketplace-container {
|
||||
min-height: 100vh;
|
||||
}
|
||||
|
||||
/* Header */
|
||||
.marketplace-header {
|
||||
background: var(--bg-secondary);
|
||||
border-bottom: 1px solid var(--border-color);
|
||||
padding: 1.5rem 0;
|
||||
}
|
||||
|
||||
.header-content {
|
||||
max-width: 1800px;
|
||||
margin: 0 auto;
|
||||
padding: 0 2rem;
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.logo-title {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 1rem;
|
||||
}
|
||||
|
||||
.header-logo {
|
||||
height: 40px;
|
||||
width: auto;
|
||||
filter: brightness(1.2);
|
||||
}
|
||||
|
||||
.marketplace-header h1 {
|
||||
font-size: 1.5rem;
|
||||
color: var(--primary-cyan);
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
.ascii-border {
|
||||
color: var(--border-color);
|
||||
}
|
||||
|
||||
.tagline {
|
||||
font-size: 0.875rem;
|
||||
color: var(--text-tertiary);
|
||||
margin-top: 0.25rem;
|
||||
}
|
||||
|
||||
.header-stats {
|
||||
display: flex;
|
||||
gap: 2rem;
|
||||
}
|
||||
|
||||
.stat-item {
|
||||
font-size: 0.875rem;
|
||||
color: var(--text-secondary);
|
||||
}
|
||||
|
||||
.stat-item span {
|
||||
color: var(--primary-cyan);
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
/* Search and Filter Bar */
|
||||
.search-filter-bar {
|
||||
max-width: 1800px;
|
||||
margin: 1.5rem auto;
|
||||
padding: 0 2rem;
|
||||
display: flex;
|
||||
gap: 1rem;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.search-box {
|
||||
flex: 1;
|
||||
max-width: 500px;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
background: var(--bg-secondary);
|
||||
border: 1px solid var(--border-color);
|
||||
padding: 0.75rem 1rem;
|
||||
transition: border-color 0.2s;
|
||||
}
|
||||
|
||||
.search-box:focus-within {
|
||||
border-color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
.search-icon {
|
||||
color: var(--text-tertiary);
|
||||
margin-right: 1rem;
|
||||
}
|
||||
|
||||
#search-input {
|
||||
flex: 1;
|
||||
background: transparent;
|
||||
border: none;
|
||||
color: var(--text-primary);
|
||||
font-family: inherit;
|
||||
font-size: 0.9rem;
|
||||
outline: none;
|
||||
}
|
||||
|
||||
.search-box kbd {
|
||||
font-size: 0.75rem;
|
||||
padding: 0.2rem 0.5rem;
|
||||
background: var(--bg-tertiary);
|
||||
border: 1px solid var(--border-color);
|
||||
color: var(--text-tertiary);
|
||||
}
|
||||
|
||||
.category-filter {
|
||||
display: flex;
|
||||
gap: 0.5rem;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
|
||||
.filter-btn {
|
||||
background: transparent;
|
||||
border: 1px solid var(--border-color);
|
||||
color: var(--text-secondary);
|
||||
padding: 0.5rem 1rem;
|
||||
font-family: inherit;
|
||||
font-size: 0.875rem;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.filter-btn:hover {
|
||||
border-color: var(--primary-cyan);
|
||||
color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
.filter-btn.active {
|
||||
background: var(--primary-cyan);
|
||||
color: var(--bg-dark);
|
||||
border-color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
/* Magazine Layout */
|
||||
.magazine-layout {
|
||||
max-width: 1800px;
|
||||
margin: 0 auto;
|
||||
padding: 0 2rem 4rem;
|
||||
display: grid;
|
||||
grid-template-columns: 1fr;
|
||||
gap: 2rem;
|
||||
}
|
||||
|
||||
/* Hero Featured Section */
|
||||
.hero-featured {
|
||||
grid-column: 1 / -1;
|
||||
position: relative;
|
||||
}
|
||||
|
||||
.hero-featured::before {
|
||||
content: '';
|
||||
position: absolute;
|
||||
top: -20px;
|
||||
left: -20px;
|
||||
right: -20px;
|
||||
bottom: -20px;
|
||||
background: radial-gradient(ellipse at center, rgba(80, 255, 255, 0.05), transparent 70%);
|
||||
pointer-events: none;
|
||||
z-index: -1;
|
||||
}
|
||||
|
||||
.featured-hero-card {
|
||||
background: linear-gradient(135deg, #1a1a2e, #0f0f1e);
|
||||
border: 2px solid var(--primary-cyan);
|
||||
box-shadow: 0 0 30px rgba(80, 255, 255, 0.15),
|
||||
inset 0 0 20px rgba(80, 255, 255, 0.05);
|
||||
height: 380px;
|
||||
position: relative;
|
||||
overflow: hidden;
|
||||
cursor: pointer;
|
||||
transition: all 0.3s ease;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.featured-hero-card:hover {
|
||||
border-color: var(--accent-pink);
|
||||
box-shadow: 0 0 40px rgba(243, 128, 245, 0.2),
|
||||
inset 0 0 30px rgba(243, 128, 245, 0.05);
|
||||
transform: translateY(-2px);
|
||||
}
|
||||
|
||||
.hero-image {
|
||||
width: 100%;
|
||||
height: 200px;
|
||||
min-height: 200px;
|
||||
max-height: 200px;
|
||||
background: linear-gradient(135deg, rgba(80, 255, 255, 0.1), rgba(243, 128, 245, 0.05));
|
||||
background-size: cover;
|
||||
background-position: center;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
font-size: 3rem;
|
||||
color: var(--primary-cyan);
|
||||
flex-shrink: 0;
|
||||
position: relative;
|
||||
filter: brightness(1.1) contrast(1.1);
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.hero-image img {
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
object-fit: cover;
|
||||
object-position: center;
|
||||
}
|
||||
|
||||
.hero-image::after {
|
||||
content: '';
|
||||
position: absolute;
|
||||
bottom: 0;
|
||||
left: 0;
|
||||
right: 0;
|
||||
height: 60%;
|
||||
background: linear-gradient(to top, rgba(10, 10, 20, 0.95), transparent);
|
||||
}
|
||||
|
||||
.hero-content {
|
||||
padding: 1.5rem;
|
||||
flex: 1;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
justify-content: space-between;
|
||||
}
|
||||
|
||||
.hero-badge {
|
||||
display: inline-block;
|
||||
padding: 0.3rem 0.6rem;
|
||||
background: linear-gradient(135deg, var(--primary-cyan), var(--primary-teal));
|
||||
color: var(--bg-dark);
|
||||
font-size: 0.7rem;
|
||||
text-transform: uppercase;
|
||||
margin-bottom: 0.5rem;
|
||||
font-weight: 600;
|
||||
box-shadow: 0 2px 10px rgba(80, 255, 255, 0.3);
|
||||
}
|
||||
|
||||
.hero-title {
|
||||
font-size: 1.6rem;
|
||||
color: var(--primary-cyan);
|
||||
margin: 0.5rem 0;
|
||||
text-shadow: 0 0 20px rgba(80, 255, 255, 0.5);
|
||||
}
|
||||
|
||||
.hero-description {
|
||||
color: var(--text-secondary);
|
||||
line-height: 1.5;
|
||||
}
|
||||
|
||||
.hero-meta {
|
||||
display: flex;
|
||||
gap: 1.5rem;
|
||||
margin-top: 1rem;
|
||||
font-size: 0.875rem;
|
||||
}
|
||||
|
||||
.hero-meta span {
|
||||
color: var(--text-tertiary);
|
||||
}
|
||||
|
||||
.hero-meta span:first-child {
|
||||
color: var(--warning);
|
||||
}
|
||||
|
||||
/* Secondary Featured */
|
||||
.secondary-featured {
|
||||
grid-column: 1 / -1;
|
||||
min-height: 380px;
|
||||
display: flex;
|
||||
align-items: flex-start;
|
||||
}
|
||||
|
||||
.featured-secondary-cards {
|
||||
width: 100%;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.75rem;
|
||||
align-items: stretch;
|
||||
}
|
||||
|
||||
.secondary-card {
|
||||
background: linear-gradient(135deg, rgba(80, 255, 255, 0.03), rgba(243, 128, 245, 0.02));
|
||||
border: 1px solid rgba(80, 255, 255, 0.3);
|
||||
cursor: pointer;
|
||||
transition: all 0.3s ease;
|
||||
display: flex;
|
||||
overflow: hidden;
|
||||
height: 118px;
|
||||
min-height: 118px;
|
||||
max-height: 118px;
|
||||
flex-shrink: 0;
|
||||
box-shadow: 0 2px 10px rgba(0, 0, 0, 0.3);
|
||||
}
|
||||
|
||||
.secondary-card:hover {
|
||||
border-color: var(--accent-pink);
|
||||
background: linear-gradient(135deg, rgba(243, 128, 245, 0.05), rgba(80, 255, 255, 0.03));
|
||||
box-shadow: 0 4px 15px rgba(243, 128, 245, 0.2);
|
||||
transform: translateX(-3px);
|
||||
}
|
||||
|
||||
.secondary-image {
|
||||
width: 120px;
|
||||
background: linear-gradient(135deg, var(--bg-tertiary), var(--bg-secondary));
|
||||
background-size: cover;
|
||||
background-position: center;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
font-size: 1.5rem;
|
||||
color: var(--primary-cyan);
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
.secondary-content {
|
||||
flex: 1;
|
||||
padding: 1rem;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
justify-content: space-between;
|
||||
}
|
||||
|
||||
.secondary-title {
|
||||
font-size: 1rem;
|
||||
color: var(--text-primary);
|
||||
margin-bottom: 0.25rem;
|
||||
}
|
||||
|
||||
.secondary-desc {
|
||||
font-size: 0.75rem;
|
||||
color: var(--text-secondary);
|
||||
display: -webkit-box;
|
||||
-webkit-line-clamp: 2;
|
||||
-webkit-box-orient: vertical;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.secondary-meta {
|
||||
font-size: 0.75rem;
|
||||
color: var(--text-tertiary);
|
||||
}
|
||||
|
||||
.secondary-meta span:last-child {
|
||||
color: var(--warning);
|
||||
}
|
||||
|
||||
/* Sponsored Section */
|
||||
.sponsored-section {
|
||||
grid-column: 1 / -1;
|
||||
background: var(--bg-secondary);
|
||||
border: 1px solid var(--warning);
|
||||
padding: 1rem;
|
||||
position: relative;
|
||||
}
|
||||
|
||||
.section-label {
|
||||
position: absolute;
|
||||
top: -0.5rem;
|
||||
left: 1rem;
|
||||
background: var(--bg-secondary);
|
||||
padding: 0 0.5rem;
|
||||
color: var(--warning);
|
||||
font-size: 0.65rem;
|
||||
letter-spacing: 0.1em;
|
||||
}
|
||||
|
||||
.sponsored-cards {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
|
||||
gap: 1rem;
|
||||
}
|
||||
|
||||
.sponsor-card {
|
||||
padding: 1rem;
|
||||
background: var(--bg-tertiary);
|
||||
border: 1px solid var(--border-color);
|
||||
}
|
||||
|
||||
.sponsor-logo {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
height: 60px;
|
||||
margin-bottom: 0.75rem;
|
||||
}
|
||||
|
||||
.sponsor-logo img {
|
||||
max-height: 60px;
|
||||
max-width: 100%;
|
||||
width: auto;
|
||||
object-fit: contain;
|
||||
}
|
||||
|
||||
.sponsor-card h4 {
|
||||
color: var(--accent-pink);
|
||||
margin-bottom: 0.5rem;
|
||||
}
|
||||
|
||||
.sponsor-card p {
|
||||
color: var(--text-secondary);
|
||||
font-size: 0.85rem;
|
||||
margin-bottom: 0.75rem;
|
||||
}
|
||||
|
||||
.sponsor-card a {
|
||||
color: var(--primary-cyan);
|
||||
text-decoration: none;
|
||||
font-size: 0.85rem;
|
||||
}
|
||||
|
||||
.sponsor-card a:hover {
|
||||
color: var(--accent-pink);
|
||||
}
|
||||
|
||||
/* Main Content Grid */
|
||||
.main-content {
|
||||
grid-column: 1 / -1;
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
|
||||
gap: 2rem;
|
||||
}
|
||||
|
||||
/* Column Headers */
|
||||
.column-header {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
margin-bottom: 1rem;
|
||||
border-bottom: 1px solid var(--border-color);
|
||||
padding-bottom: 0.5rem;
|
||||
}
|
||||
|
||||
.column-header h2 {
|
||||
font-size: 1.1rem;
|
||||
color: var(--text-primary);
|
||||
}
|
||||
|
||||
.mini-filter {
|
||||
background: var(--bg-tertiary);
|
||||
border: 1px solid var(--border-color);
|
||||
color: var(--text-primary);
|
||||
padding: 0.25rem 0.5rem;
|
||||
font-family: inherit;
|
||||
font-size: 0.75rem;
|
||||
}
|
||||
|
||||
.ascii-icon {
|
||||
color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
/* Apps Column */
|
||||
.apps-compact-grid {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.75rem;
|
||||
}
|
||||
|
||||
.app-compact {
|
||||
background: var(--bg-secondary);
|
||||
border: 1px solid var(--border-color);
|
||||
border-left: 3px solid var(--border-color);
|
||||
padding: 0.75rem;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.app-compact:hover {
|
||||
border-color: var(--primary-cyan);
|
||||
border-left-color: var(--accent-pink);
|
||||
transform: translateX(2px);
|
||||
}
|
||||
|
||||
.app-compact-header {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
font-size: 0.75rem;
|
||||
color: var(--text-tertiary);
|
||||
margin-bottom: 0.25rem;
|
||||
}
|
||||
|
||||
.app-compact-header span:first-child {
|
||||
color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
.app-compact-header span:last-child {
|
||||
color: var(--warning);
|
||||
}
|
||||
|
||||
.app-compact-title {
|
||||
font-size: 0.9rem;
|
||||
color: var(--text-primary);
|
||||
margin-bottom: 0.25rem;
|
||||
}
|
||||
|
||||
.app-compact-desc {
|
||||
font-size: 0.75rem;
|
||||
color: var(--text-secondary);
|
||||
display: -webkit-box;
|
||||
-webkit-line-clamp: 2;
|
||||
-webkit-box-orient: vertical;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
/* Articles Column */
|
||||
.articles-compact-list {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 1rem;
|
||||
}
|
||||
|
||||
.article-compact {
|
||||
border-left: 2px solid var(--border-color);
|
||||
padding-left: 1rem;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.article-compact:hover {
|
||||
border-left-color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
.article-meta {
|
||||
font-size: 0.7rem;
|
||||
color: var(--text-tertiary);
|
||||
margin-bottom: 0.25rem;
|
||||
}
|
||||
|
||||
.article-meta span:first-child {
|
||||
color: var(--accent-pink);
|
||||
}
|
||||
|
||||
.article-title {
|
||||
font-size: 0.9rem;
|
||||
color: var(--text-primary);
|
||||
margin-bottom: 0.25rem;
|
||||
}
|
||||
|
||||
.article-author {
|
||||
font-size: 0.75rem;
|
||||
color: var(--text-secondary);
|
||||
}
|
||||
|
||||
/* Trending Column */
|
||||
.trending-items {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.5rem;
|
||||
}
|
||||
|
||||
.trending-item {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.75rem;
|
||||
padding: 0.5rem;
|
||||
background: var(--bg-secondary);
|
||||
border: 1px solid var(--border-color);
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.trending-item:hover {
|
||||
border-color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
.trending-rank {
|
||||
font-size: 1.2rem;
|
||||
color: var(--primary-cyan);
|
||||
width: 2rem;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.trending-info {
|
||||
flex: 1;
|
||||
}
|
||||
|
||||
.trending-name {
|
||||
font-size: 0.85rem;
|
||||
color: var(--text-primary);
|
||||
}
|
||||
|
||||
.trending-stats {
|
||||
font-size: 0.7rem;
|
||||
color: var(--text-tertiary);
|
||||
}
|
||||
|
||||
/* Submit Box */
|
||||
.submit-box {
|
||||
margin-top: 1.5rem;
|
||||
background: var(--bg-secondary);
|
||||
border: 1px solid var(--primary-cyan);
|
||||
padding: 1rem;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.submit-box h3 {
|
||||
font-size: 1rem;
|
||||
color: var(--primary-cyan);
|
||||
margin-bottom: 0.5rem;
|
||||
}
|
||||
|
||||
.submit-box p {
|
||||
font-size: 0.8rem;
|
||||
color: var(--text-secondary);
|
||||
margin-bottom: 0.75rem;
|
||||
}
|
||||
|
||||
.submit-btn {
|
||||
display: inline-block;
|
||||
padding: 0.5rem 1rem;
|
||||
background: transparent;
|
||||
border: 1px solid var(--primary-cyan);
|
||||
color: var(--primary-cyan);
|
||||
text-decoration: none;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.submit-btn:hover {
|
||||
background: var(--primary-cyan);
|
||||
color: var(--bg-dark);
|
||||
}
|
||||
|
||||
/* More Apps Section */
|
||||
.more-apps {
|
||||
grid-column: 1 / -1;
|
||||
margin-top: 2rem;
|
||||
}
|
||||
|
||||
.section-header {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
.more-apps-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
|
||||
gap: 1rem;
|
||||
}
|
||||
|
||||
.load-more-btn {
|
||||
background: transparent;
|
||||
border: 1px solid var(--border-color);
|
||||
color: var(--text-secondary);
|
||||
padding: 0.5rem 1.5rem;
|
||||
font-family: inherit;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.load-more-btn:hover {
|
||||
border-color: var(--primary-cyan);
|
||||
color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
/* Footer */
|
||||
.marketplace-footer {
|
||||
background: var(--bg-secondary);
|
||||
border-top: 1px solid var(--border-color);
|
||||
margin-top: 4rem;
|
||||
padding: 2rem 0;
|
||||
}
|
||||
|
||||
.footer-content {
|
||||
max-width: 1800px;
|
||||
margin: 0 auto;
|
||||
padding: 0 2rem;
|
||||
display: grid;
|
||||
grid-template-columns: 1fr 1fr;
|
||||
gap: 2rem;
|
||||
}
|
||||
|
||||
.footer-section h3 {
|
||||
font-size: 1rem;
|
||||
margin-bottom: 0.5rem;
|
||||
color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
.footer-section p {
|
||||
font-size: 0.875rem;
|
||||
color: var(--text-secondary);
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
.sponsor-btn {
|
||||
display: inline-block;
|
||||
padding: 0.5rem 1rem;
|
||||
background: transparent;
|
||||
border: 1px solid var(--primary-cyan);
|
||||
color: var(--primary-cyan);
|
||||
text-decoration: none;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.sponsor-btn:hover {
|
||||
background: var(--primary-cyan);
|
||||
color: var(--bg-dark);
|
||||
}
|
||||
|
||||
.footer-bottom {
|
||||
max-width: 1800px;
|
||||
margin: 2rem auto 0;
|
||||
padding: 1rem 2rem 0;
|
||||
border-top: 1px solid var(--border-color);
|
||||
font-size: 0.75rem;
|
||||
color: var(--text-tertiary);
|
||||
}
|
||||
|
||||
/* Modal */
|
||||
.modal {
|
||||
position: fixed;
|
||||
top: 0;
|
||||
left: 0;
|
||||
right: 0;
|
||||
bottom: 0;
|
||||
background: rgba(0, 0, 0, 0.8);
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
z-index: 1000;
|
||||
}
|
||||
|
||||
.modal.hidden {
|
||||
display: none;
|
||||
}
|
||||
|
||||
.modal-content {
|
||||
background: var(--bg-secondary);
|
||||
border: 1px solid var(--primary-cyan);
|
||||
max-width: 800px;
|
||||
width: 90%;
|
||||
max-height: 80vh;
|
||||
overflow-y: auto;
|
||||
position: relative;
|
||||
}
|
||||
|
||||
.modal-close {
|
||||
position: absolute;
|
||||
top: 1rem;
|
||||
right: 1rem;
|
||||
background: transparent;
|
||||
border: 1px solid var(--border-color);
|
||||
color: var(--text-primary);
|
||||
padding: 0.25rem 0.5rem;
|
||||
cursor: pointer;
|
||||
font-size: 1.2rem;
|
||||
}
|
||||
|
||||
.modal-close:hover {
|
||||
border-color: var(--error);
|
||||
color: var(--error);
|
||||
}
|
||||
|
||||
.app-detail {
|
||||
padding: 2rem;
|
||||
}
|
||||
|
||||
.app-detail h2 {
|
||||
font-size: 1.5rem;
|
||||
margin-bottom: 1rem;
|
||||
color: var(--primary-cyan);
|
||||
}
|
||||
|
||||
/* Loading */
|
||||
.loading {
|
||||
text-align: center;
|
||||
padding: 2rem;
|
||||
color: var(--text-tertiary);
|
||||
}
|
||||
|
||||
.no-results {
|
||||
text-align: center;
|
||||
padding: 2rem;
|
||||
color: var(--text-tertiary);
|
||||
}
|
||||
|
||||
/* Responsive - Tablet */
|
||||
@media (min-width: 768px) {
|
||||
.magazine-layout {
|
||||
grid-template-columns: repeat(2, 1fr);
|
||||
}
|
||||
|
||||
.hero-featured {
|
||||
grid-column: 1 / -1;
|
||||
}
|
||||
|
||||
.secondary-featured {
|
||||
grid-column: 1 / -1;
|
||||
}
|
||||
|
||||
.sponsored-section {
|
||||
grid-column: 1 / -1;
|
||||
}
|
||||
|
||||
.main-content {
|
||||
grid-column: 1 / -1;
|
||||
grid-template-columns: repeat(2, 1fr);
|
||||
}
|
||||
}
|
||||
|
||||
/* Responsive - Desktop */
|
||||
@media (min-width: 1024px) {
|
||||
.magazine-layout {
|
||||
grid-template-columns: repeat(3, 1fr);
|
||||
}
|
||||
|
||||
.hero-featured {
|
||||
grid-column: 1 / 3;
|
||||
grid-row: 1;
|
||||
}
|
||||
|
||||
.secondary-featured {
|
||||
grid-column: 3 / 4;
|
||||
grid-row: 1;
|
||||
}
|
||||
|
||||
.featured-secondary-cards {
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.sponsored-section {
|
||||
grid-column: 1 / -1;
|
||||
}
|
||||
|
||||
.main-content {
|
||||
grid-column: 1 / -1;
|
||||
grid-template-columns: repeat(3, 1fr);
|
||||
}
|
||||
}
|
||||
|
||||
/* Responsive - Wide Desktop */
|
||||
@media (min-width: 1400px) {
|
||||
.magazine-layout {
|
||||
grid-template-columns: repeat(4, 1fr);
|
||||
}
|
||||
|
||||
.hero-featured {
|
||||
grid-column: 1 / 3;
|
||||
}
|
||||
|
||||
.secondary-featured {
|
||||
grid-column: 3 / 5;
|
||||
grid-row: 1;
|
||||
min-height: auto;
|
||||
}
|
||||
|
||||
.featured-secondary-cards {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(2, 1fr);
|
||||
flex-direction: unset;
|
||||
}
|
||||
|
||||
.main-content {
|
||||
grid-template-columns: repeat(4, 1fr);
|
||||
}
|
||||
|
||||
.apps-column {
|
||||
grid-column: span 2;
|
||||
}
|
||||
|
||||
.more-apps-grid {
|
||||
grid-template-columns: repeat(auto-fill, minmax(250px, 1fr));
|
||||
}
|
||||
}
|
||||
|
||||
/* Responsive - Ultra Wide Desktop (for coders with wide monitors) */
|
||||
@media (min-width: 1800px) {
|
||||
.magazine-layout {
|
||||
grid-template-columns: repeat(5, 1fr);
|
||||
}
|
||||
|
||||
.hero-featured {
|
||||
grid-column: 1 / 3;
|
||||
}
|
||||
|
||||
.secondary-featured {
|
||||
grid-column: 3 / 6;
|
||||
min-height: auto;
|
||||
}
|
||||
|
||||
.featured-secondary-cards {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(3, 1fr);
|
||||
flex-direction: unset;
|
||||
}
|
||||
|
||||
.sponsored-section {
|
||||
grid-column: 1 / -1;
|
||||
}
|
||||
|
||||
.sponsored-cards {
|
||||
grid-template-columns: repeat(5, 1fr);
|
||||
}
|
||||
|
||||
.main-content {
|
||||
grid-template-columns: repeat(5, 1fr);
|
||||
}
|
||||
|
||||
.apps-column {
|
||||
grid-column: span 2;
|
||||
}
|
||||
|
||||
.articles-column {
|
||||
grid-column: span 2;
|
||||
}
|
||||
|
||||
.more-apps-grid {
|
||||
grid-template-columns: repeat(auto-fill, minmax(300px, 1fr));
|
||||
}
|
||||
}
|
||||
|
||||
/* Responsive - Mobile */
|
||||
@media (max-width: 767px) {
|
||||
.header-content {
|
||||
flex-direction: column;
|
||||
gap: 1rem;
|
||||
}
|
||||
|
||||
.search-filter-bar {
|
||||
flex-direction: column;
|
||||
align-items: stretch;
|
||||
}
|
||||
|
||||
.search-box {
|
||||
max-width: none;
|
||||
}
|
||||
|
||||
.magazine-layout {
|
||||
padding: 0 1rem 2rem;
|
||||
}
|
||||
|
||||
.footer-content {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
|
||||
.secondary-card {
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.secondary-image {
|
||||
width: 100%;
|
||||
height: 150px;
|
||||
}
|
||||
}
|
||||
412
docs/md_v2/marketplace/marketplace.js
Normal file
412
docs/md_v2/marketplace/marketplace.js
Normal file
@@ -0,0 +1,412 @@
|
||||
// Marketplace JS - Magazine Layout
|
||||
const { API_BASE, API_ORIGIN } = (() => {
|
||||
const { hostname, port } = window.location;
|
||||
if ((hostname === 'localhost' || hostname === '127.0.0.1') && port === '8000') {
|
||||
const origin = 'http://127.0.0.1:8100';
|
||||
return { API_BASE: `${origin}/marketplace/api`, API_ORIGIN: origin };
|
||||
}
|
||||
return { API_BASE: '/marketplace/api', API_ORIGIN: '' };
|
||||
})();
|
||||
|
||||
const resolveAssetUrl = (path) => {
|
||||
if (!path) return '';
|
||||
if (/^https?:\/\//i.test(path)) return path;
|
||||
if (path.startsWith('/') && API_ORIGIN) {
|
||||
return `${API_ORIGIN}${path}`;
|
||||
}
|
||||
return path;
|
||||
};
|
||||
const CACHE_TTL = 3600000; // 1 hour in ms
|
||||
|
||||
class MarketplaceCache {
|
||||
constructor() {
|
||||
this.prefix = 'c4ai_market_';
|
||||
}
|
||||
|
||||
get(key) {
|
||||
const item = localStorage.getItem(this.prefix + key);
|
||||
if (!item) return null;
|
||||
|
||||
const data = JSON.parse(item);
|
||||
if (Date.now() > data.expires) {
|
||||
localStorage.removeItem(this.prefix + key);
|
||||
return null;
|
||||
}
|
||||
return data.value;
|
||||
}
|
||||
|
||||
set(key, value, ttl = CACHE_TTL) {
|
||||
const data = {
|
||||
value: value,
|
||||
expires: Date.now() + ttl
|
||||
};
|
||||
localStorage.setItem(this.prefix + key, JSON.stringify(data));
|
||||
}
|
||||
|
||||
clear() {
|
||||
Object.keys(localStorage)
|
||||
.filter(k => k.startsWith(this.prefix))
|
||||
.forEach(k => localStorage.removeItem(k));
|
||||
}
|
||||
}
|
||||
|
||||
class MarketplaceAPI {
|
||||
constructor() {
|
||||
this.cache = new MarketplaceCache();
|
||||
this.searchTimeout = null;
|
||||
}
|
||||
|
||||
async fetch(endpoint, useCache = true) {
|
||||
const cacheKey = endpoint.replace(/[^\w]/g, '_');
|
||||
|
||||
if (useCache) {
|
||||
const cached = this.cache.get(cacheKey);
|
||||
if (cached) return cached;
|
||||
}
|
||||
|
||||
try {
|
||||
const response = await fetch(`${API_BASE}${endpoint}`);
|
||||
if (!response.ok) throw new Error(`HTTP ${response.status}`);
|
||||
|
||||
const data = await response.json();
|
||||
this.cache.set(cacheKey, data);
|
||||
return data;
|
||||
} catch (error) {
|
||||
console.error('API Error:', error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async getStats() {
|
||||
return this.fetch('/stats');
|
||||
}
|
||||
|
||||
async getCategories() {
|
||||
return this.fetch('/categories');
|
||||
}
|
||||
|
||||
async getApps(params = {}) {
|
||||
const query = new URLSearchParams(params).toString();
|
||||
return this.fetch(`/apps${query ? '?' + query : ''}`);
|
||||
}
|
||||
|
||||
async getArticles(params = {}) {
|
||||
const query = new URLSearchParams(params).toString();
|
||||
return this.fetch(`/articles${query ? '?' + query : ''}`);
|
||||
}
|
||||
|
||||
async getSponsors() {
|
||||
return this.fetch('/sponsors');
|
||||
}
|
||||
|
||||
async search(query) {
|
||||
if (query.length < 2) return {};
|
||||
return this.fetch(`/search?q=${encodeURIComponent(query)}`, false);
|
||||
}
|
||||
}
|
||||
|
||||
class MarketplaceUI {
|
||||
constructor() {
|
||||
this.api = new MarketplaceAPI();
|
||||
this.currentCategory = 'all';
|
||||
this.currentType = '';
|
||||
this.searchTimeout = null;
|
||||
this.loadedApps = 10;
|
||||
this.init();
|
||||
}
|
||||
|
||||
async init() {
|
||||
await this.loadStats();
|
||||
await this.loadCategories();
|
||||
await this.loadFeaturedContent();
|
||||
await this.loadSponsors();
|
||||
await this.loadMainContent();
|
||||
this.setupEventListeners();
|
||||
}
|
||||
|
||||
async loadStats() {
|
||||
const stats = await this.api.getStats();
|
||||
if (stats) {
|
||||
document.getElementById('total-apps').textContent = stats.total_apps || '0';
|
||||
document.getElementById('total-articles').textContent = stats.total_articles || '0';
|
||||
document.getElementById('total-downloads').textContent = stats.total_downloads || '0';
|
||||
document.getElementById('last-update').textContent = new Date().toLocaleDateString();
|
||||
}
|
||||
}
|
||||
|
||||
async loadCategories() {
|
||||
const categories = await this.api.getCategories();
|
||||
if (!categories) return;
|
||||
|
||||
const filter = document.getElementById('category-filter');
|
||||
categories.forEach(cat => {
|
||||
const btn = document.createElement('button');
|
||||
btn.className = 'filter-btn';
|
||||
btn.dataset.category = cat.slug;
|
||||
btn.textContent = cat.name;
|
||||
btn.onclick = () => this.filterByCategory(cat.slug);
|
||||
filter.appendChild(btn);
|
||||
});
|
||||
}
|
||||
|
||||
async loadFeaturedContent() {
|
||||
// Load hero featured
|
||||
const featured = await this.api.getApps({ featured: true, limit: 4 });
|
||||
if (!featured || !featured.length) return;
|
||||
|
||||
// Hero card (first featured)
|
||||
const hero = featured[0];
|
||||
const heroCard = document.getElementById('featured-hero');
|
||||
if (hero) {
|
||||
const imageUrl = hero.image || '';
|
||||
heroCard.innerHTML = `
|
||||
<div class="hero-image" ${imageUrl ? `style="background-image: url('${imageUrl}')"` : ''}>
|
||||
${!imageUrl ? `[${hero.category || 'APP'}]` : ''}
|
||||
</div>
|
||||
<div class="hero-content">
|
||||
<span class="hero-badge">${hero.type || 'PAID'}</span>
|
||||
<h2 class="hero-title">${hero.name}</h2>
|
||||
<p class="hero-description">${hero.description}</p>
|
||||
<div class="hero-meta">
|
||||
<span>★ ${hero.rating || 0}/5</span>
|
||||
<span>${hero.downloads || 0} downloads</span>
|
||||
</div>
|
||||
</div>
|
||||
`;
|
||||
heroCard.onclick = () => this.showAppDetail(hero);
|
||||
}
|
||||
|
||||
// Secondary featured cards
|
||||
const secondary = document.getElementById('featured-secondary');
|
||||
secondary.innerHTML = '';
|
||||
if (featured.length > 1) {
|
||||
featured.slice(1, 4).forEach(app => {
|
||||
const card = document.createElement('div');
|
||||
card.className = 'secondary-card';
|
||||
const imageUrl = app.image || '';
|
||||
card.innerHTML = `
|
||||
<div class="secondary-image" ${imageUrl ? `style="background-image: url('${imageUrl}')"` : ''}>
|
||||
${!imageUrl ? `[${app.category || 'APP'}]` : ''}
|
||||
</div>
|
||||
<div class="secondary-content">
|
||||
<h3 class="secondary-title">${app.name}</h3>
|
||||
<p class="secondary-desc">${(app.description || '').substring(0, 100)}...</p>
|
||||
<div class="secondary-meta">
|
||||
<span>${app.type || 'Open Source'}</span> · <span>★ ${app.rating || 0}/5</span>
|
||||
</div>
|
||||
</div>
|
||||
`;
|
||||
card.onclick = () => this.showAppDetail(app);
|
||||
secondary.appendChild(card);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
async loadSponsors() {
|
||||
const sponsors = await this.api.getSponsors();
|
||||
if (!sponsors || !sponsors.length) {
|
||||
// Show placeholder if no sponsors
|
||||
const container = document.getElementById('sponsored-content');
|
||||
container.innerHTML = `
|
||||
<div class="sponsor-card">
|
||||
<h4>Become a Sponsor</h4>
|
||||
<p>Reach thousands of developers using Crawl4AI</p>
|
||||
<a href="mailto:sponsors@crawl4ai.com">Contact Us →</a>
|
||||
</div>
|
||||
`;
|
||||
return;
|
||||
}
|
||||
|
||||
const container = document.getElementById('sponsored-content');
|
||||
container.innerHTML = sponsors.slice(0, 5).map(sponsor => `
|
||||
<div class="sponsor-card">
|
||||
${sponsor.logo_url ? `<div class="sponsor-logo"><img src="${resolveAssetUrl(sponsor.logo_url)}" alt="${sponsor.company_name} logo"></div>` : ''}
|
||||
<h4>${sponsor.company_name}</h4>
|
||||
<p>${sponsor.tier} Sponsor - Premium Solutions</p>
|
||||
<a href="${sponsor.landing_url}" target="_blank">Learn More →</a>
|
||||
</div>
|
||||
`).join('');
|
||||
}
|
||||
|
||||
async loadMainContent() {
|
||||
// Load apps column
|
||||
const apps = await this.api.getApps({ limit: 8 });
|
||||
if (apps && apps.length) {
|
||||
const appsGrid = document.getElementById('apps-grid');
|
||||
appsGrid.innerHTML = apps.map(app => `
|
||||
<div class="app-compact" onclick="marketplace.showAppDetail(${JSON.stringify(app).replace(/"/g, '"')})">
|
||||
<div class="app-compact-header">
|
||||
<span>${app.category}</span>
|
||||
<span>★ ${app.rating}/5</span>
|
||||
</div>
|
||||
<div class="app-compact-title">${app.name}</div>
|
||||
<div class="app-compact-desc">${app.description}</div>
|
||||
</div>
|
||||
`).join('');
|
||||
}
|
||||
|
||||
// Load articles column
|
||||
const articles = await this.api.getArticles({ limit: 6 });
|
||||
if (articles && articles.length) {
|
||||
const articlesList = document.getElementById('articles-list');
|
||||
articlesList.innerHTML = articles.map(article => `
|
||||
<div class="article-compact" onclick="marketplace.showArticle('${article.id}')">
|
||||
<div class="article-meta">
|
||||
<span>${article.category}</span> · <span>${new Date(article.published_at).toLocaleDateString()}</span>
|
||||
</div>
|
||||
<div class="article-title">${article.title}</div>
|
||||
<div class="article-author">by ${article.author}</div>
|
||||
</div>
|
||||
`).join('');
|
||||
}
|
||||
|
||||
// Load trending
|
||||
if (apps && apps.length) {
|
||||
const trending = apps.slice(0, 5);
|
||||
const trendingList = document.getElementById('trending-list');
|
||||
trendingList.innerHTML = trending.map((app, i) => `
|
||||
<div class="trending-item" onclick="marketplace.showAppDetail(${JSON.stringify(app).replace(/"/g, '"')})">
|
||||
<div class="trending-rank">${i + 1}</div>
|
||||
<div class="trending-info">
|
||||
<div class="trending-name">${app.name}</div>
|
||||
<div class="trending-stats">${app.downloads} downloads</div>
|
||||
</div>
|
||||
</div>
|
||||
`).join('');
|
||||
}
|
||||
|
||||
// Load more apps grid
|
||||
const moreApps = await this.api.getApps({ offset: 8, limit: 12 });
|
||||
if (moreApps && moreApps.length) {
|
||||
const moreGrid = document.getElementById('more-apps-grid');
|
||||
moreGrid.innerHTML = moreApps.map(app => `
|
||||
<div class="app-compact" onclick="marketplace.showAppDetail(${JSON.stringify(app).replace(/"/g, '"')})">
|
||||
<div class="app-compact-header">
|
||||
<span>${app.category}</span>
|
||||
<span>${app.type}</span>
|
||||
</div>
|
||||
<div class="app-compact-title">${app.name}</div>
|
||||
</div>
|
||||
`).join('');
|
||||
}
|
||||
}
|
||||
|
||||
setupEventListeners() {
|
||||
// Search
|
||||
const searchInput = document.getElementById('search-input');
|
||||
searchInput.addEventListener('input', (e) => {
|
||||
clearTimeout(this.searchTimeout);
|
||||
this.searchTimeout = setTimeout(() => this.search(e.target.value), 300);
|
||||
});
|
||||
|
||||
// Keyboard shortcut
|
||||
document.addEventListener('keydown', (e) => {
|
||||
if (e.key === '/' && !searchInput.contains(document.activeElement)) {
|
||||
e.preventDefault();
|
||||
searchInput.focus();
|
||||
}
|
||||
if (e.key === 'Escape' && searchInput.contains(document.activeElement)) {
|
||||
searchInput.blur();
|
||||
searchInput.value = '';
|
||||
}
|
||||
});
|
||||
|
||||
// Type filter
|
||||
const typeFilter = document.getElementById('type-filter');
|
||||
typeFilter.addEventListener('change', (e) => {
|
||||
this.currentType = e.target.value;
|
||||
this.loadMainContent();
|
||||
});
|
||||
|
||||
// Load more
|
||||
const loadMore = document.getElementById('load-more');
|
||||
loadMore.addEventListener('click', () => this.loadMoreApps());
|
||||
}
|
||||
|
||||
async filterByCategory(category) {
|
||||
// Update active state
|
||||
document.querySelectorAll('.filter-btn').forEach(btn => {
|
||||
btn.classList.toggle('active', btn.dataset.category === category);
|
||||
});
|
||||
|
||||
this.currentCategory = category;
|
||||
await this.loadMainContent();
|
||||
}
|
||||
|
||||
async search(query) {
|
||||
if (!query) {
|
||||
await this.loadMainContent();
|
||||
return;
|
||||
}
|
||||
|
||||
const results = await this.api.search(query);
|
||||
if (!results) return;
|
||||
|
||||
// Update apps grid with search results
|
||||
if (results.apps && results.apps.length) {
|
||||
const appsGrid = document.getElementById('apps-grid');
|
||||
appsGrid.innerHTML = results.apps.map(app => `
|
||||
<div class="app-compact" onclick="marketplace.showAppDetail(${JSON.stringify(app).replace(/"/g, '"')})">
|
||||
<div class="app-compact-header">
|
||||
<span>${app.category}</span>
|
||||
<span>★ ${app.rating}/5</span>
|
||||
</div>
|
||||
<div class="app-compact-title">${app.name}</div>
|
||||
<div class="app-compact-desc">${app.description}</div>
|
||||
</div>
|
||||
`).join('');
|
||||
}
|
||||
|
||||
// Update articles with search results
|
||||
if (results.articles && results.articles.length) {
|
||||
const articlesList = document.getElementById('articles-list');
|
||||
articlesList.innerHTML = results.articles.map(article => `
|
||||
<div class="article-compact" onclick="marketplace.showArticle('${article.id}')">
|
||||
<div class="article-meta">
|
||||
<span>${article.category}</span> · <span>${new Date(article.published_at).toLocaleDateString()}</span>
|
||||
</div>
|
||||
<div class="article-title">${article.title}</div>
|
||||
<div class="article-author">by ${article.author}</div>
|
||||
</div>
|
||||
`).join('');
|
||||
}
|
||||
}
|
||||
|
||||
async loadMoreApps() {
|
||||
this.loadedApps += 12;
|
||||
const moreApps = await this.api.getApps({ offset: this.loadedApps, limit: 12 });
|
||||
if (moreApps && moreApps.length) {
|
||||
const moreGrid = document.getElementById('more-apps-grid');
|
||||
moreApps.forEach(app => {
|
||||
const card = document.createElement('div');
|
||||
card.className = 'app-compact';
|
||||
card.innerHTML = `
|
||||
<div class="app-compact-header">
|
||||
<span>${app.category}</span>
|
||||
<span>${app.type}</span>
|
||||
</div>
|
||||
<div class="app-compact-title">${app.name}</div>
|
||||
`;
|
||||
card.onclick = () => this.showAppDetail(app);
|
||||
moreGrid.appendChild(card);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
showAppDetail(app) {
|
||||
// Navigate to detail page instead of showing modal
|
||||
const slug = app.slug || app.name.toLowerCase().replace(/\s+/g, '-');
|
||||
window.location.href = `app-detail.html?app=${slug}`;
|
||||
}
|
||||
|
||||
showArticle(articleId) {
|
||||
// Could create article detail page similarly
|
||||
console.log('Show article:', articleId);
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize marketplace
|
||||
let marketplace;
|
||||
document.addEventListener('DOMContentLoaded', () => {
|
||||
marketplace = new MarketplaceUI();
|
||||
});
|
||||
338
docs/releases_review/demo_v0.7.5.py
Normal file
338
docs/releases_review/demo_v0.7.5.py
Normal file
@@ -0,0 +1,338 @@
|
||||
"""
|
||||
🚀 Crawl4AI v0.7.5 Release Demo - Working Examples
|
||||
==================================================
|
||||
This demo showcases key features introduced in v0.7.5 with real, executable examples.
|
||||
|
||||
Featured Demos:
|
||||
1. ✅ Docker Hooks System - Real API calls with custom hooks (string & function-based)
|
||||
2. ✅ Enhanced LLM Integration - Working LLM configurations
|
||||
3. ✅ HTTPS Preservation - Live crawling with HTTPS maintenance
|
||||
|
||||
Requirements:
|
||||
- crawl4ai v0.7.5 installed
|
||||
- Docker running with crawl4ai image (optional for Docker demos)
|
||||
- Valid API keys for LLM demos (optional)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import requests
|
||||
import time
|
||||
import sys
|
||||
|
||||
from crawl4ai import (AsyncWebCrawler, CrawlerRunConfig, BrowserConfig,
|
||||
CacheMode, FilterChain, URLPatternFilter, BFSDeepCrawlStrategy,
|
||||
hooks_to_string)
|
||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||
|
||||
|
||||
def print_section(title: str, description: str = ""):
|
||||
"""Print a section header"""
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"{title}")
|
||||
if description:
|
||||
print(f"{description}")
|
||||
print(f"{'=' * 60}\n")
|
||||
|
||||
|
||||
async def demo_1_docker_hooks_system():
|
||||
"""Demo 1: Docker Hooks System - Real API calls with custom hooks"""
|
||||
print_section(
|
||||
"Demo 1: Docker Hooks System",
|
||||
"Testing both string-based and function-based hooks (NEW in v0.7.5!)"
|
||||
)
|
||||
|
||||
# Check Docker service availability
|
||||
def check_docker_service():
|
||||
try:
|
||||
response = requests.get("http://localhost:11235/", timeout=3)
|
||||
return response.status_code == 200
|
||||
except:
|
||||
return False
|
||||
|
||||
print("Checking Docker service...")
|
||||
docker_running = check_docker_service()
|
||||
|
||||
if not docker_running:
|
||||
print("⚠️ Docker service not running on localhost:11235")
|
||||
print("To test Docker hooks:")
|
||||
print("1. Run: docker run -p 11235:11235 unclecode/crawl4ai:latest")
|
||||
print("2. Wait for service to start")
|
||||
print("3. Re-run this demo\n")
|
||||
return
|
||||
|
||||
print("✓ Docker service detected!")
|
||||
|
||||
# ============================================================================
|
||||
# PART 1: Traditional String-Based Hooks (Works with REST API)
|
||||
# ============================================================================
|
||||
print("\n" + "─" * 60)
|
||||
print("Part 1: String-Based Hooks (REST API)")
|
||||
print("─" * 60)
|
||||
|
||||
hooks_config_string = {
|
||||
"on_page_context_created": """
|
||||
async def hook(page, context, **kwargs):
|
||||
print("[String Hook] Setting up page context")
|
||||
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||
return page
|
||||
""",
|
||||
"before_retrieve_html": """
|
||||
async def hook(page, context, **kwargs):
|
||||
print("[String Hook] Before retrieving HTML")
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
await page.wait_for_timeout(1000)
|
||||
return page
|
||||
"""
|
||||
}
|
||||
|
||||
payload = {
|
||||
"urls": ["https://httpbin.org/html"],
|
||||
"hooks": {
|
||||
"code": hooks_config_string,
|
||||
"timeout": 30
|
||||
}
|
||||
}
|
||||
|
||||
print("🔧 Using string-based hooks for REST API...")
|
||||
try:
|
||||
start_time = time.time()
|
||||
response = requests.post("http://localhost:11235/crawl", json=payload, timeout=60)
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
print(f"✅ String-based hooks executed in {execution_time:.2f}s")
|
||||
if result.get('results') and result['results'][0].get('success'):
|
||||
html_length = len(result['results'][0].get('html', ''))
|
||||
print(f" 📄 HTML length: {html_length} characters")
|
||||
else:
|
||||
print(f"❌ Request failed: {response.status_code}")
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {str(e)}")
|
||||
|
||||
# ============================================================================
|
||||
# PART 2: NEW Function-Based Hooks with Docker Client (v0.7.5)
|
||||
# ============================================================================
|
||||
print("\n" + "─" * 60)
|
||||
print("Part 2: Function-Based Hooks with Docker Client (✨ NEW!)")
|
||||
print("─" * 60)
|
||||
|
||||
# Define hooks as regular Python functions
|
||||
async def on_page_context_created_func(page, context, **kwargs):
|
||||
"""Block images to speed up crawling"""
|
||||
print("[Function Hook] Setting up page context")
|
||||
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||
return page
|
||||
|
||||
async def before_goto_func(page, context, url, **kwargs):
|
||||
"""Add custom headers before navigation"""
|
||||
print(f"[Function Hook] About to navigate to {url}")
|
||||
await page.set_extra_http_headers({
|
||||
'X-Crawl4AI': 'v0.7.5-function-hooks',
|
||||
'X-Test-Header': 'demo'
|
||||
})
|
||||
return page
|
||||
|
||||
async def before_retrieve_html_func(page, context, **kwargs):
|
||||
"""Scroll to load lazy content"""
|
||||
print("[Function Hook] Scrolling page for lazy-loaded content")
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
await page.wait_for_timeout(500)
|
||||
await page.evaluate("window.scrollTo(0, 0)")
|
||||
return page
|
||||
|
||||
# Use the hooks_to_string utility (can be used standalone)
|
||||
print("\n📦 Converting functions to strings with hooks_to_string()...")
|
||||
hooks_as_strings = hooks_to_string({
|
||||
"on_page_context_created": on_page_context_created_func,
|
||||
"before_goto": before_goto_func,
|
||||
"before_retrieve_html": before_retrieve_html_func
|
||||
})
|
||||
print(f" ✓ Converted {len(hooks_as_strings)} hooks to string format")
|
||||
|
||||
# OR use Docker Client which does conversion automatically!
|
||||
print("\n🐳 Using Docker Client with automatic conversion...")
|
||||
try:
|
||||
client = Crawl4aiDockerClient(base_url="http://localhost:11235")
|
||||
|
||||
# Pass function objects directly - conversion happens automatically!
|
||||
results = await client.crawl(
|
||||
urls=["https://httpbin.org/html"],
|
||||
hooks={
|
||||
"on_page_context_created": on_page_context_created_func,
|
||||
"before_goto": before_goto_func,
|
||||
"before_retrieve_html": before_retrieve_html_func
|
||||
},
|
||||
hooks_timeout=30
|
||||
)
|
||||
|
||||
if results and results.success:
|
||||
print(f"✅ Function-based hooks executed successfully!")
|
||||
print(f" 📄 HTML length: {len(results.html)} characters")
|
||||
print(f" 🎯 URL: {results.url}")
|
||||
else:
|
||||
print("⚠️ Crawl completed but may have warnings")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Docker client error: {str(e)}")
|
||||
|
||||
# Show the benefits
|
||||
print("\n" + "=" * 60)
|
||||
print("✨ Benefits of Function-Based Hooks:")
|
||||
print("=" * 60)
|
||||
print("✓ Full IDE support (autocomplete, syntax highlighting)")
|
||||
print("✓ Type checking and linting")
|
||||
print("✓ Easier to test and debug")
|
||||
print("✓ Reusable across projects")
|
||||
print("✓ Automatic conversion in Docker client")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
async def demo_2_enhanced_llm_integration():
|
||||
"""Demo 2: Enhanced LLM Integration - Working LLM configurations"""
|
||||
print_section(
|
||||
"Demo 2: Enhanced LLM Integration",
|
||||
"Testing custom LLM providers and configurations"
|
||||
)
|
||||
|
||||
print("🤖 Testing Enhanced LLM Integration Features")
|
||||
|
||||
provider = "gemini/gemini-2.5-flash-lite"
|
||||
payload = {
|
||||
"url": "https://example.com",
|
||||
"f": "llm",
|
||||
"q": "Summarize this page in one sentence.",
|
||||
"provider": provider, # Explicitly set provider
|
||||
"temperature": 0.7
|
||||
}
|
||||
try:
|
||||
response = requests.post(
|
||||
"http://localhost:11235/md",
|
||||
json=payload,
|
||||
timeout=60
|
||||
)
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
print(f"✓ Request successful with provider: {provider}")
|
||||
print(f" - Response keys: {list(result.keys())}")
|
||||
print(f" - Content length: {len(result.get('markdown', ''))} characters")
|
||||
print(f" - Note: Actual LLM call may fail without valid API key")
|
||||
else:
|
||||
print(f"❌ Request failed: {response.status_code}")
|
||||
print(f" - Response: {response.text[:500]}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"[red]Error: {e}[/]")
|
||||
|
||||
|
||||
async def demo_3_https_preservation():
|
||||
"""Demo 3: HTTPS Preservation - Live crawling with HTTPS maintenance"""
|
||||
print_section(
|
||||
"Demo 3: HTTPS Preservation",
|
||||
"Testing HTTPS preservation for internal links"
|
||||
)
|
||||
|
||||
print("🔒 Testing HTTPS Preservation Feature")
|
||||
|
||||
# Test with HTTPS preservation enabled
|
||||
print("\nTest 1: HTTPS Preservation ENABLED")
|
||||
|
||||
url_filter = URLPatternFilter(
|
||||
patterns=["^(https:\/\/)?quotes\.toscrape\.com(\/.*)?$"]
|
||||
)
|
||||
config = CrawlerRunConfig(
|
||||
exclude_external_links=True,
|
||||
stream=True,
|
||||
verbose=False,
|
||||
preserve_https_for_internal_links=True,
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=2,
|
||||
max_pages=5,
|
||||
filter_chain=FilterChain([url_filter])
|
||||
)
|
||||
)
|
||||
|
||||
test_url = "https://quotes.toscrape.com"
|
||||
print(f"🎯 Testing URL: {test_url}")
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
async for result in await crawler.arun(url=test_url, config=config):
|
||||
print("✓ HTTPS Preservation Test Completed")
|
||||
internal_links = [i['href'] for i in result.links['internal']]
|
||||
for link in internal_links:
|
||||
print(f" → {link}")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all demos"""
|
||||
print("\n" + "=" * 60)
|
||||
print("🚀 Crawl4AI v0.7.5 Working Demo")
|
||||
print("=" * 60)
|
||||
|
||||
# Check system requirements
|
||||
print("🔍 System Requirements Check:")
|
||||
print(f" - Python version: {sys.version.split()[0]} {'✓' if sys.version_info >= (3, 10) else '❌ (3.10+ required)'}")
|
||||
|
||||
try:
|
||||
import requests
|
||||
print(f" - Requests library: ✓")
|
||||
except ImportError:
|
||||
print(f" - Requests library: ❌")
|
||||
|
||||
print()
|
||||
|
||||
demos = [
|
||||
("Docker Hooks System", demo_1_docker_hooks_system),
|
||||
("Enhanced LLM Integration", demo_2_enhanced_llm_integration),
|
||||
("HTTPS Preservation", demo_3_https_preservation),
|
||||
]
|
||||
|
||||
for i, (name, demo_func) in enumerate(demos, 1):
|
||||
try:
|
||||
print(f"\n📍 Starting Demo {i}/{len(demos)}: {name}")
|
||||
await demo_func()
|
||||
|
||||
if i < len(demos):
|
||||
print(f"\n✨ Demo {i} complete! Press Enter for next demo...")
|
||||
input()
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print(f"\n⏹️ Demo interrupted by user")
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"❌ Demo {i} error: {str(e)}")
|
||||
print("Continuing to next demo...")
|
||||
continue
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("🎉 Demo Complete!")
|
||||
print("=" * 60)
|
||||
print("You've experienced the power of Crawl4AI v0.7.5!")
|
||||
print("")
|
||||
print("Key Features Demonstrated:")
|
||||
print("🔧 Docker Hooks - String-based & function-based (NEW!)")
|
||||
print(" • hooks_to_string() utility for function conversion")
|
||||
print(" • Docker client with automatic conversion")
|
||||
print(" • Full IDE support and type checking")
|
||||
print("🤖 Enhanced LLM - Better AI integration")
|
||||
print("🔒 HTTPS Preservation - Secure link handling")
|
||||
print("")
|
||||
print("Ready to build something amazing? 🚀")
|
||||
print("")
|
||||
print("📖 Docs: https://docs.crawl4ai.com/")
|
||||
print("🐙 GitHub: https://github.com/unclecode/crawl4ai")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("🚀 Crawl4AI v0.7.5 Live Demo Starting...")
|
||||
print("Press Ctrl+C anytime to exit\n")
|
||||
|
||||
try:
|
||||
asyncio.run(main())
|
||||
except KeyboardInterrupt:
|
||||
print("\n👋 Demo stopped by user. Thanks for trying Crawl4AI v0.7.5!")
|
||||
except Exception as e:
|
||||
print(f"\n❌ Demo error: {str(e)}")
|
||||
print("Make sure you have the required dependencies installed.")
|
||||
359
docs/releases_review/demo_v0.7.6.py
Normal file
359
docs/releases_review/demo_v0.7.6.py
Normal file
@@ -0,0 +1,359 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Crawl4AI v0.7.6 Release Demo
|
||||
============================
|
||||
|
||||
This demo showcases the major feature in v0.7.6:
|
||||
**Webhook Support for Docker Job Queue API**
|
||||
|
||||
Features Demonstrated:
|
||||
1. Asynchronous job processing with webhook notifications
|
||||
2. Webhook support for /crawl/job endpoint
|
||||
3. Webhook support for /llm/job endpoint
|
||||
4. Notification-only vs data-in-payload modes
|
||||
5. Custom webhook headers for authentication
|
||||
6. Structured extraction with JSON schemas
|
||||
7. Exponential backoff retry for reliable delivery
|
||||
|
||||
Prerequisites:
|
||||
- Crawl4AI Docker container running on localhost:11235
|
||||
- Flask installed: pip install flask requests
|
||||
- LLM API key configured (for LLM examples)
|
||||
|
||||
Usage:
|
||||
python docs/releases_review/demo_v0.7.6.py
|
||||
"""
|
||||
|
||||
import requests
|
||||
import json
|
||||
import time
|
||||
from flask import Flask, request, jsonify
|
||||
from threading import Thread
|
||||
|
||||
# Configuration
|
||||
CRAWL4AI_BASE_URL = "http://localhost:11235"
|
||||
WEBHOOK_BASE_URL = "http://localhost:8080"
|
||||
|
||||
# Flask app for webhook receiver
|
||||
app = Flask(__name__)
|
||||
received_webhooks = []
|
||||
|
||||
|
||||
@app.route('/webhook', methods=['POST'])
|
||||
def webhook_handler():
|
||||
"""Universal webhook handler for both crawl and LLM extraction jobs."""
|
||||
payload = request.json
|
||||
task_id = payload['task_id']
|
||||
task_type = payload['task_type']
|
||||
status = payload['status']
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print(f"📬 Webhook Received!")
|
||||
print(f" Task ID: {task_id}")
|
||||
print(f" Task Type: {task_type}")
|
||||
print(f" Status: {status}")
|
||||
print(f" Timestamp: {payload['timestamp']}")
|
||||
|
||||
if status == 'completed':
|
||||
if 'data' in payload:
|
||||
print(f" ✅ Data included in webhook")
|
||||
if task_type == 'crawl':
|
||||
results = payload['data'].get('results', [])
|
||||
print(f" 📊 Crawled {len(results)} URL(s)")
|
||||
elif task_type == 'llm_extraction':
|
||||
extracted = payload['data'].get('extracted_content', {})
|
||||
print(f" 🤖 Extracted: {json.dumps(extracted, indent=6)}")
|
||||
else:
|
||||
print(f" 📥 Notification only (fetch data separately)")
|
||||
elif status == 'failed':
|
||||
print(f" ❌ Error: {payload.get('error', 'Unknown')}")
|
||||
|
||||
print(f"{'='*70}\n")
|
||||
received_webhooks.append(payload)
|
||||
|
||||
return jsonify({"status": "received"}), 200
|
||||
|
||||
|
||||
def start_webhook_server():
|
||||
"""Start Flask webhook server in background."""
|
||||
app.run(host='0.0.0.0', port=8080, debug=False, use_reloader=False)
|
||||
|
||||
|
||||
def demo_1_crawl_webhook_notification_only():
|
||||
"""Demo 1: Crawl job with webhook notification (data fetched separately)."""
|
||||
print("\n" + "="*70)
|
||||
print("DEMO 1: Crawl Job - Webhook Notification Only")
|
||||
print("="*70)
|
||||
print("Submitting crawl job with webhook notification...")
|
||||
|
||||
payload = {
|
||||
"urls": ["https://example.com"],
|
||||
"browser_config": {"headless": True},
|
||||
"crawler_config": {"cache_mode": "bypass"},
|
||||
"webhook_config": {
|
||||
"webhook_url": f"{WEBHOOK_BASE_URL}/webhook",
|
||||
"webhook_data_in_payload": False,
|
||||
"webhook_headers": {
|
||||
"X-Demo": "v0.7.6",
|
||||
"X-Type": "crawl"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
response = requests.post(f"{CRAWL4AI_BASE_URL}/crawl/job", json=payload)
|
||||
if response.ok:
|
||||
task_id = response.json()['task_id']
|
||||
print(f"✅ Job submitted: {task_id}")
|
||||
print("⏳ Webhook will notify when complete...")
|
||||
return task_id
|
||||
else:
|
||||
print(f"❌ Failed: {response.text}")
|
||||
return None
|
||||
|
||||
|
||||
def demo_2_crawl_webhook_with_data():
|
||||
"""Demo 2: Crawl job with full data in webhook payload."""
|
||||
print("\n" + "="*70)
|
||||
print("DEMO 2: Crawl Job - Webhook with Full Data")
|
||||
print("="*70)
|
||||
print("Submitting crawl job with data included in webhook...")
|
||||
|
||||
payload = {
|
||||
"urls": ["https://www.python.org"],
|
||||
"browser_config": {"headless": True},
|
||||
"crawler_config": {"cache_mode": "bypass"},
|
||||
"webhook_config": {
|
||||
"webhook_url": f"{WEBHOOK_BASE_URL}/webhook",
|
||||
"webhook_data_in_payload": True,
|
||||
"webhook_headers": {
|
||||
"X-Demo": "v0.7.6",
|
||||
"X-Type": "crawl-with-data"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
response = requests.post(f"{CRAWL4AI_BASE_URL}/crawl/job", json=payload)
|
||||
if response.ok:
|
||||
task_id = response.json()['task_id']
|
||||
print(f"✅ Job submitted: {task_id}")
|
||||
print("⏳ Webhook will include full results...")
|
||||
return task_id
|
||||
else:
|
||||
print(f"❌ Failed: {response.text}")
|
||||
return None
|
||||
|
||||
|
||||
def demo_3_llm_webhook_notification_only():
|
||||
"""Demo 3: LLM extraction with webhook notification (NEW in v0.7.6!)."""
|
||||
print("\n" + "="*70)
|
||||
print("DEMO 3: LLM Extraction - Webhook Notification Only (NEW!)")
|
||||
print("="*70)
|
||||
print("Submitting LLM extraction job with webhook notification...")
|
||||
|
||||
payload = {
|
||||
"url": "https://www.example.com",
|
||||
"q": "Extract the main heading and description from this page",
|
||||
"provider": "openai/gpt-4o-mini",
|
||||
"cache": False,
|
||||
"webhook_config": {
|
||||
"webhook_url": f"{WEBHOOK_BASE_URL}/webhook",
|
||||
"webhook_data_in_payload": False,
|
||||
"webhook_headers": {
|
||||
"X-Demo": "v0.7.6",
|
||||
"X-Type": "llm"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
response = requests.post(f"{CRAWL4AI_BASE_URL}/llm/job", json=payload)
|
||||
if response.ok:
|
||||
task_id = response.json()['task_id']
|
||||
print(f"✅ Job submitted: {task_id}")
|
||||
print("⏳ Webhook will notify when LLM extraction completes...")
|
||||
return task_id
|
||||
else:
|
||||
print(f"❌ Failed: {response.text}")
|
||||
return None
|
||||
|
||||
|
||||
def demo_4_llm_webhook_with_schema():
|
||||
"""Demo 4: LLM extraction with JSON schema and data in webhook (NEW in v0.7.6!)."""
|
||||
print("\n" + "="*70)
|
||||
print("DEMO 4: LLM Extraction - Schema + Full Data in Webhook (NEW!)")
|
||||
print("="*70)
|
||||
print("Submitting LLM extraction with JSON schema...")
|
||||
|
||||
schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string", "description": "Page title"},
|
||||
"description": {"type": "string", "description": "Page description"},
|
||||
"main_topics": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "Main topics covered"
|
||||
}
|
||||
},
|
||||
"required": ["title"]
|
||||
}
|
||||
|
||||
payload = {
|
||||
"url": "https://www.python.org",
|
||||
"q": "Extract the title, description, and main topics from this website",
|
||||
"schema": json.dumps(schema),
|
||||
"provider": "openai/gpt-4o-mini",
|
||||
"cache": False,
|
||||
"webhook_config": {
|
||||
"webhook_url": f"{WEBHOOK_BASE_URL}/webhook",
|
||||
"webhook_data_in_payload": True,
|
||||
"webhook_headers": {
|
||||
"X-Demo": "v0.7.6",
|
||||
"X-Type": "llm-with-schema"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
response = requests.post(f"{CRAWL4AI_BASE_URL}/llm/job", json=payload)
|
||||
if response.ok:
|
||||
task_id = response.json()['task_id']
|
||||
print(f"✅ Job submitted: {task_id}")
|
||||
print("⏳ Webhook will include structured extraction results...")
|
||||
return task_id
|
||||
else:
|
||||
print(f"❌ Failed: {response.text}")
|
||||
return None
|
||||
|
||||
|
||||
def demo_5_global_webhook_config():
|
||||
"""Demo 5: Using global webhook configuration from config.yml."""
|
||||
print("\n" + "="*70)
|
||||
print("DEMO 5: Global Webhook Configuration")
|
||||
print("="*70)
|
||||
print("💡 You can configure a default webhook URL in config.yml:")
|
||||
print("""
|
||||
webhooks:
|
||||
enabled: true
|
||||
default_url: "https://myapp.com/webhooks/default"
|
||||
data_in_payload: false
|
||||
retry:
|
||||
max_attempts: 5
|
||||
initial_delay_ms: 1000
|
||||
max_delay_ms: 32000
|
||||
timeout_ms: 30000
|
||||
""")
|
||||
print("Then submit jobs WITHOUT webhook_config - they'll use the default!")
|
||||
print("This is useful for consistent webhook handling across all jobs.")
|
||||
|
||||
|
||||
def demo_6_webhook_retry_logic():
|
||||
"""Demo 6: Webhook retry mechanism with exponential backoff."""
|
||||
print("\n" + "="*70)
|
||||
print("DEMO 6: Webhook Retry Logic")
|
||||
print("="*70)
|
||||
print("🔄 Webhook delivery uses exponential backoff retry:")
|
||||
print(" • Max attempts: 5")
|
||||
print(" • Delays: 1s → 2s → 4s → 8s → 16s")
|
||||
print(" • Timeout: 30s per attempt")
|
||||
print(" • Retries on: 5xx errors, network errors, timeouts")
|
||||
print(" • No retry on: 4xx client errors")
|
||||
print("\nThis ensures reliable webhook delivery even with temporary failures!")
|
||||
|
||||
|
||||
def print_summary():
|
||||
"""Print demo summary and results."""
|
||||
print("\n" + "="*70)
|
||||
print("📊 DEMO SUMMARY")
|
||||
print("="*70)
|
||||
print(f"Total webhooks received: {len(received_webhooks)}")
|
||||
|
||||
crawl_webhooks = [w for w in received_webhooks if w['task_type'] == 'crawl']
|
||||
llm_webhooks = [w for w in received_webhooks if w['task_type'] == 'llm_extraction']
|
||||
|
||||
print(f"\nBreakdown:")
|
||||
print(f" 🕷️ Crawl jobs: {len(crawl_webhooks)}")
|
||||
print(f" 🤖 LLM extraction jobs: {len(llm_webhooks)}")
|
||||
|
||||
print(f"\nDetails:")
|
||||
for i, webhook in enumerate(received_webhooks, 1):
|
||||
icon = "🕷️" if webhook['task_type'] == 'crawl' else "🤖"
|
||||
print(f" {i}. {icon} {webhook['task_id']}: {webhook['status']}")
|
||||
|
||||
print("\n" + "="*70)
|
||||
print("✨ v0.7.6 KEY FEATURES DEMONSTRATED:")
|
||||
print("="*70)
|
||||
print("✅ Webhook support for /crawl/job")
|
||||
print("✅ Webhook support for /llm/job (NEW!)")
|
||||
print("✅ Notification-only mode (fetch data separately)")
|
||||
print("✅ Data-in-payload mode (get full results in webhook)")
|
||||
print("✅ Custom headers for authentication")
|
||||
print("✅ JSON schema for structured LLM extraction")
|
||||
print("✅ Exponential backoff retry for reliable delivery")
|
||||
print("✅ Global webhook configuration support")
|
||||
print("✅ Universal webhook handler for both job types")
|
||||
print("\n💡 Benefits:")
|
||||
print(" • No more polling - get instant notifications")
|
||||
print(" • Better resource utilization")
|
||||
print(" • Reliable delivery with automatic retries")
|
||||
print(" • Consistent API across crawl and LLM jobs")
|
||||
print(" • Production-ready webhook infrastructure")
|
||||
|
||||
|
||||
def main():
|
||||
"""Run all demos."""
|
||||
print("\n" + "="*70)
|
||||
print("🚀 Crawl4AI v0.7.6 Release Demo")
|
||||
print("="*70)
|
||||
print("Feature: Webhook Support for Docker Job Queue API")
|
||||
print("="*70)
|
||||
|
||||
# Check if server is running
|
||||
try:
|
||||
health = requests.get(f"{CRAWL4AI_BASE_URL}/health", timeout=5)
|
||||
print(f"✅ Crawl4AI server is running")
|
||||
except:
|
||||
print(f"❌ Cannot connect to Crawl4AI at {CRAWL4AI_BASE_URL}")
|
||||
print("Please start Docker container:")
|
||||
print(" docker run -d -p 11235:11235 --env-file .llm.env unclecode/crawl4ai:0.7.6")
|
||||
return
|
||||
|
||||
# Start webhook server
|
||||
print(f"\n🌐 Starting webhook server at {WEBHOOK_BASE_URL}...")
|
||||
webhook_thread = Thread(target=start_webhook_server, daemon=True)
|
||||
webhook_thread.start()
|
||||
time.sleep(2)
|
||||
|
||||
# Run demos
|
||||
demo_1_crawl_webhook_notification_only()
|
||||
time.sleep(5)
|
||||
|
||||
demo_2_crawl_webhook_with_data()
|
||||
time.sleep(5)
|
||||
|
||||
demo_3_llm_webhook_notification_only()
|
||||
time.sleep(5)
|
||||
|
||||
demo_4_llm_webhook_with_schema()
|
||||
time.sleep(5)
|
||||
|
||||
demo_5_global_webhook_config()
|
||||
demo_6_webhook_retry_logic()
|
||||
|
||||
# Wait for webhooks
|
||||
print("\n⏳ Waiting for all webhooks to arrive...")
|
||||
time.sleep(30)
|
||||
|
||||
# Print summary
|
||||
print_summary()
|
||||
|
||||
print("\n" + "="*70)
|
||||
print("✅ Demo completed!")
|
||||
print("="*70)
|
||||
print("\n📚 Documentation:")
|
||||
print(" • deploy/docker/WEBHOOK_EXAMPLES.md")
|
||||
print(" • docs/examples/docker_webhook_example.py")
|
||||
print("\n🔗 Upgrade:")
|
||||
print(" docker pull unclecode/crawl4ai:0.7.6")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
655
docs/releases_review/v0.7.5_docker_hooks_demo.py
Normal file
655
docs/releases_review/v0.7.5_docker_hooks_demo.py
Normal file
@@ -0,0 +1,655 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
🚀 Crawl4AI v0.7.5 - Docker Hooks System Complete Demonstration
|
||||
================================================================
|
||||
|
||||
This file demonstrates the NEW Docker Hooks System introduced in v0.7.5.
|
||||
|
||||
The Docker Hooks System is a completely NEW feature that provides pipeline
|
||||
customization through user-provided Python functions. It offers three approaches:
|
||||
|
||||
1. String-based hooks for REST API
|
||||
2. hooks_to_string() utility to convert functions
|
||||
3. Docker Client with automatic conversion (most convenient)
|
||||
|
||||
All three approaches are part of this NEW v0.7.5 feature!
|
||||
|
||||
Perfect for video recording and demonstration purposes.
|
||||
|
||||
Requirements:
|
||||
- Docker container running: docker run -p 11235:11235 unclecode/crawl4ai:latest
|
||||
- crawl4ai v0.7.5 installed: pip install crawl4ai==0.7.5
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import requests
|
||||
import json
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
# Import Crawl4AI components
|
||||
from crawl4ai import hooks_to_string
|
||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||
|
||||
# Configuration
|
||||
DOCKER_URL = "http://localhost:11235"
|
||||
# DOCKER_URL = "http://localhost:11234"
|
||||
TEST_URLS = [
|
||||
# "https://httpbin.org/html",
|
||||
"https://www.kidocode.com",
|
||||
"https://quotes.toscrape.com",
|
||||
]
|
||||
|
||||
|
||||
def print_section(title: str, description: str = ""):
|
||||
"""Print a formatted section header"""
|
||||
print("\n" + "=" * 70)
|
||||
print(f" {title}")
|
||||
if description:
|
||||
print(f" {description}")
|
||||
print("=" * 70 + "\n")
|
||||
|
||||
|
||||
def check_docker_service() -> bool:
|
||||
"""Check if Docker service is running"""
|
||||
try:
|
||||
response = requests.get(f"{DOCKER_URL}/health", timeout=3)
|
||||
return response.status_code == 200
|
||||
except:
|
||||
return False
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# REUSABLE HOOK LIBRARY (NEW in v0.7.5)
|
||||
# ============================================================================
|
||||
|
||||
async def performance_optimization_hook(page, context, **kwargs):
|
||||
"""
|
||||
Performance Hook: Block unnecessary resources to speed up crawling
|
||||
"""
|
||||
print(" [Hook] 🚀 Optimizing performance - blocking images and ads...")
|
||||
|
||||
# Block images
|
||||
await context.route(
|
||||
"**/*.{png,jpg,jpeg,gif,webp,svg,ico}",
|
||||
lambda route: route.abort()
|
||||
)
|
||||
|
||||
# Block ads and analytics
|
||||
await context.route("**/analytics/*", lambda route: route.abort())
|
||||
await context.route("**/ads/*", lambda route: route.abort())
|
||||
await context.route("**/google-analytics.com/*", lambda route: route.abort())
|
||||
|
||||
print(" [Hook] ✓ Performance optimization applied")
|
||||
return page
|
||||
|
||||
|
||||
async def viewport_setup_hook(page, context, **kwargs):
|
||||
"""
|
||||
Viewport Hook: Set consistent viewport size for rendering
|
||||
"""
|
||||
print(" [Hook] 🖥️ Setting viewport to 1920x1080...")
|
||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||
print(" [Hook] ✓ Viewport configured")
|
||||
return page
|
||||
|
||||
|
||||
async def authentication_headers_hook(page, context, url, **kwargs):
|
||||
"""
|
||||
Headers Hook: Add custom authentication and tracking headers
|
||||
"""
|
||||
print(f" [Hook] 🔐 Adding custom headers for {url[:50]}...")
|
||||
|
||||
await page.set_extra_http_headers({
|
||||
'X-Crawl4AI-Version': '0.7.5',
|
||||
'X-Custom-Hook': 'function-based-demo',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'User-Agent': 'Crawl4AI/0.7.5 (Educational Demo)'
|
||||
})
|
||||
|
||||
print(" [Hook] ✓ Custom headers added")
|
||||
return page
|
||||
|
||||
|
||||
async def lazy_loading_handler_hook(page, context, **kwargs):
|
||||
"""
|
||||
Content Hook: Handle lazy-loaded content by scrolling
|
||||
"""
|
||||
print(" [Hook] 📜 Scrolling to load lazy content...")
|
||||
|
||||
# Scroll to bottom
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
await page.wait_for_timeout(1000)
|
||||
|
||||
# Scroll to middle
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight / 2)")
|
||||
await page.wait_for_timeout(500)
|
||||
|
||||
# Scroll back to top
|
||||
await page.evaluate("window.scrollTo(0, 0)")
|
||||
await page.wait_for_timeout(500)
|
||||
|
||||
print(" [Hook] ✓ Lazy content loaded")
|
||||
return page
|
||||
|
||||
|
||||
async def page_analytics_hook(page, context, **kwargs):
|
||||
"""
|
||||
Analytics Hook: Log page metrics before extraction
|
||||
"""
|
||||
print(" [Hook] 📊 Collecting page analytics...")
|
||||
|
||||
metrics = await page.evaluate('''
|
||||
() => ({
|
||||
title: document.title,
|
||||
images: document.images.length,
|
||||
links: document.links.length,
|
||||
scripts: document.scripts.length,
|
||||
headings: document.querySelectorAll('h1, h2, h3').length,
|
||||
paragraphs: document.querySelectorAll('p').length
|
||||
})
|
||||
''')
|
||||
|
||||
print(f" [Hook] 📈 Page: {metrics['title'][:50]}...")
|
||||
print(f" Links: {metrics['links']}, Images: {metrics['images']}, "
|
||||
f"Headings: {metrics['headings']}, Paragraphs: {metrics['paragraphs']}")
|
||||
|
||||
return page
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# DEMO 1: String-Based Hooks (NEW Docker Hooks System)
|
||||
# ============================================================================
|
||||
|
||||
def demo_1_string_based_hooks():
|
||||
"""
|
||||
Demonstrate string-based hooks with REST API (part of NEW Docker Hooks System)
|
||||
"""
|
||||
print_section(
|
||||
"DEMO 1: String-Based Hooks (REST API)",
|
||||
"Part of the NEW Docker Hooks System - hooks as strings"
|
||||
)
|
||||
|
||||
# Define hooks as strings
|
||||
hooks_config = {
|
||||
"on_page_context_created": """
|
||||
async def hook(page, context, **kwargs):
|
||||
print(" [String Hook] Setting up page context...")
|
||||
# Block images for performance
|
||||
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||
return page
|
||||
""",
|
||||
|
||||
"before_goto": """
|
||||
async def hook(page, context, url, **kwargs):
|
||||
print(f" [String Hook] Navigating to {url[:50]}...")
|
||||
await page.set_extra_http_headers({
|
||||
'X-Crawl4AI': 'string-based-hooks',
|
||||
'X-Demo': 'v0.7.5'
|
||||
})
|
||||
return page
|
||||
""",
|
||||
|
||||
"before_retrieve_html": """
|
||||
async def hook(page, context, **kwargs):
|
||||
print(" [String Hook] Scrolling page...")
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
await page.wait_for_timeout(1000)
|
||||
return page
|
||||
"""
|
||||
}
|
||||
|
||||
# Prepare request payload
|
||||
payload = {
|
||||
"urls": [TEST_URLS[0]],
|
||||
"hooks": {
|
||||
"code": hooks_config,
|
||||
"timeout": 30
|
||||
},
|
||||
"crawler_config": {
|
||||
"cache_mode": "bypass"
|
||||
}
|
||||
}
|
||||
|
||||
print(f"🎯 Target URL: {TEST_URLS[0]}")
|
||||
print(f"🔧 Configured {len(hooks_config)} string-based hooks")
|
||||
print(f"📡 Sending request to Docker API...\n")
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
response = requests.post(f"{DOCKER_URL}/crawl", json=payload, timeout=60)
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
|
||||
print(f"\n✅ Request successful! (took {execution_time:.2f}s)")
|
||||
|
||||
# Display results
|
||||
if result.get('results') and result['results'][0].get('success'):
|
||||
crawl_result = result['results'][0]
|
||||
html_length = len(crawl_result.get('html', ''))
|
||||
markdown_length = len(crawl_result.get('markdown', ''))
|
||||
|
||||
print(f"\n📊 Results:")
|
||||
print(f" • HTML length: {html_length:,} characters")
|
||||
print(f" • Markdown length: {markdown_length:,} characters")
|
||||
print(f" • URL: {crawl_result.get('url')}")
|
||||
|
||||
# Check hooks execution
|
||||
if 'hooks' in result:
|
||||
hooks_info = result['hooks']
|
||||
print(f"\n🎣 Hooks Execution:")
|
||||
print(f" • Status: {hooks_info['status']['status']}")
|
||||
print(f" • Attached hooks: {len(hooks_info['status']['attached_hooks'])}")
|
||||
|
||||
if 'summary' in hooks_info:
|
||||
summary = hooks_info['summary']
|
||||
print(f" • Total executions: {summary['total_executions']}")
|
||||
print(f" • Successful: {summary['successful']}")
|
||||
print(f" • Success rate: {summary['success_rate']:.1f}%")
|
||||
else:
|
||||
print(f"⚠️ Crawl completed but no results")
|
||||
|
||||
else:
|
||||
print(f"❌ Request failed with status {response.status_code}")
|
||||
print(f" Error: {response.text[:200]}")
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
print("⏰ Request timed out after 60 seconds")
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {str(e)}")
|
||||
|
||||
print("\n" + "─" * 70)
|
||||
print("✓ String-based hooks demo complete\n")
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# DEMO 2: Function-Based Hooks with hooks_to_string() Utility
|
||||
# ============================================================================
|
||||
|
||||
def demo_2_hooks_to_string_utility():
|
||||
"""
|
||||
Demonstrate the new hooks_to_string() utility for converting functions
|
||||
"""
|
||||
print_section(
|
||||
"DEMO 2: hooks_to_string() Utility (NEW! ✨)",
|
||||
"Convert Python functions to strings for REST API"
|
||||
)
|
||||
|
||||
print("📦 Creating hook functions...")
|
||||
print(" • performance_optimization_hook")
|
||||
print(" • viewport_setup_hook")
|
||||
print(" • authentication_headers_hook")
|
||||
print(" • lazy_loading_handler_hook")
|
||||
|
||||
# Convert function objects to strings using the NEW utility
|
||||
print("\n🔄 Converting functions to strings with hooks_to_string()...")
|
||||
|
||||
hooks_dict = {
|
||||
"on_page_context_created": performance_optimization_hook,
|
||||
"before_goto": authentication_headers_hook,
|
||||
"before_retrieve_html": lazy_loading_handler_hook,
|
||||
}
|
||||
|
||||
hooks_as_strings = hooks_to_string(hooks_dict)
|
||||
|
||||
print(f"✅ Successfully converted {len(hooks_as_strings)} functions to strings")
|
||||
|
||||
# Show a preview
|
||||
print("\n📝 Sample converted hook (first 250 characters):")
|
||||
print("─" * 70)
|
||||
sample_hook = list(hooks_as_strings.values())[0]
|
||||
print(sample_hook[:250] + "...")
|
||||
print("─" * 70)
|
||||
|
||||
# Use the converted hooks with REST API
|
||||
print("\n📡 Using converted hooks with REST API...")
|
||||
|
||||
payload = {
|
||||
"urls": [TEST_URLS[0]],
|
||||
"hooks": {
|
||||
"code": hooks_as_strings,
|
||||
"timeout": 30
|
||||
}
|
||||
}
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
response = requests.post(f"{DOCKER_URL}/crawl", json=payload, timeout=60)
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
print(f"\n✅ Request successful! (took {execution_time:.2f}s)")
|
||||
|
||||
if result.get('results') and result['results'][0].get('success'):
|
||||
crawl_result = result['results'][0]
|
||||
print(f" • HTML length: {len(crawl_result.get('html', '')):,} characters")
|
||||
print(f" • Hooks executed successfully!")
|
||||
else:
|
||||
print(f"❌ Request failed: {response.status_code}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {str(e)}")
|
||||
|
||||
print("\n💡 Benefits of hooks_to_string():")
|
||||
print(" ✓ Write hooks as regular Python functions")
|
||||
print(" ✓ Full IDE support (autocomplete, syntax highlighting)")
|
||||
print(" ✓ Type checking and linting")
|
||||
print(" ✓ Easy to test and debug")
|
||||
print(" ✓ Reusable across projects")
|
||||
print(" ✓ Works with any REST API client")
|
||||
|
||||
print("\n" + "─" * 70)
|
||||
print("✓ hooks_to_string() utility demo complete\n")
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# DEMO 3: Docker Client with Automatic Conversion (RECOMMENDED! 🌟)
|
||||
# ============================================================================
|
||||
|
||||
async def demo_3_docker_client_auto_conversion():
|
||||
"""
|
||||
Demonstrate Docker Client with automatic hook conversion (RECOMMENDED)
|
||||
"""
|
||||
print_section(
|
||||
"DEMO 3: Docker Client with Auto-Conversion (RECOMMENDED! 🌟)",
|
||||
"Pass function objects directly - conversion happens automatically!"
|
||||
)
|
||||
|
||||
print("🐳 Initializing Crawl4AI Docker Client...")
|
||||
client = Crawl4aiDockerClient(base_url=DOCKER_URL)
|
||||
|
||||
print("✅ Client ready!\n")
|
||||
|
||||
# Use our reusable hook library - just pass the function objects!
|
||||
print("📚 Using reusable hook library:")
|
||||
print(" • performance_optimization_hook")
|
||||
print(" • viewport_setup_hook")
|
||||
print(" • authentication_headers_hook")
|
||||
print(" • lazy_loading_handler_hook")
|
||||
print(" • page_analytics_hook")
|
||||
|
||||
print("\n🎯 Target URL: " + TEST_URLS[1])
|
||||
print("🚀 Starting crawl with automatic hook conversion...\n")
|
||||
|
||||
try:
|
||||
start_time = time.time()
|
||||
|
||||
# Pass function objects directly - NO manual conversion needed! ✨
|
||||
results = await client.crawl(
|
||||
urls=[TEST_URLS[0]],
|
||||
hooks={
|
||||
"on_page_context_created": performance_optimization_hook,
|
||||
"before_goto": authentication_headers_hook,
|
||||
"before_retrieve_html": lazy_loading_handler_hook,
|
||||
"before_return_html": page_analytics_hook,
|
||||
},
|
||||
hooks_timeout=30
|
||||
)
|
||||
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
print(f"\n✅ Crawl completed! (took {execution_time:.2f}s)\n")
|
||||
|
||||
# Display results
|
||||
if results and results.success:
|
||||
result = results
|
||||
print(f"📊 Results:")
|
||||
print(f" • URL: {result.url}")
|
||||
print(f" • Success: {result.success}")
|
||||
print(f" • HTML length: {len(result.html):,} characters")
|
||||
print(f" • Markdown length: {len(result.markdown):,} characters")
|
||||
|
||||
# Show metadata
|
||||
if result.metadata:
|
||||
print(f"\n📋 Metadata:")
|
||||
print(f" • Title: {result.metadata.get('title', 'N/A')}")
|
||||
print(f" • Description: {result.metadata.get('description', 'N/A')}")
|
||||
|
||||
# Show links
|
||||
if result.links:
|
||||
internal_count = len(result.links.get('internal', []))
|
||||
external_count = len(result.links.get('external', []))
|
||||
print(f"\n🔗 Links Found:")
|
||||
print(f" • Internal: {internal_count}")
|
||||
print(f" • External: {external_count}")
|
||||
else:
|
||||
print(f"⚠️ Crawl completed but no successful results")
|
||||
if results:
|
||||
print(f" Error: {results.error_message}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
print("\n🌟 Why Docker Client is RECOMMENDED:")
|
||||
print(" ✓ Automatic function-to-string conversion")
|
||||
print(" ✓ No manual hooks_to_string() calls needed")
|
||||
print(" ✓ Cleaner, more Pythonic code")
|
||||
print(" ✓ Full type hints and IDE support")
|
||||
print(" ✓ Built-in error handling")
|
||||
print(" ✓ Async/await support")
|
||||
|
||||
print("\n" + "─" * 70)
|
||||
print("✓ Docker Client auto-conversion demo complete\n")
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# DEMO 4: Advanced Use Case - Complete Hook Pipeline
|
||||
# ============================================================================
|
||||
|
||||
async def demo_4_complete_hook_pipeline():
|
||||
"""
|
||||
Demonstrate a complete hook pipeline using all 8 hook points
|
||||
"""
|
||||
print_section(
|
||||
"DEMO 4: Complete Hook Pipeline",
|
||||
"Using all 8 available hook points for comprehensive control"
|
||||
)
|
||||
|
||||
# Define all 8 hooks
|
||||
async def on_browser_created_hook(browser, **kwargs):
|
||||
"""Hook 1: Called after browser is created"""
|
||||
print(" [Pipeline] 1/8 Browser created")
|
||||
return browser
|
||||
|
||||
async def on_page_context_created_hook(page, context, **kwargs):
|
||||
"""Hook 2: Called after page context is created"""
|
||||
print(" [Pipeline] 2/8 Page context created - setting up...")
|
||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||
return page
|
||||
|
||||
async def on_user_agent_updated_hook(page, context, user_agent, **kwargs):
|
||||
"""Hook 3: Called when user agent is updated"""
|
||||
print(f" [Pipeline] 3/8 User agent updated: {user_agent[:50]}...")
|
||||
return page
|
||||
|
||||
async def before_goto_hook(page, context, url, **kwargs):
|
||||
"""Hook 4: Called before navigating to URL"""
|
||||
print(f" [Pipeline] 4/8 Before navigation to: {url[:60]}...")
|
||||
return page
|
||||
|
||||
async def after_goto_hook(page, context, url, response, **kwargs):
|
||||
"""Hook 5: Called after navigation completes"""
|
||||
print(f" [Pipeline] 5/8 After navigation - Status: {response.status if response else 'N/A'}")
|
||||
await page.wait_for_timeout(1000)
|
||||
return page
|
||||
|
||||
async def on_execution_started_hook(page, context, **kwargs):
|
||||
"""Hook 6: Called when JavaScript execution starts"""
|
||||
print(" [Pipeline] 6/8 JavaScript execution started")
|
||||
return page
|
||||
|
||||
async def before_retrieve_html_hook(page, context, **kwargs):
|
||||
"""Hook 7: Called before retrieving HTML"""
|
||||
print(" [Pipeline] 7/8 Before HTML retrieval - scrolling...")
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
return page
|
||||
|
||||
async def before_return_html_hook(page, context, html, **kwargs):
|
||||
"""Hook 8: Called before returning HTML"""
|
||||
print(f" [Pipeline] 8/8 Before return - HTML length: {len(html):,} chars")
|
||||
return page
|
||||
|
||||
print("🎯 Target URL: " + TEST_URLS[0])
|
||||
print("🔧 Configured ALL 8 hook points for complete pipeline control\n")
|
||||
|
||||
client = Crawl4aiDockerClient(base_url=DOCKER_URL)
|
||||
|
||||
try:
|
||||
print("🚀 Starting complete pipeline crawl...\n")
|
||||
start_time = time.time()
|
||||
|
||||
results = await client.crawl(
|
||||
urls=[TEST_URLS[0]],
|
||||
hooks={
|
||||
"on_browser_created": on_browser_created_hook,
|
||||
"on_page_context_created": on_page_context_created_hook,
|
||||
"on_user_agent_updated": on_user_agent_updated_hook,
|
||||
"before_goto": before_goto_hook,
|
||||
"after_goto": after_goto_hook,
|
||||
"on_execution_started": on_execution_started_hook,
|
||||
"before_retrieve_html": before_retrieve_html_hook,
|
||||
"before_return_html": before_return_html_hook,
|
||||
},
|
||||
hooks_timeout=45
|
||||
)
|
||||
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
if results and results.success:
|
||||
print(f"\n✅ Complete pipeline executed successfully! (took {execution_time:.2f}s)")
|
||||
print(f" • All 8 hooks executed in sequence")
|
||||
print(f" • HTML length: {len(results.html):,} characters")
|
||||
else:
|
||||
print(f"⚠️ Pipeline completed with warnings")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {str(e)}")
|
||||
|
||||
print("\n📚 Available Hook Points:")
|
||||
print(" 1. on_browser_created - Browser initialization")
|
||||
print(" 2. on_page_context_created - Page context setup")
|
||||
print(" 3. on_user_agent_updated - User agent configuration")
|
||||
print(" 4. before_goto - Pre-navigation setup")
|
||||
print(" 5. after_goto - Post-navigation processing")
|
||||
print(" 6. on_execution_started - JavaScript execution start")
|
||||
print(" 7. before_retrieve_html - Pre-extraction processing")
|
||||
print(" 8. before_return_html - Final HTML processing")
|
||||
|
||||
print("\n" + "─" * 70)
|
||||
print("✓ Complete hook pipeline demo complete\n")
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# MAIN EXECUTION
|
||||
# ============================================================================
|
||||
|
||||
async def main():
|
||||
"""
|
||||
Run all demonstrations
|
||||
"""
|
||||
print("\n" + "=" * 70)
|
||||
print(" 🚀 Crawl4AI v0.7.5 - Docker Hooks Complete Demonstration")
|
||||
print("=" * 70)
|
||||
|
||||
# Check Docker service
|
||||
print("\n🔍 Checking Docker service status...")
|
||||
if not check_docker_service():
|
||||
print("❌ Docker service is not running!")
|
||||
print("\n📋 To start the Docker service:")
|
||||
print(" docker run -p 11235:11235 unclecode/crawl4ai:latest")
|
||||
print("\nPlease start the service and run this demo again.")
|
||||
return
|
||||
|
||||
print("✅ Docker service is running!\n")
|
||||
|
||||
# Run all demos
|
||||
demos = [
|
||||
("String-Based Hooks (REST API)", demo_1_string_based_hooks, False),
|
||||
("hooks_to_string() Utility", demo_2_hooks_to_string_utility, False),
|
||||
("Docker Client Auto-Conversion", demo_3_docker_client_auto_conversion, True),
|
||||
# ("Complete Hook Pipeline", demo_4_complete_hook_pipeline, True),
|
||||
]
|
||||
|
||||
for i, (name, demo_func, is_async) in enumerate(demos, 1):
|
||||
print(f"\n{'🔷' * 35}")
|
||||
print(f"Starting Demo {i}/{len(demos)}: {name}")
|
||||
print(f"{'🔷' * 35}\n")
|
||||
|
||||
try:
|
||||
if is_async:
|
||||
await demo_func()
|
||||
else:
|
||||
demo_func()
|
||||
|
||||
print(f"✅ Demo {i} completed successfully!")
|
||||
|
||||
# Pause between demos (except the last one)
|
||||
if i < len(demos):
|
||||
print("\n⏸️ Press Enter to continue to next demo...")
|
||||
# input()
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print(f"\n⏹️ Demo interrupted by user")
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"\n❌ Demo {i} failed: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
print("\nContinuing to next demo...\n")
|
||||
continue
|
||||
|
||||
# Final summary
|
||||
print("\n" + "=" * 70)
|
||||
print(" 🎉 All Demonstrations Complete!")
|
||||
print("=" * 70)
|
||||
|
||||
print("\n📊 Summary of v0.7.5 Docker Hooks System:")
|
||||
print("\n🆕 COMPLETELY NEW FEATURE in v0.7.5:")
|
||||
print(" The Docker Hooks System lets you customize the crawling pipeline")
|
||||
print(" with user-provided Python functions at 8 strategic points.")
|
||||
|
||||
print("\n✨ Three Ways to Use Docker Hooks (All NEW!):")
|
||||
print(" 1. String-based - Write hooks as strings for REST API")
|
||||
print(" 2. hooks_to_string() - Convert Python functions to strings")
|
||||
print(" 3. Docker Client - Automatic conversion (RECOMMENDED)")
|
||||
|
||||
print("\n💡 Key Benefits:")
|
||||
print(" ✓ Full IDE support (autocomplete, syntax highlighting)")
|
||||
print(" ✓ Type checking and linting")
|
||||
print(" ✓ Easy to test and debug")
|
||||
print(" ✓ Reusable across projects")
|
||||
print(" ✓ Complete pipeline control")
|
||||
|
||||
print("\n🎯 8 Hook Points Available:")
|
||||
print(" • on_browser_created, on_page_context_created")
|
||||
print(" • on_user_agent_updated, before_goto, after_goto")
|
||||
print(" • on_execution_started, before_retrieve_html, before_return_html")
|
||||
|
||||
print("\n📚 Resources:")
|
||||
print(" • Docs: https://docs.crawl4ai.com")
|
||||
print(" • GitHub: https://github.com/unclecode/crawl4ai")
|
||||
print(" • Discord: https://discord.gg/jP8KfhDhyN")
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print(" Happy Crawling with v0.7.5! 🕷️")
|
||||
print("=" * 70 + "\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("\n🎬 Starting Crawl4AI v0.7.5 Docker Hooks Demonstration...")
|
||||
print("Press Ctrl+C anytime to exit\n")
|
||||
|
||||
try:
|
||||
asyncio.run(main())
|
||||
except KeyboardInterrupt:
|
||||
print("\n\n👋 Demo stopped by user. Thanks for exploring Crawl4AI v0.7.5!")
|
||||
except Exception as e:
|
||||
print(f"\n\n❌ Demo error: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
1516
docs/releases_review/v0.7.5_video_walkthrough.ipynb
Normal file
1516
docs/releases_review/v0.7.5_video_walkthrough.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
12
mkdocs.yml
12
mkdocs.yml
@@ -1,5 +1,4 @@
|
||||
site_name: Crawl4AI Documentation (v0.7.x)
|
||||
site_favicon: docs/md_v2/favicon.ico
|
||||
site_description: 🚀🤖 Crawl4AI, Open-source LLM-Friendly Web Crawler & Scraper
|
||||
site_url: https://docs.crawl4ai.com
|
||||
repo_url: https://github.com/unclecode/crawl4ai
|
||||
@@ -8,6 +7,7 @@ docs_dir: docs/md_v2
|
||||
|
||||
nav:
|
||||
- Home: 'index.md'
|
||||
- "📚 Complete SDK Reference": "complete-sdk-reference.md"
|
||||
- "Ask AI": "core/ask-ai.md"
|
||||
- "Quick Start": "core/quickstart.md"
|
||||
- "Code Examples": "core/examples.md"
|
||||
@@ -15,6 +15,8 @@ nav:
|
||||
- "Demo Apps": "apps/index.md"
|
||||
- "C4A-Script Editor": "apps/c4a-script/index.html"
|
||||
- "LLM Context Builder": "apps/llmtxt/index.html"
|
||||
- "Marketplace": "marketplace/index.html"
|
||||
- "Marketplace Admin": "marketplace/admin/index.html"
|
||||
- Setup & Installation:
|
||||
- "Installation": "core/installation.md"
|
||||
- "Docker Deployment": "core/docker-deployment.md"
|
||||
@@ -66,10 +68,12 @@ nav:
|
||||
- "CrawlResult": "api/crawl-result.md"
|
||||
- "Strategies": "api/strategies.md"
|
||||
- "C4A-Script Reference": "api/c4a-script-reference.md"
|
||||
- "Brand Book": "branding/index.md"
|
||||
|
||||
theme:
|
||||
name: 'terminal'
|
||||
palette: 'dark'
|
||||
favicon: favicon.ico
|
||||
custom_dir: docs/md_v2/overrides
|
||||
color_mode: 'dark'
|
||||
icon:
|
||||
@@ -98,6 +102,7 @@ extra_css:
|
||||
- assets/highlight.css
|
||||
- assets/dmvendor.css
|
||||
- assets/feedback-overrides.css
|
||||
- assets/page_actions.css
|
||||
|
||||
extra_javascript:
|
||||
- https://www.googletagmanager.com/gtag/js?id=G-58W0K2ZQ25
|
||||
@@ -106,8 +111,9 @@ extra_javascript:
|
||||
- assets/highlight_init.js
|
||||
- https://buttons.github.io/buttons.js
|
||||
- assets/toc.js
|
||||
- assets/github_stats.js
|
||||
- assets/github_stats.js
|
||||
- assets/selection_ask_ai.js
|
||||
- assets/copy_code.js
|
||||
- assets/floating_ask_ai_button.js
|
||||
- assets/mobile_menu.js
|
||||
- assets/mobile_menu.js
|
||||
- assets/page_actions.js?v=20251006
|
||||
@@ -31,7 +31,7 @@ dependencies = [
|
||||
"rank-bm25~=0.2",
|
||||
"snowballstemmer~=2.2",
|
||||
"pydantic>=2.10",
|
||||
"pyOpenSSL>=24.3.0",
|
||||
"pyOpenSSL>=25.3.0",
|
||||
"psutil>=6.1.1",
|
||||
"PyYAML>=6.0",
|
||||
"nltk>=3.9.1",
|
||||
|
||||
@@ -19,7 +19,7 @@ rank-bm25~=0.2
|
||||
colorama~=0.4
|
||||
snowballstemmer~=2.2
|
||||
pydantic>=2.10
|
||||
pyOpenSSL>=24.3.0
|
||||
pyOpenSSL>=25.3.0
|
||||
psutil>=6.1.1
|
||||
PyYAML>=6.0
|
||||
nltk>=3.9.1
|
||||
|
||||
401
test_llm_webhook_feature.py
Normal file
401
test_llm_webhook_feature.py
Normal file
@@ -0,0 +1,401 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script to validate webhook implementation for /llm/job endpoint.
|
||||
|
||||
This tests that the /llm/job endpoint now supports webhooks
|
||||
following the same pattern as /crawl/job.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Add deploy/docker to path
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'deploy', 'docker'))
|
||||
|
||||
def test_llm_job_payload_model():
|
||||
"""Test that LlmJobPayload includes webhook_config field"""
|
||||
print("=" * 60)
|
||||
print("TEST 1: LlmJobPayload Model")
|
||||
print("=" * 60)
|
||||
|
||||
try:
|
||||
from job import LlmJobPayload
|
||||
from schemas import WebhookConfig
|
||||
from pydantic import ValidationError
|
||||
|
||||
# Test with webhook_config
|
||||
payload_dict = {
|
||||
"url": "https://example.com",
|
||||
"q": "Extract main content",
|
||||
"schema": None,
|
||||
"cache": False,
|
||||
"provider": None,
|
||||
"webhook_config": {
|
||||
"webhook_url": "https://myapp.com/webhook",
|
||||
"webhook_data_in_payload": True,
|
||||
"webhook_headers": {"X-Secret": "token"}
|
||||
}
|
||||
}
|
||||
|
||||
payload = LlmJobPayload(**payload_dict)
|
||||
|
||||
print(f"✅ LlmJobPayload accepts webhook_config")
|
||||
print(f" - URL: {payload.url}")
|
||||
print(f" - Query: {payload.q}")
|
||||
print(f" - Webhook URL: {payload.webhook_config.webhook_url}")
|
||||
print(f" - Data in payload: {payload.webhook_config.webhook_data_in_payload}")
|
||||
|
||||
# Test without webhook_config (should be optional)
|
||||
minimal_payload = {
|
||||
"url": "https://example.com",
|
||||
"q": "Extract content"
|
||||
}
|
||||
|
||||
payload2 = LlmJobPayload(**minimal_payload)
|
||||
assert payload2.webhook_config is None, "webhook_config should be optional"
|
||||
print(f"✅ LlmJobPayload works without webhook_config (optional)")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"❌ Failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def test_handle_llm_request_signature():
|
||||
"""Test that handle_llm_request accepts webhook_config parameter"""
|
||||
print("\n" + "=" * 60)
|
||||
print("TEST 2: handle_llm_request Function Signature")
|
||||
print("=" * 60)
|
||||
|
||||
try:
|
||||
from api import handle_llm_request
|
||||
import inspect
|
||||
|
||||
sig = inspect.signature(handle_llm_request)
|
||||
params = list(sig.parameters.keys())
|
||||
|
||||
print(f"Function parameters: {params}")
|
||||
|
||||
if 'webhook_config' in params:
|
||||
print(f"✅ handle_llm_request has webhook_config parameter")
|
||||
|
||||
# Check that it's optional with default None
|
||||
webhook_param = sig.parameters['webhook_config']
|
||||
if webhook_param.default is None or webhook_param.default == inspect.Parameter.empty:
|
||||
print(f"✅ webhook_config is optional (default: {webhook_param.default})")
|
||||
else:
|
||||
print(f"⚠️ webhook_config default is: {webhook_param.default}")
|
||||
|
||||
return True
|
||||
else:
|
||||
print(f"❌ handle_llm_request missing webhook_config parameter")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def test_process_llm_extraction_signature():
|
||||
"""Test that process_llm_extraction accepts webhook_config parameter"""
|
||||
print("\n" + "=" * 60)
|
||||
print("TEST 3: process_llm_extraction Function Signature")
|
||||
print("=" * 60)
|
||||
|
||||
try:
|
||||
from api import process_llm_extraction
|
||||
import inspect
|
||||
|
||||
sig = inspect.signature(process_llm_extraction)
|
||||
params = list(sig.parameters.keys())
|
||||
|
||||
print(f"Function parameters: {params}")
|
||||
|
||||
if 'webhook_config' in params:
|
||||
print(f"✅ process_llm_extraction has webhook_config parameter")
|
||||
|
||||
webhook_param = sig.parameters['webhook_config']
|
||||
if webhook_param.default is None or webhook_param.default == inspect.Parameter.empty:
|
||||
print(f"✅ webhook_config is optional (default: {webhook_param.default})")
|
||||
else:
|
||||
print(f"⚠️ webhook_config default is: {webhook_param.default}")
|
||||
|
||||
return True
|
||||
else:
|
||||
print(f"❌ process_llm_extraction missing webhook_config parameter")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def test_webhook_integration_in_api():
|
||||
"""Test that api.py properly integrates webhook notifications"""
|
||||
print("\n" + "=" * 60)
|
||||
print("TEST 4: Webhook Integration in process_llm_extraction")
|
||||
print("=" * 60)
|
||||
|
||||
try:
|
||||
api_file = os.path.join(os.path.dirname(__file__), 'deploy', 'docker', 'api.py')
|
||||
|
||||
with open(api_file, 'r') as f:
|
||||
api_content = f.read()
|
||||
|
||||
# Check for WebhookDeliveryService initialization
|
||||
if 'webhook_service = WebhookDeliveryService(config)' in api_content:
|
||||
print("✅ process_llm_extraction initializes WebhookDeliveryService")
|
||||
else:
|
||||
print("❌ Missing WebhookDeliveryService initialization in process_llm_extraction")
|
||||
return False
|
||||
|
||||
# Check for notify_job_completion calls with llm_extraction
|
||||
if 'task_type="llm_extraction"' in api_content:
|
||||
print("✅ Uses correct task_type='llm_extraction' for notifications")
|
||||
else:
|
||||
print("❌ Missing task_type='llm_extraction' in webhook notifications")
|
||||
return False
|
||||
|
||||
# Count webhook notification calls (should have at least 3: success + 2 failure paths)
|
||||
notification_count = api_content.count('await webhook_service.notify_job_completion')
|
||||
# Find only in process_llm_extraction function
|
||||
llm_func_start = api_content.find('async def process_llm_extraction')
|
||||
llm_func_end = api_content.find('\nasync def ', llm_func_start + 1)
|
||||
if llm_func_end == -1:
|
||||
llm_func_end = len(api_content)
|
||||
|
||||
llm_func_content = api_content[llm_func_start:llm_func_end]
|
||||
llm_notification_count = llm_func_content.count('await webhook_service.notify_job_completion')
|
||||
|
||||
print(f"✅ Found {llm_notification_count} webhook notification calls in process_llm_extraction")
|
||||
|
||||
if llm_notification_count >= 3:
|
||||
print(f"✅ Sufficient notification points (success + failure paths)")
|
||||
else:
|
||||
print(f"⚠️ Expected at least 3 notification calls, found {llm_notification_count}")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"❌ Failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def test_job_endpoint_integration():
|
||||
"""Test that /llm/job endpoint extracts and passes webhook_config"""
|
||||
print("\n" + "=" * 60)
|
||||
print("TEST 5: /llm/job Endpoint Integration")
|
||||
print("=" * 60)
|
||||
|
||||
try:
|
||||
job_file = os.path.join(os.path.dirname(__file__), 'deploy', 'docker', 'job.py')
|
||||
|
||||
with open(job_file, 'r') as f:
|
||||
job_content = f.read()
|
||||
|
||||
# Find the llm_job_enqueue function
|
||||
llm_job_start = job_content.find('async def llm_job_enqueue')
|
||||
llm_job_end = job_content.find('\n\n@router', llm_job_start + 1)
|
||||
if llm_job_end == -1:
|
||||
llm_job_end = job_content.find('\n\nasync def', llm_job_start + 1)
|
||||
|
||||
llm_job_func = job_content[llm_job_start:llm_job_end]
|
||||
|
||||
# Check for webhook_config extraction
|
||||
if 'webhook_config = None' in llm_job_func:
|
||||
print("✅ llm_job_enqueue initializes webhook_config variable")
|
||||
else:
|
||||
print("❌ Missing webhook_config initialization")
|
||||
return False
|
||||
|
||||
if 'if payload.webhook_config:' in llm_job_func:
|
||||
print("✅ llm_job_enqueue checks for payload.webhook_config")
|
||||
else:
|
||||
print("❌ Missing webhook_config check")
|
||||
return False
|
||||
|
||||
if 'webhook_config = payload.webhook_config.model_dump(mode=\'json\')' in llm_job_func:
|
||||
print("✅ llm_job_enqueue converts webhook_config to dict")
|
||||
else:
|
||||
print("❌ Missing webhook_config.model_dump conversion")
|
||||
return False
|
||||
|
||||
if 'webhook_config=webhook_config' in llm_job_func:
|
||||
print("✅ llm_job_enqueue passes webhook_config to handle_llm_request")
|
||||
else:
|
||||
print("❌ Missing webhook_config parameter in handle_llm_request call")
|
||||
return False
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"❌ Failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def test_create_new_task_integration():
|
||||
"""Test that create_new_task stores webhook_config in Redis"""
|
||||
print("\n" + "=" * 60)
|
||||
print("TEST 6: create_new_task Webhook Storage")
|
||||
print("=" * 60)
|
||||
|
||||
try:
|
||||
api_file = os.path.join(os.path.dirname(__file__), 'deploy', 'docker', 'api.py')
|
||||
|
||||
with open(api_file, 'r') as f:
|
||||
api_content = f.read()
|
||||
|
||||
# Find create_new_task function
|
||||
create_task_start = api_content.find('async def create_new_task')
|
||||
create_task_end = api_content.find('\nasync def ', create_task_start + 1)
|
||||
if create_task_end == -1:
|
||||
create_task_end = len(api_content)
|
||||
|
||||
create_task_func = api_content[create_task_start:create_task_end]
|
||||
|
||||
# Check for webhook_config storage
|
||||
if 'if webhook_config:' in create_task_func:
|
||||
print("✅ create_new_task checks for webhook_config")
|
||||
else:
|
||||
print("❌ Missing webhook_config check in create_new_task")
|
||||
return False
|
||||
|
||||
if 'task_data["webhook_config"] = json.dumps(webhook_config)' in create_task_func:
|
||||
print("✅ create_new_task stores webhook_config in Redis task data")
|
||||
else:
|
||||
print("❌ Missing webhook_config storage in task_data")
|
||||
return False
|
||||
|
||||
# Check that webhook_config is passed to process_llm_extraction
|
||||
if 'webhook_config' in create_task_func and 'background_tasks.add_task' in create_task_func:
|
||||
print("✅ create_new_task passes webhook_config to background task")
|
||||
else:
|
||||
print("⚠️ Could not verify webhook_config passed to background task")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"❌ Failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def test_pattern_consistency():
|
||||
"""Test that /llm/job follows the same pattern as /crawl/job"""
|
||||
print("\n" + "=" * 60)
|
||||
print("TEST 7: Pattern Consistency with /crawl/job")
|
||||
print("=" * 60)
|
||||
|
||||
try:
|
||||
api_file = os.path.join(os.path.dirname(__file__), 'deploy', 'docker', 'api.py')
|
||||
|
||||
with open(api_file, 'r') as f:
|
||||
api_content = f.read()
|
||||
|
||||
# Find handle_crawl_job to compare pattern
|
||||
crawl_job_start = api_content.find('async def handle_crawl_job')
|
||||
crawl_job_end = api_content.find('\nasync def ', crawl_job_start + 1)
|
||||
if crawl_job_end == -1:
|
||||
crawl_job_end = len(api_content)
|
||||
crawl_job_func = api_content[crawl_job_start:crawl_job_end]
|
||||
|
||||
# Find process_llm_extraction
|
||||
llm_extract_start = api_content.find('async def process_llm_extraction')
|
||||
llm_extract_end = api_content.find('\nasync def ', llm_extract_start + 1)
|
||||
if llm_extract_end == -1:
|
||||
llm_extract_end = len(api_content)
|
||||
llm_extract_func = api_content[llm_extract_start:llm_extract_end]
|
||||
|
||||
print("Checking pattern consistency...")
|
||||
|
||||
# Both should initialize WebhookDeliveryService
|
||||
crawl_has_service = 'webhook_service = WebhookDeliveryService(config)' in crawl_job_func
|
||||
llm_has_service = 'webhook_service = WebhookDeliveryService(config)' in llm_extract_func
|
||||
|
||||
if crawl_has_service and llm_has_service:
|
||||
print("✅ Both initialize WebhookDeliveryService")
|
||||
else:
|
||||
print(f"❌ Service initialization mismatch (crawl: {crawl_has_service}, llm: {llm_has_service})")
|
||||
return False
|
||||
|
||||
# Both should call notify_job_completion on success
|
||||
crawl_notifies_success = 'status="completed"' in crawl_job_func and 'notify_job_completion' in crawl_job_func
|
||||
llm_notifies_success = 'status="completed"' in llm_extract_func and 'notify_job_completion' in llm_extract_func
|
||||
|
||||
if crawl_notifies_success and llm_notifies_success:
|
||||
print("✅ Both notify on success")
|
||||
else:
|
||||
print(f"❌ Success notification mismatch (crawl: {crawl_notifies_success}, llm: {llm_notifies_success})")
|
||||
return False
|
||||
|
||||
# Both should call notify_job_completion on failure
|
||||
crawl_notifies_failure = 'status="failed"' in crawl_job_func and 'error=' in crawl_job_func
|
||||
llm_notifies_failure = 'status="failed"' in llm_extract_func and 'error=' in llm_extract_func
|
||||
|
||||
if crawl_notifies_failure and llm_notifies_failure:
|
||||
print("✅ Both notify on failure")
|
||||
else:
|
||||
print(f"❌ Failure notification mismatch (crawl: {crawl_notifies_failure}, llm: {llm_notifies_failure})")
|
||||
return False
|
||||
|
||||
print("✅ /llm/job follows the same pattern as /crawl/job")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def main():
|
||||
"""Run all tests"""
|
||||
print("\n🧪 LLM Job Webhook Feature Validation")
|
||||
print("=" * 60)
|
||||
print("Testing that /llm/job now supports webhooks like /crawl/job")
|
||||
print("=" * 60 + "\n")
|
||||
|
||||
results = []
|
||||
|
||||
# Run all tests
|
||||
results.append(("LlmJobPayload Model", test_llm_job_payload_model()))
|
||||
results.append(("handle_llm_request Signature", test_handle_llm_request_signature()))
|
||||
results.append(("process_llm_extraction Signature", test_process_llm_extraction_signature()))
|
||||
results.append(("Webhook Integration", test_webhook_integration_in_api()))
|
||||
results.append(("/llm/job Endpoint", test_job_endpoint_integration()))
|
||||
results.append(("create_new_task Storage", test_create_new_task_integration()))
|
||||
results.append(("Pattern Consistency", test_pattern_consistency()))
|
||||
|
||||
# Print summary
|
||||
print("\n" + "=" * 60)
|
||||
print("TEST SUMMARY")
|
||||
print("=" * 60)
|
||||
|
||||
passed = sum(1 for _, result in results if result)
|
||||
total = len(results)
|
||||
|
||||
for test_name, result in results:
|
||||
status = "✅ PASS" if result else "❌ FAIL"
|
||||
print(f"{status} - {test_name}")
|
||||
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"Results: {passed}/{total} tests passed")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
if passed == total:
|
||||
print("\n🎉 All tests passed! /llm/job webhook feature is correctly implemented.")
|
||||
print("\n📝 Summary of changes:")
|
||||
print(" 1. LlmJobPayload model includes webhook_config field")
|
||||
print(" 2. /llm/job endpoint extracts and passes webhook_config")
|
||||
print(" 3. handle_llm_request accepts webhook_config parameter")
|
||||
print(" 4. create_new_task stores webhook_config in Redis")
|
||||
print(" 5. process_llm_extraction sends webhook notifications")
|
||||
print(" 6. Follows the same pattern as /crawl/job")
|
||||
return 0
|
||||
else:
|
||||
print(f"\n⚠️ {total - passed} test(s) failed. Please review the output above.")
|
||||
return 1
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit(main())
|
||||
307
test_webhook_implementation.py
Normal file
307
test_webhook_implementation.py
Normal file
@@ -0,0 +1,307 @@
|
||||
"""
|
||||
Simple test script to validate webhook implementation without running full server.
|
||||
|
||||
This script tests:
|
||||
1. Webhook module imports and syntax
|
||||
2. WebhookDeliveryService initialization
|
||||
3. Payload construction logic
|
||||
4. Configuration parsing
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
|
||||
# Add deploy/docker to path to import modules
|
||||
# sys.path.insert(0, '/home/user/crawl4ai/deploy/docker')
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'deploy', 'docker'))
|
||||
|
||||
def test_imports():
|
||||
"""Test that all webhook-related modules can be imported"""
|
||||
print("=" * 60)
|
||||
print("TEST 1: Module Imports")
|
||||
print("=" * 60)
|
||||
|
||||
try:
|
||||
from webhook import WebhookDeliveryService
|
||||
print("✅ webhook.WebhookDeliveryService imported successfully")
|
||||
except Exception as e:
|
||||
print(f"❌ Failed to import webhook module: {e}")
|
||||
return False
|
||||
|
||||
try:
|
||||
from schemas import WebhookConfig, WebhookPayload
|
||||
print("✅ schemas.WebhookConfig imported successfully")
|
||||
print("✅ schemas.WebhookPayload imported successfully")
|
||||
except Exception as e:
|
||||
print(f"❌ Failed to import schemas: {e}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def test_webhook_service_init():
|
||||
"""Test WebhookDeliveryService initialization"""
|
||||
print("\n" + "=" * 60)
|
||||
print("TEST 2: WebhookDeliveryService Initialization")
|
||||
print("=" * 60)
|
||||
|
||||
try:
|
||||
from webhook import WebhookDeliveryService
|
||||
|
||||
# Test with default config
|
||||
config = {
|
||||
"webhooks": {
|
||||
"enabled": True,
|
||||
"default_url": None,
|
||||
"data_in_payload": False,
|
||||
"retry": {
|
||||
"max_attempts": 5,
|
||||
"initial_delay_ms": 1000,
|
||||
"max_delay_ms": 32000,
|
||||
"timeout_ms": 30000
|
||||
},
|
||||
"headers": {
|
||||
"User-Agent": "Crawl4AI-Webhook/1.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
service = WebhookDeliveryService(config)
|
||||
|
||||
print(f"✅ Service initialized successfully")
|
||||
print(f" - Max attempts: {service.max_attempts}")
|
||||
print(f" - Initial delay: {service.initial_delay}s")
|
||||
print(f" - Max delay: {service.max_delay}s")
|
||||
print(f" - Timeout: {service.timeout}s")
|
||||
|
||||
# Verify calculations
|
||||
assert service.max_attempts == 5, "Max attempts should be 5"
|
||||
assert service.initial_delay == 1.0, "Initial delay should be 1.0s"
|
||||
assert service.max_delay == 32.0, "Max delay should be 32.0s"
|
||||
assert service.timeout == 30.0, "Timeout should be 30.0s"
|
||||
|
||||
print("✅ All configuration values correct")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"❌ Service initialization failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def test_webhook_config_model():
|
||||
"""Test WebhookConfig Pydantic model"""
|
||||
print("\n" + "=" * 60)
|
||||
print("TEST 3: WebhookConfig Model Validation")
|
||||
print("=" * 60)
|
||||
|
||||
try:
|
||||
from schemas import WebhookConfig
|
||||
from pydantic import ValidationError
|
||||
|
||||
# Test valid config
|
||||
valid_config = {
|
||||
"webhook_url": "https://example.com/webhook",
|
||||
"webhook_data_in_payload": True,
|
||||
"webhook_headers": {"X-Secret": "token123"}
|
||||
}
|
||||
|
||||
config = WebhookConfig(**valid_config)
|
||||
print(f"✅ Valid config accepted:")
|
||||
print(f" - URL: {config.webhook_url}")
|
||||
print(f" - Data in payload: {config.webhook_data_in_payload}")
|
||||
print(f" - Headers: {config.webhook_headers}")
|
||||
|
||||
# Test minimal config
|
||||
minimal_config = {
|
||||
"webhook_url": "https://example.com/webhook"
|
||||
}
|
||||
|
||||
config2 = WebhookConfig(**minimal_config)
|
||||
print(f"✅ Minimal config accepted (defaults applied):")
|
||||
print(f" - URL: {config2.webhook_url}")
|
||||
print(f" - Data in payload: {config2.webhook_data_in_payload}")
|
||||
print(f" - Headers: {config2.webhook_headers}")
|
||||
|
||||
# Test invalid URL
|
||||
try:
|
||||
invalid_config = {
|
||||
"webhook_url": "not-a-url"
|
||||
}
|
||||
config3 = WebhookConfig(**invalid_config)
|
||||
print(f"❌ Invalid URL should have been rejected")
|
||||
return False
|
||||
except ValidationError as e:
|
||||
print(f"✅ Invalid URL correctly rejected")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"❌ Model validation test failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def test_payload_construction():
|
||||
"""Test webhook payload construction logic"""
|
||||
print("\n" + "=" * 60)
|
||||
print("TEST 4: Payload Construction")
|
||||
print("=" * 60)
|
||||
|
||||
try:
|
||||
# Simulate payload construction from notify_job_completion
|
||||
task_id = "crawl_abc123"
|
||||
task_type = "crawl"
|
||||
status = "completed"
|
||||
urls = ["https://example.com"]
|
||||
|
||||
payload = {
|
||||
"task_id": task_id,
|
||||
"task_type": task_type,
|
||||
"status": status,
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"urls": urls
|
||||
}
|
||||
|
||||
print(f"✅ Basic payload constructed:")
|
||||
print(json.dumps(payload, indent=2))
|
||||
|
||||
# Test with error
|
||||
error_payload = {
|
||||
"task_id": "crawl_xyz789",
|
||||
"task_type": "crawl",
|
||||
"status": "failed",
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"urls": ["https://example.com"],
|
||||
"error": "Connection timeout"
|
||||
}
|
||||
|
||||
print(f"\n✅ Error payload constructed:")
|
||||
print(json.dumps(error_payload, indent=2))
|
||||
|
||||
# Test with data
|
||||
data_payload = {
|
||||
"task_id": "crawl_def456",
|
||||
"task_type": "crawl",
|
||||
"status": "completed",
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"urls": ["https://example.com"],
|
||||
"data": {
|
||||
"results": [
|
||||
{"url": "https://example.com", "markdown": "# Example"}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
print(f"\n✅ Data payload constructed:")
|
||||
print(json.dumps(data_payload, indent=2))
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"❌ Payload construction failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def test_exponential_backoff():
|
||||
"""Test exponential backoff calculation"""
|
||||
print("\n" + "=" * 60)
|
||||
print("TEST 5: Exponential Backoff Calculation")
|
||||
print("=" * 60)
|
||||
|
||||
try:
|
||||
initial_delay = 1.0 # 1 second
|
||||
max_delay = 32.0 # 32 seconds
|
||||
|
||||
print("Backoff delays for 5 attempts:")
|
||||
for attempt in range(5):
|
||||
delay = min(initial_delay * (2 ** attempt), max_delay)
|
||||
print(f" Attempt {attempt + 1}: {delay}s")
|
||||
|
||||
# Verify the sequence: 1s, 2s, 4s, 8s, 16s
|
||||
expected = [1.0, 2.0, 4.0, 8.0, 16.0]
|
||||
actual = [min(initial_delay * (2 ** i), max_delay) for i in range(5)]
|
||||
|
||||
assert actual == expected, f"Expected {expected}, got {actual}"
|
||||
print("✅ Exponential backoff sequence correct")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"❌ Backoff calculation failed: {e}")
|
||||
return False
|
||||
|
||||
def test_api_integration():
|
||||
"""Test that api.py imports webhook module correctly"""
|
||||
print("\n" + "=" * 60)
|
||||
print("TEST 6: API Integration")
|
||||
print("=" * 60)
|
||||
|
||||
try:
|
||||
# Check if api.py can import webhook module
|
||||
api_path = os.path.join(os.path.dirname(__file__), 'deploy', 'docker', 'api.py')
|
||||
with open(api_path, 'r') as f:
|
||||
api_content = f.read()
|
||||
|
||||
if 'from webhook import WebhookDeliveryService' in api_content:
|
||||
print("✅ api.py imports WebhookDeliveryService")
|
||||
else:
|
||||
print("❌ api.py missing webhook import")
|
||||
return False
|
||||
|
||||
if 'WebhookDeliveryService(config)' in api_content:
|
||||
print("✅ api.py initializes WebhookDeliveryService")
|
||||
else:
|
||||
print("❌ api.py doesn't initialize WebhookDeliveryService")
|
||||
return False
|
||||
|
||||
if 'notify_job_completion' in api_content:
|
||||
print("✅ api.py calls notify_job_completion")
|
||||
else:
|
||||
print("❌ api.py doesn't call notify_job_completion")
|
||||
return False
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"❌ API integration check failed: {e}")
|
||||
return False
|
||||
|
||||
def main():
|
||||
"""Run all tests"""
|
||||
print("\n🧪 Webhook Implementation Validation Tests")
|
||||
print("=" * 60)
|
||||
|
||||
results = []
|
||||
|
||||
# Run tests
|
||||
results.append(("Module Imports", test_imports()))
|
||||
results.append(("Service Initialization", test_webhook_service_init()))
|
||||
results.append(("Config Model", test_webhook_config_model()))
|
||||
results.append(("Payload Construction", test_payload_construction()))
|
||||
results.append(("Exponential Backoff", test_exponential_backoff()))
|
||||
results.append(("API Integration", test_api_integration()))
|
||||
|
||||
# Print summary
|
||||
print("\n" + "=" * 60)
|
||||
print("TEST SUMMARY")
|
||||
print("=" * 60)
|
||||
|
||||
passed = sum(1 for _, result in results if result)
|
||||
total = len(results)
|
||||
|
||||
for test_name, result in results:
|
||||
status = "✅ PASS" if result else "❌ FAIL"
|
||||
print(f"{status} - {test_name}")
|
||||
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"Results: {passed}/{total} tests passed")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
if passed == total:
|
||||
print("\n🎉 All tests passed! Webhook implementation is valid.")
|
||||
return 0
|
||||
else:
|
||||
print(f"\n⚠️ {total - passed} test(s) failed. Please review the output above.")
|
||||
return 1
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit(main())
|
||||
251
tests/WEBHOOK_TEST_README.md
Normal file
251
tests/WEBHOOK_TEST_README.md
Normal file
@@ -0,0 +1,251 @@
|
||||
# Webhook Feature Test Script
|
||||
|
||||
This directory contains a comprehensive test script for the webhook feature implementation.
|
||||
|
||||
## Overview
|
||||
|
||||
The `test_webhook_feature.sh` script automates the entire process of testing the webhook feature:
|
||||
|
||||
1. ✅ Fetches and switches to the webhook feature branch
|
||||
2. ✅ Activates the virtual environment
|
||||
3. ✅ Installs all required dependencies
|
||||
4. ✅ Starts Redis server in background
|
||||
5. ✅ Starts Crawl4AI server in background
|
||||
6. ✅ Runs webhook integration test
|
||||
7. ✅ Verifies job completion via webhook
|
||||
8. ✅ Cleans up and returns to original branch
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Python 3.10+
|
||||
- Virtual environment already created (`venv/` in project root)
|
||||
- Git repository with the webhook feature branch
|
||||
- `redis-server` (script will attempt to install if missing)
|
||||
- `curl` and `lsof` commands available
|
||||
|
||||
## Usage
|
||||
|
||||
### Quick Start
|
||||
|
||||
From the project root:
|
||||
|
||||
```bash
|
||||
./tests/test_webhook_feature.sh
|
||||
```
|
||||
|
||||
Or from the tests directory:
|
||||
|
||||
```bash
|
||||
cd tests
|
||||
./test_webhook_feature.sh
|
||||
```
|
||||
|
||||
### What the Script Does
|
||||
|
||||
#### Step 1: Branch Management
|
||||
- Saves your current branch
|
||||
- Fetches the webhook feature branch from remote
|
||||
- Switches to the webhook feature branch
|
||||
|
||||
#### Step 2: Environment Setup
|
||||
- Activates your existing virtual environment
|
||||
- Installs dependencies from `deploy/docker/requirements.txt`
|
||||
- Installs Flask for the webhook receiver
|
||||
|
||||
#### Step 3: Service Startup
|
||||
- Starts Redis server on port 6379
|
||||
- Starts Crawl4AI server on port 11235
|
||||
- Waits for server health check to pass
|
||||
|
||||
#### Step 4: Webhook Test
|
||||
- Creates a webhook receiver on port 8080
|
||||
- Submits a crawl job for `https://example.com` with webhook config
|
||||
- Waits for webhook notification (60s timeout)
|
||||
- Verifies webhook payload contains expected data
|
||||
|
||||
#### Step 5: Cleanup
|
||||
- Stops webhook receiver
|
||||
- Stops Crawl4AI server
|
||||
- Stops Redis server
|
||||
- Returns to your original branch
|
||||
|
||||
## Expected Output
|
||||
|
||||
```
|
||||
[INFO] Starting webhook feature test script
|
||||
[INFO] Project root: /path/to/crawl4ai
|
||||
[INFO] Step 1: Fetching PR branch...
|
||||
[INFO] Current branch: develop
|
||||
[SUCCESS] Branch fetched
|
||||
[INFO] Step 2: Switching to branch: claude/implement-webhook-crawl-feature-011CULZY1Jy8N5MUkZqXkRVp
|
||||
[SUCCESS] Switched to webhook feature branch
|
||||
[INFO] Step 3: Activating virtual environment...
|
||||
[SUCCESS] Virtual environment activated
|
||||
[INFO] Step 4: Installing server dependencies...
|
||||
[SUCCESS] Dependencies installed
|
||||
[INFO] Step 5a: Starting Redis...
|
||||
[SUCCESS] Redis started (PID: 12345)
|
||||
[INFO] Step 5b: Starting server on port 11235...
|
||||
[INFO] Server started (PID: 12346)
|
||||
[INFO] Waiting for server to be ready...
|
||||
[SUCCESS] Server is ready!
|
||||
[INFO] Step 6: Creating webhook test script...
|
||||
[INFO] Running webhook test...
|
||||
|
||||
🚀 Submitting crawl job with webhook...
|
||||
✅ Job submitted successfully, task_id: crawl_abc123
|
||||
⏳ Waiting for webhook notification...
|
||||
|
||||
✅ Webhook received: {
|
||||
"task_id": "crawl_abc123",
|
||||
"task_type": "crawl",
|
||||
"status": "completed",
|
||||
"timestamp": "2025-10-22T00:00:00.000000+00:00",
|
||||
"urls": ["https://example.com"],
|
||||
"data": { ... }
|
||||
}
|
||||
|
||||
✅ Webhook received!
|
||||
Task ID: crawl_abc123
|
||||
Status: completed
|
||||
URLs: ['https://example.com']
|
||||
✅ Data included in webhook payload
|
||||
📄 Crawled 1 URL(s)
|
||||
- https://example.com: 1234 chars
|
||||
|
||||
🎉 Webhook test PASSED!
|
||||
|
||||
[INFO] Step 7: Verifying test results...
|
||||
[SUCCESS] ✅ Webhook test PASSED!
|
||||
[SUCCESS] All tests completed successfully! 🎉
|
||||
[INFO] Cleanup will happen automatically...
|
||||
[INFO] Starting cleanup...
|
||||
[INFO] Stopping webhook receiver...
|
||||
[INFO] Stopping server...
|
||||
[INFO] Stopping Redis...
|
||||
[INFO] Switching back to branch: develop
|
||||
[SUCCESS] Cleanup complete
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Server Failed to Start
|
||||
|
||||
If the server fails to start, check the logs:
|
||||
|
||||
```bash
|
||||
tail -100 /tmp/crawl4ai_server.log
|
||||
```
|
||||
|
||||
Common issues:
|
||||
- Port 11235 already in use: `lsof -ti:11235 | xargs kill -9`
|
||||
- Missing dependencies: Check that all packages are installed
|
||||
|
||||
### Redis Connection Failed
|
||||
|
||||
Check if Redis is running:
|
||||
|
||||
```bash
|
||||
redis-cli ping
|
||||
# Should return: PONG
|
||||
```
|
||||
|
||||
If not running:
|
||||
|
||||
```bash
|
||||
redis-server --port 6379 --daemonize yes
|
||||
```
|
||||
|
||||
### Webhook Not Received
|
||||
|
||||
The script has a 60-second timeout for webhook delivery. If the webhook isn't received:
|
||||
|
||||
1. Check server logs: `/tmp/crawl4ai_server.log`
|
||||
2. Verify webhook receiver is running on port 8080
|
||||
3. Check network connectivity between components
|
||||
|
||||
### Script Interruption
|
||||
|
||||
If the script is interrupted (Ctrl+C), cleanup happens automatically via trap. The script will:
|
||||
- Kill all background processes
|
||||
- Stop Redis
|
||||
- Return to your original branch
|
||||
|
||||
To manually cleanup if needed:
|
||||
|
||||
```bash
|
||||
# Kill processes by port
|
||||
lsof -ti:11235 | xargs kill -9 # Server
|
||||
lsof -ti:8080 | xargs kill -9 # Webhook receiver
|
||||
lsof -ti:6379 | xargs kill -9 # Redis
|
||||
|
||||
# Return to your branch
|
||||
git checkout develop # or your branch name
|
||||
```
|
||||
|
||||
## Testing Different URLs
|
||||
|
||||
To test with a different URL, modify the script or create a custom test:
|
||||
|
||||
```python
|
||||
payload = {
|
||||
"urls": ["https://your-url-here.com"],
|
||||
"browser_config": {"headless": True},
|
||||
"crawler_config": {"cache_mode": "bypass"},
|
||||
"webhook_config": {
|
||||
"webhook_url": "http://localhost:8080/webhook",
|
||||
"webhook_data_in_payload": True
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Files Generated
|
||||
|
||||
The script creates temporary files:
|
||||
|
||||
- `/tmp/crawl4ai_server.log` - Server output logs
|
||||
- `/tmp/test_webhook.py` - Webhook test Python script
|
||||
|
||||
These are not cleaned up automatically so you can review them after the test.
|
||||
|
||||
## Exit Codes
|
||||
|
||||
- `0` - All tests passed successfully
|
||||
- `1` - Test failed (check output for details)
|
||||
|
||||
## Safety Features
|
||||
|
||||
- ✅ Automatic cleanup on exit, interrupt, or error
|
||||
- ✅ Returns to original branch on completion
|
||||
- ✅ Kills all background processes
|
||||
- ✅ Comprehensive error handling
|
||||
- ✅ Colored output for easy reading
|
||||
- ✅ Detailed logging at each step
|
||||
|
||||
## Notes
|
||||
|
||||
- The script uses `set -e` to exit on any command failure
|
||||
- All background processes are tracked and cleaned up
|
||||
- The virtual environment must exist before running
|
||||
- Redis must be available (installed or installable via apt-get/brew)
|
||||
|
||||
## Integration with CI/CD
|
||||
|
||||
This script can be integrated into CI/CD pipelines:
|
||||
|
||||
```yaml
|
||||
# Example GitHub Actions
|
||||
- name: Test Webhook Feature
|
||||
run: |
|
||||
chmod +x tests/test_webhook_feature.sh
|
||||
./tests/test_webhook_feature.sh
|
||||
```
|
||||
|
||||
## Support
|
||||
|
||||
If you encounter issues:
|
||||
|
||||
1. Check the troubleshooting section above
|
||||
2. Review server logs at `/tmp/crawl4ai_server.log`
|
||||
3. Ensure all prerequisites are met
|
||||
4. Open an issue with the full output of the script
|
||||
154
tests/adaptive/test_llm_embedding.py
Normal file
154
tests/adaptive/test_llm_embedding.py
Normal file
@@ -0,0 +1,154 @@
|
||||
import asyncio
|
||||
import os
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig, LLMConfig
|
||||
|
||||
|
||||
async def test_configuration(name: str, config: AdaptiveConfig, url: str, query: str):
|
||||
"""Test a specific configuration"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Configuration: {name}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
result = await adaptive.digest(start_url=url, query=query)
|
||||
|
||||
print("\n" + "="*50)
|
||||
print("CRAWL STATISTICS")
|
||||
print("="*50)
|
||||
adaptive.print_stats(detailed=False)
|
||||
|
||||
# Get the most relevant content found
|
||||
print("\n" + "="*50)
|
||||
print("MOST RELEVANT PAGES")
|
||||
print("="*50)
|
||||
|
||||
relevant_pages = adaptive.get_relevant_content(top_k=5)
|
||||
for i, page in enumerate(relevant_pages, 1):
|
||||
print(f"\n{i}. {page['url']}")
|
||||
print(f" Relevance Score: {page['score']:.2%}")
|
||||
|
||||
# Show a snippet of the content
|
||||
content = page['content'] or ""
|
||||
if content:
|
||||
snippet = content[:200].replace('\n', ' ')
|
||||
if len(content) > 200:
|
||||
snippet += "..."
|
||||
print(f" Preview: {snippet}")
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"Pages crawled: {len(result.crawled_urls)}")
|
||||
print(f"Final confidence: {adaptive.confidence:.1%}")
|
||||
print(f"Stopped reason: {result.metrics.get('stopped_reason', 'max_pages')}")
|
||||
|
||||
if result.metrics.get('is_irrelevant', False):
|
||||
print("⚠️ Query detected as irrelevant!")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def llm_embedding():
|
||||
"""Demonstrate various embedding configurations"""
|
||||
|
||||
print("EMBEDDING STRATEGY CONFIGURATION EXAMPLES")
|
||||
print("=" * 60)
|
||||
|
||||
# Base URL and query for testing
|
||||
test_url = "https://docs.python.org/3/library/asyncio.html"
|
||||
|
||||
openai_llm_config = LLMConfig(
|
||||
provider='openai/text-embedding-3-small',
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
temperature=0.7,
|
||||
max_tokens=2000
|
||||
)
|
||||
config_openai = AdaptiveConfig(
|
||||
strategy="embedding",
|
||||
max_pages=10,
|
||||
|
||||
# Use OpenAI embeddings
|
||||
embedding_llm_config=openai_llm_config,
|
||||
# embedding_llm_config={
|
||||
# 'provider': 'openai/text-embedding-3-small',
|
||||
# 'api_token': os.getenv('OPENAI_API_KEY')
|
||||
# },
|
||||
|
||||
# OpenAI embeddings are high quality, can be stricter
|
||||
embedding_k_exp=4.0,
|
||||
n_query_variations=12
|
||||
)
|
||||
|
||||
await test_configuration(
|
||||
"OpenAI Embeddings",
|
||||
config_openai,
|
||||
test_url,
|
||||
# "event-driven architecture patterns"
|
||||
"async await context managers coroutines"
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
|
||||
async def basic_adaptive_crawling():
|
||||
"""Basic adaptive crawling example"""
|
||||
|
||||
# Initialize the crawler
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# Create an adaptive crawler with default settings (statistical strategy)
|
||||
adaptive = AdaptiveCrawler(crawler)
|
||||
|
||||
# Note: You can also use embedding strategy for semantic understanding:
|
||||
# from crawl4ai import AdaptiveConfig
|
||||
# config = AdaptiveConfig(strategy="embedding")
|
||||
# adaptive = AdaptiveCrawler(crawler, config)
|
||||
|
||||
# Start adaptive crawling
|
||||
print("Starting adaptive crawl for Python async programming information...")
|
||||
result = await adaptive.digest(
|
||||
start_url="https://docs.python.org/3/library/asyncio.html",
|
||||
query="async await context managers coroutines"
|
||||
)
|
||||
|
||||
# Display crawl statistics
|
||||
print("\n" + "="*50)
|
||||
print("CRAWL STATISTICS")
|
||||
print("="*50)
|
||||
adaptive.print_stats(detailed=False)
|
||||
|
||||
# Get the most relevant content found
|
||||
print("\n" + "="*50)
|
||||
print("MOST RELEVANT PAGES")
|
||||
print("="*50)
|
||||
|
||||
relevant_pages = adaptive.get_relevant_content(top_k=5)
|
||||
for i, page in enumerate(relevant_pages, 1):
|
||||
print(f"\n{i}. {page['url']}")
|
||||
print(f" Relevance Score: {page['score']:.2%}")
|
||||
|
||||
# Show a snippet of the content
|
||||
content = page['content'] or ""
|
||||
if content:
|
||||
snippet = content[:200].replace('\n', ' ')
|
||||
if len(content) > 200:
|
||||
snippet += "..."
|
||||
print(f" Preview: {snippet}")
|
||||
|
||||
# Show final confidence
|
||||
print(f"\n{'='*50}")
|
||||
print(f"Final Confidence: {adaptive.confidence:.2%}")
|
||||
print(f"Total Pages Crawled: {len(result.crawled_urls)}")
|
||||
print(f"Knowledge Base Size: {len(adaptive.state.knowledge_base)} documents")
|
||||
|
||||
|
||||
if adaptive.confidence >= 0.8:
|
||||
print("✓ High confidence - can answer detailed questions about async Python")
|
||||
elif adaptive.confidence >= 0.6:
|
||||
print("~ Moderate confidence - can answer basic questions")
|
||||
else:
|
||||
print("✗ Low confidence - need more information")
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(llm_embedding())
|
||||
# asyncio.run(basic_adaptive_crawling())
|
||||
@@ -112,7 +112,7 @@ async def test_proxy_settings():
|
||||
headless=True,
|
||||
verbose=False,
|
||||
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
|
||||
proxy="http://127.0.0.1:8080", # Assuming local proxy server for test
|
||||
proxy_config={"server": "http://127.0.0.1:8080"}, # Assuming local proxy server for test
|
||||
use_managed_browser=False,
|
||||
use_persistent_context=False,
|
||||
) as crawler:
|
||||
|
||||
372
tests/docker/test_hooks_client.py
Normal file
372
tests/docker/test_hooks_client.py
Normal file
@@ -0,0 +1,372 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test client for demonstrating user-provided hooks in Crawl4AI Docker API
|
||||
"""
|
||||
|
||||
import requests
|
||||
import json
|
||||
from typing import Dict, Any
|
||||
|
||||
|
||||
API_BASE_URL = "http://localhost:11234" # Adjust if needed
|
||||
|
||||
|
||||
def test_hooks_info():
|
||||
"""Get information about available hooks"""
|
||||
print("=" * 70)
|
||||
print("Testing: GET /hooks/info")
|
||||
print("=" * 70)
|
||||
|
||||
response = requests.get(f"{API_BASE_URL}/hooks/info")
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
print("Available Hook Points:")
|
||||
for hook, info in data['available_hooks'].items():
|
||||
print(f"\n{hook}:")
|
||||
print(f" Parameters: {', '.join(info['parameters'])}")
|
||||
print(f" Description: {info['description']}")
|
||||
else:
|
||||
print(f"Error: {response.status_code}")
|
||||
print(response.text)
|
||||
|
||||
|
||||
def test_basic_crawl_with_hooks():
|
||||
"""Test basic crawling with user-provided hooks"""
|
||||
print("\n" + "=" * 70)
|
||||
print("Testing: POST /crawl with hooks")
|
||||
print("=" * 70)
|
||||
|
||||
# Define hooks as Python code strings
|
||||
hooks_code = {
|
||||
"on_page_context_created": """
|
||||
async def hook(page, context, **kwargs):
|
||||
print("Hook: Setting up page context")
|
||||
# Block images to speed up crawling
|
||||
await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
|
||||
print("Hook: Images blocked")
|
||||
return page
|
||||
""",
|
||||
|
||||
"before_retrieve_html": """
|
||||
async def hook(page, context, **kwargs):
|
||||
print("Hook: Before retrieving HTML")
|
||||
# Scroll to bottom to load lazy content
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
await page.wait_for_timeout(1000)
|
||||
print("Hook: Scrolled to bottom")
|
||||
return page
|
||||
""",
|
||||
|
||||
"before_goto": """
|
||||
async def hook(page, context, url, **kwargs):
|
||||
print(f"Hook: About to navigate to {url}")
|
||||
# Add custom headers
|
||||
await page.set_extra_http_headers({
|
||||
'X-Test-Header': 'crawl4ai-hooks-test'
|
||||
})
|
||||
return page
|
||||
"""
|
||||
}
|
||||
|
||||
# Create request payload
|
||||
payload = {
|
||||
"urls": ["https://httpbin.org/html"],
|
||||
"hooks": {
|
||||
"code": hooks_code,
|
||||
"timeout": 30
|
||||
}
|
||||
}
|
||||
|
||||
print("Sending request with hooks...")
|
||||
response = requests.post(f"{API_BASE_URL}/crawl", json=payload)
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
print("\n✅ Crawl successful!")
|
||||
|
||||
# Check hooks status
|
||||
if 'hooks' in data:
|
||||
hooks_info = data['hooks']
|
||||
print("\nHooks Execution Summary:")
|
||||
print(f" Status: {hooks_info['status']['status']}")
|
||||
print(f" Attached hooks: {', '.join(hooks_info['status']['attached_hooks'])}")
|
||||
|
||||
if hooks_info['status']['validation_errors']:
|
||||
print("\n⚠️ Validation Errors:")
|
||||
for error in hooks_info['status']['validation_errors']:
|
||||
print(f" - {error['hook_point']}: {error['error']}")
|
||||
|
||||
if 'summary' in hooks_info:
|
||||
summary = hooks_info['summary']
|
||||
print(f"\nExecution Statistics:")
|
||||
print(f" Total executions: {summary['total_executions']}")
|
||||
print(f" Successful: {summary['successful']}")
|
||||
print(f" Failed: {summary['failed']}")
|
||||
print(f" Timed out: {summary['timed_out']}")
|
||||
print(f" Success rate: {summary['success_rate']:.1f}%")
|
||||
|
||||
if hooks_info['execution_log']:
|
||||
print("\nExecution Log:")
|
||||
for log_entry in hooks_info['execution_log']:
|
||||
status_icon = "✅" if log_entry['status'] == 'success' else "❌"
|
||||
print(f" {status_icon} {log_entry['hook_point']}: {log_entry['status']} ({log_entry.get('execution_time', 0):.2f}s)")
|
||||
|
||||
if hooks_info['errors']:
|
||||
print("\n❌ Hook Errors:")
|
||||
for error in hooks_info['errors']:
|
||||
print(f" - {error['hook_point']}: {error['error']}")
|
||||
|
||||
# Show crawl results
|
||||
if 'results' in data:
|
||||
print(f"\nCrawled {len(data['results'])} URL(s)")
|
||||
for result in data['results']:
|
||||
print(f" - {result['url']}: {'✅' if result['success'] else '❌'}")
|
||||
|
||||
else:
|
||||
print(f"❌ Error: {response.status_code}")
|
||||
print(response.text)
|
||||
|
||||
|
||||
def test_invalid_hook():
|
||||
"""Test with an invalid hook to see error handling"""
|
||||
print("\n" + "=" * 70)
|
||||
print("Testing: Invalid hook handling")
|
||||
print("=" * 70)
|
||||
|
||||
# Intentionally broken hook
|
||||
hooks_code = {
|
||||
"on_page_context_created": """
|
||||
def hook(page, context): # Missing async!
|
||||
return page
|
||||
""",
|
||||
|
||||
"before_retrieve_html": """
|
||||
async def hook(page, context, **kwargs):
|
||||
# This will cause an error
|
||||
await page.non_existent_method()
|
||||
return page
|
||||
"""
|
||||
}
|
||||
|
||||
payload = {
|
||||
"urls": ["https://httpbin.org/html"],
|
||||
"hooks": {
|
||||
"code": hooks_code,
|
||||
"timeout": 5
|
||||
}
|
||||
}
|
||||
|
||||
print("Sending request with invalid hooks...")
|
||||
response = requests.post(f"{API_BASE_URL}/crawl", json=payload)
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
|
||||
if 'hooks' in data:
|
||||
hooks_info = data['hooks']
|
||||
print(f"\nHooks Status: {hooks_info['status']['status']}")
|
||||
|
||||
if hooks_info['status']['validation_errors']:
|
||||
print("\n✅ Validation caught errors (as expected):")
|
||||
for error in hooks_info['status']['validation_errors']:
|
||||
print(f" - {error['hook_point']}: {error['error']}")
|
||||
|
||||
if hooks_info['errors']:
|
||||
print("\n✅ Runtime errors handled gracefully:")
|
||||
for error in hooks_info['errors']:
|
||||
print(f" - {error['hook_point']}: {error['error']}")
|
||||
|
||||
# The crawl should still succeed despite hook errors
|
||||
if data.get('success'):
|
||||
print("\n✅ Crawl succeeded despite hook errors (error isolation working!)")
|
||||
|
||||
else:
|
||||
print(f"Error: {response.status_code}")
|
||||
print(response.text)
|
||||
|
||||
|
||||
def test_authentication_hook():
|
||||
"""Test authentication using hooks"""
|
||||
print("\n" + "=" * 70)
|
||||
print("Testing: Authentication with hooks")
|
||||
print("=" * 70)
|
||||
|
||||
hooks_code = {
|
||||
"before_goto": """
|
||||
async def hook(page, context, url, **kwargs):
|
||||
# For httpbin.org basic auth test, set Authorization header
|
||||
import base64
|
||||
|
||||
# httpbin.org/basic-auth/user/passwd expects username="user" and password="passwd"
|
||||
credentials = base64.b64encode(b"user:passwd").decode('ascii')
|
||||
|
||||
await page.set_extra_http_headers({
|
||||
'Authorization': f'Basic {credentials}'
|
||||
})
|
||||
|
||||
print(f"Hook: Set Authorization header for {url}")
|
||||
return page
|
||||
""",
|
||||
"on_page_context_created": """
|
||||
async def hook(page, context, **kwargs):
|
||||
# Example: Add cookies for session tracking
|
||||
await context.add_cookies([
|
||||
{
|
||||
'name': 'session_id',
|
||||
'value': 'test_session_123',
|
||||
'domain': '.httpbin.org',
|
||||
'path': '/',
|
||||
'httpOnly': True,
|
||||
'secure': True
|
||||
}
|
||||
])
|
||||
|
||||
print("Hook: Added session cookie")
|
||||
return page
|
||||
"""
|
||||
}
|
||||
|
||||
payload = {
|
||||
"urls": ["https://httpbin.org/basic-auth/user/passwd"],
|
||||
"hooks": {
|
||||
"code": hooks_code,
|
||||
"timeout": 30
|
||||
}
|
||||
}
|
||||
|
||||
print("Sending request with authentication hook...")
|
||||
response = requests.post(f"{API_BASE_URL}/crawl", json=payload)
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
if data.get('success'):
|
||||
print("✅ Crawl with authentication hook successful")
|
||||
|
||||
# Check if hooks executed
|
||||
if 'hooks' in data:
|
||||
hooks_info = data['hooks']
|
||||
if hooks_info.get('summary', {}).get('successful', 0) > 0:
|
||||
print(f"✅ Authentication hooks executed: {hooks_info['summary']['successful']} successful")
|
||||
|
||||
# Check for any hook errors
|
||||
if hooks_info.get('errors'):
|
||||
print("⚠️ Hook errors:")
|
||||
for error in hooks_info['errors']:
|
||||
print(f" - {error}")
|
||||
|
||||
# Check if authentication worked by looking at the result
|
||||
if 'results' in data and len(data['results']) > 0:
|
||||
result = data['results'][0]
|
||||
if result.get('success'):
|
||||
print("✅ Page crawled successfully (authentication worked!)")
|
||||
# httpbin.org/basic-auth returns JSON with authenticated=true when successful
|
||||
if 'authenticated' in str(result.get('html', '')):
|
||||
print("✅ Authentication confirmed in response content")
|
||||
else:
|
||||
print(f"❌ Crawl failed: {result.get('error_message', 'Unknown error')}")
|
||||
else:
|
||||
print("❌ Request failed")
|
||||
print(f"Response: {json.dumps(data, indent=2)}")
|
||||
else:
|
||||
print(f"❌ Error: {response.status_code}")
|
||||
try:
|
||||
error_data = response.json()
|
||||
print(f"Error details: {json.dumps(error_data, indent=2)}")
|
||||
except:
|
||||
print(f"Error text: {response.text[:500]}")
|
||||
|
||||
|
||||
def test_streaming_with_hooks():
|
||||
"""Test streaming endpoint with hooks"""
|
||||
print("\n" + "=" * 70)
|
||||
print("Testing: POST /crawl/stream with hooks")
|
||||
print("=" * 70)
|
||||
|
||||
hooks_code = {
|
||||
"before_retrieve_html": """
|
||||
async def hook(page, context, **kwargs):
|
||||
await page.evaluate("document.querySelectorAll('img').forEach(img => img.remove())")
|
||||
return page
|
||||
"""
|
||||
}
|
||||
|
||||
payload = {
|
||||
"urls": ["https://httpbin.org/html", "https://httpbin.org/json"],
|
||||
"hooks": {
|
||||
"code": hooks_code,
|
||||
"timeout": 10
|
||||
}
|
||||
}
|
||||
|
||||
print("Sending streaming request with hooks...")
|
||||
|
||||
with requests.post(f"{API_BASE_URL}/crawl/stream", json=payload, stream=True) as response:
|
||||
if response.status_code == 200:
|
||||
# Check headers for hooks status
|
||||
hooks_status = response.headers.get('X-Hooks-Status')
|
||||
if hooks_status:
|
||||
print(f"Hooks Status (from header): {hooks_status}")
|
||||
|
||||
print("\nStreaming results:")
|
||||
for line in response.iter_lines():
|
||||
if line:
|
||||
try:
|
||||
result = json.loads(line)
|
||||
if 'url' in result:
|
||||
print(f" Received: {result['url']}")
|
||||
elif 'status' in result:
|
||||
print(f" Stream status: {result['status']}")
|
||||
except json.JSONDecodeError:
|
||||
print(f" Raw: {line.decode()}")
|
||||
else:
|
||||
print(f"Error: {response.status_code}")
|
||||
|
||||
|
||||
def test_basic_without_hooks():
|
||||
"""Test basic crawl without hooks"""
|
||||
print("\n" + "=" * 70)
|
||||
print("Testing: POST /crawl with no hooks")
|
||||
print("=" * 70)
|
||||
|
||||
payload = {
|
||||
"urls": ["https://httpbin.org/html", "https://httpbin.org/json"]
|
||||
}
|
||||
|
||||
response = requests.post(f"{API_BASE_URL}/crawl", json=payload)
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
print(f"Response: {json.dumps(data, indent=2)}")
|
||||
else:
|
||||
print(f"Error: {response.status_code}")
|
||||
|
||||
|
||||
def main():
|
||||
"""Run all tests"""
|
||||
print("🔧 Crawl4AI Docker API - Hooks Testing")
|
||||
print("=" * 70)
|
||||
|
||||
# Test 1: Get hooks information
|
||||
# test_hooks_info()
|
||||
|
||||
# Test 2: Basic crawl with hooks
|
||||
# test_basic_crawl_with_hooks()
|
||||
|
||||
# Test 3: Invalid hooks (error handling)
|
||||
test_invalid_hook()
|
||||
|
||||
# # Test 4: Authentication hook
|
||||
# test_authentication_hook()
|
||||
|
||||
# # Test 5: Streaming with hooks
|
||||
# test_streaming_with_hooks()
|
||||
|
||||
# # Test 6: Basic crawl without hooks
|
||||
# test_basic_without_hooks()
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("✅ All tests completed!")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
512
tests/docker/test_hooks_comprehensive.py
Normal file
512
tests/docker/test_hooks_comprehensive.py
Normal file
@@ -0,0 +1,512 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Comprehensive test demonstrating all hook types from hooks_example.py
|
||||
adapted for the Docker API with real URLs
|
||||
"""
|
||||
|
||||
import requests
|
||||
import json
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
API_BASE_URL = "http://localhost:11234"
|
||||
|
||||
|
||||
def test_all_hooks_demo():
|
||||
"""Demonstrate all 8 hook types with practical examples"""
|
||||
print("=" * 70)
|
||||
print("Testing: All Hooks Comprehensive Demo")
|
||||
print("=" * 70)
|
||||
|
||||
hooks_code = {
|
||||
"on_browser_created": """
|
||||
async def hook(browser, **kwargs):
|
||||
# Hook called after browser is created
|
||||
print("[HOOK] on_browser_created - Browser is ready!")
|
||||
# Browser-level configurations would go here
|
||||
return browser
|
||||
""",
|
||||
|
||||
"on_page_context_created": """
|
||||
async def hook(page, context, **kwargs):
|
||||
# Hook called after a new page and context are created
|
||||
print("[HOOK] on_page_context_created - New page created!")
|
||||
|
||||
# Set viewport size for consistent rendering
|
||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||
|
||||
# Add cookies for the session (using httpbin.org domain)
|
||||
await context.add_cookies([
|
||||
{
|
||||
"name": "test_session",
|
||||
"value": "abc123xyz",
|
||||
"domain": ".httpbin.org",
|
||||
"path": "/",
|
||||
"httpOnly": True,
|
||||
"secure": True
|
||||
}
|
||||
])
|
||||
|
||||
# Block ads and tracking scripts to speed up crawling
|
||||
await context.route("**/*.{png,jpg,jpeg,gif,webp,svg}", lambda route: route.abort())
|
||||
await context.route("**/analytics/*", lambda route: route.abort())
|
||||
await context.route("**/ads/*", lambda route: route.abort())
|
||||
|
||||
print("[HOOK] Viewport set, cookies added, and ads blocked")
|
||||
return page
|
||||
""",
|
||||
|
||||
"on_user_agent_updated": """
|
||||
async def hook(page, context, user_agent, **kwargs):
|
||||
# Hook called when user agent is updated
|
||||
print(f"[HOOK] on_user_agent_updated - User agent: {user_agent[:50]}...")
|
||||
return page
|
||||
""",
|
||||
|
||||
"before_goto": """
|
||||
async def hook(page, context, url, **kwargs):
|
||||
# Hook called before navigating to each URL
|
||||
print(f"[HOOK] before_goto - About to visit: {url}")
|
||||
|
||||
# Add custom headers for the request
|
||||
await page.set_extra_http_headers({
|
||||
"X-Custom-Header": "crawl4ai-test",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
"DNT": "1"
|
||||
})
|
||||
|
||||
return page
|
||||
""",
|
||||
|
||||
"after_goto": """
|
||||
async def hook(page, context, url, response, **kwargs):
|
||||
# Hook called after navigating to each URL
|
||||
print(f"[HOOK] after_goto - Successfully loaded: {url}")
|
||||
|
||||
# Wait a moment for dynamic content to load
|
||||
await page.wait_for_timeout(1000)
|
||||
|
||||
# Check if specific elements exist (with error handling)
|
||||
try:
|
||||
# For httpbin.org, wait for body element
|
||||
await page.wait_for_selector("body", timeout=2000)
|
||||
print("[HOOK] Body element found and loaded")
|
||||
except:
|
||||
print("[HOOK] Timeout waiting for body, continuing anyway")
|
||||
|
||||
return page
|
||||
""",
|
||||
|
||||
"on_execution_started": """
|
||||
async def hook(page, context, **kwargs):
|
||||
# Hook called after custom JavaScript execution
|
||||
print("[HOOK] on_execution_started - Custom JS executed!")
|
||||
|
||||
# You could inject additional JavaScript here if needed
|
||||
await page.evaluate("console.log('[INJECTED] Hook JS running');")
|
||||
|
||||
return page
|
||||
""",
|
||||
|
||||
"before_retrieve_html": """
|
||||
async def hook(page, context, **kwargs):
|
||||
# Hook called before retrieving the HTML content
|
||||
print("[HOOK] before_retrieve_html - Preparing to get HTML")
|
||||
|
||||
# Scroll to bottom to trigger lazy loading
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
||||
await page.wait_for_timeout(500)
|
||||
|
||||
# Scroll back to top
|
||||
await page.evaluate("window.scrollTo(0, 0);")
|
||||
await page.wait_for_timeout(500)
|
||||
|
||||
# One more scroll to middle for good measure
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight / 2);")
|
||||
|
||||
print("[HOOK] Scrolling completed for lazy-loaded content")
|
||||
return page
|
||||
""",
|
||||
|
||||
"before_return_html": """
|
||||
async def hook(page, context, html, **kwargs):
|
||||
# Hook called before returning the HTML content
|
||||
print(f"[HOOK] before_return_html - HTML length: {len(html)} characters")
|
||||
|
||||
# Log some page metrics
|
||||
metrics = await page.evaluate('''() => {
|
||||
return {
|
||||
images: document.images.length,
|
||||
links: document.links.length,
|
||||
scripts: document.scripts.length
|
||||
}
|
||||
}''')
|
||||
|
||||
print(f"[HOOK] Page metrics - Images: {metrics['images']}, Links: {metrics['links']}, Scripts: {metrics['scripts']}")
|
||||
|
||||
return page
|
||||
"""
|
||||
}
|
||||
|
||||
# Create request payload
|
||||
payload = {
|
||||
"urls": ["https://httpbin.org/html"],
|
||||
"hooks": {
|
||||
"code": hooks_code,
|
||||
"timeout": 30
|
||||
},
|
||||
"crawler_config": {
|
||||
"js_code": "window.scrollTo(0, document.body.scrollHeight);",
|
||||
"wait_for": "body",
|
||||
"cache_mode": "bypass"
|
||||
}
|
||||
}
|
||||
|
||||
print("\nSending request with all 8 hooks...")
|
||||
start_time = time.time()
|
||||
|
||||
response = requests.post(f"{API_BASE_URL}/crawl", json=payload)
|
||||
|
||||
elapsed_time = time.time() - start_time
|
||||
print(f"Request completed in {elapsed_time:.2f} seconds")
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
print("\n✅ Request successful!")
|
||||
|
||||
# Check hooks execution
|
||||
if 'hooks' in data:
|
||||
hooks_info = data['hooks']
|
||||
print("\n📊 Hooks Execution Summary:")
|
||||
print(f" Status: {hooks_info['status']['status']}")
|
||||
print(f" Attached hooks: {len(hooks_info['status']['attached_hooks'])}")
|
||||
|
||||
for hook_name in hooks_info['status']['attached_hooks']:
|
||||
print(f" ✓ {hook_name}")
|
||||
|
||||
if 'summary' in hooks_info:
|
||||
summary = hooks_info['summary']
|
||||
print(f"\n📈 Execution Statistics:")
|
||||
print(f" Total executions: {summary['total_executions']}")
|
||||
print(f" Successful: {summary['successful']}")
|
||||
print(f" Failed: {summary['failed']}")
|
||||
print(f" Timed out: {summary['timed_out']}")
|
||||
print(f" Success rate: {summary['success_rate']:.1f}%")
|
||||
|
||||
if hooks_info.get('execution_log'):
|
||||
print(f"\n📝 Execution Log:")
|
||||
for log_entry in hooks_info['execution_log']:
|
||||
status_icon = "✅" if log_entry['status'] == 'success' else "❌"
|
||||
exec_time = log_entry.get('execution_time', 0)
|
||||
print(f" {status_icon} {log_entry['hook_point']}: {exec_time:.3f}s")
|
||||
|
||||
# Check crawl results
|
||||
if 'results' in data and len(data['results']) > 0:
|
||||
print(f"\n📄 Crawl Results:")
|
||||
for result in data['results']:
|
||||
print(f" URL: {result['url']}")
|
||||
print(f" Success: {result.get('success', False)}")
|
||||
if result.get('html'):
|
||||
print(f" HTML length: {len(result['html'])} characters")
|
||||
|
||||
else:
|
||||
print(f"❌ Error: {response.status_code}")
|
||||
try:
|
||||
error_data = response.json()
|
||||
print(f"Error details: {json.dumps(error_data, indent=2)}")
|
||||
except:
|
||||
print(f"Error text: {response.text[:500]}")
|
||||
|
||||
|
||||
def test_authentication_flow():
|
||||
"""Test a complete authentication flow with multiple hooks"""
|
||||
print("\n" + "=" * 70)
|
||||
print("Testing: Authentication Flow with Multiple Hooks")
|
||||
print("=" * 70)
|
||||
|
||||
hooks_code = {
|
||||
"on_page_context_created": """
|
||||
async def hook(page, context, **kwargs):
|
||||
print("[HOOK] Setting up authentication context")
|
||||
|
||||
# Add authentication cookies
|
||||
await context.add_cookies([
|
||||
{
|
||||
"name": "auth_token",
|
||||
"value": "fake_jwt_token_here",
|
||||
"domain": ".httpbin.org",
|
||||
"path": "/",
|
||||
"httpOnly": True,
|
||||
"secure": True
|
||||
}
|
||||
])
|
||||
|
||||
# Set localStorage items (for SPA authentication)
|
||||
await page.evaluate('''
|
||||
localStorage.setItem('user_id', '12345');
|
||||
localStorage.setItem('auth_time', new Date().toISOString());
|
||||
''')
|
||||
|
||||
return page
|
||||
""",
|
||||
|
||||
"before_goto": """
|
||||
async def hook(page, context, url, **kwargs):
|
||||
print(f"[HOOK] Adding auth headers for {url}")
|
||||
|
||||
# Add Authorization header
|
||||
import base64
|
||||
credentials = base64.b64encode(b"user:passwd").decode('ascii')
|
||||
|
||||
await page.set_extra_http_headers({
|
||||
'Authorization': f'Basic {credentials}',
|
||||
'X-API-Key': 'test-api-key-123'
|
||||
})
|
||||
|
||||
return page
|
||||
"""
|
||||
}
|
||||
|
||||
payload = {
|
||||
"urls": [
|
||||
"https://httpbin.org/basic-auth/user/passwd"
|
||||
],
|
||||
"hooks": {
|
||||
"code": hooks_code,
|
||||
"timeout": 15
|
||||
}
|
||||
}
|
||||
|
||||
print("\nTesting authentication with httpbin endpoints...")
|
||||
response = requests.post(f"{API_BASE_URL}/crawl", json=payload)
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
print("✅ Authentication test completed")
|
||||
|
||||
if 'results' in data:
|
||||
for i, result in enumerate(data['results']):
|
||||
print(f"\n URL {i+1}: {result['url']}")
|
||||
if result.get('success'):
|
||||
# Check for authentication success indicators
|
||||
html_content = result.get('html', '')
|
||||
if '"authenticated"' in html_content and 'true' in html_content:
|
||||
print(" ✅ Authentication successful! Basic auth worked.")
|
||||
else:
|
||||
print(" ⚠️ Page loaded but auth status unclear")
|
||||
else:
|
||||
print(f" ❌ Failed: {result.get('error_message', 'Unknown error')}")
|
||||
else:
|
||||
print(f"❌ Error: {response.status_code}")
|
||||
|
||||
|
||||
def test_performance_optimization_hooks():
|
||||
"""Test hooks for performance optimization"""
|
||||
print("\n" + "=" * 70)
|
||||
print("Testing: Performance Optimization Hooks")
|
||||
print("=" * 70)
|
||||
|
||||
hooks_code = {
|
||||
"on_page_context_created": """
|
||||
async def hook(page, context, **kwargs):
|
||||
print("[HOOK] Optimizing page for performance")
|
||||
|
||||
# Block resource-heavy content
|
||||
await context.route("**/*.{png,jpg,jpeg,gif,webp,svg,ico}", lambda route: route.abort())
|
||||
await context.route("**/*.{woff,woff2,ttf,otf}", lambda route: route.abort())
|
||||
await context.route("**/*.{mp4,webm,ogg,mp3,wav}", lambda route: route.abort())
|
||||
await context.route("**/googletagmanager.com/*", lambda route: route.abort())
|
||||
await context.route("**/google-analytics.com/*", lambda route: route.abort())
|
||||
await context.route("**/doubleclick.net/*", lambda route: route.abort())
|
||||
await context.route("**/facebook.com/*", lambda route: route.abort())
|
||||
|
||||
# Disable animations and transitions
|
||||
await page.add_style_tag(content='''
|
||||
*, *::before, *::after {
|
||||
animation-duration: 0s !important;
|
||||
animation-delay: 0s !important;
|
||||
transition-duration: 0s !important;
|
||||
transition-delay: 0s !important;
|
||||
}
|
||||
''')
|
||||
|
||||
print("[HOOK] Performance optimizations applied")
|
||||
return page
|
||||
""",
|
||||
|
||||
"before_retrieve_html": """
|
||||
async def hook(page, context, **kwargs):
|
||||
print("[HOOK] Removing unnecessary elements before extraction")
|
||||
|
||||
# Remove ads, popups, and other unnecessary elements
|
||||
await page.evaluate('''() => {
|
||||
// Remove common ad containers
|
||||
const adSelectors = [
|
||||
'.ad', '.ads', '.advertisement', '[id*="ad-"]', '[class*="ad-"]',
|
||||
'.popup', '.modal', '.overlay', '.cookie-banner', '.newsletter-signup'
|
||||
];
|
||||
|
||||
adSelectors.forEach(selector => {
|
||||
document.querySelectorAll(selector).forEach(el => el.remove());
|
||||
});
|
||||
|
||||
// Remove script tags to clean up HTML
|
||||
document.querySelectorAll('script').forEach(el => el.remove());
|
||||
|
||||
// Remove style tags we don't need
|
||||
document.querySelectorAll('style').forEach(el => el.remove());
|
||||
}''')
|
||||
|
||||
return page
|
||||
"""
|
||||
}
|
||||
|
||||
payload = {
|
||||
"urls": ["https://httpbin.org/html"],
|
||||
"hooks": {
|
||||
"code": hooks_code,
|
||||
"timeout": 10
|
||||
}
|
||||
}
|
||||
|
||||
print("\nTesting performance optimization hooks...")
|
||||
start_time = time.time()
|
||||
|
||||
response = requests.post(f"{API_BASE_URL}/crawl", json=payload)
|
||||
|
||||
elapsed_time = time.time() - start_time
|
||||
print(f"Request completed in {elapsed_time:.2f} seconds")
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
print("✅ Performance optimization test completed")
|
||||
|
||||
if 'results' in data and len(data['results']) > 0:
|
||||
result = data['results'][0]
|
||||
if result.get('html'):
|
||||
print(f" HTML size: {len(result['html'])} characters")
|
||||
print(" Resources blocked, ads removed, animations disabled")
|
||||
else:
|
||||
print(f"❌ Error: {response.status_code}")
|
||||
|
||||
|
||||
def test_content_extraction_hooks():
|
||||
"""Test hooks for intelligent content extraction"""
|
||||
print("\n" + "=" * 70)
|
||||
print("Testing: Content Extraction Hooks")
|
||||
print("=" * 70)
|
||||
|
||||
hooks_code = {
|
||||
"after_goto": """
|
||||
async def hook(page, context, url, response, **kwargs):
|
||||
print(f"[HOOK] Waiting for dynamic content on {url}")
|
||||
|
||||
# Wait for any lazy-loaded content
|
||||
await page.wait_for_timeout(2000)
|
||||
|
||||
# Trigger any "Load More" buttons
|
||||
try:
|
||||
load_more = await page.query_selector('[class*="load-more"], [class*="show-more"], button:has-text("Load More")')
|
||||
if load_more:
|
||||
await load_more.click()
|
||||
await page.wait_for_timeout(1000)
|
||||
print("[HOOK] Clicked 'Load More' button")
|
||||
except:
|
||||
pass
|
||||
|
||||
return page
|
||||
""",
|
||||
|
||||
"before_retrieve_html": """
|
||||
async def hook(page, context, **kwargs):
|
||||
print("[HOOK] Extracting structured data")
|
||||
|
||||
# Extract metadata
|
||||
metadata = await page.evaluate('''() => {
|
||||
const getMeta = (name) => {
|
||||
const element = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
|
||||
return element ? element.getAttribute('content') : null;
|
||||
};
|
||||
|
||||
return {
|
||||
title: document.title,
|
||||
description: getMeta('description') || getMeta('og:description'),
|
||||
author: getMeta('author'),
|
||||
keywords: getMeta('keywords'),
|
||||
ogTitle: getMeta('og:title'),
|
||||
ogImage: getMeta('og:image'),
|
||||
canonical: document.querySelector('link[rel="canonical"]')?.href,
|
||||
jsonLd: Array.from(document.querySelectorAll('script[type="application/ld+json"]'))
|
||||
.map(el => el.textContent).filter(Boolean)
|
||||
};
|
||||
}''')
|
||||
|
||||
print(f"[HOOK] Extracted metadata: {json.dumps(metadata, indent=2)}")
|
||||
|
||||
# Infinite scroll handling
|
||||
for i in range(3):
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
||||
await page.wait_for_timeout(1000)
|
||||
print(f"[HOOK] Scroll iteration {i+1}/3")
|
||||
|
||||
return page
|
||||
"""
|
||||
}
|
||||
|
||||
payload = {
|
||||
"urls": ["https://httpbin.org/html", "https://httpbin.org/json"],
|
||||
"hooks": {
|
||||
"code": hooks_code,
|
||||
"timeout": 20
|
||||
}
|
||||
}
|
||||
|
||||
print("\nTesting content extraction hooks...")
|
||||
response = requests.post(f"{API_BASE_URL}/crawl", json=payload)
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
print("✅ Content extraction test completed")
|
||||
|
||||
if 'hooks' in data and 'summary' in data['hooks']:
|
||||
summary = data['hooks']['summary']
|
||||
print(f" Hooks executed: {summary['successful']}/{summary['total_executions']}")
|
||||
|
||||
if 'results' in data:
|
||||
for result in data['results']:
|
||||
print(f"\n URL: {result['url']}")
|
||||
print(f" Success: {result.get('success', False)}")
|
||||
else:
|
||||
print(f"❌ Error: {response.status_code}")
|
||||
|
||||
|
||||
def main():
|
||||
"""Run comprehensive hook tests"""
|
||||
print("🔧 Crawl4AI Docker API - Comprehensive Hooks Testing")
|
||||
print("Based on docs/examples/hooks_example.py")
|
||||
print("=" * 70)
|
||||
|
||||
tests = [
|
||||
("All Hooks Demo", test_all_hooks_demo),
|
||||
("Authentication Flow", test_authentication_flow),
|
||||
("Performance Optimization", test_performance_optimization_hooks),
|
||||
("Content Extraction", test_content_extraction_hooks),
|
||||
]
|
||||
|
||||
for i, (name, test_func) in enumerate(tests, 1):
|
||||
print(f"\n📌 Test {i}/{len(tests)}: {name}")
|
||||
try:
|
||||
test_func()
|
||||
print(f"✅ {name} completed")
|
||||
except Exception as e:
|
||||
print(f"❌ {name} failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("🎉 All comprehensive hook tests completed!")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
193
tests/docker/test_hooks_utility.py
Normal file
193
tests/docker/test_hooks_utility.py
Normal file
@@ -0,0 +1,193 @@
|
||||
"""
|
||||
Test script demonstrating the hooks_to_string utility and Docker client integration.
|
||||
"""
|
||||
import asyncio
|
||||
from crawl4ai import Crawl4aiDockerClient, hooks_to_string
|
||||
|
||||
|
||||
# Define hook functions as regular Python functions
|
||||
async def auth_hook(page, context, **kwargs):
|
||||
"""Add authentication cookies."""
|
||||
await context.add_cookies([{
|
||||
'name': 'test_cookie',
|
||||
'value': 'test_value',
|
||||
'domain': '.httpbin.org',
|
||||
'path': '/'
|
||||
}])
|
||||
return page
|
||||
|
||||
|
||||
async def scroll_hook(page, context, **kwargs):
|
||||
"""Scroll to load lazy content."""
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
await page.wait_for_timeout(1000)
|
||||
return page
|
||||
|
||||
|
||||
async def viewport_hook(page, context, **kwargs):
|
||||
"""Set custom viewport."""
|
||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||
return page
|
||||
|
||||
|
||||
async def test_hooks_utility():
|
||||
"""Test the hooks_to_string utility function."""
|
||||
print("=" * 60)
|
||||
print("Testing hooks_to_string utility")
|
||||
print("=" * 60)
|
||||
|
||||
# Create hooks dictionary with function objects
|
||||
hooks_dict = {
|
||||
"on_page_context_created": auth_hook,
|
||||
"before_retrieve_html": scroll_hook
|
||||
}
|
||||
|
||||
# Convert to string format
|
||||
hooks_string = hooks_to_string(hooks_dict)
|
||||
|
||||
print("\n✓ Successfully converted function objects to strings")
|
||||
print(f"\n✓ Converted {len(hooks_string)} hooks:")
|
||||
for hook_name in hooks_string.keys():
|
||||
print(f" - {hook_name}")
|
||||
|
||||
print("\n✓ Preview of converted hook:")
|
||||
print("-" * 60)
|
||||
print(hooks_string["on_page_context_created"][:200] + "...")
|
||||
print("-" * 60)
|
||||
|
||||
return hooks_string
|
||||
|
||||
|
||||
async def test_docker_client_with_functions():
|
||||
"""Test Docker client with function objects (automatic conversion)."""
|
||||
print("\n" + "=" * 60)
|
||||
print("Testing Docker Client with Function Objects")
|
||||
print("=" * 60)
|
||||
|
||||
# Note: This requires a running Crawl4AI Docker server
|
||||
# Uncomment the following to test with actual server:
|
||||
|
||||
async with Crawl4aiDockerClient(base_url="http://localhost:11234", verbose=True) as client:
|
||||
# Pass function objects directly - they'll be converted automatically
|
||||
result = await client.crawl(
|
||||
["https://httpbin.org/html"],
|
||||
hooks={
|
||||
"on_page_context_created": auth_hook,
|
||||
"before_retrieve_html": scroll_hook
|
||||
},
|
||||
hooks_timeout=30
|
||||
)
|
||||
print(f"\n✓ Crawl successful: {result.success}")
|
||||
print(f"✓ URL: {result.url}")
|
||||
|
||||
print("\n✓ Docker client accepts function objects directly")
|
||||
print("✓ Automatic conversion happens internally")
|
||||
print("✓ No manual string formatting needed!")
|
||||
|
||||
|
||||
async def test_docker_client_with_strings():
|
||||
"""Test Docker client with pre-converted strings."""
|
||||
print("\n" + "=" * 60)
|
||||
print("Testing Docker Client with String Hooks")
|
||||
print("=" * 60)
|
||||
|
||||
# Convert hooks to strings first
|
||||
hooks_dict = {
|
||||
"on_page_context_created": viewport_hook,
|
||||
"before_retrieve_html": scroll_hook
|
||||
}
|
||||
hooks_string = hooks_to_string(hooks_dict)
|
||||
|
||||
# Note: This requires a running Crawl4AI Docker server
|
||||
# Uncomment the following to test with actual server:
|
||||
|
||||
async with Crawl4aiDockerClient(base_url="http://localhost:11234", verbose=True) as client:
|
||||
# Pass string hooks - they'll be used as-is
|
||||
result = await client.crawl(
|
||||
["https://httpbin.org/html"],
|
||||
hooks=hooks_string,
|
||||
hooks_timeout=30
|
||||
)
|
||||
print(f"\n✓ Crawl successful: {result.success}")
|
||||
|
||||
print("\n✓ Docker client also accepts pre-converted strings")
|
||||
print("✓ Backward compatible with existing code")
|
||||
|
||||
|
||||
async def show_usage_patterns():
|
||||
"""Show different usage patterns."""
|
||||
print("\n" + "=" * 60)
|
||||
print("Usage Patterns")
|
||||
print("=" * 60)
|
||||
|
||||
print("\n1. Direct function usage (simplest):")
|
||||
print("-" * 60)
|
||||
print("""
|
||||
async def my_hook(page, context, **kwargs):
|
||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||
return page
|
||||
|
||||
result = await client.crawl(
|
||||
["https://example.com"],
|
||||
hooks={"on_page_context_created": my_hook}
|
||||
)
|
||||
""")
|
||||
|
||||
print("\n2. Convert then use:")
|
||||
print("-" * 60)
|
||||
print("""
|
||||
hooks_dict = {"on_page_context_created": my_hook}
|
||||
hooks_string = hooks_to_string(hooks_dict)
|
||||
|
||||
result = await client.crawl(
|
||||
["https://example.com"],
|
||||
hooks=hooks_string
|
||||
)
|
||||
""")
|
||||
|
||||
print("\n3. Manual string (backward compatible):")
|
||||
print("-" * 60)
|
||||
print("""
|
||||
hooks_string = {
|
||||
"on_page_context_created": '''
|
||||
async def hook(page, context, **kwargs):
|
||||
await page.set_viewport_size({"width": 1920, "height": 1080})
|
||||
return page
|
||||
'''
|
||||
}
|
||||
|
||||
result = await client.crawl(
|
||||
["https://example.com"],
|
||||
hooks=hooks_string
|
||||
)
|
||||
""")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all tests."""
|
||||
print("\n🚀 Crawl4AI Hooks Utility Test Suite\n")
|
||||
|
||||
# Test the utility function
|
||||
# await test_hooks_utility()
|
||||
|
||||
# Show usage with Docker client
|
||||
# await test_docker_client_with_functions()
|
||||
await test_docker_client_with_strings()
|
||||
|
||||
# Show different patterns
|
||||
# await show_usage_patterns()
|
||||
|
||||
# print("\n" + "=" * 60)
|
||||
# print("✓ All tests completed successfully!")
|
||||
# print("=" * 60)
|
||||
# print("\nKey Benefits:")
|
||||
# print(" • Write hooks as regular Python functions")
|
||||
# print(" • IDE support with autocomplete and type checking")
|
||||
# print(" • Automatic conversion to API format")
|
||||
# print(" • Backward compatible with string hooks")
|
||||
# print(" • Same utility used everywhere")
|
||||
# print("\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user