Compare commits
19 Commits
feature/te
...
implement-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d670dcde0a | ||
|
|
f8606f6865 | ||
|
|
52da8d72bc | ||
|
|
8b7e67566e | ||
|
|
7388baa205 | ||
|
|
897bc3a493 | ||
|
|
8a37710313 | ||
|
|
fdbcddbf1a | ||
|
|
564d437d97 | ||
|
|
9cd06ea7eb | ||
|
|
eb257c2ba3 | ||
|
|
8d364a0731 | ||
|
|
6aff0e55aa | ||
|
|
38a0742708 | ||
|
|
a720a3a9fe | ||
|
|
017144c2dd | ||
|
|
32887ea40d | ||
|
|
eea41bf1ca | ||
|
|
21c302f439 |
@@ -1,136 +0,0 @@
|
|||||||
# Makefile for Crawl4AI Telemetry Testing
|
|
||||||
# Usage: make test-telemetry, make test-unit, make test-integration, etc.
|
|
||||||
|
|
||||||
.PHONY: help test-all test-telemetry test-unit test-integration test-privacy test-performance test-slow test-coverage test-verbose clean
|
|
||||||
|
|
||||||
# Default Python executable
|
|
||||||
PYTHON := .venv/bin/python
|
|
||||||
PYTEST := $(PYTHON) -m pytest
|
|
||||||
|
|
||||||
help:
|
|
||||||
@echo "Crawl4AI Telemetry Testing Commands:"
|
|
||||||
@echo ""
|
|
||||||
@echo " test-all Run all telemetry tests"
|
|
||||||
@echo " test-telemetry Run all telemetry tests (same as test-all)"
|
|
||||||
@echo " test-unit Run unit tests only"
|
|
||||||
@echo " test-integration Run integration tests only"
|
|
||||||
@echo " test-privacy Run privacy compliance tests only"
|
|
||||||
@echo " test-performance Run performance tests only"
|
|
||||||
@echo " test-slow Run slow tests only"
|
|
||||||
@echo " test-coverage Run tests with coverage report"
|
|
||||||
@echo " test-verbose Run tests with verbose output"
|
|
||||||
@echo " test-specific TEST= Run specific test (e.g., make test-specific TEST=test_telemetry.py::TestTelemetryConfig)"
|
|
||||||
@echo " clean Clean test artifacts"
|
|
||||||
@echo ""
|
|
||||||
@echo "Environment Variables:"
|
|
||||||
@echo " CRAWL4AI_TELEMETRY_TEST_REAL=1 Enable real telemetry during tests"
|
|
||||||
@echo " PYTEST_ARGS Additional pytest arguments"
|
|
||||||
|
|
||||||
# Run all telemetry tests
|
|
||||||
test-all test-telemetry:
|
|
||||||
$(PYTEST) tests/telemetry/ -v
|
|
||||||
|
|
||||||
# Run unit tests only
|
|
||||||
test-unit:
|
|
||||||
$(PYTEST) tests/telemetry/ -m "unit" -v
|
|
||||||
|
|
||||||
# Run integration tests only
|
|
||||||
test-integration:
|
|
||||||
$(PYTEST) tests/telemetry/ -m "integration" -v
|
|
||||||
|
|
||||||
# Run privacy compliance tests only
|
|
||||||
test-privacy:
|
|
||||||
$(PYTEST) tests/telemetry/ -m "privacy" -v
|
|
||||||
|
|
||||||
# Run performance tests only
|
|
||||||
test-performance:
|
|
||||||
$(PYTEST) tests/telemetry/ -m "performance" -v
|
|
||||||
|
|
||||||
# Run slow tests only
|
|
||||||
test-slow:
|
|
||||||
$(PYTEST) tests/telemetry/ -m "slow" -v
|
|
||||||
|
|
||||||
# Run tests with coverage
|
|
||||||
test-coverage:
|
|
||||||
$(PYTEST) tests/telemetry/ --cov=crawl4ai.telemetry --cov-report=html --cov-report=term-missing -v
|
|
||||||
|
|
||||||
# Run tests with verbose output
|
|
||||||
test-verbose:
|
|
||||||
$(PYTEST) tests/telemetry/ -vvv --tb=long
|
|
||||||
|
|
||||||
# Run specific test
|
|
||||||
test-specific:
|
|
||||||
$(PYTEST) tests/telemetry/$(TEST) -v
|
|
||||||
|
|
||||||
# Run tests excluding slow ones
|
|
||||||
test-fast:
|
|
||||||
$(PYTEST) tests/telemetry/ -m "not slow" -v
|
|
||||||
|
|
||||||
# Run tests in parallel
|
|
||||||
test-parallel:
|
|
||||||
$(PYTEST) tests/telemetry/ -n auto -v
|
|
||||||
|
|
||||||
# Clean test artifacts
|
|
||||||
clean:
|
|
||||||
rm -rf .pytest_cache/
|
|
||||||
rm -rf htmlcov/
|
|
||||||
rm -rf .coverage
|
|
||||||
find tests/ -name "*.pyc" -delete
|
|
||||||
find tests/ -name "__pycache__" -type d -exec rm -rf {} +
|
|
||||||
rm -rf tests/telemetry/__pycache__/
|
|
||||||
|
|
||||||
# Lint test files
|
|
||||||
lint-tests:
|
|
||||||
$(PYTHON) -m flake8 tests/telemetry/
|
|
||||||
$(PYTHON) -m pylint tests/telemetry/
|
|
||||||
|
|
||||||
# Type check test files
|
|
||||||
typecheck-tests:
|
|
||||||
$(PYTHON) -m mypy tests/telemetry/
|
|
||||||
|
|
||||||
# Run all quality checks
|
|
||||||
check-tests: lint-tests typecheck-tests test-unit
|
|
||||||
|
|
||||||
# Install test dependencies
|
|
||||||
install-test-deps:
|
|
||||||
$(PYTHON) -m pip install pytest pytest-asyncio pytest-mock pytest-cov pytest-xdist
|
|
||||||
|
|
||||||
# Setup development environment for testing
|
|
||||||
setup-dev:
|
|
||||||
$(PYTHON) -m pip install -e .
|
|
||||||
$(MAKE) install-test-deps
|
|
||||||
|
|
||||||
# Generate test report
|
|
||||||
test-report:
|
|
||||||
$(PYTEST) tests/telemetry/ --html=test-report.html --self-contained-html -v
|
|
||||||
|
|
||||||
# Run performance benchmarks
|
|
||||||
benchmark:
|
|
||||||
$(PYTEST) tests/telemetry/test_privacy_performance.py::TestTelemetryPerformance -v --benchmark-only
|
|
||||||
|
|
||||||
# Test different environments
|
|
||||||
test-docker-env:
|
|
||||||
CRAWL4AI_DOCKER=true $(PYTEST) tests/telemetry/ -k "docker" -v
|
|
||||||
|
|
||||||
test-cli-env:
|
|
||||||
$(PYTEST) tests/telemetry/ -k "cli" -v
|
|
||||||
|
|
||||||
# Validate telemetry implementation
|
|
||||||
validate:
|
|
||||||
@echo "Running telemetry validation suite..."
|
|
||||||
$(MAKE) test-unit
|
|
||||||
$(MAKE) test-privacy
|
|
||||||
$(MAKE) test-performance
|
|
||||||
@echo "Validation complete!"
|
|
||||||
|
|
||||||
# Debug failing tests
|
|
||||||
debug:
|
|
||||||
$(PYTEST) tests/telemetry/ --pdb -x -v
|
|
||||||
|
|
||||||
# Show test markers
|
|
||||||
show-markers:
|
|
||||||
$(PYTEST) --markers
|
|
||||||
|
|
||||||
# Show test collection (dry run)
|
|
||||||
show-tests:
|
|
||||||
$(PYTEST) tests/telemetry/ --collect-only -q
|
|
||||||
36
README.md
36
README.md
@@ -304,9 +304,9 @@ The new Docker implementation includes:
|
|||||||
### Getting Started
|
### Getting Started
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Pull and run the latest release
|
# Pull and run the latest release candidate
|
||||||
docker pull unclecode/crawl4ai:latest
|
docker pull unclecode/crawl4ai:0.7.0
|
||||||
docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:latest
|
docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:0.7.0
|
||||||
|
|
||||||
# Visit the playground at http://localhost:11235/playground
|
# Visit the playground at http://localhost:11235/playground
|
||||||
```
|
```
|
||||||
@@ -919,6 +919,36 @@ We envision a future where AI is powered by real human knowledge, ensuring data
|
|||||||
For more details, see our [full mission statement](./MISSION.md).
|
For more details, see our [full mission statement](./MISSION.md).
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
## 🌟 Current Sponsors
|
||||||
|
|
||||||
|
### 🏢 Enterprise Sponsors & Partners
|
||||||
|
|
||||||
|
Our enterprise sponsors and technology partners help scale Crawl4AI to power production-grade data pipelines.
|
||||||
|
|
||||||
|
| Company | About | Sponsorship Tier |
|
||||||
|
|------|------|----------------------------|
|
||||||
|
| <a href="https://dashboard.capsolver.com/passport/register?inviteCode=ESVSECTX5Q23" target="_blank"><picture><source width="120" media="(prefers-color-scheme: dark)" srcset="https://docs.crawl4ai.com/uploads/sponsors/20251013045338_72a71fa4ee4d2f40.png"><source width="120" media="(prefers-color-scheme: light)" srcset="https://www.capsolver.com/assets/images/logo-text.png"><img alt="Capsolver" src="https://www.capsolver.com/assets/images/logo-text.png"></picture></a> | AI-powered Captcha solving service. Supports all major Captcha types, including reCAPTCHA, Cloudflare, and more | 🥈 Silver |
|
||||||
|
| <a href="https://kipo.ai" target="_blank"><img src="https://docs.crawl4ai.com/uploads/sponsors/20251013045751_2d54f57f117c651e.png" alt="DataSync" width="120"/></a> | Helps engineers and buyers find, compare, and source electronic & industrial parts in seconds, with specs, pricing, lead times & alternatives.| 🥇 Gold |
|
||||||
|
| <a href="https://www.kidocode.com/" target="_blank"><img src="https://docs.crawl4ai.com/uploads/sponsors/20251013045045_bb8dace3f0440d65.svg" alt="Kidocode" width="120"/><p align="center">KidoCode</p></a> | Kidocode is a hybrid technology and entrepreneurship school for kids aged 5–18, offering both online and on-campus education. | 🥇 Gold |
|
||||||
|
| <a href="https://www.alephnull.sg/" target="_blank"><img src="https://docs.crawl4ai.com/uploads/sponsors/20251013050323_a9e8e8c4c3650421.svg" alt="Aleph null" width="120"/></a> | Singapore-based Aleph Null is Asia’s leading edtech hub, dedicated to student-centric, AI-driven education—empowering learners with the tools to thrive in a fast-changing world. | 🥇 Gold |
|
||||||
|
|
||||||
|
### 🧑🤝 Individual Sponsors
|
||||||
|
|
||||||
|
A heartfelt thanks to our individual supporters! Every contribution helps us keep our opensource mission alive and thriving!
|
||||||
|
|
||||||
|
<p align="left">
|
||||||
|
<a href="https://github.com/hafezparast"><img src="https://avatars.githubusercontent.com/u/14273305?s=60&v=4" style="border-radius:50%;" width="64px;"/></a>
|
||||||
|
<a href="https://github.com/ntohidi"><img src="https://avatars.githubusercontent.com/u/17140097?s=60&v=4" style="border-radius:50%;"width="64px;"/></a>
|
||||||
|
<a href="https://github.com/Sjoeborg"><img src="https://avatars.githubusercontent.com/u/17451310?s=60&v=4" style="border-radius:50%;"width="64px;"/></a>
|
||||||
|
<a href="https://github.com/romek-rozen"><img src="https://avatars.githubusercontent.com/u/30595969?s=60&v=4" style="border-radius:50%;"width="64px;"/></a>
|
||||||
|
<a href="https://github.com/Kourosh-Kiyani"><img src="https://avatars.githubusercontent.com/u/34105600?s=60&v=4" style="border-radius:50%;"width="64px;"/></a>
|
||||||
|
<a href="https://github.com/Etherdrake"><img src="https://avatars.githubusercontent.com/u/67021215?s=60&v=4" style="border-radius:50%;"width="64px;"/></a>
|
||||||
|
<a href="https://github.com/shaman247"><img src="https://avatars.githubusercontent.com/u/211010067?s=60&v=4" style="border-radius:50%;"width="64px;"/></a>
|
||||||
|
<a href="https://github.com/work-flow-manager"><img src="https://avatars.githubusercontent.com/u/217665461?s=60&v=4" style="border-radius:50%;"width="64px;"/></a>
|
||||||
|
</p>
|
||||||
|
|
||||||
|
> Want to join them? [Sponsor Crawl4AI →](https://github.com/sponsors/unclecode)
|
||||||
|
|
||||||
## Star History
|
## Star History
|
||||||
|
|
||||||
[](https://star-history.com/#unclecode/crawl4ai&Date)
|
[](https://star-history.com/#unclecode/crawl4ai&Date)
|
||||||
|
|||||||
@@ -1,190 +0,0 @@
|
|||||||
# Crawl4AI Telemetry Testing Implementation
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
This document summarizes the comprehensive testing strategy implementation for Crawl4AI's opt-in telemetry system. The implementation provides thorough test coverage across unit tests, integration tests, privacy compliance tests, and performance tests.
|
|
||||||
|
|
||||||
## Implementation Summary
|
|
||||||
|
|
||||||
### 📊 Test Statistics
|
|
||||||
- **Total Tests**: 40 tests
|
|
||||||
- **Success Rate**: 100% (40/40 passing)
|
|
||||||
- **Test Categories**: 4 categories (Unit, Integration, Privacy, Performance)
|
|
||||||
- **Code Coverage**: 51% (625 statements, 308 missing)
|
|
||||||
|
|
||||||
### 🗂️ Test Structure
|
|
||||||
|
|
||||||
#### 1. **Unit Tests** (`tests/telemetry/test_telemetry.py`)
|
|
||||||
- `TestTelemetryConfig`: Configuration management and persistence
|
|
||||||
- `TestEnvironmentDetection`: CLI, Docker, API server environment detection
|
|
||||||
- `TestTelemetryManager`: Singleton pattern and exception capture
|
|
||||||
- `TestConsentManager`: Docker default behavior and environment overrides
|
|
||||||
- `TestPublicAPI`: Public enable/disable/status functions
|
|
||||||
- `TestIntegration`: Crawler exception capture integration
|
|
||||||
|
|
||||||
#### 2. **Integration Tests** (`tests/telemetry/test_integration.py`)
|
|
||||||
- `TestTelemetryCLI`: CLI command testing (status, enable, disable)
|
|
||||||
- `TestAsyncWebCrawlerIntegration`: Real crawler integration with decorators
|
|
||||||
- `TestDockerIntegration`: Docker environment-specific behavior
|
|
||||||
- `TestTelemetryProviderIntegration`: Sentry provider initialization and fallbacks
|
|
||||||
|
|
||||||
#### 3. **Privacy & Performance Tests** (`tests/telemetry/test_privacy_performance.py`)
|
|
||||||
- `TestTelemetryPrivacy`: Data sanitization and PII protection
|
|
||||||
- `TestTelemetryPerformance`: Decorator overhead measurement
|
|
||||||
- `TestTelemetryScalability`: Multiple and concurrent exception handling
|
|
||||||
|
|
||||||
#### 4. **Hello World Test** (`tests/telemetry/test_hello_world_telemetry.py`)
|
|
||||||
- Basic telemetry functionality validation
|
|
||||||
|
|
||||||
### 🔧 Testing Infrastructure
|
|
||||||
|
|
||||||
#### **Pytest Configuration** (`pytest.ini`)
|
|
||||||
```ini
|
|
||||||
[pytest]
|
|
||||||
testpaths = tests/telemetry
|
|
||||||
markers =
|
|
||||||
unit: Unit tests
|
|
||||||
integration: Integration tests
|
|
||||||
privacy: Privacy compliance tests
|
|
||||||
performance: Performance tests
|
|
||||||
asyncio_mode = auto
|
|
||||||
```
|
|
||||||
|
|
||||||
#### **Test Fixtures** (`tests/conftest.py`)
|
|
||||||
- `temp_config_dir`: Temporary configuration directory
|
|
||||||
- `enabled_telemetry_config`: Pre-configured enabled telemetry
|
|
||||||
- `disabled_telemetry_config`: Pre-configured disabled telemetry
|
|
||||||
- `mock_sentry_provider`: Mocked Sentry provider for testing
|
|
||||||
|
|
||||||
#### **Makefile Targets** (`Makefile.telemetry`)
|
|
||||||
```makefile
|
|
||||||
test-all: Run all telemetry tests
|
|
||||||
test-unit: Run unit tests only
|
|
||||||
test-integration: Run integration tests only
|
|
||||||
test-privacy: Run privacy tests only
|
|
||||||
test-performance: Run performance tests only
|
|
||||||
test-coverage: Run tests with coverage report
|
|
||||||
test-watch: Run tests in watch mode
|
|
||||||
test-parallel: Run tests in parallel
|
|
||||||
```
|
|
||||||
|
|
||||||
## 🎯 Key Features Tested
|
|
||||||
|
|
||||||
### Privacy Compliance
|
|
||||||
- ✅ No URLs captured in telemetry data
|
|
||||||
- ✅ No content captured in telemetry data
|
|
||||||
- ✅ No PII (personally identifiable information) captured
|
|
||||||
- ✅ Sanitized context only (error types, stack traces without content)
|
|
||||||
|
|
||||||
### Performance Impact
|
|
||||||
- ✅ Telemetry decorator overhead < 1ms
|
|
||||||
- ✅ Async decorator overhead < 1ms
|
|
||||||
- ✅ Disabled telemetry has minimal performance impact
|
|
||||||
- ✅ Configuration loading performance acceptable
|
|
||||||
- ✅ Multiple exception capture scalability
|
|
||||||
- ✅ Concurrent exception capture handling
|
|
||||||
|
|
||||||
### Integration Points
|
|
||||||
- ✅ CLI command integration (status, enable, disable)
|
|
||||||
- ✅ AsyncWebCrawler decorator integration
|
|
||||||
- ✅ Docker environment auto-detection
|
|
||||||
- ✅ Sentry provider initialization
|
|
||||||
- ✅ Graceful degradation without Sentry
|
|
||||||
- ✅ Environment variable overrides
|
|
||||||
|
|
||||||
### Core Functionality
|
|
||||||
- ✅ Configuration persistence and loading
|
|
||||||
- ✅ Consent management (Docker defaults, user prompts)
|
|
||||||
- ✅ Environment detection (CLI, Docker, Jupyter, etc.)
|
|
||||||
- ✅ Singleton pattern for TelemetryManager
|
|
||||||
- ✅ Exception capture and forwarding
|
|
||||||
- ✅ Provider abstraction (Sentry, Null)
|
|
||||||
|
|
||||||
## 🚀 Usage Examples
|
|
||||||
|
|
||||||
### Run All Tests
|
|
||||||
```bash
|
|
||||||
make -f Makefile.telemetry test-all
|
|
||||||
```
|
|
||||||
|
|
||||||
### Run Specific Test Categories
|
|
||||||
```bash
|
|
||||||
# Unit tests only
|
|
||||||
make -f Makefile.telemetry test-unit
|
|
||||||
|
|
||||||
# Integration tests only
|
|
||||||
make -f Makefile.telemetry test-integration
|
|
||||||
|
|
||||||
# Privacy tests only
|
|
||||||
make -f Makefile.telemetry test-privacy
|
|
||||||
|
|
||||||
# Performance tests only
|
|
||||||
make -f Makefile.telemetry test-performance
|
|
||||||
```
|
|
||||||
|
|
||||||
### Coverage Report
|
|
||||||
```bash
|
|
||||||
make -f Makefile.telemetry test-coverage
|
|
||||||
```
|
|
||||||
|
|
||||||
### Parallel Execution
|
|
||||||
```bash
|
|
||||||
make -f Makefile.telemetry test-parallel
|
|
||||||
```
|
|
||||||
|
|
||||||
## 📁 File Structure
|
|
||||||
|
|
||||||
```
|
|
||||||
tests/
|
|
||||||
├── conftest.py # Shared pytest fixtures
|
|
||||||
└── telemetry/
|
|
||||||
├── test_hello_world_telemetry.py # Basic functionality test
|
|
||||||
├── test_telemetry.py # Unit tests
|
|
||||||
├── test_integration.py # Integration tests
|
|
||||||
└── test_privacy_performance.py # Privacy & performance tests
|
|
||||||
|
|
||||||
# Configuration
|
|
||||||
pytest.ini # Pytest configuration with markers
|
|
||||||
Makefile.telemetry # Convenient test execution targets
|
|
||||||
```
|
|
||||||
|
|
||||||
## 🔍 Test Isolation & Mocking
|
|
||||||
|
|
||||||
### Environment Isolation
|
|
||||||
- Tests run in isolated temporary directories
|
|
||||||
- Environment variables are properly mocked/isolated
|
|
||||||
- No interference between test runs
|
|
||||||
- Clean state for each test
|
|
||||||
|
|
||||||
### Mock Strategies
|
|
||||||
- `unittest.mock` for external dependencies
|
|
||||||
- Temporary file systems for configuration testing
|
|
||||||
- Subprocess mocking for CLI command testing
|
|
||||||
- Time measurement for performance testing
|
|
||||||
|
|
||||||
## 📈 Coverage Analysis
|
|
||||||
|
|
||||||
Current test coverage: **51%** (625 statements)
|
|
||||||
|
|
||||||
### Well-Covered Areas:
|
|
||||||
- Core configuration management (78%)
|
|
||||||
- Telemetry initialization (69%)
|
|
||||||
- Environment detection (64%)
|
|
||||||
|
|
||||||
### Areas for Future Enhancement:
|
|
||||||
- Consent management UI (20% - interactive prompts)
|
|
||||||
- Sentry provider implementation (25% - network calls)
|
|
||||||
- Base provider abstractions (49% - error handling paths)
|
|
||||||
|
|
||||||
## 🎉 Implementation Success
|
|
||||||
|
|
||||||
The comprehensive testing strategy has been **successfully implemented** with:
|
|
||||||
|
|
||||||
- ✅ **100% test pass rate** (40/40 tests passing)
|
|
||||||
- ✅ **Complete test infrastructure** (fixtures, configuration, targets)
|
|
||||||
- ✅ **Privacy compliance verification** (no PII, URLs, or content captured)
|
|
||||||
- ✅ **Performance validation** (minimal overhead confirmed)
|
|
||||||
- ✅ **Integration testing** (CLI, Docker, AsyncWebCrawler)
|
|
||||||
- ✅ **CI/CD ready** (Makefile targets for automation)
|
|
||||||
|
|
||||||
The telemetry system now has robust test coverage ensuring reliability, privacy compliance, and performance characteristics while maintaining comprehensive validation of all core functionality.
|
|
||||||
@@ -824,7 +824,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
except Error:
|
except Error:
|
||||||
visibility_info = await self.check_visibility(page)
|
visibility_info = await self.check_visibility(page)
|
||||||
|
|
||||||
if self.browser_config.verbose:
|
if self.browser_config.config.verbose:
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
message="Body visibility info: {info}",
|
message="Body visibility info: {info}",
|
||||||
tag="DEBUG",
|
tag="DEBUG",
|
||||||
|
|||||||
@@ -49,9 +49,6 @@ from .utils import (
|
|||||||
preprocess_html_for_schema,
|
preprocess_html_for_schema,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Import telemetry
|
|
||||||
from .telemetry import capture_exception, telemetry_decorator, async_telemetry_decorator
|
|
||||||
|
|
||||||
|
|
||||||
class AsyncWebCrawler:
|
class AsyncWebCrawler:
|
||||||
"""
|
"""
|
||||||
@@ -204,7 +201,6 @@ class AsyncWebCrawler:
|
|||||||
"""异步空上下文管理器"""
|
"""异步空上下文管理器"""
|
||||||
yield
|
yield
|
||||||
|
|
||||||
@async_telemetry_decorator
|
|
||||||
async def arun(
|
async def arun(
|
||||||
self,
|
self,
|
||||||
url: str,
|
url: str,
|
||||||
@@ -434,7 +430,6 @@ class AsyncWebCrawler:
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@async_telemetry_decorator
|
|
||||||
async def aprocess_html(
|
async def aprocess_html(
|
||||||
self,
|
self,
|
||||||
url: str,
|
url: str,
|
||||||
|
|||||||
@@ -1385,97 +1385,6 @@ def profiles_cmd():
|
|||||||
# Run interactive profile manager
|
# Run interactive profile manager
|
||||||
anyio.run(manage_profiles)
|
anyio.run(manage_profiles)
|
||||||
|
|
||||||
@cli.group("telemetry")
|
|
||||||
def telemetry_cmd():
|
|
||||||
"""Manage telemetry settings for Crawl4AI
|
|
||||||
|
|
||||||
Telemetry helps improve Crawl4AI by sending anonymous crash reports.
|
|
||||||
No personal data or crawled content is ever collected.
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
@telemetry_cmd.command("enable")
|
|
||||||
@click.option("--email", "-e", help="Optional email for follow-up on critical issues")
|
|
||||||
@click.option("--always/--once", default=True, help="Always send errors (default) or just once")
|
|
||||||
def telemetry_enable_cmd(email: Optional[str], always: bool):
|
|
||||||
"""Enable telemetry to help improve Crawl4AI
|
|
||||||
|
|
||||||
Examples:
|
|
||||||
crwl telemetry enable # Enable telemetry
|
|
||||||
crwl telemetry enable --email me@ex.com # Enable with email
|
|
||||||
crwl telemetry enable --once # Send only next error
|
|
||||||
"""
|
|
||||||
from crawl4ai.telemetry import enable
|
|
||||||
|
|
||||||
try:
|
|
||||||
enable(email=email, always=always, once=not always)
|
|
||||||
console.print("[green]✅ Telemetry enabled successfully[/green]")
|
|
||||||
|
|
||||||
if email:
|
|
||||||
console.print(f" Email: {email}")
|
|
||||||
console.print(f" Mode: {'Always send errors' if always else 'Send next error only'}")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
console.print(f"[red]❌ Failed to enable telemetry: {e}[/red]")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
@telemetry_cmd.command("disable")
|
|
||||||
def telemetry_disable_cmd():
|
|
||||||
"""Disable telemetry
|
|
||||||
|
|
||||||
Stop sending anonymous crash reports to help improve Crawl4AI.
|
|
||||||
"""
|
|
||||||
from crawl4ai.telemetry import disable
|
|
||||||
|
|
||||||
try:
|
|
||||||
disable()
|
|
||||||
console.print("[green]✅ Telemetry disabled successfully[/green]")
|
|
||||||
except Exception as e:
|
|
||||||
console.print(f"[red]❌ Failed to disable telemetry: {e}[/red]")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
@telemetry_cmd.command("status")
|
|
||||||
def telemetry_status_cmd():
|
|
||||||
"""Show current telemetry status
|
|
||||||
|
|
||||||
Display whether telemetry is enabled and current settings.
|
|
||||||
"""
|
|
||||||
from crawl4ai.telemetry import status
|
|
||||||
|
|
||||||
try:
|
|
||||||
info = status()
|
|
||||||
|
|
||||||
# Create status table
|
|
||||||
table = Table(title="Telemetry Status", show_header=False)
|
|
||||||
table.add_column("Setting", style="cyan")
|
|
||||||
table.add_column("Value")
|
|
||||||
|
|
||||||
# Status emoji
|
|
||||||
status_icon = "✅" if info['enabled'] else "❌"
|
|
||||||
|
|
||||||
table.add_row("Status", f"{status_icon} {'Enabled' if info['enabled'] else 'Disabled'}")
|
|
||||||
table.add_row("Consent", info['consent'].replace('_', ' ').title())
|
|
||||||
|
|
||||||
if info['email']:
|
|
||||||
table.add_row("Email", info['email'])
|
|
||||||
|
|
||||||
table.add_row("Environment", info['environment'])
|
|
||||||
table.add_row("Provider", info['provider'])
|
|
||||||
|
|
||||||
if info['errors_sent'] > 0:
|
|
||||||
table.add_row("Errors Sent", str(info['errors_sent']))
|
|
||||||
|
|
||||||
console.print(table)
|
|
||||||
|
|
||||||
# Add helpful messages
|
|
||||||
if not info['enabled']:
|
|
||||||
console.print("\n[yellow]ℹ️ Telemetry is disabled. Enable it to help improve Crawl4AI:[/yellow]")
|
|
||||||
console.print(" [dim]crwl telemetry enable[/dim]")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
console.print(f"[red]❌ Failed to get telemetry status: {e}[/red]")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
@cli.command(name="")
|
@cli.command(name="")
|
||||||
@click.argument("url", required=False)
|
@click.argument("url", required=False)
|
||||||
@click.option("--example", is_flag=True, help="Show usage examples")
|
@click.option("--example", is_flag=True, help="Show usage examples")
|
||||||
|
|||||||
@@ -1,440 +0,0 @@
|
|||||||
"""
|
|
||||||
Crawl4AI Telemetry Module.
|
|
||||||
Provides opt-in error tracking to improve stability.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import functools
|
|
||||||
import traceback
|
|
||||||
from typing import Optional, Any, Dict, Callable, Type
|
|
||||||
from contextlib import contextmanager, asynccontextmanager
|
|
||||||
|
|
||||||
from .base import TelemetryProvider, NullProvider
|
|
||||||
from .config import TelemetryConfig, TelemetryConsent
|
|
||||||
from .consent import ConsentManager
|
|
||||||
from .environment import Environment, EnvironmentDetector
|
|
||||||
|
|
||||||
|
|
||||||
class TelemetryManager:
|
|
||||||
"""
|
|
||||||
Main telemetry manager for Crawl4AI.
|
|
||||||
Coordinates provider, config, and consent management.
|
|
||||||
"""
|
|
||||||
|
|
||||||
_instance: Optional['TelemetryManager'] = None
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
"""Initialize telemetry manager."""
|
|
||||||
self.config = TelemetryConfig()
|
|
||||||
self.consent_manager = ConsentManager(self.config)
|
|
||||||
self.environment = EnvironmentDetector.detect()
|
|
||||||
self._provider: Optional[TelemetryProvider] = None
|
|
||||||
self._initialized = False
|
|
||||||
self._error_count = 0
|
|
||||||
self._max_errors = 100 # Prevent telemetry spam
|
|
||||||
|
|
||||||
# Load provider based on config
|
|
||||||
self._setup_provider()
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_instance(cls) -> 'TelemetryManager':
|
|
||||||
"""
|
|
||||||
Get singleton instance of telemetry manager.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
TelemetryManager instance
|
|
||||||
"""
|
|
||||||
if cls._instance is None:
|
|
||||||
cls._instance = cls()
|
|
||||||
return cls._instance
|
|
||||||
|
|
||||||
def _setup_provider(self) -> None:
|
|
||||||
"""Setup telemetry provider based on configuration."""
|
|
||||||
# Update config from environment
|
|
||||||
self.config.update_from_env()
|
|
||||||
|
|
||||||
# Check if telemetry is enabled
|
|
||||||
if not self.config.is_enabled():
|
|
||||||
self._provider = NullProvider()
|
|
||||||
return
|
|
||||||
|
|
||||||
# Try to load Sentry provider
|
|
||||||
try:
|
|
||||||
from .providers.sentry import SentryProvider
|
|
||||||
|
|
||||||
# Get Crawl4AI version for release tracking
|
|
||||||
try:
|
|
||||||
from crawl4ai import __version__
|
|
||||||
release = f"crawl4ai@{__version__}"
|
|
||||||
except ImportError:
|
|
||||||
release = "crawl4ai@unknown"
|
|
||||||
|
|
||||||
self._provider = SentryProvider(
|
|
||||||
environment=self.environment.value,
|
|
||||||
release=release
|
|
||||||
)
|
|
||||||
|
|
||||||
# Initialize provider
|
|
||||||
if not self._provider.initialize():
|
|
||||||
# Fallback to null provider if init fails
|
|
||||||
self._provider = NullProvider()
|
|
||||||
|
|
||||||
except ImportError:
|
|
||||||
# Sentry not installed - use null provider
|
|
||||||
self._provider = NullProvider()
|
|
||||||
|
|
||||||
self._initialized = True
|
|
||||||
|
|
||||||
def capture_exception(
|
|
||||||
self,
|
|
||||||
exception: Exception,
|
|
||||||
context: Optional[Dict[str, Any]] = None
|
|
||||||
) -> bool:
|
|
||||||
"""
|
|
||||||
Capture and send an exception.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
exception: The exception to capture
|
|
||||||
context: Optional additional context
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if exception was sent
|
|
||||||
"""
|
|
||||||
# Check error count limit
|
|
||||||
if self._error_count >= self._max_errors:
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Check consent on first error
|
|
||||||
if self._error_count == 0:
|
|
||||||
consent = self.consent_manager.check_and_prompt()
|
|
||||||
|
|
||||||
# Update provider if consent changed
|
|
||||||
if consent == TelemetryConsent.DENIED:
|
|
||||||
self._provider = NullProvider()
|
|
||||||
return False
|
|
||||||
elif consent in [TelemetryConsent.ONCE, TelemetryConsent.ALWAYS]:
|
|
||||||
if isinstance(self._provider, NullProvider):
|
|
||||||
self._setup_provider()
|
|
||||||
|
|
||||||
# Check if we should send this error
|
|
||||||
if not self.config.should_send_current():
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Prepare context
|
|
||||||
full_context = EnvironmentDetector.get_environment_context()
|
|
||||||
if context:
|
|
||||||
full_context.update(context)
|
|
||||||
|
|
||||||
# Add user email if available
|
|
||||||
email = self.config.get_email()
|
|
||||||
if email:
|
|
||||||
full_context['email'] = email
|
|
||||||
|
|
||||||
# Add source info
|
|
||||||
full_context['source'] = 'crawl4ai'
|
|
||||||
|
|
||||||
# Send exception
|
|
||||||
try:
|
|
||||||
if self._provider:
|
|
||||||
success = self._provider.send_exception(exception, full_context)
|
|
||||||
if success:
|
|
||||||
self._error_count += 1
|
|
||||||
return success
|
|
||||||
except Exception:
|
|
||||||
# Telemetry itself failed - ignore
|
|
||||||
pass
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def capture_message(
|
|
||||||
self,
|
|
||||||
message: str,
|
|
||||||
level: str = 'info',
|
|
||||||
context: Optional[Dict[str, Any]] = None
|
|
||||||
) -> bool:
|
|
||||||
"""
|
|
||||||
Capture a message event.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
message: Message to send
|
|
||||||
level: Message level (info, warning, error)
|
|
||||||
context: Optional context
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if message was sent
|
|
||||||
"""
|
|
||||||
if not self.config.is_enabled():
|
|
||||||
return False
|
|
||||||
|
|
||||||
payload = {
|
|
||||||
'level': level,
|
|
||||||
'message': message
|
|
||||||
}
|
|
||||||
if context:
|
|
||||||
payload.update(context)
|
|
||||||
|
|
||||||
try:
|
|
||||||
if self._provider:
|
|
||||||
return self._provider.send_event(message, payload)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def enable(
|
|
||||||
self,
|
|
||||||
email: Optional[str] = None,
|
|
||||||
always: bool = True,
|
|
||||||
once: bool = False
|
|
||||||
) -> None:
|
|
||||||
"""
|
|
||||||
Enable telemetry.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
email: Optional email for follow-up
|
|
||||||
always: If True, always send errors
|
|
||||||
once: If True, send only next error
|
|
||||||
"""
|
|
||||||
if once:
|
|
||||||
consent = TelemetryConsent.ONCE
|
|
||||||
elif always:
|
|
||||||
consent = TelemetryConsent.ALWAYS
|
|
||||||
else:
|
|
||||||
consent = TelemetryConsent.ALWAYS
|
|
||||||
|
|
||||||
self.config.set_consent(consent, email)
|
|
||||||
self._setup_provider()
|
|
||||||
|
|
||||||
print("✅ Telemetry enabled")
|
|
||||||
if email:
|
|
||||||
print(f" Email: {email}")
|
|
||||||
print(f" Mode: {'once' if once else 'always'}")
|
|
||||||
|
|
||||||
def disable(self) -> None:
|
|
||||||
"""Disable telemetry."""
|
|
||||||
self.config.set_consent(TelemetryConsent.DENIED)
|
|
||||||
self._provider = NullProvider()
|
|
||||||
print("✅ Telemetry disabled")
|
|
||||||
|
|
||||||
def status(self) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Get telemetry status.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary with status information
|
|
||||||
"""
|
|
||||||
return {
|
|
||||||
'enabled': self.config.is_enabled(),
|
|
||||||
'consent': self.config.get_consent().value,
|
|
||||||
'email': self.config.get_email(),
|
|
||||||
'environment': self.environment.value,
|
|
||||||
'provider': type(self._provider).__name__ if self._provider else 'None',
|
|
||||||
'errors_sent': self._error_count
|
|
||||||
}
|
|
||||||
|
|
||||||
def flush(self) -> None:
|
|
||||||
"""Flush any pending telemetry data."""
|
|
||||||
if self._provider:
|
|
||||||
self._provider.flush()
|
|
||||||
|
|
||||||
def shutdown(self) -> None:
|
|
||||||
"""Shutdown telemetry."""
|
|
||||||
if self._provider:
|
|
||||||
self._provider.shutdown()
|
|
||||||
|
|
||||||
|
|
||||||
# Global instance
|
|
||||||
_telemetry_manager: Optional[TelemetryManager] = None
|
|
||||||
|
|
||||||
|
|
||||||
def get_telemetry() -> TelemetryManager:
|
|
||||||
"""
|
|
||||||
Get global telemetry manager instance.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
TelemetryManager instance
|
|
||||||
"""
|
|
||||||
global _telemetry_manager
|
|
||||||
if _telemetry_manager is None:
|
|
||||||
_telemetry_manager = TelemetryManager.get_instance()
|
|
||||||
return _telemetry_manager
|
|
||||||
|
|
||||||
|
|
||||||
def capture_exception(
|
|
||||||
exception: Exception,
|
|
||||||
context: Optional[Dict[str, Any]] = None
|
|
||||||
) -> bool:
|
|
||||||
"""
|
|
||||||
Capture an exception for telemetry.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
exception: Exception to capture
|
|
||||||
context: Optional context
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if sent successfully
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
return get_telemetry().capture_exception(exception, context)
|
|
||||||
except Exception:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def telemetry_decorator(func: Callable) -> Callable:
|
|
||||||
"""
|
|
||||||
Decorator to capture exceptions from a function.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
func: Function to wrap
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Wrapped function
|
|
||||||
"""
|
|
||||||
@functools.wraps(func)
|
|
||||||
def wrapper(*args, **kwargs):
|
|
||||||
try:
|
|
||||||
return func(*args, **kwargs)
|
|
||||||
except Exception as e:
|
|
||||||
# Capture exception
|
|
||||||
capture_exception(e, {
|
|
||||||
'function': func.__name__,
|
|
||||||
'module': func.__module__
|
|
||||||
})
|
|
||||||
# Re-raise the exception
|
|
||||||
raise
|
|
||||||
|
|
||||||
return wrapper
|
|
||||||
|
|
||||||
|
|
||||||
def async_telemetry_decorator(func: Callable) -> Callable:
|
|
||||||
"""
|
|
||||||
Decorator to capture exceptions from an async function.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
func: Async function to wrap
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Wrapped async function
|
|
||||||
"""
|
|
||||||
@functools.wraps(func)
|
|
||||||
async def wrapper(*args, **kwargs):
|
|
||||||
try:
|
|
||||||
return await func(*args, **kwargs)
|
|
||||||
except Exception as e:
|
|
||||||
# Capture exception
|
|
||||||
capture_exception(e, {
|
|
||||||
'function': func.__name__,
|
|
||||||
'module': func.__module__
|
|
||||||
})
|
|
||||||
# Re-raise the exception
|
|
||||||
raise
|
|
||||||
|
|
||||||
return wrapper
|
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
|
||||||
def telemetry_context(operation: str):
|
|
||||||
"""
|
|
||||||
Context manager for capturing exceptions.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
operation: Name of the operation
|
|
||||||
|
|
||||||
Example:
|
|
||||||
with telemetry_context("web_crawl"):
|
|
||||||
# Your code here
|
|
||||||
pass
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
yield
|
|
||||||
except Exception as e:
|
|
||||||
capture_exception(e, {'operation': operation})
|
|
||||||
raise
|
|
||||||
|
|
||||||
|
|
||||||
@asynccontextmanager
|
|
||||||
async def async_telemetry_context(operation: str):
|
|
||||||
"""
|
|
||||||
Async context manager for capturing exceptions in async code.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
operation: Name of the operation
|
|
||||||
|
|
||||||
Example:
|
|
||||||
async with async_telemetry_context("async_crawl"):
|
|
||||||
# Your async code here
|
|
||||||
await something()
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
yield
|
|
||||||
except Exception as e:
|
|
||||||
capture_exception(e, {'operation': operation})
|
|
||||||
raise
|
|
||||||
|
|
||||||
|
|
||||||
def install_exception_handler():
|
|
||||||
"""Install global exception handler for uncaught exceptions."""
|
|
||||||
original_hook = sys.excepthook
|
|
||||||
|
|
||||||
def telemetry_exception_hook(exc_type, exc_value, exc_traceback):
|
|
||||||
"""Custom exception hook with telemetry."""
|
|
||||||
# Don't capture KeyboardInterrupt
|
|
||||||
if not issubclass(exc_type, KeyboardInterrupt):
|
|
||||||
capture_exception(exc_value, {
|
|
||||||
'uncaught': True,
|
|
||||||
'type': exc_type.__name__
|
|
||||||
})
|
|
||||||
|
|
||||||
# Call original hook
|
|
||||||
original_hook(exc_type, exc_value, exc_traceback)
|
|
||||||
|
|
||||||
sys.excepthook = telemetry_exception_hook
|
|
||||||
|
|
||||||
|
|
||||||
# Public API
|
|
||||||
def enable(email: Optional[str] = None, always: bool = True, once: bool = False) -> None:
|
|
||||||
"""
|
|
||||||
Enable telemetry.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
email: Optional email for follow-up
|
|
||||||
always: If True, always send errors (default)
|
|
||||||
once: If True, send only the next error
|
|
||||||
"""
|
|
||||||
get_telemetry().enable(email=email, always=always, once=once)
|
|
||||||
|
|
||||||
|
|
||||||
def disable() -> None:
|
|
||||||
"""Disable telemetry."""
|
|
||||||
get_telemetry().disable()
|
|
||||||
|
|
||||||
|
|
||||||
def status() -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Get telemetry status.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary with status information
|
|
||||||
"""
|
|
||||||
return get_telemetry().status()
|
|
||||||
|
|
||||||
|
|
||||||
# Auto-install exception handler on import
|
|
||||||
# (Only for main library usage, not for Docker/API)
|
|
||||||
if EnvironmentDetector.detect() not in [Environment.DOCKER, Environment.API_SERVER]:
|
|
||||||
install_exception_handler()
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
'TelemetryManager',
|
|
||||||
'get_telemetry',
|
|
||||||
'capture_exception',
|
|
||||||
'telemetry_decorator',
|
|
||||||
'async_telemetry_decorator',
|
|
||||||
'telemetry_context',
|
|
||||||
'async_telemetry_context',
|
|
||||||
'enable',
|
|
||||||
'disable',
|
|
||||||
'status',
|
|
||||||
]
|
|
||||||
@@ -1,140 +0,0 @@
|
|||||||
"""
|
|
||||||
Base telemetry provider interface for Crawl4AI.
|
|
||||||
Provides abstraction for different telemetry backends.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from abc import ABC, abstractmethod
|
|
||||||
from typing import Dict, Any, Optional, Union
|
|
||||||
import traceback
|
|
||||||
|
|
||||||
|
|
||||||
class TelemetryProvider(ABC):
|
|
||||||
"""Abstract base class for telemetry providers."""
|
|
||||||
|
|
||||||
def __init__(self, **kwargs):
|
|
||||||
"""Initialize the provider with optional configuration."""
|
|
||||||
self.config = kwargs
|
|
||||||
self._initialized = False
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def initialize(self) -> bool:
|
|
||||||
"""
|
|
||||||
Initialize the telemetry provider.
|
|
||||||
Returns True if initialization successful, False otherwise.
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def send_exception(
|
|
||||||
self,
|
|
||||||
exc: Exception,
|
|
||||||
context: Optional[Dict[str, Any]] = None
|
|
||||||
) -> bool:
|
|
||||||
"""
|
|
||||||
Send an exception to the telemetry backend.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
exc: The exception to report
|
|
||||||
context: Optional context data (email, environment, etc.)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if sent successfully, False otherwise
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def send_event(
|
|
||||||
self,
|
|
||||||
event_name: str,
|
|
||||||
payload: Optional[Dict[str, Any]] = None
|
|
||||||
) -> bool:
|
|
||||||
"""
|
|
||||||
Send a generic telemetry event.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
event_name: Name of the event
|
|
||||||
payload: Optional event data
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if sent successfully, False otherwise
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def flush(self) -> None:
|
|
||||||
"""Flush any pending telemetry data."""
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def shutdown(self) -> None:
|
|
||||||
"""Clean shutdown of the provider."""
|
|
||||||
pass
|
|
||||||
|
|
||||||
def sanitize_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Remove sensitive information from telemetry data.
|
|
||||||
Override in subclasses for custom sanitization.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
data: Raw data dictionary
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Sanitized data dictionary
|
|
||||||
"""
|
|
||||||
# Default implementation - remove common sensitive fields
|
|
||||||
sensitive_keys = {
|
|
||||||
'password', 'token', 'api_key', 'secret', 'credential',
|
|
||||||
'auth', 'authorization', 'cookie', 'session'
|
|
||||||
}
|
|
||||||
|
|
||||||
def _sanitize_dict(d: Dict) -> Dict:
|
|
||||||
sanitized = {}
|
|
||||||
for key, value in d.items():
|
|
||||||
key_lower = key.lower()
|
|
||||||
if any(sensitive in key_lower for sensitive in sensitive_keys):
|
|
||||||
sanitized[key] = '[REDACTED]'
|
|
||||||
elif isinstance(value, dict):
|
|
||||||
sanitized[key] = _sanitize_dict(value)
|
|
||||||
elif isinstance(value, list):
|
|
||||||
sanitized[key] = [
|
|
||||||
_sanitize_dict(item) if isinstance(item, dict) else item
|
|
||||||
for item in value
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
sanitized[key] = value
|
|
||||||
return sanitized
|
|
||||||
|
|
||||||
return _sanitize_dict(data) if isinstance(data, dict) else data
|
|
||||||
|
|
||||||
|
|
||||||
class NullProvider(TelemetryProvider):
|
|
||||||
"""No-op provider for when telemetry is disabled."""
|
|
||||||
|
|
||||||
def initialize(self) -> bool:
|
|
||||||
"""No initialization needed for null provider."""
|
|
||||||
self._initialized = True
|
|
||||||
return True
|
|
||||||
|
|
||||||
def send_exception(
|
|
||||||
self,
|
|
||||||
exc: Exception,
|
|
||||||
context: Optional[Dict[str, Any]] = None
|
|
||||||
) -> bool:
|
|
||||||
"""No-op exception sending."""
|
|
||||||
return True
|
|
||||||
|
|
||||||
def send_event(
|
|
||||||
self,
|
|
||||||
event_name: str,
|
|
||||||
payload: Optional[Dict[str, Any]] = None
|
|
||||||
) -> bool:
|
|
||||||
"""No-op event sending."""
|
|
||||||
return True
|
|
||||||
|
|
||||||
def flush(self) -> None:
|
|
||||||
"""No-op flush."""
|
|
||||||
pass
|
|
||||||
|
|
||||||
def shutdown(self) -> None:
|
|
||||||
"""No-op shutdown."""
|
|
||||||
pass
|
|
||||||
@@ -1,196 +0,0 @@
|
|||||||
"""
|
|
||||||
Configuration management for Crawl4AI telemetry.
|
|
||||||
Handles user preferences and persistence.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Dict, Any, Optional
|
|
||||||
from enum import Enum
|
|
||||||
|
|
||||||
|
|
||||||
class TelemetryConsent(Enum):
|
|
||||||
"""Telemetry consent levels."""
|
|
||||||
NOT_SET = "not_set"
|
|
||||||
DENIED = "denied"
|
|
||||||
ONCE = "once" # Send current error only
|
|
||||||
ALWAYS = "always" # Send all errors
|
|
||||||
|
|
||||||
|
|
||||||
class TelemetryConfig:
|
|
||||||
"""Manages telemetry configuration and persistence."""
|
|
||||||
|
|
||||||
def __init__(self, config_dir: Optional[Path] = None):
|
|
||||||
"""
|
|
||||||
Initialize configuration manager.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
config_dir: Optional custom config directory
|
|
||||||
"""
|
|
||||||
if config_dir:
|
|
||||||
self.config_dir = config_dir
|
|
||||||
else:
|
|
||||||
# Default to ~/.crawl4ai/
|
|
||||||
self.config_dir = Path.home() / '.crawl4ai'
|
|
||||||
|
|
||||||
self.config_file = self.config_dir / 'config.json'
|
|
||||||
self._config: Dict[str, Any] = {}
|
|
||||||
self._load_config()
|
|
||||||
|
|
||||||
def _ensure_config_dir(self) -> None:
|
|
||||||
"""Ensure configuration directory exists."""
|
|
||||||
self.config_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
def _load_config(self) -> None:
|
|
||||||
"""Load configuration from disk."""
|
|
||||||
if self.config_file.exists():
|
|
||||||
try:
|
|
||||||
with open(self.config_file, 'r') as f:
|
|
||||||
self._config = json.load(f)
|
|
||||||
except (json.JSONDecodeError, IOError):
|
|
||||||
# Corrupted or inaccessible config - start fresh
|
|
||||||
self._config = {}
|
|
||||||
else:
|
|
||||||
self._config = {}
|
|
||||||
|
|
||||||
def _save_config(self) -> bool:
|
|
||||||
"""
|
|
||||||
Save configuration to disk.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if saved successfully
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
self._ensure_config_dir()
|
|
||||||
|
|
||||||
# Write to temporary file first
|
|
||||||
temp_file = self.config_file.with_suffix('.tmp')
|
|
||||||
with open(temp_file, 'w') as f:
|
|
||||||
json.dump(self._config, f, indent=2)
|
|
||||||
|
|
||||||
# Atomic rename
|
|
||||||
temp_file.replace(self.config_file)
|
|
||||||
return True
|
|
||||||
|
|
||||||
except (IOError, OSError):
|
|
||||||
return False
|
|
||||||
|
|
||||||
def get_telemetry_settings(self) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Get current telemetry settings.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary with telemetry settings
|
|
||||||
"""
|
|
||||||
return self._config.get('telemetry', {
|
|
||||||
'consent': TelemetryConsent.NOT_SET.value,
|
|
||||||
'email': None
|
|
||||||
})
|
|
||||||
|
|
||||||
def get_consent(self) -> TelemetryConsent:
|
|
||||||
"""
|
|
||||||
Get current consent status.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
TelemetryConsent enum value
|
|
||||||
"""
|
|
||||||
settings = self.get_telemetry_settings()
|
|
||||||
consent_value = settings.get('consent', TelemetryConsent.NOT_SET.value)
|
|
||||||
|
|
||||||
# Handle legacy boolean values
|
|
||||||
if isinstance(consent_value, bool):
|
|
||||||
consent_value = TelemetryConsent.ALWAYS.value if consent_value else TelemetryConsent.DENIED.value
|
|
||||||
|
|
||||||
try:
|
|
||||||
return TelemetryConsent(consent_value)
|
|
||||||
except ValueError:
|
|
||||||
return TelemetryConsent.NOT_SET
|
|
||||||
|
|
||||||
def set_consent(
|
|
||||||
self,
|
|
||||||
consent: TelemetryConsent,
|
|
||||||
email: Optional[str] = None
|
|
||||||
) -> bool:
|
|
||||||
"""
|
|
||||||
Set telemetry consent and optional email.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
consent: Consent level
|
|
||||||
email: Optional email for follow-up
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if saved successfully
|
|
||||||
"""
|
|
||||||
if 'telemetry' not in self._config:
|
|
||||||
self._config['telemetry'] = {}
|
|
||||||
|
|
||||||
self._config['telemetry']['consent'] = consent.value
|
|
||||||
|
|
||||||
# Only update email if provided
|
|
||||||
if email is not None:
|
|
||||||
self._config['telemetry']['email'] = email
|
|
||||||
|
|
||||||
return self._save_config()
|
|
||||||
|
|
||||||
def get_email(self) -> Optional[str]:
|
|
||||||
"""
|
|
||||||
Get stored email if any.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Email address or None
|
|
||||||
"""
|
|
||||||
settings = self.get_telemetry_settings()
|
|
||||||
return settings.get('email')
|
|
||||||
|
|
||||||
def is_enabled(self) -> bool:
|
|
||||||
"""
|
|
||||||
Check if telemetry is enabled.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if telemetry should send data
|
|
||||||
"""
|
|
||||||
consent = self.get_consent()
|
|
||||||
return consent in [TelemetryConsent.ONCE, TelemetryConsent.ALWAYS]
|
|
||||||
|
|
||||||
def should_send_current(self) -> bool:
|
|
||||||
"""
|
|
||||||
Check if current error should be sent.
|
|
||||||
Used for one-time consent.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if current error should be sent
|
|
||||||
"""
|
|
||||||
consent = self.get_consent()
|
|
||||||
if consent == TelemetryConsent.ONCE:
|
|
||||||
# After sending once, reset to NOT_SET
|
|
||||||
self.set_consent(TelemetryConsent.NOT_SET)
|
|
||||||
return True
|
|
||||||
return consent == TelemetryConsent.ALWAYS
|
|
||||||
|
|
||||||
def clear(self) -> bool:
|
|
||||||
"""
|
|
||||||
Clear all telemetry settings.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if cleared successfully
|
|
||||||
"""
|
|
||||||
if 'telemetry' in self._config:
|
|
||||||
del self._config['telemetry']
|
|
||||||
return self._save_config()
|
|
||||||
return True
|
|
||||||
|
|
||||||
def update_from_env(self) -> None:
|
|
||||||
"""Update configuration from environment variables."""
|
|
||||||
# Check for telemetry disable flag
|
|
||||||
if os.environ.get('CRAWL4AI_TELEMETRY') == '0':
|
|
||||||
self.set_consent(TelemetryConsent.DENIED)
|
|
||||||
|
|
||||||
# Check for email override
|
|
||||||
env_email = os.environ.get('CRAWL4AI_TELEMETRY_EMAIL')
|
|
||||||
if env_email and self.is_enabled():
|
|
||||||
current_settings = self.get_telemetry_settings()
|
|
||||||
self.set_consent(
|
|
||||||
TelemetryConsent(current_settings['consent']),
|
|
||||||
email=env_email
|
|
||||||
)
|
|
||||||
@@ -1,314 +0,0 @@
|
|||||||
"""
|
|
||||||
User consent handling for Crawl4AI telemetry.
|
|
||||||
Provides interactive prompts for different environments.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import sys
|
|
||||||
from typing import Optional, Tuple
|
|
||||||
from .config import TelemetryConsent, TelemetryConfig
|
|
||||||
from .environment import Environment, EnvironmentDetector
|
|
||||||
|
|
||||||
|
|
||||||
class ConsentManager:
|
|
||||||
"""Manages user consent for telemetry."""
|
|
||||||
|
|
||||||
def __init__(self, config: Optional[TelemetryConfig] = None):
|
|
||||||
"""
|
|
||||||
Initialize consent manager.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
config: Optional TelemetryConfig instance
|
|
||||||
"""
|
|
||||||
self.config = config or TelemetryConfig()
|
|
||||||
self.environment = EnvironmentDetector.detect()
|
|
||||||
|
|
||||||
def check_and_prompt(self) -> TelemetryConsent:
|
|
||||||
"""
|
|
||||||
Check consent status and prompt if needed.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Current consent status
|
|
||||||
"""
|
|
||||||
current_consent = self.config.get_consent()
|
|
||||||
|
|
||||||
# If already set, return current value
|
|
||||||
if current_consent != TelemetryConsent.NOT_SET:
|
|
||||||
return current_consent
|
|
||||||
|
|
||||||
# Docker/API server: default enabled (check env var)
|
|
||||||
if self.environment in [Environment.DOCKER, Environment.API_SERVER]:
|
|
||||||
return self._handle_docker_consent()
|
|
||||||
|
|
||||||
# Interactive environments: prompt user
|
|
||||||
if EnvironmentDetector.is_interactive():
|
|
||||||
return self._prompt_for_consent()
|
|
||||||
|
|
||||||
# Non-interactive: default disabled
|
|
||||||
return TelemetryConsent.DENIED
|
|
||||||
|
|
||||||
def _handle_docker_consent(self) -> TelemetryConsent:
|
|
||||||
"""
|
|
||||||
Handle consent in Docker environment.
|
|
||||||
Default enabled unless disabled via env var.
|
|
||||||
"""
|
|
||||||
import os
|
|
||||||
|
|
||||||
if os.environ.get('CRAWL4AI_TELEMETRY') == '0':
|
|
||||||
self.config.set_consent(TelemetryConsent.DENIED)
|
|
||||||
return TelemetryConsent.DENIED
|
|
||||||
|
|
||||||
# Default enabled for Docker
|
|
||||||
self.config.set_consent(TelemetryConsent.ALWAYS)
|
|
||||||
return TelemetryConsent.ALWAYS
|
|
||||||
|
|
||||||
def _prompt_for_consent(self) -> TelemetryConsent:
|
|
||||||
"""
|
|
||||||
Prompt user for consent based on environment.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
User's consent choice
|
|
||||||
"""
|
|
||||||
if self.environment == Environment.CLI:
|
|
||||||
return self._cli_prompt()
|
|
||||||
elif self.environment in [Environment.JUPYTER, Environment.COLAB]:
|
|
||||||
return self._notebook_prompt()
|
|
||||||
else:
|
|
||||||
return TelemetryConsent.DENIED
|
|
||||||
|
|
||||||
def _cli_prompt(self) -> TelemetryConsent:
|
|
||||||
"""
|
|
||||||
Show CLI prompt for consent.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
User's consent choice
|
|
||||||
"""
|
|
||||||
print("\n" + "="*60)
|
|
||||||
print("🚨 Crawl4AI Error Detection")
|
|
||||||
print("="*60)
|
|
||||||
print("\nWe noticed an error occurred. Help improve Crawl4AI by")
|
|
||||||
print("sending anonymous crash reports?")
|
|
||||||
print("\n[1] Yes, send this error only")
|
|
||||||
print("[2] Yes, always send errors")
|
|
||||||
print("[3] No, don't send")
|
|
||||||
print("\n" + "-"*60)
|
|
||||||
|
|
||||||
# Get choice
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
choice = input("Your choice (1/2/3): ").strip()
|
|
||||||
if choice == '1':
|
|
||||||
consent = TelemetryConsent.ONCE
|
|
||||||
break
|
|
||||||
elif choice == '2':
|
|
||||||
consent = TelemetryConsent.ALWAYS
|
|
||||||
break
|
|
||||||
elif choice == '3':
|
|
||||||
consent = TelemetryConsent.DENIED
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
print("Please enter 1, 2, or 3")
|
|
||||||
except (KeyboardInterrupt, EOFError):
|
|
||||||
# User cancelled - treat as denial
|
|
||||||
consent = TelemetryConsent.DENIED
|
|
||||||
break
|
|
||||||
|
|
||||||
# Optional email
|
|
||||||
email = None
|
|
||||||
if consent != TelemetryConsent.DENIED:
|
|
||||||
print("\nOptional: Enter email for follow-up (or press Enter to skip):")
|
|
||||||
try:
|
|
||||||
email_input = input("Email: ").strip()
|
|
||||||
if email_input and '@' in email_input:
|
|
||||||
email = email_input
|
|
||||||
except (KeyboardInterrupt, EOFError):
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Save choice
|
|
||||||
self.config.set_consent(consent, email)
|
|
||||||
|
|
||||||
if consent != TelemetryConsent.DENIED:
|
|
||||||
print("\n✅ Thank you for helping improve Crawl4AI!")
|
|
||||||
else:
|
|
||||||
print("\n✅ Telemetry disabled. You can enable it anytime with:")
|
|
||||||
print(" crawl4ai telemetry enable")
|
|
||||||
|
|
||||||
print("="*60 + "\n")
|
|
||||||
|
|
||||||
return consent
|
|
||||||
|
|
||||||
def _notebook_prompt(self) -> TelemetryConsent:
|
|
||||||
"""
|
|
||||||
Show notebook prompt for consent.
|
|
||||||
Uses widgets if available, falls back to print + code.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
User's consent choice
|
|
||||||
"""
|
|
||||||
if EnvironmentDetector.supports_widgets():
|
|
||||||
return self._widget_prompt()
|
|
||||||
else:
|
|
||||||
return self._notebook_fallback_prompt()
|
|
||||||
|
|
||||||
def _widget_prompt(self) -> TelemetryConsent:
|
|
||||||
"""
|
|
||||||
Show interactive widget prompt in Jupyter/Colab.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
User's consent choice
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
import ipywidgets as widgets
|
|
||||||
from IPython.display import display, HTML
|
|
||||||
|
|
||||||
# Create styled HTML
|
|
||||||
html = HTML("""
|
|
||||||
<div style="padding: 15px; border: 2px solid #ff6b6b; border-radius: 8px; background: #fff5f5;">
|
|
||||||
<h3 style="color: #c92a2a; margin-top: 0;">🚨 Crawl4AI Error Detected</h3>
|
|
||||||
<p style="color: #495057;">Help us improve by sending anonymous crash reports?</p>
|
|
||||||
</div>
|
|
||||||
""")
|
|
||||||
display(html)
|
|
||||||
|
|
||||||
# Create buttons
|
|
||||||
btn_once = widgets.Button(
|
|
||||||
description='Send this error',
|
|
||||||
button_style='info',
|
|
||||||
icon='check'
|
|
||||||
)
|
|
||||||
btn_always = widgets.Button(
|
|
||||||
description='Always send',
|
|
||||||
button_style='success',
|
|
||||||
icon='check-circle'
|
|
||||||
)
|
|
||||||
btn_never = widgets.Button(
|
|
||||||
description='Don\'t send',
|
|
||||||
button_style='danger',
|
|
||||||
icon='times'
|
|
||||||
)
|
|
||||||
|
|
||||||
# Email input
|
|
||||||
email_input = widgets.Text(
|
|
||||||
placeholder='Optional: your@email.com',
|
|
||||||
description='Email:',
|
|
||||||
style={'description_width': 'initial'}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Output area for feedback
|
|
||||||
output = widgets.Output()
|
|
||||||
|
|
||||||
# Container
|
|
||||||
button_box = widgets.HBox([btn_once, btn_always, btn_never])
|
|
||||||
container = widgets.VBox([button_box, email_input, output])
|
|
||||||
|
|
||||||
# Variable to store choice
|
|
||||||
consent_choice = {'value': None}
|
|
||||||
|
|
||||||
def on_button_click(btn):
|
|
||||||
"""Handle button click."""
|
|
||||||
with output:
|
|
||||||
output.clear_output()
|
|
||||||
|
|
||||||
if btn == btn_once:
|
|
||||||
consent_choice['value'] = TelemetryConsent.ONCE
|
|
||||||
print("✅ Sending this error only")
|
|
||||||
elif btn == btn_always:
|
|
||||||
consent_choice['value'] = TelemetryConsent.ALWAYS
|
|
||||||
print("✅ Always sending errors")
|
|
||||||
else:
|
|
||||||
consent_choice['value'] = TelemetryConsent.DENIED
|
|
||||||
print("✅ Telemetry disabled")
|
|
||||||
|
|
||||||
# Save with email if provided
|
|
||||||
email = email_input.value.strip() if email_input.value else None
|
|
||||||
self.config.set_consent(consent_choice['value'], email)
|
|
||||||
|
|
||||||
# Disable buttons after choice
|
|
||||||
btn_once.disabled = True
|
|
||||||
btn_always.disabled = True
|
|
||||||
btn_never.disabled = True
|
|
||||||
email_input.disabled = True
|
|
||||||
|
|
||||||
# Attach handlers
|
|
||||||
btn_once.on_click(on_button_click)
|
|
||||||
btn_always.on_click(on_button_click)
|
|
||||||
btn_never.on_click(on_button_click)
|
|
||||||
|
|
||||||
# Display widget
|
|
||||||
display(container)
|
|
||||||
|
|
||||||
# Wait for user choice (in notebook, this is non-blocking)
|
|
||||||
# Return NOT_SET for now, actual choice will be saved via callback
|
|
||||||
return consent_choice.get('value', TelemetryConsent.NOT_SET)
|
|
||||||
|
|
||||||
except Exception:
|
|
||||||
# Fallback if widgets fail
|
|
||||||
return self._notebook_fallback_prompt()
|
|
||||||
|
|
||||||
def _notebook_fallback_prompt(self) -> TelemetryConsent:
|
|
||||||
"""
|
|
||||||
Fallback prompt for notebooks without widget support.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
User's consent choice (defaults to DENIED)
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
from IPython.display import display, Markdown
|
|
||||||
|
|
||||||
markdown_content = """
|
|
||||||
### 🚨 Crawl4AI Error Detected
|
|
||||||
|
|
||||||
Help us improve by sending anonymous crash reports.
|
|
||||||
|
|
||||||
**Telemetry is currently OFF.** To enable, run:
|
|
||||||
|
|
||||||
```python
|
|
||||||
import crawl4ai
|
|
||||||
crawl4ai.telemetry.enable(email="your@email.com", always=True)
|
|
||||||
```
|
|
||||||
|
|
||||||
To send just this error:
|
|
||||||
```python
|
|
||||||
crawl4ai.telemetry.enable(once=True)
|
|
||||||
```
|
|
||||||
|
|
||||||
To keep telemetry disabled:
|
|
||||||
```python
|
|
||||||
crawl4ai.telemetry.disable()
|
|
||||||
```
|
|
||||||
"""
|
|
||||||
|
|
||||||
display(Markdown(markdown_content))
|
|
||||||
|
|
||||||
except ImportError:
|
|
||||||
# Pure print fallback
|
|
||||||
print("\n" + "="*60)
|
|
||||||
print("🚨 Crawl4AI Error Detected")
|
|
||||||
print("="*60)
|
|
||||||
print("\nTelemetry is OFF. To enable, run:")
|
|
||||||
print("\nimport crawl4ai")
|
|
||||||
print('crawl4ai.telemetry.enable(email="you@example.com", always=True)')
|
|
||||||
print("\n" + "="*60)
|
|
||||||
|
|
||||||
# Default to disabled in fallback mode
|
|
||||||
return TelemetryConsent.DENIED
|
|
||||||
|
|
||||||
def force_prompt(self) -> Tuple[TelemetryConsent, Optional[str]]:
|
|
||||||
"""
|
|
||||||
Force a consent prompt regardless of current settings.
|
|
||||||
Used for manual telemetry configuration.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tuple of (consent choice, optional email)
|
|
||||||
"""
|
|
||||||
# Temporarily reset consent to force prompt
|
|
||||||
original_consent = self.config.get_consent()
|
|
||||||
self.config.set_consent(TelemetryConsent.NOT_SET)
|
|
||||||
|
|
||||||
try:
|
|
||||||
new_consent = self._prompt_for_consent()
|
|
||||||
email = self.config.get_email()
|
|
||||||
return new_consent, email
|
|
||||||
except Exception:
|
|
||||||
# Restore original on error
|
|
||||||
self.config.set_consent(original_consent)
|
|
||||||
raise
|
|
||||||
@@ -1,199 +0,0 @@
|
|||||||
"""
|
|
||||||
Environment detection for Crawl4AI telemetry.
|
|
||||||
Detects whether we're running in CLI, Docker, Jupyter, etc.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
from enum import Enum
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
|
|
||||||
class Environment(Enum):
|
|
||||||
"""Detected runtime environment."""
|
|
||||||
CLI = "cli"
|
|
||||||
DOCKER = "docker"
|
|
||||||
JUPYTER = "jupyter"
|
|
||||||
COLAB = "colab"
|
|
||||||
API_SERVER = "api_server"
|
|
||||||
UNKNOWN = "unknown"
|
|
||||||
|
|
||||||
|
|
||||||
class EnvironmentDetector:
|
|
||||||
"""Detects the current runtime environment."""
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def detect() -> Environment:
|
|
||||||
"""
|
|
||||||
Detect current runtime environment.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Environment enum value
|
|
||||||
"""
|
|
||||||
# Check for Docker
|
|
||||||
if EnvironmentDetector._is_docker():
|
|
||||||
# Further check if it's API server
|
|
||||||
if EnvironmentDetector._is_api_server():
|
|
||||||
return Environment.API_SERVER
|
|
||||||
return Environment.DOCKER
|
|
||||||
|
|
||||||
# Check for Google Colab
|
|
||||||
if EnvironmentDetector._is_colab():
|
|
||||||
return Environment.COLAB
|
|
||||||
|
|
||||||
# Check for Jupyter
|
|
||||||
if EnvironmentDetector._is_jupyter():
|
|
||||||
return Environment.JUPYTER
|
|
||||||
|
|
||||||
# Check for CLI
|
|
||||||
if EnvironmentDetector._is_cli():
|
|
||||||
return Environment.CLI
|
|
||||||
|
|
||||||
return Environment.UNKNOWN
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _is_docker() -> bool:
|
|
||||||
"""Check if running inside Docker container."""
|
|
||||||
# Check for Docker-specific files
|
|
||||||
if os.path.exists('/.dockerenv'):
|
|
||||||
return True
|
|
||||||
|
|
||||||
# Check cgroup for docker signature
|
|
||||||
try:
|
|
||||||
with open('/proc/1/cgroup', 'r') as f:
|
|
||||||
return 'docker' in f.read()
|
|
||||||
except (IOError, OSError):
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Check environment variable (if set in Dockerfile)
|
|
||||||
return os.environ.get('CRAWL4AI_DOCKER', '').lower() == 'true'
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _is_api_server() -> bool:
|
|
||||||
"""Check if running as API server."""
|
|
||||||
# Check for API server indicators
|
|
||||||
return (
|
|
||||||
os.environ.get('CRAWL4AI_API_SERVER', '').lower() == 'true' or
|
|
||||||
'deploy/docker/server.py' in ' '.join(sys.argv) or
|
|
||||||
'deploy/docker/api.py' in ' '.join(sys.argv)
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _is_jupyter() -> bool:
|
|
||||||
"""Check if running in Jupyter notebook."""
|
|
||||||
try:
|
|
||||||
# Check for IPython
|
|
||||||
from IPython import get_ipython
|
|
||||||
ipython = get_ipython()
|
|
||||||
|
|
||||||
if ipython is None:
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Check for notebook kernel
|
|
||||||
if 'IPKernelApp' in ipython.config:
|
|
||||||
return True
|
|
||||||
|
|
||||||
# Check for Jupyter-specific attributes
|
|
||||||
if hasattr(ipython, 'kernel'):
|
|
||||||
return True
|
|
||||||
|
|
||||||
except (ImportError, AttributeError):
|
|
||||||
pass
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _is_colab() -> bool:
|
|
||||||
"""Check if running in Google Colab."""
|
|
||||||
try:
|
|
||||||
import google.colab
|
|
||||||
return True
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Alternative check
|
|
||||||
return 'COLAB_GPU' in os.environ or 'COLAB_TPU_ADDR' in os.environ
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _is_cli() -> bool:
|
|
||||||
"""Check if running from command line."""
|
|
||||||
# Check if we have a terminal
|
|
||||||
return (
|
|
||||||
hasattr(sys, 'ps1') or
|
|
||||||
sys.stdin.isatty() or
|
|
||||||
bool(os.environ.get('TERM'))
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def is_interactive() -> bool:
|
|
||||||
"""
|
|
||||||
Check if environment supports interactive prompts.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if interactive prompts are supported
|
|
||||||
"""
|
|
||||||
env = EnvironmentDetector.detect()
|
|
||||||
|
|
||||||
# Docker/API server are non-interactive
|
|
||||||
if env in [Environment.DOCKER, Environment.API_SERVER]:
|
|
||||||
return False
|
|
||||||
|
|
||||||
# CLI with TTY is interactive
|
|
||||||
if env == Environment.CLI:
|
|
||||||
return sys.stdin.isatty()
|
|
||||||
|
|
||||||
# Jupyter/Colab can be interactive with widgets
|
|
||||||
if env in [Environment.JUPYTER, Environment.COLAB]:
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def supports_widgets() -> bool:
|
|
||||||
"""
|
|
||||||
Check if environment supports IPython widgets.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if widgets are supported
|
|
||||||
"""
|
|
||||||
env = EnvironmentDetector.detect()
|
|
||||||
|
|
||||||
if env not in [Environment.JUPYTER, Environment.COLAB]:
|
|
||||||
return False
|
|
||||||
|
|
||||||
try:
|
|
||||||
import ipywidgets
|
|
||||||
from IPython.display import display
|
|
||||||
return True
|
|
||||||
except ImportError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_environment_context() -> dict:
|
|
||||||
"""
|
|
||||||
Get environment context for telemetry.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dictionary with environment information
|
|
||||||
"""
|
|
||||||
env = EnvironmentDetector.detect()
|
|
||||||
|
|
||||||
context = {
|
|
||||||
'environment_type': env.value,
|
|
||||||
'python_version': f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}",
|
|
||||||
'platform': sys.platform,
|
|
||||||
}
|
|
||||||
|
|
||||||
# Add environment-specific context
|
|
||||||
if env == Environment.DOCKER:
|
|
||||||
context['docker'] = True
|
|
||||||
context['container_id'] = os.environ.get('HOSTNAME', 'unknown')
|
|
||||||
|
|
||||||
elif env == Environment.COLAB:
|
|
||||||
context['colab'] = True
|
|
||||||
context['gpu'] = bool(os.environ.get('COLAB_GPU'))
|
|
||||||
|
|
||||||
elif env == Environment.JUPYTER:
|
|
||||||
context['jupyter'] = True
|
|
||||||
|
|
||||||
return context
|
|
||||||
@@ -1,15 +0,0 @@
|
|||||||
"""
|
|
||||||
Telemetry providers for Crawl4AI.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from ..base import TelemetryProvider, NullProvider
|
|
||||||
|
|
||||||
__all__ = ['TelemetryProvider', 'NullProvider']
|
|
||||||
|
|
||||||
# Try to import Sentry provider if available
|
|
||||||
try:
|
|
||||||
from .sentry import SentryProvider
|
|
||||||
__all__.append('SentryProvider')
|
|
||||||
except ImportError:
|
|
||||||
# Sentry SDK not installed
|
|
||||||
pass
|
|
||||||
@@ -1,234 +0,0 @@
|
|||||||
"""
|
|
||||||
Sentry telemetry provider for Crawl4AI.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
from typing import Dict, Any, Optional
|
|
||||||
from ..base import TelemetryProvider
|
|
||||||
|
|
||||||
# Hardcoded DSN for Crawl4AI project
|
|
||||||
# This is safe to embed as it's the public part of the DSN
|
|
||||||
# TODO: Replace with actual Crawl4AI Sentry project DSN before release
|
|
||||||
# Format: "https://<public_key>@<organization>.ingest.sentry.io/<project_id>"
|
|
||||||
DEFAULT_SENTRY_DSN = "https://your-public-key@sentry.io/your-project-id"
|
|
||||||
|
|
||||||
|
|
||||||
class SentryProvider(TelemetryProvider):
|
|
||||||
"""Sentry implementation of telemetry provider."""
|
|
||||||
|
|
||||||
def __init__(self, dsn: Optional[str] = None, **kwargs):
|
|
||||||
"""
|
|
||||||
Initialize Sentry provider.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
dsn: Optional DSN override (for testing/development)
|
|
||||||
**kwargs: Additional Sentry configuration
|
|
||||||
"""
|
|
||||||
super().__init__(**kwargs)
|
|
||||||
|
|
||||||
# Allow DSN override via environment variable or parameter
|
|
||||||
self.dsn = (
|
|
||||||
dsn or
|
|
||||||
os.environ.get('CRAWL4AI_SENTRY_DSN') or
|
|
||||||
DEFAULT_SENTRY_DSN
|
|
||||||
)
|
|
||||||
|
|
||||||
self._sentry_sdk = None
|
|
||||||
self.environment = kwargs.get('environment', 'production')
|
|
||||||
self.release = kwargs.get('release', None)
|
|
||||||
|
|
||||||
def initialize(self) -> bool:
|
|
||||||
"""Initialize Sentry SDK."""
|
|
||||||
try:
|
|
||||||
import sentry_sdk
|
|
||||||
from sentry_sdk.integrations.stdlib import StdlibIntegration
|
|
||||||
from sentry_sdk.integrations.excepthook import ExcepthookIntegration
|
|
||||||
|
|
||||||
# Initialize Sentry with minimal integrations
|
|
||||||
sentry_sdk.init(
|
|
||||||
dsn=self.dsn,
|
|
||||||
|
|
||||||
environment=self.environment,
|
|
||||||
release=self.release,
|
|
||||||
|
|
||||||
# Performance monitoring disabled by default
|
|
||||||
traces_sample_rate=0.0,
|
|
||||||
|
|
||||||
# Only capture errors, not transactions
|
|
||||||
# profiles_sample_rate=0.0,
|
|
||||||
|
|
||||||
# Minimal integrations
|
|
||||||
integrations=[
|
|
||||||
StdlibIntegration(),
|
|
||||||
ExcepthookIntegration(always_run=False),
|
|
||||||
],
|
|
||||||
|
|
||||||
# Privacy settings
|
|
||||||
send_default_pii=False,
|
|
||||||
attach_stacktrace=True,
|
|
||||||
|
|
||||||
# Before send hook for additional sanitization
|
|
||||||
before_send=self._before_send,
|
|
||||||
|
|
||||||
# Disable automatic breadcrumbs
|
|
||||||
max_breadcrumbs=0,
|
|
||||||
|
|
||||||
# Disable request data collection
|
|
||||||
# request_bodies='never',
|
|
||||||
|
|
||||||
# # Custom transport options
|
|
||||||
# transport_options={
|
|
||||||
# 'keepalive': True,
|
|
||||||
# },
|
|
||||||
)
|
|
||||||
|
|
||||||
self._sentry_sdk = sentry_sdk
|
|
||||||
self._initialized = True
|
|
||||||
return True
|
|
||||||
|
|
||||||
except ImportError:
|
|
||||||
# Sentry SDK not installed
|
|
||||||
return False
|
|
||||||
except Exception:
|
|
||||||
# Initialization failed silently
|
|
||||||
return False
|
|
||||||
|
|
||||||
def _before_send(self, event: Dict[str, Any], hint: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
|
||||||
"""
|
|
||||||
Process event before sending to Sentry.
|
|
||||||
Provides additional privacy protection.
|
|
||||||
"""
|
|
||||||
# Remove sensitive data
|
|
||||||
if 'request' in event:
|
|
||||||
event['request'] = self._sanitize_request(event['request'])
|
|
||||||
|
|
||||||
# Remove local variables that might contain sensitive data
|
|
||||||
if 'exception' in event and 'values' in event['exception']:
|
|
||||||
for exc in event['exception']['values']:
|
|
||||||
if 'stacktrace' in exc and 'frames' in exc['stacktrace']:
|
|
||||||
for frame in exc['stacktrace']['frames']:
|
|
||||||
# Remove local variables from frames
|
|
||||||
frame.pop('vars', None)
|
|
||||||
|
|
||||||
# Apply general sanitization
|
|
||||||
event = self.sanitize_data(event)
|
|
||||||
|
|
||||||
return event
|
|
||||||
|
|
||||||
def _sanitize_request(self, request_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
||||||
"""Sanitize request data to remove sensitive information."""
|
|
||||||
sanitized = request_data.copy()
|
|
||||||
|
|
||||||
# Remove sensitive fields
|
|
||||||
sensitive_fields = ['cookies', 'headers', 'data', 'query_string', 'env']
|
|
||||||
for field in sensitive_fields:
|
|
||||||
if field in sanitized:
|
|
||||||
sanitized[field] = '[REDACTED]'
|
|
||||||
|
|
||||||
# Keep only safe fields
|
|
||||||
safe_fields = ['method', 'url']
|
|
||||||
return {k: v for k, v in sanitized.items() if k in safe_fields}
|
|
||||||
|
|
||||||
def send_exception(
|
|
||||||
self,
|
|
||||||
exc: Exception,
|
|
||||||
context: Optional[Dict[str, Any]] = None
|
|
||||||
) -> bool:
|
|
||||||
"""
|
|
||||||
Send exception to Sentry.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
exc: Exception to report
|
|
||||||
context: Optional context (email, environment info)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if sent successfully
|
|
||||||
"""
|
|
||||||
if not self._initialized:
|
|
||||||
if not self.initialize():
|
|
||||||
return False
|
|
||||||
|
|
||||||
try:
|
|
||||||
if self._sentry_sdk:
|
|
||||||
with self._sentry_sdk.push_scope() as scope:
|
|
||||||
# Add user context if email provided
|
|
||||||
if context and 'email' in context:
|
|
||||||
scope.set_user({'email': context['email']})
|
|
||||||
|
|
||||||
# Add additional context
|
|
||||||
if context:
|
|
||||||
for key, value in context.items():
|
|
||||||
if key != 'email':
|
|
||||||
scope.set_context(key, value)
|
|
||||||
|
|
||||||
# Add tags for filtering
|
|
||||||
scope.set_tag('source', context.get('source', 'unknown'))
|
|
||||||
scope.set_tag('environment_type', context.get('environment_type', 'unknown'))
|
|
||||||
|
|
||||||
# Capture the exception
|
|
||||||
self._sentry_sdk.capture_exception(exc)
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
except Exception:
|
|
||||||
# Silently fail - telemetry should never crash the app
|
|
||||||
return False
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def send_event(
|
|
||||||
self,
|
|
||||||
event_name: str,
|
|
||||||
payload: Optional[Dict[str, Any]] = None
|
|
||||||
) -> bool:
|
|
||||||
"""
|
|
||||||
Send custom event to Sentry.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
event_name: Name of the event
|
|
||||||
payload: Event data
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if sent successfully
|
|
||||||
"""
|
|
||||||
if not self._initialized:
|
|
||||||
if not self.initialize():
|
|
||||||
return False
|
|
||||||
|
|
||||||
try:
|
|
||||||
if self._sentry_sdk:
|
|
||||||
# Sanitize payload
|
|
||||||
safe_payload = self.sanitize_data(payload) if payload else {}
|
|
||||||
|
|
||||||
# Send as a message with extra data
|
|
||||||
self._sentry_sdk.capture_message(
|
|
||||||
event_name,
|
|
||||||
level='info',
|
|
||||||
extras=safe_payload
|
|
||||||
)
|
|
||||||
return True
|
|
||||||
|
|
||||||
except Exception:
|
|
||||||
return False
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def flush(self) -> None:
|
|
||||||
"""Flush pending events to Sentry."""
|
|
||||||
if self._initialized and self._sentry_sdk:
|
|
||||||
try:
|
|
||||||
self._sentry_sdk.flush(timeout=2.0)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def shutdown(self) -> None:
|
|
||||||
"""Shutdown Sentry client."""
|
|
||||||
if self._initialized and self._sentry_sdk:
|
|
||||||
try:
|
|
||||||
self._sentry_sdk.flush(timeout=2.0)
|
|
||||||
# Note: sentry_sdk doesn't have a shutdown method
|
|
||||||
# Flush is sufficient for cleanup
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
finally:
|
|
||||||
self._initialized = False
|
|
||||||
@@ -12,6 +12,7 @@
|
|||||||
- [Python SDK](#python-sdk)
|
- [Python SDK](#python-sdk)
|
||||||
- [Understanding Request Schema](#understanding-request-schema)
|
- [Understanding Request Schema](#understanding-request-schema)
|
||||||
- [REST API Examples](#rest-api-examples)
|
- [REST API Examples](#rest-api-examples)
|
||||||
|
- [Asynchronous Jobs with Webhooks](#asynchronous-jobs-with-webhooks)
|
||||||
- [Additional API Endpoints](#additional-api-endpoints)
|
- [Additional API Endpoints](#additional-api-endpoints)
|
||||||
- [HTML Extraction Endpoint](#html-extraction-endpoint)
|
- [HTML Extraction Endpoint](#html-extraction-endpoint)
|
||||||
- [Screenshot Endpoint](#screenshot-endpoint)
|
- [Screenshot Endpoint](#screenshot-endpoint)
|
||||||
@@ -648,6 +649,146 @@ async def test_stream_crawl(token: str = None): # Made token optional
|
|||||||
# asyncio.run(test_stream_crawl())
|
# asyncio.run(test_stream_crawl())
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Asynchronous Jobs with Webhooks
|
||||||
|
|
||||||
|
For long-running crawls or when you want to avoid keeping connections open, use the job queue endpoints. Instead of polling for results, configure a webhook to receive notifications when jobs complete.
|
||||||
|
|
||||||
|
#### Why Use Jobs & Webhooks?
|
||||||
|
|
||||||
|
- **No Polling Required** - Get notified when crawls complete instead of constantly checking status
|
||||||
|
- **Better Resource Usage** - Free up client connections while jobs run in the background
|
||||||
|
- **Scalable Architecture** - Ideal for high-volume crawling with TypeScript/Node.js clients or microservices
|
||||||
|
- **Reliable Delivery** - Automatic retry with exponential backoff (5 attempts: 1s → 2s → 4s → 8s → 16s)
|
||||||
|
|
||||||
|
#### How It Works
|
||||||
|
|
||||||
|
1. **Submit Job** → POST to `/crawl/job` with optional `webhook_config`
|
||||||
|
2. **Get Task ID** → Receive a `task_id` immediately
|
||||||
|
3. **Job Runs** → Crawl executes in the background
|
||||||
|
4. **Webhook Fired** → Server POSTs completion notification to your webhook URL
|
||||||
|
5. **Fetch Results** → If data wasn't included in webhook, GET `/crawl/job/{task_id}`
|
||||||
|
|
||||||
|
#### Quick Example
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Submit a crawl job with webhook notification
|
||||||
|
curl -X POST http://localhost:11235/crawl/job \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://myapp.com/webhooks/crawl-complete",
|
||||||
|
"webhook_data_in_payload": false
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
|
||||||
|
# Response: {"task_id": "crawl_a1b2c3d4"}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Your webhook receives:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"task_id": "crawl_a1b2c3d4",
|
||||||
|
"task_type": "crawl",
|
||||||
|
"status": "completed",
|
||||||
|
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
||||||
|
"urls": ["https://example.com"]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Then fetch the results:
|
||||||
|
```bash
|
||||||
|
curl http://localhost:11235/crawl/job/crawl_a1b2c3d4
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Include Data in Webhook
|
||||||
|
|
||||||
|
Set `webhook_data_in_payload: true` to receive the full crawl results directly in the webhook:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:11235/crawl/job \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://myapp.com/webhooks/crawl-complete",
|
||||||
|
"webhook_data_in_payload": true
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Your webhook receives the complete data:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"task_id": "crawl_a1b2c3d4",
|
||||||
|
"task_type": "crawl",
|
||||||
|
"status": "completed",
|
||||||
|
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"data": {
|
||||||
|
"markdown": "...",
|
||||||
|
"html": "...",
|
||||||
|
"links": {...},
|
||||||
|
"metadata": {...}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Webhook Authentication
|
||||||
|
|
||||||
|
Add custom headers for authentication:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://myapp.com/webhooks/crawl",
|
||||||
|
"webhook_data_in_payload": false,
|
||||||
|
"webhook_headers": {
|
||||||
|
"X-Webhook-Secret": "your-secret-token",
|
||||||
|
"X-Service-ID": "crawl4ai-prod"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Global Default Webhook
|
||||||
|
|
||||||
|
Configure a default webhook URL in `config.yml` for all jobs:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
webhooks:
|
||||||
|
enabled: true
|
||||||
|
default_url: "https://myapp.com/webhooks/default"
|
||||||
|
data_in_payload: false
|
||||||
|
retry:
|
||||||
|
max_attempts: 5
|
||||||
|
initial_delay_ms: 1000
|
||||||
|
max_delay_ms: 32000
|
||||||
|
timeout_ms: 30000
|
||||||
|
```
|
||||||
|
|
||||||
|
Now jobs without `webhook_config` automatically use the default webhook.
|
||||||
|
|
||||||
|
#### Job Status Polling (Without Webhooks)
|
||||||
|
|
||||||
|
If you prefer polling instead of webhooks, just omit `webhook_config`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Submit job
|
||||||
|
curl -X POST http://localhost:11235/crawl/job \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"urls": ["https://example.com"]}'
|
||||||
|
# Response: {"task_id": "crawl_xyz"}
|
||||||
|
|
||||||
|
# Poll for status
|
||||||
|
curl http://localhost:11235/crawl/job/crawl_xyz
|
||||||
|
```
|
||||||
|
|
||||||
|
The response includes `status` field: `"processing"`, `"completed"`, or `"failed"`.
|
||||||
|
|
||||||
|
> 💡 **Pro tip**: See [WEBHOOK_EXAMPLES.md](./WEBHOOK_EXAMPLES.md) for detailed examples including TypeScript client code, Flask webhook handlers, and failure handling.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Metrics & Monitoring
|
## Metrics & Monitoring
|
||||||
@@ -827,10 +968,11 @@ We're here to help you succeed with Crawl4AI! Here's how to get support:
|
|||||||
|
|
||||||
In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
|
In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
|
||||||
- Building and running the Docker container
|
- Building and running the Docker container
|
||||||
- Configuring the environment
|
- Configuring the environment
|
||||||
- Using the interactive playground for testing
|
- Using the interactive playground for testing
|
||||||
- Making API requests with proper typing
|
- Making API requests with proper typing
|
||||||
- Using the Python SDK
|
- Using the Python SDK
|
||||||
|
- Asynchronous job queues with webhook notifications
|
||||||
- Leveraging specialized endpoints for screenshots, PDFs, and JavaScript execution
|
- Leveraging specialized endpoints for screenshots, PDFs, and JavaScript execution
|
||||||
- Connecting via the Model Context Protocol (MCP)
|
- Connecting via the Model Context Protocol (MCP)
|
||||||
- Monitoring your deployment
|
- Monitoring your deployment
|
||||||
|
|||||||
378
deploy/docker/WEBHOOK_EXAMPLES.md
Normal file
378
deploy/docker/WEBHOOK_EXAMPLES.md
Normal file
@@ -0,0 +1,378 @@
|
|||||||
|
# Webhook Feature Examples
|
||||||
|
|
||||||
|
This document provides examples of how to use the webhook feature for crawl jobs in Crawl4AI.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The webhook feature allows you to receive notifications when crawl jobs complete, eliminating the need for polling. Webhooks are sent with exponential backoff retry logic to ensure reliable delivery.
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
### Global Configuration (config.yml)
|
||||||
|
|
||||||
|
You can configure default webhook settings in `config.yml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
webhooks:
|
||||||
|
enabled: true
|
||||||
|
default_url: null # Optional: default webhook URL for all jobs
|
||||||
|
data_in_payload: false # Optional: default behavior for including data
|
||||||
|
retry:
|
||||||
|
max_attempts: 5
|
||||||
|
initial_delay_ms: 1000 # 1s, 2s, 4s, 8s, 16s exponential backoff
|
||||||
|
max_delay_ms: 32000
|
||||||
|
timeout_ms: 30000 # 30s timeout per webhook call
|
||||||
|
headers: # Optional: default headers to include
|
||||||
|
User-Agent: "Crawl4AI-Webhook/1.0"
|
||||||
|
```
|
||||||
|
|
||||||
|
## API Usage Examples
|
||||||
|
|
||||||
|
### Example 1: Basic Webhook (Notification Only)
|
||||||
|
|
||||||
|
Send a webhook notification without including the crawl data in the payload.
|
||||||
|
|
||||||
|
**Request:**
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:11235/crawl/job \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://myapp.com/webhooks/crawl-complete",
|
||||||
|
"webhook_data_in_payload": false
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"task_id": "crawl_a1b2c3d4"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Webhook Payload Received:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"task_id": "crawl_a1b2c3d4",
|
||||||
|
"task_type": "crawl",
|
||||||
|
"status": "completed",
|
||||||
|
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
||||||
|
"urls": ["https://example.com"]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Your webhook handler should then fetch the results:
|
||||||
|
```bash
|
||||||
|
curl http://localhost:11235/crawl/job/crawl_a1b2c3d4
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example 2: Webhook with Data Included
|
||||||
|
|
||||||
|
Include the full crawl results in the webhook payload.
|
||||||
|
|
||||||
|
**Request:**
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:11235/crawl/job \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://myapp.com/webhooks/crawl-complete",
|
||||||
|
"webhook_data_in_payload": true
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Webhook Payload Received:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"task_id": "crawl_a1b2c3d4",
|
||||||
|
"task_type": "crawl",
|
||||||
|
"status": "completed",
|
||||||
|
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"data": {
|
||||||
|
"markdown": "...",
|
||||||
|
"html": "...",
|
||||||
|
"links": {...},
|
||||||
|
"metadata": {...}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example 3: Webhook with Custom Headers
|
||||||
|
|
||||||
|
Include custom headers for authentication or identification.
|
||||||
|
|
||||||
|
**Request:**
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:11235/crawl/job \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://myapp.com/webhooks/crawl-complete",
|
||||||
|
"webhook_data_in_payload": false,
|
||||||
|
"webhook_headers": {
|
||||||
|
"X-Webhook-Secret": "my-secret-token",
|
||||||
|
"X-Service-ID": "crawl4ai-production"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
The webhook will be sent with these additional headers plus the default headers from config.
|
||||||
|
|
||||||
|
### Example 4: Failure Notification
|
||||||
|
|
||||||
|
When a crawl job fails, a webhook is sent with error details.
|
||||||
|
|
||||||
|
**Webhook Payload on Failure:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"task_id": "crawl_a1b2c3d4",
|
||||||
|
"task_type": "crawl",
|
||||||
|
"status": "failed",
|
||||||
|
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"error": "Connection timeout after 30s"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example 5: Using Global Default Webhook
|
||||||
|
|
||||||
|
If you set a `default_url` in config.yml, jobs without webhook_config will use it:
|
||||||
|
|
||||||
|
**config.yml:**
|
||||||
|
```yaml
|
||||||
|
webhooks:
|
||||||
|
enabled: true
|
||||||
|
default_url: "https://myapp.com/webhooks/default"
|
||||||
|
data_in_payload: false
|
||||||
|
```
|
||||||
|
|
||||||
|
**Request (no webhook_config needed):**
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:11235/crawl/job \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"urls": ["https://example.com"]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
The webhook will be sent to the default URL configured in config.yml.
|
||||||
|
|
||||||
|
### Example 6: LLM Extraction Job with Webhook
|
||||||
|
|
||||||
|
Use webhooks with the LLM extraction endpoint for asynchronous processing.
|
||||||
|
|
||||||
|
**Request:**
|
||||||
|
```bash
|
||||||
|
curl -X POST http://localhost:11235/llm/job \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"url": "https://example.com/article",
|
||||||
|
"q": "Extract the article title, author, and publication date",
|
||||||
|
"schema": "{\"type\": \"object\", \"properties\": {\"title\": {\"type\": \"string\"}, \"author\": {\"type\": \"string\"}, \"date\": {\"type\": \"string\"}}}",
|
||||||
|
"cache": false,
|
||||||
|
"provider": "openai/gpt-4o-mini",
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://myapp.com/webhooks/llm-complete",
|
||||||
|
"webhook_data_in_payload": true
|
||||||
|
}
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"task_id": "llm_1698765432_12345"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Webhook Payload Received:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"task_id": "llm_1698765432_12345",
|
||||||
|
"task_type": "llm_extraction",
|
||||||
|
"status": "completed",
|
||||||
|
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
||||||
|
"urls": ["https://example.com/article"],
|
||||||
|
"data": {
|
||||||
|
"extracted_content": {
|
||||||
|
"title": "Understanding Web Scraping",
|
||||||
|
"author": "John Doe",
|
||||||
|
"date": "2025-10-21"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Webhook Handler Example
|
||||||
|
|
||||||
|
Here's a simple Python Flask webhook handler that supports both crawl and LLM extraction jobs:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from flask import Flask, request, jsonify
|
||||||
|
import requests
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
@app.route('/webhooks/crawl-complete', methods=['POST'])
|
||||||
|
def handle_crawl_webhook():
|
||||||
|
payload = request.json
|
||||||
|
|
||||||
|
task_id = payload['task_id']
|
||||||
|
task_type = payload['task_type']
|
||||||
|
status = payload['status']
|
||||||
|
|
||||||
|
if status == 'completed':
|
||||||
|
# If data not in payload, fetch it
|
||||||
|
if 'data' not in payload:
|
||||||
|
# Determine endpoint based on task type
|
||||||
|
endpoint = 'crawl' if task_type == 'crawl' else 'llm'
|
||||||
|
response = requests.get(f'http://localhost:11235/{endpoint}/job/{task_id}')
|
||||||
|
data = response.json()
|
||||||
|
else:
|
||||||
|
data = payload['data']
|
||||||
|
|
||||||
|
# Process based on task type
|
||||||
|
if task_type == 'crawl':
|
||||||
|
print(f"Processing crawl results for {task_id}")
|
||||||
|
# Handle crawl results
|
||||||
|
results = data.get('results', [])
|
||||||
|
for result in results:
|
||||||
|
print(f" - {result.get('url')}: {len(result.get('markdown', ''))} chars")
|
||||||
|
|
||||||
|
elif task_type == 'llm_extraction':
|
||||||
|
print(f"Processing LLM extraction for {task_id}")
|
||||||
|
# Handle LLM extraction
|
||||||
|
# Note: Webhook sends 'extracted_content', API returns 'result'
|
||||||
|
extracted = data.get('extracted_content', data.get('result', {}))
|
||||||
|
print(f" - Extracted: {extracted}")
|
||||||
|
|
||||||
|
# Your business logic here...
|
||||||
|
|
||||||
|
elif status == 'failed':
|
||||||
|
error = payload.get('error', 'Unknown error')
|
||||||
|
print(f"{task_type} job {task_id} failed: {error}")
|
||||||
|
# Handle failure...
|
||||||
|
|
||||||
|
return jsonify({"status": "received"}), 200
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
app.run(port=8080)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Retry Logic
|
||||||
|
|
||||||
|
The webhook delivery service uses exponential backoff retry logic:
|
||||||
|
|
||||||
|
- **Attempts:** Up to 5 attempts by default
|
||||||
|
- **Delays:** 1s → 2s → 4s → 8s → 16s
|
||||||
|
- **Timeout:** 30 seconds per attempt
|
||||||
|
- **Retry Conditions:**
|
||||||
|
- Server errors (5xx status codes)
|
||||||
|
- Network errors
|
||||||
|
- Timeouts
|
||||||
|
- **No Retry:**
|
||||||
|
- Client errors (4xx status codes)
|
||||||
|
- Successful delivery (2xx status codes)
|
||||||
|
|
||||||
|
## Benefits
|
||||||
|
|
||||||
|
1. **No Polling Required** - Eliminates constant API calls to check job status
|
||||||
|
2. **Real-time Notifications** - Immediate notification when jobs complete
|
||||||
|
3. **Reliable Delivery** - Exponential backoff ensures webhooks are delivered
|
||||||
|
4. **Flexible** - Choose between notification-only or full data delivery
|
||||||
|
5. **Secure** - Support for custom headers for authentication
|
||||||
|
6. **Configurable** - Global defaults or per-job configuration
|
||||||
|
7. **Universal Support** - Works with both `/crawl/job` and `/llm/job` endpoints
|
||||||
|
|
||||||
|
## TypeScript Client Example
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
interface WebhookConfig {
|
||||||
|
webhook_url: string;
|
||||||
|
webhook_data_in_payload?: boolean;
|
||||||
|
webhook_headers?: Record<string, string>;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface CrawlJobRequest {
|
||||||
|
urls: string[];
|
||||||
|
browser_config?: Record<string, any>;
|
||||||
|
crawler_config?: Record<string, any>;
|
||||||
|
webhook_config?: WebhookConfig;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface LLMJobRequest {
|
||||||
|
url: string;
|
||||||
|
q: string;
|
||||||
|
schema?: string;
|
||||||
|
cache?: boolean;
|
||||||
|
provider?: string;
|
||||||
|
webhook_config?: WebhookConfig;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function createCrawlJob(request: CrawlJobRequest) {
|
||||||
|
const response = await fetch('http://localhost:11235/crawl/job', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify(request)
|
||||||
|
});
|
||||||
|
|
||||||
|
const { task_id } = await response.json();
|
||||||
|
return task_id;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function createLLMJob(request: LLMJobRequest) {
|
||||||
|
const response = await fetch('http://localhost:11235/llm/job', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify(request)
|
||||||
|
});
|
||||||
|
|
||||||
|
const { task_id } = await response.json();
|
||||||
|
return task_id;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Usage - Crawl Job
|
||||||
|
const crawlTaskId = await createCrawlJob({
|
||||||
|
urls: ['https://example.com'],
|
||||||
|
webhook_config: {
|
||||||
|
webhook_url: 'https://myapp.com/webhooks/crawl-complete',
|
||||||
|
webhook_data_in_payload: false,
|
||||||
|
webhook_headers: {
|
||||||
|
'X-Webhook-Secret': 'my-secret'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Usage - LLM Extraction Job
|
||||||
|
const llmTaskId = await createLLMJob({
|
||||||
|
url: 'https://example.com/article',
|
||||||
|
q: 'Extract the main points from this article',
|
||||||
|
provider: 'openai/gpt-4o-mini',
|
||||||
|
webhook_config: {
|
||||||
|
webhook_url: 'https://myapp.com/webhooks/llm-complete',
|
||||||
|
webhook_data_in_payload: true,
|
||||||
|
webhook_headers: {
|
||||||
|
'X-Webhook-Secret': 'my-secret'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
## Monitoring and Debugging
|
||||||
|
|
||||||
|
Webhook delivery attempts are logged at INFO level:
|
||||||
|
- Successful deliveries
|
||||||
|
- Retry attempts with delays
|
||||||
|
- Final failures after max attempts
|
||||||
|
|
||||||
|
Check the application logs for webhook delivery status:
|
||||||
|
```bash
|
||||||
|
docker logs crawl4ai-container | grep -i webhook
|
||||||
|
```
|
||||||
@@ -44,6 +44,7 @@ from utils import (
|
|||||||
get_llm_api_key,
|
get_llm_api_key,
|
||||||
validate_llm_provider
|
validate_llm_provider
|
||||||
)
|
)
|
||||||
|
from webhook import WebhookDeliveryService
|
||||||
|
|
||||||
import psutil, time
|
import psutil, time
|
||||||
|
|
||||||
@@ -115,9 +116,13 @@ async def process_llm_extraction(
|
|||||||
instruction: str,
|
instruction: str,
|
||||||
schema: Optional[str] = None,
|
schema: Optional[str] = None,
|
||||||
cache: str = "0",
|
cache: str = "0",
|
||||||
provider: Optional[str] = None
|
provider: Optional[str] = None,
|
||||||
|
webhook_config: Optional[Dict] = None
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Process LLM extraction in background."""
|
"""Process LLM extraction in background."""
|
||||||
|
# Initialize webhook service
|
||||||
|
webhook_service = WebhookDeliveryService(config)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Validate provider
|
# Validate provider
|
||||||
is_valid, error_msg = validate_llm_provider(config, provider)
|
is_valid, error_msg = validate_llm_provider(config, provider)
|
||||||
@@ -126,6 +131,16 @@ async def process_llm_extraction(
|
|||||||
"status": TaskStatus.FAILED,
|
"status": TaskStatus.FAILED,
|
||||||
"error": error_msg
|
"error": error_msg
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Send webhook notification on failure
|
||||||
|
await webhook_service.notify_job_completion(
|
||||||
|
task_id=task_id,
|
||||||
|
task_type="llm_extraction",
|
||||||
|
status="failed",
|
||||||
|
urls=[url],
|
||||||
|
webhook_config=webhook_config,
|
||||||
|
error=error_msg
|
||||||
|
)
|
||||||
return
|
return
|
||||||
api_key = get_llm_api_key(config, provider)
|
api_key = get_llm_api_key(config, provider)
|
||||||
llm_strategy = LLMExtractionStrategy(
|
llm_strategy = LLMExtractionStrategy(
|
||||||
@@ -154,17 +169,40 @@ async def process_llm_extraction(
|
|||||||
"status": TaskStatus.FAILED,
|
"status": TaskStatus.FAILED,
|
||||||
"error": result.error_message
|
"error": result.error_message
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Send webhook notification on failure
|
||||||
|
await webhook_service.notify_job_completion(
|
||||||
|
task_id=task_id,
|
||||||
|
task_type="llm_extraction",
|
||||||
|
status="failed",
|
||||||
|
urls=[url],
|
||||||
|
webhook_config=webhook_config,
|
||||||
|
error=result.error_message
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
content = json.loads(result.extracted_content)
|
content = json.loads(result.extracted_content)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
content = result.extracted_content
|
content = result.extracted_content
|
||||||
|
|
||||||
|
result_data = {"extracted_content": content}
|
||||||
|
|
||||||
await redis.hset(f"task:{task_id}", mapping={
|
await redis.hset(f"task:{task_id}", mapping={
|
||||||
"status": TaskStatus.COMPLETED,
|
"status": TaskStatus.COMPLETED,
|
||||||
"result": json.dumps(content)
|
"result": json.dumps(content)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Send webhook notification on successful completion
|
||||||
|
await webhook_service.notify_job_completion(
|
||||||
|
task_id=task_id,
|
||||||
|
task_type="llm_extraction",
|
||||||
|
status="completed",
|
||||||
|
urls=[url],
|
||||||
|
webhook_config=webhook_config,
|
||||||
|
result=result_data
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"LLM extraction error: {str(e)}", exc_info=True)
|
logger.error(f"LLM extraction error: {str(e)}", exc_info=True)
|
||||||
await redis.hset(f"task:{task_id}", mapping={
|
await redis.hset(f"task:{task_id}", mapping={
|
||||||
@@ -172,6 +210,16 @@ async def process_llm_extraction(
|
|||||||
"error": str(e)
|
"error": str(e)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Send webhook notification on failure
|
||||||
|
await webhook_service.notify_job_completion(
|
||||||
|
task_id=task_id,
|
||||||
|
task_type="llm_extraction",
|
||||||
|
status="failed",
|
||||||
|
urls=[url],
|
||||||
|
webhook_config=webhook_config,
|
||||||
|
error=str(e)
|
||||||
|
)
|
||||||
|
|
||||||
async def handle_markdown_request(
|
async def handle_markdown_request(
|
||||||
url: str,
|
url: str,
|
||||||
filter_type: FilterType,
|
filter_type: FilterType,
|
||||||
@@ -248,7 +296,8 @@ async def handle_llm_request(
|
|||||||
schema: Optional[str] = None,
|
schema: Optional[str] = None,
|
||||||
cache: str = "0",
|
cache: str = "0",
|
||||||
config: Optional[dict] = None,
|
config: Optional[dict] = None,
|
||||||
provider: Optional[str] = None
|
provider: Optional[str] = None,
|
||||||
|
webhook_config: Optional[Dict] = None,
|
||||||
) -> JSONResponse:
|
) -> JSONResponse:
|
||||||
"""Handle LLM extraction requests."""
|
"""Handle LLM extraction requests."""
|
||||||
base_url = get_base_url(request)
|
base_url = get_base_url(request)
|
||||||
@@ -279,7 +328,8 @@ async def handle_llm_request(
|
|||||||
cache,
|
cache,
|
||||||
base_url,
|
base_url,
|
||||||
config,
|
config,
|
||||||
provider
|
provider,
|
||||||
|
webhook_config
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -324,7 +374,8 @@ async def create_new_task(
|
|||||||
cache: str,
|
cache: str,
|
||||||
base_url: str,
|
base_url: str,
|
||||||
config: dict,
|
config: dict,
|
||||||
provider: Optional[str] = None
|
provider: Optional[str] = None,
|
||||||
|
webhook_config: Optional[Dict] = None
|
||||||
) -> JSONResponse:
|
) -> JSONResponse:
|
||||||
"""Create and initialize a new task."""
|
"""Create and initialize a new task."""
|
||||||
decoded_url = unquote(input_path)
|
decoded_url = unquote(input_path)
|
||||||
@@ -333,12 +384,18 @@ async def create_new_task(
|
|||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
task_id = f"llm_{int(datetime.now().timestamp())}_{id(background_tasks)}"
|
task_id = f"llm_{int(datetime.now().timestamp())}_{id(background_tasks)}"
|
||||||
|
|
||||||
await redis.hset(f"task:{task_id}", mapping={
|
task_data = {
|
||||||
"status": TaskStatus.PROCESSING,
|
"status": TaskStatus.PROCESSING,
|
||||||
"created_at": datetime.now().isoformat(),
|
"created_at": datetime.now().isoformat(),
|
||||||
"url": decoded_url
|
"url": decoded_url
|
||||||
})
|
}
|
||||||
|
|
||||||
|
# Store webhook config if provided
|
||||||
|
if webhook_config:
|
||||||
|
task_data["webhook_config"] = json.dumps(webhook_config)
|
||||||
|
|
||||||
|
await redis.hset(f"task:{task_id}", mapping=task_data)
|
||||||
|
|
||||||
background_tasks.add_task(
|
background_tasks.add_task(
|
||||||
process_llm_extraction,
|
process_llm_extraction,
|
||||||
@@ -349,7 +406,8 @@ async def create_new_task(
|
|||||||
query,
|
query,
|
||||||
schema,
|
schema,
|
||||||
cache,
|
cache,
|
||||||
provider
|
provider,
|
||||||
|
webhook_config
|
||||||
)
|
)
|
||||||
|
|
||||||
return JSONResponse({
|
return JSONResponse({
|
||||||
@@ -567,6 +625,7 @@ async def handle_crawl_job(
|
|||||||
browser_config: Dict,
|
browser_config: Dict,
|
||||||
crawler_config: Dict,
|
crawler_config: Dict,
|
||||||
config: Dict,
|
config: Dict,
|
||||||
|
webhook_config: Optional[Dict] = None,
|
||||||
) -> Dict:
|
) -> Dict:
|
||||||
"""
|
"""
|
||||||
Fire-and-forget version of handle_crawl_request.
|
Fire-and-forget version of handle_crawl_request.
|
||||||
@@ -574,13 +633,24 @@ async def handle_crawl_job(
|
|||||||
lets /crawl/job/{task_id} polling fetch the result.
|
lets /crawl/job/{task_id} polling fetch the result.
|
||||||
"""
|
"""
|
||||||
task_id = f"crawl_{uuid4().hex[:8]}"
|
task_id = f"crawl_{uuid4().hex[:8]}"
|
||||||
await redis.hset(f"task:{task_id}", mapping={
|
|
||||||
|
# Store task data in Redis
|
||||||
|
task_data = {
|
||||||
"status": TaskStatus.PROCESSING, # <-- keep enum values consistent
|
"status": TaskStatus.PROCESSING, # <-- keep enum values consistent
|
||||||
"created_at": datetime.utcnow().isoformat(),
|
"created_at": datetime.utcnow().isoformat(),
|
||||||
"url": json.dumps(urls), # store list as JSON string
|
"url": json.dumps(urls), # store list as JSON string
|
||||||
"result": "",
|
"result": "",
|
||||||
"error": "",
|
"error": "",
|
||||||
})
|
}
|
||||||
|
|
||||||
|
# Store webhook config if provided
|
||||||
|
if webhook_config:
|
||||||
|
task_data["webhook_config"] = json.dumps(webhook_config)
|
||||||
|
|
||||||
|
await redis.hset(f"task:{task_id}", mapping=task_data)
|
||||||
|
|
||||||
|
# Initialize webhook service
|
||||||
|
webhook_service = WebhookDeliveryService(config)
|
||||||
|
|
||||||
async def _runner():
|
async def _runner():
|
||||||
try:
|
try:
|
||||||
@@ -594,6 +664,17 @@ async def handle_crawl_job(
|
|||||||
"status": TaskStatus.COMPLETED,
|
"status": TaskStatus.COMPLETED,
|
||||||
"result": json.dumps(result),
|
"result": json.dumps(result),
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Send webhook notification on successful completion
|
||||||
|
await webhook_service.notify_job_completion(
|
||||||
|
task_id=task_id,
|
||||||
|
task_type="crawl",
|
||||||
|
status="completed",
|
||||||
|
urls=urls,
|
||||||
|
webhook_config=webhook_config,
|
||||||
|
result=result
|
||||||
|
)
|
||||||
|
|
||||||
await asyncio.sleep(5) # Give Redis time to process the update
|
await asyncio.sleep(5) # Give Redis time to process the update
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
await redis.hset(f"task:{task_id}", mapping={
|
await redis.hset(f"task:{task_id}", mapping={
|
||||||
@@ -601,5 +682,15 @@ async def handle_crawl_job(
|
|||||||
"error": str(exc),
|
"error": str(exc),
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Send webhook notification on failure
|
||||||
|
await webhook_service.notify_job_completion(
|
||||||
|
task_id=task_id,
|
||||||
|
task_type="crawl",
|
||||||
|
status="failed",
|
||||||
|
urls=urls,
|
||||||
|
webhook_config=webhook_config,
|
||||||
|
error=str(exc)
|
||||||
|
)
|
||||||
|
|
||||||
background_tasks.add_task(_runner)
|
background_tasks.add_task(_runner)
|
||||||
return {"task_id": task_id}
|
return {"task_id": task_id}
|
||||||
@@ -88,4 +88,17 @@ observability:
|
|||||||
enabled: True
|
enabled: True
|
||||||
endpoint: "/metrics"
|
endpoint: "/metrics"
|
||||||
health_check:
|
health_check:
|
||||||
endpoint: "/health"
|
endpoint: "/health"
|
||||||
|
|
||||||
|
# Webhook Configuration
|
||||||
|
webhooks:
|
||||||
|
enabled: true
|
||||||
|
default_url: null # Optional: default webhook URL for all jobs
|
||||||
|
data_in_payload: false # Optional: default behavior for including data
|
||||||
|
retry:
|
||||||
|
max_attempts: 5
|
||||||
|
initial_delay_ms: 1000 # 1s, 2s, 4s, 8s, 16s exponential backoff
|
||||||
|
max_delay_ms: 32000
|
||||||
|
timeout_ms: 30000 # 30s timeout per webhook call
|
||||||
|
headers: # Optional: default headers to include
|
||||||
|
User-Agent: "Crawl4AI-Webhook/1.0"
|
||||||
@@ -12,6 +12,7 @@ from api import (
|
|||||||
handle_crawl_job,
|
handle_crawl_job,
|
||||||
handle_task_status,
|
handle_task_status,
|
||||||
)
|
)
|
||||||
|
from schemas import WebhookConfig
|
||||||
|
|
||||||
# ------------- dependency placeholders -------------
|
# ------------- dependency placeholders -------------
|
||||||
_redis = None # will be injected from server.py
|
_redis = None # will be injected from server.py
|
||||||
@@ -37,12 +38,14 @@ class LlmJobPayload(BaseModel):
|
|||||||
schema: Optional[str] = None
|
schema: Optional[str] = None
|
||||||
cache: bool = False
|
cache: bool = False
|
||||||
provider: Optional[str] = None
|
provider: Optional[str] = None
|
||||||
|
webhook_config: Optional[WebhookConfig] = None
|
||||||
|
|
||||||
|
|
||||||
class CrawlJobPayload(BaseModel):
|
class CrawlJobPayload(BaseModel):
|
||||||
urls: list[HttpUrl]
|
urls: list[HttpUrl]
|
||||||
browser_config: Dict = {}
|
browser_config: Dict = {}
|
||||||
crawler_config: Dict = {}
|
crawler_config: Dict = {}
|
||||||
|
webhook_config: Optional[WebhookConfig] = None
|
||||||
|
|
||||||
|
|
||||||
# ---------- LLM job ---------------------------------------------------------
|
# ---------- LLM job ---------------------------------------------------------
|
||||||
@@ -53,6 +56,10 @@ async def llm_job_enqueue(
|
|||||||
request: Request,
|
request: Request,
|
||||||
_td: Dict = Depends(lambda: _token_dep()), # late-bound dep
|
_td: Dict = Depends(lambda: _token_dep()), # late-bound dep
|
||||||
):
|
):
|
||||||
|
webhook_config = None
|
||||||
|
if payload.webhook_config:
|
||||||
|
webhook_config = payload.webhook_config.model_dump(mode='json')
|
||||||
|
|
||||||
return await handle_llm_request(
|
return await handle_llm_request(
|
||||||
_redis,
|
_redis,
|
||||||
background_tasks,
|
background_tasks,
|
||||||
@@ -63,6 +70,7 @@ async def llm_job_enqueue(
|
|||||||
cache=payload.cache,
|
cache=payload.cache,
|
||||||
config=_config,
|
config=_config,
|
||||||
provider=payload.provider,
|
provider=payload.provider,
|
||||||
|
webhook_config=webhook_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -72,7 +80,7 @@ async def llm_job_status(
|
|||||||
task_id: str,
|
task_id: str,
|
||||||
_td: Dict = Depends(lambda: _token_dep())
|
_td: Dict = Depends(lambda: _token_dep())
|
||||||
):
|
):
|
||||||
return await handle_task_status(_redis, task_id)
|
return await handle_task_status(_redis, task_id, base_url=str(request.base_url))
|
||||||
|
|
||||||
|
|
||||||
# ---------- CRAWL job -------------------------------------------------------
|
# ---------- CRAWL job -------------------------------------------------------
|
||||||
@@ -82,6 +90,10 @@ async def crawl_job_enqueue(
|
|||||||
background_tasks: BackgroundTasks,
|
background_tasks: BackgroundTasks,
|
||||||
_td: Dict = Depends(lambda: _token_dep()),
|
_td: Dict = Depends(lambda: _token_dep()),
|
||||||
):
|
):
|
||||||
|
webhook_config = None
|
||||||
|
if payload.webhook_config:
|
||||||
|
webhook_config = payload.webhook_config.model_dump(mode='json')
|
||||||
|
|
||||||
return await handle_crawl_job(
|
return await handle_crawl_job(
|
||||||
_redis,
|
_redis,
|
||||||
background_tasks,
|
background_tasks,
|
||||||
@@ -89,6 +101,7 @@ async def crawl_job_enqueue(
|
|||||||
payload.browser_config,
|
payload.browser_config,
|
||||||
payload.crawler_config,
|
payload.crawler_config,
|
||||||
config=_config,
|
config=_config,
|
||||||
|
webhook_config=webhook_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -12,7 +12,6 @@ pydantic>=2.11
|
|||||||
rank-bm25==0.2.2
|
rank-bm25==0.2.2
|
||||||
anyio==4.9.0
|
anyio==4.9.0
|
||||||
PyJWT==2.10.1
|
PyJWT==2.10.1
|
||||||
mcp>=1.6.0
|
mcp>=1.18.0
|
||||||
websockets>=15.0.1
|
websockets>=15.0.1
|
||||||
httpx[http2]>=0.27.2
|
httpx[http2]>=0.27.2
|
||||||
sentry-sdk>=2.0.0
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
from typing import List, Optional, Dict
|
from typing import List, Optional, Dict
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field, HttpUrl
|
||||||
from utils import FilterType
|
from utils import FilterType
|
||||||
|
|
||||||
|
|
||||||
@@ -39,4 +39,22 @@ class JSEndpointRequest(BaseModel):
|
|||||||
scripts: List[str] = Field(
|
scripts: List[str] = Field(
|
||||||
...,
|
...,
|
||||||
description="List of separated JavaScript snippets to execute"
|
description="List of separated JavaScript snippets to execute"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class WebhookConfig(BaseModel):
|
||||||
|
"""Configuration for webhook notifications."""
|
||||||
|
webhook_url: HttpUrl
|
||||||
|
webhook_data_in_payload: bool = False
|
||||||
|
webhook_headers: Optional[Dict[str, str]] = None
|
||||||
|
|
||||||
|
|
||||||
|
class WebhookPayload(BaseModel):
|
||||||
|
"""Payload sent to webhook endpoints."""
|
||||||
|
task_id: str
|
||||||
|
task_type: str # "crawl", "llm_extraction", etc.
|
||||||
|
status: str # "completed" or "failed"
|
||||||
|
timestamp: str # ISO 8601 format
|
||||||
|
urls: List[str]
|
||||||
|
error: Optional[str] = None
|
||||||
|
data: Optional[Dict] = None # Included only if webhook_data_in_payload=True
|
||||||
@@ -74,32 +74,6 @@ setup_logging(config)
|
|||||||
|
|
||||||
__version__ = "0.5.1-d1"
|
__version__ = "0.5.1-d1"
|
||||||
|
|
||||||
# ───────────────────── telemetry setup ────────────────────────
|
|
||||||
# Docker/API server telemetry: enabled by default unless CRAWL4AI_TELEMETRY=0
|
|
||||||
import os as _os
|
|
||||||
if _os.environ.get('CRAWL4AI_TELEMETRY') != '0':
|
|
||||||
# Set environment variable to indicate we're in API server mode
|
|
||||||
_os.environ['CRAWL4AI_API_SERVER'] = 'true'
|
|
||||||
|
|
||||||
# Import and enable telemetry for Docker/API environment
|
|
||||||
from crawl4ai.telemetry import enable as enable_telemetry
|
|
||||||
from crawl4ai.telemetry import capture_exception
|
|
||||||
|
|
||||||
# Enable telemetry automatically in Docker mode
|
|
||||||
enable_telemetry(always=True)
|
|
||||||
|
|
||||||
import logging
|
|
||||||
telemetry_logger = logging.getLogger("telemetry")
|
|
||||||
telemetry_logger.info("✅ Telemetry enabled for Docker/API server")
|
|
||||||
else:
|
|
||||||
# Define no-op for capture_exception if telemetry is disabled
|
|
||||||
def capture_exception(exc, context=None):
|
|
||||||
pass
|
|
||||||
|
|
||||||
import logging
|
|
||||||
telemetry_logger = logging.getLogger("telemetry")
|
|
||||||
telemetry_logger.info("❌ Telemetry disabled via CRAWL4AI_TELEMETRY=0")
|
|
||||||
|
|
||||||
# ── global page semaphore (hard cap) ─────────────────────────
|
# ── global page semaphore (hard cap) ─────────────────────────
|
||||||
MAX_PAGES = config["crawler"]["pool"].get("max_pages", 30)
|
MAX_PAGES = config["crawler"]["pool"].get("max_pages", 30)
|
||||||
GLOBAL_SEM = asyncio.Semaphore(MAX_PAGES)
|
GLOBAL_SEM = asyncio.Semaphore(MAX_PAGES)
|
||||||
|
|||||||
159
deploy/docker/webhook.py
Normal file
159
deploy/docker/webhook.py
Normal file
@@ -0,0 +1,159 @@
|
|||||||
|
"""
|
||||||
|
Webhook delivery service for Crawl4AI.
|
||||||
|
|
||||||
|
This module provides webhook notification functionality with exponential backoff retry logic.
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import httpx
|
||||||
|
import logging
|
||||||
|
from typing import Dict, Optional
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class WebhookDeliveryService:
|
||||||
|
"""Handles webhook delivery with exponential backoff retry logic."""
|
||||||
|
|
||||||
|
def __init__(self, config: Dict):
|
||||||
|
"""
|
||||||
|
Initialize the webhook delivery service.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Application configuration dictionary containing webhook settings
|
||||||
|
"""
|
||||||
|
self.config = config.get("webhooks", {})
|
||||||
|
self.max_attempts = self.config.get("retry", {}).get("max_attempts", 5)
|
||||||
|
self.initial_delay = self.config.get("retry", {}).get("initial_delay_ms", 1000) / 1000
|
||||||
|
self.max_delay = self.config.get("retry", {}).get("max_delay_ms", 32000) / 1000
|
||||||
|
self.timeout = self.config.get("retry", {}).get("timeout_ms", 30000) / 1000
|
||||||
|
|
||||||
|
async def send_webhook(
|
||||||
|
self,
|
||||||
|
webhook_url: str,
|
||||||
|
payload: Dict,
|
||||||
|
headers: Optional[Dict[str, str]] = None
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
Send webhook with exponential backoff retry logic.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
webhook_url: The URL to send the webhook to
|
||||||
|
payload: The JSON payload to send
|
||||||
|
headers: Optional custom headers
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if delivered successfully, False otherwise
|
||||||
|
"""
|
||||||
|
default_headers = self.config.get("headers", {})
|
||||||
|
merged_headers = {**default_headers, **(headers or {})}
|
||||||
|
merged_headers["Content-Type"] = "application/json"
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
||||||
|
for attempt in range(self.max_attempts):
|
||||||
|
try:
|
||||||
|
logger.info(
|
||||||
|
f"Sending webhook (attempt {attempt + 1}/{self.max_attempts}) to {webhook_url}"
|
||||||
|
)
|
||||||
|
|
||||||
|
response = await client.post(
|
||||||
|
webhook_url,
|
||||||
|
json=payload,
|
||||||
|
headers=merged_headers
|
||||||
|
)
|
||||||
|
|
||||||
|
# Success or client error (don't retry client errors)
|
||||||
|
if response.status_code < 500:
|
||||||
|
if 200 <= response.status_code < 300:
|
||||||
|
logger.info(f"Webhook delivered successfully to {webhook_url}")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
logger.warning(
|
||||||
|
f"Webhook rejected with status {response.status_code}: {response.text[:200]}"
|
||||||
|
)
|
||||||
|
return False # Client error - don't retry
|
||||||
|
|
||||||
|
# Server error - retry with backoff
|
||||||
|
logger.warning(
|
||||||
|
f"Webhook failed with status {response.status_code}, will retry"
|
||||||
|
)
|
||||||
|
|
||||||
|
except httpx.TimeoutException as exc:
|
||||||
|
logger.error(f"Webhook timeout (attempt {attempt + 1}): {exc}")
|
||||||
|
except httpx.RequestError as exc:
|
||||||
|
logger.error(f"Webhook request error (attempt {attempt + 1}): {exc}")
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error(f"Webhook delivery error (attempt {attempt + 1}): {exc}")
|
||||||
|
|
||||||
|
# Calculate exponential backoff delay
|
||||||
|
if attempt < self.max_attempts - 1:
|
||||||
|
delay = min(self.initial_delay * (2 ** attempt), self.max_delay)
|
||||||
|
logger.info(f"Retrying in {delay}s...")
|
||||||
|
await asyncio.sleep(delay)
|
||||||
|
|
||||||
|
logger.error(
|
||||||
|
f"Webhook delivery failed after {self.max_attempts} attempts to {webhook_url}"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def notify_job_completion(
|
||||||
|
self,
|
||||||
|
task_id: str,
|
||||||
|
task_type: str,
|
||||||
|
status: str,
|
||||||
|
urls: list,
|
||||||
|
webhook_config: Optional[Dict],
|
||||||
|
result: Optional[Dict] = None,
|
||||||
|
error: Optional[str] = None
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Notify webhook of job completion.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
task_id: The task identifier
|
||||||
|
task_type: Type of task (e.g., "crawl", "llm_extraction")
|
||||||
|
status: Task status ("completed" or "failed")
|
||||||
|
urls: List of URLs that were crawled
|
||||||
|
webhook_config: Webhook configuration from the job request
|
||||||
|
result: Optional crawl result data
|
||||||
|
error: Optional error message if failed
|
||||||
|
"""
|
||||||
|
# Determine webhook URL
|
||||||
|
webhook_url = None
|
||||||
|
data_in_payload = self.config.get("data_in_payload", False)
|
||||||
|
custom_headers = None
|
||||||
|
|
||||||
|
if webhook_config:
|
||||||
|
webhook_url = webhook_config.get("webhook_url")
|
||||||
|
data_in_payload = webhook_config.get("webhook_data_in_payload", data_in_payload)
|
||||||
|
custom_headers = webhook_config.get("webhook_headers")
|
||||||
|
|
||||||
|
if not webhook_url:
|
||||||
|
webhook_url = self.config.get("default_url")
|
||||||
|
|
||||||
|
if not webhook_url:
|
||||||
|
logger.debug("No webhook URL configured, skipping notification")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Check if webhooks are enabled
|
||||||
|
if not self.config.get("enabled", True):
|
||||||
|
logger.debug("Webhooks are disabled, skipping notification")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Build payload
|
||||||
|
payload = {
|
||||||
|
"task_id": task_id,
|
||||||
|
"task_type": task_type,
|
||||||
|
"status": status,
|
||||||
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||||
|
"urls": urls
|
||||||
|
}
|
||||||
|
|
||||||
|
if error:
|
||||||
|
payload["error"] = error
|
||||||
|
|
||||||
|
if data_in_payload and result:
|
||||||
|
payload["data"] = result
|
||||||
|
|
||||||
|
# Send webhook (fire and forget - don't block on completion)
|
||||||
|
await self.send_webhook(webhook_url, payload, custom_headers)
|
||||||
461
docs/examples/docker_webhook_example.py
Normal file
461
docs/examples/docker_webhook_example.py
Normal file
@@ -0,0 +1,461 @@
|
|||||||
|
"""
|
||||||
|
Docker Webhook Example for Crawl4AI
|
||||||
|
|
||||||
|
This example demonstrates how to use webhooks with the Crawl4AI job queue API.
|
||||||
|
Instead of polling for results, webhooks notify your application when jobs complete.
|
||||||
|
|
||||||
|
Supports both:
|
||||||
|
- /crawl/job - Raw crawling with markdown extraction
|
||||||
|
- /llm/job - LLM-powered content extraction
|
||||||
|
|
||||||
|
Prerequisites:
|
||||||
|
1. Crawl4AI Docker container running on localhost:11234
|
||||||
|
2. Flask installed: pip install flask requests
|
||||||
|
3. LLM API key configured in .llm.env (for LLM extraction examples)
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
1. Run this script: python docker_webhook_example.py
|
||||||
|
2. The webhook server will start on http://localhost:8080
|
||||||
|
3. Jobs will be submitted and webhooks will be received automatically
|
||||||
|
"""
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from flask import Flask, request, jsonify
|
||||||
|
from threading import Thread
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
CRAWL4AI_BASE_URL = "http://localhost:11234"
|
||||||
|
WEBHOOK_BASE_URL = "http://localhost:8080" # Your webhook receiver URL
|
||||||
|
|
||||||
|
# Initialize Flask app for webhook receiver
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
# Store received webhook data for demonstration
|
||||||
|
received_webhooks = []
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/webhooks/crawl-complete', methods=['POST'])
|
||||||
|
def handle_crawl_webhook():
|
||||||
|
"""
|
||||||
|
Webhook handler that receives notifications when crawl jobs complete.
|
||||||
|
|
||||||
|
Payload structure:
|
||||||
|
{
|
||||||
|
"task_id": "crawl_abc123",
|
||||||
|
"task_type": "crawl",
|
||||||
|
"status": "completed" or "failed",
|
||||||
|
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"error": "error message" (only if failed),
|
||||||
|
"data": {...} (only if webhook_data_in_payload=True)
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
payload = request.json
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"📬 Webhook received for task: {payload['task_id']}")
|
||||||
|
print(f" Status: {payload['status']}")
|
||||||
|
print(f" Timestamp: {payload['timestamp']}")
|
||||||
|
print(f" URLs: {payload['urls']}")
|
||||||
|
|
||||||
|
if payload['status'] == 'completed':
|
||||||
|
# If data is in payload, process it directly
|
||||||
|
if 'data' in payload:
|
||||||
|
print(f" ✅ Data included in webhook")
|
||||||
|
data = payload['data']
|
||||||
|
# Process the crawl results here
|
||||||
|
for result in data.get('results', []):
|
||||||
|
print(f" - Crawled: {result.get('url')}")
|
||||||
|
print(f" - Markdown length: {len(result.get('markdown', ''))}")
|
||||||
|
else:
|
||||||
|
# Fetch results from API if not included
|
||||||
|
print(f" 📥 Fetching results from API...")
|
||||||
|
task_id = payload['task_id']
|
||||||
|
result_response = requests.get(f"{CRAWL4AI_BASE_URL}/crawl/job/{task_id}")
|
||||||
|
if result_response.ok:
|
||||||
|
data = result_response.json()
|
||||||
|
print(f" ✅ Results fetched successfully")
|
||||||
|
# Process the crawl results here
|
||||||
|
for result in data['result'].get('results', []):
|
||||||
|
print(f" - Crawled: {result.get('url')}")
|
||||||
|
print(f" - Markdown length: {len(result.get('markdown', ''))}")
|
||||||
|
|
||||||
|
elif payload['status'] == 'failed':
|
||||||
|
print(f" ❌ Job failed: {payload.get('error', 'Unknown error')}")
|
||||||
|
|
||||||
|
print(f"{'='*60}\n")
|
||||||
|
|
||||||
|
# Store webhook for demonstration
|
||||||
|
received_webhooks.append(payload)
|
||||||
|
|
||||||
|
# Return 200 OK to acknowledge receipt
|
||||||
|
return jsonify({"status": "received"}), 200
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/webhooks/llm-complete', methods=['POST'])
|
||||||
|
def handle_llm_webhook():
|
||||||
|
"""
|
||||||
|
Webhook handler that receives notifications when LLM extraction jobs complete.
|
||||||
|
|
||||||
|
Payload structure:
|
||||||
|
{
|
||||||
|
"task_id": "llm_1698765432_12345",
|
||||||
|
"task_type": "llm_extraction",
|
||||||
|
"status": "completed" or "failed",
|
||||||
|
"timestamp": "2025-10-21T10:30:00.000000+00:00",
|
||||||
|
"urls": ["https://example.com/article"],
|
||||||
|
"error": "error message" (only if failed),
|
||||||
|
"data": {"extracted_content": {...}} (only if webhook_data_in_payload=True)
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
payload = request.json
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"🤖 LLM Webhook received for task: {payload['task_id']}")
|
||||||
|
print(f" Task Type: {payload['task_type']}")
|
||||||
|
print(f" Status: {payload['status']}")
|
||||||
|
print(f" Timestamp: {payload['timestamp']}")
|
||||||
|
print(f" URL: {payload['urls'][0]}")
|
||||||
|
|
||||||
|
if payload['status'] == 'completed':
|
||||||
|
# If data is in payload, process it directly
|
||||||
|
if 'data' in payload:
|
||||||
|
print(f" ✅ Data included in webhook")
|
||||||
|
data = payload['data']
|
||||||
|
# Webhook wraps extracted content in 'extracted_content' field
|
||||||
|
extracted = data.get('extracted_content', {})
|
||||||
|
print(f" - Extracted content:")
|
||||||
|
print(f" {json.dumps(extracted, indent=8)}")
|
||||||
|
else:
|
||||||
|
# Fetch results from API if not included
|
||||||
|
print(f" 📥 Fetching results from API...")
|
||||||
|
task_id = payload['task_id']
|
||||||
|
result_response = requests.get(f"{CRAWL4AI_BASE_URL}/llm/job/{task_id}")
|
||||||
|
if result_response.ok:
|
||||||
|
data = result_response.json()
|
||||||
|
print(f" ✅ Results fetched successfully")
|
||||||
|
# API returns unwrapped content in 'result' field
|
||||||
|
extracted = data['result']
|
||||||
|
print(f" - Extracted content:")
|
||||||
|
print(f" {json.dumps(extracted, indent=8)}")
|
||||||
|
|
||||||
|
elif payload['status'] == 'failed':
|
||||||
|
print(f" ❌ Job failed: {payload.get('error', 'Unknown error')}")
|
||||||
|
|
||||||
|
print(f"{'='*60}\n")
|
||||||
|
|
||||||
|
# Store webhook for demonstration
|
||||||
|
received_webhooks.append(payload)
|
||||||
|
|
||||||
|
# Return 200 OK to acknowledge receipt
|
||||||
|
return jsonify({"status": "received"}), 200
|
||||||
|
|
||||||
|
|
||||||
|
def start_webhook_server():
|
||||||
|
"""Start the Flask webhook server in a separate thread"""
|
||||||
|
app.run(host='0.0.0.0', port=8080, debug=False, use_reloader=False)
|
||||||
|
|
||||||
|
|
||||||
|
def submit_crawl_job_with_webhook(urls, webhook_url, include_data=False):
|
||||||
|
"""
|
||||||
|
Submit a crawl job with webhook notification.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
urls: List of URLs to crawl
|
||||||
|
webhook_url: URL to receive webhook notifications
|
||||||
|
include_data: Whether to include full results in webhook payload
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
task_id: The job's task identifier
|
||||||
|
"""
|
||||||
|
payload = {
|
||||||
|
"urls": urls,
|
||||||
|
"browser_config": {"headless": True},
|
||||||
|
"crawler_config": {"cache_mode": "bypass"},
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": webhook_url,
|
||||||
|
"webhook_data_in_payload": include_data,
|
||||||
|
# Optional: Add custom headers for authentication
|
||||||
|
# "webhook_headers": {
|
||||||
|
# "X-Webhook-Secret": "your-secret-token"
|
||||||
|
# }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f"\n🚀 Submitting crawl job...")
|
||||||
|
print(f" URLs: {urls}")
|
||||||
|
print(f" Webhook: {webhook_url}")
|
||||||
|
print(f" Include data: {include_data}")
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
f"{CRAWL4AI_BASE_URL}/crawl/job",
|
||||||
|
json=payload,
|
||||||
|
headers={"Content-Type": "application/json"}
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.ok:
|
||||||
|
data = response.json()
|
||||||
|
task_id = data['task_id']
|
||||||
|
print(f" ✅ Job submitted successfully")
|
||||||
|
print(f" Task ID: {task_id}")
|
||||||
|
return task_id
|
||||||
|
else:
|
||||||
|
print(f" ❌ Failed to submit job: {response.text}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def submit_llm_job_with_webhook(url, query, webhook_url, include_data=False, schema=None, provider=None):
|
||||||
|
"""
|
||||||
|
Submit an LLM extraction job with webhook notification.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: URL to extract content from
|
||||||
|
query: Instruction for the LLM (e.g., "Extract article title and author")
|
||||||
|
webhook_url: URL to receive webhook notifications
|
||||||
|
include_data: Whether to include full results in webhook payload
|
||||||
|
schema: Optional JSON schema for structured extraction
|
||||||
|
provider: Optional LLM provider (e.g., "openai/gpt-4o-mini")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
task_id: The job's task identifier
|
||||||
|
"""
|
||||||
|
payload = {
|
||||||
|
"url": url,
|
||||||
|
"q": query,
|
||||||
|
"cache": False,
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": webhook_url,
|
||||||
|
"webhook_data_in_payload": include_data,
|
||||||
|
# Optional: Add custom headers for authentication
|
||||||
|
# "webhook_headers": {
|
||||||
|
# "X-Webhook-Secret": "your-secret-token"
|
||||||
|
# }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if schema:
|
||||||
|
payload["schema"] = schema
|
||||||
|
|
||||||
|
if provider:
|
||||||
|
payload["provider"] = provider
|
||||||
|
|
||||||
|
print(f"\n🤖 Submitting LLM extraction job...")
|
||||||
|
print(f" URL: {url}")
|
||||||
|
print(f" Query: {query}")
|
||||||
|
print(f" Webhook: {webhook_url}")
|
||||||
|
print(f" Include data: {include_data}")
|
||||||
|
if provider:
|
||||||
|
print(f" Provider: {provider}")
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
f"{CRAWL4AI_BASE_URL}/llm/job",
|
||||||
|
json=payload,
|
||||||
|
headers={"Content-Type": "application/json"}
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.ok:
|
||||||
|
data = response.json()
|
||||||
|
task_id = data['task_id']
|
||||||
|
print(f" ✅ Job submitted successfully")
|
||||||
|
print(f" Task ID: {task_id}")
|
||||||
|
return task_id
|
||||||
|
else:
|
||||||
|
print(f" ❌ Failed to submit job: {response.text}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def submit_job_without_webhook(urls):
|
||||||
|
"""
|
||||||
|
Submit a job without webhook (traditional polling approach).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
urls: List of URLs to crawl
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
task_id: The job's task identifier
|
||||||
|
"""
|
||||||
|
payload = {
|
||||||
|
"urls": urls,
|
||||||
|
"browser_config": {"headless": True},
|
||||||
|
"crawler_config": {"cache_mode": "bypass"}
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f"\n🚀 Submitting crawl job (without webhook)...")
|
||||||
|
print(f" URLs: {urls}")
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
f"{CRAWL4AI_BASE_URL}/crawl/job",
|
||||||
|
json=payload
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.ok:
|
||||||
|
data = response.json()
|
||||||
|
task_id = data['task_id']
|
||||||
|
print(f" ✅ Job submitted successfully")
|
||||||
|
print(f" Task ID: {task_id}")
|
||||||
|
return task_id
|
||||||
|
else:
|
||||||
|
print(f" ❌ Failed to submit job: {response.text}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def poll_job_status(task_id, timeout=60):
|
||||||
|
"""
|
||||||
|
Poll for job status (used when webhook is not configured).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
task_id: The job's task identifier
|
||||||
|
timeout: Maximum time to wait in seconds
|
||||||
|
"""
|
||||||
|
print(f"\n⏳ Polling for job status...")
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
while time.time() - start_time < timeout:
|
||||||
|
response = requests.get(f"{CRAWL4AI_BASE_URL}/crawl/job/{task_id}")
|
||||||
|
|
||||||
|
if response.ok:
|
||||||
|
data = response.json()
|
||||||
|
status = data.get('status', 'unknown')
|
||||||
|
|
||||||
|
if status == 'completed':
|
||||||
|
print(f" ✅ Job completed!")
|
||||||
|
return data
|
||||||
|
elif status == 'failed':
|
||||||
|
print(f" ❌ Job failed: {data.get('error', 'Unknown error')}")
|
||||||
|
return data
|
||||||
|
else:
|
||||||
|
print(f" ⏳ Status: {status}, waiting...")
|
||||||
|
time.sleep(2)
|
||||||
|
else:
|
||||||
|
print(f" ❌ Failed to get status: {response.text}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
print(f" ⏰ Timeout reached")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run the webhook demonstration"""
|
||||||
|
|
||||||
|
# Check if Crawl4AI is running
|
||||||
|
try:
|
||||||
|
health = requests.get(f"{CRAWL4AI_BASE_URL}/health", timeout=5)
|
||||||
|
print(f"✅ Crawl4AI is running: {health.json()}")
|
||||||
|
except:
|
||||||
|
print(f"❌ Cannot connect to Crawl4AI at {CRAWL4AI_BASE_URL}")
|
||||||
|
print(" Please make sure Docker container is running:")
|
||||||
|
print(" docker run -d -p 11234:11234 --name crawl4ai unclecode/crawl4ai:latest")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Start webhook server in background thread
|
||||||
|
print(f"\n🌐 Starting webhook server at {WEBHOOK_BASE_URL}...")
|
||||||
|
webhook_thread = Thread(target=start_webhook_server, daemon=True)
|
||||||
|
webhook_thread.start()
|
||||||
|
time.sleep(2) # Give server time to start
|
||||||
|
|
||||||
|
# Example 1: Job with webhook (notification only, fetch data separately)
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print("Example 1: Webhook Notification Only")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
task_id_1 = submit_crawl_job_with_webhook(
|
||||||
|
urls=["https://example.com"],
|
||||||
|
webhook_url=f"{WEBHOOK_BASE_URL}/webhooks/crawl-complete",
|
||||||
|
include_data=False
|
||||||
|
)
|
||||||
|
|
||||||
|
# Example 2: Job with webhook (data included in payload)
|
||||||
|
time.sleep(5) # Wait a bit between requests
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print("Example 2: Webhook with Full Data")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
task_id_2 = submit_crawl_job_with_webhook(
|
||||||
|
urls=["https://www.python.org"],
|
||||||
|
webhook_url=f"{WEBHOOK_BASE_URL}/webhooks/crawl-complete",
|
||||||
|
include_data=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Example 3: LLM extraction with webhook (notification only)
|
||||||
|
time.sleep(5) # Wait a bit between requests
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print("Example 3: LLM Extraction with Webhook (Notification Only)")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
task_id_3 = submit_llm_job_with_webhook(
|
||||||
|
url="https://www.example.com",
|
||||||
|
query="Extract the main heading and description from this page.",
|
||||||
|
webhook_url=f"{WEBHOOK_BASE_URL}/webhooks/llm-complete",
|
||||||
|
include_data=False,
|
||||||
|
provider="openai/gpt-4o-mini"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Example 4: LLM extraction with webhook (data included + schema)
|
||||||
|
time.sleep(5) # Wait a bit between requests
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print("Example 4: LLM Extraction with Schema and Full Data")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
# Define a schema for structured extraction
|
||||||
|
schema = json.dumps({
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"title": {"type": "string", "description": "Page title"},
|
||||||
|
"description": {"type": "string", "description": "Page description"}
|
||||||
|
},
|
||||||
|
"required": ["title"]
|
||||||
|
})
|
||||||
|
|
||||||
|
task_id_4 = submit_llm_job_with_webhook(
|
||||||
|
url="https://www.python.org",
|
||||||
|
query="Extract the title and description of this website",
|
||||||
|
webhook_url=f"{WEBHOOK_BASE_URL}/webhooks/llm-complete",
|
||||||
|
include_data=True,
|
||||||
|
schema=schema,
|
||||||
|
provider="openai/gpt-4o-mini"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Example 5: Traditional polling (no webhook)
|
||||||
|
time.sleep(5) # Wait a bit between requests
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print("Example 5: Traditional Polling (No Webhook)")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
task_id_5 = submit_job_without_webhook(
|
||||||
|
urls=["https://github.com"]
|
||||||
|
)
|
||||||
|
if task_id_5:
|
||||||
|
result = poll_job_status(task_id_5)
|
||||||
|
if result and result.get('status') == 'completed':
|
||||||
|
print(f" ✅ Results retrieved via polling")
|
||||||
|
|
||||||
|
# Wait for webhooks to arrive
|
||||||
|
print(f"\n⏳ Waiting for webhooks to be received...")
|
||||||
|
time.sleep(30) # Give jobs time to complete and webhooks to arrive (longer for LLM)
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print("Summary")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
print(f"Total webhooks received: {len(received_webhooks)}")
|
||||||
|
|
||||||
|
crawl_webhooks = [w for w in received_webhooks if w['task_type'] == 'crawl']
|
||||||
|
llm_webhooks = [w for w in received_webhooks if w['task_type'] == 'llm_extraction']
|
||||||
|
|
||||||
|
print(f"\n📊 Breakdown:")
|
||||||
|
print(f" - Crawl webhooks: {len(crawl_webhooks)}")
|
||||||
|
print(f" - LLM extraction webhooks: {len(llm_webhooks)}")
|
||||||
|
|
||||||
|
print(f"\n📋 Details:")
|
||||||
|
for i, webhook in enumerate(received_webhooks, 1):
|
||||||
|
task_type = webhook['task_type']
|
||||||
|
icon = "🕷️" if task_type == "crawl" else "🤖"
|
||||||
|
print(f"{i}. {icon} Task {webhook['task_id']}: {webhook['status']} ({task_type})")
|
||||||
|
|
||||||
|
print(f"\n✅ Demo completed!")
|
||||||
|
print(f"\n💡 Pro tips:")
|
||||||
|
print(f" - In production, your webhook URL should be publicly accessible")
|
||||||
|
print(f" (e.g., https://myapp.com/webhooks) or use ngrok for testing")
|
||||||
|
print(f" - Both /crawl/job and /llm/job support the same webhook configuration")
|
||||||
|
print(f" - Use webhook_data_in_payload=true to get results directly in the webhook")
|
||||||
|
print(f" - LLM jobs may take longer, adjust timeouts accordingly")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -1,242 +0,0 @@
|
|||||||
# Telemetry
|
|
||||||
|
|
||||||
Crawl4AI includes **opt-in telemetry** to help improve stability by capturing anonymous crash reports. No personal data or crawled content is ever collected.
|
|
||||||
|
|
||||||
!!! info "Privacy First"
|
|
||||||
Telemetry is completely optional and respects your privacy. Only exception information is collected - no URLs, no personal data, no crawled content.
|
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
- **Privacy-first**: Only exceptions and crashes are reported
|
|
||||||
- **Opt-in by default**: You control when telemetry is enabled (except in Docker where it's on by default)
|
|
||||||
- **No PII**: No URLs, request data, or personal information is collected
|
|
||||||
- **Provider-agnostic**: Currently uses Sentry, but designed to support multiple backends
|
|
||||||
|
|
||||||
## Installation
|
|
||||||
|
|
||||||
Telemetry requires the optional Sentry SDK:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Install with telemetry support
|
|
||||||
pip install crawl4ai[telemetry]
|
|
||||||
|
|
||||||
# Or install Sentry SDK separately
|
|
||||||
pip install sentry-sdk>=2.0.0
|
|
||||||
```
|
|
||||||
|
|
||||||
## Environments
|
|
||||||
|
|
||||||
### 1. Python Library & CLI
|
|
||||||
|
|
||||||
On first exception, you'll see an interactive prompt:
|
|
||||||
|
|
||||||
```
|
|
||||||
🚨 Crawl4AI Error Detection
|
|
||||||
==============================================================
|
|
||||||
We noticed an error occurred. Help improve Crawl4AI by
|
|
||||||
sending anonymous crash reports?
|
|
||||||
|
|
||||||
[1] Yes, send this error only
|
|
||||||
[2] Yes, always send errors
|
|
||||||
[3] No, don't send
|
|
||||||
|
|
||||||
Your choice (1/2/3):
|
|
||||||
```
|
|
||||||
|
|
||||||
Control via CLI:
|
|
||||||
```bash
|
|
||||||
# Enable telemetry
|
|
||||||
crwl telemetry enable
|
|
||||||
crwl telemetry enable --email you@example.com
|
|
||||||
|
|
||||||
# Disable telemetry
|
|
||||||
crwl telemetry disable
|
|
||||||
|
|
||||||
# Check status
|
|
||||||
crwl telemetry status
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2. Docker / API Server
|
|
||||||
|
|
||||||
!!! warning "Default Enabled in Docker"
|
|
||||||
Telemetry is **enabled by default** in Docker environments to help identify container-specific issues. This is different from the CLI where it's opt-in.
|
|
||||||
|
|
||||||
To disable:
|
|
||||||
```bash
|
|
||||||
# Via environment variable
|
|
||||||
docker run -e CRAWL4AI_TELEMETRY=0 ...
|
|
||||||
|
|
||||||
# In docker-compose.yml
|
|
||||||
environment:
|
|
||||||
- CRAWL4AI_TELEMETRY=0
|
|
||||||
```
|
|
||||||
|
|
||||||
### 3. Jupyter / Google Colab
|
|
||||||
|
|
||||||
In notebooks, you'll see an interactive widget (if available) or a code snippet:
|
|
||||||
|
|
||||||
```python
|
|
||||||
import crawl4ai
|
|
||||||
|
|
||||||
# Enable telemetry
|
|
||||||
crawl4ai.telemetry.enable(email="you@example.com", always=True)
|
|
||||||
|
|
||||||
# Send only next error
|
|
||||||
crawl4ai.telemetry.enable(once=True)
|
|
||||||
|
|
||||||
# Disable telemetry
|
|
||||||
crawl4ai.telemetry.disable()
|
|
||||||
|
|
||||||
# Check status
|
|
||||||
crawl4ai.telemetry.status()
|
|
||||||
```
|
|
||||||
|
|
||||||
## Python API
|
|
||||||
|
|
||||||
### Basic Usage
|
|
||||||
|
|
||||||
```python
|
|
||||||
from crawl4ai import telemetry
|
|
||||||
|
|
||||||
# Enable/disable telemetry
|
|
||||||
telemetry.enable(email="optional@email.com", always=True)
|
|
||||||
telemetry.disable()
|
|
||||||
|
|
||||||
# Check current status
|
|
||||||
status = telemetry.status()
|
|
||||||
print(f"Telemetry enabled: {status['enabled']}")
|
|
||||||
print(f"Consent: {status['consent']}")
|
|
||||||
```
|
|
||||||
|
|
||||||
### Manual Exception Capture
|
|
||||||
|
|
||||||
```python
|
|
||||||
from crawl4ai.telemetry import capture_exception
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Your code here
|
|
||||||
risky_operation()
|
|
||||||
except Exception as e:
|
|
||||||
# Manually capture exception with context
|
|
||||||
capture_exception(e, {
|
|
||||||
'operation': 'custom_crawler',
|
|
||||||
'url': 'https://example.com' # Will be sanitized
|
|
||||||
})
|
|
||||||
raise
|
|
||||||
```
|
|
||||||
|
|
||||||
### Decorator Pattern
|
|
||||||
|
|
||||||
```python
|
|
||||||
from crawl4ai.telemetry import telemetry_decorator
|
|
||||||
|
|
||||||
@telemetry_decorator
|
|
||||||
def my_crawler_function():
|
|
||||||
# Exceptions will be automatically captured
|
|
||||||
pass
|
|
||||||
```
|
|
||||||
|
|
||||||
### Context Manager
|
|
||||||
|
|
||||||
```python
|
|
||||||
from crawl4ai.telemetry import telemetry_context
|
|
||||||
|
|
||||||
with telemetry_context("data_extraction"):
|
|
||||||
# Any exceptions in this block will be captured
|
|
||||||
result = extract_data(html)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Configuration
|
|
||||||
|
|
||||||
Settings are stored in `~/.crawl4ai/config.json`:
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"telemetry": {
|
|
||||||
"consent": "always",
|
|
||||||
"email": "user@example.com"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Consent levels:
|
|
||||||
- `"not_set"` - No decision made yet
|
|
||||||
- `"denied"` - Telemetry disabled
|
|
||||||
- `"once"` - Send current error only
|
|
||||||
- `"always"` - Always send errors
|
|
||||||
|
|
||||||
## Environment Variables
|
|
||||||
|
|
||||||
- `CRAWL4AI_TELEMETRY=0` - Disable telemetry (overrides config)
|
|
||||||
- `CRAWL4AI_TELEMETRY_EMAIL=email@example.com` - Set email for follow-up
|
|
||||||
- `CRAWL4AI_SENTRY_DSN=https://...` - Override default DSN (for maintainers)
|
|
||||||
|
|
||||||
## What's Collected
|
|
||||||
|
|
||||||
### Collected ✅
|
|
||||||
- Exception type and traceback
|
|
||||||
- Crawl4AI version
|
|
||||||
- Python version
|
|
||||||
- Operating system
|
|
||||||
- Environment type (CLI, Docker, Jupyter)
|
|
||||||
- Optional email (if provided)
|
|
||||||
|
|
||||||
### NOT Collected ❌
|
|
||||||
- URLs being crawled
|
|
||||||
- HTML content
|
|
||||||
- Request/response data
|
|
||||||
- Cookies or authentication tokens
|
|
||||||
- IP addresses
|
|
||||||
- Any personally identifiable information
|
|
||||||
|
|
||||||
## Provider Architecture
|
|
||||||
|
|
||||||
Telemetry is designed to be provider-agnostic:
|
|
||||||
|
|
||||||
```python
|
|
||||||
from crawl4ai.telemetry.base import TelemetryProvider
|
|
||||||
|
|
||||||
class CustomProvider(TelemetryProvider):
|
|
||||||
def send_exception(self, exc, context=None):
|
|
||||||
# Your implementation
|
|
||||||
pass
|
|
||||||
```
|
|
||||||
|
|
||||||
## FAQ
|
|
||||||
|
|
||||||
### Q: Can I completely disable telemetry?
|
|
||||||
A: Yes! Use `crwl telemetry disable` or set `CRAWL4AI_TELEMETRY=0`
|
|
||||||
|
|
||||||
### Q: Is telemetry required?
|
|
||||||
A: No, it's completely optional (except enabled by default in Docker)
|
|
||||||
|
|
||||||
### Q: What if I don't install sentry-sdk?
|
|
||||||
A: Telemetry will gracefully degrade to a no-op state
|
|
||||||
|
|
||||||
### Q: Can I see what's being sent?
|
|
||||||
A: Yes, check the source code in `crawl4ai/telemetry/`
|
|
||||||
|
|
||||||
### Q: How do I remove my email?
|
|
||||||
A: Delete `~/.crawl4ai/config.json` or edit it to remove the email field
|
|
||||||
|
|
||||||
## Privacy Commitment
|
|
||||||
|
|
||||||
1. **Transparency**: All telemetry code is open source
|
|
||||||
2. **Control**: You can enable/disable at any time
|
|
||||||
3. **Minimal**: Only crash data, no user content
|
|
||||||
4. **Secure**: Data transmitted over HTTPS to Sentry
|
|
||||||
5. **Anonymous**: No tracking or user identification
|
|
||||||
|
|
||||||
## Contributing
|
|
||||||
|
|
||||||
Help improve telemetry:
|
|
||||||
- Report issues with telemetry itself
|
|
||||||
- Suggest privacy improvements
|
|
||||||
- Add new provider backends
|
|
||||||
|
|
||||||
## Support
|
|
||||||
|
|
||||||
If you have concerns about telemetry:
|
|
||||||
- Open an issue on GitHub
|
|
||||||
- Email the maintainers
|
|
||||||
- Review the code in `crawl4ai/telemetry/`
|
|
||||||
@@ -35,7 +35,6 @@ nav:
|
|||||||
- "Page Interaction": "core/page-interaction.md"
|
- "Page Interaction": "core/page-interaction.md"
|
||||||
- "Content Selection": "core/content-selection.md"
|
- "Content Selection": "core/content-selection.md"
|
||||||
- "Cache Modes": "core/cache-modes.md"
|
- "Cache Modes": "core/cache-modes.md"
|
||||||
- "Telemetry": "core/telemetry.md"
|
|
||||||
- "Local Files & Raw HTML": "core/local-files.md"
|
- "Local Files & Raw HTML": "core/local-files.md"
|
||||||
- "Link & Media": "core/link-media.md"
|
- "Link & Media": "core/link-media.md"
|
||||||
- Advanced:
|
- Advanced:
|
||||||
|
|||||||
@@ -64,7 +64,6 @@ torch = ["torch", "nltk", "scikit-learn"]
|
|||||||
transformer = ["transformers", "tokenizers", "sentence-transformers"]
|
transformer = ["transformers", "tokenizers", "sentence-transformers"]
|
||||||
cosine = ["torch", "transformers", "nltk", "sentence-transformers"]
|
cosine = ["torch", "transformers", "nltk", "sentence-transformers"]
|
||||||
sync = ["selenium"]
|
sync = ["selenium"]
|
||||||
telemetry = ["sentry-sdk>=2.0.0", "ipywidgets>=8.0.0"]
|
|
||||||
all = [
|
all = [
|
||||||
"PyPDF2",
|
"PyPDF2",
|
||||||
"torch",
|
"torch",
|
||||||
@@ -73,9 +72,7 @@ all = [
|
|||||||
"transformers",
|
"transformers",
|
||||||
"tokenizers",
|
"tokenizers",
|
||||||
"sentence-transformers",
|
"sentence-transformers",
|
||||||
"selenium",
|
"selenium"
|
||||||
"sentry-sdk>=2.0.0",
|
|
||||||
"ipywidgets>=8.0.0"
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
|
|||||||
16
pytest.ini
16
pytest.ini
@@ -1,16 +0,0 @@
|
|||||||
[pytest]
|
|
||||||
testpaths = tests
|
|
||||||
python_paths = .
|
|
||||||
addopts = --maxfail=1 --disable-warnings -q --tb=short -v
|
|
||||||
asyncio_mode = auto
|
|
||||||
markers =
|
|
||||||
slow: marks tests as slow (deselect with '-m "not slow"')
|
|
||||||
integration: marks tests as integration tests
|
|
||||||
unit: marks tests as unit tests
|
|
||||||
privacy: marks tests related to privacy compliance
|
|
||||||
performance: marks tests related to performance
|
|
||||||
filterwarnings =
|
|
||||||
ignore::DeprecationWarning
|
|
||||||
ignore::PendingDeprecationWarning
|
|
||||||
env =
|
|
||||||
CRAWL4AI_TEST_MODE=1
|
|
||||||
401
test_llm_webhook_feature.py
Normal file
401
test_llm_webhook_feature.py
Normal file
@@ -0,0 +1,401 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test script to validate webhook implementation for /llm/job endpoint.
|
||||||
|
|
||||||
|
This tests that the /llm/job endpoint now supports webhooks
|
||||||
|
following the same pattern as /crawl/job.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Add deploy/docker to path
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'deploy', 'docker'))
|
||||||
|
|
||||||
|
def test_llm_job_payload_model():
|
||||||
|
"""Test that LlmJobPayload includes webhook_config field"""
|
||||||
|
print("=" * 60)
|
||||||
|
print("TEST 1: LlmJobPayload Model")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from job import LlmJobPayload
|
||||||
|
from schemas import WebhookConfig
|
||||||
|
from pydantic import ValidationError
|
||||||
|
|
||||||
|
# Test with webhook_config
|
||||||
|
payload_dict = {
|
||||||
|
"url": "https://example.com",
|
||||||
|
"q": "Extract main content",
|
||||||
|
"schema": None,
|
||||||
|
"cache": False,
|
||||||
|
"provider": None,
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "https://myapp.com/webhook",
|
||||||
|
"webhook_data_in_payload": True,
|
||||||
|
"webhook_headers": {"X-Secret": "token"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
payload = LlmJobPayload(**payload_dict)
|
||||||
|
|
||||||
|
print(f"✅ LlmJobPayload accepts webhook_config")
|
||||||
|
print(f" - URL: {payload.url}")
|
||||||
|
print(f" - Query: {payload.q}")
|
||||||
|
print(f" - Webhook URL: {payload.webhook_config.webhook_url}")
|
||||||
|
print(f" - Data in payload: {payload.webhook_config.webhook_data_in_payload}")
|
||||||
|
|
||||||
|
# Test without webhook_config (should be optional)
|
||||||
|
minimal_payload = {
|
||||||
|
"url": "https://example.com",
|
||||||
|
"q": "Extract content"
|
||||||
|
}
|
||||||
|
|
||||||
|
payload2 = LlmJobPayload(**minimal_payload)
|
||||||
|
assert payload2.webhook_config is None, "webhook_config should be optional"
|
||||||
|
print(f"✅ LlmJobPayload works without webhook_config (optional)")
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Failed: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
|
def test_handle_llm_request_signature():
|
||||||
|
"""Test that handle_llm_request accepts webhook_config parameter"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("TEST 2: handle_llm_request Function Signature")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from api import handle_llm_request
|
||||||
|
import inspect
|
||||||
|
|
||||||
|
sig = inspect.signature(handle_llm_request)
|
||||||
|
params = list(sig.parameters.keys())
|
||||||
|
|
||||||
|
print(f"Function parameters: {params}")
|
||||||
|
|
||||||
|
if 'webhook_config' in params:
|
||||||
|
print(f"✅ handle_llm_request has webhook_config parameter")
|
||||||
|
|
||||||
|
# Check that it's optional with default None
|
||||||
|
webhook_param = sig.parameters['webhook_config']
|
||||||
|
if webhook_param.default is None or webhook_param.default == inspect.Parameter.empty:
|
||||||
|
print(f"✅ webhook_config is optional (default: {webhook_param.default})")
|
||||||
|
else:
|
||||||
|
print(f"⚠️ webhook_config default is: {webhook_param.default}")
|
||||||
|
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print(f"❌ handle_llm_request missing webhook_config parameter")
|
||||||
|
return False
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Failed: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
|
def test_process_llm_extraction_signature():
|
||||||
|
"""Test that process_llm_extraction accepts webhook_config parameter"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("TEST 3: process_llm_extraction Function Signature")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from api import process_llm_extraction
|
||||||
|
import inspect
|
||||||
|
|
||||||
|
sig = inspect.signature(process_llm_extraction)
|
||||||
|
params = list(sig.parameters.keys())
|
||||||
|
|
||||||
|
print(f"Function parameters: {params}")
|
||||||
|
|
||||||
|
if 'webhook_config' in params:
|
||||||
|
print(f"✅ process_llm_extraction has webhook_config parameter")
|
||||||
|
|
||||||
|
webhook_param = sig.parameters['webhook_config']
|
||||||
|
if webhook_param.default is None or webhook_param.default == inspect.Parameter.empty:
|
||||||
|
print(f"✅ webhook_config is optional (default: {webhook_param.default})")
|
||||||
|
else:
|
||||||
|
print(f"⚠️ webhook_config default is: {webhook_param.default}")
|
||||||
|
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print(f"❌ process_llm_extraction missing webhook_config parameter")
|
||||||
|
return False
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Failed: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
|
def test_webhook_integration_in_api():
|
||||||
|
"""Test that api.py properly integrates webhook notifications"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("TEST 4: Webhook Integration in process_llm_extraction")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
api_file = os.path.join(os.path.dirname(__file__), 'deploy', 'docker', 'api.py')
|
||||||
|
|
||||||
|
with open(api_file, 'r') as f:
|
||||||
|
api_content = f.read()
|
||||||
|
|
||||||
|
# Check for WebhookDeliveryService initialization
|
||||||
|
if 'webhook_service = WebhookDeliveryService(config)' in api_content:
|
||||||
|
print("✅ process_llm_extraction initializes WebhookDeliveryService")
|
||||||
|
else:
|
||||||
|
print("❌ Missing WebhookDeliveryService initialization in process_llm_extraction")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check for notify_job_completion calls with llm_extraction
|
||||||
|
if 'task_type="llm_extraction"' in api_content:
|
||||||
|
print("✅ Uses correct task_type='llm_extraction' for notifications")
|
||||||
|
else:
|
||||||
|
print("❌ Missing task_type='llm_extraction' in webhook notifications")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Count webhook notification calls (should have at least 3: success + 2 failure paths)
|
||||||
|
notification_count = api_content.count('await webhook_service.notify_job_completion')
|
||||||
|
# Find only in process_llm_extraction function
|
||||||
|
llm_func_start = api_content.find('async def process_llm_extraction')
|
||||||
|
llm_func_end = api_content.find('\nasync def ', llm_func_start + 1)
|
||||||
|
if llm_func_end == -1:
|
||||||
|
llm_func_end = len(api_content)
|
||||||
|
|
||||||
|
llm_func_content = api_content[llm_func_start:llm_func_end]
|
||||||
|
llm_notification_count = llm_func_content.count('await webhook_service.notify_job_completion')
|
||||||
|
|
||||||
|
print(f"✅ Found {llm_notification_count} webhook notification calls in process_llm_extraction")
|
||||||
|
|
||||||
|
if llm_notification_count >= 3:
|
||||||
|
print(f"✅ Sufficient notification points (success + failure paths)")
|
||||||
|
else:
|
||||||
|
print(f"⚠️ Expected at least 3 notification calls, found {llm_notification_count}")
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Failed: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
|
def test_job_endpoint_integration():
|
||||||
|
"""Test that /llm/job endpoint extracts and passes webhook_config"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("TEST 5: /llm/job Endpoint Integration")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
job_file = os.path.join(os.path.dirname(__file__), 'deploy', 'docker', 'job.py')
|
||||||
|
|
||||||
|
with open(job_file, 'r') as f:
|
||||||
|
job_content = f.read()
|
||||||
|
|
||||||
|
# Find the llm_job_enqueue function
|
||||||
|
llm_job_start = job_content.find('async def llm_job_enqueue')
|
||||||
|
llm_job_end = job_content.find('\n\n@router', llm_job_start + 1)
|
||||||
|
if llm_job_end == -1:
|
||||||
|
llm_job_end = job_content.find('\n\nasync def', llm_job_start + 1)
|
||||||
|
|
||||||
|
llm_job_func = job_content[llm_job_start:llm_job_end]
|
||||||
|
|
||||||
|
# Check for webhook_config extraction
|
||||||
|
if 'webhook_config = None' in llm_job_func:
|
||||||
|
print("✅ llm_job_enqueue initializes webhook_config variable")
|
||||||
|
else:
|
||||||
|
print("❌ Missing webhook_config initialization")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if 'if payload.webhook_config:' in llm_job_func:
|
||||||
|
print("✅ llm_job_enqueue checks for payload.webhook_config")
|
||||||
|
else:
|
||||||
|
print("❌ Missing webhook_config check")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if 'webhook_config = payload.webhook_config.model_dump(mode=\'json\')' in llm_job_func:
|
||||||
|
print("✅ llm_job_enqueue converts webhook_config to dict")
|
||||||
|
else:
|
||||||
|
print("❌ Missing webhook_config.model_dump conversion")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if 'webhook_config=webhook_config' in llm_job_func:
|
||||||
|
print("✅ llm_job_enqueue passes webhook_config to handle_llm_request")
|
||||||
|
else:
|
||||||
|
print("❌ Missing webhook_config parameter in handle_llm_request call")
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Failed: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
|
def test_create_new_task_integration():
|
||||||
|
"""Test that create_new_task stores webhook_config in Redis"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("TEST 6: create_new_task Webhook Storage")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
api_file = os.path.join(os.path.dirname(__file__), 'deploy', 'docker', 'api.py')
|
||||||
|
|
||||||
|
with open(api_file, 'r') as f:
|
||||||
|
api_content = f.read()
|
||||||
|
|
||||||
|
# Find create_new_task function
|
||||||
|
create_task_start = api_content.find('async def create_new_task')
|
||||||
|
create_task_end = api_content.find('\nasync def ', create_task_start + 1)
|
||||||
|
if create_task_end == -1:
|
||||||
|
create_task_end = len(api_content)
|
||||||
|
|
||||||
|
create_task_func = api_content[create_task_start:create_task_end]
|
||||||
|
|
||||||
|
# Check for webhook_config storage
|
||||||
|
if 'if webhook_config:' in create_task_func:
|
||||||
|
print("✅ create_new_task checks for webhook_config")
|
||||||
|
else:
|
||||||
|
print("❌ Missing webhook_config check in create_new_task")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if 'task_data["webhook_config"] = json.dumps(webhook_config)' in create_task_func:
|
||||||
|
print("✅ create_new_task stores webhook_config in Redis task data")
|
||||||
|
else:
|
||||||
|
print("❌ Missing webhook_config storage in task_data")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check that webhook_config is passed to process_llm_extraction
|
||||||
|
if 'webhook_config' in create_task_func and 'background_tasks.add_task' in create_task_func:
|
||||||
|
print("✅ create_new_task passes webhook_config to background task")
|
||||||
|
else:
|
||||||
|
print("⚠️ Could not verify webhook_config passed to background task")
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Failed: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
|
def test_pattern_consistency():
|
||||||
|
"""Test that /llm/job follows the same pattern as /crawl/job"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("TEST 7: Pattern Consistency with /crawl/job")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
api_file = os.path.join(os.path.dirname(__file__), 'deploy', 'docker', 'api.py')
|
||||||
|
|
||||||
|
with open(api_file, 'r') as f:
|
||||||
|
api_content = f.read()
|
||||||
|
|
||||||
|
# Find handle_crawl_job to compare pattern
|
||||||
|
crawl_job_start = api_content.find('async def handle_crawl_job')
|
||||||
|
crawl_job_end = api_content.find('\nasync def ', crawl_job_start + 1)
|
||||||
|
if crawl_job_end == -1:
|
||||||
|
crawl_job_end = len(api_content)
|
||||||
|
crawl_job_func = api_content[crawl_job_start:crawl_job_end]
|
||||||
|
|
||||||
|
# Find process_llm_extraction
|
||||||
|
llm_extract_start = api_content.find('async def process_llm_extraction')
|
||||||
|
llm_extract_end = api_content.find('\nasync def ', llm_extract_start + 1)
|
||||||
|
if llm_extract_end == -1:
|
||||||
|
llm_extract_end = len(api_content)
|
||||||
|
llm_extract_func = api_content[llm_extract_start:llm_extract_end]
|
||||||
|
|
||||||
|
print("Checking pattern consistency...")
|
||||||
|
|
||||||
|
# Both should initialize WebhookDeliveryService
|
||||||
|
crawl_has_service = 'webhook_service = WebhookDeliveryService(config)' in crawl_job_func
|
||||||
|
llm_has_service = 'webhook_service = WebhookDeliveryService(config)' in llm_extract_func
|
||||||
|
|
||||||
|
if crawl_has_service and llm_has_service:
|
||||||
|
print("✅ Both initialize WebhookDeliveryService")
|
||||||
|
else:
|
||||||
|
print(f"❌ Service initialization mismatch (crawl: {crawl_has_service}, llm: {llm_has_service})")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Both should call notify_job_completion on success
|
||||||
|
crawl_notifies_success = 'status="completed"' in crawl_job_func and 'notify_job_completion' in crawl_job_func
|
||||||
|
llm_notifies_success = 'status="completed"' in llm_extract_func and 'notify_job_completion' in llm_extract_func
|
||||||
|
|
||||||
|
if crawl_notifies_success and llm_notifies_success:
|
||||||
|
print("✅ Both notify on success")
|
||||||
|
else:
|
||||||
|
print(f"❌ Success notification mismatch (crawl: {crawl_notifies_success}, llm: {llm_notifies_success})")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Both should call notify_job_completion on failure
|
||||||
|
crawl_notifies_failure = 'status="failed"' in crawl_job_func and 'error=' in crawl_job_func
|
||||||
|
llm_notifies_failure = 'status="failed"' in llm_extract_func and 'error=' in llm_extract_func
|
||||||
|
|
||||||
|
if crawl_notifies_failure and llm_notifies_failure:
|
||||||
|
print("✅ Both notify on failure")
|
||||||
|
else:
|
||||||
|
print(f"❌ Failure notification mismatch (crawl: {crawl_notifies_failure}, llm: {llm_notifies_failure})")
|
||||||
|
return False
|
||||||
|
|
||||||
|
print("✅ /llm/job follows the same pattern as /crawl/job")
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Failed: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run all tests"""
|
||||||
|
print("\n🧪 LLM Job Webhook Feature Validation")
|
||||||
|
print("=" * 60)
|
||||||
|
print("Testing that /llm/job now supports webhooks like /crawl/job")
|
||||||
|
print("=" * 60 + "\n")
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# Run all tests
|
||||||
|
results.append(("LlmJobPayload Model", test_llm_job_payload_model()))
|
||||||
|
results.append(("handle_llm_request Signature", test_handle_llm_request_signature()))
|
||||||
|
results.append(("process_llm_extraction Signature", test_process_llm_extraction_signature()))
|
||||||
|
results.append(("Webhook Integration", test_webhook_integration_in_api()))
|
||||||
|
results.append(("/llm/job Endpoint", test_job_endpoint_integration()))
|
||||||
|
results.append(("create_new_task Storage", test_create_new_task_integration()))
|
||||||
|
results.append(("Pattern Consistency", test_pattern_consistency()))
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("TEST SUMMARY")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
passed = sum(1 for _, result in results if result)
|
||||||
|
total = len(results)
|
||||||
|
|
||||||
|
for test_name, result in results:
|
||||||
|
status = "✅ PASS" if result else "❌ FAIL"
|
||||||
|
print(f"{status} - {test_name}")
|
||||||
|
|
||||||
|
print(f"\n{'=' * 60}")
|
||||||
|
print(f"Results: {passed}/{total} tests passed")
|
||||||
|
print(f"{'=' * 60}")
|
||||||
|
|
||||||
|
if passed == total:
|
||||||
|
print("\n🎉 All tests passed! /llm/job webhook feature is correctly implemented.")
|
||||||
|
print("\n📝 Summary of changes:")
|
||||||
|
print(" 1. LlmJobPayload model includes webhook_config field")
|
||||||
|
print(" 2. /llm/job endpoint extracts and passes webhook_config")
|
||||||
|
print(" 3. handle_llm_request accepts webhook_config parameter")
|
||||||
|
print(" 4. create_new_task stores webhook_config in Redis")
|
||||||
|
print(" 5. process_llm_extraction sends webhook notifications")
|
||||||
|
print(" 6. Follows the same pattern as /crawl/job")
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
print(f"\n⚠️ {total - passed} test(s) failed. Please review the output above.")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
exit(main())
|
||||||
307
test_webhook_implementation.py
Normal file
307
test_webhook_implementation.py
Normal file
@@ -0,0 +1,307 @@
|
|||||||
|
"""
|
||||||
|
Simple test script to validate webhook implementation without running full server.
|
||||||
|
|
||||||
|
This script tests:
|
||||||
|
1. Webhook module imports and syntax
|
||||||
|
2. WebhookDeliveryService initialization
|
||||||
|
3. Payload construction logic
|
||||||
|
4. Configuration parsing
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
# Add deploy/docker to path to import modules
|
||||||
|
# sys.path.insert(0, '/home/user/crawl4ai/deploy/docker')
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'deploy', 'docker'))
|
||||||
|
|
||||||
|
def test_imports():
|
||||||
|
"""Test that all webhook-related modules can be imported"""
|
||||||
|
print("=" * 60)
|
||||||
|
print("TEST 1: Module Imports")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from webhook import WebhookDeliveryService
|
||||||
|
print("✅ webhook.WebhookDeliveryService imported successfully")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Failed to import webhook module: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
try:
|
||||||
|
from schemas import WebhookConfig, WebhookPayload
|
||||||
|
print("✅ schemas.WebhookConfig imported successfully")
|
||||||
|
print("✅ schemas.WebhookPayload imported successfully")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Failed to import schemas: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def test_webhook_service_init():
|
||||||
|
"""Test WebhookDeliveryService initialization"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("TEST 2: WebhookDeliveryService Initialization")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from webhook import WebhookDeliveryService
|
||||||
|
|
||||||
|
# Test with default config
|
||||||
|
config = {
|
||||||
|
"webhooks": {
|
||||||
|
"enabled": True,
|
||||||
|
"default_url": None,
|
||||||
|
"data_in_payload": False,
|
||||||
|
"retry": {
|
||||||
|
"max_attempts": 5,
|
||||||
|
"initial_delay_ms": 1000,
|
||||||
|
"max_delay_ms": 32000,
|
||||||
|
"timeout_ms": 30000
|
||||||
|
},
|
||||||
|
"headers": {
|
||||||
|
"User-Agent": "Crawl4AI-Webhook/1.0"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
service = WebhookDeliveryService(config)
|
||||||
|
|
||||||
|
print(f"✅ Service initialized successfully")
|
||||||
|
print(f" - Max attempts: {service.max_attempts}")
|
||||||
|
print(f" - Initial delay: {service.initial_delay}s")
|
||||||
|
print(f" - Max delay: {service.max_delay}s")
|
||||||
|
print(f" - Timeout: {service.timeout}s")
|
||||||
|
|
||||||
|
# Verify calculations
|
||||||
|
assert service.max_attempts == 5, "Max attempts should be 5"
|
||||||
|
assert service.initial_delay == 1.0, "Initial delay should be 1.0s"
|
||||||
|
assert service.max_delay == 32.0, "Max delay should be 32.0s"
|
||||||
|
assert service.timeout == 30.0, "Timeout should be 30.0s"
|
||||||
|
|
||||||
|
print("✅ All configuration values correct")
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Service initialization failed: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
|
def test_webhook_config_model():
|
||||||
|
"""Test WebhookConfig Pydantic model"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("TEST 3: WebhookConfig Model Validation")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from schemas import WebhookConfig
|
||||||
|
from pydantic import ValidationError
|
||||||
|
|
||||||
|
# Test valid config
|
||||||
|
valid_config = {
|
||||||
|
"webhook_url": "https://example.com/webhook",
|
||||||
|
"webhook_data_in_payload": True,
|
||||||
|
"webhook_headers": {"X-Secret": "token123"}
|
||||||
|
}
|
||||||
|
|
||||||
|
config = WebhookConfig(**valid_config)
|
||||||
|
print(f"✅ Valid config accepted:")
|
||||||
|
print(f" - URL: {config.webhook_url}")
|
||||||
|
print(f" - Data in payload: {config.webhook_data_in_payload}")
|
||||||
|
print(f" - Headers: {config.webhook_headers}")
|
||||||
|
|
||||||
|
# Test minimal config
|
||||||
|
minimal_config = {
|
||||||
|
"webhook_url": "https://example.com/webhook"
|
||||||
|
}
|
||||||
|
|
||||||
|
config2 = WebhookConfig(**minimal_config)
|
||||||
|
print(f"✅ Minimal config accepted (defaults applied):")
|
||||||
|
print(f" - URL: {config2.webhook_url}")
|
||||||
|
print(f" - Data in payload: {config2.webhook_data_in_payload}")
|
||||||
|
print(f" - Headers: {config2.webhook_headers}")
|
||||||
|
|
||||||
|
# Test invalid URL
|
||||||
|
try:
|
||||||
|
invalid_config = {
|
||||||
|
"webhook_url": "not-a-url"
|
||||||
|
}
|
||||||
|
config3 = WebhookConfig(**invalid_config)
|
||||||
|
print(f"❌ Invalid URL should have been rejected")
|
||||||
|
return False
|
||||||
|
except ValidationError as e:
|
||||||
|
print(f"✅ Invalid URL correctly rejected")
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Model validation test failed: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
|
def test_payload_construction():
|
||||||
|
"""Test webhook payload construction logic"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("TEST 4: Payload Construction")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Simulate payload construction from notify_job_completion
|
||||||
|
task_id = "crawl_abc123"
|
||||||
|
task_type = "crawl"
|
||||||
|
status = "completed"
|
||||||
|
urls = ["https://example.com"]
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"task_id": task_id,
|
||||||
|
"task_type": task_type,
|
||||||
|
"status": status,
|
||||||
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||||
|
"urls": urls
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f"✅ Basic payload constructed:")
|
||||||
|
print(json.dumps(payload, indent=2))
|
||||||
|
|
||||||
|
# Test with error
|
||||||
|
error_payload = {
|
||||||
|
"task_id": "crawl_xyz789",
|
||||||
|
"task_type": "crawl",
|
||||||
|
"status": "failed",
|
||||||
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"error": "Connection timeout"
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f"\n✅ Error payload constructed:")
|
||||||
|
print(json.dumps(error_payload, indent=2))
|
||||||
|
|
||||||
|
# Test with data
|
||||||
|
data_payload = {
|
||||||
|
"task_id": "crawl_def456",
|
||||||
|
"task_type": "crawl",
|
||||||
|
"status": "completed",
|
||||||
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"data": {
|
||||||
|
"results": [
|
||||||
|
{"url": "https://example.com", "markdown": "# Example"}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f"\n✅ Data payload constructed:")
|
||||||
|
print(json.dumps(data_payload, indent=2))
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Payload construction failed: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
|
def test_exponential_backoff():
|
||||||
|
"""Test exponential backoff calculation"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("TEST 5: Exponential Backoff Calculation")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
initial_delay = 1.0 # 1 second
|
||||||
|
max_delay = 32.0 # 32 seconds
|
||||||
|
|
||||||
|
print("Backoff delays for 5 attempts:")
|
||||||
|
for attempt in range(5):
|
||||||
|
delay = min(initial_delay * (2 ** attempt), max_delay)
|
||||||
|
print(f" Attempt {attempt + 1}: {delay}s")
|
||||||
|
|
||||||
|
# Verify the sequence: 1s, 2s, 4s, 8s, 16s
|
||||||
|
expected = [1.0, 2.0, 4.0, 8.0, 16.0]
|
||||||
|
actual = [min(initial_delay * (2 ** i), max_delay) for i in range(5)]
|
||||||
|
|
||||||
|
assert actual == expected, f"Expected {expected}, got {actual}"
|
||||||
|
print("✅ Exponential backoff sequence correct")
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Backoff calculation failed: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def test_api_integration():
|
||||||
|
"""Test that api.py imports webhook module correctly"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("TEST 6: API Integration")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Check if api.py can import webhook module
|
||||||
|
api_path = os.path.join(os.path.dirname(__file__), 'deploy', 'docker', 'api.py')
|
||||||
|
with open(api_path, 'r') as f:
|
||||||
|
api_content = f.read()
|
||||||
|
|
||||||
|
if 'from webhook import WebhookDeliveryService' in api_content:
|
||||||
|
print("✅ api.py imports WebhookDeliveryService")
|
||||||
|
else:
|
||||||
|
print("❌ api.py missing webhook import")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if 'WebhookDeliveryService(config)' in api_content:
|
||||||
|
print("✅ api.py initializes WebhookDeliveryService")
|
||||||
|
else:
|
||||||
|
print("❌ api.py doesn't initialize WebhookDeliveryService")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if 'notify_job_completion' in api_content:
|
||||||
|
print("✅ api.py calls notify_job_completion")
|
||||||
|
else:
|
||||||
|
print("❌ api.py doesn't call notify_job_completion")
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ API integration check failed: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run all tests"""
|
||||||
|
print("\n🧪 Webhook Implementation Validation Tests")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# Run tests
|
||||||
|
results.append(("Module Imports", test_imports()))
|
||||||
|
results.append(("Service Initialization", test_webhook_service_init()))
|
||||||
|
results.append(("Config Model", test_webhook_config_model()))
|
||||||
|
results.append(("Payload Construction", test_payload_construction()))
|
||||||
|
results.append(("Exponential Backoff", test_exponential_backoff()))
|
||||||
|
results.append(("API Integration", test_api_integration()))
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("TEST SUMMARY")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
passed = sum(1 for _, result in results if result)
|
||||||
|
total = len(results)
|
||||||
|
|
||||||
|
for test_name, result in results:
|
||||||
|
status = "✅ PASS" if result else "❌ FAIL"
|
||||||
|
print(f"{status} - {test_name}")
|
||||||
|
|
||||||
|
print(f"\n{'=' * 60}")
|
||||||
|
print(f"Results: {passed}/{total} tests passed")
|
||||||
|
print(f"{'=' * 60}")
|
||||||
|
|
||||||
|
if passed == total:
|
||||||
|
print("\n🎉 All tests passed! Webhook implementation is valid.")
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
print(f"\n⚠️ {total - passed} test(s) failed. Please review the output above.")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
exit(main())
|
||||||
251
tests/WEBHOOK_TEST_README.md
Normal file
251
tests/WEBHOOK_TEST_README.md
Normal file
@@ -0,0 +1,251 @@
|
|||||||
|
# Webhook Feature Test Script
|
||||||
|
|
||||||
|
This directory contains a comprehensive test script for the webhook feature implementation.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The `test_webhook_feature.sh` script automates the entire process of testing the webhook feature:
|
||||||
|
|
||||||
|
1. ✅ Fetches and switches to the webhook feature branch
|
||||||
|
2. ✅ Activates the virtual environment
|
||||||
|
3. ✅ Installs all required dependencies
|
||||||
|
4. ✅ Starts Redis server in background
|
||||||
|
5. ✅ Starts Crawl4AI server in background
|
||||||
|
6. ✅ Runs webhook integration test
|
||||||
|
7. ✅ Verifies job completion via webhook
|
||||||
|
8. ✅ Cleans up and returns to original branch
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- Python 3.10+
|
||||||
|
- Virtual environment already created (`venv/` in project root)
|
||||||
|
- Git repository with the webhook feature branch
|
||||||
|
- `redis-server` (script will attempt to install if missing)
|
||||||
|
- `curl` and `lsof` commands available
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Quick Start
|
||||||
|
|
||||||
|
From the project root:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./tests/test_webhook_feature.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
Or from the tests directory:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd tests
|
||||||
|
./test_webhook_feature.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
### What the Script Does
|
||||||
|
|
||||||
|
#### Step 1: Branch Management
|
||||||
|
- Saves your current branch
|
||||||
|
- Fetches the webhook feature branch from remote
|
||||||
|
- Switches to the webhook feature branch
|
||||||
|
|
||||||
|
#### Step 2: Environment Setup
|
||||||
|
- Activates your existing virtual environment
|
||||||
|
- Installs dependencies from `deploy/docker/requirements.txt`
|
||||||
|
- Installs Flask for the webhook receiver
|
||||||
|
|
||||||
|
#### Step 3: Service Startup
|
||||||
|
- Starts Redis server on port 6379
|
||||||
|
- Starts Crawl4AI server on port 11235
|
||||||
|
- Waits for server health check to pass
|
||||||
|
|
||||||
|
#### Step 4: Webhook Test
|
||||||
|
- Creates a webhook receiver on port 8080
|
||||||
|
- Submits a crawl job for `https://example.com` with webhook config
|
||||||
|
- Waits for webhook notification (60s timeout)
|
||||||
|
- Verifies webhook payload contains expected data
|
||||||
|
|
||||||
|
#### Step 5: Cleanup
|
||||||
|
- Stops webhook receiver
|
||||||
|
- Stops Crawl4AI server
|
||||||
|
- Stops Redis server
|
||||||
|
- Returns to your original branch
|
||||||
|
|
||||||
|
## Expected Output
|
||||||
|
|
||||||
|
```
|
||||||
|
[INFO] Starting webhook feature test script
|
||||||
|
[INFO] Project root: /path/to/crawl4ai
|
||||||
|
[INFO] Step 1: Fetching PR branch...
|
||||||
|
[INFO] Current branch: develop
|
||||||
|
[SUCCESS] Branch fetched
|
||||||
|
[INFO] Step 2: Switching to branch: claude/implement-webhook-crawl-feature-011CULZY1Jy8N5MUkZqXkRVp
|
||||||
|
[SUCCESS] Switched to webhook feature branch
|
||||||
|
[INFO] Step 3: Activating virtual environment...
|
||||||
|
[SUCCESS] Virtual environment activated
|
||||||
|
[INFO] Step 4: Installing server dependencies...
|
||||||
|
[SUCCESS] Dependencies installed
|
||||||
|
[INFO] Step 5a: Starting Redis...
|
||||||
|
[SUCCESS] Redis started (PID: 12345)
|
||||||
|
[INFO] Step 5b: Starting server on port 11235...
|
||||||
|
[INFO] Server started (PID: 12346)
|
||||||
|
[INFO] Waiting for server to be ready...
|
||||||
|
[SUCCESS] Server is ready!
|
||||||
|
[INFO] Step 6: Creating webhook test script...
|
||||||
|
[INFO] Running webhook test...
|
||||||
|
|
||||||
|
🚀 Submitting crawl job with webhook...
|
||||||
|
✅ Job submitted successfully, task_id: crawl_abc123
|
||||||
|
⏳ Waiting for webhook notification...
|
||||||
|
|
||||||
|
✅ Webhook received: {
|
||||||
|
"task_id": "crawl_abc123",
|
||||||
|
"task_type": "crawl",
|
||||||
|
"status": "completed",
|
||||||
|
"timestamp": "2025-10-22T00:00:00.000000+00:00",
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"data": { ... }
|
||||||
|
}
|
||||||
|
|
||||||
|
✅ Webhook received!
|
||||||
|
Task ID: crawl_abc123
|
||||||
|
Status: completed
|
||||||
|
URLs: ['https://example.com']
|
||||||
|
✅ Data included in webhook payload
|
||||||
|
📄 Crawled 1 URL(s)
|
||||||
|
- https://example.com: 1234 chars
|
||||||
|
|
||||||
|
🎉 Webhook test PASSED!
|
||||||
|
|
||||||
|
[INFO] Step 7: Verifying test results...
|
||||||
|
[SUCCESS] ✅ Webhook test PASSED!
|
||||||
|
[SUCCESS] All tests completed successfully! 🎉
|
||||||
|
[INFO] Cleanup will happen automatically...
|
||||||
|
[INFO] Starting cleanup...
|
||||||
|
[INFO] Stopping webhook receiver...
|
||||||
|
[INFO] Stopping server...
|
||||||
|
[INFO] Stopping Redis...
|
||||||
|
[INFO] Switching back to branch: develop
|
||||||
|
[SUCCESS] Cleanup complete
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Server Failed to Start
|
||||||
|
|
||||||
|
If the server fails to start, check the logs:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
tail -100 /tmp/crawl4ai_server.log
|
||||||
|
```
|
||||||
|
|
||||||
|
Common issues:
|
||||||
|
- Port 11235 already in use: `lsof -ti:11235 | xargs kill -9`
|
||||||
|
- Missing dependencies: Check that all packages are installed
|
||||||
|
|
||||||
|
### Redis Connection Failed
|
||||||
|
|
||||||
|
Check if Redis is running:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
redis-cli ping
|
||||||
|
# Should return: PONG
|
||||||
|
```
|
||||||
|
|
||||||
|
If not running:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
redis-server --port 6379 --daemonize yes
|
||||||
|
```
|
||||||
|
|
||||||
|
### Webhook Not Received
|
||||||
|
|
||||||
|
The script has a 60-second timeout for webhook delivery. If the webhook isn't received:
|
||||||
|
|
||||||
|
1. Check server logs: `/tmp/crawl4ai_server.log`
|
||||||
|
2. Verify webhook receiver is running on port 8080
|
||||||
|
3. Check network connectivity between components
|
||||||
|
|
||||||
|
### Script Interruption
|
||||||
|
|
||||||
|
If the script is interrupted (Ctrl+C), cleanup happens automatically via trap. The script will:
|
||||||
|
- Kill all background processes
|
||||||
|
- Stop Redis
|
||||||
|
- Return to your original branch
|
||||||
|
|
||||||
|
To manually cleanup if needed:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Kill processes by port
|
||||||
|
lsof -ti:11235 | xargs kill -9 # Server
|
||||||
|
lsof -ti:8080 | xargs kill -9 # Webhook receiver
|
||||||
|
lsof -ti:6379 | xargs kill -9 # Redis
|
||||||
|
|
||||||
|
# Return to your branch
|
||||||
|
git checkout develop # or your branch name
|
||||||
|
```
|
||||||
|
|
||||||
|
## Testing Different URLs
|
||||||
|
|
||||||
|
To test with a different URL, modify the script or create a custom test:
|
||||||
|
|
||||||
|
```python
|
||||||
|
payload = {
|
||||||
|
"urls": ["https://your-url-here.com"],
|
||||||
|
"browser_config": {"headless": True},
|
||||||
|
"crawler_config": {"cache_mode": "bypass"},
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": "http://localhost:8080/webhook",
|
||||||
|
"webhook_data_in_payload": True
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Files Generated
|
||||||
|
|
||||||
|
The script creates temporary files:
|
||||||
|
|
||||||
|
- `/tmp/crawl4ai_server.log` - Server output logs
|
||||||
|
- `/tmp/test_webhook.py` - Webhook test Python script
|
||||||
|
|
||||||
|
These are not cleaned up automatically so you can review them after the test.
|
||||||
|
|
||||||
|
## Exit Codes
|
||||||
|
|
||||||
|
- `0` - All tests passed successfully
|
||||||
|
- `1` - Test failed (check output for details)
|
||||||
|
|
||||||
|
## Safety Features
|
||||||
|
|
||||||
|
- ✅ Automatic cleanup on exit, interrupt, or error
|
||||||
|
- ✅ Returns to original branch on completion
|
||||||
|
- ✅ Kills all background processes
|
||||||
|
- ✅ Comprehensive error handling
|
||||||
|
- ✅ Colored output for easy reading
|
||||||
|
- ✅ Detailed logging at each step
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- The script uses `set -e` to exit on any command failure
|
||||||
|
- All background processes are tracked and cleaned up
|
||||||
|
- The virtual environment must exist before running
|
||||||
|
- Redis must be available (installed or installable via apt-get/brew)
|
||||||
|
|
||||||
|
## Integration with CI/CD
|
||||||
|
|
||||||
|
This script can be integrated into CI/CD pipelines:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Example GitHub Actions
|
||||||
|
- name: Test Webhook Feature
|
||||||
|
run: |
|
||||||
|
chmod +x tests/test_webhook_feature.sh
|
||||||
|
./tests/test_webhook_feature.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
## Support
|
||||||
|
|
||||||
|
If you encounter issues:
|
||||||
|
|
||||||
|
1. Check the troubleshooting section above
|
||||||
|
2. Review server logs at `/tmp/crawl4ai_server.log`
|
||||||
|
3. Ensure all prerequisites are met
|
||||||
|
4. Open an issue with the full output of the script
|
||||||
@@ -1,151 +0,0 @@
|
|||||||
"""
|
|
||||||
Shared pytest fixtures for Crawl4AI tests.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import tempfile
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
from unittest.mock import Mock, patch
|
|
||||||
from crawl4ai.telemetry.config import TelemetryConfig, TelemetryConsent
|
|
||||||
from crawl4ai.telemetry.environment import Environment
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def temp_config_dir():
|
|
||||||
"""Provide a temporary directory for telemetry config testing."""
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
yield Path(tmpdir)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def mock_telemetry_config(temp_config_dir):
|
|
||||||
"""Provide a mocked telemetry config for testing."""
|
|
||||||
config = TelemetryConfig(config_dir=temp_config_dir)
|
|
||||||
yield config
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def clean_environment():
|
|
||||||
"""Clean environment variables before and after test."""
|
|
||||||
# Store original environment
|
|
||||||
original_env = os.environ.copy()
|
|
||||||
|
|
||||||
# Clean telemetry-related env vars
|
|
||||||
telemetry_vars = [
|
|
||||||
'CRAWL4AI_TELEMETRY',
|
|
||||||
'CRAWL4AI_DOCKER',
|
|
||||||
'CRAWL4AI_API_SERVER',
|
|
||||||
'CRAWL4AI_TEST_MODE'
|
|
||||||
]
|
|
||||||
|
|
||||||
for var in telemetry_vars:
|
|
||||||
if var in os.environ:
|
|
||||||
del os.environ[var]
|
|
||||||
|
|
||||||
# Set test mode
|
|
||||||
os.environ['CRAWL4AI_TEST_MODE'] = '1'
|
|
||||||
|
|
||||||
yield
|
|
||||||
|
|
||||||
# Restore original environment
|
|
||||||
os.environ.clear()
|
|
||||||
os.environ.update(original_env)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def mock_sentry_provider():
|
|
||||||
"""Provide a mocked Sentry provider for testing."""
|
|
||||||
with patch('crawl4ai.telemetry.providers.sentry.SentryProvider') as mock:
|
|
||||||
provider_instance = Mock()
|
|
||||||
provider_instance.initialize.return_value = True
|
|
||||||
provider_instance.send_exception.return_value = True
|
|
||||||
provider_instance.is_initialized = True
|
|
||||||
mock.return_value = provider_instance
|
|
||||||
yield provider_instance
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def enabled_telemetry_config(temp_config_dir): # noqa: F811
|
|
||||||
"""Provide a telemetry config with telemetry enabled."""
|
|
||||||
config = Mock()
|
|
||||||
config.get_consent.return_value = TelemetryConsent.ALWAYS
|
|
||||||
config.is_enabled.return_value = True
|
|
||||||
config.should_send_current.return_value = True
|
|
||||||
config.get_email.return_value = "test@example.com"
|
|
||||||
config.update_from_env.return_value = None
|
|
||||||
yield config
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def disabled_telemetry_config(temp_config_dir): # noqa: F811
|
|
||||||
"""Provide a telemetry config with telemetry disabled."""
|
|
||||||
config = Mock()
|
|
||||||
config.get_consent.return_value = TelemetryConsent.DENIED
|
|
||||||
config.is_enabled.return_value = False
|
|
||||||
config.should_send_current.return_value = False
|
|
||||||
config.update_from_env.return_value = None
|
|
||||||
yield config
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def docker_environment():
|
|
||||||
"""Mock Docker environment detection."""
|
|
||||||
with patch('crawl4ai.telemetry.environment.EnvironmentDetector.detect', return_value=Environment.DOCKER):
|
|
||||||
yield
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def cli_environment():
|
|
||||||
"""Mock CLI environment detection."""
|
|
||||||
with patch('crawl4ai.telemetry.environment.EnvironmentDetector.detect', return_value=Environment.CLI):
|
|
||||||
with patch('sys.stdin.isatty', return_value=True):
|
|
||||||
yield
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def jupyter_environment():
|
|
||||||
"""Mock Jupyter environment detection."""
|
|
||||||
with patch('crawl4ai.telemetry.environment.EnvironmentDetector.detect', return_value=Environment.JUPYTER):
|
|
||||||
yield
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
|
||||||
def reset_telemetry_singleton():
|
|
||||||
"""Reset telemetry singleton between tests."""
|
|
||||||
from crawl4ai.telemetry import TelemetryManager
|
|
||||||
# Reset the singleton instance
|
|
||||||
if hasattr(TelemetryManager, '_instance'):
|
|
||||||
TelemetryManager._instance = None # noqa: SLF001
|
|
||||||
yield
|
|
||||||
# Clean up after test
|
|
||||||
if hasattr(TelemetryManager, '_instance'):
|
|
||||||
TelemetryManager._instance = None # noqa: SLF001
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def sample_exception():
|
|
||||||
"""Provide a sample exception for testing."""
|
|
||||||
try:
|
|
||||||
raise ValueError("Test exception for telemetry")
|
|
||||||
except ValueError as e:
|
|
||||||
return e
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def privacy_test_data():
|
|
||||||
"""Provide test data that should NOT be captured by telemetry."""
|
|
||||||
return {
|
|
||||||
'url': 'https://example.com/private-page',
|
|
||||||
'content': 'This is private content that should not be sent',
|
|
||||||
'user_data': {
|
|
||||||
'email': 'user@private.com',
|
|
||||||
'password': 'secret123',
|
|
||||||
'api_key': 'sk-1234567890abcdef'
|
|
||||||
},
|
|
||||||
'pii': {
|
|
||||||
'ssn': '123-45-6789',
|
|
||||||
'phone': '+1-555-123-4567',
|
|
||||||
'address': '123 Main St, Anytown, USA'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,64 +0,0 @@
|
|||||||
"""
|
|
||||||
Test configuration and utilities for telemetry testing.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_configure(config): # noqa: ARG001
|
|
||||||
"""Configure pytest for telemetry tests."""
|
|
||||||
# Add custom markers
|
|
||||||
config.addinivalue_line("markers", "unit: Unit tests")
|
|
||||||
config.addinivalue_line("markers", "integration: Integration tests")
|
|
||||||
config.addinivalue_line("markers", "privacy: Privacy compliance tests")
|
|
||||||
config.addinivalue_line("markers", "performance: Performance tests")
|
|
||||||
config.addinivalue_line("markers", "slow: Slow running tests")
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_collection_modifyitems(config, items): # noqa: ARG001
|
|
||||||
"""Modify test collection to add markers automatically."""
|
|
||||||
for item in items:
|
|
||||||
# Add markers based on test location and name
|
|
||||||
if "telemetry" in str(item.fspath):
|
|
||||||
if "integration" in item.name or "test_integration" in str(item.fspath):
|
|
||||||
item.add_marker(pytest.mark.integration)
|
|
||||||
elif "privacy" in item.name or "performance" in item.name:
|
|
||||||
if "privacy" in item.name:
|
|
||||||
item.add_marker(pytest.mark.privacy)
|
|
||||||
if "performance" in item.name:
|
|
||||||
item.add_marker(pytest.mark.performance)
|
|
||||||
else:
|
|
||||||
item.add_marker(pytest.mark.unit)
|
|
||||||
|
|
||||||
# Mark slow tests
|
|
||||||
if "slow" in item.name or any(mark.name == "slow" for mark in item.iter_markers()):
|
|
||||||
item.add_marker(pytest.mark.slow)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
|
||||||
def setup_test_environment():
|
|
||||||
"""Set up test environment variables."""
|
|
||||||
# Ensure we're in test mode
|
|
||||||
os.environ['CRAWL4AI_TEST_MODE'] = '1'
|
|
||||||
|
|
||||||
# Disable actual telemetry during tests unless explicitly enabled
|
|
||||||
if 'CRAWL4AI_TELEMETRY_TEST_REAL' not in os.environ:
|
|
||||||
os.environ['CRAWL4AI_TELEMETRY'] = '0'
|
|
||||||
|
|
||||||
yield
|
|
||||||
|
|
||||||
# Clean up after tests
|
|
||||||
test_vars = ['CRAWL4AI_TEST_MODE', 'CRAWL4AI_TELEMETRY_TEST_REAL']
|
|
||||||
for var in test_vars:
|
|
||||||
if var in os.environ:
|
|
||||||
del os.environ[var]
|
|
||||||
|
|
||||||
|
|
||||||
def pytest_report_header(config): # noqa: ARG001
|
|
||||||
"""Add information to pytest header."""
|
|
||||||
return [
|
|
||||||
"Crawl4AI Telemetry Tests",
|
|
||||||
f"Test mode: {'ENABLED' if os.environ.get('CRAWL4AI_TEST_MODE') else 'DISABLED'}",
|
|
||||||
f"Real telemetry: {'ENABLED' if os.environ.get('CRAWL4AI_TELEMETRY_TEST_REAL') else 'DISABLED'}"
|
|
||||||
]
|
|
||||||
@@ -1,216 +0,0 @@
|
|||||||
"""
|
|
||||||
Integration tests for telemetry CLI commands.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import subprocess
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
from unittest.mock import patch, Mock
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.integration
|
|
||||||
class TestTelemetryCLI:
|
|
||||||
"""Test telemetry CLI commands integration."""
|
|
||||||
|
|
||||||
def test_telemetry_status_command(self, clean_environment, temp_config_dir):
|
|
||||||
"""Test the telemetry status CLI command."""
|
|
||||||
# Import with mocked config
|
|
||||||
with patch('crawl4ai.telemetry.TelemetryConfig') as MockConfig:
|
|
||||||
mock_config = Mock()
|
|
||||||
mock_config.get_consent.return_value = 'not_set'
|
|
||||||
mock_config.is_enabled.return_value = False
|
|
||||||
MockConfig.return_value = mock_config
|
|
||||||
|
|
||||||
from crawl4ai.cli import main
|
|
||||||
|
|
||||||
# Test status command
|
|
||||||
with patch('sys.argv', ['crawl4ai', 'telemetry', 'status']):
|
|
||||||
try:
|
|
||||||
main()
|
|
||||||
except SystemExit:
|
|
||||||
pass # CLI commands often call sys.exit()
|
|
||||||
|
|
||||||
def test_telemetry_enable_command(self, clean_environment, temp_config_dir):
|
|
||||||
"""Test the telemetry enable CLI command."""
|
|
||||||
with patch('crawl4ai.telemetry.TelemetryConfig') as MockConfig:
|
|
||||||
mock_config = Mock()
|
|
||||||
MockConfig.return_value = mock_config
|
|
||||||
|
|
||||||
from crawl4ai.cli import main
|
|
||||||
|
|
||||||
# Test enable command
|
|
||||||
with patch('sys.argv', ['crawl4ai', 'telemetry', 'enable', '--email', 'test@example.com']):
|
|
||||||
try:
|
|
||||||
main()
|
|
||||||
except SystemExit:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def test_telemetry_disable_command(self, clean_environment, temp_config_dir):
|
|
||||||
"""Test the telemetry disable CLI command."""
|
|
||||||
with patch('crawl4ai.telemetry.TelemetryConfig') as MockConfig:
|
|
||||||
mock_config = Mock()
|
|
||||||
MockConfig.return_value = mock_config
|
|
||||||
|
|
||||||
from crawl4ai.cli import main
|
|
||||||
|
|
||||||
# Test disable command
|
|
||||||
with patch('sys.argv', ['crawl4ai', 'telemetry', 'disable']):
|
|
||||||
try:
|
|
||||||
main()
|
|
||||||
except SystemExit:
|
|
||||||
pass
|
|
||||||
|
|
||||||
@pytest.mark.slow
|
|
||||||
def test_cli_subprocess_integration(self, temp_config_dir):
|
|
||||||
"""Test CLI commands as subprocess calls."""
|
|
||||||
env = os.environ.copy()
|
|
||||||
env['CRAWL4AI_CONFIG_DIR'] = str(temp_config_dir)
|
|
||||||
|
|
||||||
# Test status command via subprocess
|
|
||||||
try:
|
|
||||||
result = subprocess.run(
|
|
||||||
[sys.executable, '-m', 'crawl4ai.cli', 'telemetry', 'status'],
|
|
||||||
env=env,
|
|
||||||
capture_output=True,
|
|
||||||
text=True,
|
|
||||||
timeout=10
|
|
||||||
)
|
|
||||||
# Should not crash, regardless of exit code
|
|
||||||
assert result.returncode in [0, 1] # May return 1 if not configured
|
|
||||||
except subprocess.TimeoutExpired:
|
|
||||||
pytest.skip("CLI command timed out")
|
|
||||||
except FileNotFoundError:
|
|
||||||
pytest.skip("CLI module not found")
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.integration
|
|
||||||
class TestAsyncWebCrawlerIntegration:
|
|
||||||
"""Test AsyncWebCrawler telemetry integration."""
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_crawler_telemetry_decorator(self, enabled_telemetry_config, mock_sentry_provider):
|
|
||||||
"""Test that AsyncWebCrawler methods are decorated with telemetry."""
|
|
||||||
with patch('crawl4ai.telemetry.TelemetryConfig', return_value=enabled_telemetry_config):
|
|
||||||
from crawl4ai import AsyncWebCrawler
|
|
||||||
|
|
||||||
# Check if the arun method has telemetry decoration
|
|
||||||
crawler = AsyncWebCrawler()
|
|
||||||
assert hasattr(crawler.arun, '__wrapped__') or callable(crawler.arun)
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_crawler_exception_capture_integration(self, enabled_telemetry_config, mock_sentry_provider):
|
|
||||||
"""Test that exceptions in AsyncWebCrawler are captured."""
|
|
||||||
with patch('crawl4ai.telemetry.TelemetryConfig', return_value=enabled_telemetry_config):
|
|
||||||
with patch('crawl4ai.telemetry.capture_exception') as _mock_capture:
|
|
||||||
from crawl4ai import AsyncWebCrawler
|
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
|
||||||
try:
|
|
||||||
# This should cause an exception
|
|
||||||
await crawler.arun(url="invalid://url")
|
|
||||||
except Exception:
|
|
||||||
pass # We expect this to fail
|
|
||||||
|
|
||||||
# The decorator should have attempted to capture the exception
|
|
||||||
# Note: This might not always be called depending on where the exception occurs
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_crawler_with_disabled_telemetry(self, disabled_telemetry_config):
|
|
||||||
"""Test that AsyncWebCrawler works normally with disabled telemetry."""
|
|
||||||
with patch('crawl4ai.telemetry.TelemetryConfig', return_value=disabled_telemetry_config):
|
|
||||||
from crawl4ai import AsyncWebCrawler
|
|
||||||
|
|
||||||
# Should work normally even with telemetry disabled
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
|
||||||
assert crawler is not None
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.integration
|
|
||||||
class TestDockerIntegration:
|
|
||||||
"""Test Docker environment telemetry integration."""
|
|
||||||
|
|
||||||
def test_docker_environment_detection(self, docker_environment, temp_config_dir):
|
|
||||||
"""Test that Docker environment is detected correctly."""
|
|
||||||
from crawl4ai.telemetry.environment import EnvironmentDetector
|
|
||||||
|
|
||||||
env = EnvironmentDetector.detect()
|
|
||||||
from crawl4ai.telemetry.environment import Environment
|
|
||||||
assert env == Environment.DOCKER
|
|
||||||
|
|
||||||
def test_docker_default_telemetry_enabled(self, temp_config_dir):
|
|
||||||
"""Test that telemetry is enabled by default in Docker."""
|
|
||||||
from crawl4ai.telemetry.environment import Environment
|
|
||||||
|
|
||||||
# Clear any existing environment variables that might interfere
|
|
||||||
with patch.dict(os.environ, {}, clear=True):
|
|
||||||
# Set only the Docker environment variable
|
|
||||||
os.environ['CRAWL4AI_DOCKER'] = 'true'
|
|
||||||
|
|
||||||
with patch('crawl4ai.telemetry.environment.EnvironmentDetector.detect', return_value=Environment.DOCKER):
|
|
||||||
from crawl4ai.telemetry.consent import ConsentManager
|
|
||||||
from crawl4ai.telemetry.config import TelemetryConfig, TelemetryConsent
|
|
||||||
|
|
||||||
config = TelemetryConfig(config_dir=temp_config_dir)
|
|
||||||
consent_manager = ConsentManager(config)
|
|
||||||
|
|
||||||
# Should set consent to ALWAYS for Docker
|
|
||||||
consent_manager.check_and_prompt()
|
|
||||||
assert config.get_consent() == TelemetryConsent.ALWAYS
|
|
||||||
|
|
||||||
def test_docker_telemetry_can_be_disabled(self, temp_config_dir):
|
|
||||||
"""Test that Docker telemetry can be disabled via environment variable."""
|
|
||||||
from crawl4ai.telemetry.environment import Environment
|
|
||||||
|
|
||||||
with patch.dict(os.environ, {'CRAWL4AI_TELEMETRY': '0', 'CRAWL4AI_DOCKER': 'true'}):
|
|
||||||
with patch('crawl4ai.telemetry.environment.EnvironmentDetector.detect', return_value=Environment.DOCKER):
|
|
||||||
from crawl4ai.telemetry.consent import ConsentManager
|
|
||||||
from crawl4ai.telemetry.config import TelemetryConfig, TelemetryConsent
|
|
||||||
|
|
||||||
config = TelemetryConfig(config_dir=temp_config_dir)
|
|
||||||
consent_manager = ConsentManager(config)
|
|
||||||
|
|
||||||
# Should set consent to DENIED when env var is 0
|
|
||||||
consent_manager.check_and_prompt()
|
|
||||||
assert config.get_consent() == TelemetryConsent.DENIED
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.integration
|
|
||||||
class TestTelemetryProviderIntegration:
|
|
||||||
"""Test telemetry provider integration."""
|
|
||||||
|
|
||||||
def test_sentry_provider_initialization(self, enabled_telemetry_config):
|
|
||||||
"""Test that Sentry provider initializes correctly."""
|
|
||||||
try:
|
|
||||||
from crawl4ai.telemetry.providers.sentry import SentryProvider
|
|
||||||
|
|
||||||
provider = SentryProvider()
|
|
||||||
# Should not crash during initialization
|
|
||||||
assert provider is not None
|
|
||||||
|
|
||||||
except ImportError:
|
|
||||||
pytest.skip("Sentry provider not available")
|
|
||||||
|
|
||||||
def test_null_provider_fallback(self, disabled_telemetry_config):
|
|
||||||
"""Test that NullProvider is used when telemetry is disabled."""
|
|
||||||
with patch('crawl4ai.telemetry.TelemetryConfig', return_value=disabled_telemetry_config):
|
|
||||||
from crawl4ai.telemetry import TelemetryManager
|
|
||||||
from crawl4ai.telemetry.base import NullProvider
|
|
||||||
|
|
||||||
manager = TelemetryManager()
|
|
||||||
assert isinstance(manager._provider, NullProvider) # noqa: SLF001
|
|
||||||
|
|
||||||
def test_graceful_degradation_without_sentry(self, enabled_telemetry_config):
|
|
||||||
"""Test graceful degradation when sentry-sdk is not available."""
|
|
||||||
with patch.dict('sys.modules', {'sentry_sdk': None}):
|
|
||||||
with patch('crawl4ai.telemetry.TelemetryConfig', return_value=enabled_telemetry_config):
|
|
||||||
from crawl4ai.telemetry import TelemetryManager
|
|
||||||
from crawl4ai.telemetry.base import NullProvider
|
|
||||||
|
|
||||||
# Should fall back to NullProvider when Sentry is not available
|
|
||||||
manager = TelemetryManager()
|
|
||||||
assert isinstance(manager._provider, NullProvider) # noqa: SLF001
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
pytest.main([__file__, "-v"])
|
|
||||||
@@ -1,283 +0,0 @@
|
|||||||
"""
|
|
||||||
Privacy and performance tests for telemetry system.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import time
|
|
||||||
import asyncio
|
|
||||||
from unittest.mock import patch
|
|
||||||
from crawl4ai.telemetry import telemetry_decorator, async_telemetry_decorator, TelemetryManager
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.privacy
|
|
||||||
class TestTelemetryPrivacy:
|
|
||||||
"""Test privacy compliance of telemetry system."""
|
|
||||||
|
|
||||||
def test_no_url_captured(self, enabled_telemetry_config, mock_sentry_provider, privacy_test_data):
|
|
||||||
"""Test that URLs are not captured in telemetry data."""
|
|
||||||
# Ensure config is properly set for sending
|
|
||||||
enabled_telemetry_config.is_enabled.return_value = True
|
|
||||||
enabled_telemetry_config.should_send_current.return_value = True
|
|
||||||
|
|
||||||
with patch('crawl4ai.telemetry.TelemetryConfig', return_value=enabled_telemetry_config):
|
|
||||||
# Mock the provider directly in the manager
|
|
||||||
manager = TelemetryManager()
|
|
||||||
manager._provider = mock_sentry_provider # noqa: SLF001
|
|
||||||
manager._initialized = True # noqa: SLF001
|
|
||||||
|
|
||||||
# Create exception with URL in context
|
|
||||||
exception = ValueError("Test error")
|
|
||||||
context = {'url': privacy_test_data['url']}
|
|
||||||
|
|
||||||
manager.capture_exception(exception, context)
|
|
||||||
|
|
||||||
# Verify that the provider was called
|
|
||||||
mock_sentry_provider.send_exception.assert_called_once()
|
|
||||||
call_args = mock_sentry_provider.send_exception.call_args
|
|
||||||
|
|
||||||
# Verify that context was passed to the provider (filtering happens in provider)
|
|
||||||
assert len(call_args) >= 2
|
|
||||||
|
|
||||||
def test_no_content_captured(self, enabled_telemetry_config, mock_sentry_provider, privacy_test_data):
|
|
||||||
"""Test that crawled content is not captured."""
|
|
||||||
# Ensure config is properly set
|
|
||||||
enabled_telemetry_config.is_enabled.return_value = True
|
|
||||||
enabled_telemetry_config.should_send_current.return_value = True
|
|
||||||
|
|
||||||
with patch('crawl4ai.telemetry.TelemetryConfig', return_value=enabled_telemetry_config):
|
|
||||||
manager = TelemetryManager()
|
|
||||||
manager._provider = mock_sentry_provider # noqa: SLF001
|
|
||||||
manager._initialized = True # noqa: SLF001
|
|
||||||
|
|
||||||
exception = ValueError("Test error")
|
|
||||||
context = {
|
|
||||||
'content': privacy_test_data['content'],
|
|
||||||
'html': '<html><body>Private content</body></html>',
|
|
||||||
'text': 'Extracted private text'
|
|
||||||
}
|
|
||||||
|
|
||||||
manager.capture_exception(exception, context)
|
|
||||||
|
|
||||||
mock_sentry_provider.send_exception.assert_called_once()
|
|
||||||
call_args = mock_sentry_provider.send_exception.call_args
|
|
||||||
|
|
||||||
# Verify that the provider was called (actual filtering would happen in provider)
|
|
||||||
assert len(call_args) >= 2
|
|
||||||
|
|
||||||
def test_no_pii_captured(self, enabled_telemetry_config, mock_sentry_provider, privacy_test_data):
|
|
||||||
"""Test that PII is not captured in telemetry."""
|
|
||||||
# Ensure config is properly set
|
|
||||||
enabled_telemetry_config.is_enabled.return_value = True
|
|
||||||
enabled_telemetry_config.should_send_current.return_value = True
|
|
||||||
|
|
||||||
with patch('crawl4ai.telemetry.TelemetryConfig', return_value=enabled_telemetry_config):
|
|
||||||
manager = TelemetryManager()
|
|
||||||
manager._provider = mock_sentry_provider # noqa: SLF001
|
|
||||||
manager._initialized = True # noqa: SLF001
|
|
||||||
|
|
||||||
exception = ValueError("Test error")
|
|
||||||
context = privacy_test_data['user_data'].copy()
|
|
||||||
context.update(privacy_test_data['pii'])
|
|
||||||
|
|
||||||
manager.capture_exception(exception, context)
|
|
||||||
|
|
||||||
mock_sentry_provider.send_exception.assert_called_once()
|
|
||||||
call_args = mock_sentry_provider.send_exception.call_args
|
|
||||||
|
|
||||||
# Verify that the provider was called (actual filtering would happen in provider)
|
|
||||||
assert len(call_args) >= 2
|
|
||||||
|
|
||||||
def test_sanitized_context_captured(self, enabled_telemetry_config, mock_sentry_provider):
|
|
||||||
"""Test that only safe context is captured."""
|
|
||||||
# Ensure config is properly set
|
|
||||||
enabled_telemetry_config.is_enabled.return_value = True
|
|
||||||
enabled_telemetry_config.should_send_current.return_value = True
|
|
||||||
|
|
||||||
with patch('crawl4ai.telemetry.TelemetryConfig', return_value=enabled_telemetry_config):
|
|
||||||
manager = TelemetryManager()
|
|
||||||
manager._provider = mock_sentry_provider # noqa: SLF001
|
|
||||||
manager._initialized = True # noqa: SLF001
|
|
||||||
|
|
||||||
exception = ValueError("Test error")
|
|
||||||
context = {
|
|
||||||
'operation': 'crawl', # Safe to capture
|
|
||||||
'status_code': 404, # Safe to capture
|
|
||||||
'retry_count': 3, # Safe to capture
|
|
||||||
'user_email': 'secret@example.com', # Should be in context (not filtered at this level)
|
|
||||||
'content': 'private content' # Should be in context (not filtered at this level)
|
|
||||||
}
|
|
||||||
|
|
||||||
manager.capture_exception(exception, context)
|
|
||||||
|
|
||||||
mock_sentry_provider.send_exception.assert_called_once()
|
|
||||||
call_args = mock_sentry_provider.send_exception.call_args
|
|
||||||
|
|
||||||
# Get the actual arguments passed to the mock
|
|
||||||
args, kwargs = call_args
|
|
||||||
assert len(args) >= 2, f"Expected at least 2 args, got {len(args)}"
|
|
||||||
|
|
||||||
# The second argument should be the context
|
|
||||||
captured_context = args[1]
|
|
||||||
|
|
||||||
# The basic context should be present (this tests the manager, not the provider filtering)
|
|
||||||
assert 'operation' in captured_context, f"operation not found in {captured_context}"
|
|
||||||
assert captured_context.get('operation') == 'crawl'
|
|
||||||
assert captured_context.get('status_code') == 404
|
|
||||||
assert captured_context.get('retry_count') == 3
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.performance
|
|
||||||
class TestTelemetryPerformance:
|
|
||||||
"""Test performance impact of telemetry system."""
|
|
||||||
|
|
||||||
def test_decorator_overhead_sync(self, enabled_telemetry_config, mock_sentry_provider): # noqa: ARG002
|
|
||||||
"""Test performance overhead of sync telemetry decorator."""
|
|
||||||
with patch('crawl4ai.telemetry.TelemetryConfig', return_value=enabled_telemetry_config):
|
|
||||||
|
|
||||||
@telemetry_decorator
|
|
||||||
def test_function():
|
|
||||||
"""Test function with telemetry decorator."""
|
|
||||||
time.sleep(0.001) # Simulate small amount of work
|
|
||||||
return "success"
|
|
||||||
|
|
||||||
# Measure time with telemetry
|
|
||||||
start_time = time.time()
|
|
||||||
for _ in range(100):
|
|
||||||
test_function()
|
|
||||||
telemetry_time = time.time() - start_time
|
|
||||||
|
|
||||||
# Telemetry should add minimal overhead
|
|
||||||
assert telemetry_time < 1.0 # Should complete 100 calls in under 1 second
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_decorator_overhead_async(self, enabled_telemetry_config, mock_sentry_provider): # noqa: ARG002
|
|
||||||
"""Test performance overhead of async telemetry decorator."""
|
|
||||||
with patch('crawl4ai.telemetry.TelemetryConfig', return_value=enabled_telemetry_config):
|
|
||||||
|
|
||||||
@async_telemetry_decorator
|
|
||||||
async def test_async_function():
|
|
||||||
"""Test async function with telemetry decorator."""
|
|
||||||
await asyncio.sleep(0.001) # Simulate small amount of async work
|
|
||||||
return "success"
|
|
||||||
|
|
||||||
# Measure time with telemetry
|
|
||||||
start_time = time.time()
|
|
||||||
tasks = [test_async_function() for _ in range(100)]
|
|
||||||
await asyncio.gather(*tasks)
|
|
||||||
telemetry_time = time.time() - start_time
|
|
||||||
|
|
||||||
# Telemetry should add minimal overhead to async operations
|
|
||||||
assert telemetry_time < 2.0 # Should complete 100 async calls in under 2 seconds
|
|
||||||
|
|
||||||
def test_disabled_telemetry_performance(self, disabled_telemetry_config):
|
|
||||||
"""Test that disabled telemetry has zero overhead."""
|
|
||||||
with patch('crawl4ai.telemetry.TelemetryConfig', return_value=disabled_telemetry_config):
|
|
||||||
|
|
||||||
@telemetry_decorator
|
|
||||||
def test_function():
|
|
||||||
"""Test function with disabled telemetry."""
|
|
||||||
time.sleep(0.001)
|
|
||||||
return "success"
|
|
||||||
|
|
||||||
# Measure time with disabled telemetry
|
|
||||||
start_time = time.time()
|
|
||||||
for _ in range(100):
|
|
||||||
test_function()
|
|
||||||
disabled_time = time.time() - start_time
|
|
||||||
|
|
||||||
# Should be very fast when disabled
|
|
||||||
assert disabled_time < 0.5 # Should be faster than enabled telemetry
|
|
||||||
|
|
||||||
def test_telemetry_manager_initialization_performance(self):
|
|
||||||
"""Test that TelemetryManager initializes quickly."""
|
|
||||||
start_time = time.time()
|
|
||||||
|
|
||||||
# Initialize multiple managers (should use singleton)
|
|
||||||
for _ in range(10):
|
|
||||||
TelemetryManager.get_instance()
|
|
||||||
|
|
||||||
init_time = time.time() - start_time
|
|
||||||
|
|
||||||
# Initialization should be fast
|
|
||||||
assert init_time < 0.1 # Should initialize in under 100ms
|
|
||||||
|
|
||||||
def test_config_loading_performance(self, temp_config_dir):
|
|
||||||
"""Test that config loading is fast."""
|
|
||||||
from crawl4ai.telemetry.config import TelemetryConfig
|
|
||||||
|
|
||||||
# Create config with some data
|
|
||||||
config = TelemetryConfig(config_dir=temp_config_dir)
|
|
||||||
from crawl4ai.telemetry.config import TelemetryConsent
|
|
||||||
config.set_consent(TelemetryConsent.ALWAYS, email="test@example.com")
|
|
||||||
|
|
||||||
start_time = time.time()
|
|
||||||
|
|
||||||
# Load config multiple times
|
|
||||||
for _ in range(100):
|
|
||||||
new_config = TelemetryConfig(config_dir=temp_config_dir)
|
|
||||||
new_config.get_consent()
|
|
||||||
|
|
||||||
load_time = time.time() - start_time
|
|
||||||
|
|
||||||
# Config loading should be fast
|
|
||||||
assert load_time < 0.5 # Should load 100 times in under 500ms
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.performance
|
|
||||||
class TestTelemetryScalability:
|
|
||||||
"""Test telemetry system scalability."""
|
|
||||||
|
|
||||||
def test_multiple_exception_capture(self, enabled_telemetry_config, mock_sentry_provider):
|
|
||||||
"""Test capturing multiple exceptions in sequence."""
|
|
||||||
# Ensure config is properly set
|
|
||||||
enabled_telemetry_config.is_enabled.return_value = True
|
|
||||||
enabled_telemetry_config.should_send_current.return_value = True
|
|
||||||
|
|
||||||
with patch('crawl4ai.telemetry.TelemetryConfig', return_value=enabled_telemetry_config):
|
|
||||||
manager = TelemetryManager()
|
|
||||||
manager._provider = mock_sentry_provider # noqa: SLF001
|
|
||||||
manager._initialized = True # noqa: SLF001
|
|
||||||
|
|
||||||
start_time = time.time()
|
|
||||||
|
|
||||||
# Capture many exceptions
|
|
||||||
for i in range(50):
|
|
||||||
exception = ValueError(f"Test error {i}")
|
|
||||||
manager.capture_exception(exception, {'operation': f'test_{i}'})
|
|
||||||
|
|
||||||
capture_time = time.time() - start_time
|
|
||||||
|
|
||||||
# Should handle multiple exceptions efficiently
|
|
||||||
assert capture_time < 1.0 # Should capture 50 exceptions in under 1 second
|
|
||||||
assert mock_sentry_provider.send_exception.call_count <= 50 # May be less due to consent checks
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_concurrent_exception_capture(self, enabled_telemetry_config, mock_sentry_provider): # noqa: ARG002
|
|
||||||
"""Test concurrent exception capture performance."""
|
|
||||||
# Ensure config is properly set
|
|
||||||
enabled_telemetry_config.is_enabled.return_value = True
|
|
||||||
enabled_telemetry_config.should_send_current.return_value = True
|
|
||||||
|
|
||||||
with patch('crawl4ai.telemetry.TelemetryConfig', return_value=enabled_telemetry_config):
|
|
||||||
manager = TelemetryManager()
|
|
||||||
manager._provider = mock_sentry_provider # noqa: SLF001
|
|
||||||
manager._initialized = True # noqa: SLF001
|
|
||||||
|
|
||||||
async def capture_exception_async(i):
|
|
||||||
exception = ValueError(f"Concurrent error {i}")
|
|
||||||
return manager.capture_exception(exception, {'operation': f'concurrent_{i}'})
|
|
||||||
|
|
||||||
start_time = time.time()
|
|
||||||
|
|
||||||
# Capture exceptions concurrently
|
|
||||||
tasks = [capture_exception_async(i) for i in range(20)]
|
|
||||||
await asyncio.gather(*tasks)
|
|
||||||
|
|
||||||
capture_time = time.time() - start_time
|
|
||||||
|
|
||||||
# Should handle concurrent exceptions efficiently
|
|
||||||
assert capture_time < 1.0 # Should capture 20 concurrent exceptions in under 1 second
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
pytest.main([__file__, "-v"])
|
|
||||||
@@ -1,241 +0,0 @@
|
|||||||
"""
|
|
||||||
Tests for Crawl4AI telemetry functionality.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import os
|
|
||||||
import tempfile
|
|
||||||
from pathlib import Path
|
|
||||||
import json
|
|
||||||
from unittest.mock import Mock, patch, MagicMock
|
|
||||||
|
|
||||||
from crawl4ai.telemetry import (
|
|
||||||
TelemetryManager,
|
|
||||||
capture_exception,
|
|
||||||
enable,
|
|
||||||
disable,
|
|
||||||
status
|
|
||||||
)
|
|
||||||
from crawl4ai.telemetry.config import TelemetryConfig, TelemetryConsent
|
|
||||||
from crawl4ai.telemetry.environment import Environment, EnvironmentDetector
|
|
||||||
from crawl4ai.telemetry.base import NullProvider
|
|
||||||
from crawl4ai.telemetry.consent import ConsentManager
|
|
||||||
|
|
||||||
|
|
||||||
class TestTelemetryConfig:
|
|
||||||
"""Test telemetry configuration management."""
|
|
||||||
|
|
||||||
def test_config_initialization(self):
|
|
||||||
"""Test config initialization with custom directory."""
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
config = TelemetryConfig(config_dir=Path(tmpdir))
|
|
||||||
assert config.config_dir == Path(tmpdir)
|
|
||||||
assert config.get_consent() == TelemetryConsent.NOT_SET
|
|
||||||
|
|
||||||
def test_consent_persistence(self):
|
|
||||||
"""Test that consent is saved and loaded correctly."""
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
config = TelemetryConfig(config_dir=Path(tmpdir))
|
|
||||||
|
|
||||||
# Set consent
|
|
||||||
config.set_consent(TelemetryConsent.ALWAYS, email="test@example.com")
|
|
||||||
|
|
||||||
# Create new config instance to test persistence
|
|
||||||
config2 = TelemetryConfig(config_dir=Path(tmpdir))
|
|
||||||
assert config2.get_consent() == TelemetryConsent.ALWAYS
|
|
||||||
assert config2.get_email() == "test@example.com"
|
|
||||||
|
|
||||||
def test_environment_variable_override(self):
|
|
||||||
"""Test that environment variables override config."""
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
config = TelemetryConfig(config_dir=Path(tmpdir))
|
|
||||||
config.set_consent(TelemetryConsent.ALWAYS)
|
|
||||||
|
|
||||||
# Set environment variable to disable
|
|
||||||
os.environ['CRAWL4AI_TELEMETRY'] = '0'
|
|
||||||
try:
|
|
||||||
config.update_from_env()
|
|
||||||
assert config.get_consent() == TelemetryConsent.DENIED
|
|
||||||
finally:
|
|
||||||
del os.environ['CRAWL4AI_TELEMETRY']
|
|
||||||
|
|
||||||
|
|
||||||
class TestEnvironmentDetection:
|
|
||||||
"""Test environment detection functionality."""
|
|
||||||
|
|
||||||
def test_cli_detection(self):
|
|
||||||
"""Test CLI environment detection."""
|
|
||||||
# Mock sys.stdin.isatty
|
|
||||||
with patch('sys.stdin.isatty', return_value=True):
|
|
||||||
env = EnvironmentDetector.detect()
|
|
||||||
# Should detect as CLI in most test environments
|
|
||||||
assert env in [Environment.CLI, Environment.UNKNOWN]
|
|
||||||
|
|
||||||
def test_docker_detection(self):
|
|
||||||
"""Test Docker environment detection."""
|
|
||||||
# Mock Docker environment
|
|
||||||
with patch.dict(os.environ, {'CRAWL4AI_DOCKER': 'true'}):
|
|
||||||
env = EnvironmentDetector.detect()
|
|
||||||
assert env == Environment.DOCKER
|
|
||||||
|
|
||||||
def test_api_server_detection(self):
|
|
||||||
"""Test API server detection."""
|
|
||||||
with patch.dict(os.environ, {'CRAWL4AI_API_SERVER': 'true', 'CRAWL4AI_DOCKER': 'true'}):
|
|
||||||
env = EnvironmentDetector.detect()
|
|
||||||
assert env == Environment.API_SERVER
|
|
||||||
|
|
||||||
|
|
||||||
class TestTelemetryManager:
|
|
||||||
"""Test the main telemetry manager."""
|
|
||||||
|
|
||||||
def test_singleton_pattern(self):
|
|
||||||
"""Test that TelemetryManager is a singleton."""
|
|
||||||
manager1 = TelemetryManager.get_instance()
|
|
||||||
manager2 = TelemetryManager.get_instance()
|
|
||||||
assert manager1 is manager2
|
|
||||||
|
|
||||||
def test_exception_capture(self):
|
|
||||||
"""Test exception capture functionality."""
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
# Create manager with custom config dir
|
|
||||||
with patch('crawl4ai.telemetry.TelemetryConfig') as MockConfig:
|
|
||||||
mock_config = Mock()
|
|
||||||
mock_config.get_consent.return_value = TelemetryConsent.ALWAYS
|
|
||||||
mock_config.is_enabled.return_value = True
|
|
||||||
mock_config.should_send_current.return_value = True
|
|
||||||
mock_config.get_email.return_value = "test@example.com"
|
|
||||||
mock_config.update_from_env.return_value = None
|
|
||||||
MockConfig.return_value = mock_config
|
|
||||||
|
|
||||||
# Mock the provider setup
|
|
||||||
with patch('crawl4ai.telemetry.providers.sentry.SentryProvider') as MockSentryProvider:
|
|
||||||
mock_provider = Mock()
|
|
||||||
mock_provider.initialize.return_value = True
|
|
||||||
mock_provider.send_exception.return_value = True
|
|
||||||
MockSentryProvider.return_value = mock_provider
|
|
||||||
|
|
||||||
manager = TelemetryManager()
|
|
||||||
|
|
||||||
# Test exception capture
|
|
||||||
test_exception = ValueError("Test error")
|
|
||||||
result = manager.capture_exception(test_exception, {'test': 'context'})
|
|
||||||
|
|
||||||
# Verify the exception was processed
|
|
||||||
assert mock_config.should_send_current.called
|
|
||||||
|
|
||||||
def test_null_provider_when_disabled(self):
|
|
||||||
"""Test that NullProvider is used when telemetry is disabled."""
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
|
||||||
with patch('crawl4ai.telemetry.TelemetryConfig') as MockConfig:
|
|
||||||
mock_config = Mock()
|
|
||||||
mock_config.get_consent.return_value = TelemetryConsent.DENIED
|
|
||||||
mock_config.is_enabled.return_value = False
|
|
||||||
MockConfig.return_value = mock_config
|
|
||||||
|
|
||||||
manager = TelemetryManager()
|
|
||||||
assert isinstance(manager._provider, NullProvider)
|
|
||||||
|
|
||||||
|
|
||||||
class TestConsentManager:
|
|
||||||
"""Test consent management functionality."""
|
|
||||||
|
|
||||||
def test_docker_default_enabled(self):
|
|
||||||
"""Test that Docker environment has telemetry enabled by default."""
|
|
||||||
with patch('crawl4ai.telemetry.consent.EnvironmentDetector.detect', return_value=Environment.DOCKER):
|
|
||||||
with patch('os.environ.get') as mock_env_get:
|
|
||||||
# Mock os.environ.get to return None for CRAWL4AI_TELEMETRY
|
|
||||||
mock_env_get.return_value = None
|
|
||||||
|
|
||||||
config = Mock()
|
|
||||||
config.get_consent.return_value = TelemetryConsent.NOT_SET
|
|
||||||
|
|
||||||
consent_manager = ConsentManager(config)
|
|
||||||
consent_manager.check_and_prompt()
|
|
||||||
|
|
||||||
# Should be enabled by default in Docker
|
|
||||||
assert config.set_consent.called
|
|
||||||
assert config.set_consent.call_args[0][0] == TelemetryConsent.ALWAYS
|
|
||||||
|
|
||||||
def test_docker_disabled_by_env(self):
|
|
||||||
"""Test that Docker telemetry can be disabled via environment variable."""
|
|
||||||
with patch('crawl4ai.telemetry.consent.EnvironmentDetector.detect', return_value=Environment.DOCKER):
|
|
||||||
with patch.dict(os.environ, {'CRAWL4AI_TELEMETRY': '0'}):
|
|
||||||
config = Mock()
|
|
||||||
config.get_consent.return_value = TelemetryConsent.NOT_SET
|
|
||||||
|
|
||||||
consent_manager = ConsentManager(config)
|
|
||||||
consent = consent_manager.check_and_prompt()
|
|
||||||
|
|
||||||
# Should be disabled
|
|
||||||
assert config.set_consent.called
|
|
||||||
assert config.set_consent.call_args[0][0] == TelemetryConsent.DENIED
|
|
||||||
|
|
||||||
|
|
||||||
class TestPublicAPI:
|
|
||||||
"""Test the public API functions."""
|
|
||||||
|
|
||||||
@patch('crawl4ai.telemetry.get_telemetry')
|
|
||||||
def test_enable_function(self, mock_get_telemetry):
|
|
||||||
"""Test the enable() function."""
|
|
||||||
mock_manager = Mock()
|
|
||||||
mock_get_telemetry.return_value = mock_manager
|
|
||||||
|
|
||||||
enable(email="test@example.com", always=True)
|
|
||||||
|
|
||||||
mock_manager.enable.assert_called_once_with(
|
|
||||||
email="test@example.com",
|
|
||||||
always=True,
|
|
||||||
once=False
|
|
||||||
)
|
|
||||||
|
|
||||||
@patch('crawl4ai.telemetry.get_telemetry')
|
|
||||||
def test_disable_function(self, mock_get_telemetry):
|
|
||||||
"""Test the disable() function."""
|
|
||||||
mock_manager = Mock()
|
|
||||||
mock_get_telemetry.return_value = mock_manager
|
|
||||||
|
|
||||||
disable()
|
|
||||||
|
|
||||||
mock_manager.disable.assert_called_once()
|
|
||||||
|
|
||||||
@patch('crawl4ai.telemetry.get_telemetry')
|
|
||||||
def test_status_function(self, mock_get_telemetry):
|
|
||||||
"""Test the status() function."""
|
|
||||||
mock_manager = Mock()
|
|
||||||
mock_manager.status.return_value = {
|
|
||||||
'enabled': True,
|
|
||||||
'consent': 'always',
|
|
||||||
'email': 'test@example.com'
|
|
||||||
}
|
|
||||||
mock_get_telemetry.return_value = mock_manager
|
|
||||||
|
|
||||||
result = status()
|
|
||||||
|
|
||||||
assert result['enabled'] is True
|
|
||||||
assert result['consent'] == 'always'
|
|
||||||
assert result['email'] == 'test@example.com'
|
|
||||||
|
|
||||||
|
|
||||||
class TestIntegration:
|
|
||||||
"""Integration tests for telemetry with AsyncWebCrawler."""
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_crawler_exception_capture(self):
|
|
||||||
"""Test that AsyncWebCrawler captures exceptions."""
|
|
||||||
from crawl4ai import AsyncWebCrawler
|
|
||||||
|
|
||||||
with patch('crawl4ai.telemetry.capture_exception') as mock_capture:
|
|
||||||
# This should trigger an exception for invalid URL
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
|
||||||
try:
|
|
||||||
# Use an invalid URL that will cause an error
|
|
||||||
result = await crawler.arun(url="not-a-valid-url")
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Check if exception was captured (may not be called if error is handled)
|
|
||||||
# This is more of a smoke test to ensure the integration doesn't break
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
pytest.main([__file__, "-v"])
|
|
||||||
305
tests/test_webhook_feature.sh
Executable file
305
tests/test_webhook_feature.sh
Executable file
@@ -0,0 +1,305 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
#############################################################################
|
||||||
|
# Webhook Feature Test Script
|
||||||
|
#
|
||||||
|
# This script tests the webhook feature implementation by:
|
||||||
|
# 1. Switching to the webhook feature branch
|
||||||
|
# 2. Installing dependencies
|
||||||
|
# 3. Starting the server
|
||||||
|
# 4. Running webhook tests
|
||||||
|
# 5. Cleaning up and returning to original branch
|
||||||
|
#
|
||||||
|
# Usage: ./test_webhook_feature.sh
|
||||||
|
#############################################################################
|
||||||
|
|
||||||
|
set -e # Exit on error
|
||||||
|
|
||||||
|
# Colors for output
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
BLUE='\033[0;34m'
|
||||||
|
NC='\033[0m' # No Color
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
BRANCH_NAME="claude/implement-webhook-crawl-feature-011CULZY1Jy8N5MUkZqXkRVp"
|
||||||
|
VENV_PATH="venv"
|
||||||
|
SERVER_PORT=11235
|
||||||
|
WEBHOOK_PORT=8080
|
||||||
|
PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
||||||
|
|
||||||
|
# PID files for cleanup
|
||||||
|
REDIS_PID=""
|
||||||
|
SERVER_PID=""
|
||||||
|
WEBHOOK_PID=""
|
||||||
|
|
||||||
|
#############################################################################
|
||||||
|
# Utility Functions
|
||||||
|
#############################################################################
|
||||||
|
|
||||||
|
log_info() {
|
||||||
|
echo -e "${BLUE}[INFO]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_success() {
|
||||||
|
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_warning() {
|
||||||
|
echo -e "${YELLOW}[WARNING]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_error() {
|
||||||
|
echo -e "${RED}[ERROR]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
log_info "Starting cleanup..."
|
||||||
|
|
||||||
|
# Kill webhook receiver if running
|
||||||
|
if [ ! -z "$WEBHOOK_PID" ] && kill -0 $WEBHOOK_PID 2>/dev/null; then
|
||||||
|
log_info "Stopping webhook receiver (PID: $WEBHOOK_PID)..."
|
||||||
|
kill $WEBHOOK_PID 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Kill server if running
|
||||||
|
if [ ! -z "$SERVER_PID" ] && kill -0 $SERVER_PID 2>/dev/null; then
|
||||||
|
log_info "Stopping server (PID: $SERVER_PID)..."
|
||||||
|
kill $SERVER_PID 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Kill Redis if running
|
||||||
|
if [ ! -z "$REDIS_PID" ] && kill -0 $REDIS_PID 2>/dev/null; then
|
||||||
|
log_info "Stopping Redis (PID: $REDIS_PID)..."
|
||||||
|
kill $REDIS_PID 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Also kill by port if PIDs didn't work
|
||||||
|
lsof -ti:$SERVER_PORT | xargs kill -9 2>/dev/null || true
|
||||||
|
lsof -ti:$WEBHOOK_PORT | xargs kill -9 2>/dev/null || true
|
||||||
|
lsof -ti:6379 | xargs kill -9 2>/dev/null || true
|
||||||
|
|
||||||
|
# Return to original branch
|
||||||
|
if [ ! -z "$ORIGINAL_BRANCH" ]; then
|
||||||
|
log_info "Switching back to branch: $ORIGINAL_BRANCH"
|
||||||
|
git checkout $ORIGINAL_BRANCH 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_success "Cleanup complete"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Set trap to cleanup on exit
|
||||||
|
trap cleanup EXIT INT TERM
|
||||||
|
|
||||||
|
#############################################################################
|
||||||
|
# Main Script
|
||||||
|
#############################################################################
|
||||||
|
|
||||||
|
log_info "Starting webhook feature test script"
|
||||||
|
log_info "Project root: $PROJECT_ROOT"
|
||||||
|
|
||||||
|
cd "$PROJECT_ROOT"
|
||||||
|
|
||||||
|
# Step 1: Save current branch and fetch PR
|
||||||
|
log_info "Step 1: Fetching PR branch..."
|
||||||
|
ORIGINAL_BRANCH=$(git rev-parse --abbrev-ref HEAD)
|
||||||
|
log_info "Current branch: $ORIGINAL_BRANCH"
|
||||||
|
|
||||||
|
git fetch origin $BRANCH_NAME
|
||||||
|
log_success "Branch fetched"
|
||||||
|
|
||||||
|
# Step 2: Switch to new branch
|
||||||
|
log_info "Step 2: Switching to branch: $BRANCH_NAME"
|
||||||
|
git checkout $BRANCH_NAME
|
||||||
|
log_success "Switched to webhook feature branch"
|
||||||
|
|
||||||
|
# Step 3: Activate virtual environment
|
||||||
|
log_info "Step 3: Activating virtual environment..."
|
||||||
|
if [ ! -d "$VENV_PATH" ]; then
|
||||||
|
log_error "Virtual environment not found at $VENV_PATH"
|
||||||
|
log_info "Creating virtual environment..."
|
||||||
|
python3 -m venv $VENV_PATH
|
||||||
|
fi
|
||||||
|
|
||||||
|
source $VENV_PATH/bin/activate
|
||||||
|
log_success "Virtual environment activated: $(which python)"
|
||||||
|
|
||||||
|
# Step 4: Install server dependencies
|
||||||
|
log_info "Step 4: Installing server dependencies..."
|
||||||
|
pip install -q -r deploy/docker/requirements.txt
|
||||||
|
log_success "Dependencies installed"
|
||||||
|
|
||||||
|
# Check if Redis is available
|
||||||
|
log_info "Checking Redis availability..."
|
||||||
|
if ! command -v redis-server &> /dev/null; then
|
||||||
|
log_warning "Redis not found, attempting to install..."
|
||||||
|
if command -v apt-get &> /dev/null; then
|
||||||
|
sudo apt-get update && sudo apt-get install -y redis-server
|
||||||
|
elif command -v brew &> /dev/null; then
|
||||||
|
brew install redis
|
||||||
|
else
|
||||||
|
log_error "Cannot install Redis automatically. Please install Redis manually."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Step 5: Start Redis in background
|
||||||
|
log_info "Step 5a: Starting Redis..."
|
||||||
|
redis-server --port 6379 --daemonize yes
|
||||||
|
sleep 2
|
||||||
|
REDIS_PID=$(pgrep redis-server)
|
||||||
|
log_success "Redis started (PID: $REDIS_PID)"
|
||||||
|
|
||||||
|
# Step 5b: Start server in background
|
||||||
|
log_info "Step 5b: Starting server on port $SERVER_PORT..."
|
||||||
|
cd deploy/docker
|
||||||
|
|
||||||
|
# Start server in background
|
||||||
|
python3 -m uvicorn server:app --host 0.0.0.0 --port $SERVER_PORT > /tmp/crawl4ai_server.log 2>&1 &
|
||||||
|
SERVER_PID=$!
|
||||||
|
cd "$PROJECT_ROOT"
|
||||||
|
|
||||||
|
log_info "Server started (PID: $SERVER_PID)"
|
||||||
|
|
||||||
|
# Wait for server to be ready
|
||||||
|
log_info "Waiting for server to be ready..."
|
||||||
|
for i in {1..30}; do
|
||||||
|
if curl -s http://localhost:$SERVER_PORT/health > /dev/null 2>&1; then
|
||||||
|
log_success "Server is ready!"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
if [ $i -eq 30 ]; then
|
||||||
|
log_error "Server failed to start within 30 seconds"
|
||||||
|
log_info "Server logs:"
|
||||||
|
tail -50 /tmp/crawl4ai_server.log
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo -n "."
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Step 6: Create and run webhook test
|
||||||
|
log_info "Step 6: Creating webhook test script..."
|
||||||
|
|
||||||
|
cat > /tmp/test_webhook.py << 'PYTHON_SCRIPT'
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from flask import Flask, request, jsonify
|
||||||
|
from threading import Thread, Event
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
CRAWL4AI_BASE_URL = "http://localhost:11235"
|
||||||
|
WEBHOOK_BASE_URL = "http://localhost:8080"
|
||||||
|
|
||||||
|
# Flask app for webhook receiver
|
||||||
|
app = Flask(__name__)
|
||||||
|
webhook_received = Event()
|
||||||
|
webhook_data = {}
|
||||||
|
|
||||||
|
@app.route('/webhook', methods=['POST'])
|
||||||
|
def handle_webhook():
|
||||||
|
global webhook_data
|
||||||
|
webhook_data = request.json
|
||||||
|
webhook_received.set()
|
||||||
|
print(f"\n✅ Webhook received: {json.dumps(webhook_data, indent=2)}")
|
||||||
|
return jsonify({"status": "received"}), 200
|
||||||
|
|
||||||
|
def start_webhook_server():
|
||||||
|
app.run(host='0.0.0.0', port=8080, debug=False, use_reloader=False)
|
||||||
|
|
||||||
|
# Start webhook server in background
|
||||||
|
webhook_thread = Thread(target=start_webhook_server, daemon=True)
|
||||||
|
webhook_thread.start()
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
print("🚀 Submitting crawl job with webhook...")
|
||||||
|
|
||||||
|
# Submit job with webhook
|
||||||
|
payload = {
|
||||||
|
"urls": ["https://example.com"],
|
||||||
|
"browser_config": {"headless": True},
|
||||||
|
"crawler_config": {"cache_mode": "bypass"},
|
||||||
|
"webhook_config": {
|
||||||
|
"webhook_url": f"{WEBHOOK_BASE_URL}/webhook",
|
||||||
|
"webhook_data_in_payload": True
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
f"{CRAWL4AI_BASE_URL}/crawl/job",
|
||||||
|
json=payload,
|
||||||
|
headers={"Content-Type": "application/json"}
|
||||||
|
)
|
||||||
|
|
||||||
|
if not response.ok:
|
||||||
|
print(f"❌ Failed to submit job: {response.text}")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
task_id = response.json()['task_id']
|
||||||
|
print(f"✅ Job submitted successfully, task_id: {task_id}")
|
||||||
|
|
||||||
|
# Wait for webhook (with timeout)
|
||||||
|
print("⏳ Waiting for webhook notification...")
|
||||||
|
if webhook_received.wait(timeout=60):
|
||||||
|
print(f"✅ Webhook received!")
|
||||||
|
print(f" Task ID: {webhook_data.get('task_id')}")
|
||||||
|
print(f" Status: {webhook_data.get('status')}")
|
||||||
|
print(f" URLs: {webhook_data.get('urls')}")
|
||||||
|
|
||||||
|
if webhook_data.get('status') == 'completed':
|
||||||
|
if 'data' in webhook_data:
|
||||||
|
print(f" ✅ Data included in webhook payload")
|
||||||
|
results = webhook_data['data'].get('results', [])
|
||||||
|
if results:
|
||||||
|
print(f" 📄 Crawled {len(results)} URL(s)")
|
||||||
|
for result in results:
|
||||||
|
print(f" - {result.get('url')}: {len(result.get('markdown', ''))} chars")
|
||||||
|
print("\n🎉 Webhook test PASSED!")
|
||||||
|
exit(0)
|
||||||
|
else:
|
||||||
|
print(f" ❌ Job failed: {webhook_data.get('error')}")
|
||||||
|
exit(1)
|
||||||
|
else:
|
||||||
|
print("❌ Webhook not received within 60 seconds")
|
||||||
|
# Try polling as fallback
|
||||||
|
print("⏳ Trying to poll job status...")
|
||||||
|
for i in range(10):
|
||||||
|
status_response = requests.get(f"{CRAWL4AI_BASE_URL}/crawl/job/{task_id}")
|
||||||
|
if status_response.ok:
|
||||||
|
status = status_response.json()
|
||||||
|
print(f" Status: {status.get('status')}")
|
||||||
|
if status.get('status') in ['completed', 'failed']:
|
||||||
|
break
|
||||||
|
time.sleep(2)
|
||||||
|
exit(1)
|
||||||
|
PYTHON_SCRIPT
|
||||||
|
|
||||||
|
# Install Flask for webhook receiver
|
||||||
|
pip install -q flask
|
||||||
|
|
||||||
|
# Run the webhook test
|
||||||
|
log_info "Running webhook test..."
|
||||||
|
python3 /tmp/test_webhook.py &
|
||||||
|
WEBHOOK_PID=$!
|
||||||
|
|
||||||
|
# Wait for test to complete
|
||||||
|
wait $WEBHOOK_PID
|
||||||
|
TEST_EXIT_CODE=$?
|
||||||
|
|
||||||
|
# Step 7: Verify results
|
||||||
|
log_info "Step 7: Verifying test results..."
|
||||||
|
if [ $TEST_EXIT_CODE -eq 0 ]; then
|
||||||
|
log_success "✅ Webhook test PASSED!"
|
||||||
|
else
|
||||||
|
log_error "❌ Webhook test FAILED (exit code: $TEST_EXIT_CODE)"
|
||||||
|
log_info "Server logs:"
|
||||||
|
tail -100 /tmp/crawl4ai_server.log
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Step 8: Cleanup happens automatically via trap
|
||||||
|
log_success "All tests completed successfully! 🎉"
|
||||||
|
log_info "Cleanup will happen automatically..."
|
||||||
Reference in New Issue
Block a user