Compare commits
28 Commits
next
...
release/v0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9f9ea3bb3b | ||
|
|
d58b93c207 | ||
|
|
e2b4705010 | ||
|
|
4a1abd5086 | ||
|
|
04258cd4f2 | ||
|
|
84e462d9f8 | ||
|
|
9546773a07 | ||
|
|
66a979ad11 | ||
|
|
0c31e91b53 | ||
|
|
1b6a31f88f | ||
|
|
b8c261780f | ||
|
|
db6ad7a79d | ||
|
|
004d514f33 | ||
|
|
3a9e2c716e | ||
|
|
0163bd797c | ||
|
|
26bad799e4 | ||
|
|
cf8badfe27 | ||
|
|
ccbe3c105c | ||
|
|
761c19d54b | ||
|
|
14b0ecb137 | ||
|
|
0eaa9f9895 | ||
|
|
1d1970ae69 | ||
|
|
205df1e330 | ||
|
|
2640dc73a5 | ||
|
|
58024755c5 | ||
|
|
dd5ee752cf | ||
|
|
bde1bba6a2 | ||
|
|
ee25c771d8 |
141
.github/workflows/release.yml
vendored
Normal file
141
.github/workflows/release.yml
vendored
Normal file
@@ -0,0 +1,141 @@
|
||||
name: Release Pipeline
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- 'v*'
|
||||
- '!test-v*' # Exclude test tags
|
||||
|
||||
jobs:
|
||||
release:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Extract version from tag
|
||||
id: get_version
|
||||
run: |
|
||||
TAG_VERSION=${GITHUB_REF#refs/tags/v}
|
||||
echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT
|
||||
echo "Releasing version: $TAG_VERSION"
|
||||
|
||||
- name: Install package dependencies
|
||||
run: |
|
||||
pip install -e .
|
||||
|
||||
- name: Check version consistency
|
||||
run: |
|
||||
TAG_VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||
PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)")
|
||||
|
||||
echo "Tag version: $TAG_VERSION"
|
||||
echo "Package version: $PACKAGE_VERSION"
|
||||
|
||||
if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then
|
||||
echo "❌ Version mismatch! Tag: $TAG_VERSION, Package: $PACKAGE_VERSION"
|
||||
echo "Please update crawl4ai/__version__.py to match the tag version"
|
||||
exit 1
|
||||
fi
|
||||
echo "✅ Version check passed: $TAG_VERSION"
|
||||
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install build twine
|
||||
|
||||
- name: Build package
|
||||
run: python -m build
|
||||
|
||||
- name: Check package
|
||||
run: twine check dist/*
|
||||
|
||||
- name: Upload to PyPI
|
||||
env:
|
||||
TWINE_USERNAME: __token__
|
||||
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
|
||||
run: |
|
||||
echo "📦 Uploading to PyPI..."
|
||||
twine upload dist/*
|
||||
echo "✅ Package uploaded to https://pypi.org/project/crawl4ai/"
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Extract major and minor versions
|
||||
id: versions
|
||||
run: |
|
||||
VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||
MAJOR=$(echo $VERSION | cut -d. -f1)
|
||||
MINOR=$(echo $VERSION | cut -d. -f1-2)
|
||||
echo "MAJOR=$MAJOR" >> $GITHUB_OUTPUT
|
||||
echo "MINOR=$MINOR" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Build and push Docker images
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: |
|
||||
unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
||||
unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}
|
||||
unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}
|
||||
unclecode/crawl4ai:latest
|
||||
platforms: linux/amd64,linux/arm64
|
||||
|
||||
- name: Create GitHub Release
|
||||
uses: actions/create-release@v1
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
with:
|
||||
tag_name: v${{ steps.get_version.outputs.VERSION }}
|
||||
release_name: Release v${{ steps.get_version.outputs.VERSION }}
|
||||
body: |
|
||||
## 🎉 Crawl4AI v${{ steps.get_version.outputs.VERSION }} Released!
|
||||
|
||||
### 📦 Installation
|
||||
|
||||
**PyPI:**
|
||||
```bash
|
||||
pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}
|
||||
```
|
||||
|
||||
**Docker:**
|
||||
```bash
|
||||
docker pull unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
||||
docker pull unclecode/crawl4ai:latest
|
||||
```
|
||||
|
||||
### 📝 What's Changed
|
||||
See [CHANGELOG.md](https://github.com/${{ github.repository }}/blob/main/CHANGELOG.md) for details.
|
||||
draft: false
|
||||
prerelease: false
|
||||
|
||||
- name: Summary
|
||||
run: |
|
||||
echo "## 🚀 Release Complete!" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 📦 PyPI Package" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- Version: ${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- URL: https://pypi.org/project/crawl4ai/" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- Install: \`pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 🐳 Docker Images" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:latest\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 📋 GitHub Release" >> $GITHUB_STEP_SUMMARY
|
||||
echo "https://github.com/${{ github.repository }}/releases/tag/v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
116
.github/workflows/test-release.yml.disabled
vendored
Normal file
116
.github/workflows/test-release.yml.disabled
vendored
Normal file
@@ -0,0 +1,116 @@
|
||||
name: Test Release Pipeline
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- 'test-v*'
|
||||
|
||||
jobs:
|
||||
test-release:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Extract version from tag
|
||||
id: get_version
|
||||
run: |
|
||||
TAG_VERSION=${GITHUB_REF#refs/tags/test-v}
|
||||
echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT
|
||||
echo "Testing with version: $TAG_VERSION"
|
||||
|
||||
- name: Install package dependencies
|
||||
run: |
|
||||
pip install -e .
|
||||
|
||||
- name: Check version consistency
|
||||
run: |
|
||||
TAG_VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||
PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)")
|
||||
|
||||
echo "Tag version: $TAG_VERSION"
|
||||
echo "Package version: $PACKAGE_VERSION"
|
||||
|
||||
if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then
|
||||
echo "❌ Version mismatch! Tag: $TAG_VERSION, Package: $PACKAGE_VERSION"
|
||||
echo "Please update crawl4ai/__version__.py to match the tag version"
|
||||
exit 1
|
||||
fi
|
||||
echo "✅ Version check passed: $TAG_VERSION"
|
||||
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install build twine
|
||||
|
||||
- name: Build package
|
||||
run: python -m build
|
||||
|
||||
- name: Check package
|
||||
run: twine check dist/*
|
||||
|
||||
- name: Upload to Test PyPI
|
||||
env:
|
||||
TWINE_USERNAME: __token__
|
||||
TWINE_PASSWORD: ${{ secrets.TEST_PYPI_TOKEN }}
|
||||
run: |
|
||||
echo "📦 Uploading to Test PyPI..."
|
||||
twine upload --repository testpypi dist/* || {
|
||||
if [ $? -eq 1 ]; then
|
||||
echo "⚠️ Upload failed - likely version already exists on Test PyPI"
|
||||
echo "Continuing anyway for test purposes..."
|
||||
else
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
echo "✅ Test PyPI step complete"
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Build and push Docker test images
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: |
|
||||
unclecode/crawl4ai:test-${{ steps.get_version.outputs.VERSION }}
|
||||
unclecode/crawl4ai:test-latest
|
||||
platforms: linux/amd64,linux/arm64
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
|
||||
- name: Summary
|
||||
run: |
|
||||
echo "## 🎉 Test Release Complete!" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 📦 Test PyPI Package" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- Version: ${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- URL: https://test.pypi.org/project/crawl4ai/" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- Install: \`pip install -i https://test.pypi.org/simple/ crawl4ai==${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 🐳 Docker Test Images" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:test-${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:test-latest\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 🧹 Cleanup Commands" >> $GITHUB_STEP_SUMMARY
|
||||
echo "\`\`\`bash" >> $GITHUB_STEP_SUMMARY
|
||||
echo "# Remove test tag" >> $GITHUB_STEP_SUMMARY
|
||||
echo "git tag -d test-v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "git push origin :test-v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "# Remove Docker test images" >> $GITHUB_STEP_SUMMARY
|
||||
echo "docker rmi unclecode/crawl4ai:test-${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "docker rmi unclecode/crawl4ai:test-latest" >> $GITHUB_STEP_SUMMARY
|
||||
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
|
||||
@@ -216,7 +216,7 @@ Under certain assumptions about link preview accuracy:
|
||||
|
||||
### 8.1 Core Components
|
||||
|
||||
1. **AdaptiveCrawlResult**: Maintains crawl history and metrics
|
||||
1. **CrawlState**: Maintains crawl history and metrics
|
||||
2. **AdaptiveConfig**: Configuration parameters
|
||||
3. **CrawlStrategy**: Pluggable strategy interface
|
||||
4. **AdaptiveCrawler**: Main orchestrator
|
||||
|
||||
@@ -28,7 +28,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant
|
||||
|
||||
[✨ Check out latest update v0.7.0](#-recent-updates)
|
||||
|
||||
🎉 **Version 0.7.0 is now available!** The Adaptive Intelligence Update introduces groundbreaking features: Adaptive Crawling that learns website patterns, Virtual Scroll support for infinite pages, intelligent Link Preview with 3-layer scoring, Async URL Seeder for massive discovery, and significant performance improvements. [Read the release notes →](https://docs.crawl4ai.com/blog/release-v0.7.0)
|
||||
🎉 **Version 0.7.0 is now available!** The Adaptive Intelligence Update introduces groundbreaking features: Adaptive Crawling that learns website patterns, Virtual Scroll support for infinite pages, intelligent Link Preview with 3-layer scoring, Async URL Seeder for massive discovery, and significant performance improvements. [Read the release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.0.md)
|
||||
|
||||
<details>
|
||||
<summary>🤓 <strong>My Personal Story</strong></summary>
|
||||
|
||||
@@ -73,7 +73,7 @@ from .async_url_seeder import AsyncUrlSeeder
|
||||
from .adaptive_crawler import (
|
||||
AdaptiveCrawler,
|
||||
AdaptiveConfig,
|
||||
AdaptiveCrawlResult,
|
||||
CrawlState,
|
||||
CrawlStrategy,
|
||||
StatisticalStrategy
|
||||
)
|
||||
@@ -108,7 +108,7 @@ __all__ = [
|
||||
# Adaptive Crawler
|
||||
"AdaptiveCrawler",
|
||||
"AdaptiveConfig",
|
||||
"AdaptiveCrawlResult",
|
||||
"CrawlState",
|
||||
"CrawlStrategy",
|
||||
"StatisticalStrategy",
|
||||
"DeepCrawlStrategy",
|
||||
|
||||
1847
crawl4ai/adaptive_crawler copy.py
Normal file
1847
crawl4ai/adaptive_crawler copy.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -24,7 +24,7 @@ from crawl4ai.models import Link, CrawlResult
|
||||
import numpy as np
|
||||
|
||||
@dataclass
|
||||
class AdaptiveCrawlResult:
|
||||
class CrawlState:
|
||||
"""Tracks the current state of adaptive crawling"""
|
||||
crawled_urls: Set[str] = field(default_factory=set)
|
||||
knowledge_base: List[CrawlResult] = field(default_factory=list)
|
||||
@@ -80,7 +80,7 @@ class AdaptiveCrawlResult:
|
||||
json.dump(state_dict, f, indent=2)
|
||||
|
||||
@classmethod
|
||||
def load(cls, path: Union[str, Path]) -> 'AdaptiveCrawlResult':
|
||||
def load(cls, path: Union[str, Path]) -> 'CrawlState':
|
||||
"""Load state from disk"""
|
||||
path = Path(path)
|
||||
with open(path, 'r') as f:
|
||||
@@ -256,22 +256,22 @@ class CrawlStrategy(ABC):
|
||||
"""Abstract base class for crawling strategies"""
|
||||
|
||||
@abstractmethod
|
||||
async def calculate_confidence(self, state: AdaptiveCrawlResult) -> float:
|
||||
async def calculate_confidence(self, state: CrawlState) -> float:
|
||||
"""Calculate overall confidence that we have sufficient information"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def rank_links(self, state: AdaptiveCrawlResult, config: AdaptiveConfig) -> List[Tuple[Link, float]]:
|
||||
async def rank_links(self, state: CrawlState, config: AdaptiveConfig) -> List[Tuple[Link, float]]:
|
||||
"""Rank pending links by expected information gain"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def should_stop(self, state: AdaptiveCrawlResult, config: AdaptiveConfig) -> bool:
|
||||
async def should_stop(self, state: CrawlState, config: AdaptiveConfig) -> bool:
|
||||
"""Determine if crawling should stop"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def update_state(self, state: AdaptiveCrawlResult, new_results: List[CrawlResult]) -> None:
|
||||
async def update_state(self, state: CrawlState, new_results: List[CrawlResult]) -> None:
|
||||
"""Update state with new crawl results"""
|
||||
pass
|
||||
|
||||
@@ -284,7 +284,7 @@ class StatisticalStrategy(CrawlStrategy):
|
||||
self.bm25_k1 = 1.2 # BM25 parameter
|
||||
self.bm25_b = 0.75 # BM25 parameter
|
||||
|
||||
async def calculate_confidence(self, state: AdaptiveCrawlResult) -> float:
|
||||
async def calculate_confidence(self, state: CrawlState) -> float:
|
||||
"""Calculate confidence using coverage, consistency, and saturation"""
|
||||
if not state.knowledge_base:
|
||||
return 0.0
|
||||
@@ -303,7 +303,7 @@ class StatisticalStrategy(CrawlStrategy):
|
||||
|
||||
return confidence
|
||||
|
||||
def _calculate_coverage(self, state: AdaptiveCrawlResult) -> float:
|
||||
def _calculate_coverage(self, state: CrawlState) -> float:
|
||||
"""Coverage scoring - measures query term presence across knowledge base
|
||||
|
||||
Returns a score between 0 and 1, where:
|
||||
@@ -344,7 +344,7 @@ class StatisticalStrategy(CrawlStrategy):
|
||||
# This helps differentiate between partial and good coverage
|
||||
return min(1.0, math.sqrt(coverage))
|
||||
|
||||
def _calculate_consistency(self, state: AdaptiveCrawlResult) -> float:
|
||||
def _calculate_consistency(self, state: CrawlState) -> float:
|
||||
"""Information overlap between pages - high overlap suggests coherent topic coverage"""
|
||||
if len(state.knowledge_base) < 2:
|
||||
return 1.0 # Single or no documents are perfectly consistent
|
||||
@@ -371,7 +371,7 @@ class StatisticalStrategy(CrawlStrategy):
|
||||
|
||||
return consistency
|
||||
|
||||
def _calculate_saturation(self, state: AdaptiveCrawlResult) -> float:
|
||||
def _calculate_saturation(self, state: CrawlState) -> float:
|
||||
"""Diminishing returns indicator - are we still discovering new information?"""
|
||||
if not state.new_terms_history:
|
||||
return 0.0
|
||||
@@ -388,7 +388,7 @@ class StatisticalStrategy(CrawlStrategy):
|
||||
|
||||
return max(0.0, min(saturation, 1.0))
|
||||
|
||||
async def rank_links(self, state: AdaptiveCrawlResult, config: AdaptiveConfig) -> List[Tuple[Link, float]]:
|
||||
async def rank_links(self, state: CrawlState, config: AdaptiveConfig) -> List[Tuple[Link, float]]:
|
||||
"""Rank links by expected information gain"""
|
||||
scored_links = []
|
||||
|
||||
@@ -415,7 +415,7 @@ class StatisticalStrategy(CrawlStrategy):
|
||||
|
||||
return scored_links
|
||||
|
||||
def _calculate_relevance(self, link: Link, state: AdaptiveCrawlResult) -> float:
|
||||
def _calculate_relevance(self, link: Link, state: CrawlState) -> float:
|
||||
"""BM25 relevance score between link preview and query"""
|
||||
if not state.query or not link:
|
||||
return 0.0
|
||||
@@ -447,7 +447,7 @@ class StatisticalStrategy(CrawlStrategy):
|
||||
overlap = len(query_terms & link_terms) / len(query_terms)
|
||||
return overlap
|
||||
|
||||
def _calculate_novelty(self, link: Link, state: AdaptiveCrawlResult) -> float:
|
||||
def _calculate_novelty(self, link: Link, state: CrawlState) -> float:
|
||||
"""Estimate how much new information this link might provide"""
|
||||
if not state.knowledge_base:
|
||||
return 1.0 # First links are maximally novel
|
||||
@@ -502,7 +502,7 @@ class StatisticalStrategy(CrawlStrategy):
|
||||
|
||||
return min(score, 1.0)
|
||||
|
||||
async def should_stop(self, state: AdaptiveCrawlResult, config: AdaptiveConfig) -> bool:
|
||||
async def should_stop(self, state: CrawlState, config: AdaptiveConfig) -> bool:
|
||||
"""Determine if crawling should stop"""
|
||||
# Check confidence threshold
|
||||
confidence = state.metrics.get('confidence', 0.0)
|
||||
@@ -523,7 +523,7 @@ class StatisticalStrategy(CrawlStrategy):
|
||||
|
||||
return False
|
||||
|
||||
async def update_state(self, state: AdaptiveCrawlResult, new_results: List[CrawlResult]) -> None:
|
||||
async def update_state(self, state: CrawlState, new_results: List[CrawlResult]) -> None:
|
||||
"""Update state with new crawl results"""
|
||||
for result in new_results:
|
||||
# Track new terms
|
||||
@@ -921,7 +921,7 @@ class EmbeddingStrategy(CrawlStrategy):
|
||||
|
||||
return sorted(scored_links, key=lambda x: x[1], reverse=True)
|
||||
|
||||
async def calculate_confidence(self, state: AdaptiveCrawlResult) -> float:
|
||||
async def calculate_confidence(self, state: CrawlState) -> float:
|
||||
"""Coverage-based learning score (0–1)."""
|
||||
# Guard clauses
|
||||
if state.kb_embeddings is None or state.query_embeddings is None:
|
||||
@@ -951,7 +951,7 @@ class EmbeddingStrategy(CrawlStrategy):
|
||||
|
||||
|
||||
|
||||
# async def calculate_confidence(self, state: AdaptiveCrawlResult) -> float:
|
||||
# async def calculate_confidence(self, state: CrawlState) -> float:
|
||||
# """Calculate learning score for adaptive crawling (used for stopping)"""
|
||||
#
|
||||
|
||||
@@ -1021,7 +1021,7 @@ class EmbeddingStrategy(CrawlStrategy):
|
||||
# # For stopping criteria, return learning score
|
||||
# return float(learning_score)
|
||||
|
||||
async def rank_links(self, state: AdaptiveCrawlResult, config: AdaptiveConfig) -> List[Tuple[Link, float]]:
|
||||
async def rank_links(self, state: CrawlState, config: AdaptiveConfig) -> List[Tuple[Link, float]]:
|
||||
"""Main entry point for link ranking"""
|
||||
# Store config for use in other methods
|
||||
self.config = config
|
||||
@@ -1052,7 +1052,7 @@ class EmbeddingStrategy(CrawlStrategy):
|
||||
state.kb_embeddings
|
||||
)
|
||||
|
||||
async def validate_coverage(self, state: AdaptiveCrawlResult) -> float:
|
||||
async def validate_coverage(self, state: CrawlState) -> float:
|
||||
"""Validate coverage using held-out queries with caching"""
|
||||
if not hasattr(self, '_validation_queries') or not self._validation_queries:
|
||||
return state.metrics.get('confidence', 0.0)
|
||||
@@ -1088,7 +1088,7 @@ class EmbeddingStrategy(CrawlStrategy):
|
||||
|
||||
return validation_confidence
|
||||
|
||||
async def should_stop(self, state: AdaptiveCrawlResult, config: AdaptiveConfig) -> bool:
|
||||
async def should_stop(self, state: CrawlState, config: AdaptiveConfig) -> bool:
|
||||
"""Stop based on learning curve convergence"""
|
||||
confidence = state.metrics.get('confidence', 0.0)
|
||||
|
||||
@@ -1139,7 +1139,7 @@ class EmbeddingStrategy(CrawlStrategy):
|
||||
|
||||
return False
|
||||
|
||||
def get_quality_confidence(self, state: AdaptiveCrawlResult) -> float:
|
||||
def get_quality_confidence(self, state: CrawlState) -> float:
|
||||
"""Calculate quality-based confidence score for display"""
|
||||
learning_score = state.metrics.get('learning_score', 0.0)
|
||||
validation_score = state.metrics.get('validation_confidence', 0.0)
|
||||
@@ -1166,7 +1166,7 @@ class EmbeddingStrategy(CrawlStrategy):
|
||||
|
||||
return confidence
|
||||
|
||||
async def update_state(self, state: AdaptiveCrawlResult, new_results: List[CrawlResult]) -> None:
|
||||
async def update_state(self, state: CrawlState, new_results: List[CrawlResult]) -> None:
|
||||
"""Update embeddings and coverage metrics with deduplication"""
|
||||
from .utils import get_text_embeddings
|
||||
|
||||
@@ -1246,7 +1246,7 @@ class AdaptiveCrawler:
|
||||
self.strategy = self._create_strategy(self.config.strategy)
|
||||
|
||||
# Initialize state
|
||||
self.state: Optional[AdaptiveCrawlResult] = None
|
||||
self.state: Optional[CrawlState] = None
|
||||
|
||||
# Track if we own the crawler (for cleanup)
|
||||
self._owns_crawler = crawler is None
|
||||
@@ -1266,14 +1266,14 @@ class AdaptiveCrawler:
|
||||
async def digest(self,
|
||||
start_url: str,
|
||||
query: str,
|
||||
resume_from: Optional[str] = None) -> AdaptiveCrawlResult:
|
||||
resume_from: Optional[str] = None) -> CrawlState:
|
||||
"""Main entry point for adaptive crawling"""
|
||||
# Initialize or resume state
|
||||
if resume_from:
|
||||
self.state = AdaptiveCrawlResult.load(resume_from)
|
||||
self.state = CrawlState.load(resume_from)
|
||||
self.state.query = query # Update query in case it changed
|
||||
else:
|
||||
self.state = AdaptiveCrawlResult(
|
||||
self.state = CrawlState(
|
||||
crawled_urls=set(),
|
||||
knowledge_base=[],
|
||||
pending_links=[],
|
||||
@@ -1803,7 +1803,7 @@ class AdaptiveCrawler:
|
||||
|
||||
# Initialize state if needed
|
||||
if not self.state:
|
||||
self.state = AdaptiveCrawlResult()
|
||||
self.state = CrawlState()
|
||||
|
||||
# Add imported results
|
||||
self.state.knowledge_base.extend(imported_results)
|
||||
|
||||
@@ -824,7 +824,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
except Error:
|
||||
visibility_info = await self.check_visibility(page)
|
||||
|
||||
if self.browser_config.config.verbose:
|
||||
if self.browser_config.verbose:
|
||||
self.logger.debug(
|
||||
message="Body visibility info: {info}",
|
||||
tag="DEBUG",
|
||||
|
||||
@@ -47,7 +47,6 @@ from .utils import (
|
||||
get_error_context,
|
||||
RobotsParser,
|
||||
preprocess_html_for_schema,
|
||||
should_crawl_based_on_head,
|
||||
)
|
||||
|
||||
|
||||
@@ -269,56 +268,31 @@ class AsyncWebCrawler:
|
||||
cached_result = await async_db_manager.aget_cached_url(url)
|
||||
|
||||
if cached_result:
|
||||
# Check if SMART mode requires validation
|
||||
if cache_context.cache_mode == CacheMode.SMART:
|
||||
# Perform HEAD check to see if content has changed
|
||||
user_agent = self.crawler_strategy.user_agent if hasattr(self.crawler_strategy, 'user_agent') else "Mozilla/5.0"
|
||||
should_crawl, reason = await should_crawl_based_on_head(
|
||||
url=url,
|
||||
cached_headers=cached_result.response_headers or {},
|
||||
user_agent=user_agent,
|
||||
timeout=5
|
||||
)
|
||||
|
||||
if should_crawl:
|
||||
self.logger.info(
|
||||
f"SMART cache: {reason} - Re-crawling {url}",
|
||||
tag="SMART"
|
||||
)
|
||||
cached_result = None # Force re-crawl
|
||||
else:
|
||||
self.logger.info(
|
||||
f"SMART cache: {reason} - Using cache for {url}",
|
||||
tag="SMART"
|
||||
)
|
||||
|
||||
# Process cached result if still valid
|
||||
if cached_result:
|
||||
html = sanitize_input_encode(cached_result.html)
|
||||
extracted_content = sanitize_input_encode(
|
||||
cached_result.extracted_content or ""
|
||||
)
|
||||
extracted_content = (
|
||||
None
|
||||
if not extracted_content or extracted_content == "[]"
|
||||
else extracted_content
|
||||
)
|
||||
# If screenshot is requested but its not in cache, then set cache_result to None
|
||||
screenshot_data = cached_result.screenshot
|
||||
pdf_data = cached_result.pdf
|
||||
# if config.screenshot and not screenshot or config.pdf and not pdf:
|
||||
if config.screenshot and not screenshot_data:
|
||||
cached_result = None
|
||||
html = sanitize_input_encode(cached_result.html)
|
||||
extracted_content = sanitize_input_encode(
|
||||
cached_result.extracted_content or ""
|
||||
)
|
||||
extracted_content = (
|
||||
None
|
||||
if not extracted_content or extracted_content == "[]"
|
||||
else extracted_content
|
||||
)
|
||||
# If screenshot is requested but its not in cache, then set cache_result to None
|
||||
screenshot_data = cached_result.screenshot
|
||||
pdf_data = cached_result.pdf
|
||||
# if config.screenshot and not screenshot or config.pdf and not pdf:
|
||||
if config.screenshot and not screenshot_data:
|
||||
cached_result = None
|
||||
|
||||
if config.pdf and not pdf_data:
|
||||
cached_result = None
|
||||
if config.pdf and not pdf_data:
|
||||
cached_result = None
|
||||
|
||||
self.logger.url_status(
|
||||
url=cache_context.display_url,
|
||||
success=bool(html),
|
||||
timing=time.perf_counter() - start_time,
|
||||
tag="FETCH",
|
||||
)
|
||||
self.logger.url_status(
|
||||
url=cache_context.display_url,
|
||||
success=bool(html),
|
||||
timing=time.perf_counter() - start_time,
|
||||
tag="FETCH",
|
||||
)
|
||||
|
||||
# Update proxy configuration from rotation strategy if available
|
||||
if config and config.proxy_rotation_strategy:
|
||||
@@ -528,9 +502,12 @@ class AsyncWebCrawler:
|
||||
metadata = result.get("metadata", {})
|
||||
else:
|
||||
cleaned_html = sanitize_input_encode(result.cleaned_html)
|
||||
media = result.media.model_dump()
|
||||
tables = media.pop("tables", [])
|
||||
links = result.links.model_dump()
|
||||
# media = result.media.model_dump()
|
||||
# tables = media.pop("tables", [])
|
||||
# links = result.links.model_dump()
|
||||
media = result.media.model_dump() if hasattr(result.media, 'model_dump') else result.media
|
||||
tables = media.pop("tables", []) if isinstance(media, dict) else []
|
||||
links = result.links.model_dump() if hasattr(result.links, 'model_dump') else result.links
|
||||
metadata = result.metadata
|
||||
|
||||
fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)
|
||||
|
||||
@@ -11,7 +11,6 @@ class CacheMode(Enum):
|
||||
- READ_ONLY: Only read from cache, don't write
|
||||
- WRITE_ONLY: Only write to cache, don't read
|
||||
- BYPASS: Bypass cache for this operation
|
||||
- SMART: Validate cache with HEAD request before using
|
||||
"""
|
||||
|
||||
ENABLED = "enabled"
|
||||
@@ -19,7 +18,6 @@ class CacheMode(Enum):
|
||||
READ_ONLY = "read_only"
|
||||
WRITE_ONLY = "write_only"
|
||||
BYPASS = "bypass"
|
||||
SMART = "smart"
|
||||
|
||||
|
||||
class CacheContext:
|
||||
@@ -64,14 +62,14 @@ class CacheContext:
|
||||
|
||||
How it works:
|
||||
1. If always_bypass is True or is_cacheable is False, return False.
|
||||
2. If cache_mode is ENABLED, READ_ONLY, or SMART, return True.
|
||||
2. If cache_mode is ENABLED or READ_ONLY, return True.
|
||||
|
||||
Returns:
|
||||
bool: True if cache should be read, False otherwise.
|
||||
"""
|
||||
if self.always_bypass or not self.is_cacheable:
|
||||
return False
|
||||
return self.cache_mode in [CacheMode.ENABLED, CacheMode.READ_ONLY, CacheMode.SMART]
|
||||
return self.cache_mode in [CacheMode.ENABLED, CacheMode.READ_ONLY]
|
||||
|
||||
def should_write(self) -> bool:
|
||||
"""
|
||||
@@ -79,14 +77,14 @@ class CacheContext:
|
||||
|
||||
How it works:
|
||||
1. If always_bypass is True or is_cacheable is False, return False.
|
||||
2. If cache_mode is ENABLED, WRITE_ONLY, or SMART, return True.
|
||||
2. If cache_mode is ENABLED or WRITE_ONLY, return True.
|
||||
|
||||
Returns:
|
||||
bool: True if cache should be written, False otherwise.
|
||||
"""
|
||||
if self.always_bypass or not self.is_cacheable:
|
||||
return False
|
||||
return self.cache_mode in [CacheMode.ENABLED, CacheMode.WRITE_ONLY, CacheMode.SMART]
|
||||
return self.cache_mode in [CacheMode.ENABLED, CacheMode.WRITE_ONLY]
|
||||
|
||||
@property
|
||||
def display_url(self) -> str:
|
||||
|
||||
@@ -27,7 +27,10 @@ from crawl4ai import (
|
||||
PruningContentFilter,
|
||||
BrowserProfiler,
|
||||
DefaultMarkdownGenerator,
|
||||
LLMConfig
|
||||
LLMConfig,
|
||||
BFSDeepCrawlStrategy,
|
||||
DFSDeepCrawlStrategy,
|
||||
BestFirstCrawlingStrategy,
|
||||
)
|
||||
from crawl4ai.config import USER_SETTINGS
|
||||
from litellm import completion
|
||||
@@ -1014,9 +1017,11 @@ def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless
|
||||
@click.option("--question", "-q", help="Ask a question about the crawled content")
|
||||
@click.option("--verbose", "-v", is_flag=True)
|
||||
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
|
||||
@click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy (bfs, dfs, or best-first)")
|
||||
@click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode")
|
||||
def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str,
|
||||
extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
|
||||
output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
|
||||
output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int):
|
||||
"""Crawl a website and extract content
|
||||
|
||||
Simple Usage:
|
||||
@@ -1156,6 +1161,27 @@ Always return valid, properly formatted JSON."""
|
||||
|
||||
crawler_cfg.scraping_strategy = LXMLWebScrapingStrategy()
|
||||
|
||||
# Handle deep crawling configuration
|
||||
if deep_crawl:
|
||||
if deep_crawl == "bfs":
|
||||
crawler_cfg.deep_crawl_strategy = BFSDeepCrawlStrategy(
|
||||
max_depth=3,
|
||||
max_pages=max_pages
|
||||
)
|
||||
elif deep_crawl == "dfs":
|
||||
crawler_cfg.deep_crawl_strategy = DFSDeepCrawlStrategy(
|
||||
max_depth=3,
|
||||
max_pages=max_pages
|
||||
)
|
||||
elif deep_crawl == "best-first":
|
||||
crawler_cfg.deep_crawl_strategy = BestFirstCrawlingStrategy(
|
||||
max_depth=3,
|
||||
max_pages=max_pages
|
||||
)
|
||||
|
||||
if verbose:
|
||||
console.print(f"[green]Deep crawling enabled:[/green] {deep_crawl} strategy, max {max_pages} pages")
|
||||
|
||||
config = get_global_config()
|
||||
|
||||
browser_cfg.verbose = config.get("VERBOSE", False)
|
||||
@@ -1170,39 +1196,60 @@ Always return valid, properly formatted JSON."""
|
||||
verbose
|
||||
)
|
||||
|
||||
# Handle deep crawl results (list) vs single result
|
||||
if isinstance(result, list):
|
||||
if len(result) == 0:
|
||||
click.echo("No results found during deep crawling")
|
||||
return
|
||||
# Use the first result for question answering and output
|
||||
main_result = result[0]
|
||||
all_results = result
|
||||
else:
|
||||
# Single result from regular crawling
|
||||
main_result = result
|
||||
all_results = [result]
|
||||
|
||||
# Handle question
|
||||
if question:
|
||||
provider, token = setup_llm_config()
|
||||
markdown = result.markdown.raw_markdown
|
||||
markdown = main_result.markdown.raw_markdown
|
||||
anyio.run(stream_llm_response, url, markdown, question, provider, token)
|
||||
return
|
||||
|
||||
# Handle output
|
||||
if not output_file:
|
||||
if output == "all":
|
||||
click.echo(json.dumps(result.model_dump(), indent=2))
|
||||
if isinstance(result, list):
|
||||
output_data = [r.model_dump() for r in all_results]
|
||||
click.echo(json.dumps(output_data, indent=2))
|
||||
else:
|
||||
click.echo(json.dumps(main_result.model_dump(), indent=2))
|
||||
elif output == "json":
|
||||
print(result.extracted_content)
|
||||
extracted_items = json.loads(result.extracted_content)
|
||||
print(main_result.extracted_content)
|
||||
extracted_items = json.loads(main_result.extracted_content)
|
||||
click.echo(json.dumps(extracted_items, indent=2))
|
||||
|
||||
elif output in ["markdown", "md"]:
|
||||
click.echo(result.markdown.raw_markdown)
|
||||
click.echo(main_result.markdown.raw_markdown)
|
||||
elif output in ["markdown-fit", "md-fit"]:
|
||||
click.echo(result.markdown.fit_markdown)
|
||||
click.echo(main_result.markdown.fit_markdown)
|
||||
else:
|
||||
if output == "all":
|
||||
with open(output_file, "w") as f:
|
||||
f.write(json.dumps(result.model_dump(), indent=2))
|
||||
if isinstance(result, list):
|
||||
output_data = [r.model_dump() for r in all_results]
|
||||
f.write(json.dumps(output_data, indent=2))
|
||||
else:
|
||||
f.write(json.dumps(main_result.model_dump(), indent=2))
|
||||
elif output == "json":
|
||||
with open(output_file, "w") as f:
|
||||
f.write(result.extracted_content)
|
||||
f.write(main_result.extracted_content)
|
||||
elif output in ["markdown", "md"]:
|
||||
with open(output_file, "w") as f:
|
||||
f.write(result.markdown.raw_markdown)
|
||||
f.write(main_result.markdown.raw_markdown)
|
||||
elif output in ["markdown-fit", "md-fit"]:
|
||||
with open(output_file, "w") as f:
|
||||
f.write(result.markdown.fit_markdown)
|
||||
f.write(main_result.markdown.fit_markdown)
|
||||
|
||||
except Exception as e:
|
||||
raise click.ClickException(str(e))
|
||||
@@ -1354,9 +1401,11 @@ def profiles_cmd():
|
||||
@click.option("--question", "-q", help="Ask a question about the crawled content")
|
||||
@click.option("--verbose", "-v", is_flag=True)
|
||||
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
|
||||
@click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy")
|
||||
@click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode")
|
||||
def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str,
|
||||
extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
|
||||
output: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
|
||||
output: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int):
|
||||
"""Crawl4AI CLI - Web content extraction tool
|
||||
|
||||
Simple Usage:
|
||||
@@ -1406,7 +1455,9 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f
|
||||
bypass_cache=bypass_cache,
|
||||
question=question,
|
||||
verbose=verbose,
|
||||
profile=profile
|
||||
profile=profile,
|
||||
deep_crawl=deep_crawl,
|
||||
max_pages=max_pages
|
||||
)
|
||||
|
||||
def main():
|
||||
|
||||
@@ -1088,147 +1088,111 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
|
||||
@staticmethod
|
||||
def generate_schema(
|
||||
html: str,
|
||||
*,
|
||||
schema_type: str = "CSS", # "CSS" or "XPATH"
|
||||
query: str | None = None,
|
||||
target_json_example: str | None = None,
|
||||
last_instruction: str | None = None, # extra “IMPORTANT” notes
|
||||
llm_config: "LLMConfig" = create_llm_config(),
|
||||
token_usages: Optional[list["TokenUsage"]] = None,
|
||||
prompt: str | None = None,
|
||||
**kwargs,
|
||||
schema_type: str = "CSS", # or XPATH
|
||||
query: str = None,
|
||||
target_json_example: str = None,
|
||||
llm_config: 'LLMConfig' = create_llm_config(),
|
||||
provider: str = None,
|
||||
api_token: str = None,
|
||||
**kwargs
|
||||
) -> dict:
|
||||
"""
|
||||
Produce a JSON extraction schema from raw HTML.
|
||||
|
||||
- If `query` is given, the task section echoes it.
|
||||
- If no `query` but `target_json_example` exists,
|
||||
we instruct the model to fit the schema to that example.
|
||||
- If neither is provided, we ask the model to detect
|
||||
the most obvious repeating data and build a schema.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict
|
||||
A schema compliant with JsonElementExtractionStrategy.
|
||||
Generate extraction schema from HTML content and optional query.
|
||||
|
||||
Args:
|
||||
html (str): The HTML content to analyze
|
||||
query (str, optional): Natural language description of what data to extract
|
||||
provider (str): Legacy Parameter. LLM provider to use
|
||||
api_token (str): Legacy Parameter. API token for LLM provider
|
||||
llm_config (LLMConfig): LLM configuration object
|
||||
prompt (str, optional): Custom prompt template to use
|
||||
**kwargs: Additional args passed to LLM processor
|
||||
|
||||
Returns:
|
||||
dict: Generated schema following the JsonElementExtractionStrategy format
|
||||
"""
|
||||
import json, re, textwrap
|
||||
from .prompts import JSON_SCHEMA_BUILDER, JSON_SCHEMA_BUILDER_XPATH
|
||||
from .prompts import JSON_SCHEMA_BUILDER
|
||||
from .utils import perform_completion_with_backoff
|
||||
|
||||
# ─── basic validation ────────────────────────────────────
|
||||
if not html or not html.strip():
|
||||
raise ValueError("html must be non-empty")
|
||||
if schema_type not in {"CSS", "XPATH"}:
|
||||
raise ValueError("schema_type must be 'CSS' or 'XPATH'")
|
||||
for name, msg in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items():
|
||||
if locals().get(name) is not None:
|
||||
raise AttributeError(f"Setting '{name}' is deprecated. {msg}")
|
||||
|
||||
# ─── prompt selection ────────────────────────────────────
|
||||
prompt_template = (
|
||||
prompt
|
||||
if prompt is not None
|
||||
else (JSON_SCHEMA_BUILDER if schema_type == "CSS" else JSON_SCHEMA_BUILDER_XPATH)
|
||||
)
|
||||
|
||||
# ─── derive task description ─────────────────────────────
|
||||
if query:
|
||||
task_line = query.strip()
|
||||
elif target_json_example:
|
||||
task_line = (
|
||||
"Use the example JSON below to infer all required fields, "
|
||||
"then generate a schema that extracts matching data."
|
||||
)
|
||||
else:
|
||||
task_line = (
|
||||
"Detect the most obvious repeating data on this page and "
|
||||
"generate a schema that captures it completely."
|
||||
)
|
||||
|
||||
# ─── build user prompt body ──────────────────────────────
|
||||
html_clean = re.sub(r"\s{2,}", " ", textwrap.dedent(html).strip())
|
||||
|
||||
parts: list[str] = [
|
||||
f"{prompt_template}",
|
||||
"\n\n## Extracted HTML\n"
|
||||
"==================== Beginning of Html ====================\n",
|
||||
html_clean,
|
||||
"\n==================== End of Html ====================\n",
|
||||
]
|
||||
|
||||
if target_json_example:
|
||||
parts.extend(
|
||||
[
|
||||
"\n## Example of end result\n",
|
||||
target_json_example.strip(),
|
||||
"\n",
|
||||
]
|
||||
)
|
||||
|
||||
if last_instruction:
|
||||
parts.extend(
|
||||
[
|
||||
"\n## Important\n",
|
||||
last_instruction.strip(),
|
||||
"\n",
|
||||
]
|
||||
)
|
||||
|
||||
parts.extend(
|
||||
[
|
||||
"\n## Task:\n",
|
||||
task_line,
|
||||
]
|
||||
)
|
||||
|
||||
user_message = {"role": "user", "content": "".join(parts)}
|
||||
|
||||
# slim system message, JSON_SCHEMA_BUILDER already holds heavy guidance
|
||||
for name, message in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items():
|
||||
if locals()[name] is not None:
|
||||
raise AttributeError(f"Setting '{name}' is deprecated. {message}")
|
||||
|
||||
# Use default or custom prompt
|
||||
prompt_template = JSON_SCHEMA_BUILDER if schema_type == "CSS" else JSON_SCHEMA_BUILDER_XPATH
|
||||
|
||||
# Build the prompt
|
||||
system_message = {
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You generate reliable JSON schemas for structured extraction. "
|
||||
"Return valid JSON only."
|
||||
),
|
||||
"role": "system",
|
||||
"content": f"""You specialize in generating special JSON schemas for web scraping. This schema uses CSS or XPATH selectors to present a repetitive pattern in crawled HTML, such as a product in a product list or a search result item in a list of search results. We use this JSON schema to pass to a language model along with the HTML content to extract structured data from the HTML. The language model uses the JSON schema to extract data from the HTML and retrieve values for fields in the JSON schema, following the schema.
|
||||
|
||||
Generating this HTML manually is not feasible, so you need to generate the JSON schema using the HTML content. The HTML copied from the crawled website is provided below, which we believe contains the repetitive pattern.
|
||||
|
||||
# Schema main keys:
|
||||
- name: This is the name of the schema.
|
||||
- baseSelector: This is the CSS or XPATH selector that identifies the base element that contains all the repetitive patterns.
|
||||
- baseFields: This is a list of fields that you extract from the base element itself.
|
||||
- fields: This is a list of fields that you extract from the children of the base element. {{name, selector, type}} based on the type, you may have extra keys such as "attribute" when the type is "attribute".
|
||||
|
||||
# Extra Context:
|
||||
In this context, the following items may or may not be present:
|
||||
- Example of target JSON object: This is a sample of the final JSON object that we hope to extract from the HTML using the schema you are generating.
|
||||
- Extra Instructions: This is optional instructions to consider when generating the schema provided by the user.
|
||||
- Query or explanation of target/goal data item: This is a description of what data we are trying to extract from the HTML. This explanation means we're not sure about the rigid schema of the structures we want, so we leave it to you to use your expertise to create the best and most comprehensive structures aimed at maximizing data extraction from this page. You must ensure that you do not pick up nuances that may exist on a particular page. The focus should be on the data we are extracting, and it must be valid, safe, and robust based on the given HTML.
|
||||
|
||||
# What if there is no example of target JSON object and also no extra instructions or even no explanation of target/goal data item?
|
||||
In this scenario, use your best judgment to generate the schema. You need to examine the content of the page and understand the data it provides. If the page contains repetitive data, such as lists of items, products, jobs, places, books, or movies, focus on one single item that repeats. If the page is a detailed page about one product or item, create a schema to extract the entire structured data. At this stage, you must think and decide for yourself. Try to maximize the number of fields that you can extract from the HTML.
|
||||
|
||||
# What are the instructions and details for this schema generation?
|
||||
{prompt_template}"""
|
||||
}
|
||||
|
||||
user_message = {
|
||||
"role": "user",
|
||||
"content": f"""
|
||||
HTML to analyze:
|
||||
```html
|
||||
{html}
|
||||
```
|
||||
"""
|
||||
}
|
||||
|
||||
# ─── call LLM ─────────────────────────────────────────────
|
||||
response = perform_completion_with_backoff(
|
||||
provider=llm_config.provider,
|
||||
prompt_with_variables="\n\n".join(
|
||||
[system_message["content"], user_message["content"]]
|
||||
),
|
||||
json_response=True,
|
||||
api_token=llm_config.api_token,
|
||||
base_url=llm_config.base_url,
|
||||
extra_args=kwargs,
|
||||
)
|
||||
if query:
|
||||
user_message["content"] += f"\n\n## Query or explanation of target/goal data item:\n{query}"
|
||||
if target_json_example:
|
||||
user_message["content"] += f"\n\n## Example of target JSON object:\n```json\n{target_json_example}\n```"
|
||||
|
||||
# ─── token usage accounting ──────────────────────────────
|
||||
if token_usages is not None and hasattr(response, "usage"):
|
||||
token_usages.append(
|
||||
TokenUsage(
|
||||
completion_tokens=getattr(response.usage, "completion_tokens", 0),
|
||||
prompt_tokens=getattr(response.usage, "prompt_tokens", 0),
|
||||
total_tokens=getattr(response.usage, "total_tokens", 0),
|
||||
)
|
||||
)
|
||||
if query and not target_json_example:
|
||||
user_message["content"] += """IMPORTANT: To remind you, in this process, we are not providing a rigid example of the adjacent objects we seek. We rely on your understanding of the explanation provided in the above section. Make sure to grasp what we are looking for and, based on that, create the best schema.."""
|
||||
elif not query and target_json_example:
|
||||
user_message["content"] += """IMPORTANT: Please remember that in this process, we provided a proper example of a target JSON object. Make sure to adhere to the structure and create a schema that exactly fits this example. If you find that some elements on the page do not match completely, vote for the majority."""
|
||||
elif not query and not target_json_example:
|
||||
user_message["content"] += """IMPORTANT: Since we neither have a query nor an example, it is crucial to rely solely on the HTML content provided. Leverage your expertise to determine the schema based on the repetitive patterns observed in the content."""
|
||||
|
||||
user_message["content"] += """IMPORTANT:
|
||||
0/ Ensure your schema remains reliable by avoiding selectors that appear to generate dynamically and are not dependable. You want a reliable schema, as it consistently returns the same data even after many page reloads.
|
||||
1/ DO NOT USE use base64 kind of classes, they are temporary and not reliable.
|
||||
2/ Every selector must refer to only one unique element. You should ensure your selector points to a single element and is unique to the place that contains the information. You have to use available techniques based on CSS or XPATH requested schema to make sure your selector is unique and also not fragile, meaning if we reload the page now or in the future, the selector should remain reliable.
|
||||
3/ Do not use Regex as much as possible.
|
||||
|
||||
Analyze the HTML and generate a JSON schema that follows the specified format. Only output valid JSON schema, nothing else.
|
||||
"""
|
||||
|
||||
# ─── parse and validate JSON answer ──────────────────────
|
||||
try:
|
||||
schema = json.loads(response.choices[0].message.content)
|
||||
except Exception as exc:
|
||||
raise ValueError(f"LLM returned invalid JSON: {exc}") from exc
|
||||
|
||||
required = {"name", "baseSelector", "fields"}
|
||||
if not required.issubset(schema):
|
||||
missing = required - set(schema)
|
||||
raise ValueError(f"Generated schema missing required keys: {missing}")
|
||||
|
||||
return schema
|
||||
|
||||
|
||||
# Call LLM with backoff handling
|
||||
response = perform_completion_with_backoff(
|
||||
provider=llm_config.provider,
|
||||
prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]),
|
||||
json_response = True,
|
||||
api_token=llm_config.api_token,
|
||||
base_url=llm_config.base_url,
|
||||
extra_args=kwargs
|
||||
)
|
||||
|
||||
# Extract and return schema
|
||||
return json.loads(response.choices[0].message.content)
|
||||
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to generate schema: {str(e)}")
|
||||
|
||||
class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
|
||||
"""
|
||||
|
||||
@@ -1056,7 +1056,7 @@ Your output must:
|
||||
</output_requirements>
|
||||
"""
|
||||
|
||||
GENERATE_SCRIPT_PROMPT = r"""You are a world-class browser automation specialist. Your sole purpose is to convert a natural language objective and a snippet of HTML into the most **efficient, robust, and simple** script possible to prepare a web page for data extraction.
|
||||
GENERATE_SCRIPT_PROMPT = """You are a world-class browser automation specialist. Your sole purpose is to convert a natural language objective and a snippet of HTML into the most **efficient, robust, and simple** script possible to prepare a web page for data extraction.
|
||||
|
||||
Your scripts run **before the crawl** to handle dynamic content, user interactions, and other obstacles. You are a master of two tools: raw **JavaScript** and the high-level **Crawl4ai Script (c4a)**.
|
||||
|
||||
|
||||
@@ -3342,7 +3342,13 @@ async def get_text_embeddings(
|
||||
# Default: use sentence-transformers
|
||||
else:
|
||||
# Lazy load to avoid importing heavy libraries unless needed
|
||||
from sentence_transformers import SentenceTransformer
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"sentence-transformers is required for local embeddings. "
|
||||
"Install it with: pip install 'crawl4ai[transformer]' or pip install sentence-transformers"
|
||||
)
|
||||
|
||||
# Cache the model in function attribute to avoid reloading
|
||||
if not hasattr(get_text_embeddings, '_models'):
|
||||
@@ -3387,90 +3393,3 @@ def cosine_distance(vec1: np.ndarray, vec2: np.ndarray) -> float:
|
||||
"""Calculate cosine distance (1 - similarity) between two vectors"""
|
||||
return 1 - cosine_similarity(vec1, vec2)
|
||||
|
||||
|
||||
async def should_crawl_based_on_head(
|
||||
url: str,
|
||||
cached_headers: Dict[str, str],
|
||||
user_agent: str = "Mozilla/5.0",
|
||||
timeout: int = 5
|
||||
) -> tuple[bool, str]:
|
||||
"""
|
||||
Check if content has changed using HEAD request.
|
||||
|
||||
Args:
|
||||
url: The URL to check
|
||||
cached_headers: The cached response headers from previous crawl
|
||||
user_agent: User agent string to use for the HEAD request
|
||||
timeout: Timeout in seconds for the HEAD request
|
||||
|
||||
Returns:
|
||||
Tuple of (should_crawl: bool, reason: str)
|
||||
- should_crawl: True if content has changed and should be re-crawled, False otherwise
|
||||
- reason: Explanation of the decision
|
||||
"""
|
||||
import email.utils
|
||||
|
||||
if not cached_headers:
|
||||
return True, "No cached headers available, must crawl"
|
||||
|
||||
headers = {
|
||||
"Accept-Encoding": "identity",
|
||||
"User-Agent": user_agent,
|
||||
"Want-Content-Digest": "sha-256", # Request RFC 9530 digest
|
||||
}
|
||||
|
||||
# Add conditional headers if available in cache
|
||||
if cached_headers.get("etag"):
|
||||
headers["If-None-Match"] = cached_headers["etag"]
|
||||
if cached_headers.get("last-modified"):
|
||||
headers["If-Modified-Since"] = cached_headers["last-modified"]
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.head(
|
||||
url,
|
||||
headers=headers,
|
||||
timeout=aiohttp.ClientTimeout(total=timeout),
|
||||
allow_redirects=True
|
||||
) as response:
|
||||
# 304 Not Modified - content hasn't changed
|
||||
if response.status == 304:
|
||||
return False, "304 Not Modified - Content unchanged"
|
||||
|
||||
# Check other headers if no 304 response
|
||||
new_headers = dict(response.headers)
|
||||
|
||||
# Check Content-Digest (most reliable)
|
||||
if new_headers.get("content-digest") and cached_headers.get("content-digest"):
|
||||
if new_headers["content-digest"] == cached_headers["content-digest"]:
|
||||
return False, "Content-Digest matches - Content unchanged"
|
||||
|
||||
# Check strong ETag
|
||||
if new_headers.get("etag") and cached_headers.get("etag"):
|
||||
# Strong ETags start with '"'
|
||||
if (new_headers["etag"].startswith('"') and
|
||||
new_headers["etag"] == cached_headers["etag"]):
|
||||
return False, "Strong ETag matches - Content unchanged"
|
||||
|
||||
# Check Last-Modified
|
||||
if new_headers.get("last-modified") and cached_headers.get("last-modified"):
|
||||
try:
|
||||
new_lm = email.utils.parsedate_to_datetime(new_headers["last-modified"])
|
||||
cached_lm = email.utils.parsedate_to_datetime(cached_headers["last-modified"])
|
||||
if new_lm <= cached_lm:
|
||||
return False, "Last-Modified not newer - Content unchanged"
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Content-Length changed is a positive signal
|
||||
if (new_headers.get("content-length") and cached_headers.get("content-length") and
|
||||
new_headers["content-length"] != cached_headers["content-length"]):
|
||||
return True, f"Content-Length changed ({cached_headers['content-length']} -> {new_headers['content-length']})"
|
||||
|
||||
# Default: assume content has changed
|
||||
return True, "No definitive cache headers matched - Assuming content changed"
|
||||
|
||||
except Exception as e:
|
||||
# On error, assume content has changed (safe default)
|
||||
return True, f"HEAD request failed: {str(e)} - Assuming content changed"
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ from typing import List, Tuple, Dict
|
||||
from functools import partial
|
||||
from uuid import uuid4
|
||||
from datetime import datetime
|
||||
from base64 import b64encode
|
||||
|
||||
import logging
|
||||
from typing import Optional, AsyncGenerator
|
||||
@@ -371,6 +372,9 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
|
||||
server_memory_mb = _get_memory_mb()
|
||||
result_dict = result.model_dump()
|
||||
result_dict['server_memory_mb'] = server_memory_mb
|
||||
# If PDF exists, encode it to base64
|
||||
if result_dict.get('pdf') is not None:
|
||||
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
|
||||
logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")
|
||||
data = json.dumps(result_dict, default=datetime_handler) + "\n"
|
||||
yield data.encode('utf-8')
|
||||
@@ -443,10 +447,19 @@ async def handle_crawl_request(
|
||||
mem_delta_mb = end_mem_mb - start_mem_mb # <--- Calculate delta
|
||||
peak_mem_mb = max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb) # <--- Get peak memory
|
||||
logger.info(f"Memory usage: Start: {start_mem_mb} MB, End: {end_mem_mb} MB, Delta: {mem_delta_mb} MB, Peak: {peak_mem_mb} MB")
|
||||
|
||||
|
||||
# Process results to handle PDF bytes
|
||||
processed_results = []
|
||||
for result in results:
|
||||
result_dict = result.model_dump()
|
||||
# If PDF exists, encode it to base64
|
||||
if result_dict.get('pdf') is not None:
|
||||
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
|
||||
processed_results.append(result_dict)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"results": [result.model_dump() for result in results],
|
||||
"results": processed_results,
|
||||
"server_processing_time_s": end_time - start_time,
|
||||
"server_memory_delta_mb": mem_delta_mb,
|
||||
"server_peak_memory_mb": peak_mem_mb
|
||||
|
||||
@@ -9,7 +9,7 @@ import asyncio
|
||||
import re
|
||||
from typing import List, Dict, Set
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||
from crawl4ai.adaptive_crawler import AdaptiveCrawlResult, Link
|
||||
from crawl4ai.adaptive_crawler import CrawlState, Link
|
||||
import math
|
||||
|
||||
|
||||
@@ -45,7 +45,7 @@ class APIDocumentationStrategy:
|
||||
r'/legal/'
|
||||
]
|
||||
|
||||
def score_link(self, link: Link, query: str, state: AdaptiveCrawlResult) -> float:
|
||||
def score_link(self, link: Link, query: str, state: CrawlState) -> float:
|
||||
"""Custom link scoring for API documentation"""
|
||||
score = 1.0
|
||||
url = link.href.lower()
|
||||
@@ -77,7 +77,7 @@ class APIDocumentationStrategy:
|
||||
|
||||
return score
|
||||
|
||||
def calculate_api_coverage(self, state: AdaptiveCrawlResult, query: str) -> Dict[str, float]:
|
||||
def calculate_api_coverage(self, state: CrawlState, query: str) -> Dict[str, float]:
|
||||
"""Calculate specialized coverage metrics for API documentation"""
|
||||
metrics = {
|
||||
'endpoint_coverage': 0.0,
|
||||
|
||||
@@ -8,8 +8,6 @@ from crawl4ai import (
|
||||
CrawlResult
|
||||
)
|
||||
|
||||
from crawl4ai.prompts import GENERATE_SCRIPT_PROMPT
|
||||
|
||||
|
||||
async def main():
|
||||
browser_config = BrowserConfig(
|
||||
|
||||
@@ -1,202 +0,0 @@
|
||||
"""
|
||||
SMART Cache Mode Example for Crawl4AI
|
||||
|
||||
This example demonstrates how to use the SMART cache mode to intelligently
|
||||
validate cached content before using it. SMART mode can save 70-95% bandwidth
|
||||
on unchanged content while ensuring you always get fresh data when it changes.
|
||||
|
||||
SMART Cache Mode: Only Crawl When Changes
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.cache_context import CacheMode
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
|
||||
async def basic_smart_cache_example():
|
||||
"""Basic example showing SMART cache mode in action"""
|
||||
print("=== Basic SMART Cache Example ===\n")
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
url = "https://example.com"
|
||||
|
||||
# First crawl: Cache the content
|
||||
print("1. Initial crawl to cache the content:")
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
||||
result1 = await crawler.arun(url=url, config=config)
|
||||
print(f" Initial crawl: {len(result1.html)} bytes\n")
|
||||
|
||||
# Second crawl: Use SMART mode
|
||||
print("2. SMART mode crawl (should use cache for static content):")
|
||||
smart_config = CrawlerRunConfig(cache_mode=CacheMode.SMART)
|
||||
start_time = time.time()
|
||||
result2 = await crawler.arun(url=url, config=smart_config)
|
||||
elapsed = time.time() - start_time
|
||||
print(f" SMART crawl: {len(result2.html)} bytes in {elapsed:.2f}s")
|
||||
print(f" Content identical: {result1.html == result2.html}\n")
|
||||
|
||||
|
||||
async def news_site_monitoring():
|
||||
"""Monitor a news site for changes using SMART cache mode"""
|
||||
print("=== News Site Monitoring Example ===\n")
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.SMART)
|
||||
url = "https://news.ycombinator.com"
|
||||
|
||||
print("Monitoring Hacker News for changes...\n")
|
||||
|
||||
previous_length = 0
|
||||
for i in range(3):
|
||||
result = await crawler.arun(url=url, config=config)
|
||||
current_length = len(result.html)
|
||||
|
||||
if i == 0:
|
||||
print(f"Check {i+1}: Initial crawl - {current_length} bytes")
|
||||
else:
|
||||
if current_length != previous_length:
|
||||
print(f"Check {i+1}: Content changed! {previous_length} -> {current_length} bytes")
|
||||
else:
|
||||
print(f"Check {i+1}: Content unchanged - {current_length} bytes")
|
||||
|
||||
previous_length = current_length
|
||||
|
||||
if i < 2: # Don't wait after last check
|
||||
print(" Waiting 10 seconds before next check...")
|
||||
await asyncio.sleep(10)
|
||||
|
||||
print()
|
||||
|
||||
|
||||
async def compare_cache_modes():
|
||||
"""Compare different cache modes to understand SMART mode benefits"""
|
||||
print("=== Cache Mode Comparison ===\n")
|
||||
|
||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||
url = "https://www.wikipedia.org"
|
||||
|
||||
# First, populate the cache
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
||||
await crawler.arun(url=url, config=config)
|
||||
print("Cache populated.\n")
|
||||
|
||||
# Test different cache modes
|
||||
modes = [
|
||||
(CacheMode.ENABLED, "ENABLED (always uses cache if available)"),
|
||||
(CacheMode.BYPASS, "BYPASS (never uses cache)"),
|
||||
(CacheMode.SMART, "SMART (validates cache before using)")
|
||||
]
|
||||
|
||||
for mode, description in modes:
|
||||
config = CrawlerRunConfig(cache_mode=mode)
|
||||
start_time = time.time()
|
||||
result = await crawler.arun(url=url, config=config)
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
print(f"{description}:")
|
||||
print(f" Time: {elapsed:.2f}s")
|
||||
print(f" Size: {len(result.html)} bytes\n")
|
||||
|
||||
|
||||
async def dynamic_content_example():
|
||||
"""Show how SMART mode handles dynamic content"""
|
||||
print("=== Dynamic Content Example ===\n")
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# URL that returns different content each time
|
||||
dynamic_url = "https://httpbin.org/uuid"
|
||||
|
||||
print("Testing with dynamic content (changes every request):\n")
|
||||
|
||||
# First crawl
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
||||
result1 = await crawler.arun(url=dynamic_url, config=config)
|
||||
|
||||
# Extract UUID from the response
|
||||
import re
|
||||
uuid1 = re.search(r'"uuid":\s*"([^"]+)"', result1.html)
|
||||
if uuid1:
|
||||
print(f"1. First crawl UUID: {uuid1.group(1)}")
|
||||
|
||||
# SMART mode crawl - should detect change and re-crawl
|
||||
smart_config = CrawlerRunConfig(cache_mode=CacheMode.SMART)
|
||||
result2 = await crawler.arun(url=dynamic_url, config=smart_config)
|
||||
|
||||
uuid2 = re.search(r'"uuid":\s*"([^"]+)"', result2.html)
|
||||
if uuid2:
|
||||
print(f"2. SMART crawl UUID: {uuid2.group(1)}")
|
||||
print(f" Different UUIDs: {uuid1.group(1) != uuid2.group(1)} (should be True)")
|
||||
|
||||
|
||||
async def bandwidth_savings_demo():
|
||||
"""Demonstrate bandwidth savings with SMART mode"""
|
||||
print("=== Bandwidth Savings Demo ===\n")
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# List of URLs to crawl
|
||||
urls = [
|
||||
"https://example.com",
|
||||
"https://www.python.org",
|
||||
"https://docs.python.org/3/",
|
||||
]
|
||||
|
||||
print("Crawling multiple URLs twice to show bandwidth savings:\n")
|
||||
|
||||
# First pass: Cache all URLs
|
||||
print("First pass - Caching all URLs:")
|
||||
total_bytes_pass1 = 0
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
||||
|
||||
for url in urls:
|
||||
result = await crawler.arun(url=url, config=config)
|
||||
total_bytes_pass1 += len(result.html)
|
||||
print(f" {url}: {len(result.html)} bytes")
|
||||
|
||||
print(f"\nTotal downloaded in first pass: {total_bytes_pass1} bytes")
|
||||
|
||||
# Second pass: Use SMART mode
|
||||
print("\nSecond pass - Using SMART mode:")
|
||||
total_bytes_pass2 = 0
|
||||
smart_config = CrawlerRunConfig(cache_mode=CacheMode.SMART)
|
||||
|
||||
for url in urls:
|
||||
result = await crawler.arun(url=url, config=smart_config)
|
||||
# In SMART mode, unchanged content uses cache (minimal bandwidth)
|
||||
print(f" {url}: Using {'cache' if result else 'fresh crawl'}")
|
||||
|
||||
print(f"\nBandwidth saved: ~{total_bytes_pass1} bytes (only HEAD requests sent)")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all examples"""
|
||||
examples = [
|
||||
basic_smart_cache_example,
|
||||
news_site_monitoring,
|
||||
compare_cache_modes,
|
||||
dynamic_content_example,
|
||||
bandwidth_savings_demo
|
||||
]
|
||||
|
||||
for example in examples:
|
||||
await example()
|
||||
print("\n" + "="*50 + "\n")
|
||||
await asyncio.sleep(2) # Brief pause between examples
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("""
|
||||
Crawl4AI SMART Cache Mode Examples
|
||||
==================================
|
||||
|
||||
These examples demonstrate the SMART cache mode that intelligently
|
||||
validates cached content using HEAD requests before deciding whether
|
||||
to use cache or perform a fresh crawl.
|
||||
|
||||
""")
|
||||
asyncio.run(main())
|
||||
@@ -130,7 +130,7 @@ Factors:
|
||||
|
||||
```python
|
||||
class CustomLinkScorer:
|
||||
def score(self, link: Link, query: str, state: AdaptiveCrawlResult) -> float:
|
||||
def score(self, link: Link, query: str, state: CrawlState) -> float:
|
||||
# Prioritize specific URL patterns
|
||||
if "/api/reference/" in link.href:
|
||||
return 2.0 # Double the score
|
||||
@@ -325,17 +325,17 @@ with open("crawl_analysis.json", "w") as f:
|
||||
from crawl4ai.adaptive_crawler import BaseStrategy
|
||||
|
||||
class DomainSpecificStrategy(BaseStrategy):
|
||||
def calculate_coverage(self, state: AdaptiveCrawlResult) -> float:
|
||||
def calculate_coverage(self, state: CrawlState) -> float:
|
||||
# Custom coverage calculation
|
||||
# e.g., weight certain terms more heavily
|
||||
pass
|
||||
|
||||
def calculate_consistency(self, state: AdaptiveCrawlResult) -> float:
|
||||
def calculate_consistency(self, state: CrawlState) -> float:
|
||||
# Custom consistency logic
|
||||
# e.g., domain-specific validation
|
||||
pass
|
||||
|
||||
def rank_links(self, links: List[Link], state: AdaptiveCrawlResult) -> List[Link]:
|
||||
def rank_links(self, links: List[Link], state: CrawlState) -> List[Link]:
|
||||
# Custom link ranking
|
||||
# e.g., prioritize specific URL patterns
|
||||
pass
|
||||
@@ -359,7 +359,7 @@ class HybridStrategy(BaseStrategy):
|
||||
URLPatternStrategy()
|
||||
]
|
||||
|
||||
def calculate_confidence(self, state: AdaptiveCrawlResult) -> float:
|
||||
def calculate_confidence(self, state: CrawlState) -> float:
|
||||
# Weighted combination of strategies
|
||||
scores = [s.calculate_confidence(state) for s in self.strategies]
|
||||
weights = [0.5, 0.3, 0.2]
|
||||
|
||||
@@ -27,7 +27,7 @@ async def digest(
|
||||
start_url: str,
|
||||
query: str,
|
||||
resume_from: Optional[Union[str, Path]] = None
|
||||
) -> AdaptiveCrawlResult
|
||||
) -> CrawlState
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
@@ -38,7 +38,7 @@ async def digest(
|
||||
|
||||
#### Returns
|
||||
|
||||
- **AdaptiveCrawlResult**: The final crawl state containing all crawled URLs, knowledge base, and metrics
|
||||
- **CrawlState**: The final crawl state containing all crawled URLs, knowledge base, and metrics
|
||||
|
||||
#### Example
|
||||
|
||||
@@ -92,7 +92,7 @@ Access to the current crawl state.
|
||||
|
||||
```python
|
||||
@property
|
||||
def state(self) -> AdaptiveCrawlResult
|
||||
def state(self) -> CrawlState
|
||||
```
|
||||
|
||||
## Methods
|
||||
|
||||
@@ -9,7 +9,7 @@ async def digest(
|
||||
start_url: str,
|
||||
query: str,
|
||||
resume_from: Optional[Union[str, Path]] = None
|
||||
) -> AdaptiveCrawlResult
|
||||
) -> CrawlState
|
||||
```
|
||||
|
||||
## Parameters
|
||||
@@ -31,7 +31,7 @@ async def digest(
|
||||
|
||||
## Return Value
|
||||
|
||||
Returns a `AdaptiveCrawlResult` object containing:
|
||||
Returns a `CrawlState` object containing:
|
||||
|
||||
- **crawled_urls** (`Set[str]`): All URLs that have been crawled
|
||||
- **knowledge_base** (`List[CrawlResult]`): Collection of crawled pages with content
|
||||
|
||||
@@ -52,11 +52,9 @@ That's it! In just a few lines, you've automated a complete search workflow.
|
||||
|
||||
Want to learn by doing? We've got you covered:
|
||||
|
||||
**🚀 [Live Demo](https://docs.crawl4ai.com/c4a-script/demo)** - Try C4A-Script in your browser right now!
|
||||
**🚀 [Live Demo](https://docs.crawl4ai.com/apps/c4a-script/)** - Try C4A-Script in your browser right now!
|
||||
|
||||
**📁 [Tutorial Examples](/examples/c4a_script/)** - Complete examples with source code
|
||||
|
||||
**🛠️ [Local Tutorial](/examples/c4a_script/tutorial/)** - Run the interactive tutorial on your machine
|
||||
**📁 [Tutorial Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/c4a_script/)** - Complete examples with source code
|
||||
|
||||
### Running the Tutorial Locally
|
||||
|
||||
|
||||
@@ -19,7 +19,6 @@ The new system uses a single `CacheMode` enum:
|
||||
- `CacheMode.READ_ONLY`: Only read from cache
|
||||
- `CacheMode.WRITE_ONLY`: Only write to cache
|
||||
- `CacheMode.BYPASS`: Skip cache for this operation
|
||||
- `CacheMode.SMART`: **NEW** - Intelligently validate cache with HEAD requests
|
||||
|
||||
## Migration Example
|
||||
|
||||
@@ -73,128 +72,4 @@ if __name__ == "__main__":
|
||||
| `bypass_cache=True` | `cache_mode=CacheMode.BYPASS` |
|
||||
| `disable_cache=True` | `cache_mode=CacheMode.DISABLED`|
|
||||
| `no_cache_read=True` | `cache_mode=CacheMode.WRITE_ONLY` |
|
||||
| `no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY` |
|
||||
|
||||
## SMART Cache Mode: Only Crawl When Changes
|
||||
|
||||
Starting from version 0.7.1, Crawl4AI introduces the **SMART cache mode** - an intelligent caching strategy that validates cached content before using it. This mode uses HTTP HEAD requests to check if content has changed, potentially saving 70-95% bandwidth on unchanged content.
|
||||
|
||||
### How SMART Mode Works
|
||||
|
||||
When you use `CacheMode.SMART`, Crawl4AI:
|
||||
|
||||
1. **Retrieves cached content** (if available)
|
||||
2. **Sends a HEAD request** with conditional headers (ETag, Last-Modified)
|
||||
3. **Validates the response**:
|
||||
- If server returns `304 Not Modified` → uses cache
|
||||
- If content changed → performs fresh crawl
|
||||
- If headers indicate changes → performs fresh crawl
|
||||
|
||||
### Benefits
|
||||
|
||||
- **Bandwidth Efficient**: Only downloads full content when necessary
|
||||
- **Always Fresh**: Ensures you get the latest content when it changes
|
||||
- **Cost Effective**: Reduces API calls and bandwidth usage
|
||||
- **Intelligent**: Uses multiple signals to detect changes (ETag, Last-Modified, Content-Length)
|
||||
|
||||
### Basic Usage
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.cache_context import CacheMode
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
async def smart_crawl():
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# First crawl - caches the content
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
||||
result1 = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=config
|
||||
)
|
||||
print(f"First crawl: {len(result1.html)} bytes")
|
||||
|
||||
# Second crawl - uses SMART mode
|
||||
smart_config = CrawlerRunConfig(cache_mode=CacheMode.SMART)
|
||||
result2 = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=smart_config
|
||||
)
|
||||
print(f"SMART crawl: {len(result2.html)} bytes (from cache if unchanged)")
|
||||
|
||||
asyncio.run(smart_crawl())
|
||||
```
|
||||
|
||||
### When to Use SMART Mode
|
||||
|
||||
SMART mode is ideal for:
|
||||
|
||||
- **Periodic crawling** of websites that update irregularly
|
||||
- **News sites** where you want fresh content but avoid re-downloading unchanged pages
|
||||
- **API endpoints** that provide proper caching headers
|
||||
- **Large-scale crawling** where bandwidth costs are significant
|
||||
|
||||
### How It Detects Changes
|
||||
|
||||
SMART mode checks these signals in order:
|
||||
|
||||
1. **304 Not Modified** status (most reliable)
|
||||
2. **Content-Digest** header (RFC 9530)
|
||||
3. **Strong ETag** comparison
|
||||
4. **Last-Modified** timestamp
|
||||
5. **Content-Length** changes (as a hint)
|
||||
|
||||
### Example: News Site Monitoring
|
||||
|
||||
```python
|
||||
async def monitor_news_site():
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.SMART)
|
||||
|
||||
# Check multiple times
|
||||
for i in range(3):
|
||||
result = await crawler.arun(
|
||||
url="https://news.ycombinator.com",
|
||||
config=config
|
||||
)
|
||||
|
||||
# SMART mode will only re-crawl if content changed
|
||||
print(f"Check {i+1}: Retrieved {len(result.html)} bytes")
|
||||
await asyncio.sleep(300) # Wait 5 minutes
|
||||
|
||||
asyncio.run(monitor_news_site())
|
||||
```
|
||||
|
||||
### Understanding SMART Mode Logs
|
||||
|
||||
When using SMART mode with `verbose=True`, you'll see informative logs:
|
||||
|
||||
```
|
||||
[SMART] ℹ SMART cache: 304 Not Modified - Content unchanged - Using cache for https://example.com
|
||||
[SMART] ℹ SMART cache: Content-Length changed (12345 -> 12789) - Re-crawling https://example.com
|
||||
[SMART] ℹ SMART cache: No definitive cache headers matched - Assuming content changed - Re-crawling https://example.com
|
||||
```
|
||||
|
||||
### Limitations
|
||||
|
||||
- Some servers don't properly support HEAD requests
|
||||
- Dynamic content without proper cache headers will always be re-crawled
|
||||
- Content changes must be reflected in HTTP headers for detection
|
||||
|
||||
### Advanced Example
|
||||
|
||||
For a complete example demonstrating SMART mode with both static and dynamic content, check out `docs/examples/smart_cache.py`.
|
||||
|
||||
## Cache Mode Reference
|
||||
|
||||
| Mode | Read from Cache | Write to Cache | Use Case |
|
||||
|------|----------------|----------------|----------|
|
||||
| `ENABLED` | ✓ | ✓ | Normal operation |
|
||||
| `DISABLED` | ✗ | ✗ | No caching needed |
|
||||
| `READ_ONLY` | ✓ | ✗ | Use existing cache only |
|
||||
| `WRITE_ONLY` | ✗ | ✓ | Refresh cache only |
|
||||
| `BYPASS` | ✗ | ✗ | Skip cache for this request |
|
||||
| `SMART` | ✓* | ✓ | Validate before using cache |
|
||||
|
||||
*SMART mode reads from cache but validates it first with a HEAD request.
|
||||
| `no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY` |
|
||||
@@ -37,12 +37,6 @@ This page provides a comprehensive list of example scripts that demonstrate vari
|
||||
| Storage State | Tutorial on managing browser storage state for persistence. | [View Guide](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/storage_state_tutorial.md) |
|
||||
| Network Console Capture | Demonstrates how to capture and analyze network requests and console logs. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/network_console_capture_example.py) |
|
||||
|
||||
## Caching & Performance
|
||||
|
||||
| Example | Description | Link |
|
||||
|---------|-------------|------|
|
||||
| SMART Cache Mode | Demonstrates the intelligent SMART cache mode that validates cached content using HEAD requests, saving 70-95% bandwidth while ensuring fresh content. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/smart_cache.py) |
|
||||
|
||||
## Extraction Strategies
|
||||
|
||||
| Example | Description | Link |
|
||||
|
||||
@@ -79,7 +79,7 @@ if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
> IMPORTANT: By default cache mode is set to `CacheMode.ENABLED`. So to have fresh content, you need to set it to `CacheMode.BYPASS`. For intelligent caching that validates content before using cache, use the new `CacheMode.SMART` - it saves bandwidth while ensuring fresh content.
|
||||
> IMPORTANT: By default cache mode is set to `CacheMode.ENABLED`. So to have fresh content, you need to set it to `CacheMode.BYPASS`
|
||||
|
||||
We’ll explore more advanced config in later tutorials (like enabling proxies, PDF output, multi-tab sessions, etc.). For now, just note how you pass these objects to manage crawling.
|
||||
|
||||
|
||||
@@ -30,6 +30,8 @@ dependencies = [
|
||||
"pydantic>=2.10",
|
||||
"pyOpenSSL>=24.3.0",
|
||||
"psutil>=6.1.1",
|
||||
"nltk>=3.9.1",
|
||||
"playwright",
|
||||
"rich>=13.9.4",
|
||||
"cssselect>=1.2.0",
|
||||
"httpx>=0.27.2",
|
||||
@@ -57,20 +59,20 @@ classifiers = [
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
pdf = ["pypdf>=3.0.0"] # PyPDF2 is deprecated, use pypdf instead
|
||||
torch = ["torch>=2.0.0", "nltk>=3.9.1", "scikit-learn>=1.3.0"]
|
||||
transformer = ["transformers>=4.34.0", "tokenizers>=0.15.0", "sentence-transformers>=2.2.0"]
|
||||
cosine = ["torch>=2.0.0", "transformers>=4.34.0", "nltk>=3.9.1", "sentence-transformers>=2.2.0"]
|
||||
sync = ["selenium>=4.0.0"]
|
||||
pdf = ["PyPDF2"]
|
||||
torch = ["torch", "nltk", "scikit-learn"]
|
||||
transformer = ["transformers", "tokenizers", "sentence-transformers"]
|
||||
cosine = ["torch", "transformers", "nltk", "sentence-transformers"]
|
||||
sync = ["selenium"]
|
||||
all = [
|
||||
"pypdf>=3.0.0",
|
||||
"torch>=2.0.0",
|
||||
"nltk>=3.9.1",
|
||||
"scikit-learn>=1.3.0",
|
||||
"transformers>=4.34.0",
|
||||
"tokenizers>=0.15.0",
|
||||
"sentence-transformers>=2.2.0",
|
||||
"selenium>=4.0.0"
|
||||
"PyPDF2",
|
||||
"torch",
|
||||
"nltk",
|
||||
"scikit-learn",
|
||||
"transformers",
|
||||
"tokenizers",
|
||||
"sentence-transformers",
|
||||
"selenium"
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
|
||||
@@ -24,7 +24,6 @@ cssselect>=1.2.0
|
||||
chardet>=5.2.0
|
||||
brotli>=1.1.0
|
||||
httpx[http2]>=0.27.2
|
||||
sentence-transformers>=2.2.0
|
||||
alphashape>=1.3.1
|
||||
shapely>=2.0.0
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
AdaptiveCrawler,
|
||||
AdaptiveConfig,
|
||||
AdaptiveCrawlResult
|
||||
CrawlState
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ import math
|
||||
sys.path.append(str(Path(__file__).parent.parent))
|
||||
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.adaptive_crawler import AdaptiveCrawlResult, StatisticalStrategy
|
||||
from crawl4ai.adaptive_crawler import CrawlState, StatisticalStrategy
|
||||
from crawl4ai.models import CrawlResult
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ class ConfidenceTestHarness:
|
||||
print("=" * 80)
|
||||
|
||||
# Initialize state
|
||||
state = AdaptiveCrawlResult(query=self.query)
|
||||
state = CrawlState(query=self.query)
|
||||
|
||||
# Create crawler
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
@@ -107,7 +107,7 @@ class ConfidenceTestHarness:
|
||||
|
||||
state.metrics['prev_confidence'] = confidence
|
||||
|
||||
def _debug_coverage_calculation(self, state: AdaptiveCrawlResult, query_terms: List[str]):
|
||||
def _debug_coverage_calculation(self, state: CrawlState, query_terms: List[str]):
|
||||
"""Debug coverage calculation step by step"""
|
||||
coverage_score = 0.0
|
||||
max_possible_score = 0.0
|
||||
@@ -136,7 +136,7 @@ class ConfidenceTestHarness:
|
||||
new_coverage = self._calculate_coverage_new(state, query_terms)
|
||||
print(f" → New Coverage: {new_coverage:.3f}")
|
||||
|
||||
def _calculate_coverage_new(self, state: AdaptiveCrawlResult, query_terms: List[str]) -> float:
|
||||
def _calculate_coverage_new(self, state: CrawlState, query_terms: List[str]) -> float:
|
||||
"""New coverage calculation without IDF"""
|
||||
if not query_terms or state.total_documents == 0:
|
||||
return 0.0
|
||||
|
||||
@@ -15,7 +15,7 @@ import os
|
||||
sys.path.append(str(Path(__file__).parent.parent.parent))
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||
from crawl4ai.adaptive_crawler import EmbeddingStrategy, AdaptiveCrawlResult
|
||||
from crawl4ai.adaptive_crawler import EmbeddingStrategy, CrawlState
|
||||
from crawl4ai.models import CrawlResult
|
||||
|
||||
|
||||
@@ -132,7 +132,7 @@ async def test_embedding_performance():
|
||||
strategy.config = config
|
||||
|
||||
# Initialize state
|
||||
state = AdaptiveCrawlResult()
|
||||
state = CrawlState()
|
||||
state.query = "async await coroutines event loops tasks"
|
||||
|
||||
# Start performance monitoring
|
||||
|
||||
@@ -20,7 +20,7 @@ from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
AdaptiveCrawler,
|
||||
AdaptiveConfig,
|
||||
AdaptiveCrawlResult
|
||||
CrawlState
|
||||
)
|
||||
|
||||
console = Console()
|
||||
|
||||
@@ -1,211 +0,0 @@
|
||||
import asyncio
|
||||
import httpx
|
||||
import email.utils
|
||||
from datetime import datetime
|
||||
import json
|
||||
from typing import Dict, Optional
|
||||
import time
|
||||
|
||||
|
||||
async def should_crawl(url: str, cache: Optional[Dict[str, str]] = None) -> bool:
|
||||
"""
|
||||
Check if a URL should be crawled based on HEAD request headers.
|
||||
|
||||
Args:
|
||||
url: The URL to check
|
||||
cache: Previous cache data containing etag, last_modified, digest, content_length
|
||||
|
||||
Returns:
|
||||
True if the page has changed and should be crawled, False otherwise
|
||||
"""
|
||||
if cache is None:
|
||||
cache = {}
|
||||
|
||||
headers = {
|
||||
"Accept-Encoding": "identity",
|
||||
"Want-Content-Digest": "sha-256",
|
||||
}
|
||||
|
||||
if cache.get("etag"):
|
||||
headers["If-None-Match"] = cache["etag"]
|
||||
if cache.get("last_modified"):
|
||||
headers["If-Modified-Since"] = cache["last_modified"]
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(follow_redirects=True, timeout=5) as client:
|
||||
response = await client.head(url, headers=headers)
|
||||
|
||||
# 304 Not Modified - content hasn't changed
|
||||
if response.status_code == 304:
|
||||
print(f"✓ 304 Not Modified - No need to crawl {url}")
|
||||
return False
|
||||
|
||||
h = response.headers
|
||||
|
||||
# Check Content-Digest (most reliable)
|
||||
if h.get("content-digest") and h["content-digest"] == cache.get("digest"):
|
||||
print(f"✓ Content-Digest matches - No need to crawl {url}")
|
||||
return False
|
||||
|
||||
# Check strong ETag
|
||||
if h.get("etag") and h["etag"].startswith('"') and h["etag"] == cache.get("etag"):
|
||||
print(f"✓ Strong ETag matches - No need to crawl {url}")
|
||||
return False
|
||||
|
||||
# Check Last-Modified
|
||||
if h.get("last-modified") and cache.get("last_modified"):
|
||||
try:
|
||||
lm_new = email.utils.parsedate_to_datetime(h["last-modified"])
|
||||
lm_old = email.utils.parsedate_to_datetime(cache["last_modified"])
|
||||
if lm_new <= lm_old:
|
||||
print(f"✓ Last-Modified not newer - No need to crawl {url}")
|
||||
return False
|
||||
except:
|
||||
pass
|
||||
|
||||
# Check Content-Length (weakest signal - only as a hint, not definitive)
|
||||
# Note: Same content length doesn't mean same content!
|
||||
# This should only be used when no other signals are available
|
||||
if h.get("content-length") and cache.get("content_length"):
|
||||
try:
|
||||
if int(h["content-length"]) != cache.get("content_length"):
|
||||
print(f"✗ Content-Length changed - Should crawl {url}")
|
||||
return True
|
||||
else:
|
||||
print(f"⚠️ Content-Length unchanged but content might have changed - Should crawl {url}")
|
||||
return True # When in doubt, crawl!
|
||||
except:
|
||||
pass
|
||||
|
||||
print(f"✗ Content has changed - Should crawl {url}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Error checking {url}: {e}")
|
||||
return True # On error, assume we should crawl
|
||||
|
||||
|
||||
async def crawl_page(url: str) -> Dict[str, str]:
|
||||
"""
|
||||
Simulate crawling a page and extracting cache headers.
|
||||
"""
|
||||
print(f"\n🕷️ Crawling {url}...")
|
||||
|
||||
async with httpx.AsyncClient(follow_redirects=True, timeout=10) as client:
|
||||
response = await client.get(url)
|
||||
|
||||
cache_data = {}
|
||||
h = response.headers
|
||||
|
||||
if h.get("etag"):
|
||||
cache_data["etag"] = h["etag"]
|
||||
print(f" Stored ETag: {h['etag']}")
|
||||
|
||||
if h.get("last-modified"):
|
||||
cache_data["last_modified"] = h["last-modified"]
|
||||
print(f" Stored Last-Modified: {h['last-modified']}")
|
||||
|
||||
if h.get("content-digest"):
|
||||
cache_data["digest"] = h["content-digest"]
|
||||
print(f" Stored Content-Digest: {h['content-digest']}")
|
||||
|
||||
if h.get("content-length"):
|
||||
cache_data["content_length"] = int(h["content-length"])
|
||||
print(f" Stored Content-Length: {h['content-length']}")
|
||||
|
||||
print(f" Response size: {len(response.content)} bytes")
|
||||
return cache_data
|
||||
|
||||
|
||||
async def test_static_site():
|
||||
"""Test with a static website (example.com)"""
|
||||
print("=" * 60)
|
||||
print("Testing with static site: example.com")
|
||||
print("=" * 60)
|
||||
|
||||
url = "https://example.com"
|
||||
|
||||
# First crawl - always happens
|
||||
cache = await crawl_page(url)
|
||||
|
||||
# Wait a bit
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Second check - should not need to crawl
|
||||
print(f"\n📊 Checking if we need to re-crawl...")
|
||||
needs_crawl = await should_crawl(url, cache)
|
||||
|
||||
if not needs_crawl:
|
||||
print("✅ Correctly identified: No need to re-crawl static content")
|
||||
else:
|
||||
print("❌ Unexpected: Static content flagged as changed")
|
||||
|
||||
|
||||
async def test_dynamic_site():
|
||||
"""Test with dynamic websites that change frequently"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Testing with dynamic sites")
|
||||
print("=" * 60)
|
||||
|
||||
# Test with a few dynamic sites
|
||||
dynamic_sites = [
|
||||
"https://api.github.com/", # GitHub API root (changes with rate limit info)
|
||||
"https://worldtimeapi.org/api/timezone/UTC", # Current time API
|
||||
"https://httpbin.org/uuid", # Generates new UUID each request
|
||||
]
|
||||
|
||||
for url in dynamic_sites:
|
||||
print(f"\n🔄 Testing {url}")
|
||||
try:
|
||||
# First crawl
|
||||
cache = await crawl_page(url)
|
||||
|
||||
# Wait a bit
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Check if content changed
|
||||
print(f"\n📊 Checking if we need to re-crawl...")
|
||||
needs_crawl = await should_crawl(url, cache)
|
||||
|
||||
if needs_crawl:
|
||||
print("✅ Correctly identified: Dynamic content has changed")
|
||||
else:
|
||||
print("⚠️ Note: Dynamic content appears unchanged (might have caching)")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error testing {url}: {e}")
|
||||
|
||||
|
||||
async def test_conditional_get():
|
||||
"""Test conditional GET fallback when HEAD doesn't provide enough info"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Testing conditional GET scenario")
|
||||
print("=" * 60)
|
||||
|
||||
url = "https://httpbin.org/etag/test-etag-123"
|
||||
|
||||
# Simulate a scenario where we have an ETag
|
||||
cache = {"etag": '"test-etag-123"'}
|
||||
|
||||
print(f"Testing with cached ETag: {cache['etag']}")
|
||||
needs_crawl = await should_crawl(url, cache)
|
||||
|
||||
if not needs_crawl:
|
||||
print("✅ ETag matched - no crawl needed")
|
||||
else:
|
||||
print("✅ ETag didn't match - crawl needed")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all tests"""
|
||||
print("🚀 Starting HEAD request change detection tests\n")
|
||||
|
||||
await test_static_site()
|
||||
await test_dynamic_site()
|
||||
await test_conditional_get()
|
||||
|
||||
print("\n✨ All tests completed!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,186 +0,0 @@
|
||||
import asyncio
|
||||
import httpx
|
||||
import email.utils
|
||||
from datetime import datetime
|
||||
import json
|
||||
from typing import Dict, Optional
|
||||
import time
|
||||
|
||||
|
||||
async def should_crawl(url: str, cache: Optional[Dict[str, str]] = None) -> bool:
|
||||
"""
|
||||
Check if a URL should be crawled based on HEAD request headers.
|
||||
"""
|
||||
if cache is None:
|
||||
cache = {}
|
||||
|
||||
headers = {
|
||||
"Accept-Encoding": "identity",
|
||||
"Want-Content-Digest": "sha-256",
|
||||
"User-Agent": "Mozilla/5.0 (compatible; crawl4ai/1.0)"
|
||||
}
|
||||
|
||||
if cache.get("etag"):
|
||||
headers["If-None-Match"] = cache["etag"]
|
||||
if cache.get("last_modified"):
|
||||
headers["If-Modified-Since"] = cache["last_modified"]
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(follow_redirects=True, timeout=5) as client:
|
||||
response = await client.head(url, headers=headers)
|
||||
|
||||
print(f"\nHEAD Response Status: {response.status_code}")
|
||||
print(f"Headers received: {dict(response.headers)}")
|
||||
|
||||
# 304 Not Modified
|
||||
if response.status_code == 304:
|
||||
return False
|
||||
|
||||
h = response.headers
|
||||
|
||||
# Check headers in order of reliability
|
||||
if h.get("content-digest") and h["content-digest"] == cache.get("digest"):
|
||||
return False
|
||||
|
||||
if h.get("etag") and h["etag"].startswith('"') and h["etag"] == cache.get("etag"):
|
||||
return False
|
||||
|
||||
if h.get("last-modified") and cache.get("last_modified"):
|
||||
try:
|
||||
lm_new = email.utils.parsedate_to_datetime(h["last-modified"])
|
||||
lm_old = email.utils.parsedate_to_datetime(cache["last_modified"])
|
||||
if lm_new <= lm_old:
|
||||
return False
|
||||
except:
|
||||
pass
|
||||
|
||||
# Check Content-Length (weakest signal - only as a hint, not definitive)
|
||||
# Note: Same content length doesn't mean same content!
|
||||
if h.get("content-length") and cache.get("content_length"):
|
||||
try:
|
||||
if int(h["content-length"]) != cache.get("content_length"):
|
||||
return True # Length changed, likely content changed
|
||||
# If length is same, we can't be sure - default to crawling
|
||||
except:
|
||||
pass
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during HEAD request: {e}")
|
||||
return True
|
||||
|
||||
|
||||
async def test_with_changing_content():
|
||||
"""Test with a real changing website"""
|
||||
print("=" * 60)
|
||||
print("Testing with real changing content")
|
||||
print("=" * 60)
|
||||
|
||||
# Using httpbin's cache endpoint that changes after specified seconds
|
||||
url = "https://httpbin.org/cache/1" # Cache for 1 second
|
||||
|
||||
print(f"\n1️⃣ First request to {url}")
|
||||
async with httpx.AsyncClient() as client:
|
||||
response1 = await client.get(url)
|
||||
cache = {}
|
||||
if response1.headers.get("etag"):
|
||||
cache["etag"] = response1.headers["etag"]
|
||||
if response1.headers.get("last-modified"):
|
||||
cache["last_modified"] = response1.headers["last-modified"]
|
||||
print(f"Cached ETag: {cache.get('etag', 'None')}")
|
||||
print(f"Cached Last-Modified: {cache.get('last_modified', 'None')}")
|
||||
|
||||
# Check immediately (should not need crawl)
|
||||
print(f"\n2️⃣ Checking immediately after first request...")
|
||||
needs_crawl = await should_crawl(url, cache)
|
||||
print(f"Result: {'NEED TO CRAWL' if needs_crawl else 'NO NEED TO CRAWL'}")
|
||||
|
||||
# Wait for cache to expire
|
||||
print(f"\n⏳ Waiting 2 seconds for cache to expire...")
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Check again (should need crawl now)
|
||||
print(f"\n3️⃣ Checking after cache expiry...")
|
||||
needs_crawl = await should_crawl(url, cache)
|
||||
print(f"Result: {'NEED TO CRAWL' if needs_crawl else 'NO NEED TO CRAWL'}")
|
||||
|
||||
|
||||
async def test_news_website():
|
||||
"""Test with a news website that updates frequently"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Testing with news website (BBC)")
|
||||
print("=" * 60)
|
||||
|
||||
url = "https://www.bbc.com"
|
||||
|
||||
print(f"\n1️⃣ First crawl of {url}")
|
||||
async with httpx.AsyncClient() as client:
|
||||
response1 = await client.get(url)
|
||||
cache = {}
|
||||
h = response1.headers
|
||||
|
||||
if h.get("etag"):
|
||||
cache["etag"] = h["etag"]
|
||||
print(f"Stored ETag: {h['etag'][:50]}...")
|
||||
if h.get("last-modified"):
|
||||
cache["last_modified"] = h["last-modified"]
|
||||
print(f"Stored Last-Modified: {h['last-modified']}")
|
||||
if h.get("content-length"):
|
||||
cache["content_length"] = int(h["content-length"])
|
||||
print(f"Stored Content-Length: {h['content-length']}")
|
||||
|
||||
# Check multiple times
|
||||
for i in range(3):
|
||||
await asyncio.sleep(5)
|
||||
print(f"\n📊 Check #{i+2} - {datetime.now().strftime('%H:%M:%S')}")
|
||||
needs_crawl = await should_crawl(url, cache)
|
||||
print(f"Result: {'NEED TO CRAWL ✓' if needs_crawl else 'NO NEED TO CRAWL ✗'}")
|
||||
|
||||
|
||||
async def test_api_endpoint():
|
||||
"""Test with an API that provides proper caching headers"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Testing with GitHub API")
|
||||
print("=" * 60)
|
||||
|
||||
# GitHub user API (updates when user data changes)
|
||||
url = "https://api.github.com/users/github"
|
||||
|
||||
headers = {"User-Agent": "crawl4ai-test"}
|
||||
|
||||
print(f"\n1️⃣ First request to {url}")
|
||||
async with httpx.AsyncClient() as client:
|
||||
response1 = await client.get(url, headers=headers)
|
||||
cache = {}
|
||||
h = response1.headers
|
||||
|
||||
if h.get("etag"):
|
||||
cache["etag"] = h["etag"]
|
||||
print(f"Stored ETag: {h['etag']}")
|
||||
if h.get("last-modified"):
|
||||
cache["last_modified"] = h["last-modified"]
|
||||
print(f"Stored Last-Modified: {h['last-modified']}")
|
||||
|
||||
# Print rate limit info
|
||||
print(f"Rate Limit Remaining: {h.get('x-ratelimit-remaining', 'N/A')}")
|
||||
|
||||
# Check if content changed
|
||||
print(f"\n2️⃣ Checking if content changed...")
|
||||
needs_crawl = await should_crawl(url, cache)
|
||||
print(f"Result: {'NEED TO CRAWL' if needs_crawl else 'NO NEED TO CRAWL (content unchanged)'}")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all tests"""
|
||||
print("🚀 Testing HEAD request change detection with real websites\n")
|
||||
|
||||
await test_with_changing_content()
|
||||
await test_news_website()
|
||||
await test_api_endpoint()
|
||||
|
||||
print("\n✨ All tests completed!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,196 +0,0 @@
|
||||
"""
|
||||
Test SMART cache mode functionality in crawl4ai.
|
||||
|
||||
This test demonstrates:
|
||||
1. Initial crawl with caching enabled
|
||||
2. Re-crawl with SMART mode on static content (should use cache)
|
||||
3. Re-crawl with SMART mode on dynamic content (should re-crawl)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
from crawl4ai.cache_context import CacheMode
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
async def test_smart_cache_mode():
|
||||
"""Test the SMART cache mode with both static and dynamic URLs"""
|
||||
|
||||
print("=" * 60)
|
||||
print("Testing SMART Cache Mode")
|
||||
print("=" * 60)
|
||||
|
||||
# URLs for testing
|
||||
static_url = "https://example.com" # Rarely changes
|
||||
dynamic_url = "https://httpbin.org/uuid" # Changes every request
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
|
||||
# Test 1: Initial crawl with caching enabled
|
||||
print("\n1️⃣ Initial crawl with ENABLED cache mode")
|
||||
print("-" * 40)
|
||||
|
||||
# Crawl static URL
|
||||
config_static = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.ENABLED,
|
||||
verbose=True
|
||||
)
|
||||
result_static_1 = await crawler.arun(url=static_url, config=config_static)
|
||||
print(f"✓ Static URL crawled: {len(result_static_1.html)} bytes")
|
||||
print(f" Response headers: {list(result_static_1.response_headers.keys())[:5]}...")
|
||||
|
||||
# Crawl dynamic URL
|
||||
config_dynamic = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.ENABLED,
|
||||
verbose=True
|
||||
)
|
||||
result_dynamic_1 = await crawler.arun(url=dynamic_url, config=config_dynamic)
|
||||
print(f"✓ Dynamic URL crawled: {len(result_dynamic_1.html)} bytes")
|
||||
dynamic_content_1 = result_dynamic_1.html
|
||||
|
||||
# Wait a bit
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Test 2: Re-crawl static URL with SMART mode
|
||||
print("\n2️⃣ Re-crawl static URL with SMART cache mode")
|
||||
print("-" * 40)
|
||||
|
||||
config_smart = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.SMART, # This will be our new mode
|
||||
verbose=True
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
result_static_2 = await crawler.arun(url=static_url, config=config_smart)
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
print(f"✓ Static URL with SMART mode completed in {elapsed:.2f}s")
|
||||
print(f" Should use cache (content unchanged)")
|
||||
print(f" HTML length: {len(result_static_2.html)} bytes")
|
||||
|
||||
# Test 3: Re-crawl dynamic URL with SMART mode
|
||||
print("\n3️⃣ Re-crawl dynamic URL with SMART cache mode")
|
||||
print("-" * 40)
|
||||
|
||||
start_time = time.time()
|
||||
result_dynamic_2 = await crawler.arun(url=dynamic_url, config=config_smart)
|
||||
elapsed = time.time() - start_time
|
||||
dynamic_content_2 = result_dynamic_2.html
|
||||
|
||||
print(f"✓ Dynamic URL with SMART mode completed in {elapsed:.2f}s")
|
||||
print(f" Should re-crawl (content changes every request)")
|
||||
print(f" HTML length: {len(result_dynamic_2.html)} bytes")
|
||||
print(f" Content changed: {dynamic_content_1 != dynamic_content_2}")
|
||||
|
||||
# Test 4: Test with a news website (content changes frequently)
|
||||
print("\n4️⃣ Testing with news website")
|
||||
print("-" * 40)
|
||||
|
||||
news_url = "https://news.ycombinator.com"
|
||||
|
||||
# First crawl
|
||||
result_news_1 = await crawler.arun(
|
||||
url=news_url,
|
||||
config=CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
||||
)
|
||||
print(f"✓ News site initial crawl: {len(result_news_1.html)} bytes")
|
||||
|
||||
# Wait a bit
|
||||
await asyncio.sleep(5)
|
||||
|
||||
# Re-crawl with SMART mode
|
||||
start_time = time.time()
|
||||
result_news_2 = await crawler.arun(
|
||||
url=news_url,
|
||||
config=CrawlerRunConfig(cache_mode=CacheMode.SMART)
|
||||
)
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
print(f"✓ News site SMART mode completed in {elapsed:.2f}s")
|
||||
print(f" Content length changed: {len(result_news_1.html) != len(result_news_2.html)}")
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 60)
|
||||
print("Summary")
|
||||
print("=" * 60)
|
||||
print("✅ SMART cache mode should:")
|
||||
print(" - Use cache for static content (example.com)")
|
||||
print(" - Re-crawl dynamic content (httpbin.org/uuid)")
|
||||
print(" - Make intelligent decisions based on HEAD requests")
|
||||
print(" - Save bandwidth on unchanged content")
|
||||
|
||||
|
||||
async def test_smart_cache_edge_cases():
|
||||
"""Test edge cases for SMART cache mode"""
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Testing SMART Cache Mode Edge Cases")
|
||||
print("=" * 60)
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
|
||||
# Test with URL that doesn't support HEAD
|
||||
print("\n🔧 Testing URL with potential HEAD issues")
|
||||
print("-" * 40)
|
||||
|
||||
# Some servers don't handle HEAD well
|
||||
problematic_url = "https://httpbin.org/status/200"
|
||||
|
||||
# Initial crawl
|
||||
await crawler.arun(
|
||||
url=problematic_url,
|
||||
config=CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
||||
)
|
||||
|
||||
# Try SMART mode
|
||||
result = await crawler.arun(
|
||||
url=problematic_url,
|
||||
config=CrawlerRunConfig(cache_mode=CacheMode.SMART)
|
||||
)
|
||||
print(f"✓ Handled potentially problematic URL: {result.success}")
|
||||
|
||||
# Test with URL that has no caching headers
|
||||
print("\n🔧 Testing URL with no cache headers")
|
||||
print("-" * 40)
|
||||
|
||||
no_cache_url = "https://httpbin.org/html"
|
||||
|
||||
# Initial crawl
|
||||
await crawler.arun(
|
||||
url=no_cache_url,
|
||||
config=CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
||||
)
|
||||
|
||||
# SMART mode should handle gracefully
|
||||
result = await crawler.arun(
|
||||
url=no_cache_url,
|
||||
config=CrawlerRunConfig(cache_mode=CacheMode.SMART)
|
||||
)
|
||||
print(f"✓ Handled URL with no cache headers: {result.success}")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all tests"""
|
||||
try:
|
||||
# Run main test
|
||||
await test_smart_cache_mode()
|
||||
|
||||
# Run edge case tests
|
||||
await test_smart_cache_edge_cases()
|
||||
|
||||
print("\n✨ All tests completed!")
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error during testing: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Note: This test will fail until SMART mode is implemented
|
||||
print("⚠️ Note: This test expects CacheMode.SMART to be implemented")
|
||||
print("⚠️ It will fail with AttributeError until the feature is added\n")
|
||||
|
||||
asyncio.run(main())
|
||||
@@ -1,69 +0,0 @@
|
||||
"""
|
||||
Simple test for SMART cache mode functionality.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
from crawl4ai.cache_context import CacheMode
|
||||
import time
|
||||
|
||||
|
||||
async def test_smart_cache():
|
||||
"""Test SMART cache mode with a simple example"""
|
||||
|
||||
print("Testing SMART Cache Mode")
|
||||
print("-" * 40)
|
||||
|
||||
# Test URL
|
||||
url = "https://example.com"
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# First crawl with normal caching
|
||||
print("\n1. Initial crawl with ENABLED mode:")
|
||||
config1 = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
||||
result1 = await crawler.arun(url=url, config=config1)
|
||||
print(f" Crawled: {len(result1.html)} bytes")
|
||||
print(f" Headers: {list(result1.response_headers.keys())[:3]}...")
|
||||
|
||||
# Wait a moment
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Re-crawl with SMART mode
|
||||
print("\n2. Re-crawl with SMART mode:")
|
||||
config2 = CrawlerRunConfig(cache_mode=CacheMode.SMART)
|
||||
start = time.time()
|
||||
result2 = await crawler.arun(url=url, config=config2)
|
||||
elapsed = time.time() - start
|
||||
|
||||
print(f" Time: {elapsed:.2f}s")
|
||||
print(f" Result: {len(result2.html)} bytes")
|
||||
print(f" Should use cache (content unchanged)")
|
||||
|
||||
# Test with dynamic content
|
||||
print("\n3. Testing with dynamic URL:")
|
||||
dynamic_url = "https://httpbin.org/uuid"
|
||||
|
||||
# First crawl
|
||||
config3 = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
||||
result3 = await crawler.arun(url=dynamic_url, config=config3)
|
||||
content1 = result3.html
|
||||
|
||||
# Re-crawl with SMART
|
||||
config4 = CrawlerRunConfig(cache_mode=CacheMode.SMART)
|
||||
result4 = await crawler.arun(url=dynamic_url, config=config4)
|
||||
content2 = result4.html
|
||||
|
||||
print(f" Content changed: {content1 != content2}")
|
||||
print(f" Should re-crawl (dynamic content)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(f"Python path: {sys.path[0]}")
|
||||
print(f"CacheMode values: {[e.value for e in CacheMode]}")
|
||||
print()
|
||||
asyncio.run(test_smart_cache())
|
||||
Reference in New Issue
Block a user