Compare commits
106 Commits
feature/na
...
release/v0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
48647300b4 | ||
|
|
9f9ea3bb3b | ||
|
|
d58b93c207 | ||
|
|
e2b4705010 | ||
|
|
4a1abd5086 | ||
|
|
04258cd4f2 | ||
|
|
84e462d9f8 | ||
|
|
9546773a07 | ||
|
|
66a979ad11 | ||
|
|
0c31e91b53 | ||
|
|
1b6a31f88f | ||
|
|
b8c261780f | ||
|
|
db6ad7a79d | ||
|
|
004d514f33 | ||
|
|
3a9e2c716e | ||
|
|
0163bd797c | ||
|
|
26bad799e4 | ||
|
|
cf8badfe27 | ||
|
|
ccbe3c105c | ||
|
|
761c19d54b | ||
|
|
14b0ecb137 | ||
|
|
0eaa9f9895 | ||
|
|
1d1970ae69 | ||
|
|
205df1e330 | ||
|
|
2640dc73a5 | ||
|
|
58024755c5 | ||
|
|
dd5ee752cf | ||
|
|
bde1bba6a2 | ||
|
|
14f690d751 | ||
|
|
7b9ba3015f | ||
|
|
0c8bb742b7 | ||
|
|
ba2ed53ff1 | ||
|
|
a93efcb650 | ||
|
|
8794852a26 | ||
|
|
fb25a4a769 | ||
|
|
afe852935e | ||
|
|
0ebce590f8 | ||
|
|
026e96a2df | ||
|
|
36429a63de | ||
|
|
a3d41c7951 | ||
|
|
fee4c5c783 | ||
|
|
0f210f6e02 | ||
|
|
1a73fb60db | ||
|
|
74705c1f67 | ||
|
|
048d9b0f5b | ||
|
|
02f3127ded | ||
|
|
414f16e975 | ||
|
|
b7a6e02236 | ||
|
|
9332326457 | ||
|
|
6cd34b3157 | ||
|
|
871d4f1158 | ||
|
|
dc85481180 | ||
|
|
5d9213a0e9 | ||
|
|
4679ee023d | ||
|
|
f9b7090084 | ||
|
|
9442597f81 | ||
|
|
74b06d4b80 | ||
|
|
b4bb0ccea0 | ||
|
|
5ac19a61d7 | ||
|
|
022cc2d92a | ||
|
|
fcc2abe4db | ||
|
|
cc95d3abd4 | ||
|
|
5ce3e682f3 | ||
|
|
28125c1980 | ||
|
|
773ed7b281 | ||
|
|
58c1e17170 | ||
|
|
b55e27d2ef | ||
|
|
3d46d89759 | ||
|
|
da8f0dbb93 | ||
|
|
33a0c7a17a | ||
|
|
984524ca1c | ||
|
|
cb8d581e47 | ||
|
|
a55c2b3f88 | ||
|
|
ce09648af1 | ||
|
|
a97654270b | ||
|
|
b4fc60a555 | ||
|
|
137ac014fb | ||
|
|
faa98eefbc | ||
|
|
22725ca87b | ||
|
|
e0fbd2b0a0 | ||
|
|
32966bea11 | ||
|
|
a3b0cab52a | ||
|
|
137556b3dc | ||
|
|
260e2dc347 | ||
|
|
25d97d56e4 | ||
|
|
98a56e6e01 | ||
|
|
1af3d1c2e0 | ||
|
|
c1041b9bbe | ||
|
|
f6e25e2a6b | ||
|
|
ee93acbd06 | ||
|
|
2b17f234f8 | ||
|
|
eebb8c84f0 | ||
|
|
12783fabda | ||
|
|
39e3b792a1 | ||
|
|
e0cd3e10de | ||
|
|
1d6a2b9979 | ||
|
|
039be1b1ce | ||
|
|
53245e4e0e | ||
|
|
094201ab2a | ||
|
|
14a31456ef | ||
|
|
0886153d6a | ||
|
|
0ec3c4a788 | ||
|
|
05085b6e3d | ||
|
|
1f3b1251d0 | ||
|
|
7b9aabc64a | ||
|
|
27af4cc27b |
13
.github/workflows/main.yml
vendored
13
.github/workflows/main.yml
vendored
@@ -9,16 +9,26 @@ on:
|
||||
types: [opened]
|
||||
discussion:
|
||||
types: [created]
|
||||
watch:
|
||||
types: [started]
|
||||
|
||||
jobs:
|
||||
notify-discord:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Send to Google Apps Script (Stars only)
|
||||
if: github.event_name == 'watch'
|
||||
run: |
|
||||
curl -fSs -X POST "${{ secrets.GOOGLE_SCRIPT_ENDPOINT }}" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"url":"${{ github.event.sender.html_url }}"}'
|
||||
- name: Set webhook based on event type
|
||||
id: set-webhook
|
||||
run: |
|
||||
if [ "${{ github.event_name }}" == "discussion" ]; then
|
||||
echo "webhook=${{ secrets.DISCORD_DISCUSSIONS_WEBHOOK }}" >> $GITHUB_OUTPUT
|
||||
elif [ "${{ github.event_name }}" == "watch" ]; then
|
||||
echo "webhook=${{ secrets.DISCORD_STAR_GAZERS }}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "webhook=${{ secrets.DISCORD_WEBHOOK }}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
@@ -31,5 +41,6 @@ jobs:
|
||||
args: |
|
||||
${{ github.event_name == 'issues' && format('📣 New issue created: **{0}** by {1} - {2}', github.event.issue.title, github.event.issue.user.login, github.event.issue.html_url) ||
|
||||
github.event_name == 'issue_comment' && format('💬 New comment on issue **{0}** by {1} - {2}', github.event.issue.title, github.event.comment.user.login, github.event.comment.html_url) ||
|
||||
github.event_name == 'pull_request' && format('🔄 New PR opened: **{0}** by {1} - {2}', github.event.pull_request.title, github.event.pull_request.user.login, github.event.pull_request.html_url) ||
|
||||
github.event_name == 'pull_request' && format('🔄 New PR opened: **{0}** by {1} - {2}', github.event.pull_request.title, github.event.pull_request.user.login, github.event.pull_request.html_url) ||
|
||||
github.event_name == 'watch' && format('⭐ {0} starred Crawl4AI 🥳! Check out their profile: {1}', github.event.sender.login, github.event.sender.html_url) ||
|
||||
format('💬 New discussion started: **{0}** by {1} - {2}', github.event.discussion.title, github.event.discussion.user.login, github.event.discussion.html_url) }}
|
||||
|
||||
141
.github/workflows/release.yml
vendored
Normal file
141
.github/workflows/release.yml
vendored
Normal file
@@ -0,0 +1,141 @@
|
||||
name: Release Pipeline
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- 'v*'
|
||||
- '!test-v*' # Exclude test tags
|
||||
|
||||
jobs:
|
||||
release:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Extract version from tag
|
||||
id: get_version
|
||||
run: |
|
||||
TAG_VERSION=${GITHUB_REF#refs/tags/v}
|
||||
echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT
|
||||
echo "Releasing version: $TAG_VERSION"
|
||||
|
||||
- name: Install package dependencies
|
||||
run: |
|
||||
pip install -e .
|
||||
|
||||
- name: Check version consistency
|
||||
run: |
|
||||
TAG_VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||
PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)")
|
||||
|
||||
echo "Tag version: $TAG_VERSION"
|
||||
echo "Package version: $PACKAGE_VERSION"
|
||||
|
||||
if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then
|
||||
echo "❌ Version mismatch! Tag: $TAG_VERSION, Package: $PACKAGE_VERSION"
|
||||
echo "Please update crawl4ai/__version__.py to match the tag version"
|
||||
exit 1
|
||||
fi
|
||||
echo "✅ Version check passed: $TAG_VERSION"
|
||||
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install build twine
|
||||
|
||||
- name: Build package
|
||||
run: python -m build
|
||||
|
||||
- name: Check package
|
||||
run: twine check dist/*
|
||||
|
||||
- name: Upload to PyPI
|
||||
env:
|
||||
TWINE_USERNAME: __token__
|
||||
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
|
||||
run: |
|
||||
echo "📦 Uploading to PyPI..."
|
||||
twine upload dist/*
|
||||
echo "✅ Package uploaded to https://pypi.org/project/crawl4ai/"
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Extract major and minor versions
|
||||
id: versions
|
||||
run: |
|
||||
VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||
MAJOR=$(echo $VERSION | cut -d. -f1)
|
||||
MINOR=$(echo $VERSION | cut -d. -f1-2)
|
||||
echo "MAJOR=$MAJOR" >> $GITHUB_OUTPUT
|
||||
echo "MINOR=$MINOR" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Build and push Docker images
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: |
|
||||
unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
||||
unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}
|
||||
unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}
|
||||
unclecode/crawl4ai:latest
|
||||
platforms: linux/amd64,linux/arm64
|
||||
|
||||
- name: Create GitHub Release
|
||||
uses: actions/create-release@v1
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
with:
|
||||
tag_name: v${{ steps.get_version.outputs.VERSION }}
|
||||
release_name: Release v${{ steps.get_version.outputs.VERSION }}
|
||||
body: |
|
||||
## 🎉 Crawl4AI v${{ steps.get_version.outputs.VERSION }} Released!
|
||||
|
||||
### 📦 Installation
|
||||
|
||||
**PyPI:**
|
||||
```bash
|
||||
pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}
|
||||
```
|
||||
|
||||
**Docker:**
|
||||
```bash
|
||||
docker pull unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
||||
docker pull unclecode/crawl4ai:latest
|
||||
```
|
||||
|
||||
### 📝 What's Changed
|
||||
See [CHANGELOG.md](https://github.com/${{ github.repository }}/blob/main/CHANGELOG.md) for details.
|
||||
draft: false
|
||||
prerelease: false
|
||||
|
||||
- name: Summary
|
||||
run: |
|
||||
echo "## 🚀 Release Complete!" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 📦 PyPI Package" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- Version: ${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- URL: https://pypi.org/project/crawl4ai/" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- Install: \`pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 🐳 Docker Images" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:latest\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 📋 GitHub Release" >> $GITHUB_STEP_SUMMARY
|
||||
echo "https://github.com/${{ github.repository }}/releases/tag/v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
116
.github/workflows/test-release.yml.disabled
vendored
Normal file
116
.github/workflows/test-release.yml.disabled
vendored
Normal file
@@ -0,0 +1,116 @@
|
||||
name: Test Release Pipeline
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- 'test-v*'
|
||||
|
||||
jobs:
|
||||
test-release:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Extract version from tag
|
||||
id: get_version
|
||||
run: |
|
||||
TAG_VERSION=${GITHUB_REF#refs/tags/test-v}
|
||||
echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT
|
||||
echo "Testing with version: $TAG_VERSION"
|
||||
|
||||
- name: Install package dependencies
|
||||
run: |
|
||||
pip install -e .
|
||||
|
||||
- name: Check version consistency
|
||||
run: |
|
||||
TAG_VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||
PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)")
|
||||
|
||||
echo "Tag version: $TAG_VERSION"
|
||||
echo "Package version: $PACKAGE_VERSION"
|
||||
|
||||
if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then
|
||||
echo "❌ Version mismatch! Tag: $TAG_VERSION, Package: $PACKAGE_VERSION"
|
||||
echo "Please update crawl4ai/__version__.py to match the tag version"
|
||||
exit 1
|
||||
fi
|
||||
echo "✅ Version check passed: $TAG_VERSION"
|
||||
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install build twine
|
||||
|
||||
- name: Build package
|
||||
run: python -m build
|
||||
|
||||
- name: Check package
|
||||
run: twine check dist/*
|
||||
|
||||
- name: Upload to Test PyPI
|
||||
env:
|
||||
TWINE_USERNAME: __token__
|
||||
TWINE_PASSWORD: ${{ secrets.TEST_PYPI_TOKEN }}
|
||||
run: |
|
||||
echo "📦 Uploading to Test PyPI..."
|
||||
twine upload --repository testpypi dist/* || {
|
||||
if [ $? -eq 1 ]; then
|
||||
echo "⚠️ Upload failed - likely version already exists on Test PyPI"
|
||||
echo "Continuing anyway for test purposes..."
|
||||
else
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
echo "✅ Test PyPI step complete"
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Build and push Docker test images
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: |
|
||||
unclecode/crawl4ai:test-${{ steps.get_version.outputs.VERSION }}
|
||||
unclecode/crawl4ai:test-latest
|
||||
platforms: linux/amd64,linux/arm64
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
|
||||
- name: Summary
|
||||
run: |
|
||||
echo "## 🎉 Test Release Complete!" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 📦 Test PyPI Package" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- Version: ${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- URL: https://test.pypi.org/project/crawl4ai/" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- Install: \`pip install -i https://test.pypi.org/simple/ crawl4ai==${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 🐳 Docker Test Images" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:test-${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:test-latest\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 🧹 Cleanup Commands" >> $GITHUB_STEP_SUMMARY
|
||||
echo "\`\`\`bash" >> $GITHUB_STEP_SUMMARY
|
||||
echo "# Remove test tag" >> $GITHUB_STEP_SUMMARY
|
||||
echo "git tag -d test-v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "git push origin :test-v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "# Remove Docker test images" >> $GITHUB_STEP_SUMMARY
|
||||
echo "docker rmi unclecode/crawl4ai:test-${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "docker rmi unclecode/crawl4ai:test-latest" >> $GITHUB_STEP_SUMMARY
|
||||
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -1,3 +1,6 @@
|
||||
# Scripts folder (private tools)
|
||||
.scripts/
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
FROM python:3.12-slim-bookworm AS build
|
||||
|
||||
# C4ai version
|
||||
ARG C4AI_VER=0.6.0
|
||||
ARG C4AI_VER=0.7.0-r1
|
||||
ENV C4AI_VERSION=$C4AI_VER
|
||||
LABEL c4ai.version=$C4AI_VER
|
||||
|
||||
|
||||
320
PROGRESSIVE_CRAWLING.md
Normal file
320
PROGRESSIVE_CRAWLING.md
Normal file
@@ -0,0 +1,320 @@
|
||||
# Progressive Web Crawling with Adaptive Information Foraging
|
||||
|
||||
## Abstract
|
||||
|
||||
This paper presents a novel approach to web crawling that adaptively determines when sufficient information has been gathered to answer a given query. Unlike traditional exhaustive crawling methods, our Progressive Information Sufficiency (PIS) framework uses statistical measures to balance information completeness against crawling efficiency. We introduce a multi-strategy architecture supporting pure statistical, embedding-enhanced, and LLM-assisted approaches, with theoretical guarantees on convergence and practical evaluation methods using synthetic datasets.
|
||||
|
||||
## 1. Introduction
|
||||
|
||||
Traditional web crawling approaches follow predetermined patterns (breadth-first, depth-first) without consideration for information sufficiency. This work addresses the fundamental question: *"When do we have enough information to answer a query and similar queries in its domain?"*
|
||||
|
||||
We formalize this as an optimal stopping problem in information foraging, introducing metrics for coverage, consistency, and saturation that enable crawlers to make intelligent decisions about when to stop crawling and which links to follow.
|
||||
|
||||
## 2. Problem Formulation
|
||||
|
||||
### 2.1 Definitions
|
||||
|
||||
Let:
|
||||
- **K** = {d₁, d₂, ..., dₙ} be the current knowledge base (crawled documents)
|
||||
- **Q** be the user query
|
||||
- **L** = {l₁, l₂, ..., lₘ} be available links with preview metadata
|
||||
- **θ** be the confidence threshold for information sufficiency
|
||||
|
||||
### 2.2 Objectives
|
||||
|
||||
1. **Minimize** |K| (number of crawled pages)
|
||||
2. **Maximize** P(answers(Q) | K) (probability of answering Q given K)
|
||||
3. **Ensure** coverage of Q's domain (similar queries)
|
||||
|
||||
## 3. Mathematical Framework
|
||||
|
||||
### 3.1 Information Sufficiency Metric
|
||||
|
||||
We define Information Sufficiency as:
|
||||
|
||||
```
|
||||
IS(K, Q) = min(Coverage(K, Q), Consistency(K, Q), 1 - Redundancy(K)) × DomainCoverage(K, Q)
|
||||
```
|
||||
|
||||
### 3.2 Coverage Score
|
||||
|
||||
Coverage measures how well current knowledge covers query terms and related concepts:
|
||||
|
||||
```
|
||||
Coverage(K, Q) = Σ(t ∈ Q) log(df(t, K) + 1) × idf(t) / |Q|
|
||||
```
|
||||
|
||||
Where:
|
||||
- df(t, K) = document frequency of term t in knowledge base K
|
||||
- idf(t) = inverse document frequency weight
|
||||
|
||||
### 3.3 Consistency Score
|
||||
|
||||
Consistency measures information coherence across documents:
|
||||
|
||||
```
|
||||
Consistency(K, Q) = 1 - Var(answers from random subsets of K)
|
||||
```
|
||||
|
||||
This captures the principle that sufficient knowledge should provide stable answers regardless of document subset.
|
||||
|
||||
### 3.4 Saturation Score
|
||||
|
||||
Saturation detects diminishing returns:
|
||||
|
||||
```
|
||||
Saturation(K) = 1 - (ΔInfo(Kₙ) / ΔInfo(K₁))
|
||||
```
|
||||
|
||||
Where ΔInfo represents marginal information gain from the nth crawl.
|
||||
|
||||
### 3.5 Link Value Prediction
|
||||
|
||||
Expected information gain from uncrawled links:
|
||||
|
||||
```
|
||||
ExpectedGain(l) = Relevance(l, Q) × Novelty(l, K) × Authority(l)
|
||||
```
|
||||
|
||||
Components:
|
||||
- **Relevance**: BM25(preview_text, Q)
|
||||
- **Novelty**: 1 - max_similarity(preview, K)
|
||||
- **Authority**: f(url_structure, domain_metrics)
|
||||
|
||||
## 4. Algorithmic Approach
|
||||
|
||||
### 4.1 Progressive Crawling Algorithm
|
||||
|
||||
```
|
||||
Algorithm: ProgressiveCrawl(start_url, query, θ)
|
||||
K ← ∅
|
||||
crawled ← {start_url}
|
||||
pending ← extract_links(crawl(start_url))
|
||||
|
||||
while IS(K, Q) < θ and |crawled| < max_pages:
|
||||
candidates ← rank_by_expected_gain(pending, Q, K)
|
||||
if max(ExpectedGain(candidates)) < min_gain:
|
||||
break // Diminishing returns
|
||||
|
||||
to_crawl ← top_k(candidates)
|
||||
new_docs ← parallel_crawl(to_crawl)
|
||||
K ← K ∪ new_docs
|
||||
crawled ← crawled ∪ to_crawl
|
||||
pending ← extract_new_links(new_docs) - crawled
|
||||
|
||||
return K
|
||||
```
|
||||
|
||||
### 4.2 Stopping Criteria
|
||||
|
||||
Crawling terminates when:
|
||||
1. IS(K, Q) ≥ θ (sufficient information)
|
||||
2. d(IS)/d(crawls) < ε (plateau reached)
|
||||
3. |crawled| ≥ max_pages (resource limit)
|
||||
4. max(ExpectedGain) < min_gain (no promising links)
|
||||
|
||||
## 5. Multi-Strategy Architecture
|
||||
|
||||
### 5.1 Strategy Pattern Design
|
||||
|
||||
```
|
||||
AbstractStrategy
|
||||
├── StatisticalStrategy (no LLM, no embeddings)
|
||||
├── EmbeddingStrategy (with semantic similarity)
|
||||
└── LLMStrategy (with language model assistance)
|
||||
```
|
||||
|
||||
### 5.2 Statistical Strategy
|
||||
|
||||
Pure statistical approach using:
|
||||
- BM25 for relevance scoring
|
||||
- Term frequency analysis for coverage
|
||||
- Graph structure for authority
|
||||
- No external models required
|
||||
|
||||
**Advantages**: Fast, no API costs, works offline
|
||||
**Best for**: Technical documentation, specific terminology
|
||||
|
||||
### 5.3 Embedding Strategy (Implemented)
|
||||
|
||||
Semantic understanding through embeddings:
|
||||
- Query expansion into semantic variations
|
||||
- Coverage mapping in embedding space
|
||||
- Gap-driven link selection
|
||||
- Validation-based stopping criteria
|
||||
|
||||
**Mathematical Framework**:
|
||||
```
|
||||
Coverage(K, Q) = mean(max_similarity(q, K) for q in Q_expanded)
|
||||
Gap(q) = 1 - max_similarity(q, K)
|
||||
LinkScore(l) = Σ(Gap(q) × relevance(l, q)) × (1 - redundancy(l, K))
|
||||
```
|
||||
|
||||
**Key Parameters**:
|
||||
- `embedding_k_exp`: Exponential decay factor for distance-to-score mapping
|
||||
- `embedding_coverage_radius`: Distance threshold for query coverage
|
||||
- `embedding_min_confidence_threshold`: Minimum relevance threshold
|
||||
|
||||
**Advantages**: Semantic understanding, handles ambiguity, detects irrelevance
|
||||
**Best for**: Research queries, conceptual topics, diverse content
|
||||
|
||||
### 5.4 Progressive Enhancement Path
|
||||
|
||||
1. **Level 0**: Statistical only (implemented)
|
||||
2. **Level 1**: + Embeddings for semantic similarity (implemented)
|
||||
3. **Level 2**: + LLM for query understanding (future)
|
||||
|
||||
## 6. Evaluation Methodology
|
||||
|
||||
### 6.1 Synthetic Dataset Generation
|
||||
|
||||
Using LLM to create evaluation data:
|
||||
|
||||
```python
|
||||
def generate_synthetic_dataset(domain_url):
|
||||
# 1. Fully crawl domain
|
||||
full_knowledge = exhaustive_crawl(domain_url)
|
||||
|
||||
# 2. Generate answerable queries
|
||||
queries = llm_generate_queries(full_knowledge)
|
||||
|
||||
# 3. Create query variations
|
||||
for q in queries:
|
||||
variations = generate_variations(q) # synonyms, sub/super queries
|
||||
|
||||
return queries, variations, full_knowledge
|
||||
```
|
||||
|
||||
### 6.2 Evaluation Metrics
|
||||
|
||||
1. **Efficiency**: Information gained / Pages crawled
|
||||
2. **Completeness**: Answerable queries / Total queries
|
||||
3. **Redundancy**: 1 - (Unique information / Total information)
|
||||
4. **Convergence Rate**: Pages to 95% completeness
|
||||
|
||||
### 6.3 Ablation Studies
|
||||
|
||||
- Impact of each score component (coverage, consistency, saturation)
|
||||
- Sensitivity to threshold parameters
|
||||
- Performance across different domain types
|
||||
|
||||
## 7. Theoretical Properties
|
||||
|
||||
### 7.1 Convergence Guarantee
|
||||
|
||||
**Theorem**: For finite websites, ProgressiveCrawl converges to IS(K, Q) ≥ θ or exhausts all reachable pages.
|
||||
|
||||
**Proof sketch**: IS(K, Q) is monotonically non-decreasing with each crawl, bounded above by 1.
|
||||
|
||||
### 7.2 Optimality
|
||||
|
||||
Under certain assumptions about link preview accuracy:
|
||||
- Expected crawls ≤ 2 × optimal_crawls
|
||||
- Approximation ratio improves with preview quality
|
||||
|
||||
## 8. Implementation Design
|
||||
|
||||
### 8.1 Core Components
|
||||
|
||||
1. **CrawlState**: Maintains crawl history and metrics
|
||||
2. **AdaptiveConfig**: Configuration parameters
|
||||
3. **CrawlStrategy**: Pluggable strategy interface
|
||||
4. **AdaptiveCrawler**: Main orchestrator
|
||||
|
||||
### 8.2 Integration with Crawl4AI
|
||||
|
||||
- Wraps existing AsyncWebCrawler
|
||||
- Leverages link preview functionality
|
||||
- Maintains backward compatibility
|
||||
|
||||
### 8.3 Persistence
|
||||
|
||||
Knowledge base serialization for:
|
||||
- Resumable crawls
|
||||
- Knowledge sharing
|
||||
- Offline analysis
|
||||
|
||||
## 9. Future Directions
|
||||
|
||||
### 9.1 Advanced Scoring
|
||||
|
||||
- Temporal information value
|
||||
- Multi-query optimization
|
||||
- Active learning from user feedback
|
||||
|
||||
### 9.2 Distributed Crawling
|
||||
|
||||
- Collaborative knowledge building
|
||||
- Federated information sufficiency
|
||||
|
||||
### 9.3 Domain Adaptation
|
||||
|
||||
- Transfer learning across domains
|
||||
- Meta-learning for threshold selection
|
||||
|
||||
## 10. Conclusion
|
||||
|
||||
Progressive crawling with adaptive information foraging provides a principled approach to efficient web information extraction. By combining coverage, consistency, and saturation metrics, we can determine information sufficiency without ground truth labels. The multi-strategy architecture allows graceful enhancement from pure statistical to LLM-assisted approaches based on requirements and resources.
|
||||
|
||||
## References
|
||||
|
||||
1. Manning, C. D., Raghavan, P., & Schütze, H. (2008). Introduction to Information Retrieval. Cambridge University Press.
|
||||
|
||||
2. Robertson, S., & Zaragoza, H. (2009). The Probabilistic Relevance Framework: BM25 and Beyond. Foundations and Trends in Information Retrieval.
|
||||
|
||||
3. Pirolli, P., & Card, S. (1999). Information Foraging. Psychological Review, 106(4), 643-675.
|
||||
|
||||
4. Dasgupta, S. (2005). Analysis of a greedy active learning strategy. Advances in Neural Information Processing Systems.
|
||||
|
||||
## Appendix A: Implementation Pseudocode
|
||||
|
||||
```python
|
||||
class StatisticalStrategy:
|
||||
def calculate_confidence(self, state):
|
||||
coverage = self.calculate_coverage(state)
|
||||
consistency = self.calculate_consistency(state)
|
||||
saturation = self.calculate_saturation(state)
|
||||
return min(coverage, consistency, saturation)
|
||||
|
||||
def calculate_coverage(self, state):
|
||||
# BM25-based term coverage
|
||||
term_scores = []
|
||||
for term in state.query.split():
|
||||
df = state.document_frequencies.get(term, 0)
|
||||
idf = self.idf_cache.get(term, 1.0)
|
||||
term_scores.append(log(df + 1) * idf)
|
||||
return mean(term_scores) / max_possible_score
|
||||
|
||||
def rank_links(self, state):
|
||||
scored_links = []
|
||||
for link in state.pending_links:
|
||||
relevance = self.bm25_score(link.preview_text, state.query)
|
||||
novelty = self.calculate_novelty(link, state.knowledge_base)
|
||||
authority = self.url_authority(link.href)
|
||||
score = relevance * novelty * authority
|
||||
scored_links.append((link, score))
|
||||
return sorted(scored_links, key=lambda x: x[1], reverse=True)
|
||||
```
|
||||
|
||||
## Appendix B: Evaluation Protocol
|
||||
|
||||
1. **Dataset Creation**:
|
||||
- Select diverse domains (documentation, blogs, e-commerce)
|
||||
- Generate 100 queries per domain using LLM
|
||||
- Create query variations (5-10 per query)
|
||||
|
||||
2. **Baseline Comparisons**:
|
||||
- BFS crawler (depth-limited)
|
||||
- DFS crawler (depth-limited)
|
||||
- Random crawler
|
||||
- Oracle (knows relevant pages)
|
||||
|
||||
3. **Metrics Collection**:
|
||||
- Pages crawled vs query answerability
|
||||
- Time to sufficient confidence
|
||||
- False positive/negative rates
|
||||
|
||||
4. **Statistical Analysis**:
|
||||
- ANOVA for strategy comparison
|
||||
- Regression for parameter sensitivity
|
||||
- Bootstrap for confidence intervals
|
||||
111
README.md
111
README.md
@@ -11,19 +11,24 @@
|
||||
[](https://pypi.org/project/crawl4ai/)
|
||||
[](https://pepy.tech/project/crawl4ai)
|
||||
|
||||
<!-- [](https://crawl4ai.readthedocs.io/) -->
|
||||
[](https://github.com/unclecode/crawl4ai/blob/main/LICENSE)
|
||||
[](https://github.com/psf/black)
|
||||
[](https://github.com/PyCQA/bandit)
|
||||
[](code_of_conduct.md)
|
||||
|
||||
<p align="center">
|
||||
<a href="https://x.com/crawl4ai">
|
||||
<img src="https://img.shields.io/badge/Follow%20on%20X-000000?style=for-the-badge&logo=x&logoColor=white" alt="Follow on X" />
|
||||
</a>
|
||||
<a href="https://www.linkedin.com/company/crawl4ai">
|
||||
<img src="https://img.shields.io/badge/Follow%20on%20LinkedIn-0077B5?style=for-the-badge&logo=linkedin&logoColor=white" alt="Follow on LinkedIn" />
|
||||
</a>
|
||||
<a href="https://discord.gg/jP8KfhDhyN">
|
||||
<img src="https://img.shields.io/badge/Join%20our%20Discord-5865F2?style=for-the-badge&logo=discord&logoColor=white" alt="Join our Discord" />
|
||||
</a>
|
||||
</p>
|
||||
</div>
|
||||
|
||||
Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease.
|
||||
|
||||
[✨ Check out latest update v0.6.0](#-recent-updates)
|
||||
[✨ Check out latest update v0.7.0](#-recent-updates)
|
||||
|
||||
🎉 **Version 0.6.0 is now available!** This release candidate introduces World-aware Crawling with geolocation and locale settings, Table-to-DataFrame extraction, Browser pooling with pre-warming, Network and console traffic capture, MCP integration for AI tools, and a completely revamped Docker deployment! [Read the release notes →](https://docs.crawl4ai.com/blog)
|
||||
🎉 **Version 0.7.0 is now available!** The Adaptive Intelligence Update introduces groundbreaking features: Adaptive Crawling that learns website patterns, Virtual Scroll support for infinite pages, intelligent Link Preview with 3-layer scoring, Async URL Seeder for massive discovery, and significant performance improvements. [Read the release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.0.md)
|
||||
|
||||
<details>
|
||||
<summary>🤓 <strong>My Personal Story</strong></summary>
|
||||
@@ -269,8 +274,8 @@ The new Docker implementation includes:
|
||||
|
||||
```bash
|
||||
# Pull and run the latest release candidate
|
||||
docker pull unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number
|
||||
docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number
|
||||
docker pull unclecode/crawl4ai:0.7.0
|
||||
docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:0.7.0
|
||||
|
||||
# Visit the playground at http://localhost:11235/playground
|
||||
```
|
||||
@@ -291,12 +296,20 @@ import requests
|
||||
# Submit a crawl job
|
||||
response = requests.post(
|
||||
"http://localhost:11235/crawl",
|
||||
json={"urls": "https://example.com", "priority": 10}
|
||||
json={"urls": ["https://example.com"], "priority": 10}
|
||||
)
|
||||
task_id = response.json()["task_id"]
|
||||
|
||||
# Continue polling until the task is complete (status="completed")
|
||||
result = requests.get(f"http://localhost:11235/task/{task_id}")
|
||||
if response.status_code == 200:
|
||||
print("Crawl job submitted successfully.")
|
||||
|
||||
if "results" in response.json():
|
||||
results = response.json()["results"]
|
||||
print("Crawl job completed. Results:")
|
||||
for result in results:
|
||||
print(result)
|
||||
else:
|
||||
task_id = response.json()["task_id"]
|
||||
print(f"Crawl job submitted. Task ID:: {task_id}")
|
||||
result = requests.get(f"http://localhost:11235/task/{task_id}")
|
||||
```
|
||||
|
||||
For more examples, see our [Docker Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_example.py). For advanced configuration, environment variables, and usage examples, see our [Docker Deployment Guide](https://docs.crawl4ai.com/basic/docker-deployment/).
|
||||
@@ -505,7 +518,72 @@ async def test_news_crawl():
|
||||
|
||||
## ✨ Recent Updates
|
||||
|
||||
### Version 0.6.0 Release Highlights
|
||||
### Version 0.7.0 Release Highlights - The Adaptive Intelligence Update
|
||||
|
||||
- **🧠 Adaptive Crawling**: Your crawler now learns and adapts to website patterns automatically:
|
||||
```python
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.7, # Min confidence to stop crawling
|
||||
max_depth=5, # Maximum crawl depth
|
||||
max_pages=20, # Maximum number of pages to crawl
|
||||
strategy="statistical"
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
adaptive_crawler = AdaptiveCrawler(crawler, config)
|
||||
state = await adaptive_crawler.digest(
|
||||
start_url="https://news.example.com",
|
||||
query="latest news content"
|
||||
)
|
||||
# Crawler learns patterns and improves extraction over time
|
||||
```
|
||||
|
||||
- **🌊 Virtual Scroll Support**: Complete content extraction from infinite scroll pages:
|
||||
```python
|
||||
scroll_config = VirtualScrollConfig(
|
||||
container_selector="[data-testid='feed']",
|
||||
scroll_count=20,
|
||||
scroll_by="container_height",
|
||||
wait_after_scroll=1.0
|
||||
)
|
||||
|
||||
result = await crawler.arun(url, config=CrawlerRunConfig(
|
||||
virtual_scroll_config=scroll_config
|
||||
))
|
||||
```
|
||||
|
||||
- **🔗 Intelligent Link Analysis**: 3-layer scoring system for smart link prioritization:
|
||||
```python
|
||||
link_config = LinkPreviewConfig(
|
||||
query="machine learning tutorials",
|
||||
score_threshold=0.3,
|
||||
concurrent_requests=10
|
||||
)
|
||||
|
||||
result = await crawler.arun(url, config=CrawlerRunConfig(
|
||||
link_preview_config=link_config,
|
||||
score_links=True
|
||||
))
|
||||
# Links ranked by relevance and quality
|
||||
```
|
||||
|
||||
- **🎣 Async URL Seeder**: Discover thousands of URLs in seconds:
|
||||
```python
|
||||
seeder = AsyncUrlSeeder(SeedingConfig(
|
||||
source="sitemap+cc",
|
||||
pattern="*/blog/*",
|
||||
query="python tutorials",
|
||||
score_threshold=0.4
|
||||
))
|
||||
|
||||
urls = await seeder.discover("https://example.com")
|
||||
```
|
||||
|
||||
- **⚡ Performance Boost**: Up to 3x faster with optimized resource handling and memory efficiency
|
||||
|
||||
Read the full details in our [0.7.0 Release Notes](https://docs.crawl4ai.com/blog/release-v0.7.0) or check the [CHANGELOG](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md).
|
||||
|
||||
### Previous Version: 0.6.0 Release Highlights
|
||||
|
||||
- **🌎 World-aware Crawling**: Set geolocation, language, and timezone for authentic locale-specific content:
|
||||
```python
|
||||
@@ -575,7 +653,6 @@ async def test_news_crawl():
|
||||
|
||||
- **📱 Multi-stage Build System**: Optimized Dockerfile with platform-specific performance enhancements
|
||||
|
||||
Read the full details in our [0.6.0 Release Notes](https://docs.crawl4ai.com/blog/releases/0.6.0.html) or check the [CHANGELOG](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md).
|
||||
|
||||
### Previous Version: 0.5.0 Major Release Highlights
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ import warnings
|
||||
|
||||
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
||||
# MODIFIED: Add SeedingConfig and VirtualScrollConfig here
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig, LinkPreviewConfig
|
||||
|
||||
from .content_scraping_strategy import (
|
||||
ContentScrapingStrategy,
|
||||
@@ -69,6 +69,14 @@ from .deep_crawling import (
|
||||
)
|
||||
# NEW: Import AsyncUrlSeeder
|
||||
from .async_url_seeder import AsyncUrlSeeder
|
||||
# Adaptive Crawler
|
||||
from .adaptive_crawler import (
|
||||
AdaptiveCrawler,
|
||||
AdaptiveConfig,
|
||||
CrawlState,
|
||||
CrawlStrategy,
|
||||
StatisticalStrategy
|
||||
)
|
||||
|
||||
# C4A Script Language Support
|
||||
from .script import (
|
||||
@@ -97,6 +105,12 @@ __all__ = [
|
||||
"VirtualScrollConfig",
|
||||
# NEW: Add AsyncUrlSeeder
|
||||
"AsyncUrlSeeder",
|
||||
# Adaptive Crawler
|
||||
"AdaptiveCrawler",
|
||||
"AdaptiveConfig",
|
||||
"CrawlState",
|
||||
"CrawlStrategy",
|
||||
"StatisticalStrategy",
|
||||
"DeepCrawlStrategy",
|
||||
"BFSDeepCrawlStrategy",
|
||||
"BestFirstCrawlingStrategy",
|
||||
@@ -159,6 +173,7 @@ __all__ = [
|
||||
"CompilationResult",
|
||||
"ValidationResult",
|
||||
"ErrorDetail",
|
||||
"LinkPreviewConfig"
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -1,3 +1,8 @@
|
||||
# crawl4ai/_version.py
|
||||
__version__ = "0.6.3"
|
||||
# crawl4ai/__version__.py
|
||||
|
||||
# This is the version that will be used for stable releases
|
||||
__version__ = "0.7.2"
|
||||
|
||||
# For nightly builds, this gets set during build process
|
||||
__nightly_version__ = None
|
||||
|
||||
|
||||
1847
crawl4ai/adaptive_crawler copy.py
Normal file
1847
crawl4ai/adaptive_crawler copy.py
Normal file
File diff suppressed because it is too large
Load Diff
1861
crawl4ai/adaptive_crawler.py
Normal file
1861
crawl4ai/adaptive_crawler.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -926,6 +926,8 @@ class CrawlerRunConfig():
|
||||
Default: False.
|
||||
scroll_delay (float): Delay in seconds between scroll steps if scan_full_page is True.
|
||||
Default: 0.2.
|
||||
max_scroll_steps (Optional[int]): Maximum number of scroll steps to perform during full page scan.
|
||||
If None, scrolls until the entire page is loaded. Default: None.
|
||||
process_iframes (bool): If True, attempts to process and inline iframe content.
|
||||
Default: False.
|
||||
remove_overlay_elements (bool): If True, remove overlays/popups before extracting HTML.
|
||||
@@ -1066,6 +1068,7 @@ class CrawlerRunConfig():
|
||||
ignore_body_visibility: bool = True,
|
||||
scan_full_page: bool = False,
|
||||
scroll_delay: float = 0.2,
|
||||
max_scroll_steps: Optional[int] = None,
|
||||
process_iframes: bool = False,
|
||||
remove_overlay_elements: bool = False,
|
||||
simulate_user: bool = False,
|
||||
@@ -1170,6 +1173,7 @@ class CrawlerRunConfig():
|
||||
self.ignore_body_visibility = ignore_body_visibility
|
||||
self.scan_full_page = scan_full_page
|
||||
self.scroll_delay = scroll_delay
|
||||
self.max_scroll_steps = max_scroll_steps
|
||||
self.process_iframes = process_iframes
|
||||
self.remove_overlay_elements = remove_overlay_elements
|
||||
self.simulate_user = simulate_user
|
||||
@@ -1387,6 +1391,7 @@ class CrawlerRunConfig():
|
||||
ignore_body_visibility=kwargs.get("ignore_body_visibility", True),
|
||||
scan_full_page=kwargs.get("scan_full_page", False),
|
||||
scroll_delay=kwargs.get("scroll_delay", 0.2),
|
||||
max_scroll_steps=kwargs.get("max_scroll_steps"),
|
||||
process_iframes=kwargs.get("process_iframes", False),
|
||||
remove_overlay_elements=kwargs.get("remove_overlay_elements", False),
|
||||
simulate_user=kwargs.get("simulate_user", False),
|
||||
@@ -1499,6 +1504,7 @@ class CrawlerRunConfig():
|
||||
"ignore_body_visibility": self.ignore_body_visibility,
|
||||
"scan_full_page": self.scan_full_page,
|
||||
"scroll_delay": self.scroll_delay,
|
||||
"max_scroll_steps": self.max_scroll_steps,
|
||||
"process_iframes": self.process_iframes,
|
||||
"remove_overlay_elements": self.remove_overlay_elements,
|
||||
"simulate_user": self.simulate_user,
|
||||
@@ -1653,22 +1659,57 @@ class SeedingConfig:
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
source: str = "sitemap+cc", # Options: "sitemap", "cc", "sitemap+cc"
|
||||
pattern: Optional[str] = "*", # URL pattern to filter discovered URLs (e.g., "*example.com/blog/*")
|
||||
live_check: bool = False, # Whether to perform HEAD requests to verify URL liveness
|
||||
extract_head: bool = False, # Whether to fetch and parse <head> section for metadata
|
||||
max_urls: int = -1, # Maximum number of URLs to discover (default: -1 for no limit)
|
||||
concurrency: int = 1000, # Maximum concurrent requests for live checks/head extraction
|
||||
hits_per_sec: int = 5, # Rate limit in requests per second
|
||||
force: bool = False, # If True, bypasses the AsyncUrlSeeder's internal .jsonl cache
|
||||
base_directory: Optional[str] = None, # Base directory for UrlSeeder's cache files (.jsonl)
|
||||
llm_config: Optional[LLMConfig] = None, # Forward LLM config for future use (e.g., relevance scoring)
|
||||
verbose: Optional[bool] = None, # Override crawler's general verbose setting
|
||||
query: Optional[str] = None, # Search query for relevance scoring
|
||||
score_threshold: Optional[float] = None, # Minimum relevance score to include URL (0.0-1.0)
|
||||
scoring_method: str = "bm25", # Scoring method: "bm25" (default), future: "semantic"
|
||||
filter_nonsense_urls: bool = True, # Filter out utility URLs like robots.txt, sitemap.xml, etc.
|
||||
source: str = "sitemap+cc",
|
||||
pattern: Optional[str] = "*",
|
||||
live_check: bool = False,
|
||||
extract_head: bool = False,
|
||||
max_urls: int = -1,
|
||||
concurrency: int = 1000,
|
||||
hits_per_sec: int = 5,
|
||||
force: bool = False,
|
||||
base_directory: Optional[str] = None,
|
||||
llm_config: Optional[LLMConfig] = None,
|
||||
verbose: Optional[bool] = None,
|
||||
query: Optional[str] = None,
|
||||
score_threshold: Optional[float] = None,
|
||||
scoring_method: str = "bm25",
|
||||
filter_nonsense_urls: bool = True,
|
||||
):
|
||||
"""
|
||||
Initialize URL seeding configuration.
|
||||
|
||||
Args:
|
||||
source: Discovery source(s) to use. Options: "sitemap", "cc" (Common Crawl),
|
||||
or "sitemap+cc" (both). Default: "sitemap+cc"
|
||||
pattern: URL pattern to filter discovered URLs (e.g., "*example.com/blog/*").
|
||||
Supports glob-style wildcards. Default: "*" (all URLs)
|
||||
live_check: Whether to perform HEAD requests to verify URL liveness.
|
||||
Default: False
|
||||
extract_head: Whether to fetch and parse <head> section for metadata extraction.
|
||||
Required for BM25 relevance scoring. Default: False
|
||||
max_urls: Maximum number of URLs to discover. Use -1 for no limit.
|
||||
Default: -1
|
||||
concurrency: Maximum concurrent requests for live checks/head extraction.
|
||||
Default: 1000
|
||||
hits_per_sec: Rate limit in requests per second to avoid overwhelming servers.
|
||||
Default: 5
|
||||
force: If True, bypasses the AsyncUrlSeeder's internal .jsonl cache and
|
||||
re-fetches URLs. Default: False
|
||||
base_directory: Base directory for UrlSeeder's cache files (.jsonl).
|
||||
If None, uses default ~/.crawl4ai/. Default: None
|
||||
llm_config: LLM configuration for future features (e.g., semantic scoring).
|
||||
Currently unused. Default: None
|
||||
verbose: Override crawler's general verbose setting for seeding operations.
|
||||
Default: None (inherits from crawler)
|
||||
query: Search query for BM25 relevance scoring (e.g., "python tutorials").
|
||||
Requires extract_head=True. Default: None
|
||||
score_threshold: Minimum relevance score (0.0-1.0) to include URL.
|
||||
Only applies when query is provided. Default: None
|
||||
scoring_method: Scoring algorithm to use. Currently only "bm25" is supported.
|
||||
Future: "semantic". Default: "bm25"
|
||||
filter_nonsense_urls: Filter out utility URLs like robots.txt, sitemap.xml,
|
||||
ads.txt, favicon.ico, etc. Default: True
|
||||
"""
|
||||
self.source = source
|
||||
self.pattern = pattern
|
||||
self.live_check = live_check
|
||||
|
||||
@@ -445,6 +445,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
return await self._crawl_web(url, config)
|
||||
|
||||
elif url.startswith("file://"):
|
||||
# initialize empty lists for console messages
|
||||
captured_console = []
|
||||
|
||||
# Process local file
|
||||
local_file_path = url[7:] # Remove 'file://' prefix
|
||||
if not os.path.exists(local_file_path):
|
||||
@@ -466,9 +469,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
console_messages=captured_console,
|
||||
)
|
||||
|
||||
elif url.startswith("raw:") or url.startswith("raw://"):
|
||||
#####
|
||||
# Since both "raw:" and "raw://" start with "raw:", the first condition is always true for both, so "raw://" will be sliced as "//...", which is incorrect.
|
||||
# Fix: Check for "raw://" first, then "raw:"
|
||||
# Also, the prefix "raw://" is actually 6 characters long, not 7, so it should be sliced accordingly: url[6:]
|
||||
#####
|
||||
elif url.startswith("raw://") or url.startswith("raw:"):
|
||||
# Process raw HTML content
|
||||
raw_html = url[4:] if url[:4] == "raw:" else url[7:]
|
||||
# raw_html = url[4:] if url[:4] == "raw:" else url[7:]
|
||||
raw_html = url[6:] if url.startswith("raw://") else url[4:]
|
||||
html = raw_html
|
||||
if config.screenshot:
|
||||
screenshot_data = await self._generate_screenshot_from_html(html)
|
||||
@@ -741,18 +750,49 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
)
|
||||
redirected_url = page.url
|
||||
except Error as e:
|
||||
raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
|
||||
# Allow navigation to be aborted when downloading files
|
||||
# This is expected behavior for downloads in some browser engines
|
||||
if 'net::ERR_ABORTED' in str(e) and self.browser_config.accept_downloads:
|
||||
self.logger.info(
|
||||
message=f"Navigation aborted, likely due to file download: {url}",
|
||||
tag="GOTO",
|
||||
params={"url": url},
|
||||
)
|
||||
response = None
|
||||
else:
|
||||
raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
|
||||
|
||||
await self.execute_hook(
|
||||
"after_goto", page, context=context, url=url, response=response, config=config
|
||||
)
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# Walk the redirect chain. Playwright returns only the last
|
||||
# hop, so we trace the `request.redirected_from` links until the
|
||||
# first response that differs from the final one and surface its
|
||||
# status-code.
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
if response is None:
|
||||
status_code = 200
|
||||
response_headers = {}
|
||||
else:
|
||||
status_code = response.status
|
||||
response_headers = response.headers
|
||||
first_resp = response
|
||||
req = response.request
|
||||
while req and req.redirected_from:
|
||||
prev_req = req.redirected_from
|
||||
prev_resp = await prev_req.response()
|
||||
if prev_resp: # keep earliest
|
||||
first_resp = prev_resp
|
||||
req = prev_req
|
||||
|
||||
status_code = first_resp.status
|
||||
response_headers = first_resp.headers
|
||||
# if response is None:
|
||||
# status_code = 200
|
||||
# response_headers = {}
|
||||
# else:
|
||||
# status_code = response.status
|
||||
# response_headers = response.headers
|
||||
|
||||
else:
|
||||
status_code = 200
|
||||
@@ -784,7 +824,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
except Error:
|
||||
visibility_info = await self.check_visibility(page)
|
||||
|
||||
if self.browser_config.config.verbose:
|
||||
if self.browser_config.verbose:
|
||||
self.logger.debug(
|
||||
message="Body visibility info: {info}",
|
||||
tag="DEBUG",
|
||||
@@ -896,7 +936,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
|
||||
# Handle full page scanning
|
||||
if config.scan_full_page:
|
||||
await self._handle_full_page_scan(page, config.scroll_delay)
|
||||
# await self._handle_full_page_scan(page, config.scroll_delay)
|
||||
await self._handle_full_page_scan(page, config.scroll_delay, config.max_scroll_steps)
|
||||
|
||||
# Handle virtual scroll if configured
|
||||
if config.virtual_scroll_config:
|
||||
@@ -1088,7 +1129,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
# Close the page
|
||||
await page.close()
|
||||
|
||||
async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1):
|
||||
# async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1):
|
||||
async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1, max_scroll_steps: Optional[int] = None):
|
||||
"""
|
||||
Helper method to handle full page scanning.
|
||||
|
||||
@@ -1103,6 +1145,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
Args:
|
||||
page (Page): The Playwright page object
|
||||
scroll_delay (float): The delay between page scrolls
|
||||
max_scroll_steps (Optional[int]): Maximum number of scroll steps to perform. If None, scrolls until end.
|
||||
|
||||
"""
|
||||
try:
|
||||
@@ -1127,9 +1170,21 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
dimensions = await self.get_page_dimensions(page)
|
||||
total_height = dimensions["height"]
|
||||
|
||||
scroll_step_count = 0
|
||||
while current_position < total_height:
|
||||
####
|
||||
# NEW FEATURE: Check if we've reached the maximum allowed scroll steps
|
||||
# This prevents infinite scrolling on very long pages or infinite scroll scenarios
|
||||
# If max_scroll_steps is None, this check is skipped (unlimited scrolling - original behavior)
|
||||
####
|
||||
if max_scroll_steps is not None and scroll_step_count >= max_scroll_steps:
|
||||
break
|
||||
current_position = min(current_position + viewport_height, total_height)
|
||||
await self.safe_scroll(page, 0, current_position, delay=scroll_delay)
|
||||
|
||||
# Increment the step counter for max_scroll_steps tracking
|
||||
scroll_step_count += 1
|
||||
|
||||
# await page.evaluate(f"window.scrollTo(0, {current_position})")
|
||||
# await asyncio.sleep(scroll_delay)
|
||||
|
||||
@@ -1616,12 +1671,32 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
num_segments = (page_height // viewport_height) + 1
|
||||
for i in range(num_segments):
|
||||
y_offset = i * viewport_height
|
||||
# Special handling for the last segment
|
||||
if i == num_segments - 1:
|
||||
last_part_height = page_height % viewport_height
|
||||
|
||||
# If page_height is an exact multiple of viewport_height,
|
||||
# we don't need an extra segment
|
||||
if last_part_height == 0:
|
||||
# Skip last segment if page height is exact multiple of viewport
|
||||
break
|
||||
|
||||
# Adjust viewport to exactly match the remaining content height
|
||||
await page.set_viewport_size({"width": page_width, "height": last_part_height})
|
||||
|
||||
await page.evaluate(f"window.scrollTo(0, {y_offset})")
|
||||
await asyncio.sleep(0.01) # wait for render
|
||||
seg_shot = await page.screenshot(full_page=False)
|
||||
|
||||
# Capture the current segment
|
||||
# Note: Using compression options (format, quality) would go here
|
||||
seg_shot = await page.screenshot(full_page=False, type="jpeg", quality=85)
|
||||
# seg_shot = await page.screenshot(full_page=False)
|
||||
img = Image.open(BytesIO(seg_shot)).convert("RGB")
|
||||
segments.append(img)
|
||||
|
||||
# Reset viewport to original size after capturing segments
|
||||
await page.set_viewport_size({"width": page_width, "height": viewport_height})
|
||||
|
||||
total_height = sum(img.height for img in segments)
|
||||
stitched = Image.new("RGB", (segments[0].width, total_height))
|
||||
offset = 0
|
||||
@@ -1750,12 +1825,31 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
# then wait for the new page to load before continuing
|
||||
result = None
|
||||
try:
|
||||
# OLD VERSION:
|
||||
# result = await page.evaluate(
|
||||
# f"""
|
||||
# (async () => {{
|
||||
# try {{
|
||||
# const script_result = {script};
|
||||
# return {{ success: true, result: script_result }};
|
||||
# }} catch (err) {{
|
||||
# return {{ success: false, error: err.toString(), stack: err.stack }};
|
||||
# }}
|
||||
# }})();
|
||||
# """
|
||||
# )
|
||||
|
||||
# """ NEW VERSION:
|
||||
# When {script} contains statements (e.g., const link = …; link.click();),
|
||||
# this forms invalid JavaScript, causing Playwright execution error: SyntaxError: Unexpected token 'const'.
|
||||
# """
|
||||
result = await page.evaluate(
|
||||
f"""
|
||||
(async () => {{
|
||||
try {{
|
||||
const script_result = {script};
|
||||
return {{ success: true, result: script_result }};
|
||||
return await (async () => {{
|
||||
{script}
|
||||
}})();
|
||||
}} catch (err) {{
|
||||
return {{ success: false, error: err.toString(), stack: err.stack }};
|
||||
}}
|
||||
|
||||
@@ -39,6 +39,7 @@ class LogColor(str, Enum):
|
||||
YELLOW = "yellow"
|
||||
MAGENTA = "magenta"
|
||||
DIM_MAGENTA = "dim magenta"
|
||||
RED = "red"
|
||||
|
||||
def __str__(self):
|
||||
"""Automatically convert rich color to string."""
|
||||
|
||||
@@ -424,10 +424,21 @@ class AsyncUrlSeeder:
|
||||
self._log("info", "Finished URL seeding for {domain}. Total URLs: {count}",
|
||||
params={"domain": domain, "count": len(results)}, tag="URL_SEED")
|
||||
|
||||
# Sort by relevance score if query was provided
|
||||
# Apply BM25 scoring if query was provided
|
||||
if query and extract_head and scoring_method == "bm25":
|
||||
results.sort(key=lambda x: x.get(
|
||||
"relevance_score", 0.0), reverse=True)
|
||||
# Apply collective BM25 scoring across all documents
|
||||
results = await self._apply_bm25_scoring(results, config)
|
||||
|
||||
# Filter by score threshold if specified
|
||||
if score_threshold is not None:
|
||||
original_count = len(results)
|
||||
results = [r for r in results if r.get("relevance_score", 0) >= score_threshold]
|
||||
if original_count > len(results):
|
||||
self._log("info", "Filtered {filtered} URLs below score threshold {threshold}",
|
||||
params={"filtered": original_count - len(results), "threshold": score_threshold}, tag="URL_SEED")
|
||||
|
||||
# Sort by relevance score
|
||||
results.sort(key=lambda x: x.get("relevance_score", 0.0), reverse=True)
|
||||
self._log("info", "Sorted {count} URLs by relevance score for query: '{query}'",
|
||||
params={"count": len(results), "query": query}, tag="URL_SEED")
|
||||
elif query and not extract_head:
|
||||
@@ -982,28 +993,6 @@ class AsyncUrlSeeder:
|
||||
"head_data": head_data,
|
||||
}
|
||||
|
||||
# Apply BM25 scoring if query is provided and head data exists
|
||||
if query and ok and scoring_method == "bm25" and head_data:
|
||||
text_context = self._extract_text_context(head_data)
|
||||
if text_context:
|
||||
# Calculate BM25 score for this single document
|
||||
# scores = self._calculate_bm25_score(query, [text_context])
|
||||
scores = await asyncio.to_thread(self._calculate_bm25_score, query, [text_context])
|
||||
relevance_score = scores[0] if scores else 0.0
|
||||
entry["relevance_score"] = float(relevance_score)
|
||||
else:
|
||||
# No text context, use URL-based scoring as fallback
|
||||
relevance_score = self._calculate_url_relevance_score(
|
||||
query, entry["url"])
|
||||
entry["relevance_score"] = float(relevance_score)
|
||||
elif query:
|
||||
# Query provided but no head data - we reject this entry
|
||||
self._log("debug", "No head data for {url}, using URL-based scoring",
|
||||
params={"url": url}, tag="URL_SEED")
|
||||
return
|
||||
# relevance_score = self._calculate_url_relevance_score(query, entry["url"])
|
||||
# entry["relevance_score"] = float(relevance_score)
|
||||
|
||||
elif live:
|
||||
self._log("debug", "Performing live check for {url}", params={
|
||||
"url": url}, tag="URL_SEED")
|
||||
@@ -1013,35 +1002,13 @@ class AsyncUrlSeeder:
|
||||
params={"status": status.upper(), "url": url}, tag="URL_SEED")
|
||||
entry = {"url": url, "status": status, "head_data": {}}
|
||||
|
||||
# Apply URL-based scoring if query is provided
|
||||
if query:
|
||||
relevance_score = self._calculate_url_relevance_score(
|
||||
query, url)
|
||||
entry["relevance_score"] = float(relevance_score)
|
||||
|
||||
else:
|
||||
entry = {"url": url, "status": "unknown", "head_data": {}}
|
||||
|
||||
# Apply URL-based scoring if query is provided
|
||||
if query:
|
||||
relevance_score = self._calculate_url_relevance_score(
|
||||
query, url)
|
||||
entry["relevance_score"] = float(relevance_score)
|
||||
|
||||
# Now decide whether to add the entry based on score threshold
|
||||
if query and "relevance_score" in entry:
|
||||
if score_threshold is None or entry["relevance_score"] >= score_threshold:
|
||||
if live or extract:
|
||||
await self._cache_set(cache_kind, url, entry)
|
||||
res_list.append(entry)
|
||||
else:
|
||||
self._log("debug", "URL {url} filtered out with score {score} < {threshold}",
|
||||
params={"url": url, "score": entry["relevance_score"], "threshold": score_threshold}, tag="URL_SEED")
|
||||
else:
|
||||
# No query or no scoring - add as usual
|
||||
if live or extract:
|
||||
await self._cache_set(cache_kind, url, entry)
|
||||
res_list.append(entry)
|
||||
# Add entry to results (scoring will be done later)
|
||||
if live or extract:
|
||||
await self._cache_set(cache_kind, url, entry)
|
||||
res_list.append(entry)
|
||||
|
||||
async def _head_ok(self, url: str, timeout: int) -> bool:
|
||||
try:
|
||||
@@ -1436,8 +1403,19 @@ class AsyncUrlSeeder:
|
||||
scores = bm25.get_scores(query_tokens)
|
||||
|
||||
# Normalize scores to 0-1 range
|
||||
max_score = max(scores) if max(scores) > 0 else 1.0
|
||||
normalized_scores = [score / max_score for score in scores]
|
||||
# BM25 can return negative scores, so we need to handle the full range
|
||||
if len(scores) == 0:
|
||||
return []
|
||||
|
||||
min_score = min(scores)
|
||||
max_score = max(scores)
|
||||
|
||||
# If all scores are the same, return 0.5 for all
|
||||
if max_score == min_score:
|
||||
return [0.5] * len(scores)
|
||||
|
||||
# Normalize to 0-1 range using min-max normalization
|
||||
normalized_scores = [(score - min_score) / (max_score - min_score) for score in scores]
|
||||
|
||||
return normalized_scores
|
||||
except Exception as e:
|
||||
|
||||
@@ -363,7 +363,7 @@ class AsyncWebCrawler:
|
||||
pdf_data=pdf_data,
|
||||
verbose=config.verbose,
|
||||
is_raw_html=True if url.startswith("raw:") else False,
|
||||
redirected_url=async_response.redirected_url,
|
||||
redirected_url=async_response.redirected_url,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@@ -509,7 +509,7 @@ class AsyncWebCrawler:
|
||||
tables = media.pop("tables", []) if isinstance(media, dict) else []
|
||||
links = result.links.model_dump() if hasattr(result.links, 'model_dump') else result.links
|
||||
metadata = result.metadata
|
||||
|
||||
|
||||
fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)
|
||||
|
||||
################################
|
||||
@@ -591,11 +591,13 @@ class AsyncWebCrawler:
|
||||
# Choose content based on input_format
|
||||
content_format = config.extraction_strategy.input_format
|
||||
if content_format == "fit_markdown" and not markdown_result.fit_markdown:
|
||||
self.logger.warning(
|
||||
message="Fit markdown requested but not available. Falling back to raw markdown.",
|
||||
tag="EXTRACT",
|
||||
params={"url": _url},
|
||||
)
|
||||
|
||||
self.logger.url_status(
|
||||
url=_url,
|
||||
success=bool(html),
|
||||
timing=time.perf_counter() - t1,
|
||||
tag="EXTRACT",
|
||||
)
|
||||
content_format = "markdown"
|
||||
|
||||
content = {
|
||||
@@ -619,11 +621,12 @@ class AsyncWebCrawler:
|
||||
)
|
||||
|
||||
# Log extraction completion
|
||||
self.logger.info(
|
||||
message="Completed for {url:.50}... | Time: {timing}s",
|
||||
tag="EXTRACT",
|
||||
params={"url": _url, "timing": time.perf_counter() - t1},
|
||||
)
|
||||
self.logger.url_status(
|
||||
url=_url,
|
||||
success=bool(html),
|
||||
timing=time.perf_counter() - t1,
|
||||
tag="EXTRACT",
|
||||
)
|
||||
|
||||
# Apply HTML formatting if requested
|
||||
if config.prettiify:
|
||||
|
||||
@@ -14,23 +14,8 @@ import hashlib
|
||||
from .js_snippet import load_js_script
|
||||
from .config import DOWNLOAD_PAGE_TIMEOUT
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from playwright_stealth import StealthConfig
|
||||
from .utils import get_chromium_path
|
||||
|
||||
stealth_config = StealthConfig(
|
||||
webdriver=True,
|
||||
chrome_app=True,
|
||||
chrome_csi=True,
|
||||
chrome_load_times=True,
|
||||
chrome_runtime=True,
|
||||
navigator_languages=True,
|
||||
navigator_plugins=True,
|
||||
navigator_permissions=True,
|
||||
webgl_vendor=True,
|
||||
outerdimensions=True,
|
||||
navigator_hardware_concurrency=True,
|
||||
media_codecs=True,
|
||||
)
|
||||
|
||||
BROWSER_DISABLE_OPTIONS = [
|
||||
"--disable-background-networking",
|
||||
|
||||
@@ -480,7 +480,7 @@ class BrowserProfiler:
|
||||
self.logger.info("4. Exit", tag="MENU", base_color=LogColor.MAGENTA)
|
||||
exit_option = "4"
|
||||
|
||||
self.logger.print(f"\n[cyan]Enter your choice (1-{exit_option}): [/cyan]", end="")
|
||||
self.logger.info(f"\n[cyan]Enter your choice (1-{exit_option}): [/cyan]", end="")
|
||||
choice = input()
|
||||
|
||||
if choice == "1":
|
||||
@@ -637,9 +637,18 @@ class BrowserProfiler:
|
||||
self.logger.info(f"Debugging port: {debugging_port}", tag="CDP")
|
||||
self.logger.info(f"Headless mode: {headless}", tag="CDP")
|
||||
|
||||
# create browser config
|
||||
browser_config = BrowserConfig(
|
||||
browser_type=browser_type,
|
||||
headless=headless,
|
||||
user_data_dir=profile_path,
|
||||
debugging_port=debugging_port,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
# Create managed browser instance
|
||||
managed_browser = ManagedBrowser(
|
||||
browser_type=browser_type,
|
||||
browser_config=browser_config,
|
||||
user_data_dir=profile_path,
|
||||
headless=headless,
|
||||
logger=self.logger,
|
||||
|
||||
@@ -720,13 +720,18 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
|
||||
# Check flag if we should remove external images
|
||||
if kwargs.get("exclude_external_images", False):
|
||||
element.decompose()
|
||||
return False
|
||||
# src_url_base = src.split('/')[2]
|
||||
# url_base = url.split('/')[2]
|
||||
# if url_base not in src_url_base:
|
||||
# element.decompose()
|
||||
# return False
|
||||
# Handle relative URLs (which are always from the same domain)
|
||||
if not src.startswith('http') and not src.startswith('//'):
|
||||
return True # Keep relative URLs
|
||||
|
||||
# For absolute URLs, compare the base domains using the existing function
|
||||
src_base_domain = get_base_domain(src)
|
||||
url_base_domain = get_base_domain(url)
|
||||
|
||||
# If the domains don't match and both are valid, the image is external
|
||||
if src_base_domain and url_base_domain and src_base_domain != url_base_domain:
|
||||
element.decompose()
|
||||
return False
|
||||
|
||||
# if kwargs.get('exclude_social_media_links', False):
|
||||
# if image_src_base_domain in exclude_social_media_domains:
|
||||
@@ -1140,10 +1145,10 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
link_data["intrinsic_score"] = intrinsic_score
|
||||
except Exception:
|
||||
# Fail gracefully - assign default score
|
||||
link_data["intrinsic_score"] = float('inf')
|
||||
link_data["intrinsic_score"] = 0
|
||||
else:
|
||||
# No scoring enabled - assign infinity (all links equal priority)
|
||||
link_data["intrinsic_score"] = float('inf')
|
||||
link_data["intrinsic_score"] = 0
|
||||
|
||||
is_external = is_external_url(normalized_href, base_domain)
|
||||
if is_external:
|
||||
|
||||
@@ -150,6 +150,14 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
||||
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
|
||||
break
|
||||
|
||||
# Calculate how many more URLs we can process in this batch
|
||||
remaining = self.max_pages - self._pages_crawled
|
||||
batch_size = min(BATCH_SIZE, remaining)
|
||||
if batch_size <= 0:
|
||||
# No more pages to crawl
|
||||
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
|
||||
break
|
||||
|
||||
batch: List[Tuple[float, int, str, Optional[str]]] = []
|
||||
# Retrieve up to BATCH_SIZE items from the priority queue.
|
||||
for _ in range(BATCH_SIZE):
|
||||
@@ -184,6 +192,10 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
||||
# Count only successful crawls toward max_pages limit
|
||||
if result.success:
|
||||
self._pages_crawled += 1
|
||||
# Check if we've reached the limit during batch processing
|
||||
if self._pages_crawled >= self.max_pages:
|
||||
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
|
||||
break # Exit the generator
|
||||
|
||||
yield result
|
||||
|
||||
|
||||
@@ -157,6 +157,11 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
||||
results: List[CrawlResult] = []
|
||||
|
||||
while current_level and not self._cancel_event.is_set():
|
||||
# Check if we've already reached max_pages before starting a new level
|
||||
if self._pages_crawled >= self.max_pages:
|
||||
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
|
||||
break
|
||||
|
||||
next_level: List[Tuple[str, Optional[str]]] = []
|
||||
urls = [url for url, _ in current_level]
|
||||
|
||||
@@ -221,6 +226,10 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
||||
# Count only successful crawls
|
||||
if result.success:
|
||||
self._pages_crawled += 1
|
||||
# Check if we've reached the limit during batch processing
|
||||
if self._pages_crawled >= self.max_pages:
|
||||
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
|
||||
break # Exit the generator
|
||||
|
||||
results_count += 1
|
||||
yield result
|
||||
|
||||
@@ -49,6 +49,10 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
|
||||
# Count only successful crawls toward max_pages limit
|
||||
if result.success:
|
||||
self._pages_crawled += 1
|
||||
# Check if we've reached the limit during batch processing
|
||||
if self._pages_crawled >= self.max_pages:
|
||||
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
|
||||
break # Exit the generator
|
||||
|
||||
# Only discover links from successful crawls
|
||||
new_links: List[Tuple[str, Optional[str]]] = []
|
||||
@@ -94,6 +98,10 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
|
||||
# and only discover links from successful crawls
|
||||
if result.success:
|
||||
self._pages_crawled += 1
|
||||
# Check if we've reached the limit during batch processing
|
||||
if self._pages_crawled >= self.max_pages:
|
||||
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
|
||||
break # Exit the generator
|
||||
|
||||
new_links: List[Tuple[str, Optional[str]]] = []
|
||||
await self.link_discovery(result, url, depth, visited, new_links, depths)
|
||||
|
||||
@@ -227,10 +227,21 @@ class URLPatternFilter(URLFilter):
|
||||
# Prefix check (/foo/*)
|
||||
if self._simple_prefixes:
|
||||
path = url.split("?")[0]
|
||||
if any(path.startswith(p) for p in self._simple_prefixes):
|
||||
result = True
|
||||
self._update_stats(result)
|
||||
return not result if self._reverse else result
|
||||
# if any(path.startswith(p) for p in self._simple_prefixes):
|
||||
# result = True
|
||||
# self._update_stats(result)
|
||||
# return not result if self._reverse else result
|
||||
####
|
||||
# Modified the prefix matching logic to ensure path boundary checking:
|
||||
# - Check if the matched prefix is followed by a path separator (`/`), query parameter (`?`), fragment (`#`), or is at the end of the path
|
||||
# - This ensures `/api/` only matches complete path segments, not substrings like `/apiv2/`
|
||||
####
|
||||
for prefix in self._simple_prefixes:
|
||||
if path.startswith(prefix):
|
||||
if len(path) == len(prefix) or path[len(prefix)] in ['/', '?', '#']:
|
||||
result = True
|
||||
self._update_stats(result)
|
||||
return not result if self._reverse else result
|
||||
|
||||
# Complex patterns
|
||||
if self._path_patterns:
|
||||
@@ -337,6 +348,15 @@ class ContentTypeFilter(URLFilter):
|
||||
"sqlite": "application/vnd.sqlite3",
|
||||
# Placeholder
|
||||
"unknown": "application/octet-stream", # Fallback for unknown file types
|
||||
# php
|
||||
"php": "application/x-httpd-php",
|
||||
"php3": "application/x-httpd-php",
|
||||
"php4": "application/x-httpd-php",
|
||||
"php5": "application/x-httpd-php",
|
||||
"php7": "application/x-httpd-php",
|
||||
"phtml": "application/x-httpd-php",
|
||||
"phps": "application/x-httpd-php-source",
|
||||
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
|
||||
@@ -73,6 +73,8 @@ class Crawl4aiDockerClient:
|
||||
def _prepare_request(self, urls: List[str], browser_config: Optional[BrowserConfig] = None,
|
||||
crawler_config: Optional[CrawlerRunConfig] = None) -> Dict[str, Any]:
|
||||
"""Prepare request data from configs."""
|
||||
if self._token:
|
||||
self._http_client.headers["Authorization"] = f"Bearer {self._token}"
|
||||
return {
|
||||
"urls": urls,
|
||||
"browser_config": browser_config.dump() if browser_config else {},
|
||||
@@ -103,8 +105,6 @@ class Crawl4aiDockerClient:
|
||||
crawler_config: Optional[CrawlerRunConfig] = None
|
||||
) -> Union[CrawlResult, List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
|
||||
"""Execute a crawl operation."""
|
||||
if not self._token:
|
||||
raise Crawl4aiClientError("Authentication required. Call authenticate() first.")
|
||||
await self._check_server()
|
||||
|
||||
data = self._prepare_request(urls, browser_config, crawler_config)
|
||||
@@ -140,8 +140,6 @@ class Crawl4aiDockerClient:
|
||||
|
||||
async def get_schema(self) -> Dict[str, Any]:
|
||||
"""Retrieve configuration schemas."""
|
||||
if not self._token:
|
||||
raise Crawl4aiClientError("Authentication required. Call authenticate() first.")
|
||||
response = await self._request("GET", "/schema")
|
||||
return response.json()
|
||||
|
||||
@@ -167,4 +165,4 @@ async def main():
|
||||
print(schema)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
asyncio.run(main())
|
||||
|
||||
@@ -656,11 +656,11 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
self.total_usage.total_tokens += usage.total_tokens
|
||||
|
||||
try:
|
||||
response = response.choices[0].message.content
|
||||
content = response.choices[0].message.content
|
||||
blocks = None
|
||||
|
||||
if self.force_json_response:
|
||||
blocks = json.loads(response)
|
||||
blocks = json.loads(content)
|
||||
if isinstance(blocks, dict):
|
||||
# If it has only one key which calue is list then assign that to blocks, exampled: {"news": [..]}
|
||||
if len(blocks) == 1 and isinstance(list(blocks.values())[0], list):
|
||||
@@ -673,7 +673,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
blocks = blocks
|
||||
else:
|
||||
# blocks = extract_xml_data(["blocks"], response.choices[0].message.content)["blocks"]
|
||||
blocks = extract_xml_data(["blocks"], response)["blocks"]
|
||||
blocks = extract_xml_data(["blocks"], content)["blocks"]
|
||||
blocks = json.loads(blocks)
|
||||
|
||||
for block in blocks:
|
||||
|
||||
@@ -32,7 +32,6 @@ import hashlib
|
||||
|
||||
from urllib.robotparser import RobotFileParser
|
||||
import aiohttp
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
from functools import lru_cache
|
||||
|
||||
from packaging import version
|
||||
@@ -43,6 +42,37 @@ from itertools import chain
|
||||
from collections import deque
|
||||
from typing import Generator, Iterable
|
||||
|
||||
import numpy as np
|
||||
|
||||
from urllib.parse import (
|
||||
urljoin, urlparse, urlunparse,
|
||||
parse_qsl, urlencode, quote, unquote
|
||||
)
|
||||
|
||||
|
||||
# Monkey patch to fix wildcard handling in urllib.robotparser
|
||||
from urllib.robotparser import RuleLine
|
||||
import re
|
||||
|
||||
original_applies_to = RuleLine.applies_to
|
||||
|
||||
def patched_applies_to(self, filename):
|
||||
# Handle wildcards in paths
|
||||
if '*' in self.path or '%2A' in self.path or self.path in ("*", "%2A"):
|
||||
pattern = self.path.replace('%2A', '*')
|
||||
pattern = re.escape(pattern).replace('\\*', '.*')
|
||||
pattern = '^' + pattern
|
||||
if pattern.endswith('\\$'):
|
||||
pattern = pattern[:-2] + '$'
|
||||
try:
|
||||
return bool(re.match(pattern, filename))
|
||||
except re.error:
|
||||
return original_applies_to(self, filename)
|
||||
return original_applies_to(self, filename)
|
||||
|
||||
RuleLine.applies_to = patched_applies_to
|
||||
# Monkey patch ends
|
||||
|
||||
def chunk_documents(
|
||||
documents: Iterable[str],
|
||||
chunk_token_threshold: int,
|
||||
@@ -311,7 +341,7 @@ class RobotsParser:
|
||||
robots_url = f"{scheme}://{domain}/robots.txt"
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(robots_url, timeout=2) as response:
|
||||
async with session.get(robots_url, timeout=2, ssl=False) as response:
|
||||
if response.status == 200:
|
||||
rules = await response.text()
|
||||
self._cache_rules(domain, rules)
|
||||
@@ -1517,6 +1547,14 @@ def extract_metadata_using_lxml(html, doc=None):
|
||||
content = tag.get("content", "").strip()
|
||||
if property_name and content:
|
||||
metadata[property_name] = content
|
||||
|
||||
# Article metadata
|
||||
article_tags = head.xpath('.//meta[starts-with(@property, "article:")]')
|
||||
for tag in article_tags:
|
||||
property_name = tag.get("property", "").strip()
|
||||
content = tag.get("content", "").strip()
|
||||
if property_name and content:
|
||||
metadata[property_name] = content
|
||||
|
||||
return metadata
|
||||
|
||||
@@ -1592,7 +1630,15 @@ def extract_metadata(html, soup=None):
|
||||
content = tag.get("content", "").strip()
|
||||
if property_name and content:
|
||||
metadata[property_name] = content
|
||||
|
||||
|
||||
# Article metadata
|
||||
article_tags = head.find_all("meta", attrs={"property": re.compile(r"^article:")})
|
||||
for tag in article_tags:
|
||||
property_name = tag.get("property", "").strip()
|
||||
content = tag.get("content", "").strip()
|
||||
if property_name and content:
|
||||
metadata[property_name] = content
|
||||
|
||||
return metadata
|
||||
|
||||
|
||||
@@ -2061,13 +2107,101 @@ def normalize_url(href, base_url):
|
||||
parsed_base = urlparse(base_url)
|
||||
if not parsed_base.scheme or not parsed_base.netloc:
|
||||
raise ValueError(f"Invalid base URL format: {base_url}")
|
||||
|
||||
# Ensure base_url ends with a trailing slash if it's a directory path
|
||||
if not base_url.endswith('/'):
|
||||
base_url = base_url + '/'
|
||||
|
||||
if parsed_base.scheme.lower() not in ["http", "https"]:
|
||||
# Handle special protocols
|
||||
raise ValueError(f"Invalid base URL format: {base_url}")
|
||||
cleaned_href = href.strip()
|
||||
|
||||
# Use urljoin to handle all cases
|
||||
normalized = urljoin(base_url, href.strip())
|
||||
return urljoin(base_url, cleaned_href)
|
||||
|
||||
|
||||
|
||||
|
||||
def normalize_url(
|
||||
href: str,
|
||||
base_url: str,
|
||||
*,
|
||||
drop_query_tracking=True,
|
||||
sort_query=True,
|
||||
keep_fragment=False,
|
||||
extra_drop_params=None
|
||||
):
|
||||
"""
|
||||
Extended URL normalizer
|
||||
|
||||
Parameters
|
||||
----------
|
||||
href : str
|
||||
The raw link extracted from a page.
|
||||
base_url : str
|
||||
The page’s canonical URL (used to resolve relative links).
|
||||
drop_query_tracking : bool (default True)
|
||||
Remove common tracking query parameters.
|
||||
sort_query : bool (default True)
|
||||
Alphabetically sort query keys for deterministic output.
|
||||
keep_fragment : bool (default False)
|
||||
Preserve the hash fragment (#section) if you need in-page links.
|
||||
extra_drop_params : Iterable[str] | None
|
||||
Additional query keys to strip (case-insensitive).
|
||||
|
||||
Returns
|
||||
-------
|
||||
str | None
|
||||
A clean, canonical URL or None if href is empty/None.
|
||||
"""
|
||||
if not href:
|
||||
return None
|
||||
|
||||
# Resolve relative paths first
|
||||
full_url = urljoin(base_url, href.strip())
|
||||
|
||||
# Parse once, edit parts, then rebuild
|
||||
parsed = urlparse(full_url)
|
||||
|
||||
# ── netloc ──
|
||||
netloc = parsed.netloc.lower()
|
||||
|
||||
# ── path ──
|
||||
# Strip duplicate slashes and trailing “/” (except root)
|
||||
path = quote(unquote(parsed.path))
|
||||
if path.endswith('/') and path != '/':
|
||||
path = path.rstrip('/')
|
||||
|
||||
# ── query ──
|
||||
query = parsed.query
|
||||
if query:
|
||||
# explode, mutate, then rebuild
|
||||
params = [(k.lower(), v) for k, v in parse_qsl(query, keep_blank_values=True)]
|
||||
|
||||
if drop_query_tracking:
|
||||
default_tracking = {
|
||||
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term',
|
||||
'utm_content', 'gclid', 'fbclid', 'ref', 'ref_src'
|
||||
}
|
||||
if extra_drop_params:
|
||||
default_tracking |= {p.lower() for p in extra_drop_params}
|
||||
params = [(k, v) for k, v in params if k not in default_tracking]
|
||||
|
||||
if sort_query:
|
||||
params.sort(key=lambda kv: kv[0])
|
||||
|
||||
query = urlencode(params, doseq=True) if params else ''
|
||||
|
||||
# ── fragment ──
|
||||
fragment = parsed.fragment if keep_fragment else ''
|
||||
|
||||
# Re-assemble
|
||||
normalized = urlunparse((
|
||||
parsed.scheme,
|
||||
netloc,
|
||||
path,
|
||||
parsed.params,
|
||||
query,
|
||||
fragment
|
||||
))
|
||||
|
||||
return normalized
|
||||
|
||||
|
||||
@@ -3148,3 +3282,114 @@ def calculate_total_score(
|
||||
|
||||
return max(0.0, min(total, 10.0))
|
||||
|
||||
|
||||
# Embedding utilities
|
||||
async def get_text_embeddings(
|
||||
texts: List[str],
|
||||
llm_config: Optional[Dict] = None,
|
||||
model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
|
||||
batch_size: int = 32
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Compute embeddings for a list of texts using specified model.
|
||||
|
||||
Args:
|
||||
texts: List of texts to embed
|
||||
llm_config: Optional LLM configuration for API-based embeddings
|
||||
model_name: Model name (used when llm_config is None)
|
||||
batch_size: Batch size for processing
|
||||
|
||||
Returns:
|
||||
numpy array of embeddings
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
if not texts:
|
||||
return np.array([])
|
||||
|
||||
# If LLMConfig provided, use litellm for embeddings
|
||||
if llm_config is not None:
|
||||
from litellm import aembedding
|
||||
|
||||
# Get embedding model from config or use default
|
||||
embedding_model = llm_config.get('provider', 'text-embedding-3-small')
|
||||
api_base = llm_config.get('base_url', llm_config.get('api_base'))
|
||||
|
||||
# Prepare kwargs
|
||||
kwargs = {
|
||||
'model': embedding_model,
|
||||
'input': texts,
|
||||
'api_key': llm_config.get('api_token', llm_config.get('api_key'))
|
||||
}
|
||||
|
||||
if api_base:
|
||||
kwargs['api_base'] = api_base
|
||||
|
||||
# Handle OpenAI-compatible endpoints
|
||||
if api_base and 'openai/' not in embedding_model:
|
||||
kwargs['model'] = f"openai/{embedding_model}"
|
||||
|
||||
# Get embeddings
|
||||
response = await aembedding(**kwargs)
|
||||
|
||||
# Extract embeddings from response
|
||||
embeddings = []
|
||||
for item in response.data:
|
||||
embeddings.append(item['embedding'])
|
||||
|
||||
return np.array(embeddings)
|
||||
|
||||
# Default: use sentence-transformers
|
||||
else:
|
||||
# Lazy load to avoid importing heavy libraries unless needed
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"sentence-transformers is required for local embeddings. "
|
||||
"Install it with: pip install 'crawl4ai[transformer]' or pip install sentence-transformers"
|
||||
)
|
||||
|
||||
# Cache the model in function attribute to avoid reloading
|
||||
if not hasattr(get_text_embeddings, '_models'):
|
||||
get_text_embeddings._models = {}
|
||||
|
||||
if model_name not in get_text_embeddings._models:
|
||||
get_text_embeddings._models[model_name] = SentenceTransformer(model_name)
|
||||
|
||||
encoder = get_text_embeddings._models[model_name]
|
||||
|
||||
# Batch encode for efficiency
|
||||
embeddings = encoder.encode(
|
||||
texts,
|
||||
batch_size=batch_size,
|
||||
show_progress_bar=False,
|
||||
convert_to_numpy=True
|
||||
)
|
||||
|
||||
return embeddings
|
||||
|
||||
|
||||
def get_text_embeddings_sync(
|
||||
texts: List[str],
|
||||
llm_config: Optional[Dict] = None,
|
||||
model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
|
||||
batch_size: int = 32
|
||||
) -> np.ndarray:
|
||||
"""Synchronous wrapper for get_text_embeddings"""
|
||||
import numpy as np
|
||||
return asyncio.run(get_text_embeddings(texts, llm_config, model_name, batch_size))
|
||||
|
||||
|
||||
def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
|
||||
"""Calculate cosine similarity between two vectors"""
|
||||
import numpy as np
|
||||
dot_product = np.dot(vec1, vec2)
|
||||
norm_product = np.linalg.norm(vec1) * np.linalg.norm(vec2)
|
||||
return float(dot_product / norm_product) if norm_product != 0 else 0.0
|
||||
|
||||
|
||||
def cosine_distance(vec1: np.ndarray, vec2: np.ndarray) -> float:
|
||||
"""Calculate cosine distance (1 - similarity) between two vectors"""
|
||||
return 1 - cosine_similarity(vec1, vec2)
|
||||
|
||||
|
||||
@@ -58,13 +58,15 @@ Pull and run images directly from Docker Hub without building locally.
|
||||
|
||||
#### 1. Pull the Image
|
||||
|
||||
Our latest release candidate is `0.6.0-r1`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
|
||||
Our latest release candidate is `0.7.0-r1`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
|
||||
|
||||
> ⚠️ **Important Note**: The `latest` tag currently points to the stable `0.6.0` version. After testing and validation, `0.7.0` (without -r1) will be released and `latest` will be updated. For now, please use `0.7.0-r1` to test the new features.
|
||||
|
||||
```bash
|
||||
# Pull the release candidate (recommended for latest features)
|
||||
docker pull unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number
|
||||
# Pull the release candidate (for testing new features)
|
||||
docker pull unclecode/crawl4ai:0.7.0-r1
|
||||
|
||||
# Or pull the latest stable version
|
||||
# Or pull the current stable version (0.6.0)
|
||||
docker pull unclecode/crawl4ai:latest
|
||||
```
|
||||
|
||||
@@ -99,7 +101,7 @@ EOL
|
||||
-p 11235:11235 \
|
||||
--name crawl4ai \
|
||||
--shm-size=1g \
|
||||
unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number
|
||||
unclecode/crawl4ai:0.7.0-r1
|
||||
```
|
||||
|
||||
* **With LLM support:**
|
||||
@@ -110,7 +112,7 @@ EOL
|
||||
--name crawl4ai \
|
||||
--env-file .llm.env \
|
||||
--shm-size=1g \
|
||||
unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number
|
||||
unclecode/crawl4ai:0.7.0-r1
|
||||
```
|
||||
|
||||
> The server will be available at `http://localhost:11235`. Visit `/playground` to access the interactive testing interface.
|
||||
@@ -124,7 +126,7 @@ docker stop crawl4ai && docker rm crawl4ai
|
||||
#### Docker Hub Versioning Explained
|
||||
|
||||
* **Image Name:** `unclecode/crawl4ai`
|
||||
* **Tag Format:** `LIBRARY_VERSION[-SUFFIX]` (e.g., `0.6.0-r1`)
|
||||
* **Tag Format:** `LIBRARY_VERSION[-SUFFIX]` (e.g., `0.7.0-r1`)
|
||||
* `LIBRARY_VERSION`: The semantic version of the core `crawl4ai` Python library
|
||||
* `SUFFIX`: Optional tag for release candidates (``) and revisions (`r1`)
|
||||
* **`latest` Tag:** Points to the most recent stable version
|
||||
@@ -160,7 +162,7 @@ The `docker-compose.yml` file in the project root provides a simplified approach
|
||||
```bash
|
||||
# Pulls and runs the release candidate from Docker Hub
|
||||
# Automatically selects the correct architecture
|
||||
IMAGE=unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number docker compose up -d
|
||||
IMAGE=unclecode/crawl4ai:0.7.0-r1 docker compose up -d
|
||||
```
|
||||
|
||||
* **Build and Run Locally:**
|
||||
|
||||
@@ -5,6 +5,7 @@ from typing import List, Tuple, Dict
|
||||
from functools import partial
|
||||
from uuid import uuid4
|
||||
from datetime import datetime
|
||||
from base64 import b64encode
|
||||
|
||||
import logging
|
||||
from typing import Optional, AsyncGenerator
|
||||
@@ -371,6 +372,9 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
|
||||
server_memory_mb = _get_memory_mb()
|
||||
result_dict = result.model_dump()
|
||||
result_dict['server_memory_mb'] = server_memory_mb
|
||||
# If PDF exists, encode it to base64
|
||||
if result_dict.get('pdf') is not None:
|
||||
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
|
||||
logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")
|
||||
data = json.dumps(result_dict, default=datetime_handler) + "\n"
|
||||
yield data.encode('utf-8')
|
||||
@@ -443,10 +447,19 @@ async def handle_crawl_request(
|
||||
mem_delta_mb = end_mem_mb - start_mem_mb # <--- Calculate delta
|
||||
peak_mem_mb = max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb) # <--- Get peak memory
|
||||
logger.info(f"Memory usage: Start: {start_mem_mb} MB, End: {end_mem_mb} MB, Delta: {mem_delta_mb} MB, Peak: {peak_mem_mb} MB")
|
||||
|
||||
|
||||
# Process results to handle PDF bytes
|
||||
processed_results = []
|
||||
for result in results:
|
||||
result_dict = result.model_dump()
|
||||
# If PDF exists, encode it to base64
|
||||
if result_dict.get('pdf') is not None:
|
||||
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
|
||||
processed_results.append(result_dict)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"results": [result.model_dump() for result in results],
|
||||
"results": processed_results,
|
||||
"server_processing_time_s": end_time - start_time,
|
||||
"server_memory_delta_mb": mem_delta_mb,
|
||||
"server_peak_memory_mb": peak_mem_mb
|
||||
@@ -459,7 +472,7 @@ async def handle_crawl_request(
|
||||
# await crawler.close()
|
||||
# except Exception as close_e:
|
||||
# logger.error(f"Error closing crawler during exception handling: {close_e}")
|
||||
logger.error(f"Error closing crawler during exception handling: {close_e}")
|
||||
logger.error(f"Error closing crawler during exception handling: {str(e)}")
|
||||
|
||||
# Measure memory even on error if possible
|
||||
end_mem_mb_error = _get_memory_mb()
|
||||
@@ -518,7 +531,7 @@ async def handle_stream_crawl_request(
|
||||
# await crawler.close()
|
||||
# except Exception as close_e:
|
||||
# logger.error(f"Error closing crawler during stream setup exception: {close_e}")
|
||||
logger.error(f"Error closing crawler during stream setup exception: {close_e}")
|
||||
logger.error(f"Error closing crawler during stream setup exception: {str(e)}")
|
||||
logger.error(f"Stream crawl error: {str(e)}", exc_info=True)
|
||||
# Raising HTTPException here will prevent streaming response
|
||||
raise HTTPException(
|
||||
|
||||
@@ -332,7 +332,7 @@ The `clone()` method:
|
||||
### Key fields to note
|
||||
|
||||
1. **`provider`**:
|
||||
- Which LLM provoder to use.
|
||||
- Which LLM provider to use.
|
||||
- Possible values are `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`<br/>*(default: `"openai/gpt-4o-mini"`)*
|
||||
|
||||
2. **`api_token`**:
|
||||
@@ -403,7 +403,7 @@ async def main():
|
||||
|
||||
md_generator = DefaultMarkdownGenerator(
|
||||
content_filter=filter,
|
||||
options={"ignore_links": True}
|
||||
options={"ignore_links": True})
|
||||
|
||||
# 4) Crawler run config: skip cache, use extraction
|
||||
run_conf = CrawlerRunConfig(
|
||||
@@ -3760,11 +3760,11 @@ To crawl a live web page, provide the URL starting with `http://` or `https://`,
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
async def crawl_web():
|
||||
config = CrawlerRunConfig(bypass_cache=True)
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://en.wikipedia.org/wiki/apple",
|
||||
@@ -3785,13 +3785,13 @@ To crawl a local HTML file, prefix the file path with `file://`.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
async def crawl_local_file():
|
||||
local_file_path = "/path/to/apple.html" # Replace with your file path
|
||||
file_url = f"file://{local_file_path}"
|
||||
config = CrawlerRunConfig(bypass_cache=True)
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url=file_url, config=config)
|
||||
@@ -3810,13 +3810,13 @@ To crawl raw HTML content, prefix the HTML string with `raw:`.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
async def crawl_raw_html():
|
||||
raw_html = "<html><body><h1>Hello, World!</h1></body></html>"
|
||||
raw_html_url = f"raw:{raw_html}"
|
||||
config = CrawlerRunConfig(bypass_cache=True)
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url=raw_html_url, config=config)
|
||||
@@ -3845,7 +3845,7 @@ import os
|
||||
import sys
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
async def main():
|
||||
@@ -3856,7 +3856,7 @@ async def main():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Step 1: Crawl the Web URL
|
||||
print("\n=== Step 1: Crawling the Wikipedia URL ===")
|
||||
web_config = CrawlerRunConfig(bypass_cache=True)
|
||||
web_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
result = await crawler.arun(url=wikipedia_url, config=web_config)
|
||||
|
||||
if not result.success:
|
||||
@@ -3871,7 +3871,7 @@ async def main():
|
||||
# Step 2: Crawl from the Local HTML File
|
||||
print("=== Step 2: Crawling from the Local HTML File ===")
|
||||
file_url = f"file://{html_file_path.resolve()}"
|
||||
file_config = CrawlerRunConfig(bypass_cache=True)
|
||||
file_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
local_result = await crawler.arun(url=file_url, config=file_config)
|
||||
|
||||
if not local_result.success:
|
||||
@@ -3887,7 +3887,7 @@ async def main():
|
||||
with open(html_file_path, 'r', encoding='utf-8') as f:
|
||||
raw_html_content = f.read()
|
||||
raw_html_url = f"raw:{raw_html_content}"
|
||||
raw_config = CrawlerRunConfig(bypass_cache=True)
|
||||
raw_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
raw_result = await crawler.arun(url=raw_html_url, config=raw_config)
|
||||
|
||||
if not raw_result.success:
|
||||
@@ -4152,7 +4152,7 @@ prune_filter = PruningContentFilter(
|
||||
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
|
||||
async def main():
|
||||
@@ -4175,8 +4175,13 @@ async def main():
|
||||
verbose=True
|
||||
)
|
||||
|
||||
md_generator = DefaultMarkdownGenerator(
|
||||
content_filter=filter,
|
||||
options={"ignore_links": True}
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
content_filter=filter
|
||||
markdown_generator=md_generator
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
@@ -5428,29 +5433,38 @@ Sometimes you need a visual record of a page or a PDF “printout.” Crawl4AI c
|
||||
```python
|
||||
import os, asyncio
|
||||
from base64 import b64decode
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
|
||||
|
||||
async def main():
|
||||
run_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
screenshot=True,
|
||||
pdf=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://en.wikipedia.org/wiki/List_of_common_misconceptions",
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
pdf=True,
|
||||
screenshot=True
|
||||
config=run_config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
# Save screenshot
|
||||
print(f"Screenshot data present: {result.screenshot is not None}")
|
||||
print(f"PDF data present: {result.pdf is not None}")
|
||||
|
||||
if result.screenshot:
|
||||
print(f"[OK] Screenshot captured, size: {len(result.screenshot)} bytes")
|
||||
with open("wikipedia_screenshot.png", "wb") as f:
|
||||
f.write(b64decode(result.screenshot))
|
||||
|
||||
# Save PDF
|
||||
else:
|
||||
print("[WARN] Screenshot data is None.")
|
||||
|
||||
if result.pdf:
|
||||
print(f"[OK] PDF captured, size: {len(result.pdf)} bytes")
|
||||
with open("wikipedia_page.pdf", "wb") as f:
|
||||
f.write(result.pdf)
|
||||
|
||||
print("[OK] PDF & screenshot captured.")
|
||||
else:
|
||||
print("[WARN] PDF data is None.")
|
||||
|
||||
else:
|
||||
print("[ERROR]", result.error_message)
|
||||
|
||||
|
||||
@@ -12,8 +12,7 @@ class CrawlRequest(BaseModel):
|
||||
class MarkdownRequest(BaseModel):
|
||||
"""Request body for the /md endpoint."""
|
||||
url: str = Field(..., description="Absolute http/https URL to fetch")
|
||||
f: FilterType = Field(FilterType.FIT,
|
||||
description="Content‑filter strategy: FIT, RAW, BM25, or LLM")
|
||||
f: FilterType = Field(FilterType.FIT, description="Content‑filter strategy: fit, raw, bm25, or llm")
|
||||
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
|
||||
c: Optional[str] = Field("0", description="Cache‑bust / revision counter")
|
||||
|
||||
|
||||
@@ -671,6 +671,16 @@
|
||||
method: 'GET',
|
||||
headers: { 'Accept': 'application/json' }
|
||||
});
|
||||
responseData = await response.json();
|
||||
const time = Math.round(performance.now() - startTime);
|
||||
if (!response.ok) {
|
||||
updateStatus('error', time);
|
||||
throw new Error(responseData.error || 'Request failed');
|
||||
}
|
||||
updateStatus('success', time);
|
||||
document.querySelector('#response-content code').textContent = JSON.stringify(responseData, null, 2);
|
||||
document.querySelector('#response-content code').className = 'json hljs';
|
||||
forceHighlightElement(document.querySelector('#response-content code'));
|
||||
} else if (endpoint === 'crawl_stream') {
|
||||
// Stream processing
|
||||
response = await fetch(api, {
|
||||
|
||||
343
docs/blog/release-v0.7.0.md
Normal file
343
docs/blog/release-v0.7.0.md
Normal file
@@ -0,0 +1,343 @@
|
||||
# 🚀 Crawl4AI v0.7.0: The Adaptive Intelligence Update
|
||||
|
||||
*January 28, 2025 • 10 min read*
|
||||
|
||||
---
|
||||
|
||||
Today I'm releasing Crawl4AI v0.7.0—the Adaptive Intelligence Update. This release introduces fundamental improvements in how Crawl4AI handles modern web complexity through adaptive learning, intelligent content discovery, and advanced extraction capabilities.
|
||||
|
||||
## 🎯 What's New at a Glance
|
||||
|
||||
- **Adaptive Crawling**: Your crawler now learns and adapts to website patterns
|
||||
- **Virtual Scroll Support**: Complete content extraction from infinite scroll pages
|
||||
- **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization
|
||||
- **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering
|
||||
- **Performance Optimizations**: Significant speed and memory improvements
|
||||
|
||||
## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning
|
||||
|
||||
**The Problem:** Websites change. Class names shift. IDs disappear. Your carefully crafted selectors break at 3 AM, and you wake up to empty datasets and angry stakeholders.
|
||||
|
||||
**My Solution:** I implemented an adaptive learning system that observes patterns, builds confidence scores, and adjusts extraction strategies on the fly. It's like having a junior developer who gets better at their job with every page they scrape.
|
||||
|
||||
### Technical Deep-Dive
|
||||
|
||||
The Adaptive Crawler maintains a persistent state for each domain, tracking:
|
||||
- Pattern success rates
|
||||
- Selector stability over time
|
||||
- Content structure variations
|
||||
- Extraction confidence scores
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||
import asyncio
|
||||
|
||||
async def main():
|
||||
|
||||
# Configure adaptive crawler
|
||||
config = AdaptiveConfig(
|
||||
strategy="statistical", # or "embedding" for semantic understanding
|
||||
max_pages=10,
|
||||
confidence_threshold=0.7, # Stop at 70% confidence
|
||||
top_k_links=3, # Follow top 3 links per page
|
||||
min_gain_threshold=0.05 # Need 5% information gain to continue
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
|
||||
print("Starting adaptive crawl about Python decorators...")
|
||||
result = await adaptive.digest(
|
||||
start_url="https://docs.python.org/3/glossary.html",
|
||||
query="python decorators functions wrapping"
|
||||
)
|
||||
|
||||
print(f"\n✅ Crawling Complete!")
|
||||
print(f"• Confidence Level: {adaptive.confidence:.0%}")
|
||||
print(f"• Pages Crawled: {len(result.crawled_urls)}")
|
||||
print(f"• Knowledge Base: {len(adaptive.state.knowledge_base)} documents")
|
||||
|
||||
# Get most relevant content
|
||||
relevant = adaptive.get_relevant_content(top_k=3)
|
||||
print(f"\nMost Relevant Pages:")
|
||||
for i, page in enumerate(relevant, 1):
|
||||
print(f"{i}. {page['url']} (relevance: {page['score']:.2%})")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
- **News Aggregation**: Maintain 95%+ extraction accuracy even as news sites update their templates
|
||||
- **E-commerce Monitoring**: Track product changes across hundreds of stores without constant maintenance
|
||||
- **Research Data Collection**: Build robust academic datasets that survive website redesigns
|
||||
- **Reduced Maintenance**: Cut selector update time by 80% for frequently-changing sites
|
||||
|
||||
## 🌊 Virtual Scroll: Complete Content Capture
|
||||
|
||||
**The Problem:** Modern web apps only render what's visible. Scroll down, new content appears, old content vanishes into the void. Traditional crawlers capture that first viewport and miss 90% of the content. It's like reading only the first page of every book.
|
||||
|
||||
**My Solution:** I built Virtual Scroll support that mimics human browsing behavior, capturing content as it loads and preserving it before the browser's garbage collector strikes.
|
||||
|
||||
### Implementation Details
|
||||
|
||||
```python
|
||||
from crawl4ai import VirtualScrollConfig
|
||||
|
||||
# For social media feeds (Twitter/X style)
|
||||
twitter_config = VirtualScrollConfig(
|
||||
container_selector="[data-testid='primaryColumn']",
|
||||
scroll_count=20, # Number of scrolls
|
||||
scroll_by="container_height", # Smart scrolling by container size
|
||||
wait_after_scroll=1.0 # Let content load
|
||||
)
|
||||
|
||||
# For e-commerce product grids (Instagram style)
|
||||
grid_config = VirtualScrollConfig(
|
||||
container_selector="main .product-grid",
|
||||
scroll_count=30,
|
||||
scroll_by=800, # Fixed pixel scrolling
|
||||
wait_after_scroll=1.5 # Images need time
|
||||
)
|
||||
|
||||
# For news feeds with lazy loading
|
||||
news_config = VirtualScrollConfig(
|
||||
container_selector=".article-feed",
|
||||
scroll_count=50,
|
||||
scroll_by="page_height", # Viewport-based scrolling
|
||||
wait_after_scroll=0.5 # Wait for content to load
|
||||
)
|
||||
|
||||
# Use it in your crawl
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://twitter.com/trending",
|
||||
config=CrawlerRunConfig(
|
||||
virtual_scroll_config=twitter_config,
|
||||
# Combine with other features
|
||||
extraction_strategy=JsonCssExtractionStrategy({
|
||||
"tweets": {
|
||||
"selector": "[data-testid='tweet']",
|
||||
"fields": {
|
||||
"text": {"selector": "[data-testid='tweetText']", "type": "text"},
|
||||
"likes": {"selector": "[data-testid='like']", "type": "text"}
|
||||
}
|
||||
}
|
||||
})
|
||||
)
|
||||
)
|
||||
|
||||
print(f"Captured {len(result.extracted_content['tweets'])} tweets")
|
||||
```
|
||||
|
||||
**Key Capabilities:**
|
||||
- **DOM Recycling Awareness**: Detects and handles virtual DOM element recycling
|
||||
- **Smart Scroll Physics**: Three modes - container height, page height, or fixed pixels
|
||||
- **Content Preservation**: Captures content before it's destroyed
|
||||
- **Intelligent Stopping**: Stops when no new content appears
|
||||
- **Memory Efficient**: Streams content instead of holding everything in memory
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
- **Social Media Analysis**: Capture entire Twitter threads with hundreds of replies, not just top 10
|
||||
- **E-commerce Scraping**: Extract 500+ products from infinite scroll catalogs vs. 20-50 with traditional methods
|
||||
- **News Aggregation**: Get all articles from modern news sites, not just above-the-fold content
|
||||
- **Research Applications**: Complete data extraction from academic databases using virtual pagination
|
||||
|
||||
## 🔗 Link Preview: Intelligent Link Analysis and Scoring
|
||||
|
||||
**The Problem:** You crawl a page and get 200 links. Which ones matter? Which lead to the content you actually want? Traditional crawlers force you to follow everything or build complex filters.
|
||||
|
||||
**My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals.
|
||||
|
||||
### Intelligent Link Analysis and Scoring
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import CrawlerRunConfig, CacheMode, AsyncWebCrawler
|
||||
from crawl4ai.adaptive_crawler import LinkPreviewConfig
|
||||
|
||||
async def main():
|
||||
# Configure intelligent link analysis
|
||||
link_config = LinkPreviewConfig(
|
||||
include_internal=True,
|
||||
include_external=False,
|
||||
max_links=10,
|
||||
concurrency=5,
|
||||
query="python tutorial", # For contextual scoring
|
||||
score_threshold=0.3,
|
||||
verbose=True
|
||||
)
|
||||
# Use in your crawl
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://www.geeksforgeeks.org/",
|
||||
config=CrawlerRunConfig(
|
||||
link_preview_config=link_config,
|
||||
score_links=True, # Enable intrinsic scoring
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
)
|
||||
|
||||
# Access scored and sorted links
|
||||
if result.success and result.links:
|
||||
for link in result.links.get("internal", []):
|
||||
text = link.get('text', 'No text')[:40]
|
||||
print(
|
||||
text,
|
||||
f"{link.get('intrinsic_score', 0):.1f}/10" if link.get('intrinsic_score') is not None else "0.0/10",
|
||||
f"{link.get('contextual_score', 0):.2f}/1" if link.get('contextual_score') is not None else "0.00/1",
|
||||
f"{link.get('total_score', 0):.3f}" if link.get('total_score') is not None else "0.000"
|
||||
)
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Scoring Components:**
|
||||
|
||||
1. **Intrinsic Score**: Based on link quality indicators
|
||||
- Position on page (navigation, content, footer)
|
||||
- Link attributes (rel, title, class names)
|
||||
- Anchor text quality and length
|
||||
- URL structure and depth
|
||||
|
||||
2. **Contextual Score**: Relevance to your query using BM25 algorithm
|
||||
- Keyword matching in link text and title
|
||||
- Meta description analysis
|
||||
- Content preview scoring
|
||||
|
||||
3. **Total Score**: Combined score for final ranking
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
- **Research Efficiency**: Find relevant papers 10x faster by following only high-score links
|
||||
- **Competitive Analysis**: Automatically identify important pages on competitor sites
|
||||
- **Content Discovery**: Build topic-focused crawlers that stay on track
|
||||
- **SEO Audits**: Identify and prioritize high-value internal linking opportunities
|
||||
|
||||
## 🎣 Async URL Seeder: Automated URL Discovery at Scale
|
||||
|
||||
**The Problem:** You want to crawl an entire domain but only have the homepage. Or worse, you want specific content types across thousands of pages. Manual URL discovery? That's a job for machines, not humans.
|
||||
|
||||
**My Solution:** I built Async URL Seeder—a turbocharged URL discovery engine that combines multiple sources with intelligent filtering and relevance scoring.
|
||||
|
||||
### Technical Architecture
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
||||
|
||||
async def main():
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
# Discover Python tutorial URLs
|
||||
config = SeedingConfig(
|
||||
source="sitemap", # Use sitemap
|
||||
pattern="*python*", # URL pattern filter
|
||||
extract_head=True, # Get metadata
|
||||
query="python tutorial", # For relevance scoring
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.2,
|
||||
max_urls=10
|
||||
)
|
||||
|
||||
print("Discovering Python async tutorial URLs...")
|
||||
urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
|
||||
|
||||
print(f"\n✅ Found {len(urls)} relevant URLs:")
|
||||
for i, url_info in enumerate(urls[:5], 1):
|
||||
print(f"\n{i}. {url_info['url']}")
|
||||
if url_info.get('relevance_score'):
|
||||
print(f" Relevance: {url_info['relevance_score']:.3f}")
|
||||
if url_info.get('head_data', {}).get('title'):
|
||||
print(f" Title: {url_info['head_data']['title'][:60]}...")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Discovery Methods:**
|
||||
- **Sitemap Mining**: Parses robots.txt and all linked sitemaps
|
||||
- **Common Crawl**: Queries the Common Crawl index for historical URLs
|
||||
- **Intelligent Crawling**: Follows links with smart depth control
|
||||
- **Pattern Analysis**: Learns URL structures and generates variations
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
- **Migration Projects**: Discover 10,000+ URLs from legacy sites in under 60 seconds
|
||||
- **Market Research**: Map entire competitor ecosystems automatically
|
||||
- **Academic Research**: Build comprehensive datasets without manual URL collection
|
||||
- **SEO Audits**: Find every indexable page with content scoring
|
||||
- **Content Archival**: Ensure no content is left behind during site migrations
|
||||
|
||||
## ⚡ Performance Optimizations
|
||||
|
||||
This release includes significant performance improvements through optimized resource handling, better concurrency management, and reduced memory footprint.
|
||||
|
||||
### What We Optimized
|
||||
|
||||
```python
|
||||
# Optimized crawling with v0.7.0 improvements
|
||||
results = []
|
||||
for url in urls:
|
||||
result = await crawler.arun(
|
||||
url,
|
||||
config=CrawlerRunConfig(
|
||||
# Performance optimizations
|
||||
wait_until="domcontentloaded", # Faster than networkidle
|
||||
cache_mode=CacheMode.ENABLED # Enable caching
|
||||
)
|
||||
)
|
||||
results.append(result)
|
||||
```
|
||||
|
||||
**Performance Gains:**
|
||||
- **Startup Time**: 70% faster browser initialization
|
||||
- **Page Loading**: 40% reduction with smart resource blocking
|
||||
- **Extraction**: 3x faster with compiled CSS selectors
|
||||
- **Memory Usage**: 60% reduction with streaming processing
|
||||
- **Concurrent Crawls**: Handle 5x more parallel requests
|
||||
|
||||
|
||||
## 🔧 Important Changes
|
||||
|
||||
### Breaking Changes
|
||||
- `link_extractor` renamed to `link_preview` (better reflects functionality)
|
||||
- Minimum Python version now 3.9
|
||||
- `CrawlerConfig` split into `CrawlerRunConfig` and `BrowserConfig`
|
||||
|
||||
### Migration Guide
|
||||
```python
|
||||
# Old (v0.6.x)
|
||||
from crawl4ai import CrawlerConfig
|
||||
config = CrawlerConfig(timeout=30000)
|
||||
|
||||
# New (v0.7.0)
|
||||
from crawl4ai import CrawlerRunConfig, BrowserConfig
|
||||
browser_config = BrowserConfig(timeout=30000)
|
||||
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
```
|
||||
|
||||
## 🤖 Coming Soon: Intelligent Web Automation
|
||||
|
||||
I'm currently working on bringing advanced automation capabilities to Crawl4AI. This includes:
|
||||
|
||||
- **Crawl Agents**: Autonomous crawlers that understand your goals and adapt their strategies
|
||||
- **Auto JS Generation**: Automatic JavaScript code generation for complex interactions
|
||||
- **Smart Form Handling**: Intelligent form detection and filling
|
||||
- **Context-Aware Actions**: Crawlers that understand page context and make decisions
|
||||
|
||||
These features are under active development and will revolutionize how we approach web automation. Stay tuned!
|
||||
|
||||
## 🚀 Get Started
|
||||
|
||||
```bash
|
||||
pip install crawl4ai==0.7.0
|
||||
```
|
||||
|
||||
Check out the [updated documentation](https://docs.crawl4ai.com).
|
||||
|
||||
Questions? Issues? I'm always listening:
|
||||
- GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||
- Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||
- Twitter: [@unclecode](https://x.com/unclecode)
|
||||
|
||||
Happy crawling! 🕷️
|
||||
|
||||
---
|
||||
|
||||
*P.S. If you're using Crawl4AI in production, I'd love to hear about it. Your use cases inspire the next features.*
|
||||
43
docs/blog/release-v0.7.1.md
Normal file
43
docs/blog/release-v0.7.1.md
Normal file
@@ -0,0 +1,43 @@
|
||||
# 🛠️ Crawl4AI v0.7.1: Minor Cleanup Update
|
||||
|
||||
*July 17, 2025 • 2 min read*
|
||||
|
||||
---
|
||||
|
||||
A small maintenance release that removes unused code and improves documentation.
|
||||
|
||||
## 🎯 What's Changed
|
||||
|
||||
- **Removed unused StealthConfig** from `crawl4ai/browser_manager.py`
|
||||
- **Updated documentation** with better examples and parameter explanations
|
||||
- **Fixed virtual scroll configuration** examples in docs
|
||||
|
||||
## 🧹 Code Cleanup
|
||||
|
||||
Removed unused `StealthConfig` import and configuration that wasn't being used anywhere in the codebase. The project uses its own custom stealth implementation through JavaScript injection instead.
|
||||
|
||||
```python
|
||||
# Removed unused code:
|
||||
from playwright_stealth import StealthConfig
|
||||
stealth_config = StealthConfig(...) # This was never used
|
||||
```
|
||||
|
||||
## 📖 Documentation Updates
|
||||
|
||||
- Fixed adaptive crawling parameter examples
|
||||
- Updated session management documentation
|
||||
- Corrected virtual scroll configuration examples
|
||||
|
||||
## 🚀 Installation
|
||||
|
||||
```bash
|
||||
pip install crawl4ai==0.7.1
|
||||
```
|
||||
|
||||
No breaking changes - upgrade directly from v0.7.0.
|
||||
|
||||
---
|
||||
|
||||
Questions? Issues?
|
||||
- GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||
- Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||
85
docs/examples/adaptive_crawling/README.md
Normal file
85
docs/examples/adaptive_crawling/README.md
Normal file
@@ -0,0 +1,85 @@
|
||||
# Adaptive Crawling Examples
|
||||
|
||||
This directory contains examples demonstrating various aspects of Crawl4AI's Adaptive Crawling feature.
|
||||
|
||||
## Examples Overview
|
||||
|
||||
### 1. `basic_usage.py`
|
||||
- Simple introduction to adaptive crawling
|
||||
- Uses default statistical strategy
|
||||
- Shows how to get crawl statistics and relevant content
|
||||
|
||||
### 2. `embedding_strategy.py` ⭐ NEW
|
||||
- Demonstrates the embedding-based strategy for semantic understanding
|
||||
- Shows query expansion and irrelevance detection
|
||||
- Includes configuration for both local and API-based embeddings
|
||||
|
||||
### 3. `embedding_vs_statistical.py` ⭐ NEW
|
||||
- Direct comparison between statistical and embedding strategies
|
||||
- Helps you choose the right strategy for your use case
|
||||
- Shows performance and accuracy trade-offs
|
||||
|
||||
### 4. `embedding_configuration.py` ⭐ NEW
|
||||
- Advanced configuration options for embedding strategy
|
||||
- Parameter tuning guide for different scenarios
|
||||
- Examples for research, exploration, and quality-focused crawling
|
||||
|
||||
### 5. `advanced_configuration.py`
|
||||
- Shows various configuration options for both strategies
|
||||
- Demonstrates threshold tuning and performance optimization
|
||||
|
||||
### 6. `custom_strategies.py`
|
||||
- How to implement your own crawling strategy
|
||||
- Extends the base CrawlStrategy class
|
||||
- Advanced use case for specialized requirements
|
||||
|
||||
### 7. `export_import_kb.py`
|
||||
- Export crawled knowledge base to JSONL
|
||||
- Import and continue crawling from saved state
|
||||
- Useful for building persistent knowledge bases
|
||||
|
||||
## Quick Start
|
||||
|
||||
For your first adaptive crawling experience, run:
|
||||
|
||||
```bash
|
||||
python basic_usage.py
|
||||
```
|
||||
|
||||
To try the new embedding strategy with semantic understanding:
|
||||
|
||||
```bash
|
||||
python embedding_strategy.py
|
||||
```
|
||||
|
||||
To compare strategies and see which works best for your use case:
|
||||
|
||||
```bash
|
||||
python embedding_vs_statistical.py
|
||||
```
|
||||
|
||||
## Strategy Selection Guide
|
||||
|
||||
### Use Statistical Strategy (Default) When:
|
||||
- Working with technical documentation
|
||||
- Queries contain specific terms or code
|
||||
- Speed is critical
|
||||
- No API access available
|
||||
|
||||
### Use Embedding Strategy When:
|
||||
- Queries are conceptual or ambiguous
|
||||
- Need semantic understanding beyond exact matches
|
||||
- Want to detect irrelevant content
|
||||
- Working with diverse content sources
|
||||
|
||||
## Requirements
|
||||
|
||||
- Crawl4AI installed
|
||||
- For embedding strategy with local models: `sentence-transformers`
|
||||
- For embedding strategy with OpenAI: Set `OPENAI_API_KEY` environment variable
|
||||
|
||||
## Learn More
|
||||
|
||||
- [Adaptive Crawling Documentation](https://docs.crawl4ai.com/core/adaptive-crawling/)
|
||||
- [Mathematical Framework](https://github.com/unclecode/crawl4ai/blob/main/PROGRESSIVE_CRAWLING.md)
|
||||
- [Blog: The Adaptive Crawling Revolution](https://docs.crawl4ai.com/blog/adaptive-crawling-revolution/)
|
||||
207
docs/examples/adaptive_crawling/advanced_configuration.py
Normal file
207
docs/examples/adaptive_crawling/advanced_configuration.py
Normal file
@@ -0,0 +1,207 @@
|
||||
"""
|
||||
Advanced Adaptive Crawling Configuration
|
||||
|
||||
This example demonstrates all configuration options available for adaptive crawling,
|
||||
including threshold tuning, persistence, and custom parameters.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||
|
||||
|
||||
async def main():
|
||||
"""Demonstrate advanced configuration options"""
|
||||
|
||||
# Example 1: Custom thresholds for different use cases
|
||||
print("="*60)
|
||||
print("EXAMPLE 1: Custom Confidence Thresholds")
|
||||
print("="*60)
|
||||
|
||||
# High-precision configuration (exhaustive crawling)
|
||||
high_precision_config = AdaptiveConfig(
|
||||
confidence_threshold=0.9, # Very high confidence required
|
||||
max_pages=50, # Allow more pages
|
||||
top_k_links=5, # Follow more links per page
|
||||
min_gain_threshold=0.02 # Lower threshold to continue
|
||||
)
|
||||
|
||||
# Balanced configuration (default use case)
|
||||
balanced_config = AdaptiveConfig(
|
||||
confidence_threshold=0.7, # Moderate confidence
|
||||
max_pages=20, # Reasonable limit
|
||||
top_k_links=3, # Moderate branching
|
||||
min_gain_threshold=0.05 # Standard gain threshold
|
||||
)
|
||||
|
||||
# Quick exploration configuration
|
||||
quick_config = AdaptiveConfig(
|
||||
confidence_threshold=0.5, # Lower confidence acceptable
|
||||
max_pages=10, # Strict limit
|
||||
top_k_links=2, # Minimal branching
|
||||
min_gain_threshold=0.1 # High gain required
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||
# Test different configurations
|
||||
for config_name, config in [
|
||||
("High Precision", high_precision_config),
|
||||
("Balanced", balanced_config),
|
||||
("Quick Exploration", quick_config)
|
||||
]:
|
||||
print(f"\nTesting {config_name} configuration...")
|
||||
adaptive = AdaptiveCrawler(crawler, config=config)
|
||||
|
||||
result = await adaptive.digest(
|
||||
start_url="https://httpbin.org",
|
||||
query="http headers authentication"
|
||||
)
|
||||
|
||||
print(f" - Pages crawled: {len(result.crawled_urls)}")
|
||||
print(f" - Confidence achieved: {adaptive.confidence:.2%}")
|
||||
print(f" - Coverage score: {adaptive.coverage_stats['coverage']:.2f}")
|
||||
|
||||
# Example 2: Persistence and state management
|
||||
print("\n" + "="*60)
|
||||
print("EXAMPLE 2: State Persistence")
|
||||
print("="*60)
|
||||
|
||||
state_file = "crawl_state_demo.json"
|
||||
|
||||
# Configuration with persistence
|
||||
persistent_config = AdaptiveConfig(
|
||||
confidence_threshold=0.8,
|
||||
max_pages=30,
|
||||
save_state=True, # Enable auto-save
|
||||
state_path=state_file # Specify save location
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||
# First crawl - will be interrupted
|
||||
print("\nStarting initial crawl (will interrupt after 5 pages)...")
|
||||
|
||||
interrupt_config = AdaptiveConfig(
|
||||
confidence_threshold=0.8,
|
||||
max_pages=5, # Artificially low to simulate interruption
|
||||
save_state=True,
|
||||
state_path=state_file
|
||||
)
|
||||
|
||||
adaptive = AdaptiveCrawler(crawler, config=interrupt_config)
|
||||
result1 = await adaptive.digest(
|
||||
start_url="https://docs.python.org/3/",
|
||||
query="exception handling try except finally"
|
||||
)
|
||||
|
||||
print(f"First crawl completed: {len(result1.crawled_urls)} pages")
|
||||
print(f"Confidence reached: {adaptive.confidence:.2%}")
|
||||
|
||||
# Resume crawl with higher page limit
|
||||
print("\nResuming crawl from saved state...")
|
||||
|
||||
resume_config = AdaptiveConfig(
|
||||
confidence_threshold=0.8,
|
||||
max_pages=20, # Increase limit
|
||||
save_state=True,
|
||||
state_path=state_file
|
||||
)
|
||||
|
||||
adaptive2 = AdaptiveCrawler(crawler, config=resume_config)
|
||||
result2 = await adaptive2.digest(
|
||||
start_url="https://docs.python.org/3/",
|
||||
query="exception handling try except finally",
|
||||
resume_from=state_file
|
||||
)
|
||||
|
||||
print(f"Resumed crawl completed: {len(result2.crawled_urls)} total pages")
|
||||
print(f"Final confidence: {adaptive2.confidence:.2%}")
|
||||
|
||||
# Clean up
|
||||
Path(state_file).unlink(missing_ok=True)
|
||||
|
||||
# Example 3: Link selection strategies
|
||||
print("\n" + "="*60)
|
||||
print("EXAMPLE 3: Link Selection Strategies")
|
||||
print("="*60)
|
||||
|
||||
# Conservative link following
|
||||
conservative_config = AdaptiveConfig(
|
||||
confidence_threshold=0.7,
|
||||
max_pages=15,
|
||||
top_k_links=1, # Only follow best link
|
||||
min_gain_threshold=0.15 # High threshold
|
||||
)
|
||||
|
||||
# Aggressive link following
|
||||
aggressive_config = AdaptiveConfig(
|
||||
confidence_threshold=0.7,
|
||||
max_pages=15,
|
||||
top_k_links=10, # Follow many links
|
||||
min_gain_threshold=0.01 # Very low threshold
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||
for strategy_name, config in [
|
||||
("Conservative", conservative_config),
|
||||
("Aggressive", aggressive_config)
|
||||
]:
|
||||
print(f"\n{strategy_name} link selection:")
|
||||
adaptive = AdaptiveCrawler(crawler, config=config)
|
||||
|
||||
result = await adaptive.digest(
|
||||
start_url="https://httpbin.org",
|
||||
query="api endpoints"
|
||||
)
|
||||
|
||||
# Analyze crawl pattern
|
||||
print(f" - Total pages: {len(result.crawled_urls)}")
|
||||
print(f" - Unique domains: {len(set(url.split('/')[2] for url in result.crawled_urls))}")
|
||||
print(f" - Max depth reached: {max(url.count('/') for url in result.crawled_urls) - 2}")
|
||||
|
||||
# Show saturation trend
|
||||
if hasattr(result, 'new_terms_history') and result.new_terms_history:
|
||||
print(f" - New terms discovered: {result.new_terms_history[:5]}...")
|
||||
print(f" - Saturation trend: {'decreasing' if result.new_terms_history[-1] < result.new_terms_history[0] else 'increasing'}")
|
||||
|
||||
# Example 4: Monitoring crawl progress
|
||||
print("\n" + "="*60)
|
||||
print("EXAMPLE 4: Progress Monitoring")
|
||||
print("="*60)
|
||||
|
||||
# Configuration with detailed monitoring
|
||||
monitor_config = AdaptiveConfig(
|
||||
confidence_threshold=0.75,
|
||||
max_pages=10,
|
||||
top_k_links=3
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler, config=monitor_config)
|
||||
|
||||
# Start crawl
|
||||
print("\nMonitoring crawl progress...")
|
||||
result = await adaptive.digest(
|
||||
start_url="https://httpbin.org",
|
||||
query="http methods headers"
|
||||
)
|
||||
|
||||
# Detailed statistics
|
||||
print("\nDetailed crawl analysis:")
|
||||
adaptive.print_stats(detailed=True)
|
||||
|
||||
# Export for analysis
|
||||
print("\nExporting knowledge base for external analysis...")
|
||||
adaptive.export_knowledge_base("knowledge_export_demo.jsonl")
|
||||
print("Knowledge base exported to: knowledge_export_demo.jsonl")
|
||||
|
||||
# Show sample of exported data
|
||||
with open("knowledge_export_demo.jsonl", 'r') as f:
|
||||
first_line = f.readline()
|
||||
print(f"Sample export: {first_line[:100]}...")
|
||||
|
||||
# Clean up
|
||||
Path("knowledge_export_demo.jsonl").unlink(missing_ok=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
76
docs/examples/adaptive_crawling/basic_usage.py
Normal file
76
docs/examples/adaptive_crawling/basic_usage.py
Normal file
@@ -0,0 +1,76 @@
|
||||
"""
|
||||
Basic Adaptive Crawling Example
|
||||
|
||||
This example demonstrates the simplest use case of adaptive crawling:
|
||||
finding information about a specific topic and knowing when to stop.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler
|
||||
|
||||
|
||||
async def main():
|
||||
"""Basic adaptive crawling example"""
|
||||
|
||||
# Initialize the crawler
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# Create an adaptive crawler with default settings (statistical strategy)
|
||||
adaptive = AdaptiveCrawler(crawler)
|
||||
|
||||
# Note: You can also use embedding strategy for semantic understanding:
|
||||
# from crawl4ai import AdaptiveConfig
|
||||
# config = AdaptiveConfig(strategy="embedding")
|
||||
# adaptive = AdaptiveCrawler(crawler, config)
|
||||
|
||||
# Start adaptive crawling
|
||||
print("Starting adaptive crawl for Python async programming information...")
|
||||
result = await adaptive.digest(
|
||||
start_url="https://docs.python.org/3/library/asyncio.html",
|
||||
query="async await context managers coroutines"
|
||||
)
|
||||
|
||||
# Display crawl statistics
|
||||
print("\n" + "="*50)
|
||||
print("CRAWL STATISTICS")
|
||||
print("="*50)
|
||||
adaptive.print_stats(detailed=False)
|
||||
|
||||
# Get the most relevant content found
|
||||
print("\n" + "="*50)
|
||||
print("MOST RELEVANT PAGES")
|
||||
print("="*50)
|
||||
|
||||
relevant_pages = adaptive.get_relevant_content(top_k=5)
|
||||
for i, page in enumerate(relevant_pages, 1):
|
||||
print(f"\n{i}. {page['url']}")
|
||||
print(f" Relevance Score: {page['score']:.2%}")
|
||||
|
||||
# Show a snippet of the content
|
||||
content = page['content'] or ""
|
||||
if content:
|
||||
snippet = content[:200].replace('\n', ' ')
|
||||
if len(content) > 200:
|
||||
snippet += "..."
|
||||
print(f" Preview: {snippet}")
|
||||
|
||||
# Show final confidence
|
||||
print(f"\n{'='*50}")
|
||||
print(f"Final Confidence: {adaptive.confidence:.2%}")
|
||||
print(f"Total Pages Crawled: {len(result.crawled_urls)}")
|
||||
print(f"Knowledge Base Size: {len(adaptive.state.knowledge_base)} documents")
|
||||
|
||||
# Example: Check if we can answer specific questions
|
||||
print(f"\n{'='*50}")
|
||||
print("INFORMATION SUFFICIENCY CHECK")
|
||||
print(f"{'='*50}")
|
||||
|
||||
if adaptive.confidence >= 0.8:
|
||||
print("✓ High confidence - can answer detailed questions about async Python")
|
||||
elif adaptive.confidence >= 0.6:
|
||||
print("~ Moderate confidence - can answer basic questions")
|
||||
else:
|
||||
print("✗ Low confidence - need more information")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
373
docs/examples/adaptive_crawling/custom_strategies.py
Normal file
373
docs/examples/adaptive_crawling/custom_strategies.py
Normal file
@@ -0,0 +1,373 @@
|
||||
"""
|
||||
Custom Adaptive Crawling Strategies
|
||||
|
||||
This example demonstrates how to implement custom scoring strategies
|
||||
for domain-specific crawling needs.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import re
|
||||
from typing import List, Dict, Set
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||
from crawl4ai.adaptive_crawler import CrawlState, Link
|
||||
import math
|
||||
|
||||
|
||||
class APIDocumentationStrategy:
|
||||
"""
|
||||
Custom strategy optimized for API documentation crawling.
|
||||
Prioritizes endpoint references, code examples, and parameter descriptions.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
# Keywords that indicate high-value API documentation
|
||||
self.api_keywords = {
|
||||
'endpoint', 'request', 'response', 'parameter', 'authentication',
|
||||
'header', 'body', 'query', 'path', 'method', 'get', 'post', 'put',
|
||||
'delete', 'patch', 'status', 'code', 'example', 'curl', 'python'
|
||||
}
|
||||
|
||||
# URL patterns that typically contain API documentation
|
||||
self.valuable_patterns = [
|
||||
r'/api/',
|
||||
r'/reference/',
|
||||
r'/endpoints?/',
|
||||
r'/methods?/',
|
||||
r'/resources?/'
|
||||
]
|
||||
|
||||
# Patterns to avoid
|
||||
self.avoid_patterns = [
|
||||
r'/blog/',
|
||||
r'/news/',
|
||||
r'/about/',
|
||||
r'/contact/',
|
||||
r'/legal/'
|
||||
]
|
||||
|
||||
def score_link(self, link: Link, query: str, state: CrawlState) -> float:
|
||||
"""Custom link scoring for API documentation"""
|
||||
score = 1.0
|
||||
url = link.href.lower()
|
||||
|
||||
# Boost API-related URLs
|
||||
for pattern in self.valuable_patterns:
|
||||
if re.search(pattern, url):
|
||||
score *= 2.0
|
||||
break
|
||||
|
||||
# Reduce score for non-API content
|
||||
for pattern in self.avoid_patterns:
|
||||
if re.search(pattern, url):
|
||||
score *= 0.1
|
||||
break
|
||||
|
||||
# Boost if preview contains API keywords
|
||||
if link.text:
|
||||
preview_lower = link.text.lower()
|
||||
keyword_count = sum(1 for kw in self.api_keywords if kw in preview_lower)
|
||||
score *= (1 + keyword_count * 0.2)
|
||||
|
||||
# Prioritize shallow URLs (likely overview pages)
|
||||
depth = url.count('/') - 2 # Subtract protocol slashes
|
||||
if depth <= 3:
|
||||
score *= 1.5
|
||||
elif depth > 6:
|
||||
score *= 0.5
|
||||
|
||||
return score
|
||||
|
||||
def calculate_api_coverage(self, state: CrawlState, query: str) -> Dict[str, float]:
|
||||
"""Calculate specialized coverage metrics for API documentation"""
|
||||
metrics = {
|
||||
'endpoint_coverage': 0.0,
|
||||
'example_coverage': 0.0,
|
||||
'parameter_coverage': 0.0
|
||||
}
|
||||
|
||||
# Analyze knowledge base for API-specific content
|
||||
endpoint_patterns = [r'GET\s+/', r'POST\s+/', r'PUT\s+/', r'DELETE\s+/']
|
||||
example_patterns = [r'```\w+', r'curl\s+-', r'import\s+requests']
|
||||
param_patterns = [r'param(?:eter)?s?\s*:', r'required\s*:', r'optional\s*:']
|
||||
|
||||
total_docs = len(state.knowledge_base)
|
||||
if total_docs == 0:
|
||||
return metrics
|
||||
|
||||
docs_with_endpoints = 0
|
||||
docs_with_examples = 0
|
||||
docs_with_params = 0
|
||||
|
||||
for doc in state.knowledge_base:
|
||||
content = doc.markdown.raw_markdown if hasattr(doc, 'markdown') else str(doc)
|
||||
|
||||
# Check for endpoints
|
||||
if any(re.search(pattern, content, re.IGNORECASE) for pattern in endpoint_patterns):
|
||||
docs_with_endpoints += 1
|
||||
|
||||
# Check for examples
|
||||
if any(re.search(pattern, content, re.IGNORECASE) for pattern in example_patterns):
|
||||
docs_with_examples += 1
|
||||
|
||||
# Check for parameters
|
||||
if any(re.search(pattern, content, re.IGNORECASE) for pattern in param_patterns):
|
||||
docs_with_params += 1
|
||||
|
||||
metrics['endpoint_coverage'] = docs_with_endpoints / total_docs
|
||||
metrics['example_coverage'] = docs_with_examples / total_docs
|
||||
metrics['parameter_coverage'] = docs_with_params / total_docs
|
||||
|
||||
return metrics
|
||||
|
||||
|
||||
class ResearchPaperStrategy:
|
||||
"""
|
||||
Strategy optimized for crawling research papers and academic content.
|
||||
Prioritizes citations, abstracts, and methodology sections.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.academic_keywords = {
|
||||
'abstract', 'introduction', 'methodology', 'results', 'conclusion',
|
||||
'references', 'citation', 'paper', 'study', 'research', 'analysis',
|
||||
'hypothesis', 'experiment', 'findings', 'doi'
|
||||
}
|
||||
|
||||
self.citation_patterns = [
|
||||
r'\[\d+\]', # [1] style citations
|
||||
r'\(\w+\s+\d{4}\)', # (Author 2024) style
|
||||
r'doi:\s*\S+', # DOI references
|
||||
]
|
||||
|
||||
def calculate_academic_relevance(self, content: str, query: str) -> float:
|
||||
"""Calculate relevance score for academic content"""
|
||||
score = 0.0
|
||||
content_lower = content.lower()
|
||||
|
||||
# Check for academic keywords
|
||||
keyword_matches = sum(1 for kw in self.academic_keywords if kw in content_lower)
|
||||
score += keyword_matches * 0.1
|
||||
|
||||
# Check for citations
|
||||
citation_count = sum(
|
||||
len(re.findall(pattern, content))
|
||||
for pattern in self.citation_patterns
|
||||
)
|
||||
score += min(citation_count * 0.05, 1.0) # Cap at 1.0
|
||||
|
||||
# Check for query terms in academic context
|
||||
query_terms = query.lower().split()
|
||||
for term in query_terms:
|
||||
# Boost if term appears near academic keywords
|
||||
for keyword in ['abstract', 'conclusion', 'results']:
|
||||
if keyword in content_lower:
|
||||
section = content_lower[content_lower.find(keyword):content_lower.find(keyword) + 500]
|
||||
if term in section:
|
||||
score += 0.2
|
||||
|
||||
return min(score, 2.0) # Cap total score
|
||||
|
||||
|
||||
async def demo_custom_strategies():
|
||||
"""Demonstrate custom strategy usage"""
|
||||
|
||||
# Example 1: API Documentation Strategy
|
||||
print("="*60)
|
||||
print("EXAMPLE 1: Custom API Documentation Strategy")
|
||||
print("="*60)
|
||||
|
||||
api_strategy = APIDocumentationStrategy()
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Standard adaptive crawler
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.8,
|
||||
max_pages=15
|
||||
)
|
||||
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
|
||||
# Override link scoring with custom strategy
|
||||
original_rank_links = adaptive._rank_links
|
||||
|
||||
def custom_rank_links(links, query, state):
|
||||
# Apply custom scoring
|
||||
scored_links = []
|
||||
for link in links:
|
||||
base_score = api_strategy.score_link(link, query, state)
|
||||
scored_links.append((link, base_score))
|
||||
|
||||
# Sort by score
|
||||
scored_links.sort(key=lambda x: x[1], reverse=True)
|
||||
return [link for link, _ in scored_links[:config.top_k_links]]
|
||||
|
||||
adaptive._rank_links = custom_rank_links
|
||||
|
||||
# Crawl API documentation
|
||||
print("\nCrawling API documentation with custom strategy...")
|
||||
state = await adaptive.digest(
|
||||
start_url="https://httpbin.org",
|
||||
query="api endpoints authentication headers"
|
||||
)
|
||||
|
||||
# Calculate custom metrics
|
||||
api_metrics = api_strategy.calculate_api_coverage(state, "api endpoints")
|
||||
|
||||
print(f"\nResults:")
|
||||
print(f"Pages crawled: {len(state.crawled_urls)}")
|
||||
print(f"Confidence: {adaptive.confidence:.2%}")
|
||||
print(f"\nAPI-Specific Metrics:")
|
||||
print(f" - Endpoint coverage: {api_metrics['endpoint_coverage']:.2%}")
|
||||
print(f" - Example coverage: {api_metrics['example_coverage']:.2%}")
|
||||
print(f" - Parameter coverage: {api_metrics['parameter_coverage']:.2%}")
|
||||
|
||||
# Example 2: Combined Strategy
|
||||
print("\n" + "="*60)
|
||||
print("EXAMPLE 2: Hybrid Strategy Combining Multiple Approaches")
|
||||
print("="*60)
|
||||
|
||||
class HybridStrategy:
|
||||
"""Combines multiple strategies with weights"""
|
||||
|
||||
def __init__(self):
|
||||
self.api_strategy = APIDocumentationStrategy()
|
||||
self.research_strategy = ResearchPaperStrategy()
|
||||
self.weights = {
|
||||
'api': 0.7,
|
||||
'research': 0.3
|
||||
}
|
||||
|
||||
def score_content(self, content: str, query: str) -> float:
|
||||
# Get scores from each strategy
|
||||
api_score = self._calculate_api_score(content, query)
|
||||
research_score = self.research_strategy.calculate_academic_relevance(content, query)
|
||||
|
||||
# Weighted combination
|
||||
total_score = (
|
||||
api_score * self.weights['api'] +
|
||||
research_score * self.weights['research']
|
||||
)
|
||||
|
||||
return total_score
|
||||
|
||||
def _calculate_api_score(self, content: str, query: str) -> float:
|
||||
# Simplified API scoring based on keyword presence
|
||||
content_lower = content.lower()
|
||||
api_keywords = self.api_strategy.api_keywords
|
||||
|
||||
keyword_count = sum(1 for kw in api_keywords if kw in content_lower)
|
||||
return min(keyword_count * 0.1, 2.0)
|
||||
|
||||
hybrid_strategy = HybridStrategy()
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler)
|
||||
|
||||
# Crawl with hybrid scoring
|
||||
print("\nTesting hybrid strategy on technical documentation...")
|
||||
state = await adaptive.digest(
|
||||
start_url="https://docs.python.org/3/library/asyncio.html",
|
||||
query="async await coroutines api"
|
||||
)
|
||||
|
||||
# Analyze results with hybrid strategy
|
||||
print(f"\nHybrid Strategy Analysis:")
|
||||
total_score = 0
|
||||
for doc in adaptive.get_relevant_content(top_k=5):
|
||||
content = doc['content'] or ""
|
||||
score = hybrid_strategy.score_content(content, "async await api")
|
||||
total_score += score
|
||||
print(f" - {doc['url'][:50]}... Score: {score:.2f}")
|
||||
|
||||
print(f"\nAverage hybrid score: {total_score/5:.2f}")
|
||||
|
||||
|
||||
async def demo_performance_optimization():
|
||||
"""Demonstrate performance optimization with custom strategies"""
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("EXAMPLE 3: Performance-Optimized Strategy")
|
||||
print("="*60)
|
||||
|
||||
class PerformanceOptimizedStrategy:
|
||||
"""Strategy that balances thoroughness with speed"""
|
||||
|
||||
def __init__(self):
|
||||
self.url_cache: Set[str] = set()
|
||||
self.domain_scores: Dict[str, float] = {}
|
||||
|
||||
def should_crawl_domain(self, url: str) -> bool:
|
||||
"""Implement domain-level filtering"""
|
||||
domain = url.split('/')[2] if url.startswith('http') else url
|
||||
|
||||
# Skip if we've already crawled many pages from this domain
|
||||
domain_count = sum(1 for cached in self.url_cache if domain in cached)
|
||||
if domain_count > 5:
|
||||
return False
|
||||
|
||||
# Skip low-scoring domains
|
||||
if domain in self.domain_scores and self.domain_scores[domain] < 0.3:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def update_domain_score(self, url: str, relevance: float):
|
||||
"""Track domain-level performance"""
|
||||
domain = url.split('/')[2] if url.startswith('http') else url
|
||||
|
||||
if domain not in self.domain_scores:
|
||||
self.domain_scores[domain] = relevance
|
||||
else:
|
||||
# Moving average
|
||||
self.domain_scores[domain] = (
|
||||
0.7 * self.domain_scores[domain] + 0.3 * relevance
|
||||
)
|
||||
|
||||
perf_strategy = PerformanceOptimizedStrategy()
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.7,
|
||||
max_pages=10,
|
||||
top_k_links=2 # Fewer links for speed
|
||||
)
|
||||
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
|
||||
# Track performance
|
||||
import time
|
||||
start_time = time.time()
|
||||
|
||||
state = await adaptive.digest(
|
||||
start_url="https://httpbin.org",
|
||||
query="http methods headers"
|
||||
)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
print(f"\nPerformance Results:")
|
||||
print(f" - Time elapsed: {elapsed:.2f} seconds")
|
||||
print(f" - Pages crawled: {len(state.crawled_urls)}")
|
||||
print(f" - Pages per second: {len(state.crawled_urls)/elapsed:.2f}")
|
||||
print(f" - Final confidence: {adaptive.confidence:.2%}")
|
||||
print(f" - Efficiency: {adaptive.confidence/len(state.crawled_urls):.2%} confidence per page")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all demonstrations"""
|
||||
try:
|
||||
await demo_custom_strategies()
|
||||
await demo_performance_optimization()
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("All custom strategy examples completed!")
|
||||
print("="*60)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
206
docs/examples/adaptive_crawling/embedding_configuration.py
Normal file
206
docs/examples/adaptive_crawling/embedding_configuration.py
Normal file
@@ -0,0 +1,206 @@
|
||||
"""
|
||||
Advanced Embedding Configuration Example
|
||||
|
||||
This example demonstrates all configuration options available for the
|
||||
embedding strategy, including fine-tuning parameters for different use cases.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||
|
||||
|
||||
async def test_configuration(name: str, config: AdaptiveConfig, url: str, query: str):
|
||||
"""Test a specific configuration"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Configuration: {name}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
result = await adaptive.digest(start_url=url, query=query)
|
||||
|
||||
print(f"Pages crawled: {len(result.crawled_urls)}")
|
||||
print(f"Final confidence: {adaptive.confidence:.1%}")
|
||||
print(f"Stopped reason: {result.metrics.get('stopped_reason', 'max_pages')}")
|
||||
|
||||
if result.metrics.get('is_irrelevant', False):
|
||||
print("⚠️ Query detected as irrelevant!")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def main():
|
||||
"""Demonstrate various embedding configurations"""
|
||||
|
||||
print("EMBEDDING STRATEGY CONFIGURATION EXAMPLES")
|
||||
print("=" * 60)
|
||||
|
||||
# Base URL and query for testing
|
||||
test_url = "https://docs.python.org/3/library/asyncio.html"
|
||||
|
||||
# 1. Default Configuration
|
||||
config_default = AdaptiveConfig(
|
||||
strategy="embedding",
|
||||
max_pages=10
|
||||
)
|
||||
|
||||
await test_configuration(
|
||||
"Default Settings",
|
||||
config_default,
|
||||
test_url,
|
||||
"async programming patterns"
|
||||
)
|
||||
|
||||
# 2. Strict Coverage Requirements
|
||||
config_strict = AdaptiveConfig(
|
||||
strategy="embedding",
|
||||
max_pages=20,
|
||||
|
||||
# Stricter similarity requirements
|
||||
embedding_k_exp=5.0, # Default is 3.0, higher = stricter
|
||||
embedding_coverage_radius=0.15, # Default is 0.2, lower = stricter
|
||||
|
||||
# Higher validation threshold
|
||||
embedding_validation_min_score=0.6, # Default is 0.3
|
||||
|
||||
# More query variations for better coverage
|
||||
n_query_variations=15 # Default is 10
|
||||
)
|
||||
|
||||
await test_configuration(
|
||||
"Strict Coverage (Research/Academic)",
|
||||
config_strict,
|
||||
test_url,
|
||||
"comprehensive guide async await"
|
||||
)
|
||||
|
||||
# 3. Fast Exploration
|
||||
config_fast = AdaptiveConfig(
|
||||
strategy="embedding",
|
||||
max_pages=10,
|
||||
top_k_links=5, # Follow more links per page
|
||||
|
||||
# Relaxed requirements for faster convergence
|
||||
embedding_k_exp=1.0, # Lower = more lenient
|
||||
embedding_min_relative_improvement=0.05, # Stop earlier
|
||||
|
||||
# Lower quality thresholds
|
||||
embedding_quality_min_confidence=0.5, # Display lower confidence
|
||||
embedding_quality_max_confidence=0.85,
|
||||
|
||||
# Fewer query variations for speed
|
||||
n_query_variations=5
|
||||
)
|
||||
|
||||
await test_configuration(
|
||||
"Fast Exploration (Quick Overview)",
|
||||
config_fast,
|
||||
test_url,
|
||||
"async basics"
|
||||
)
|
||||
|
||||
# 4. Irrelevance Detection Focus
|
||||
config_irrelevance = AdaptiveConfig(
|
||||
strategy="embedding",
|
||||
max_pages=5,
|
||||
|
||||
# Aggressive irrelevance detection
|
||||
embedding_min_confidence_threshold=0.2, # Higher threshold (default 0.1)
|
||||
embedding_k_exp=5.0, # Strict similarity
|
||||
|
||||
# Quick stopping for irrelevant content
|
||||
embedding_min_relative_improvement=0.15
|
||||
)
|
||||
|
||||
await test_configuration(
|
||||
"Irrelevance Detection",
|
||||
config_irrelevance,
|
||||
test_url,
|
||||
"recipe for chocolate cake" # Irrelevant query
|
||||
)
|
||||
|
||||
# 5. High-Quality Knowledge Base
|
||||
config_quality = AdaptiveConfig(
|
||||
strategy="embedding",
|
||||
max_pages=30,
|
||||
|
||||
# Deduplication settings
|
||||
embedding_overlap_threshold=0.75, # More aggressive deduplication
|
||||
|
||||
# Quality focus
|
||||
embedding_validation_min_score=0.5,
|
||||
embedding_quality_scale_factor=1.0, # Linear quality mapping
|
||||
|
||||
# Balanced parameters
|
||||
embedding_k_exp=3.0,
|
||||
embedding_nearest_weight=0.8, # Focus on best matches
|
||||
embedding_top_k_weight=0.2
|
||||
)
|
||||
|
||||
await test_configuration(
|
||||
"High-Quality Knowledge Base",
|
||||
config_quality,
|
||||
test_url,
|
||||
"asyncio advanced patterns best practices"
|
||||
)
|
||||
|
||||
# 6. Custom Embedding Provider
|
||||
if os.getenv('OPENAI_API_KEY'):
|
||||
config_openai = AdaptiveConfig(
|
||||
strategy="embedding",
|
||||
max_pages=10,
|
||||
|
||||
# Use OpenAI embeddings
|
||||
embedding_llm_config={
|
||||
'provider': 'openai/text-embedding-3-small',
|
||||
'api_token': os.getenv('OPENAI_API_KEY')
|
||||
},
|
||||
|
||||
# OpenAI embeddings are high quality, can be stricter
|
||||
embedding_k_exp=4.0,
|
||||
n_query_variations=12
|
||||
)
|
||||
|
||||
await test_configuration(
|
||||
"OpenAI Embeddings",
|
||||
config_openai,
|
||||
test_url,
|
||||
"event-driven architecture patterns"
|
||||
)
|
||||
|
||||
# Parameter Guide
|
||||
print("\n" + "="*60)
|
||||
print("PARAMETER TUNING GUIDE")
|
||||
print("="*60)
|
||||
|
||||
print("\n📊 Key Parameters and Their Effects:")
|
||||
print("\n1. embedding_k_exp (default: 3.0)")
|
||||
print(" - Lower (1-2): More lenient, faster convergence")
|
||||
print(" - Higher (4-5): Stricter, better precision")
|
||||
|
||||
print("\n2. embedding_coverage_radius (default: 0.2)")
|
||||
print(" - Lower (0.1-0.15): Requires closer matches")
|
||||
print(" - Higher (0.25-0.3): Accepts broader matches")
|
||||
|
||||
print("\n3. n_query_variations (default: 10)")
|
||||
print(" - Lower (5-7): Faster, less comprehensive")
|
||||
print(" - Higher (15-20): Better coverage, slower")
|
||||
|
||||
print("\n4. embedding_min_confidence_threshold (default: 0.1)")
|
||||
print(" - Set to 0.15-0.2 for aggressive irrelevance detection")
|
||||
print(" - Set to 0.05 to crawl even barely relevant content")
|
||||
|
||||
print("\n5. embedding_validation_min_score (default: 0.3)")
|
||||
print(" - Higher (0.5-0.6): Requires strong validation")
|
||||
print(" - Lower (0.2): More permissive stopping")
|
||||
|
||||
print("\n💡 Tips:")
|
||||
print("- For research: High k_exp, more variations, strict validation")
|
||||
print("- For exploration: Low k_exp, fewer variations, relaxed thresholds")
|
||||
print("- For quality: Focus on overlap_threshold and validation scores")
|
||||
print("- For speed: Reduce variations, increase min_relative_improvement")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
109
docs/examples/adaptive_crawling/embedding_strategy.py
Normal file
109
docs/examples/adaptive_crawling/embedding_strategy.py
Normal file
@@ -0,0 +1,109 @@
|
||||
"""
|
||||
Embedding Strategy Example for Adaptive Crawling
|
||||
|
||||
This example demonstrates how to use the embedding-based strategy
|
||||
for semantic understanding and intelligent crawling.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||
|
||||
|
||||
async def main():
|
||||
"""Demonstrate embedding strategy for adaptive crawling"""
|
||||
|
||||
# Configure embedding strategy
|
||||
config = AdaptiveConfig(
|
||||
strategy="embedding", # Use embedding strategy
|
||||
embedding_model="sentence-transformers/all-MiniLM-L6-v2", # Default model
|
||||
n_query_variations=10, # Generate 10 semantic variations
|
||||
max_pages=15,
|
||||
top_k_links=3,
|
||||
min_gain_threshold=0.05,
|
||||
|
||||
# Embedding-specific parameters
|
||||
embedding_k_exp=3.0, # Higher = stricter similarity requirements
|
||||
embedding_min_confidence_threshold=0.1, # Stop if <10% relevant
|
||||
embedding_validation_min_score=0.4 # Validation threshold
|
||||
)
|
||||
|
||||
# Optional: Use OpenAI embeddings instead
|
||||
if os.getenv('OPENAI_API_KEY'):
|
||||
config.embedding_llm_config = {
|
||||
'provider': 'openai/text-embedding-3-small',
|
||||
'api_token': os.getenv('OPENAI_API_KEY')
|
||||
}
|
||||
print("Using OpenAI embeddings")
|
||||
else:
|
||||
print("Using sentence-transformers (local embeddings)")
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
|
||||
# Test 1: Relevant query with semantic understanding
|
||||
print("\n" + "="*50)
|
||||
print("TEST 1: Semantic Query Understanding")
|
||||
print("="*50)
|
||||
|
||||
result = await adaptive.digest(
|
||||
start_url="https://docs.python.org/3/library/asyncio.html",
|
||||
query="concurrent programming event-driven architecture"
|
||||
)
|
||||
|
||||
print("\nQuery Expansion:")
|
||||
print(f"Original query expanded to {len(result.expanded_queries)} variations")
|
||||
for i, q in enumerate(result.expanded_queries[:3], 1):
|
||||
print(f" {i}. {q}")
|
||||
print(" ...")
|
||||
|
||||
print("\nResults:")
|
||||
adaptive.print_stats(detailed=False)
|
||||
|
||||
# Test 2: Detecting irrelevant queries
|
||||
print("\n" + "="*50)
|
||||
print("TEST 2: Irrelevant Query Detection")
|
||||
print("="*50)
|
||||
|
||||
# Reset crawler for new query
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
|
||||
result = await adaptive.digest(
|
||||
start_url="https://docs.python.org/3/library/asyncio.html",
|
||||
query="how to bake chocolate chip cookies"
|
||||
)
|
||||
|
||||
if result.metrics.get('is_irrelevant', False):
|
||||
print("\n✅ Successfully detected irrelevant query!")
|
||||
print(f"Stopped after just {len(result.crawled_urls)} pages")
|
||||
print(f"Reason: {result.metrics.get('stopped_reason', 'unknown')}")
|
||||
else:
|
||||
print("\n❌ Failed to detect irrelevance")
|
||||
|
||||
print(f"Final confidence: {adaptive.confidence:.1%}")
|
||||
|
||||
# Test 3: Semantic gap analysis
|
||||
print("\n" + "="*50)
|
||||
print("TEST 3: Semantic Gap Analysis")
|
||||
print("="*50)
|
||||
|
||||
# Show how embedding strategy identifies gaps
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
|
||||
result = await adaptive.digest(
|
||||
start_url="https://realpython.com",
|
||||
query="python decorators advanced patterns"
|
||||
)
|
||||
|
||||
print(f"\nSemantic gaps identified: {len(result.semantic_gaps)}")
|
||||
print(f"Knowledge base embeddings shape: {result.kb_embeddings.shape if result.kb_embeddings is not None else 'None'}")
|
||||
|
||||
# Show coverage metrics specific to embedding strategy
|
||||
print("\nEmbedding-specific metrics:")
|
||||
print(f" Average best similarity: {result.metrics.get('avg_best_similarity', 0):.3f}")
|
||||
print(f" Coverage score: {result.metrics.get('coverage_score', 0):.3f}")
|
||||
print(f" Validation confidence: {result.metrics.get('validation_confidence', 0):.2%}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
167
docs/examples/adaptive_crawling/embedding_vs_statistical.py
Normal file
167
docs/examples/adaptive_crawling/embedding_vs_statistical.py
Normal file
@@ -0,0 +1,167 @@
|
||||
"""
|
||||
Comparison: Embedding vs Statistical Strategy
|
||||
|
||||
This example demonstrates the differences between statistical and embedding
|
||||
strategies for adaptive crawling, showing when to use each approach.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
import os
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||
|
||||
|
||||
async def crawl_with_strategy(url: str, query: str, strategy: str, **kwargs):
|
||||
"""Helper function to crawl with a specific strategy"""
|
||||
config = AdaptiveConfig(
|
||||
strategy=strategy,
|
||||
max_pages=20,
|
||||
top_k_links=3,
|
||||
min_gain_threshold=0.05,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
|
||||
start_time = time.time()
|
||||
result = await adaptive.digest(start_url=url, query=query)
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
return {
|
||||
'result': result,
|
||||
'crawler': adaptive,
|
||||
'elapsed': elapsed,
|
||||
'pages': len(result.crawled_urls),
|
||||
'confidence': adaptive.confidence
|
||||
}
|
||||
|
||||
|
||||
async def main():
|
||||
"""Compare embedding and statistical strategies"""
|
||||
|
||||
# Test scenarios
|
||||
test_cases = [
|
||||
{
|
||||
'name': 'Technical Documentation (Specific Terms)',
|
||||
'url': 'https://docs.python.org/3/library/asyncio.html',
|
||||
'query': 'asyncio.create_task event_loop.run_until_complete'
|
||||
},
|
||||
{
|
||||
'name': 'Conceptual Query (Semantic Understanding)',
|
||||
'url': 'https://docs.python.org/3/library/asyncio.html',
|
||||
'query': 'concurrent programming patterns'
|
||||
},
|
||||
{
|
||||
'name': 'Ambiguous Query',
|
||||
'url': 'https://realpython.com',
|
||||
'query': 'python performance optimization'
|
||||
}
|
||||
]
|
||||
|
||||
# Configure embedding strategy
|
||||
embedding_config = {}
|
||||
if os.getenv('OPENAI_API_KEY'):
|
||||
embedding_config['embedding_llm_config'] = {
|
||||
'provider': 'openai/text-embedding-3-small',
|
||||
'api_token': os.getenv('OPENAI_API_KEY')
|
||||
}
|
||||
|
||||
for test in test_cases:
|
||||
print("\n" + "="*70)
|
||||
print(f"TEST: {test['name']}")
|
||||
print(f"URL: {test['url']}")
|
||||
print(f"Query: '{test['query']}'")
|
||||
print("="*70)
|
||||
|
||||
# Run statistical strategy
|
||||
print("\n📊 Statistical Strategy:")
|
||||
stat_result = await crawl_with_strategy(
|
||||
test['url'],
|
||||
test['query'],
|
||||
'statistical'
|
||||
)
|
||||
|
||||
print(f" Pages crawled: {stat_result['pages']}")
|
||||
print(f" Time taken: {stat_result['elapsed']:.2f}s")
|
||||
print(f" Confidence: {stat_result['confidence']:.1%}")
|
||||
print(f" Sufficient: {'Yes' if stat_result['crawler'].is_sufficient else 'No'}")
|
||||
|
||||
# Show term coverage
|
||||
if hasattr(stat_result['result'], 'term_frequencies'):
|
||||
query_terms = test['query'].lower().split()
|
||||
covered = sum(1 for term in query_terms
|
||||
if term in stat_result['result'].term_frequencies)
|
||||
print(f" Term coverage: {covered}/{len(query_terms)} query terms found")
|
||||
|
||||
# Run embedding strategy
|
||||
print("\n🧠 Embedding Strategy:")
|
||||
emb_result = await crawl_with_strategy(
|
||||
test['url'],
|
||||
test['query'],
|
||||
'embedding',
|
||||
**embedding_config
|
||||
)
|
||||
|
||||
print(f" Pages crawled: {emb_result['pages']}")
|
||||
print(f" Time taken: {emb_result['elapsed']:.2f}s")
|
||||
print(f" Confidence: {emb_result['confidence']:.1%}")
|
||||
print(f" Sufficient: {'Yes' if emb_result['crawler'].is_sufficient else 'No'}")
|
||||
|
||||
# Show semantic understanding
|
||||
if emb_result['result'].expanded_queries:
|
||||
print(f" Query variations: {len(emb_result['result'].expanded_queries)}")
|
||||
print(f" Semantic gaps: {len(emb_result['result'].semantic_gaps)}")
|
||||
|
||||
# Compare results
|
||||
print("\n📈 Comparison:")
|
||||
efficiency_diff = ((stat_result['pages'] - emb_result['pages']) /
|
||||
stat_result['pages'] * 100) if stat_result['pages'] > 0 else 0
|
||||
|
||||
print(f" Efficiency: ", end="")
|
||||
if efficiency_diff > 0:
|
||||
print(f"Embedding used {efficiency_diff:.0f}% fewer pages")
|
||||
else:
|
||||
print(f"Statistical used {-efficiency_diff:.0f}% fewer pages")
|
||||
|
||||
print(f" Speed: ", end="")
|
||||
if stat_result['elapsed'] < emb_result['elapsed']:
|
||||
print(f"Statistical was {emb_result['elapsed']/stat_result['elapsed']:.1f}x faster")
|
||||
else:
|
||||
print(f"Embedding was {stat_result['elapsed']/emb_result['elapsed']:.1f}x faster")
|
||||
|
||||
print(f" Confidence difference: {abs(stat_result['confidence'] - emb_result['confidence'])*100:.0f} percentage points")
|
||||
|
||||
# Recommendation
|
||||
print("\n💡 Recommendation:")
|
||||
if 'specific' in test['name'].lower() or all(len(term) > 5 for term in test['query'].split()):
|
||||
print(" → Statistical strategy is likely better for this use case (specific terms)")
|
||||
elif 'conceptual' in test['name'].lower() or 'semantic' in test['name'].lower():
|
||||
print(" → Embedding strategy is likely better for this use case (semantic understanding)")
|
||||
else:
|
||||
if emb_result['confidence'] > stat_result['confidence'] + 0.1:
|
||||
print(" → Embedding strategy achieved significantly better understanding")
|
||||
elif stat_result['elapsed'] < emb_result['elapsed'] / 2:
|
||||
print(" → Statistical strategy is much faster with similar results")
|
||||
else:
|
||||
print(" → Both strategies performed similarly; choose based on your priorities")
|
||||
|
||||
# Summary recommendations
|
||||
print("\n" + "="*70)
|
||||
print("STRATEGY SELECTION GUIDE")
|
||||
print("="*70)
|
||||
print("\n✅ Use STATISTICAL strategy when:")
|
||||
print(" - Queries contain specific technical terms")
|
||||
print(" - Speed is critical")
|
||||
print(" - No API access available")
|
||||
print(" - Working with well-structured documentation")
|
||||
|
||||
print("\n✅ Use EMBEDDING strategy when:")
|
||||
print(" - Queries are conceptual or ambiguous")
|
||||
print(" - Semantic understanding is important")
|
||||
print(" - Need to detect irrelevant content")
|
||||
print(" - Working with diverse content sources")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
232
docs/examples/adaptive_crawling/export_import_kb.py
Normal file
232
docs/examples/adaptive_crawling/export_import_kb.py
Normal file
@@ -0,0 +1,232 @@
|
||||
"""
|
||||
Knowledge Base Export and Import
|
||||
|
||||
This example demonstrates how to export crawled knowledge bases and
|
||||
import them for reuse, sharing, or analysis.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from pathlib import Path
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||
|
||||
|
||||
async def build_knowledge_base():
|
||||
"""Build a knowledge base about web technologies"""
|
||||
print("="*60)
|
||||
print("PHASE 1: Building Knowledge Base")
|
||||
print("="*60)
|
||||
|
||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler)
|
||||
|
||||
# Crawl information about HTTP
|
||||
print("\n1. Gathering HTTP protocol information...")
|
||||
await adaptive.digest(
|
||||
start_url="https://httpbin.org",
|
||||
query="http methods headers status codes"
|
||||
)
|
||||
print(f" - Pages crawled: {len(adaptive.state.crawled_urls)}")
|
||||
print(f" - Confidence: {adaptive.confidence:.2%}")
|
||||
|
||||
# Add more information about APIs
|
||||
print("\n2. Adding API documentation knowledge...")
|
||||
await adaptive.digest(
|
||||
start_url="https://httpbin.org/anything",
|
||||
query="rest api json response request"
|
||||
)
|
||||
print(f" - Total pages: {len(adaptive.state.crawled_urls)}")
|
||||
print(f" - Confidence: {adaptive.confidence:.2%}")
|
||||
|
||||
# Export the knowledge base
|
||||
export_path = "web_tech_knowledge.jsonl"
|
||||
print(f"\n3. Exporting knowledge base to {export_path}")
|
||||
adaptive.export_knowledge_base(export_path)
|
||||
|
||||
# Show export statistics
|
||||
export_size = Path(export_path).stat().st_size / 1024
|
||||
with open(export_path, 'r') as f:
|
||||
line_count = sum(1 for _ in f)
|
||||
|
||||
print(f" - Exported {line_count} documents")
|
||||
print(f" - File size: {export_size:.1f} KB")
|
||||
|
||||
return export_path
|
||||
|
||||
|
||||
async def analyze_knowledge_base(kb_path):
|
||||
"""Analyze the exported knowledge base"""
|
||||
print("\n" + "="*60)
|
||||
print("PHASE 2: Analyzing Exported Knowledge Base")
|
||||
print("="*60)
|
||||
|
||||
# Read and analyze JSONL
|
||||
documents = []
|
||||
with open(kb_path, 'r') as f:
|
||||
for line in f:
|
||||
documents.append(json.loads(line))
|
||||
|
||||
print(f"\nKnowledge base contains {len(documents)} documents:")
|
||||
|
||||
# Analyze document properties
|
||||
total_content_length = 0
|
||||
urls_by_domain = {}
|
||||
|
||||
for doc in documents:
|
||||
# Content analysis
|
||||
content_length = len(doc.get('content', ''))
|
||||
total_content_length += content_length
|
||||
|
||||
# URL analysis
|
||||
url = doc.get('url', '')
|
||||
domain = url.split('/')[2] if url.startswith('http') else 'unknown'
|
||||
urls_by_domain[domain] = urls_by_domain.get(domain, 0) + 1
|
||||
|
||||
# Show sample document
|
||||
if documents.index(doc) == 0:
|
||||
print(f"\nSample document structure:")
|
||||
print(f" - URL: {url}")
|
||||
print(f" - Content length: {content_length} chars")
|
||||
print(f" - Has metadata: {'metadata' in doc}")
|
||||
print(f" - Has links: {len(doc.get('links', []))} links")
|
||||
print(f" - Query: {doc.get('query', 'N/A')}")
|
||||
|
||||
print(f"\nContent statistics:")
|
||||
print(f" - Total content: {total_content_length:,} characters")
|
||||
print(f" - Average per document: {total_content_length/len(documents):,.0f} chars")
|
||||
|
||||
print(f"\nDomain distribution:")
|
||||
for domain, count in urls_by_domain.items():
|
||||
print(f" - {domain}: {count} pages")
|
||||
|
||||
|
||||
async def import_and_continue():
|
||||
"""Import a knowledge base and continue crawling"""
|
||||
print("\n" + "="*60)
|
||||
print("PHASE 3: Importing and Extending Knowledge Base")
|
||||
print("="*60)
|
||||
|
||||
kb_path = "web_tech_knowledge.jsonl"
|
||||
|
||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||
# Create new adaptive crawler
|
||||
adaptive = AdaptiveCrawler(crawler)
|
||||
|
||||
# Import existing knowledge base
|
||||
print(f"\n1. Importing knowledge base from {kb_path}")
|
||||
adaptive.import_knowledge_base(kb_path)
|
||||
|
||||
print(f" - Imported {len(adaptive.state.knowledge_base)} documents")
|
||||
print(f" - Existing URLs: {len(adaptive.state.crawled_urls)}")
|
||||
|
||||
# Check current state
|
||||
print("\n2. Checking imported knowledge state:")
|
||||
adaptive.print_stats(detailed=False)
|
||||
|
||||
# Continue crawling with new query
|
||||
print("\n3. Extending knowledge with new query...")
|
||||
await adaptive.digest(
|
||||
start_url="https://httpbin.org/status/200",
|
||||
query="error handling retry timeout"
|
||||
)
|
||||
|
||||
print("\n4. Final knowledge base state:")
|
||||
adaptive.print_stats(detailed=False)
|
||||
|
||||
# Export extended knowledge base
|
||||
extended_path = "web_tech_knowledge_extended.jsonl"
|
||||
adaptive.export_knowledge_base(extended_path)
|
||||
print(f"\n5. Extended knowledge base exported to {extended_path}")
|
||||
|
||||
|
||||
async def share_knowledge_bases():
|
||||
"""Demonstrate sharing knowledge bases between projects"""
|
||||
print("\n" + "="*60)
|
||||
print("PHASE 4: Sharing Knowledge Between Projects")
|
||||
print("="*60)
|
||||
|
||||
# Simulate two different projects
|
||||
project_a_kb = "project_a_knowledge.jsonl"
|
||||
project_b_kb = "project_b_knowledge.jsonl"
|
||||
|
||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||
# Project A: Security documentation
|
||||
print("\n1. Project A: Building security knowledge...")
|
||||
crawler_a = AdaptiveCrawler(crawler)
|
||||
await crawler_a.digest(
|
||||
start_url="https://httpbin.org/basic-auth/user/pass",
|
||||
query="authentication security headers"
|
||||
)
|
||||
crawler_a.export_knowledge_base(project_a_kb)
|
||||
print(f" - Exported {len(crawler_a.state.knowledge_base)} documents")
|
||||
|
||||
# Project B: API testing
|
||||
print("\n2. Project B: Building testing knowledge...")
|
||||
crawler_b = AdaptiveCrawler(crawler)
|
||||
await crawler_b.digest(
|
||||
start_url="https://httpbin.org/anything",
|
||||
query="testing endpoints mocking"
|
||||
)
|
||||
crawler_b.export_knowledge_base(project_b_kb)
|
||||
print(f" - Exported {len(crawler_b.state.knowledge_base)} documents")
|
||||
|
||||
# Merge knowledge bases
|
||||
print("\n3. Merging knowledge bases...")
|
||||
merged_crawler = AdaptiveCrawler(crawler)
|
||||
|
||||
# Import both knowledge bases
|
||||
merged_crawler.import_knowledge_base(project_a_kb)
|
||||
initial_size = len(merged_crawler.state.knowledge_base)
|
||||
|
||||
merged_crawler.import_knowledge_base(project_b_kb)
|
||||
final_size = len(merged_crawler.state.knowledge_base)
|
||||
|
||||
print(f" - Project A documents: {initial_size}")
|
||||
print(f" - Additional from Project B: {final_size - initial_size}")
|
||||
print(f" - Total merged documents: {final_size}")
|
||||
|
||||
# Export merged knowledge
|
||||
merged_kb = "merged_knowledge.jsonl"
|
||||
merged_crawler.export_knowledge_base(merged_kb)
|
||||
print(f"\n4. Merged knowledge base exported to {merged_kb}")
|
||||
|
||||
# Show combined coverage
|
||||
print("\n5. Combined knowledge coverage:")
|
||||
merged_crawler.print_stats(detailed=False)
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all examples"""
|
||||
try:
|
||||
# Build initial knowledge base
|
||||
kb_path = await build_knowledge_base()
|
||||
|
||||
# Analyze the export
|
||||
await analyze_knowledge_base(kb_path)
|
||||
|
||||
# Import and extend
|
||||
await import_and_continue()
|
||||
|
||||
# Demonstrate sharing
|
||||
await share_knowledge_bases()
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("All examples completed successfully!")
|
||||
print("="*60)
|
||||
|
||||
finally:
|
||||
# Clean up generated files
|
||||
print("\nCleaning up generated files...")
|
||||
for file in [
|
||||
"web_tech_knowledge.jsonl",
|
||||
"web_tech_knowledge_extended.jsonl",
|
||||
"project_a_knowledge.jsonl",
|
||||
"project_b_knowledge.jsonl",
|
||||
"merged_knowledge.jsonl"
|
||||
]:
|
||||
Path(file).unlink(missing_ok=True)
|
||||
print("Cleanup complete.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -18,7 +18,7 @@ Usage:
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.async_configs import LinkPreviewConfig
|
||||
from crawl4ai import LinkPreviewConfig
|
||||
|
||||
|
||||
async def basic_link_head_extraction():
|
||||
|
||||
@@ -1,43 +1,55 @@
|
||||
from crawl4ai import LLMConfig
|
||||
from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy
|
||||
import asyncio
|
||||
import os
|
||||
import json
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
url = "https://openai.com/api/pricing/"
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig, BrowserConfig, CacheMode
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
from typing import Dict
|
||||
import os
|
||||
|
||||
|
||||
class OpenAIModelFee(BaseModel):
|
||||
model_name: str = Field(..., description="Name of the OpenAI model.")
|
||||
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
|
||||
output_fee: str = Field(
|
||||
..., description="Fee for output token for the OpenAI model."
|
||||
output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
|
||||
|
||||
|
||||
async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None):
|
||||
print(f"\n--- Extracting Structured Data with {provider} ---")
|
||||
|
||||
if api_token is None and provider != "ollama":
|
||||
print(f"API token is required for {provider}. Skipping this example.")
|
||||
return
|
||||
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
|
||||
extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
|
||||
if extra_headers:
|
||||
extra_args["extra_headers"] = extra_headers
|
||||
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
word_count_threshold=1,
|
||||
page_timeout=80000,
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
llm_config=LLMConfig(provider=provider, api_token=api_token),
|
||||
schema=OpenAIModelFee.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
||||
Do not miss any models in the entire content.""",
|
||||
extra_args=extra_args,
|
||||
),
|
||||
)
|
||||
|
||||
async def main():
|
||||
# Use AsyncWebCrawler
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
word_count_threshold=1,
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
# provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
|
||||
llm_config=LLMConfig(provider="groq/llama-3.1-70b-versatile", api_token=os.getenv("GROQ_API_KEY")),
|
||||
schema=OpenAIModelFee.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="From the crawled content, extract all mentioned model names along with their "
|
||||
"fees for input and output tokens. Make sure not to miss anything in the entire content. "
|
||||
"One extracted model JSON format should look like this: "
|
||||
'{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }',
|
||||
),
|
||||
url="https://openai.com/api/pricing/",
|
||||
config=crawler_config
|
||||
)
|
||||
print("Success:", result.success)
|
||||
model_fees = json.loads(result.extracted_content)
|
||||
print(len(model_fees))
|
||||
|
||||
with open(".data/data.json", "w", encoding="utf-8") as f:
|
||||
f.write(result.extracted_content)
|
||||
print(result.extracted_content)
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(
|
||||
extract_structured_data_using_llm(
|
||||
provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY")
|
||||
)
|
||||
)
|
||||
|
||||
432
docs/md_v2/advanced/adaptive-strategies.md
Normal file
432
docs/md_v2/advanced/adaptive-strategies.md
Normal file
@@ -0,0 +1,432 @@
|
||||
# Advanced Adaptive Strategies
|
||||
|
||||
## Overview
|
||||
|
||||
While the default adaptive crawling configuration works well for most use cases, understanding the underlying strategies and scoring mechanisms allows you to fine-tune the crawler for specific domains and requirements.
|
||||
|
||||
## The Three-Layer Scoring System
|
||||
|
||||
### 1. Coverage Score
|
||||
|
||||
Coverage measures how comprehensively your knowledge base covers the query terms and related concepts.
|
||||
|
||||
#### Mathematical Foundation
|
||||
|
||||
```python
|
||||
Coverage(K, Q) = Σ(t ∈ Q) score(t, K) / |Q|
|
||||
|
||||
where score(t, K) = doc_coverage(t) × (1 + freq_boost(t))
|
||||
```
|
||||
|
||||
#### Components
|
||||
|
||||
- **Document Coverage**: Percentage of documents containing the term
|
||||
- **Frequency Boost**: Logarithmic bonus for term frequency
|
||||
- **Query Decomposition**: Handles multi-word queries intelligently
|
||||
|
||||
#### Tuning Coverage
|
||||
|
||||
```python
|
||||
# For technical documentation with specific terminology
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.85, # Require high coverage
|
||||
top_k_links=5 # Cast wider net
|
||||
)
|
||||
|
||||
# For general topics with synonyms
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.6, # Lower threshold
|
||||
top_k_links=2 # More focused
|
||||
)
|
||||
```
|
||||
|
||||
### 2. Consistency Score
|
||||
|
||||
Consistency evaluates whether the information across pages is coherent and non-contradictory.
|
||||
|
||||
#### How It Works
|
||||
|
||||
1. Extracts key statements from each document
|
||||
2. Compares statements across documents
|
||||
3. Measures agreement vs. contradiction
|
||||
4. Returns normalized score (0-1)
|
||||
|
||||
#### Practical Impact
|
||||
|
||||
- **High consistency (>0.8)**: Information is reliable and coherent
|
||||
- **Medium consistency (0.5-0.8)**: Some variation, but generally aligned
|
||||
- **Low consistency (<0.5)**: Conflicting information, need more sources
|
||||
|
||||
### 3. Saturation Score
|
||||
|
||||
Saturation detects when new pages stop providing novel information.
|
||||
|
||||
#### Detection Algorithm
|
||||
|
||||
```python
|
||||
# Tracks new unique terms per page
|
||||
new_terms_page_1 = 50
|
||||
new_terms_page_2 = 30 # 60% of first
|
||||
new_terms_page_3 = 15 # 50% of second
|
||||
new_terms_page_4 = 5 # 33% of third
|
||||
# Saturation detected: rapidly diminishing returns
|
||||
```
|
||||
|
||||
#### Configuration
|
||||
|
||||
```python
|
||||
config = AdaptiveConfig(
|
||||
min_gain_threshold=0.1 # Stop if <10% new information
|
||||
)
|
||||
```
|
||||
|
||||
## Link Ranking Algorithm
|
||||
|
||||
### Expected Information Gain
|
||||
|
||||
Each uncrawled link is scored based on:
|
||||
|
||||
```python
|
||||
ExpectedGain(link) = Relevance × Novelty × Authority
|
||||
```
|
||||
|
||||
#### 1. Relevance Scoring
|
||||
|
||||
Uses BM25 algorithm on link preview text:
|
||||
|
||||
```python
|
||||
relevance = BM25(link.preview_text, query)
|
||||
```
|
||||
|
||||
Factors:
|
||||
- Term frequency in preview
|
||||
- Inverse document frequency
|
||||
- Preview length normalization
|
||||
|
||||
#### 2. Novelty Estimation
|
||||
|
||||
Measures how different the link appears from already-crawled content:
|
||||
|
||||
```python
|
||||
novelty = 1 - max_similarity(preview, knowledge_base)
|
||||
```
|
||||
|
||||
Prevents crawling duplicate or highly similar pages.
|
||||
|
||||
#### 3. Authority Calculation
|
||||
|
||||
URL structure and domain analysis:
|
||||
|
||||
```python
|
||||
authority = f(domain_rank, url_depth, url_structure)
|
||||
```
|
||||
|
||||
Factors:
|
||||
- Domain reputation
|
||||
- URL depth (fewer slashes = higher authority)
|
||||
- Clean URL structure
|
||||
|
||||
### Custom Link Scoring
|
||||
|
||||
```python
|
||||
class CustomLinkScorer:
|
||||
def score(self, link: Link, query: str, state: CrawlState) -> float:
|
||||
# Prioritize specific URL patterns
|
||||
if "/api/reference/" in link.href:
|
||||
return 2.0 # Double the score
|
||||
|
||||
# Deprioritize certain sections
|
||||
if "/archive/" in link.href:
|
||||
return 0.1 # Reduce score by 90%
|
||||
|
||||
# Default scoring
|
||||
return 1.0
|
||||
|
||||
# Use with adaptive crawler
|
||||
adaptive = AdaptiveCrawler(
|
||||
crawler,
|
||||
config=config,
|
||||
link_scorer=CustomLinkScorer()
|
||||
)
|
||||
```
|
||||
|
||||
## Domain-Specific Configurations
|
||||
|
||||
### Technical Documentation
|
||||
|
||||
```python
|
||||
tech_doc_config = AdaptiveConfig(
|
||||
confidence_threshold=0.85,
|
||||
max_pages=30,
|
||||
top_k_links=3,
|
||||
min_gain_threshold=0.05 # Keep crawling for small gains
|
||||
)
|
||||
```
|
||||
|
||||
Rationale:
|
||||
- High threshold ensures comprehensive coverage
|
||||
- Lower gain threshold captures edge cases
|
||||
- Moderate link following for depth
|
||||
|
||||
### News & Articles
|
||||
|
||||
```python
|
||||
news_config = AdaptiveConfig(
|
||||
confidence_threshold=0.6,
|
||||
max_pages=10,
|
||||
top_k_links=5,
|
||||
min_gain_threshold=0.15 # Stop quickly on repetition
|
||||
)
|
||||
```
|
||||
|
||||
Rationale:
|
||||
- Lower threshold (articles often repeat information)
|
||||
- Higher gain threshold (avoid duplicate stories)
|
||||
- More links per page (explore different perspectives)
|
||||
|
||||
### E-commerce
|
||||
|
||||
```python
|
||||
ecommerce_config = AdaptiveConfig(
|
||||
confidence_threshold=0.7,
|
||||
max_pages=20,
|
||||
top_k_links=2,
|
||||
min_gain_threshold=0.1
|
||||
)
|
||||
```
|
||||
|
||||
Rationale:
|
||||
- Balanced threshold for product variations
|
||||
- Focused link following (avoid infinite products)
|
||||
- Standard gain threshold
|
||||
|
||||
### Research & Academic
|
||||
|
||||
```python
|
||||
research_config = AdaptiveConfig(
|
||||
confidence_threshold=0.9,
|
||||
max_pages=50,
|
||||
top_k_links=4,
|
||||
min_gain_threshold=0.02 # Very low - capture citations
|
||||
)
|
||||
```
|
||||
|
||||
Rationale:
|
||||
- Very high threshold for completeness
|
||||
- Many pages allowed for thorough research
|
||||
- Very low gain threshold to capture references
|
||||
|
||||
## Performance Optimization
|
||||
|
||||
### Memory Management
|
||||
|
||||
```python
|
||||
# For large crawls, use streaming
|
||||
config = AdaptiveConfig(
|
||||
max_pages=100,
|
||||
save_state=True,
|
||||
state_path="large_crawl.json"
|
||||
)
|
||||
|
||||
# Periodically clean state
|
||||
if len(state.knowledge_base) > 1000:
|
||||
# Keep only most relevant
|
||||
state.knowledge_base = get_top_relevant(state.knowledge_base, 500)
|
||||
```
|
||||
|
||||
### Parallel Processing
|
||||
|
||||
```python
|
||||
# Use multiple start points
|
||||
start_urls = [
|
||||
"https://docs.example.com/intro",
|
||||
"https://docs.example.com/api",
|
||||
"https://docs.example.com/guides"
|
||||
]
|
||||
|
||||
# Crawl in parallel
|
||||
tasks = [
|
||||
adaptive.digest(url, query)
|
||||
for url in start_urls
|
||||
]
|
||||
results = await asyncio.gather(*tasks)
|
||||
```
|
||||
|
||||
### Caching Strategy
|
||||
|
||||
```python
|
||||
# Enable caching for repeated crawls
|
||||
async with AsyncWebCrawler(
|
||||
config=BrowserConfig(
|
||||
cache_mode=CacheMode.ENABLED
|
||||
)
|
||||
) as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
```
|
||||
|
||||
## Debugging & Analysis
|
||||
|
||||
### Enable Verbose Logging
|
||||
|
||||
```python
|
||||
import logging
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
adaptive = AdaptiveCrawler(crawler, config, verbose=True)
|
||||
```
|
||||
|
||||
### Analyze Crawl Patterns
|
||||
|
||||
```python
|
||||
# After crawling
|
||||
state = await adaptive.digest(start_url, query)
|
||||
|
||||
# Analyze link selection
|
||||
print("Link selection order:")
|
||||
for i, url in enumerate(state.crawl_order):
|
||||
print(f"{i+1}. {url}")
|
||||
|
||||
# Analyze term discovery
|
||||
print("\nTerm discovery rate:")
|
||||
for i, new_terms in enumerate(state.new_terms_history):
|
||||
print(f"Page {i+1}: {new_terms} new terms")
|
||||
|
||||
# Analyze score progression
|
||||
print("\nScore progression:")
|
||||
print(f"Coverage: {state.metrics['coverage_history']}")
|
||||
print(f"Saturation: {state.metrics['saturation_history']}")
|
||||
```
|
||||
|
||||
### Export for Analysis
|
||||
|
||||
```python
|
||||
# Export detailed metrics
|
||||
import json
|
||||
|
||||
metrics = {
|
||||
"query": query,
|
||||
"total_pages": len(state.crawled_urls),
|
||||
"confidence": adaptive.confidence,
|
||||
"coverage_stats": adaptive.coverage_stats,
|
||||
"crawl_order": state.crawl_order,
|
||||
"term_frequencies": dict(state.term_frequencies),
|
||||
"new_terms_history": state.new_terms_history
|
||||
}
|
||||
|
||||
with open("crawl_analysis.json", "w") as f:
|
||||
json.dump(metrics, f, indent=2)
|
||||
```
|
||||
|
||||
## Custom Strategies
|
||||
|
||||
### Implementing a Custom Strategy
|
||||
|
||||
```python
|
||||
from crawl4ai.adaptive_crawler import BaseStrategy
|
||||
|
||||
class DomainSpecificStrategy(BaseStrategy):
|
||||
def calculate_coverage(self, state: CrawlState) -> float:
|
||||
# Custom coverage calculation
|
||||
# e.g., weight certain terms more heavily
|
||||
pass
|
||||
|
||||
def calculate_consistency(self, state: CrawlState) -> float:
|
||||
# Custom consistency logic
|
||||
# e.g., domain-specific validation
|
||||
pass
|
||||
|
||||
def rank_links(self, links: List[Link], state: CrawlState) -> List[Link]:
|
||||
# Custom link ranking
|
||||
# e.g., prioritize specific URL patterns
|
||||
pass
|
||||
|
||||
# Use custom strategy
|
||||
adaptive = AdaptiveCrawler(
|
||||
crawler,
|
||||
config=config,
|
||||
strategy=DomainSpecificStrategy()
|
||||
)
|
||||
```
|
||||
|
||||
### Combining Strategies
|
||||
|
||||
```python
|
||||
class HybridStrategy(BaseStrategy):
|
||||
def __init__(self):
|
||||
self.strategies = [
|
||||
TechnicalDocStrategy(),
|
||||
SemanticSimilarityStrategy(),
|
||||
URLPatternStrategy()
|
||||
]
|
||||
|
||||
def calculate_confidence(self, state: CrawlState) -> float:
|
||||
# Weighted combination of strategies
|
||||
scores = [s.calculate_confidence(state) for s in self.strategies]
|
||||
weights = [0.5, 0.3, 0.2]
|
||||
return sum(s * w for s, w in zip(scores, weights))
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### 1. Start Conservative
|
||||
|
||||
Begin with default settings and adjust based on results:
|
||||
|
||||
```python
|
||||
# Start with defaults
|
||||
result = await adaptive.digest(url, query)
|
||||
|
||||
# Analyze and adjust
|
||||
if adaptive.confidence < 0.7:
|
||||
config.max_pages += 10
|
||||
config.confidence_threshold -= 0.1
|
||||
```
|
||||
|
||||
### 2. Monitor Resource Usage
|
||||
|
||||
```python
|
||||
import psutil
|
||||
|
||||
# Check memory before large crawls
|
||||
memory_percent = psutil.virtual_memory().percent
|
||||
if memory_percent > 80:
|
||||
config.max_pages = min(config.max_pages, 20)
|
||||
```
|
||||
|
||||
### 3. Use Domain Knowledge
|
||||
|
||||
```python
|
||||
# For API documentation
|
||||
if "api" in start_url:
|
||||
config.top_k_links = 2 # APIs have clear structure
|
||||
|
||||
# For blogs
|
||||
if "blog" in start_url:
|
||||
config.min_gain_threshold = 0.2 # Avoid similar posts
|
||||
```
|
||||
|
||||
### 4. Validate Results
|
||||
|
||||
```python
|
||||
# Always validate the knowledge base
|
||||
relevant_content = adaptive.get_relevant_content(top_k=10)
|
||||
|
||||
# Check coverage
|
||||
query_terms = set(query.lower().split())
|
||||
covered_terms = set()
|
||||
|
||||
for doc in relevant_content:
|
||||
content_lower = doc['content'].lower()
|
||||
for term in query_terms:
|
||||
if term in content_lower:
|
||||
covered_terms.add(term)
|
||||
|
||||
coverage_ratio = len(covered_terms) / len(query_terms)
|
||||
print(f"Query term coverage: {coverage_ratio:.0%}")
|
||||
```
|
||||
|
||||
## Next Steps
|
||||
|
||||
- Explore [Custom Strategy Implementation](../tutorials/custom-adaptive-strategies.md)
|
||||
- Learn about [Knowledge Base Management](../tutorials/knowledge-base-management.md)
|
||||
- See [Performance Benchmarks](../benchmarks/adaptive-performance.md)
|
||||
@@ -66,29 +66,38 @@ Sometimes you need a visual record of a page or a PDF “printout.” Crawl4AI c
|
||||
```python
|
||||
import os, asyncio
|
||||
from base64 import b64decode
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
|
||||
|
||||
async def main():
|
||||
run_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
screenshot=True,
|
||||
pdf=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://en.wikipedia.org/wiki/List_of_common_misconceptions",
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
pdf=True,
|
||||
screenshot=True
|
||||
config=run_config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
# Save screenshot
|
||||
print(f"Screenshot data present: {result.screenshot is not None}")
|
||||
print(f"PDF data present: {result.pdf is not None}")
|
||||
|
||||
if result.screenshot:
|
||||
print(f"[OK] Screenshot captured, size: {len(result.screenshot)} bytes")
|
||||
with open("wikipedia_screenshot.png", "wb") as f:
|
||||
f.write(b64decode(result.screenshot))
|
||||
|
||||
# Save PDF
|
||||
else:
|
||||
print("[WARN] Screenshot data is None.")
|
||||
|
||||
if result.pdf:
|
||||
print(f"[OK] PDF captured, size: {len(result.pdf)} bytes")
|
||||
with open("wikipedia_page.pdf", "wb") as f:
|
||||
f.write(result.pdf)
|
||||
|
||||
print("[OK] PDF & screenshot captured.")
|
||||
else:
|
||||
print("[WARN] PDF data is None.")
|
||||
|
||||
else:
|
||||
print("[ERROR]", result.error_message)
|
||||
|
||||
|
||||
201
docs/md_v2/advanced/pdf-parsing.md
Normal file
201
docs/md_v2/advanced/pdf-parsing.md
Normal file
@@ -0,0 +1,201 @@
|
||||
# PDF Processing Strategies
|
||||
|
||||
Crawl4AI provides specialized strategies for handling and extracting content from PDF files. These strategies allow you to seamlessly integrate PDF processing into your crawling workflows, whether the PDFs are hosted online or stored locally.
|
||||
|
||||
## `PDFCrawlerStrategy`
|
||||
|
||||
### Overview
|
||||
`PDFCrawlerStrategy` is an implementation of `AsyncCrawlerStrategy` designed specifically for PDF documents. Instead of interpreting the input URL as an HTML webpage, this strategy treats it as a pointer to a PDF file. It doesn't perform deep crawling or HTML parsing itself but rather prepares the PDF source for a dedicated PDF scraping strategy. Its primary role is to identify the PDF source (web URL or local file) and pass it along the processing pipeline in a way that `AsyncWebCrawler` can handle.
|
||||
|
||||
### When to Use
|
||||
Use `PDFCrawlerStrategy` when you need to:
|
||||
- Process PDF files using the `AsyncWebCrawler`.
|
||||
- Handle PDFs from both web URLs (e.g., `https://example.com/document.pdf`) and local file paths (e.g., `file:///path/to/your/document.pdf`).
|
||||
- Integrate PDF content extraction into a unified `CrawlResult` object, allowing consistent handling of PDF data alongside web page data.
|
||||
|
||||
### Key Methods and Their Behavior
|
||||
- **`__init__(self, logger: AsyncLogger = None)`**:
|
||||
- Initializes the strategy.
|
||||
- `logger`: An optional `AsyncLogger` instance (from `crawl4ai.async_logger`) for logging purposes.
|
||||
- **`async crawl(self, url: str, **kwargs) -> AsyncCrawlResponse`**:
|
||||
- This method is called by the `AsyncWebCrawler` during the `arun` process.
|
||||
- It takes the `url` (which should point to a PDF) and creates a minimal `AsyncCrawlResponse`.
|
||||
- The `html` attribute of this response is typically empty or a placeholder, as the actual PDF content processing is deferred to the `PDFContentScrapingStrategy` (or a similar PDF-aware scraping strategy).
|
||||
- It sets `response_headers` to indicate "application/pdf" and `status_code` to 200.
|
||||
- **`async close(self)`**:
|
||||
- A method for cleaning up any resources used by the strategy. For `PDFCrawlerStrategy`, this is usually minimal.
|
||||
- **`async __aenter__(self)` / `async __aexit__(self, exc_type, exc_val, exc_tb)`**:
|
||||
- Enables asynchronous context management for the strategy, allowing it to be used with `async with`.
|
||||
|
||||
### Example Usage
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
|
||||
|
||||
async def main():
|
||||
# Initialize the PDF crawler strategy
|
||||
pdf_crawler_strategy = PDFCrawlerStrategy()
|
||||
|
||||
# PDFCrawlerStrategy is typically used in conjunction with PDFContentScrapingStrategy
|
||||
# The scraping strategy handles the actual PDF content extraction
|
||||
pdf_scraping_strategy = PDFContentScrapingStrategy()
|
||||
run_config = CrawlerRunConfig(scraping_strategy=pdf_scraping_strategy)
|
||||
|
||||
async with AsyncWebCrawler(crawler_strategy=pdf_crawler_strategy) as crawler:
|
||||
# Example with a remote PDF URL
|
||||
pdf_url = "https://arxiv.org/pdf/2310.06825.pdf" # A public PDF from arXiv
|
||||
|
||||
print(f"Attempting to process PDF: {pdf_url}")
|
||||
result = await crawler.arun(url=pdf_url, config=run_config)
|
||||
|
||||
if result.success:
|
||||
print(f"Successfully processed PDF: {result.url}")
|
||||
print(f"Metadata Title: {result.metadata.get('title', 'N/A')}")
|
||||
# Further processing of result.markdown, result.media, etc.
|
||||
# would be done here, based on what PDFContentScrapingStrategy extracts.
|
||||
if result.markdown and hasattr(result.markdown, 'raw_markdown'):
|
||||
print(f"Extracted text (first 200 chars): {result.markdown.raw_markdown[:200]}...")
|
||||
else:
|
||||
print("No markdown (text) content extracted.")
|
||||
else:
|
||||
print(f"Failed to process PDF: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
### Pros and Cons
|
||||
**Pros:**
|
||||
- Enables `AsyncWebCrawler` to handle PDF sources directly using familiar `arun` calls.
|
||||
- Provides a consistent interface for specifying PDF sources (URLs or local paths).
|
||||
- Abstracts the source handling, allowing a separate scraping strategy to focus on PDF content parsing.
|
||||
|
||||
**Cons:**
|
||||
- Does not perform any PDF data extraction itself; it strictly relies on a compatible scraping strategy (like `PDFContentScrapingStrategy`) to process the PDF.
|
||||
- Has limited utility on its own; most of its value comes from being paired with a PDF-specific content scraping strategy.
|
||||
|
||||
---
|
||||
|
||||
## `PDFContentScrapingStrategy`
|
||||
|
||||
### Overview
|
||||
`PDFContentScrapingStrategy` is an implementation of `ContentScrapingStrategy` designed to extract text, metadata, and optionally images from PDF documents. It is intended to be used in conjunction with a crawler strategy that can provide it with a PDF source, such as `PDFCrawlerStrategy`. This strategy uses the `NaivePDFProcessorStrategy` internally to perform the low-level PDF parsing.
|
||||
|
||||
### When to Use
|
||||
Use `PDFContentScrapingStrategy` when your `AsyncWebCrawler` (often configured with `PDFCrawlerStrategy`) needs to:
|
||||
- Extract textual content page by page from a PDF document.
|
||||
- Retrieve standard metadata embedded within the PDF (e.g., title, author, subject, creation date, page count).
|
||||
- Optionally, extract images contained within the PDF pages. These images can be saved to a local directory or made available for further processing.
|
||||
- Produce a `ScrapingResult` that can be converted into a `CrawlResult`, making PDF content accessible in a manner similar to HTML web content (e.g., text in `result.markdown`, metadata in `result.metadata`).
|
||||
|
||||
### Key Configuration Attributes
|
||||
When initializing `PDFContentScrapingStrategy`, you can configure its behavior using the following attributes:
|
||||
- **`extract_images: bool = False`**: If `True`, the strategy will attempt to extract images from the PDF.
|
||||
- **`save_images_locally: bool = False`**: If `True` (and `extract_images` is also `True`), extracted images will be saved to disk in the `image_save_dir`. If `False`, image data might be available in another form (e.g., base64, depending on the underlying processor) but not saved as separate files by this strategy.
|
||||
- **`image_save_dir: str = None`**: Specifies the directory where extracted images should be saved if `save_images_locally` is `True`. If `None`, a default or temporary directory might be used.
|
||||
- **`batch_size: int = 4`**: Defines how many PDF pages are processed in a single batch. This can be useful for managing memory when dealing with very large PDF documents.
|
||||
- **`logger: AsyncLogger = None`**: An optional `AsyncLogger` instance for logging.
|
||||
|
||||
### Key Methods and Their Behavior
|
||||
- **`__init__(self, save_images_locally: bool = False, extract_images: bool = False, image_save_dir: str = None, batch_size: int = 4, logger: AsyncLogger = None)`**:
|
||||
- Initializes the strategy with configurations for image handling, batch processing, and logging. It sets up an internal `NaivePDFProcessorStrategy` instance which performs the actual PDF parsing.
|
||||
- **`scrap(self, url: str, html: str, **params) -> ScrapingResult`**:
|
||||
- This is the primary synchronous method called by the crawler (via `ascrap`) to process the PDF.
|
||||
- `url`: The path or URL to the PDF file (provided by `PDFCrawlerStrategy` or similar).
|
||||
- `html`: Typically an empty string when used with `PDFCrawlerStrategy`, as the content is a PDF, not HTML.
|
||||
- It first ensures the PDF is accessible locally (downloads it to a temporary file if `url` is remote).
|
||||
- It then uses its internal PDF processor to extract text, metadata, and images (if configured).
|
||||
- The extracted information is compiled into a `ScrapingResult` object:
|
||||
- `cleaned_html`: Contains an HTML-like representation of the PDF, where each page's content is often wrapped in a `<div>` with page number information.
|
||||
- `media`: A dictionary where `media["images"]` will contain information about extracted images if `extract_images` was `True`.
|
||||
- `links`: A dictionary where `links["urls"]` can contain URLs found within the PDF content.
|
||||
- `metadata`: A dictionary holding PDF metadata (e.g., title, author, num_pages).
|
||||
- **`async ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult`**:
|
||||
- The asynchronous version of `scrap`. Under the hood, it typically runs the synchronous `scrap` method in a separate thread using `asyncio.to_thread` to avoid blocking the event loop.
|
||||
- **`_get_pdf_path(self, url: str) -> str`**:
|
||||
- A private helper method to manage PDF file access. If the `url` is remote (http/https), it downloads the PDF to a temporary local file and returns its path. If `url` indicates a local file (`file://` or a direct path), it resolves and returns the local path.
|
||||
|
||||
### Example Usage
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
|
||||
import os # For creating image directory
|
||||
|
||||
async def main():
|
||||
# Define the directory for saving extracted images
|
||||
image_output_dir = "./my_pdf_images"
|
||||
os.makedirs(image_output_dir, exist_ok=True)
|
||||
|
||||
# Configure the PDF content scraping strategy
|
||||
# Enable image extraction and specify where to save them
|
||||
pdf_scraping_cfg = PDFContentScrapingStrategy(
|
||||
extract_images=True,
|
||||
save_images_locally=True,
|
||||
image_save_dir=image_output_dir,
|
||||
batch_size=2 # Process 2 pages at a time for demonstration
|
||||
)
|
||||
|
||||
# The PDFCrawlerStrategy is needed to tell AsyncWebCrawler how to "crawl" a PDF
|
||||
pdf_crawler_cfg = PDFCrawlerStrategy()
|
||||
|
||||
# Configure the overall crawl run
|
||||
run_cfg = CrawlerRunConfig(
|
||||
scraping_strategy=pdf_scraping_cfg # Use our PDF scraping strategy
|
||||
)
|
||||
|
||||
# Initialize the crawler with the PDF-specific crawler strategy
|
||||
async with AsyncWebCrawler(crawler_strategy=pdf_crawler_cfg) as crawler:
|
||||
pdf_url = "https://arxiv.org/pdf/2310.06825.pdf" # Example PDF
|
||||
|
||||
print(f"Starting PDF processing for: {pdf_url}")
|
||||
result = await crawler.arun(url=pdf_url, config=run_cfg)
|
||||
|
||||
if result.success:
|
||||
print("\n--- PDF Processing Successful ---")
|
||||
print(f"Processed URL: {result.url}")
|
||||
|
||||
print("\n--- Metadata ---")
|
||||
for key, value in result.metadata.items():
|
||||
print(f" {key.replace('_', ' ').title()}: {value}")
|
||||
|
||||
if result.markdown and hasattr(result.markdown, 'raw_markdown'):
|
||||
print(f"\n--- Extracted Text (Markdown Snippet) ---")
|
||||
print(result.markdown.raw_markdown[:500].strip() + "...")
|
||||
else:
|
||||
print("\nNo text (markdown) content extracted.")
|
||||
|
||||
if result.media and result.media.get("images"):
|
||||
print(f"\n--- Image Extraction ---")
|
||||
print(f"Extracted {len(result.media['images'])} image(s).")
|
||||
for i, img_info in enumerate(result.media["images"][:2]): # Show info for first 2 images
|
||||
print(f" Image {i+1}:")
|
||||
print(f" Page: {img_info.get('page')}")
|
||||
print(f" Format: {img_info.get('format', 'N/A')}")
|
||||
if img_info.get('path'):
|
||||
print(f" Saved at: {img_info.get('path')}")
|
||||
else:
|
||||
print("\nNo images were extracted (or extract_images was False).")
|
||||
else:
|
||||
print(f"\n--- PDF Processing Failed ---")
|
||||
print(f"Error: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
### Pros and Cons
|
||||
|
||||
**Pros:**
|
||||
- Provides a comprehensive way to extract text, metadata, and (optionally) images from PDF documents.
|
||||
- Handles both remote PDFs (via URL) and local PDF files.
|
||||
- Configurable image extraction allows saving images to disk or accessing their data.
|
||||
- Integrates smoothly with the `CrawlResult` object structure, making PDF-derived data accessible in a way consistent with web-scraped data.
|
||||
- The `batch_size` parameter can help in managing memory consumption when processing large or numerous PDF pages.
|
||||
|
||||
**Cons:**
|
||||
- Extraction quality and performance can vary significantly depending on the PDF's complexity, encoding, and whether it's image-based (scanned) or text-based.
|
||||
- Image extraction can be resource-intensive (both CPU and disk space if `save_images_locally` is true).
|
||||
- Relies on `NaivePDFProcessorStrategy` internally, which might have limitations with very complex layouts, encrypted PDFs, or forms compared to more sophisticated PDF parsing libraries. Scanned PDFs will not yield text unless an OCR step is performed (which is not part of this strategy by default).
|
||||
- Link extraction from PDFs can be basic and depends on how hyperlinks are embedded in the document.
|
||||
@@ -25,44 +25,70 @@ Use an authenticated proxy with `BrowserConfig`:
|
||||
```python
|
||||
from crawl4ai.async_configs import BrowserConfig
|
||||
|
||||
proxy_config = {
|
||||
"server": "http://proxy.example.com:8080",
|
||||
"username": "user",
|
||||
"password": "pass"
|
||||
}
|
||||
|
||||
browser_config = BrowserConfig(proxy_config=proxy_config)
|
||||
browser_config = BrowserConfig(proxy="http://[username]:[password]@[host]:[port]")
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
```
|
||||
|
||||
Here's the corrected documentation:
|
||||
|
||||
## Rotating Proxies
|
||||
|
||||
Example using a proxy rotation service dynamically:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
|
||||
async def get_next_proxy():
|
||||
# Your proxy rotation logic here
|
||||
return {"server": "http://next.proxy.com:8080"}
|
||||
|
||||
import re
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
CacheMode,
|
||||
RoundRobinProxyStrategy,
|
||||
)
|
||||
import asyncio
|
||||
from crawl4ai import ProxyConfig
|
||||
async def main():
|
||||
browser_config = BrowserConfig()
|
||||
run_config = CrawlerRunConfig()
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# For each URL, create a new run config with different proxy
|
||||
for url in urls:
|
||||
proxy = await get_next_proxy()
|
||||
# Clone the config and update proxy - this creates a new browser context
|
||||
current_config = run_config.clone(proxy_config=proxy)
|
||||
result = await crawler.arun(url=url, config=current_config)
|
||||
# Load proxies and create rotation strategy
|
||||
proxies = ProxyConfig.from_env()
|
||||
#eg: export PROXIES="ip1:port1:username1:password1,ip2:port2:username2:password2"
|
||||
if not proxies:
|
||||
print("No proxies found in environment. Set PROXIES env variable!")
|
||||
return
|
||||
|
||||
proxy_strategy = RoundRobinProxyStrategy(proxies)
|
||||
|
||||
# Create configs
|
||||
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||
run_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
proxy_rotation_strategy=proxy_strategy
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
urls = ["https://httpbin.org/ip"] * (len(proxies) * 2) # Test each proxy twice
|
||||
|
||||
print("\n📈 Initializing crawler with proxy rotation...")
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
print("\n🚀 Starting batch crawl with proxy rotation...")
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=run_config
|
||||
)
|
||||
for result in results:
|
||||
if result.success:
|
||||
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
|
||||
current_proxy = run_config.proxy_config if run_config.proxy_config else None
|
||||
|
||||
if current_proxy and ip_match:
|
||||
print(f"URL {result.url}")
|
||||
print(f"Proxy {current_proxy.server} -> Response IP: {ip_match.group(0)}")
|
||||
verified = ip_match.group(0) == current_proxy.ip
|
||||
if verified:
|
||||
print(f"✅ Proxy working! IP matches: {current_proxy.ip}")
|
||||
else:
|
||||
print("❌ Proxy failed or IP mismatch!")
|
||||
print("---")
|
||||
|
||||
asyncio.run(main())
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
|
||||
@@ -49,46 +49,75 @@ from crawl4ai import JsonCssExtractionStrategy
|
||||
from crawl4ai.cache_context import CacheMode
|
||||
|
||||
async def crawl_dynamic_content():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
session_id = "github_commits_session"
|
||||
url = "https://github.com/microsoft/TypeScript/commits/main"
|
||||
all_commits = []
|
||||
url = "https://github.com/microsoft/TypeScript/commits/main"
|
||||
session_id = "wait_for_session"
|
||||
all_commits = []
|
||||
|
||||
# Define extraction schema
|
||||
schema = {
|
||||
"name": "Commit Extractor",
|
||||
"baseSelector": "li.Box-sc-g0xbh4-0",
|
||||
"fields": [{
|
||||
"name": "title", "selector": "h4.markdown-title", "type": "text"
|
||||
}],
|
||||
}
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema)
|
||||
js_next_page = """
|
||||
const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4');
|
||||
if (commits.length > 0) {
|
||||
window.lastCommit = commits[0].textContent.trim();
|
||||
}
|
||||
const button = document.querySelector('a[data-testid="pagination-next-button"]');
|
||||
if (button) {button.click(); console.log('button clicked') }
|
||||
"""
|
||||
|
||||
# JavaScript and wait configurations
|
||||
js_next_page = """document.querySelector('a[data-testid="pagination-next-button"]').click();"""
|
||||
wait_for = """() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0"""
|
||||
|
||||
# Crawl multiple pages
|
||||
wait_for = """() => {
|
||||
const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4');
|
||||
if (commits.length === 0) return false;
|
||||
const firstCommit = commits[0].textContent.trim();
|
||||
return firstCommit !== window.lastCommit;
|
||||
}"""
|
||||
|
||||
schema = {
|
||||
"name": "Commit Extractor",
|
||||
"baseSelector": "li[data-testid='commit-row-item']",
|
||||
"fields": [
|
||||
{
|
||||
"name": "title",
|
||||
"selector": "h4 a",
|
||||
"type": "text",
|
||||
"transform": "strip",
|
||||
},
|
||||
],
|
||||
}
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
|
||||
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
verbose=True,
|
||||
headless=False,
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
for page in range(3):
|
||||
config = CrawlerRunConfig(
|
||||
url=url,
|
||||
crawler_config = CrawlerRunConfig(
|
||||
session_id=session_id,
|
||||
css_selector="li[data-testid='commit-row-item']",
|
||||
extraction_strategy=extraction_strategy,
|
||||
js_code=js_next_page if page > 0 else None,
|
||||
wait_for=wait_for if page > 0 else None,
|
||||
js_only=page > 0,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
capture_console_messages=True,
|
||||
)
|
||||
|
||||
result = await crawler.arun(config=config)
|
||||
if result.success:
|
||||
|
||||
result = await crawler.arun(url=url, config=crawler_config)
|
||||
|
||||
if result.console_messages:
|
||||
print(f"Page {page + 1} console messages:", result.console_messages)
|
||||
|
||||
if result.extracted_content:
|
||||
# print(f"Page {page + 1} result:", result.extracted_content)
|
||||
commits = json.loads(result.extracted_content)
|
||||
all_commits.extend(commits)
|
||||
print(f"Page {page + 1}: Found {len(commits)} commits")
|
||||
else:
|
||||
print(f"Page {page + 1}: No content extracted")
|
||||
|
||||
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
|
||||
# Clean up session
|
||||
await crawler.crawler_strategy.kill_session(session_id)
|
||||
return all_commits
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
@@ -91,13 +91,12 @@ async def crawl_twitter_timeline():
|
||||
wait_after_scroll=1.0 # Twitter needs time to load
|
||||
)
|
||||
|
||||
browser_config = BrowserConfig(headless=True) # Set to False to watch it work
|
||||
config = CrawlerRunConfig(
|
||||
virtual_scroll_config=virtual_config,
|
||||
# Optional: Set headless=False to watch it work
|
||||
# browser_config=BrowserConfig(headless=False)
|
||||
virtual_scroll_config=virtual_config
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://twitter.com/search?q=AI",
|
||||
config=config
|
||||
@@ -200,7 +199,7 @@ Use **scan_full_page** when:
|
||||
Virtual Scroll works seamlessly with extraction strategies:
|
||||
|
||||
```python
|
||||
from crawl4ai import LLMExtractionStrategy
|
||||
from crawl4ai import LLMExtractionStrategy, LLMConfig
|
||||
|
||||
# Define extraction schema
|
||||
schema = {
|
||||
@@ -222,7 +221,7 @@ config = CrawlerRunConfig(
|
||||
scroll_count=20
|
||||
),
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o-mini",
|
||||
llm_config=LLMConfig(provider="openai/gpt-4o-mini"),
|
||||
schema=schema
|
||||
)
|
||||
)
|
||||
|
||||
244
docs/md_v2/api/adaptive-crawler.md
Normal file
244
docs/md_v2/api/adaptive-crawler.md
Normal file
@@ -0,0 +1,244 @@
|
||||
# AdaptiveCrawler
|
||||
|
||||
The `AdaptiveCrawler` class implements intelligent web crawling that automatically determines when sufficient information has been gathered to answer a query. It uses a three-layer scoring system to evaluate coverage, consistency, and saturation.
|
||||
|
||||
## Constructor
|
||||
|
||||
```python
|
||||
AdaptiveCrawler(
|
||||
crawler: AsyncWebCrawler,
|
||||
config: Optional[AdaptiveConfig] = None
|
||||
)
|
||||
```
|
||||
|
||||
### Parameters
|
||||
|
||||
- **crawler** (`AsyncWebCrawler`): The underlying web crawler instance to use for fetching pages
|
||||
- **config** (`Optional[AdaptiveConfig]`): Configuration settings for adaptive crawling behavior. If not provided, uses default settings.
|
||||
|
||||
## Primary Method
|
||||
|
||||
### digest()
|
||||
|
||||
The main method that performs adaptive crawling starting from a URL with a specific query.
|
||||
|
||||
```python
|
||||
async def digest(
|
||||
start_url: str,
|
||||
query: str,
|
||||
resume_from: Optional[Union[str, Path]] = None
|
||||
) -> CrawlState
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
- **start_url** (`str`): The starting URL for crawling
|
||||
- **query** (`str`): The search query that guides the crawling process
|
||||
- **resume_from** (`Optional[Union[str, Path]]`): Path to a saved state file to resume from
|
||||
|
||||
#### Returns
|
||||
|
||||
- **CrawlState**: The final crawl state containing all crawled URLs, knowledge base, and metrics
|
||||
|
||||
#### Example
|
||||
|
||||
```python
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler)
|
||||
state = await adaptive.digest(
|
||||
start_url="https://docs.python.org",
|
||||
query="async context managers"
|
||||
)
|
||||
```
|
||||
|
||||
## Properties
|
||||
|
||||
### confidence
|
||||
|
||||
Current confidence score (0-1) indicating information sufficiency.
|
||||
|
||||
```python
|
||||
@property
|
||||
def confidence(self) -> float
|
||||
```
|
||||
|
||||
### coverage_stats
|
||||
|
||||
Dictionary containing detailed coverage statistics.
|
||||
|
||||
```python
|
||||
@property
|
||||
def coverage_stats(self) -> Dict[str, float]
|
||||
```
|
||||
|
||||
Returns:
|
||||
- **coverage**: Query term coverage score
|
||||
- **consistency**: Information consistency score
|
||||
- **saturation**: Content saturation score
|
||||
- **confidence**: Overall confidence score
|
||||
|
||||
### is_sufficient
|
||||
|
||||
Boolean indicating whether sufficient information has been gathered.
|
||||
|
||||
```python
|
||||
@property
|
||||
def is_sufficient(self) -> bool
|
||||
```
|
||||
|
||||
### state
|
||||
|
||||
Access to the current crawl state.
|
||||
|
||||
```python
|
||||
@property
|
||||
def state(self) -> CrawlState
|
||||
```
|
||||
|
||||
## Methods
|
||||
|
||||
### get_relevant_content()
|
||||
|
||||
Retrieve the most relevant content from the knowledge base.
|
||||
|
||||
```python
|
||||
def get_relevant_content(
|
||||
self,
|
||||
top_k: int = 5
|
||||
) -> List[Dict[str, Any]]
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
- **top_k** (`int`): Number of top relevant documents to return (default: 5)
|
||||
|
||||
#### Returns
|
||||
|
||||
List of dictionaries containing:
|
||||
- **url**: The URL of the page
|
||||
- **content**: The page content
|
||||
- **score**: Relevance score
|
||||
- **metadata**: Additional page metadata
|
||||
|
||||
### print_stats()
|
||||
|
||||
Display crawl statistics in formatted output.
|
||||
|
||||
```python
|
||||
def print_stats(
|
||||
self,
|
||||
detailed: bool = False
|
||||
) -> None
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
- **detailed** (`bool`): If True, shows detailed metrics with colors. If False, shows summary table.
|
||||
|
||||
### export_knowledge_base()
|
||||
|
||||
Export the collected knowledge base to a JSONL file.
|
||||
|
||||
```python
|
||||
def export_knowledge_base(
|
||||
self,
|
||||
path: Union[str, Path]
|
||||
) -> None
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
- **path** (`Union[str, Path]`): Output file path for JSONL export
|
||||
|
||||
#### Example
|
||||
|
||||
```python
|
||||
adaptive.export_knowledge_base("my_knowledge.jsonl")
|
||||
```
|
||||
|
||||
### import_knowledge_base()
|
||||
|
||||
Import a previously exported knowledge base.
|
||||
|
||||
```python
|
||||
def import_knowledge_base(
|
||||
self,
|
||||
path: Union[str, Path]
|
||||
) -> None
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
|
||||
- **path** (`Union[str, Path]`): Path to JSONL file to import
|
||||
|
||||
## Configuration
|
||||
|
||||
The `AdaptiveConfig` class controls the behavior of adaptive crawling:
|
||||
|
||||
```python
|
||||
@dataclass
|
||||
class AdaptiveConfig:
|
||||
confidence_threshold: float = 0.8 # Stop when confidence reaches this
|
||||
max_pages: int = 50 # Maximum pages to crawl
|
||||
top_k_links: int = 5 # Links to follow per page
|
||||
min_gain_threshold: float = 0.1 # Minimum expected gain to continue
|
||||
save_state: bool = False # Auto-save crawl state
|
||||
state_path: Optional[str] = None # Path for state persistence
|
||||
```
|
||||
|
||||
### Example with Custom Config
|
||||
|
||||
```python
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.7,
|
||||
max_pages=20,
|
||||
top_k_links=3
|
||||
)
|
||||
|
||||
adaptive = AdaptiveCrawler(crawler, config=config)
|
||||
```
|
||||
|
||||
## Complete Example
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||
|
||||
async def main():
|
||||
# Configure adaptive crawling
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.75,
|
||||
max_pages=15,
|
||||
save_state=True,
|
||||
state_path="my_crawl.json"
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
|
||||
# Start crawling
|
||||
state = await adaptive.digest(
|
||||
start_url="https://example.com/docs",
|
||||
query="authentication oauth2 jwt"
|
||||
)
|
||||
|
||||
# Check results
|
||||
print(f"Confidence achieved: {adaptive.confidence:.0%}")
|
||||
adaptive.print_stats()
|
||||
|
||||
# Get most relevant pages
|
||||
for page in adaptive.get_relevant_content(top_k=3):
|
||||
print(f"- {page['url']} (score: {page['score']:.2f})")
|
||||
|
||||
# Export for later use
|
||||
adaptive.export_knowledge_base("auth_knowledge.jsonl")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
## See Also
|
||||
|
||||
- [digest() Method Reference](digest.md)
|
||||
- [Adaptive Crawling Guide](../core/adaptive-crawling.md)
|
||||
- [Advanced Adaptive Strategies](../advanced/adaptive-strategies.md)
|
||||
181
docs/md_v2/api/digest.md
Normal file
181
docs/md_v2/api/digest.md
Normal file
@@ -0,0 +1,181 @@
|
||||
# digest()
|
||||
|
||||
The `digest()` method is the primary interface for adaptive web crawling. It intelligently crawls websites starting from a given URL, guided by a query, and automatically determines when sufficient information has been gathered.
|
||||
|
||||
## Method Signature
|
||||
|
||||
```python
|
||||
async def digest(
|
||||
start_url: str,
|
||||
query: str,
|
||||
resume_from: Optional[Union[str, Path]] = None
|
||||
) -> CrawlState
|
||||
```
|
||||
|
||||
## Parameters
|
||||
|
||||
### start_url
|
||||
- **Type**: `str`
|
||||
- **Required**: Yes
|
||||
- **Description**: The starting URL for the crawl. This should be a valid HTTP/HTTPS URL that serves as the entry point for information gathering.
|
||||
|
||||
### query
|
||||
- **Type**: `str`
|
||||
- **Required**: Yes
|
||||
- **Description**: The search query that guides the crawling process. This should contain key terms related to the information you're seeking. The crawler uses this to evaluate relevance and determine which links to follow.
|
||||
|
||||
### resume_from
|
||||
- **Type**: `Optional[Union[str, Path]]`
|
||||
- **Default**: `None`
|
||||
- **Description**: Path to a previously saved crawl state file. When provided, the crawler resumes from the saved state instead of starting fresh.
|
||||
|
||||
## Return Value
|
||||
|
||||
Returns a `CrawlState` object containing:
|
||||
|
||||
- **crawled_urls** (`Set[str]`): All URLs that have been crawled
|
||||
- **knowledge_base** (`List[CrawlResult]`): Collection of crawled pages with content
|
||||
- **pending_links** (`List[Link]`): Links discovered but not yet crawled
|
||||
- **metrics** (`Dict[str, float]`): Performance and quality metrics
|
||||
- **query** (`str`): The original query
|
||||
- Additional statistical information for scoring
|
||||
|
||||
## How It Works
|
||||
|
||||
The `digest()` method implements an intelligent crawling algorithm:
|
||||
|
||||
1. **Initial Crawl**: Starts from the provided URL
|
||||
2. **Link Analysis**: Evaluates all discovered links for relevance
|
||||
3. **Scoring**: Uses three metrics to assess information sufficiency:
|
||||
- **Coverage**: How well the query terms are covered
|
||||
- **Consistency**: Information coherence across pages
|
||||
- **Saturation**: Diminishing returns detection
|
||||
4. **Adaptive Selection**: Chooses the most promising links to follow
|
||||
5. **Stopping Decision**: Automatically stops when confidence threshold is reached
|
||||
|
||||
## Examples
|
||||
|
||||
### Basic Usage
|
||||
|
||||
```python
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler)
|
||||
|
||||
state = await adaptive.digest(
|
||||
start_url="https://docs.python.org/3/",
|
||||
query="async await context managers"
|
||||
)
|
||||
|
||||
print(f"Crawled {len(state.crawled_urls)} pages")
|
||||
print(f"Confidence: {adaptive.confidence:.0%}")
|
||||
```
|
||||
|
||||
### With Configuration
|
||||
|
||||
```python
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.9, # Require high confidence
|
||||
max_pages=30, # Allow more pages
|
||||
top_k_links=3 # Follow top 3 links per page
|
||||
)
|
||||
|
||||
adaptive = AdaptiveCrawler(crawler, config=config)
|
||||
|
||||
state = await adaptive.digest(
|
||||
start_url="https://api.example.com/docs",
|
||||
query="authentication endpoints rate limits"
|
||||
)
|
||||
```
|
||||
|
||||
### Resuming a Previous Crawl
|
||||
|
||||
```python
|
||||
# First crawl - may be interrupted
|
||||
state1 = await adaptive.digest(
|
||||
start_url="https://example.com",
|
||||
query="machine learning algorithms"
|
||||
)
|
||||
|
||||
# Save state (if not auto-saved)
|
||||
state1.save("ml_crawl_state.json")
|
||||
|
||||
# Later, resume from saved state
|
||||
state2 = await adaptive.digest(
|
||||
start_url="https://example.com",
|
||||
query="machine learning algorithms",
|
||||
resume_from="ml_crawl_state.json"
|
||||
)
|
||||
```
|
||||
|
||||
### With Progress Monitoring
|
||||
|
||||
```python
|
||||
state = await adaptive.digest(
|
||||
start_url="https://docs.example.com",
|
||||
query="api reference"
|
||||
)
|
||||
|
||||
# Monitor progress
|
||||
print(f"Pages crawled: {len(state.crawled_urls)}")
|
||||
print(f"New terms discovered: {state.new_terms_history}")
|
||||
print(f"Final confidence: {adaptive.confidence:.2%}")
|
||||
|
||||
# View detailed statistics
|
||||
adaptive.print_stats(detailed=True)
|
||||
```
|
||||
|
||||
## Query Best Practices
|
||||
|
||||
1. **Be Specific**: Use descriptive terms that appear in target content
|
||||
```python
|
||||
# Good
|
||||
query = "python async context managers implementation"
|
||||
|
||||
# Too broad
|
||||
query = "python programming"
|
||||
```
|
||||
|
||||
2. **Include Key Terms**: Add technical terms you expect to find
|
||||
```python
|
||||
query = "oauth2 jwt refresh tokens authorization"
|
||||
```
|
||||
|
||||
3. **Multiple Concepts**: Combine related concepts for comprehensive coverage
|
||||
```python
|
||||
query = "rest api pagination sorting filtering"
|
||||
```
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
- **Initial URL**: Choose a page with good navigation (e.g., documentation index)
|
||||
- **Query Length**: 3-8 terms typically work best
|
||||
- **Link Density**: Sites with clear navigation crawl more efficiently
|
||||
- **Caching**: Enable caching for repeated crawls of the same domain
|
||||
|
||||
## Error Handling
|
||||
|
||||
```python
|
||||
try:
|
||||
state = await adaptive.digest(
|
||||
start_url="https://example.com",
|
||||
query="search terms"
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Crawl failed: {e}")
|
||||
# State is auto-saved if save_state=True in config
|
||||
```
|
||||
|
||||
## Stopping Conditions
|
||||
|
||||
The crawl stops when any of these conditions are met:
|
||||
|
||||
1. **Confidence Threshold**: Reached the configured confidence level
|
||||
2. **Page Limit**: Crawled the maximum number of pages
|
||||
3. **Diminishing Returns**: Expected information gain below threshold
|
||||
4. **No Relevant Links**: No promising links remain to follow
|
||||
|
||||
## See Also
|
||||
|
||||
- [AdaptiveCrawler Class](adaptive-crawler.md)
|
||||
- [Adaptive Crawling Guide](../core/adaptive-crawling.md)
|
||||
- [Configuration Options](../core/adaptive-crawling.md#configuration-options)
|
||||
@@ -298,7 +298,7 @@ LLMConfig is useful to pass LLM provider config to strategies and functions that
|
||||
## 3.1 Parameters
|
||||
| **Parameter** | **Type / Default** | **What It Does** |
|
||||
|-----------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| **`provider`** | `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`<br/>*(default: `"openai/gpt-4o-mini"`)* | Which LLM provoder to use.
|
||||
| **`provider`** | `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`<br/>*(default: `"openai/gpt-4o-mini"`)* | Which LLM provider to use.
|
||||
| **`api_token`** |1.Optional. When not provided explicitly, api_token will be read from environment variables based on provider. For example: If a gemini model is passed as provider then,`"GEMINI_API_KEY"` will be read from environment variables <br/> 2. API token of LLM provider <br/> eg: `api_token = "gsk_1ClHGGJ7Lpn4WGybR7vNWGdyb3FY7zXEw3SCiy0BAVM9lL8CQv"` <br/> 3. Environment variable - use with prefix "env:" <br/> eg:`api_token = "env: GROQ_API_KEY"` | API token to use for the given provider
|
||||
| **`base_url`** |Optional. Custom API endpoint | If your provider has a custom endpoint
|
||||
|
||||
|
||||
369
docs/md_v2/blog/articles/adaptive-crawling-revolution.md
Normal file
369
docs/md_v2/blog/articles/adaptive-crawling-revolution.md
Normal file
@@ -0,0 +1,369 @@
|
||||
# Adaptive Crawling: Building Dynamic Knowledge That Grows on Demand
|
||||
|
||||
*Published on January 29, 2025 • 8 min read*
|
||||
|
||||
*By [unclecode](https://x.com/unclecode) • Follow me on [X/Twitter](https://x.com/unclecode) for more web scraping insights*
|
||||
|
||||
---
|
||||
|
||||
## The Knowledge Capacitor
|
||||
|
||||
Imagine a capacitor that stores energy, releasing it precisely when needed. Now imagine that for information. That's Adaptive Crawling—a term I coined to describe a fundamentally different approach to web crawling. Instead of the brute force of traditional deep crawling, we build knowledge dynamically, growing it based on queries and circumstances, like a living organism responding to its environment.
|
||||
|
||||
This isn't just another crawling optimization. It's a paradigm shift from "crawl everything, hope for the best" to "crawl intelligently, know when to stop."
|
||||
|
||||
## Why I Built This
|
||||
|
||||
I've watched too many startups burn through resources with a dangerous misconception: that LLMs make everything efficient. They don't. They make things *possible*, not necessarily *smart*. When you combine brute-force crawling with LLM processing, you're not just wasting time—you're hemorrhaging money on tokens, compute, and opportunity cost.
|
||||
|
||||
Consider this reality:
|
||||
- **Traditional deep crawling**: 500 pages → 50 useful → $15 in LLM tokens → 2 hours wasted
|
||||
- **Adaptive crawling**: 15 pages → 14 useful → $2 in tokens → 10 minutes → **7.5x cost reduction**
|
||||
|
||||
But it's not about crawling less. It's about crawling *right*.
|
||||
|
||||
## The Information Theory Foundation
|
||||
|
||||
<div style="background-color: #1a1a1c; border: 1px solid #3f3f44; padding: 20px; margin: 20px 0;">
|
||||
|
||||
### 🧮 **Pure Statistics, No Magic**
|
||||
|
||||
My first principle was crucial: start with classic statistical approaches. No embeddings. No LLMs. Just pure information theory:
|
||||
|
||||
```python
|
||||
# Information gain calculation - the heart of adaptive crawling
|
||||
def calculate_information_gain(new_page, knowledge_base):
|
||||
new_terms = extract_terms(new_page) - existing_terms(knowledge_base)
|
||||
overlap = calculate_overlap(new_page, knowledge_base)
|
||||
|
||||
# High gain = many new terms + low overlap
|
||||
gain = len(new_terms) / (1 + overlap)
|
||||
return gain
|
||||
```
|
||||
|
||||
This isn't regression to older methods—it's recognition that we've forgotten powerful, efficient solutions in our rush to apply LLMs everywhere.
|
||||
|
||||
</div>
|
||||
|
||||
## The A* of Web Crawling
|
||||
|
||||
Adaptive crawling implements what I call "information scenting"—like A* pathfinding but for knowledge acquisition. Each link is evaluated not randomly, but by its probability of contributing meaningful information toward answering current and future queries.
|
||||
|
||||
<div style="display: flex; align-items: center; background-color: #3f3f44; padding: 20px; margin: 20px 0; border-left: 4px solid #09b5a5;">
|
||||
<div style="font-size: 48px; margin-right: 20px;">🎯</div>
|
||||
<div>
|
||||
<strong>The Scenting Algorithm:</strong><br>
|
||||
From available links, we select those with highest information gain. It's not about following every path—it's about following the <em>right</em> paths. Like a bloodhound following the strongest scent to its target.
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## The Three Pillars of Intelligence
|
||||
|
||||
### 1. Coverage: The Breadth Sensor
|
||||
Measures how well your knowledge spans the query space. Not just "do we have pages?" but "do we have the RIGHT pages?"
|
||||
|
||||
### 2. Consistency: The Coherence Detector
|
||||
Information from multiple sources should align. When pages agree, confidence rises. When they conflict, we need more data.
|
||||
|
||||
### 3. Saturation: The Efficiency Guardian
|
||||
The most crucial metric. When new pages stop adding information, we stop crawling. Simple. Powerful. Ignored by everyone else.
|
||||
|
||||
## Real Impact: Time, Money, and Sanity
|
||||
|
||||
Let me show you what this means for your bottom line:
|
||||
|
||||
### Building a Customer Support Knowledge Base
|
||||
|
||||
**Traditional Approach:**
|
||||
```python
|
||||
# Crawl entire documentation site
|
||||
results = await crawler.crawl_bfs("https://docs.company.com", max_depth=5)
|
||||
# Result: 1,200 pages, 18 hours, $150 in API costs
|
||||
# Useful content: ~100 pages scattered throughout
|
||||
```
|
||||
|
||||
**Adaptive Approach:**
|
||||
```python
|
||||
# Grow knowledge based on actual support queries
|
||||
knowledge = await adaptive.digest(
|
||||
start_url="https://docs.company.com",
|
||||
query="payment processing errors refund policies"
|
||||
)
|
||||
# Result: 45 pages, 12 minutes, $8 in API costs
|
||||
# Useful content: 42 pages, all relevant
|
||||
```
|
||||
|
||||
**Savings: 93% time reduction, 95% cost reduction, 100% more sanity**
|
||||
|
||||
## The Dynamic Growth Pattern
|
||||
|
||||
<div style="text-align: center; padding: 40px; background-color: #1a1a1c; border: 1px dashed #3f3f44; margin: 30px 0;">
|
||||
<div style="font-size: 24px; color: #09b5a5; margin-bottom: 10px;">
|
||||
Knowledge grows like crystals in a supersaturated solution
|
||||
</div>
|
||||
<div style="color: #a3abba;">
|
||||
Add a query (seed), and relevant information crystallizes around it.<br>
|
||||
Change the query, and the knowledge structure adapts.
|
||||
</div>
|
||||
</div>
|
||||
|
||||
This is the beauty of adaptive crawling: your knowledge base becomes a living entity that grows based on actual needs, not hypothetical completeness.
|
||||
|
||||
## Why "Adaptive"?
|
||||
|
||||
I specifically chose "Adaptive" because it captures the essence: the system adapts to what it finds. Dense technical documentation might need 20 pages for confidence. A simple FAQ might need just 5. The crawler doesn't follow a recipe—it reads the room and adjusts.
|
||||
|
||||
This is my term, my concept, and I have extensive plans for its evolution.
|
||||
|
||||
## The Progressive Roadmap
|
||||
|
||||
This is just the beginning. My roadmap for Adaptive Crawling:
|
||||
|
||||
### Phase 1 (Current): Statistical Foundation
|
||||
- Pure information theory approach
|
||||
- No dependencies on expensive models
|
||||
- Proven efficiency gains
|
||||
|
||||
### Phase 2 (Now Available): Embedding Enhancement
|
||||
- Semantic understanding layered onto statistical base
|
||||
- Still efficient, now even smarter
|
||||
- Optional, not required
|
||||
|
||||
### Phase 3 (Future): LLM Integration
|
||||
- LLMs for complex reasoning tasks only
|
||||
- Used surgically, not wastefully
|
||||
- Always with statistical foundation underneath
|
||||
|
||||
## The Efficiency Revolution
|
||||
|
||||
<div style="background-color: #1a1a1c; border: 1px solid #3f3f44; padding: 20px; margin: 20px 0;">
|
||||
|
||||
### 💰 **The Economics of Intelligence**
|
||||
|
||||
For a typical SaaS documentation crawl:
|
||||
|
||||
**Traditional Deep Crawling:**
|
||||
- Pages crawled: 1,000
|
||||
- Useful pages: 80
|
||||
- Time spent: 3 hours
|
||||
- LLM tokens used: 2.5M
|
||||
- Cost: $75
|
||||
- Efficiency: 8%
|
||||
|
||||
**Adaptive Crawling:**
|
||||
- Pages crawled: 95
|
||||
- Useful pages: 88
|
||||
- Time spent: 15 minutes
|
||||
- LLM tokens used: 200K
|
||||
- Cost: $6
|
||||
- Efficiency: 93%
|
||||
|
||||
**That's not optimization. That's transformation.**
|
||||
|
||||
</div>
|
||||
|
||||
## Missing the Forest for the Trees
|
||||
|
||||
The startup world has a dangerous blind spot. We're so enamored with LLMs that we forget: just because you CAN process everything with an LLM doesn't mean you SHOULD.
|
||||
|
||||
Classic NLP and statistical methods can:
|
||||
- Filter irrelevant content before it reaches LLMs
|
||||
- Identify patterns without expensive inference
|
||||
- Make intelligent decisions in microseconds
|
||||
- Scale without breaking the bank
|
||||
|
||||
Adaptive crawling proves this. It uses battle-tested information theory to make smart decisions BEFORE expensive processing.
|
||||
|
||||
## Your Knowledge, On Demand
|
||||
|
||||
```python
|
||||
# Monday: Customer asks about authentication
|
||||
auth_knowledge = await adaptive.digest(
|
||||
"https://docs.api.com",
|
||||
"oauth jwt authentication"
|
||||
)
|
||||
|
||||
# Tuesday: They ask about rate limiting
|
||||
# The crawler adapts, builds on existing knowledge
|
||||
rate_limit_knowledge = await adaptive.digest(
|
||||
"https://docs.api.com",
|
||||
"rate limiting throttling quotas"
|
||||
)
|
||||
|
||||
# Your knowledge base grows intelligently, not indiscriminately
|
||||
```
|
||||
|
||||
## The Competitive Edge
|
||||
|
||||
Companies using adaptive crawling will have:
|
||||
- **90% lower crawling costs**
|
||||
- **Knowledge bases that actually answer questions**
|
||||
- **Update cycles in minutes, not days**
|
||||
- **Happy customers who find answers fast**
|
||||
- **Engineers who sleep at night**
|
||||
|
||||
Those still using brute force? They'll wonder why their infrastructure costs keep rising while their customers keep complaining.
|
||||
|
||||
## The Embedding Evolution (Now Available!)
|
||||
|
||||
<div style="background-color: #1a1a1c; border: 1px solid #3f3f44; padding: 20px; margin: 20px 0;">
|
||||
|
||||
### 🧠 **Semantic Understanding Without the Cost**
|
||||
|
||||
The embedding strategy brings semantic intelligence while maintaining efficiency:
|
||||
|
||||
```python
|
||||
# Statistical strategy - great for exact terms
|
||||
config_statistical = AdaptiveConfig(
|
||||
strategy="statistical" # Default
|
||||
)
|
||||
|
||||
# Embedding strategy - understands concepts
|
||||
config_embedding = AdaptiveConfig(
|
||||
strategy="embedding",
|
||||
embedding_model="sentence-transformers/all-MiniLM-L6-v2",
|
||||
n_query_variations=10
|
||||
)
|
||||
```
|
||||
|
||||
**The magic**: It automatically expands your query into semantic variations, maps the coverage space, and identifies gaps to fill intelligently.
|
||||
|
||||
</div>
|
||||
|
||||
### Real-World Comparison
|
||||
|
||||
<div style="display: flex; gap: 20px; margin: 20px 0;">
|
||||
<div style="flex: 1; background-color: #1a1a1c; border: 1px solid #3f3f44; padding: 20px;">
|
||||
|
||||
**Query**: "authentication oauth"
|
||||
|
||||
**Statistical Strategy**:
|
||||
- Searches for exact terms
|
||||
- 12 pages crawled
|
||||
- 78% confidence
|
||||
- Fast but literal
|
||||
|
||||
</div>
|
||||
<div style="flex: 1; background-color: #1a1a1c; border: 1px solid #09b5a5; padding: 20px;">
|
||||
|
||||
**Embedding Strategy**:
|
||||
- Understands "auth", "login", "SSO"
|
||||
- 8 pages crawled
|
||||
- 92% confidence
|
||||
- Semantic comprehension
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
### Detecting Irrelevance
|
||||
|
||||
One killer feature: the embedding strategy knows when to give up:
|
||||
|
||||
```python
|
||||
# Crawling Python docs with a cooking query
|
||||
result = await adaptive.digest(
|
||||
start_url="https://docs.python.org/3/",
|
||||
query="how to make spaghetti carbonara"
|
||||
)
|
||||
|
||||
# System detects irrelevance and stops
|
||||
# Confidence: 5% (below threshold)
|
||||
# Pages crawled: 2
|
||||
# Stopped reason: "below_minimum_relevance_threshold"
|
||||
```
|
||||
|
||||
No more crawling hundreds of pages hoping to find something that doesn't exist!
|
||||
|
||||
## Try It Yourself
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Choose your strategy
|
||||
config = AdaptiveConfig(
|
||||
strategy="embedding", # or "statistical"
|
||||
embedding_min_confidence_threshold=0.1 # Stop if irrelevant
|
||||
)
|
||||
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
|
||||
# Watch intelligence at work
|
||||
result = await adaptive.digest(
|
||||
start_url="https://your-docs.com",
|
||||
query="your users' actual questions"
|
||||
)
|
||||
|
||||
# See the efficiency
|
||||
adaptive.print_stats()
|
||||
print(f"Found {adaptive.confidence:.0%} of needed information")
|
||||
print(f"In just {len(result.crawled_urls)} pages")
|
||||
print(f"Saving you {1000 - len(result.crawled_urls)} unnecessary crawls")
|
||||
```
|
||||
|
||||
## A Personal Note
|
||||
|
||||
I created Adaptive Crawling because I was tired of watching smart people make inefficient choices. We have incredibly powerful statistical tools that we've forgotten in our rush toward LLMs. This is my attempt to bring balance back to the Force.
|
||||
|
||||
This is not just a feature. It's a philosophy: **Grow knowledge on demand. Stop when you have enough. Save time, money, and computational resources for what really matters.**
|
||||
|
||||
## The Future is Adaptive
|
||||
|
||||
<div style="text-align: center; padding: 40px; background-color: #1a1a1c; border: 1px dashed #3f3f44; margin: 30px 0;">
|
||||
<div style="font-size: 24px; color: #09b5a5; margin-bottom: 10px;">
|
||||
Traditional Crawling: Drinking from a firehose<br>
|
||||
Adaptive Crawling: Sipping exactly what you need
|
||||
</div>
|
||||
<div style="color: #a3abba;">
|
||||
The future of web crawling isn't about processing more data.<br>
|
||||
It's about processing the <em>right</em> data.
|
||||
</div>
|
||||
</div>
|
||||
|
||||
Join me in making web crawling intelligent, efficient, and actually useful. Because in the age of information overload, the winners won't be those who collect the most data—they'll be those who collect the *right* data.
|
||||
|
||||
---
|
||||
|
||||
*Adaptive Crawling is now part of Crawl4AI. [Get started with the documentation](/core/adaptive-crawling/) or [dive into the mathematical framework](https://github.com/unclecode/crawl4ai/blob/main/PROGRESSIVE_CRAWLING.md). For updates on my work in information theory and efficient AI, follow me on [X/Twitter](https://x.com/unclecode).*
|
||||
|
||||
<style>
|
||||
/* Custom styles for this article */
|
||||
.markdown-body pre {
|
||||
background-color: #1e1e1e !important;
|
||||
border: 1px solid #3f3f44;
|
||||
}
|
||||
|
||||
.markdown-body code {
|
||||
background-color: #3f3f44;
|
||||
color: #50ffff;
|
||||
padding: 2px 6px;
|
||||
border-radius: 3px;
|
||||
}
|
||||
|
||||
.markdown-body pre code {
|
||||
background-color: transparent;
|
||||
color: #e8e9ed;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
.markdown-body blockquote {
|
||||
border-left: 4px solid #09b5a5;
|
||||
background-color: #1a1a1c;
|
||||
padding: 15px 20px;
|
||||
margin: 20px 0;
|
||||
}
|
||||
|
||||
.markdown-body h2 {
|
||||
color: #50ffff;
|
||||
border-bottom: 1px dashed #3f3f44;
|
||||
padding-bottom: 10px;
|
||||
}
|
||||
|
||||
.markdown-body h3 {
|
||||
color: #09b5a5;
|
||||
}
|
||||
|
||||
.markdown-body strong {
|
||||
color: #50ffff;
|
||||
}
|
||||
</style>
|
||||
@@ -2,16 +2,46 @@
|
||||
|
||||
Welcome to the Crawl4AI blog! Here you'll find detailed release notes, technical insights, and updates about the project. Whether you're looking for the latest improvements or want to dive deep into web crawling techniques, this is the place.
|
||||
|
||||
## Featured Articles
|
||||
|
||||
### [When to Stop Crawling: The Art of Knowing "Enough"](articles/adaptive-crawling-revolution.md)
|
||||
*January 29, 2025*
|
||||
|
||||
Traditional crawlers are like tourists with unlimited time—they'll visit every street, every alley, every dead end. But what if your crawler could think like a researcher with a deadline? Discover how Adaptive Crawling revolutionizes web scraping by knowing when to stop. Learn about the three-layer intelligence system that evaluates coverage, consistency, and saturation to build focused knowledge bases instead of endless page collections.
|
||||
|
||||
[Read the full article →](articles/adaptive-crawling-revolution.md)
|
||||
|
||||
### [The LLM Context Protocol: Why Your AI Assistant Needs Memory, Reasoning, and Examples](articles/llm-context-revolution.md)
|
||||
*January 24, 2025*
|
||||
|
||||
Ever wondered why your AI coding assistant struggles with your library despite comprehensive documentation? This article introduces the three-dimensional context protocol that transforms how AI understands code. Learn why memory, reasoning, and examples together create wisdom—not just information.
|
||||
|
||||
[Read the full article →](articles/llm-context-revolution.md)
|
||||
|
||||
## Latest Release
|
||||
|
||||
Here’s the blog index entry for **v0.6.0**, written to match the exact tone and structure of your previous entries:
|
||||
### [Crawl4AI v0.7.0 – The Adaptive Intelligence Update](releases/0.7.0.md)
|
||||
*January 28, 2025*
|
||||
|
||||
Crawl4AI v0.7.0 introduces groundbreaking intelligence features that transform how crawlers understand and adapt to websites. This release brings Adaptive Crawling that learns website patterns, Virtual Scroll support for infinite pages, intelligent Link Preview with 3-layer scoring, and the powerful Async URL Seeder for massive URL discovery.
|
||||
|
||||
Key highlights:
|
||||
- **Adaptive Crawling**: Crawlers that learn and adapt to website structures automatically
|
||||
- **Virtual Scroll Support**: Complete content extraction from modern infinite scroll pages
|
||||
- **Link Preview**: 3-layer scoring system for intelligent link prioritization
|
||||
- **Async URL Seeder**: Discover thousands of URLs in seconds with smart filtering
|
||||
- **Performance Boost**: Up to 3x faster with optimized resource handling
|
||||
|
||||
[Read full release notes →](releases/0.7.0.md)
|
||||
|
||||
---
|
||||
|
||||
### [Crawl4AI v0.6.0 – World-Aware Crawling, Pre-Warmed Browsers, and the MCP API](releases/0.6.0.md)
|
||||
*April 23, 2025*
|
||||
## Previous Releases
|
||||
|
||||
Crawl4AI v0.6.0 is our most powerful release yet. This update brings major architectural upgrades including world-aware crawling (set geolocation, locale, and timezone), real-time traffic capture, and a memory-efficient crawler pool with pre-warmed pages.
|
||||
### [Crawl4AI v0.6.0 – World-Aware Crawling, Pre-Warmed Browsers, and the MCP API](releases/0.6.0.md)
|
||||
*December 23, 2024*
|
||||
|
||||
Crawl4AI v0.6.0 brought major architectural upgrades including world-aware crawling (set geolocation, locale, and timezone), real-time traffic capture, and a memory-efficient crawler pool with pre-warmed pages.
|
||||
|
||||
The Docker server now exposes a full-featured MCP socket + SSE interface, supports streaming, and comes with a new Playground UI. Plus, table extraction is now native, and the new stress-test framework supports crawling 1,000+ URLs.
|
||||
|
||||
@@ -29,8 +59,6 @@ Other key changes:
|
||||
|
||||
---
|
||||
|
||||
Let me know if you want me to auto-update the actual file or just paste this into the markdown.
|
||||
|
||||
### [Crawl4AI v0.5.0: Deep Crawling, Scalability, and a New CLI!](releases/0.5.0.md)
|
||||
|
||||
My dear friends and crawlers, there you go, this is the release of Crawl4AI v0.5.0! This release brings a wealth of new features, performance improvements, and a more streamlined developer experience. Here's a breakdown of what's new:
|
||||
@@ -124,5 +152,4 @@ Curious about how Crawl4AI has evolved? Check out our [complete changelog](https
|
||||
|
||||
- Star us on [GitHub](https://github.com/unclecode/crawl4ai)
|
||||
- Follow [@unclecode](https://twitter.com/unclecode) on Twitter
|
||||
- Join our community discussions on GitHub
|
||||
|
||||
- Join our community discussions on GitHub
|
||||
144
docs/md_v2/blog/index.md.bak
Normal file
144
docs/md_v2/blog/index.md.bak
Normal file
@@ -0,0 +1,144 @@
|
||||
# Crawl4AI Blog
|
||||
|
||||
Welcome to the Crawl4AI blog! Here you'll find detailed release notes, technical insights, and updates about the project. Whether you're looking for the latest improvements or want to dive deep into web crawling techniques, this is the place.
|
||||
|
||||
## Featured Articles
|
||||
|
||||
### [When to Stop Crawling: The Art of Knowing "Enough"](articles/adaptive-crawling-revolution.md)
|
||||
*January 29, 2025*
|
||||
|
||||
Traditional crawlers are like tourists with unlimited time—they'll visit every street, every alley, every dead end. But what if your crawler could think like a researcher with a deadline? Discover how Adaptive Crawling revolutionizes web scraping by knowing when to stop. Learn about the three-layer intelligence system that evaluates coverage, consistency, and saturation to build focused knowledge bases instead of endless page collections.
|
||||
|
||||
[Read the full article →](articles/adaptive-crawling-revolution.md)
|
||||
|
||||
### [The LLM Context Protocol: Why Your AI Assistant Needs Memory, Reasoning, and Examples](articles/llm-context-revolution.md)
|
||||
*January 24, 2025*
|
||||
|
||||
Ever wondered why your AI coding assistant struggles with your library despite comprehensive documentation? This article introduces the three-dimensional context protocol that transforms how AI understands code. Learn why memory, reasoning, and examples together create wisdom—not just information.
|
||||
|
||||
[Read the full article →](articles/llm-context-revolution.md)
|
||||
|
||||
## Latest Release
|
||||
|
||||
Here’s the blog index entry for **v0.6.0**, written to match the exact tone and structure of your previous entries:
|
||||
|
||||
---
|
||||
|
||||
### [Crawl4AI v0.6.0 – World-Aware Crawling, Pre-Warmed Browsers, and the MCP API](releases/0.6.0.md)
|
||||
*April 23, 2025*
|
||||
|
||||
Crawl4AI v0.6.0 is our most powerful release yet. This update brings major architectural upgrades including world-aware crawling (set geolocation, locale, and timezone), real-time traffic capture, and a memory-efficient crawler pool with pre-warmed pages.
|
||||
|
||||
The Docker server now exposes a full-featured MCP socket + SSE interface, supports streaming, and comes with a new Playground UI. Plus, table extraction is now native, and the new stress-test framework supports crawling 1,000+ URLs.
|
||||
|
||||
Other key changes:
|
||||
|
||||
* Native support for `result.media["tables"]` to export DataFrames
|
||||
* Full network + console logs and MHTML snapshot per crawl
|
||||
* Browser pooling and pre-warming for faster cold starts
|
||||
* New streaming endpoints via MCP API and Playground
|
||||
* Robots.txt support, proxy rotation, and improved session handling
|
||||
* Deprecated old markdown names, legacy modules cleaned up
|
||||
* Massive repo cleanup: ~36K insertions, ~5K deletions across 121 files
|
||||
|
||||
[Read full release notes →](releases/0.6.0.md)
|
||||
|
||||
---
|
||||
|
||||
Let me know if you want me to auto-update the actual file or just paste this into the markdown.
|
||||
|
||||
### [Crawl4AI v0.5.0: Deep Crawling, Scalability, and a New CLI!](releases/0.5.0.md)
|
||||
|
||||
My dear friends and crawlers, there you go, this is the release of Crawl4AI v0.5.0! This release brings a wealth of new features, performance improvements, and a more streamlined developer experience. Here's a breakdown of what's new:
|
||||
|
||||
**Major New Features:**
|
||||
|
||||
* **Deep Crawling:** Explore entire websites with configurable strategies (BFS, DFS, Best-First). Define custom filters and URL scoring for targeted crawls.
|
||||
* **Memory-Adaptive Dispatcher:** Handle large-scale crawls with ease! Our new dispatcher dynamically adjusts concurrency based on available memory and includes built-in rate limiting.
|
||||
* **Multiple Crawler Strategies:** Choose between the full-featured Playwright browser-based crawler or a new, *much* faster HTTP-only crawler for simpler tasks.
|
||||
* **Docker Deployment:** Deploy Crawl4AI as a scalable, self-contained service with built-in API endpoints and optional JWT authentication.
|
||||
* **Command-Line Interface (CLI):** Interact with Crawl4AI directly from your terminal. Crawl, configure, and extract data with simple commands.
|
||||
* **LLM Configuration (`LLMConfig`):** A new, unified way to configure LLM providers (OpenAI, Anthropic, Ollama, etc.) for extraction, filtering, and schema generation. Simplifies API key management and switching between models.
|
||||
|
||||
**Minor Updates & Improvements:**
|
||||
|
||||
* **LXML Scraping Mode:** Faster HTML parsing with `LXMLWebScrapingStrategy`.
|
||||
* **Proxy Rotation:** Added `ProxyRotationStrategy` with a `RoundRobinProxyStrategy` implementation.
|
||||
* **PDF Processing:** Extract text, images, and metadata from PDF files.
|
||||
* **URL Redirection Tracking:** Automatically follows and records redirects.
|
||||
* **Robots.txt Compliance:** Optionally respect website crawling rules.
|
||||
* **LLM-Powered Schema Generation:** Automatically create extraction schemas using an LLM.
|
||||
* **`LLMContentFilter`:** Generate high-quality, focused markdown using an LLM.
|
||||
* **Improved Error Handling & Stability:** Numerous bug fixes and performance enhancements.
|
||||
* **Enhanced Documentation:** Updated guides and examples.
|
||||
|
||||
**Breaking Changes & Migration:**
|
||||
|
||||
This release includes several breaking changes to improve the library's structure and consistency. Here's what you need to know:
|
||||
|
||||
* **`arun_many()` Behavior:** Now uses the `MemoryAdaptiveDispatcher` by default. The return type depends on the `stream` parameter in `CrawlerRunConfig`. Adjust code that relied on unbounded concurrency.
|
||||
* **`max_depth` Location:** Moved to `CrawlerRunConfig` and now controls *crawl depth*.
|
||||
* **Deep Crawling Imports:** Import `DeepCrawlStrategy` and related classes from `crawl4ai.deep_crawling`.
|
||||
* **`BrowserContext` API:** Updated; the old `get_context` method is deprecated.
|
||||
* **Optional Model Fields:** Many data model fields are now optional. Handle potential `None` values.
|
||||
* **`ScrapingMode` Enum:** Replaced with strategy pattern (`WebScrapingStrategy`, `LXMLWebScrapingStrategy`).
|
||||
* **`content_filter` Parameter:** Removed from `CrawlerRunConfig`. Use extraction strategies or markdown generators with filters.
|
||||
* **Removed Functionality:** The synchronous `WebCrawler`, the old CLI, and docs management tools have been removed.
|
||||
* **Docker:** Significant changes to deployment. See the [Docker documentation](../deploy/docker/README.md).
|
||||
* **`ssl_certificate.json`:** This file has been removed.
|
||||
* **Config**: FastFilterChain has been replaced with FilterChain
|
||||
* **Deep-Crawl**: DeepCrawlStrategy.arun now returns Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
||||
* **Proxy**: Removed synchronous WebCrawler support and related rate limiting configurations
|
||||
* **LLM Parameters:** Use the new `LLMConfig` object instead of passing `provider`, `api_token`, `base_url`, and `api_base` directly to `LLMExtractionStrategy` and `LLMContentFilter`.
|
||||
|
||||
**In short:** Update imports, adjust `arun_many()` usage, check for optional fields, and review the Docker deployment guide.
|
||||
|
||||
## License Change
|
||||
|
||||
Crawl4AI v0.5.0 updates the license to Apache 2.0 *with a required attribution clause*. This means you are free to use, modify, and distribute Crawl4AI (even commercially), but you *must* clearly attribute the project in any public use or distribution. See the updated `LICENSE` file for the full legal text and specific requirements.
|
||||
|
||||
**Get Started:**
|
||||
|
||||
* **Installation:** `pip install "crawl4ai[all]"` (or use the Docker image)
|
||||
* **Documentation:** [https://docs.crawl4ai.com](https://docs.crawl4ai.com)
|
||||
* **GitHub:** [https://github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||
|
||||
I'm very excited to see what you build with Crawl4AI v0.5.0!
|
||||
|
||||
---
|
||||
|
||||
### [0.4.2 - Configurable Crawlers, Session Management, and Smarter Screenshots](releases/0.4.2.md)
|
||||
*December 12, 2024*
|
||||
|
||||
The 0.4.2 update brings massive improvements to configuration, making crawlers and browsers easier to manage with dedicated objects. You can now import/export local storage for seamless session management. Plus, long-page screenshots are faster and cleaner, and full-page PDF exports are now possible. Check out all the new features to make your crawling experience even smoother.
|
||||
|
||||
[Read full release notes →](releases/0.4.2.md)
|
||||
|
||||
---
|
||||
|
||||
### [0.4.1 - Smarter Crawling with Lazy-Load Handling, Text-Only Mode, and More](releases/0.4.1.md)
|
||||
*December 8, 2024*
|
||||
|
||||
This release brings major improvements to handling lazy-loaded images, a blazing-fast Text-Only Mode, full-page scanning for infinite scrolls, dynamic viewport adjustments, and session reuse for efficient crawling. If you're looking to improve speed, reliability, or handle dynamic content with ease, this update has you covered.
|
||||
|
||||
[Read full release notes →](releases/0.4.1.md)
|
||||
|
||||
---
|
||||
|
||||
### [0.4.0 - Major Content Filtering Update](releases/0.4.0.md)
|
||||
*December 1, 2024*
|
||||
|
||||
Introduced significant improvements to content filtering, multi-threaded environment handling, and user-agent generation. This release features the new PruningContentFilter, enhanced thread safety, and improved test coverage.
|
||||
|
||||
[Read full release notes →](releases/0.4.0.md)
|
||||
|
||||
## Project History
|
||||
|
||||
Curious about how Crawl4AI has evolved? Check out our [complete changelog](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md) for a detailed history of all versions and updates.
|
||||
|
||||
## Stay Updated
|
||||
|
||||
- Star us on [GitHub](https://github.com/unclecode/crawl4ai)
|
||||
- Follow [@unclecode](https://twitter.com/unclecode) on Twitter
|
||||
- Join our community discussions on GitHub
|
||||
|
||||
343
docs/md_v2/blog/releases/0.7.0.md
Normal file
343
docs/md_v2/blog/releases/0.7.0.md
Normal file
@@ -0,0 +1,343 @@
|
||||
# 🚀 Crawl4AI v0.7.0: The Adaptive Intelligence Update
|
||||
|
||||
*January 28, 2025 • 10 min read*
|
||||
|
||||
---
|
||||
|
||||
Today I'm releasing Crawl4AI v0.7.0—the Adaptive Intelligence Update. This release introduces fundamental improvements in how Crawl4AI handles modern web complexity through adaptive learning, intelligent content discovery, and advanced extraction capabilities.
|
||||
|
||||
## 🎯 What's New at a Glance
|
||||
|
||||
- **Adaptive Crawling**: Your crawler now learns and adapts to website patterns
|
||||
- **Virtual Scroll Support**: Complete content extraction from infinite scroll pages
|
||||
- **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization
|
||||
- **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering
|
||||
- **Performance Optimizations**: Significant speed and memory improvements
|
||||
|
||||
## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning
|
||||
|
||||
**The Problem:** Websites change. Class names shift. IDs disappear. Your carefully crafted selectors break at 3 AM, and you wake up to empty datasets and angry stakeholders.
|
||||
|
||||
**My Solution:** I implemented an adaptive learning system that observes patterns, builds confidence scores, and adjusts extraction strategies on the fly. It's like having a junior developer who gets better at their job with every page they scrape.
|
||||
|
||||
### Technical Deep-Dive
|
||||
|
||||
The Adaptive Crawler maintains a persistent state for each domain, tracking:
|
||||
- Pattern success rates
|
||||
- Selector stability over time
|
||||
- Content structure variations
|
||||
- Extraction confidence scores
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||
import asyncio
|
||||
|
||||
async def main():
|
||||
|
||||
# Configure adaptive crawler
|
||||
config = AdaptiveConfig(
|
||||
strategy="statistical", # or "embedding" for semantic understanding
|
||||
max_pages=10,
|
||||
confidence_threshold=0.7, # Stop at 70% confidence
|
||||
top_k_links=3, # Follow top 3 links per page
|
||||
min_gain_threshold=0.05 # Need 5% information gain to continue
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
|
||||
print("Starting adaptive crawl about Python decorators...")
|
||||
result = await adaptive.digest(
|
||||
start_url="https://docs.python.org/3/glossary.html",
|
||||
query="python decorators functions wrapping"
|
||||
)
|
||||
|
||||
print(f"\n✅ Crawling Complete!")
|
||||
print(f"• Confidence Level: {adaptive.confidence:.0%}")
|
||||
print(f"• Pages Crawled: {len(result.crawled_urls)}")
|
||||
print(f"• Knowledge Base: {len(adaptive.state.knowledge_base)} documents")
|
||||
|
||||
# Get most relevant content
|
||||
relevant = adaptive.get_relevant_content(top_k=3)
|
||||
print(f"\nMost Relevant Pages:")
|
||||
for i, page in enumerate(relevant, 1):
|
||||
print(f"{i}. {page['url']} (relevance: {page['score']:.2%})")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
- **News Aggregation**: Maintain 95%+ extraction accuracy even as news sites update their templates
|
||||
- **E-commerce Monitoring**: Track product changes across hundreds of stores without constant maintenance
|
||||
- **Research Data Collection**: Build robust academic datasets that survive website redesigns
|
||||
- **Reduced Maintenance**: Cut selector update time by 80% for frequently-changing sites
|
||||
|
||||
## 🌊 Virtual Scroll: Complete Content Capture
|
||||
|
||||
**The Problem:** Modern web apps only render what's visible. Scroll down, new content appears, old content vanishes into the void. Traditional crawlers capture that first viewport and miss 90% of the content. It's like reading only the first page of every book.
|
||||
|
||||
**My Solution:** I built Virtual Scroll support that mimics human browsing behavior, capturing content as it loads and preserving it before the browser's garbage collector strikes.
|
||||
|
||||
### Implementation Details
|
||||
|
||||
```python
|
||||
from crawl4ai import VirtualScrollConfig
|
||||
|
||||
# For social media feeds (Twitter/X style)
|
||||
twitter_config = VirtualScrollConfig(
|
||||
container_selector="[data-testid='primaryColumn']",
|
||||
scroll_count=20, # Number of scrolls
|
||||
scroll_by="container_height", # Smart scrolling by container size
|
||||
wait_after_scroll=1.0 # Let content load
|
||||
)
|
||||
|
||||
# For e-commerce product grids (Instagram style)
|
||||
grid_config = VirtualScrollConfig(
|
||||
container_selector="main .product-grid",
|
||||
scroll_count=30,
|
||||
scroll_by=800, # Fixed pixel scrolling
|
||||
wait_after_scroll=1.5 # Images need time
|
||||
)
|
||||
|
||||
# For news feeds with lazy loading
|
||||
news_config = VirtualScrollConfig(
|
||||
container_selector=".article-feed",
|
||||
scroll_count=50,
|
||||
scroll_by="page_height", # Viewport-based scrolling
|
||||
wait_after_scroll=0.5 # Wait for content to load
|
||||
)
|
||||
|
||||
# Use it in your crawl
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://twitter.com/trending",
|
||||
config=CrawlerRunConfig(
|
||||
virtual_scroll_config=twitter_config,
|
||||
# Combine with other features
|
||||
extraction_strategy=JsonCssExtractionStrategy({
|
||||
"tweets": {
|
||||
"selector": "[data-testid='tweet']",
|
||||
"fields": {
|
||||
"text": {"selector": "[data-testid='tweetText']", "type": "text"},
|
||||
"likes": {"selector": "[data-testid='like']", "type": "text"}
|
||||
}
|
||||
}
|
||||
})
|
||||
)
|
||||
)
|
||||
|
||||
print(f"Captured {len(result.extracted_content['tweets'])} tweets")
|
||||
```
|
||||
|
||||
**Key Capabilities:**
|
||||
- **DOM Recycling Awareness**: Detects and handles virtual DOM element recycling
|
||||
- **Smart Scroll Physics**: Three modes - container height, page height, or fixed pixels
|
||||
- **Content Preservation**: Captures content before it's destroyed
|
||||
- **Intelligent Stopping**: Stops when no new content appears
|
||||
- **Memory Efficient**: Streams content instead of holding everything in memory
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
- **Social Media Analysis**: Capture entire Twitter threads with hundreds of replies, not just top 10
|
||||
- **E-commerce Scraping**: Extract 500+ products from infinite scroll catalogs vs. 20-50 with traditional methods
|
||||
- **News Aggregation**: Get all articles from modern news sites, not just above-the-fold content
|
||||
- **Research Applications**: Complete data extraction from academic databases using virtual pagination
|
||||
|
||||
## 🔗 Link Preview: Intelligent Link Analysis and Scoring
|
||||
|
||||
**The Problem:** You crawl a page and get 200 links. Which ones matter? Which lead to the content you actually want? Traditional crawlers force you to follow everything or build complex filters.
|
||||
|
||||
**My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals.
|
||||
|
||||
### Intelligent Link Analysis and Scoring
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import CrawlerRunConfig, CacheMode, AsyncWebCrawler
|
||||
from crawl4ai.adaptive_crawler import LinkPreviewConfig
|
||||
|
||||
async def main():
|
||||
# Configure intelligent link analysis
|
||||
link_config = LinkPreviewConfig(
|
||||
include_internal=True,
|
||||
include_external=False,
|
||||
max_links=10,
|
||||
concurrency=5,
|
||||
query="python tutorial", # For contextual scoring
|
||||
score_threshold=0.3,
|
||||
verbose=True
|
||||
)
|
||||
# Use in your crawl
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://www.geeksforgeeks.org/",
|
||||
config=CrawlerRunConfig(
|
||||
link_preview_config=link_config,
|
||||
score_links=True, # Enable intrinsic scoring
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
)
|
||||
|
||||
# Access scored and sorted links
|
||||
if result.success and result.links:
|
||||
for link in result.links.get("internal", []):
|
||||
text = link.get('text', 'No text')[:40]
|
||||
print(
|
||||
text,
|
||||
f"{link.get('intrinsic_score', 0):.1f}/10" if link.get('intrinsic_score') is not None else "0.0/10",
|
||||
f"{link.get('contextual_score', 0):.2f}/1" if link.get('contextual_score') is not None else "0.00/1",
|
||||
f"{link.get('total_score', 0):.3f}" if link.get('total_score') is not None else "0.000"
|
||||
)
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Scoring Components:**
|
||||
|
||||
1. **Intrinsic Score**: Based on link quality indicators
|
||||
- Position on page (navigation, content, footer)
|
||||
- Link attributes (rel, title, class names)
|
||||
- Anchor text quality and length
|
||||
- URL structure and depth
|
||||
|
||||
2. **Contextual Score**: Relevance to your query using BM25 algorithm
|
||||
- Keyword matching in link text and title
|
||||
- Meta description analysis
|
||||
- Content preview scoring
|
||||
|
||||
3. **Total Score**: Combined score for final ranking
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
- **Research Efficiency**: Find relevant papers 10x faster by following only high-score links
|
||||
- **Competitive Analysis**: Automatically identify important pages on competitor sites
|
||||
- **Content Discovery**: Build topic-focused crawlers that stay on track
|
||||
- **SEO Audits**: Identify and prioritize high-value internal linking opportunities
|
||||
|
||||
## 🎣 Async URL Seeder: Automated URL Discovery at Scale
|
||||
|
||||
**The Problem:** You want to crawl an entire domain but only have the homepage. Or worse, you want specific content types across thousands of pages. Manual URL discovery? That's a job for machines, not humans.
|
||||
|
||||
**My Solution:** I built Async URL Seeder—a turbocharged URL discovery engine that combines multiple sources with intelligent filtering and relevance scoring.
|
||||
|
||||
### Technical Architecture
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
||||
|
||||
async def main():
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
# Discover Python tutorial URLs
|
||||
config = SeedingConfig(
|
||||
source="sitemap", # Use sitemap
|
||||
pattern="*python*", # URL pattern filter
|
||||
extract_head=True, # Get metadata
|
||||
query="python tutorial", # For relevance scoring
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.2,
|
||||
max_urls=10
|
||||
)
|
||||
|
||||
print("Discovering Python async tutorial URLs...")
|
||||
urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
|
||||
|
||||
print(f"\n✅ Found {len(urls)} relevant URLs:")
|
||||
for i, url_info in enumerate(urls[:5], 1):
|
||||
print(f"\n{i}. {url_info['url']}")
|
||||
if url_info.get('relevance_score'):
|
||||
print(f" Relevance: {url_info['relevance_score']:.3f}")
|
||||
if url_info.get('head_data', {}).get('title'):
|
||||
print(f" Title: {url_info['head_data']['title'][:60]}...")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Discovery Methods:**
|
||||
- **Sitemap Mining**: Parses robots.txt and all linked sitemaps
|
||||
- **Common Crawl**: Queries the Common Crawl index for historical URLs
|
||||
- **Intelligent Crawling**: Follows links with smart depth control
|
||||
- **Pattern Analysis**: Learns URL structures and generates variations
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
- **Migration Projects**: Discover 10,000+ URLs from legacy sites in under 60 seconds
|
||||
- **Market Research**: Map entire competitor ecosystems automatically
|
||||
- **Academic Research**: Build comprehensive datasets without manual URL collection
|
||||
- **SEO Audits**: Find every indexable page with content scoring
|
||||
- **Content Archival**: Ensure no content is left behind during site migrations
|
||||
|
||||
## ⚡ Performance Optimizations
|
||||
|
||||
This release includes significant performance improvements through optimized resource handling, better concurrency management, and reduced memory footprint.
|
||||
|
||||
### What We Optimized
|
||||
|
||||
```python
|
||||
# Optimized crawling with v0.7.0 improvements
|
||||
results = []
|
||||
for url in urls:
|
||||
result = await crawler.arun(
|
||||
url,
|
||||
config=CrawlerRunConfig(
|
||||
# Performance optimizations
|
||||
wait_until="domcontentloaded", # Faster than networkidle
|
||||
cache_mode=CacheMode.ENABLED # Enable caching
|
||||
)
|
||||
)
|
||||
results.append(result)
|
||||
```
|
||||
|
||||
**Performance Gains:**
|
||||
- **Startup Time**: 70% faster browser initialization
|
||||
- **Page Loading**: 40% reduction with smart resource blocking
|
||||
- **Extraction**: 3x faster with compiled CSS selectors
|
||||
- **Memory Usage**: 60% reduction with streaming processing
|
||||
- **Concurrent Crawls**: Handle 5x more parallel requests
|
||||
|
||||
|
||||
## 🔧 Important Changes
|
||||
|
||||
### Breaking Changes
|
||||
- `link_extractor` renamed to `link_preview` (better reflects functionality)
|
||||
- Minimum Python version now 3.9
|
||||
- `CrawlerConfig` split into `CrawlerRunConfig` and `BrowserConfig`
|
||||
|
||||
### Migration Guide
|
||||
```python
|
||||
# Old (v0.6.x)
|
||||
from crawl4ai import CrawlerConfig
|
||||
config = CrawlerConfig(timeout=30000)
|
||||
|
||||
# New (v0.7.0)
|
||||
from crawl4ai import CrawlerRunConfig, BrowserConfig
|
||||
browser_config = BrowserConfig(timeout=30000)
|
||||
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
```
|
||||
|
||||
## 🤖 Coming Soon: Intelligent Web Automation
|
||||
|
||||
I'm currently working on bringing advanced automation capabilities to Crawl4AI. This includes:
|
||||
|
||||
- **Crawl Agents**: Autonomous crawlers that understand your goals and adapt their strategies
|
||||
- **Auto JS Generation**: Automatic JavaScript code generation for complex interactions
|
||||
- **Smart Form Handling**: Intelligent form detection and filling
|
||||
- **Context-Aware Actions**: Crawlers that understand page context and make decisions
|
||||
|
||||
These features are under active development and will revolutionize how we approach web automation. Stay tuned!
|
||||
|
||||
## 🚀 Get Started
|
||||
|
||||
```bash
|
||||
pip install crawl4ai==0.7.0
|
||||
```
|
||||
|
||||
Check out the [updated documentation](https://docs.crawl4ai.com).
|
||||
|
||||
Questions? Issues? I'm always listening:
|
||||
- GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||
- Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||
- Twitter: [@unclecode](https://x.com/unclecode)
|
||||
|
||||
Happy crawling! 🕷️
|
||||
|
||||
---
|
||||
|
||||
*P.S. If you're using Crawl4AI in production, I'd love to hear about it. Your use cases inspire the next features.*
|
||||
347
docs/md_v2/core/adaptive-crawling.md
Normal file
347
docs/md_v2/core/adaptive-crawling.md
Normal file
@@ -0,0 +1,347 @@
|
||||
# Adaptive Web Crawling
|
||||
|
||||
## Introduction
|
||||
|
||||
Traditional web crawlers follow predetermined patterns, crawling pages blindly without knowing when they've gathered enough information. **Adaptive Crawling** changes this paradigm by introducing intelligence into the crawling process.
|
||||
|
||||
Think of it like research: when you're looking for information, you don't read every book in the library. You stop when you've found sufficient information to answer your question. That's exactly what Adaptive Crawling does for web scraping.
|
||||
|
||||
## Key Concepts
|
||||
|
||||
### The Problem It Solves
|
||||
|
||||
When crawling websites for specific information, you face two challenges:
|
||||
1. **Under-crawling**: Stopping too early and missing crucial information
|
||||
2. **Over-crawling**: Wasting resources by crawling irrelevant pages
|
||||
|
||||
Adaptive Crawling solves both by using a three-layer scoring system that determines when you have "enough" information.
|
||||
|
||||
### How It Works
|
||||
|
||||
The AdaptiveCrawler uses three metrics to measure information sufficiency:
|
||||
|
||||
- **Coverage**: How well your collected pages cover the query terms
|
||||
- **Consistency**: Whether the information is coherent across pages
|
||||
- **Saturation**: Detecting when new pages aren't adding new information
|
||||
|
||||
When these metrics indicate sufficient information has been gathered, crawling stops automatically.
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Basic Usage
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler
|
||||
|
||||
async def main():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Create an adaptive crawler (config is optional)
|
||||
adaptive = AdaptiveCrawler(crawler)
|
||||
|
||||
# Start crawling with a query
|
||||
result = await adaptive.digest(
|
||||
start_url="https://docs.python.org/3/",
|
||||
query="async context managers"
|
||||
)
|
||||
|
||||
# View statistics
|
||||
adaptive.print_stats()
|
||||
|
||||
# Get the most relevant content
|
||||
relevant_pages = adaptive.get_relevant_content(top_k=5)
|
||||
for page in relevant_pages:
|
||||
print(f"- {page['url']} (score: {page['score']:.2f})")
|
||||
```
|
||||
|
||||
### Configuration Options
|
||||
|
||||
```python
|
||||
from crawl4ai import AdaptiveConfig
|
||||
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.8, # Stop when 80% confident (default: 0.7)
|
||||
max_pages=30, # Maximum pages to crawl (default: 20)
|
||||
top_k_links=5, # Links to follow per page (default: 3)
|
||||
min_gain_threshold=0.05 # Minimum expected gain to continue (default: 0.1)
|
||||
)
|
||||
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
```
|
||||
|
||||
## Crawling Strategies
|
||||
|
||||
Adaptive Crawling supports two distinct strategies for determining information sufficiency:
|
||||
|
||||
### Statistical Strategy (Default)
|
||||
|
||||
The statistical strategy uses pure information theory and term-based analysis:
|
||||
|
||||
- **Fast and efficient** - No API calls or model loading
|
||||
- **Term-based coverage** - Analyzes query term presence and distribution
|
||||
- **No external dependencies** - Works offline
|
||||
- **Best for**: Well-defined queries with specific terminology
|
||||
|
||||
```python
|
||||
# Default configuration uses statistical strategy
|
||||
config = AdaptiveConfig(
|
||||
strategy="statistical", # This is the default
|
||||
confidence_threshold=0.8
|
||||
)
|
||||
```
|
||||
|
||||
### Embedding Strategy
|
||||
|
||||
The embedding strategy uses semantic embeddings for deeper understanding:
|
||||
|
||||
- **Semantic understanding** - Captures meaning beyond exact term matches
|
||||
- **Query expansion** - Automatically generates query variations
|
||||
- **Gap-driven selection** - Identifies semantic gaps in knowledge
|
||||
- **Validation-based stopping** - Uses held-out queries to validate coverage
|
||||
- **Best for**: Complex queries, ambiguous topics, conceptual understanding
|
||||
|
||||
```python
|
||||
# Configure embedding strategy
|
||||
config = AdaptiveConfig(
|
||||
strategy="embedding",
|
||||
embedding_model="sentence-transformers/all-MiniLM-L6-v2", # Default
|
||||
n_query_variations=10, # Generate 10 query variations
|
||||
embedding_min_confidence_threshold=0.1 # Stop if completely irrelevant
|
||||
)
|
||||
|
||||
# With custom embedding provider (e.g., OpenAI)
|
||||
config = AdaptiveConfig(
|
||||
strategy="embedding",
|
||||
embedding_llm_config={
|
||||
'provider': 'openai/text-embedding-3-small',
|
||||
'api_token': 'your-api-key'
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
### Strategy Comparison
|
||||
|
||||
| Feature | Statistical | Embedding |
|
||||
|---------|------------|-----------|
|
||||
| **Speed** | Very fast | Moderate (API calls) |
|
||||
| **Cost** | Free | Depends on provider |
|
||||
| **Accuracy** | Good for exact terms | Excellent for concepts |
|
||||
| **Dependencies** | None | Embedding model/API |
|
||||
| **Query Understanding** | Literal | Semantic |
|
||||
| **Best Use Case** | Technical docs, specific terms | Research, broad topics |
|
||||
|
||||
### Embedding Strategy Configuration
|
||||
|
||||
The embedding strategy offers fine-tuned control through several parameters:
|
||||
|
||||
```python
|
||||
config = AdaptiveConfig(
|
||||
strategy="embedding",
|
||||
|
||||
# Model configuration
|
||||
embedding_model="sentence-transformers/all-MiniLM-L6-v2",
|
||||
embedding_llm_config=None, # Use for API-based embeddings
|
||||
|
||||
# Query expansion
|
||||
n_query_variations=10, # Number of query variations to generate
|
||||
|
||||
# Coverage parameters
|
||||
embedding_coverage_radius=0.2, # Distance threshold for coverage
|
||||
embedding_k_exp=3.0, # Exponential decay factor (higher = stricter)
|
||||
|
||||
# Stopping criteria
|
||||
embedding_min_relative_improvement=0.1, # Min improvement to continue
|
||||
embedding_validation_min_score=0.3, # Min validation score
|
||||
embedding_min_confidence_threshold=0.1, # Below this = irrelevant
|
||||
|
||||
# Link selection
|
||||
embedding_overlap_threshold=0.85, # Similarity for deduplication
|
||||
|
||||
# Display confidence mapping
|
||||
embedding_quality_min_confidence=0.7, # Min displayed confidence
|
||||
embedding_quality_max_confidence=0.95 # Max displayed confidence
|
||||
)
|
||||
```
|
||||
|
||||
### Handling Irrelevant Queries
|
||||
|
||||
The embedding strategy can detect when a query is completely unrelated to the content:
|
||||
|
||||
```python
|
||||
# This will stop quickly with low confidence
|
||||
result = await adaptive.digest(
|
||||
start_url="https://docs.python.org/3/",
|
||||
query="how to cook pasta" # Irrelevant to Python docs
|
||||
)
|
||||
|
||||
# Check if query was irrelevant
|
||||
if result.metrics.get('is_irrelevant', False):
|
||||
print("Query is unrelated to the content!")
|
||||
```
|
||||
|
||||
## When to Use Adaptive Crawling
|
||||
|
||||
### Perfect For:
|
||||
- **Research Tasks**: Finding comprehensive information about a topic
|
||||
- **Question Answering**: Gathering sufficient context to answer specific queries
|
||||
- **Knowledge Base Building**: Creating focused datasets for AI/ML applications
|
||||
- **Competitive Intelligence**: Collecting complete information about specific products/features
|
||||
|
||||
### Not Recommended For:
|
||||
- **Full Site Archiving**: When you need every page regardless of content
|
||||
- **Structured Data Extraction**: When targeting specific, known page patterns
|
||||
- **Real-time Monitoring**: When you need continuous updates
|
||||
|
||||
## Understanding the Output
|
||||
|
||||
### Confidence Score
|
||||
|
||||
The confidence score (0-1) indicates how sufficient the gathered information is:
|
||||
- **0.0-0.3**: Insufficient information, needs more crawling
|
||||
- **0.3-0.6**: Partial information, may answer basic queries
|
||||
- **0.6-0.7**: Good coverage, can answer most queries
|
||||
- **0.7-1.0**: Excellent coverage, comprehensive information
|
||||
|
||||
### Statistics Display
|
||||
|
||||
```python
|
||||
adaptive.print_stats(detailed=False) # Summary table
|
||||
adaptive.print_stats(detailed=True) # Detailed metrics
|
||||
```
|
||||
|
||||
The summary shows:
|
||||
- Pages crawled vs. confidence achieved
|
||||
- Coverage, consistency, and saturation scores
|
||||
- Crawling efficiency metrics
|
||||
|
||||
## Persistence and Resumption
|
||||
|
||||
### Saving Progress
|
||||
|
||||
```python
|
||||
config = AdaptiveConfig(
|
||||
save_state=True,
|
||||
state_path="my_crawl_state.json"
|
||||
)
|
||||
|
||||
# Crawl will auto-save progress
|
||||
result = await adaptive.digest(start_url, query)
|
||||
```
|
||||
|
||||
### Resuming a Crawl
|
||||
|
||||
```python
|
||||
# Resume from saved state
|
||||
result = await adaptive.digest(
|
||||
start_url,
|
||||
query,
|
||||
resume_from="my_crawl_state.json"
|
||||
)
|
||||
```
|
||||
|
||||
### Exporting Knowledge Base
|
||||
|
||||
```python
|
||||
# Export collected pages to JSONL
|
||||
adaptive.export_knowledge_base("knowledge_base.jsonl")
|
||||
|
||||
# Import into another session
|
||||
new_adaptive = AdaptiveCrawler(crawler)
|
||||
new_adaptive.import_knowledge_base("knowledge_base.jsonl")
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### 1. Query Formulation
|
||||
- Use specific, descriptive queries
|
||||
- Include key terms you expect to find
|
||||
- Avoid overly broad queries
|
||||
|
||||
### 2. Threshold Tuning
|
||||
- Start with default (0.7) for general use
|
||||
- Lower to 0.5-0.6 for exploratory crawling
|
||||
- Raise to 0.8+ for exhaustive coverage
|
||||
|
||||
### 3. Performance Optimization
|
||||
- Use appropriate `max_pages` limits
|
||||
- Adjust `top_k_links` based on site structure
|
||||
- Enable caching for repeat crawls
|
||||
|
||||
### 4. Link Selection
|
||||
- The crawler prioritizes links based on:
|
||||
- Relevance to query
|
||||
- Expected information gain
|
||||
- URL structure and depth
|
||||
|
||||
## Examples
|
||||
|
||||
### Research Assistant
|
||||
|
||||
```python
|
||||
# Gather information about a programming concept
|
||||
result = await adaptive.digest(
|
||||
start_url="https://realpython.com",
|
||||
query="python decorators implementation patterns"
|
||||
)
|
||||
|
||||
# Get the most relevant excerpts
|
||||
for doc in adaptive.get_relevant_content(top_k=3):
|
||||
print(f"\nFrom: {doc['url']}")
|
||||
print(f"Relevance: {doc['score']:.2%}")
|
||||
print(doc['content'][:500] + "...")
|
||||
```
|
||||
|
||||
### Knowledge Base Builder
|
||||
|
||||
```python
|
||||
# Build a focused knowledge base about machine learning
|
||||
queries = [
|
||||
"supervised learning algorithms",
|
||||
"neural network architectures",
|
||||
"model evaluation metrics"
|
||||
]
|
||||
|
||||
for query in queries:
|
||||
await adaptive.digest(
|
||||
start_url="https://scikit-learn.org/stable/",
|
||||
query=query
|
||||
)
|
||||
|
||||
# Export combined knowledge base
|
||||
adaptive.export_knowledge_base("ml_knowledge.jsonl")
|
||||
```
|
||||
|
||||
### API Documentation Crawler
|
||||
|
||||
```python
|
||||
# Intelligently crawl API documentation
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.85, # Higher threshold for completeness
|
||||
max_pages=30
|
||||
)
|
||||
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
result = await adaptive.digest(
|
||||
start_url="https://api.example.com/docs",
|
||||
query="authentication endpoints rate limits"
|
||||
)
|
||||
```
|
||||
|
||||
## Next Steps
|
||||
|
||||
- Learn about [Advanced Adaptive Strategies](../advanced/adaptive-strategies.md)
|
||||
- Explore the [AdaptiveCrawler API Reference](../api/adaptive-crawler.md)
|
||||
- See more [Examples](https://github.com/unclecode/crawl4ai/tree/main/docs/examples/adaptive_crawling)
|
||||
|
||||
## FAQ
|
||||
|
||||
**Q: How is this different from traditional crawling?**
|
||||
A: Traditional crawling follows fixed patterns (BFS/DFS). Adaptive crawling makes intelligent decisions about which links to follow and when to stop based on information gain.
|
||||
|
||||
**Q: Can I use this with JavaScript-heavy sites?**
|
||||
A: Yes! AdaptiveCrawler inherits all capabilities from AsyncWebCrawler, including JavaScript execution.
|
||||
|
||||
**Q: How does it handle large websites?**
|
||||
A: The algorithm naturally limits crawling to relevant sections. Use `max_pages` as a safety limit.
|
||||
|
||||
**Q: Can I customize the scoring algorithms?**
|
||||
A: Advanced users can implement custom strategies. See [Adaptive Strategies](../advanced/adaptive-strategies.md).
|
||||
@@ -252,7 +252,7 @@ The `clone()` method:
|
||||
### Key fields to note
|
||||
|
||||
1. **`provider`**:
|
||||
- Which LLM provoder to use.
|
||||
- Which LLM provider to use.
|
||||
- Possible values are `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`<br/>*(default: `"openai/gpt-4o-mini"`)*
|
||||
|
||||
2. **`api_token`**:
|
||||
@@ -273,7 +273,7 @@ In a typical scenario, you define **one** `BrowserConfig` for your crawler sessi
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig, LLMContentFilter, DefaultMarkdownGenerator
|
||||
from crawl4ai import JsonCssExtractionStrategy
|
||||
|
||||
async def main():
|
||||
@@ -298,7 +298,7 @@ async def main():
|
||||
# 3) Example LLM content filtering
|
||||
|
||||
gemini_config = LLMConfig(
|
||||
provider="gemini/gemini-1.5-pro"
|
||||
provider="gemini/gemini-1.5-pro",
|
||||
api_token = "env:GEMINI_API_TOKEN"
|
||||
)
|
||||
|
||||
@@ -322,8 +322,9 @@ async def main():
|
||||
)
|
||||
|
||||
md_generator = DefaultMarkdownGenerator(
|
||||
content_filter=filter,
|
||||
options={"ignore_links": True}
|
||||
content_filter=filter,
|
||||
options={"ignore_links": True}
|
||||
)
|
||||
|
||||
# 4) Crawler run config: skip cache, use extraction
|
||||
run_conf = CrawlerRunConfig(
|
||||
|
||||
@@ -52,11 +52,9 @@ That's it! In just a few lines, you've automated a complete search workflow.
|
||||
|
||||
Want to learn by doing? We've got you covered:
|
||||
|
||||
**🚀 [Live Demo](https://docs.crawl4ai.com/c4a-script/demo)** - Try C4A-Script in your browser right now!
|
||||
**🚀 [Live Demo](https://docs.crawl4ai.com/apps/c4a-script/)** - Try C4A-Script in your browser right now!
|
||||
|
||||
**📁 [Tutorial Examples](/examples/c4a_script/)** - Complete examples with source code
|
||||
|
||||
**🛠️ [Local Tutorial](/examples/c4a_script/tutorial/)** - Run the interactive tutorial on your machine
|
||||
**📁 [Tutorial Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/c4a_script/)** - Complete examples with source code
|
||||
|
||||
### Running the Tutorial Locally
|
||||
|
||||
|
||||
@@ -17,6 +17,9 @@
|
||||
- [Configuration Reference](#configuration-reference)
|
||||
- [Best Practices & Tips](#best-practices--tips)
|
||||
|
||||
## Installation
|
||||
The Crawl4AI CLI will be installed automatically when you install the library.
|
||||
|
||||
## Basic Usage
|
||||
|
||||
The Crawl4AI CLI (`crwl`) provides a simple interface to the Crawl4AI library:
|
||||
|
||||
@@ -58,13 +58,15 @@ Pull and run images directly from Docker Hub without building locally.
|
||||
|
||||
#### 1. Pull the Image
|
||||
|
||||
Our latest release candidate is `0.6.0-r2`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
|
||||
Our latest release candidate is `0.7.0-r1`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
|
||||
|
||||
> ⚠️ **Important Note**: The `latest` tag currently points to the stable `0.6.0` version. After testing and validation, `0.7.0` (without -r1) will be released and `latest` will be updated. For now, please use `0.7.0-r1` to test the new features.
|
||||
|
||||
```bash
|
||||
# Pull the release candidate (recommended for latest features)
|
||||
docker pull unclecode/crawl4ai:0.6.0-r1
|
||||
# Pull the release candidate (for testing new features)
|
||||
docker pull unclecode/crawl4ai:0.7.0-r1
|
||||
|
||||
# Or pull the latest stable version
|
||||
# Or pull the current stable version (0.6.0)
|
||||
docker pull unclecode/crawl4ai:latest
|
||||
```
|
||||
|
||||
@@ -124,7 +126,7 @@ docker stop crawl4ai && docker rm crawl4ai
|
||||
#### Docker Hub Versioning Explained
|
||||
|
||||
* **Image Name:** `unclecode/crawl4ai`
|
||||
* **Tag Format:** `LIBRARY_VERSION[-SUFFIX]` (e.g., `0.6.0-r2`)
|
||||
* **Tag Format:** `LIBRARY_VERSION[-SUFFIX]` (e.g., `0.7.0-r1`)
|
||||
* `LIBRARY_VERSION`: The semantic version of the core `crawl4ai` Python library
|
||||
* `SUFFIX`: Optional tag for release candidates (``) and revisions (`r1`)
|
||||
* **`latest` Tag:** Points to the most recent stable version
|
||||
|
||||
@@ -28,7 +28,11 @@ This page provides a comprehensive list of example scripts that demonstrate vari
|
||||
| Example | Description | Link |
|
||||
|---------|-------------|------|
|
||||
| Deep Crawling | An extensive tutorial on deep crawling capabilities, demonstrating BFS and BestFirst strategies, stream vs. non-stream execution, filters, scorers, and advanced configurations. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/deepcrawl_example.py) |
|
||||
<<<<<<< HEAD
|
||||
| Virtual Scroll | Comprehensive examples for handling virtualized scrolling on sites like Twitter, Instagram. Demonstrates different scrolling scenarios with local test server. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/virtual_scroll_example.py) |
|
||||
=======
|
||||
| Adaptive Crawling | Demonstrates intelligent crawling that automatically determines when sufficient information has been gathered. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/adaptive_crawling/) |
|
||||
>>>>>>> feature/progressive-crawling
|
||||
| Dispatcher | Shows how to use the crawl dispatcher for advanced workload management. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/dispatcher_example.py) |
|
||||
| Storage State | Tutorial on managing browser storage state for persistence. | [View Guide](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/storage_state_tutorial.md) |
|
||||
| Network Console Capture | Demonstrates how to capture and analyze network requests and console logs. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/network_console_capture_example.py) |
|
||||
|
||||
@@ -125,7 +125,7 @@ Here's a full example you can copy, paste, and run immediately:
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.async_configs import LinkPreviewConfig
|
||||
from crawl4ai import LinkPreviewConfig
|
||||
|
||||
async def extract_link_heads_example():
|
||||
"""
|
||||
@@ -237,7 +237,7 @@ if __name__ == "__main__":
|
||||
The `LinkPreviewConfig` class supports these options:
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import LinkPreviewConfig
|
||||
from crawl4ai import LinkPreviewConfig
|
||||
|
||||
link_preview_config = LinkPreviewConfig(
|
||||
# BASIC SETTINGS
|
||||
|
||||
@@ -8,11 +8,10 @@ To crawl a live web page, provide the URL starting with `http://` or `https://`,
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
|
||||
|
||||
async def crawl_web():
|
||||
config = CrawlerRunConfig(bypass_cache=True)
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://en.wikipedia.org/wiki/apple",
|
||||
@@ -33,13 +32,12 @@ To crawl a local HTML file, prefix the file path with `file://`.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
|
||||
|
||||
async def crawl_local_file():
|
||||
local_file_path = "/path/to/apple.html" # Replace with your file path
|
||||
file_url = f"file://{local_file_path}"
|
||||
config = CrawlerRunConfig(bypass_cache=True)
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url=file_url, config=config)
|
||||
@@ -93,8 +91,7 @@ import os
|
||||
import sys
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
|
||||
|
||||
async def main():
|
||||
wikipedia_url = "https://en.wikipedia.org/wiki/apple"
|
||||
@@ -104,7 +101,7 @@ async def main():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Step 1: Crawl the Web URL
|
||||
print("\n=== Step 1: Crawling the Wikipedia URL ===")
|
||||
web_config = CrawlerRunConfig(bypass_cache=True)
|
||||
web_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
result = await crawler.arun(url=wikipedia_url, config=web_config)
|
||||
|
||||
if not result.success:
|
||||
@@ -119,7 +116,7 @@ async def main():
|
||||
# Step 2: Crawl from the Local HTML File
|
||||
print("=== Step 2: Crawling from the Local HTML File ===")
|
||||
file_url = f"file://{html_file_path.resolve()}"
|
||||
file_config = CrawlerRunConfig(bypass_cache=True)
|
||||
file_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
local_result = await crawler.arun(url=file_url, config=file_config)
|
||||
|
||||
if not local_result.success:
|
||||
@@ -135,7 +132,7 @@ async def main():
|
||||
with open(html_file_path, 'r', encoding='utf-8') as f:
|
||||
raw_html_content = f.read()
|
||||
raw_html_url = f"raw:{raw_html_content}"
|
||||
raw_config = CrawlerRunConfig(bypass_cache=True)
|
||||
raw_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
raw_result = await crawler.arun(url=raw_html_url, config=raw_config)
|
||||
|
||||
if not raw_result.success:
|
||||
|
||||
@@ -200,7 +200,8 @@ config = CrawlerRunConfig(markdown_generator=md_generator)
|
||||
|
||||
- **`user_query`**: The term you want to focus on. BM25 tries to keep only content blocks relevant to that query.
|
||||
- **`bm25_threshold`**: Raise it to keep fewer blocks; lower it to keep more.
|
||||
- **`use_stemming`** *(default `True`)*: If enabled, variations of words match (e.g., “learn,” “learning,” “learnt”).
|
||||
- **`use_stemming`** *(default `True`)*: Whether to apply stemming to the query and content.
|
||||
- **`language (str)`**: Language for stemming (default: 'english').
|
||||
|
||||
**No query provided?** BM25 tries to glean a context from page metadata, or you can simply treat it as a scorched-earth approach that discards text with low generic score. Realistically, you want to supply a query for best results.
|
||||
|
||||
@@ -233,7 +234,7 @@ prune_filter = PruningContentFilter(
|
||||
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
|
||||
async def main():
|
||||
@@ -255,9 +256,12 @@ async def main():
|
||||
chunk_token_threshold=4096, # Adjust based on your needs
|
||||
verbose=True
|
||||
)
|
||||
|
||||
md_generator = DefaultMarkdownGenerator(
|
||||
content_filter=filter,
|
||||
options={"ignore_links": True}
|
||||
)
|
||||
config = CrawlerRunConfig(
|
||||
content_filter=filter
|
||||
markdown_generator=md_generator,
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
|
||||
@@ -272,7 +272,43 @@ if __name__ == "__main__":
|
||||
|
||||
---
|
||||
|
||||
## 7. Multi-URL Concurrency (Preview)
|
||||
## 7. Adaptive Crawling (New!)
|
||||
|
||||
Crawl4AI now includes intelligent adaptive crawling that automatically determines when sufficient information has been gathered. Here's a quick example:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler
|
||||
|
||||
async def adaptive_example():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler)
|
||||
|
||||
# Start adaptive crawling
|
||||
result = await adaptive.digest(
|
||||
start_url="https://docs.python.org/3/",
|
||||
query="async context managers"
|
||||
)
|
||||
|
||||
# View results
|
||||
adaptive.print_stats()
|
||||
print(f"Crawled {len(result.crawled_urls)} pages")
|
||||
print(f"Achieved {adaptive.confidence:.0%} confidence")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(adaptive_example())
|
||||
```
|
||||
|
||||
**What's special about adaptive crawling?**
|
||||
- **Automatic stopping**: Stops when sufficient information is gathered
|
||||
- **Intelligent link selection**: Follows only relevant links
|
||||
- **Confidence scoring**: Know how complete your information is
|
||||
|
||||
[Learn more about Adaptive Crawling →](adaptive-crawling.md)
|
||||
|
||||
---
|
||||
|
||||
## 8. Multi-URL Concurrency (Preview)
|
||||
|
||||
If you need to crawl multiple URLs in **parallel**, you can use `arun_many()`. By default, Crawl4AI employs a **MemoryAdaptiveDispatcher**, automatically adjusting concurrency based on system resources. Here’s a quick glimpse:
|
||||
|
||||
|
||||
@@ -31,9 +31,16 @@ if __name__ == "__main__":
|
||||
The `arun()` method returns a `CrawlResult` object with several useful properties. Here's a quick overview (see [CrawlResult](../api/crawl-result.md) for complete details):
|
||||
|
||||
```python
|
||||
config = CrawlerRunConfig(
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(threshold=0.6),
|
||||
options={"ignore_links": True}
|
||||
)
|
||||
)
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=CrawlerRunConfig(fit_markdown=True)
|
||||
config=config
|
||||
)
|
||||
|
||||
# Different content formats
|
||||
|
||||
@@ -137,7 +137,7 @@ async def smart_blog_crawler():
|
||||
word_count_threshold=300 # Only substantial articles
|
||||
)
|
||||
|
||||
# Extract URLs and stream results as they come
|
||||
# Extract URLs and crawl them
|
||||
tutorial_urls = [t["url"] for t in tutorials[:10]]
|
||||
results = await crawler.arun_many(tutorial_urls, config=config)
|
||||
|
||||
@@ -231,7 +231,7 @@ Common Crawl is a massive public dataset that regularly crawls the entire web. I
|
||||
|
||||
```python
|
||||
# Use both sources
|
||||
config = SeedingConfig(source="cc+sitemap")
|
||||
config = SeedingConfig(source="sitemap+cc")
|
||||
urls = await seeder.urls("example.com", config)
|
||||
```
|
||||
|
||||
@@ -241,13 +241,13 @@ The `SeedingConfig` object is your control panel. Here's everything you can conf
|
||||
|
||||
| Parameter | Type | Default | Description |
|
||||
|-----------|------|---------|-------------|
|
||||
| `source` | str | "cc" | URL source: "cc" (Common Crawl), "sitemap", or "cc+sitemap" |
|
||||
| `source` | str | "sitemap+cc" | URL source: "cc" (Common Crawl), "sitemap", or "sitemap+cc" |
|
||||
| `pattern` | str | "*" | URL pattern filter (e.g., "*/blog/*", "*.html") |
|
||||
| `extract_head` | bool | False | Extract metadata from page `<head>` |
|
||||
| `live_check` | bool | False | Verify URLs are accessible |
|
||||
| `max_urls` | int | -1 | Maximum URLs to return (-1 = unlimited) |
|
||||
| `concurrency` | int | 10 | Parallel workers for fetching |
|
||||
| `hits_per_sec` | int | None | Rate limit for requests |
|
||||
| `hits_per_sec` | int | 5 | Rate limit for requests |
|
||||
| `force` | bool | False | Bypass cache, fetch fresh data |
|
||||
| `verbose` | bool | False | Show detailed progress |
|
||||
| `query` | str | None | Search query for BM25 scoring |
|
||||
@@ -522,7 +522,7 @@ urls = await seeder.urls("docs.example.com", config)
|
||||
```python
|
||||
# Find specific products
|
||||
config = SeedingConfig(
|
||||
source="cc+sitemap", # Use both sources
|
||||
source="sitemap+cc", # Use both sources
|
||||
extract_head=True,
|
||||
query="wireless headphones noise canceling",
|
||||
scoring_method="bm25",
|
||||
@@ -782,7 +782,7 @@ class ResearchAssistant:
|
||||
|
||||
# Step 1: Discover relevant URLs
|
||||
config = SeedingConfig(
|
||||
source="cc+sitemap", # Maximum coverage
|
||||
source="sitemap+cc", # Maximum coverage
|
||||
extract_head=True, # Get metadata
|
||||
query=topic, # Research topic
|
||||
scoring_method="bm25", # Smart scoring
|
||||
@@ -832,7 +832,8 @@ class ResearchAssistant:
|
||||
# Extract URLs and crawl all articles
|
||||
article_urls = [article['url'] for article in top_articles]
|
||||
results = []
|
||||
async for result in await crawler.arun_many(article_urls, config=config):
|
||||
crawl_results = await crawler.arun_many(article_urls, config=config)
|
||||
async for result in crawl_results:
|
||||
if result.success:
|
||||
results.append({
|
||||
'url': result.url,
|
||||
@@ -933,10 +934,10 @@ config = SeedingConfig(concurrency=10, hits_per_sec=5)
|
||||
# When crawling many URLs
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Assuming urls is a list of URL strings
|
||||
results = await crawler.arun_many(urls, config=config)
|
||||
crawl_results = await crawler.arun_many(urls, config=config)
|
||||
|
||||
# Process as they arrive
|
||||
async for result in results:
|
||||
async for result in crawl_results:
|
||||
process_immediately(result) # Don't wait for all
|
||||
```
|
||||
|
||||
@@ -1020,7 +1021,7 @@ config = SeedingConfig(
|
||||
|
||||
# E-commerce product discovery
|
||||
config = SeedingConfig(
|
||||
source="cc+sitemap",
|
||||
source="sitemap+cc",
|
||||
pattern="*/product/*",
|
||||
extract_head=True,
|
||||
live_check=True
|
||||
|
||||
@@ -218,7 +218,7 @@ import json
|
||||
import asyncio
|
||||
from typing import List
|
||||
from pydantic import BaseModel, Field
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
|
||||
from crawl4ai import LLMExtractionStrategy
|
||||
|
||||
class Entity(BaseModel):
|
||||
@@ -238,8 +238,8 @@ class KnowledgeGraph(BaseModel):
|
||||
async def main():
|
||||
# LLM extraction strategy
|
||||
llm_strat = LLMExtractionStrategy(
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4", api_token=os.getenv('OPENAI_API_KEY')),
|
||||
schema=KnowledgeGraph.schema_json(),
|
||||
llmConfig = LLMConfig(provider="openai/gpt-4", api_token=os.getenv('OPENAI_API_KEY')),
|
||||
schema=KnowledgeGraph.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="Extract entities and relationships from the content. Return valid JSON.",
|
||||
chunk_token_threshold=1400,
|
||||
@@ -258,6 +258,10 @@ async def main():
|
||||
url = "https://www.nbcnews.com/business"
|
||||
result = await crawler.arun(url=url, config=crawl_config)
|
||||
|
||||
print("--- LLM RAW RESPONSE ---")
|
||||
print(result.extracted_content)
|
||||
print("--- END LLM RAW RESPONSE ---")
|
||||
|
||||
if result.success:
|
||||
with open("kb_result.json", "w", encoding="utf-8") as f:
|
||||
f.write(result.extracted_content)
|
||||
|
||||
@@ -41,6 +41,17 @@
|
||||
alt="License"/>
|
||||
</a>
|
||||
</p>
|
||||
<p align="center">
|
||||
<a href="https://x.com/crawl4ai">
|
||||
<img src="https://img.shields.io/badge/Follow%20on%20X-000000?style=for-the-badge&logo=x&logoColor=white" alt="Follow on X" />
|
||||
</a>
|
||||
<a href="https://www.linkedin.com/company/crawl4ai">
|
||||
<img src="https://img.shields.io/badge/Follow%20on%20LinkedIn-0077B5?style=for-the-badge&logo=linkedin&logoColor=white" alt="Follow on LinkedIn" />
|
||||
</a>
|
||||
<a href="https://discord.gg/jP8KfhDhyN">
|
||||
<img src="https://img.shields.io/badge/Join%20our%20Discord-5865F2?style=for-the-badge&logo=discord&logoColor=white" alt="Join our Discord" />
|
||||
</a>
|
||||
</p>
|
||||
|
||||
</div>
|
||||
|
||||
@@ -48,6 +59,12 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant
|
||||
|
||||
> **Note**: If you're looking for the old documentation, you can access it [here](https://old.docs.crawl4ai.com).
|
||||
|
||||
## 🎯 New: Adaptive Web Crawling
|
||||
|
||||
Crawl4AI now features intelligent adaptive crawling that knows when to stop! Using advanced information foraging algorithms, it determines when sufficient information has been gathered to answer your query.
|
||||
|
||||
[Learn more about Adaptive Crawling →](core/adaptive-crawling.md)
|
||||
|
||||
|
||||
## Quick Start
|
||||
|
||||
|
||||
1584
docs/releases_review/crawl4ai_v0_7_0_showcase.py
Normal file
1584
docs/releases_review/crawl4ai_v0_7_0_showcase.py
Normal file
File diff suppressed because it is too large
Load Diff
408
docs/releases_review/demo_v0.7.0.py
Normal file
408
docs/releases_review/demo_v0.7.0.py
Normal file
@@ -0,0 +1,408 @@
|
||||
"""
|
||||
🚀 Crawl4AI v0.7.0 Release Demo
|
||||
================================
|
||||
This demo showcases all major features introduced in v0.7.0 release.
|
||||
|
||||
Major Features:
|
||||
1. ✅ Adaptive Crawling - Intelligent crawling with confidence tracking
|
||||
2. ✅ Virtual Scroll Support - Handle infinite scroll pages
|
||||
3. ✅ Link Preview - Advanced link analysis with 3-layer scoring
|
||||
4. ✅ URL Seeder - Smart URL discovery and filtering
|
||||
5. ✅ C4A Script - Domain-specific language for web automation
|
||||
6. ✅ Chrome Extension Updates - Click2Crawl and instant schema extraction
|
||||
7. ✅ PDF Parsing Support - Extract content from PDF documents
|
||||
8. ✅ Nightly Builds - Automated nightly releases
|
||||
|
||||
Run this demo to see all features in action!
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from typing import List, Dict
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
from rich.panel import Panel
|
||||
from rich import box
|
||||
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
CrawlerRunConfig,
|
||||
BrowserConfig,
|
||||
CacheMode,
|
||||
AdaptiveCrawler,
|
||||
AdaptiveConfig,
|
||||
AsyncUrlSeeder,
|
||||
SeedingConfig,
|
||||
c4a_compile,
|
||||
CompilationResult
|
||||
)
|
||||
from crawl4ai.async_configs import VirtualScrollConfig, LinkPreviewConfig
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
|
||||
console = Console()
|
||||
|
||||
def print_section(title: str, description: str = ""):
|
||||
"""Print a section header"""
|
||||
console.print(f"\n[bold cyan]{'=' * 60}[/bold cyan]")
|
||||
console.print(f"[bold yellow]{title}[/bold yellow]")
|
||||
if description:
|
||||
console.print(f"[dim]{description}[/dim]")
|
||||
console.print(f"[bold cyan]{'=' * 60}[/bold cyan]\n")
|
||||
|
||||
|
||||
async def demo_1_adaptive_crawling():
|
||||
"""Demo 1: Adaptive Crawling - Intelligent content extraction"""
|
||||
print_section(
|
||||
"Demo 1: Adaptive Crawling",
|
||||
"Intelligently learns and adapts to website patterns"
|
||||
)
|
||||
|
||||
# Create adaptive crawler with custom configuration
|
||||
config = AdaptiveConfig(
|
||||
strategy="statistical", # or "embedding"
|
||||
confidence_threshold=0.7,
|
||||
max_pages=10,
|
||||
top_k_links=3,
|
||||
min_gain_threshold=0.1
|
||||
)
|
||||
|
||||
# Example: Learn from a product page
|
||||
console.print("[cyan]Learning from product page patterns...[/cyan]")
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
|
||||
# Start adaptive crawl
|
||||
console.print("[cyan]Starting adaptive crawl...[/cyan]")
|
||||
result = await adaptive.digest(
|
||||
start_url="https://docs.python.org/3/",
|
||||
query="python decorators tutorial"
|
||||
)
|
||||
|
||||
console.print("[green]✓ Adaptive crawl completed[/green]")
|
||||
console.print(f" - Confidence Level: {adaptive.confidence:.0%}")
|
||||
console.print(f" - Pages Crawled: {len(result.crawled_urls)}")
|
||||
console.print(f" - Knowledge Base: {len(adaptive.state.knowledge_base)} documents")
|
||||
|
||||
# Get most relevant content
|
||||
relevant = adaptive.get_relevant_content(top_k=3)
|
||||
if relevant:
|
||||
console.print("\nMost relevant pages:")
|
||||
for i, page in enumerate(relevant, 1):
|
||||
console.print(f" {i}. {page['url']} (relevance: {page['score']:.2%})")
|
||||
|
||||
|
||||
async def demo_2_virtual_scroll():
|
||||
"""Demo 2: Virtual Scroll - Handle infinite scroll pages"""
|
||||
print_section(
|
||||
"Demo 2: Virtual Scroll Support",
|
||||
"Capture content from modern infinite scroll pages"
|
||||
)
|
||||
|
||||
# Configure virtual scroll - using body as container for example.com
|
||||
scroll_config = VirtualScrollConfig(
|
||||
container_selector="body", # Using body since example.com has simple structure
|
||||
scroll_count=3, # Just 3 scrolls for demo
|
||||
scroll_by="container_height", # or "page_height" or pixel value
|
||||
wait_after_scroll=0.5 # Wait 500ms after each scroll
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
virtual_scroll_config=scroll_config,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
wait_until="networkidle"
|
||||
)
|
||||
|
||||
console.print("[cyan]Virtual Scroll Configuration:[/cyan]")
|
||||
console.print(f" - Container: {scroll_config.container_selector}")
|
||||
console.print(f" - Scroll count: {scroll_config.scroll_count}")
|
||||
console.print(f" - Scroll by: {scroll_config.scroll_by}")
|
||||
console.print(f" - Wait after scroll: {scroll_config.wait_after_scroll}s")
|
||||
|
||||
console.print("\n[dim]Note: Using example.com for demo - in production, use this[/dim]")
|
||||
console.print("[dim]with actual infinite scroll pages like social media feeds.[/dim]\n")
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://example.com",
|
||||
config=config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
console.print("[green]✓ Virtual scroll executed successfully![/green]")
|
||||
console.print(f" - Content length: {len(result.markdown)} chars")
|
||||
|
||||
# Show example of how to use with real infinite scroll sites
|
||||
console.print("\n[yellow]Example for real infinite scroll sites:[/yellow]")
|
||||
console.print("""
|
||||
# For Twitter-like feeds:
|
||||
scroll_config = VirtualScrollConfig(
|
||||
container_selector="[data-testid='primaryColumn']",
|
||||
scroll_count=20,
|
||||
scroll_by="container_height",
|
||||
wait_after_scroll=1.0
|
||||
)
|
||||
|
||||
# For Instagram-like grids:
|
||||
scroll_config = VirtualScrollConfig(
|
||||
container_selector="main article",
|
||||
scroll_count=15,
|
||||
scroll_by=1000, # Fixed pixel amount
|
||||
wait_after_scroll=1.5
|
||||
)""")
|
||||
|
||||
|
||||
async def demo_3_link_preview():
|
||||
"""Demo 3: Link Preview with 3-layer scoring"""
|
||||
print_section(
|
||||
"Demo 3: Link Preview & Scoring",
|
||||
"Advanced link analysis with intrinsic, contextual, and total scoring"
|
||||
)
|
||||
|
||||
# Configure link preview
|
||||
link_config = LinkPreviewConfig(
|
||||
include_internal=True,
|
||||
include_external=False,
|
||||
max_links=10,
|
||||
concurrency=5,
|
||||
query="python tutorial", # For contextual scoring
|
||||
score_threshold=0.3,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
link_preview_config=link_config,
|
||||
score_links=True, # Enable intrinsic scoring
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
console.print("[cyan]Analyzing links with 3-layer scoring system...[/cyan]")
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://docs.python.org/3/", config=config)
|
||||
|
||||
if result.success and result.links:
|
||||
# Get scored links
|
||||
internal_links = result.links.get("internal", [])
|
||||
scored_links = [l for l in internal_links if l.get("total_score")]
|
||||
scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
|
||||
|
||||
# Create a scoring table
|
||||
table = Table(title="Link Scoring Results", box=box.ROUNDED)
|
||||
table.add_column("Link Text", style="cyan", width=40)
|
||||
table.add_column("Intrinsic Score", justify="center")
|
||||
table.add_column("Contextual Score", justify="center")
|
||||
table.add_column("Total Score", justify="center", style="bold green")
|
||||
|
||||
for link in scored_links[:5]:
|
||||
text = link.get('text', 'No text')[:40]
|
||||
table.add_row(
|
||||
text,
|
||||
f"{link.get('intrinsic_score', 0):.1f}/10",
|
||||
f"{link.get('contextual_score', 0):.2f}/1",
|
||||
f"{link.get('total_score', 0):.3f}"
|
||||
)
|
||||
|
||||
console.print(table)
|
||||
|
||||
|
||||
async def demo_4_url_seeder():
|
||||
"""Demo 4: URL Seeder - Smart URL discovery"""
|
||||
print_section(
|
||||
"Demo 4: URL Seeder",
|
||||
"Intelligent URL discovery and filtering"
|
||||
)
|
||||
|
||||
# Configure seeding
|
||||
seeding_config = SeedingConfig(
|
||||
source="cc+sitemap", # or "crawl"
|
||||
pattern="*tutorial*", # URL pattern filter
|
||||
max_urls=50,
|
||||
extract_head=True, # Get metadata
|
||||
query="python programming", # For relevance scoring
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.2,
|
||||
force = True
|
||||
)
|
||||
|
||||
console.print("[cyan]URL Seeder Configuration:[/cyan]")
|
||||
console.print(f" - Source: {seeding_config.source}")
|
||||
console.print(f" - Pattern: {seeding_config.pattern}")
|
||||
console.print(f" - Max URLs: {seeding_config.max_urls}")
|
||||
console.print(f" - Query: {seeding_config.query}")
|
||||
console.print(f" - Scoring: {seeding_config.scoring_method}")
|
||||
|
||||
# Use URL seeder to discover URLs
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
console.print("\n[cyan]Discovering URLs from Python docs...[/cyan]")
|
||||
urls = await seeder.urls("docs.python.org", seeding_config)
|
||||
|
||||
console.print(f"\n[green]✓ Discovered {len(urls)} URLs[/green]")
|
||||
for i, url_info in enumerate(urls[:5], 1):
|
||||
console.print(f" {i}. {url_info['url']}")
|
||||
if url_info.get('relevance_score'):
|
||||
console.print(f" Relevance: {url_info['relevance_score']:.3f}")
|
||||
|
||||
|
||||
async def demo_5_c4a_script():
|
||||
"""Demo 5: C4A Script - Domain-specific language"""
|
||||
print_section(
|
||||
"Demo 5: C4A Script Language",
|
||||
"Domain-specific language for web automation"
|
||||
)
|
||||
|
||||
# Example C4A script
|
||||
c4a_script = """
|
||||
# Simple C4A script example
|
||||
WAIT `body` 3
|
||||
IF (EXISTS `.cookie-banner`) THEN CLICK `.accept`
|
||||
CLICK `.search-button`
|
||||
TYPE "python tutorial"
|
||||
PRESS Enter
|
||||
WAIT `.results` 5
|
||||
"""
|
||||
|
||||
console.print("[cyan]C4A Script Example:[/cyan]")
|
||||
console.print(Panel(c4a_script, title="script.c4a", border_style="blue"))
|
||||
|
||||
# Compile the script
|
||||
compilation_result = c4a_compile(c4a_script)
|
||||
|
||||
if compilation_result.success:
|
||||
console.print("[green]✓ Script compiled successfully![/green]")
|
||||
console.print(f" - Generated {len(compilation_result.js_code)} JavaScript statements")
|
||||
console.print("\nFirst 3 JS statements:")
|
||||
for stmt in compilation_result.js_code[:3]:
|
||||
console.print(f" • {stmt}")
|
||||
else:
|
||||
console.print("[red]✗ Script compilation failed[/red]")
|
||||
if compilation_result.first_error:
|
||||
error = compilation_result.first_error
|
||||
console.print(f" Error at line {error.line}: {error.message}")
|
||||
|
||||
|
||||
async def demo_6_css_extraction():
|
||||
"""Demo 6: Enhanced CSS/JSON extraction"""
|
||||
print_section(
|
||||
"Demo 6: Enhanced Extraction",
|
||||
"Improved CSS selector and JSON extraction"
|
||||
)
|
||||
|
||||
# Define extraction schema
|
||||
schema = {
|
||||
"name": "Example Page Data",
|
||||
"baseSelector": "body",
|
||||
"fields": [
|
||||
{
|
||||
"name": "title",
|
||||
"selector": "h1",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "paragraphs",
|
||||
"selector": "p",
|
||||
"type": "list",
|
||||
"fields": [
|
||||
{"name": "text", "type": "text"}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema)
|
||||
|
||||
console.print("[cyan]Extraction Schema:[/cyan]")
|
||||
console.print(json.dumps(schema, indent=2))
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://example.com",
|
||||
config=CrawlerRunConfig(
|
||||
extraction_strategy=extraction_strategy,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
)
|
||||
|
||||
if result.success and result.extracted_content:
|
||||
console.print("\n[green]✓ Content extracted successfully![/green]")
|
||||
console.print(f"Extracted: {json.dumps(json.loads(result.extracted_content), indent=2)[:200]}...")
|
||||
|
||||
|
||||
async def demo_7_performance_improvements():
|
||||
"""Demo 7: Performance improvements"""
|
||||
print_section(
|
||||
"Demo 7: Performance Improvements",
|
||||
"Faster crawling with better resource management"
|
||||
)
|
||||
|
||||
# Performance-optimized configuration
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.ENABLED, # Use caching
|
||||
wait_until="domcontentloaded", # Faster than networkidle
|
||||
page_timeout=10000, # 10 second timeout
|
||||
exclude_external_links=True,
|
||||
exclude_social_media_links=True,
|
||||
exclude_external_images=True
|
||||
)
|
||||
|
||||
console.print("[cyan]Performance Configuration:[/cyan]")
|
||||
console.print(" - Cache: ENABLED")
|
||||
console.print(" - Wait: domcontentloaded (faster)")
|
||||
console.print(" - Timeout: 10s")
|
||||
console.print(" - Excluding: external links, images, social media")
|
||||
|
||||
# Measure performance
|
||||
import time
|
||||
start_time = time.time()
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://example.com", config=config)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
if result.success:
|
||||
console.print(f"\n[green]✓ Page crawled in {elapsed:.2f} seconds[/green]")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all demos"""
|
||||
console.print(Panel(
|
||||
"[bold cyan]Crawl4AI v0.7.0 Release Demo[/bold cyan]\n\n"
|
||||
"This demo showcases all major features introduced in v0.7.0.\n"
|
||||
"Each demo is self-contained and demonstrates a specific feature.",
|
||||
title="Welcome",
|
||||
border_style="blue"
|
||||
))
|
||||
|
||||
demos = [
|
||||
demo_1_adaptive_crawling,
|
||||
demo_2_virtual_scroll,
|
||||
demo_3_link_preview,
|
||||
demo_4_url_seeder,
|
||||
demo_5_c4a_script,
|
||||
demo_6_css_extraction,
|
||||
demo_7_performance_improvements
|
||||
]
|
||||
|
||||
for i, demo in enumerate(demos, 1):
|
||||
try:
|
||||
await demo()
|
||||
if i < len(demos):
|
||||
console.print("\n[dim]Press Enter to continue to next demo...[/dim]")
|
||||
input()
|
||||
except Exception as e:
|
||||
console.print(f"[red]Error in demo: {e}[/red]")
|
||||
continue
|
||||
|
||||
console.print(Panel(
|
||||
"[bold green]Demo Complete![/bold green]\n\n"
|
||||
"Thank you for trying Crawl4AI v0.7.0!\n"
|
||||
"For more examples and documentation, visit:\n"
|
||||
"https://github.com/unclecode/crawl4ai",
|
||||
title="Complete",
|
||||
border_style="green"
|
||||
))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
280
docs/releases_review/v0_7_0_features_demo.py
Normal file
280
docs/releases_review/v0_7_0_features_demo.py
Normal file
@@ -0,0 +1,280 @@
|
||||
"""
|
||||
🚀 Crawl4AI v0.7.0 Feature Demo
|
||||
================================
|
||||
This file demonstrates the major features introduced in v0.7.0 with practical examples.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from pathlib import Path
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
CrawlerRunConfig,
|
||||
BrowserConfig,
|
||||
CacheMode,
|
||||
# New imports for v0.7.0
|
||||
VirtualScrollConfig,
|
||||
LinkPreviewConfig,
|
||||
AdaptiveCrawler,
|
||||
AdaptiveConfig,
|
||||
AsyncUrlSeeder,
|
||||
SeedingConfig,
|
||||
c4a_compile,
|
||||
)
|
||||
|
||||
|
||||
async def demo_link_preview():
|
||||
"""
|
||||
Demo 1: Link Preview with 3-Layer Scoring
|
||||
|
||||
Shows how to analyze links with intrinsic quality scores,
|
||||
contextual relevance, and combined total scores.
|
||||
"""
|
||||
print("\n" + "="*60)
|
||||
print("🔗 DEMO 1: Link Preview & Intelligent Scoring")
|
||||
print("="*60)
|
||||
|
||||
# Configure link preview with contextual scoring
|
||||
config = CrawlerRunConfig(
|
||||
link_preview_config=LinkPreviewConfig(
|
||||
include_internal=True,
|
||||
include_external=False,
|
||||
max_links=10,
|
||||
concurrency=5,
|
||||
query="machine learning tutorials", # For contextual scoring
|
||||
score_threshold=0.3, # Minimum relevance
|
||||
verbose=True
|
||||
),
|
||||
score_links=True, # Enable intrinsic scoring
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://scikit-learn.org/stable/", config=config)
|
||||
|
||||
if result.success:
|
||||
# Get scored links
|
||||
internal_links = result.links.get("internal", [])
|
||||
scored_links = [l for l in internal_links if l.get("total_score")]
|
||||
scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
|
||||
|
||||
print(f"\nTop 5 Most Relevant Links:")
|
||||
for i, link in enumerate(scored_links[:5], 1):
|
||||
print(f"\n{i}. {link.get('text', 'No text')[:50]}...")
|
||||
print(f" URL: {link['href']}")
|
||||
print(f" Intrinsic Score: {link.get('intrinsic_score', 0):.2f}/10")
|
||||
print(f" Contextual Score: {link.get('contextual_score', 0):.3f}")
|
||||
print(f" Total Score: {link.get('total_score', 0):.3f}")
|
||||
|
||||
# Show metadata if available
|
||||
if link.get('head_data'):
|
||||
title = link['head_data'].get('title', 'No title')
|
||||
print(f" Title: {title[:60]}...")
|
||||
|
||||
|
||||
async def demo_adaptive_crawling():
|
||||
"""
|
||||
Demo 2: Adaptive Crawling
|
||||
|
||||
Shows intelligent crawling that stops when enough information
|
||||
is gathered, with confidence tracking.
|
||||
"""
|
||||
print("\n" + "="*60)
|
||||
print("🎯 DEMO 2: Adaptive Crawling with Confidence Tracking")
|
||||
print("="*60)
|
||||
|
||||
# Configure adaptive crawler
|
||||
config = AdaptiveConfig(
|
||||
strategy="statistical", # or "embedding" for semantic understanding
|
||||
max_pages=10,
|
||||
confidence_threshold=0.7, # Stop at 70% confidence
|
||||
top_k_links=3, # Follow top 3 links per page
|
||||
min_gain_threshold=0.05 # Need 5% information gain to continue
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
|
||||
print("Starting adaptive crawl about Python decorators...")
|
||||
result = await adaptive.digest(
|
||||
start_url="https://docs.python.org/3/glossary.html",
|
||||
query="python decorators functions wrapping"
|
||||
)
|
||||
|
||||
print(f"\n✅ Crawling Complete!")
|
||||
print(f"• Confidence Level: {adaptive.confidence:.0%}")
|
||||
print(f"• Pages Crawled: {len(result.crawled_urls)}")
|
||||
print(f"• Knowledge Base: {len(adaptive.state.knowledge_base)} documents")
|
||||
|
||||
# Get most relevant content
|
||||
relevant = adaptive.get_relevant_content(top_k=3)
|
||||
print(f"\nMost Relevant Pages:")
|
||||
for i, page in enumerate(relevant, 1):
|
||||
print(f"{i}. {page['url']} (relevance: {page['score']:.2%})")
|
||||
|
||||
|
||||
async def demo_virtual_scroll():
|
||||
"""
|
||||
Demo 3: Virtual Scroll for Modern Web Pages
|
||||
|
||||
Shows how to capture content from pages with DOM recycling
|
||||
(Twitter, Instagram, infinite scroll).
|
||||
"""
|
||||
print("\n" + "="*60)
|
||||
print("📜 DEMO 3: Virtual Scroll Support")
|
||||
print("="*60)
|
||||
|
||||
# Configure virtual scroll for a news site
|
||||
virtual_config = VirtualScrollConfig(
|
||||
container_selector="main, article, .content", # Common containers
|
||||
scroll_count=20, # Scroll up to 20 times
|
||||
scroll_by="container_height", # Scroll by container height
|
||||
wait_after_scroll=0.5 # Wait 500ms after each scroll
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
virtual_scroll_config=virtual_config,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
wait_for="css:article" # Wait for articles to load
|
||||
)
|
||||
|
||||
# Example with a real news site
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://news.ycombinator.com/",
|
||||
config=config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
# Count items captured
|
||||
import re
|
||||
items = len(re.findall(r'class="athing"', result.html))
|
||||
print(f"\n✅ Captured {items} news items")
|
||||
print(f"• HTML size: {len(result.html):,} bytes")
|
||||
print(f"• Without virtual scroll, would only capture ~30 items")
|
||||
|
||||
|
||||
async def demo_url_seeder():
|
||||
"""
|
||||
Demo 4: URL Seeder for Intelligent Discovery
|
||||
|
||||
Shows how to discover and filter URLs before crawling,
|
||||
with relevance scoring.
|
||||
"""
|
||||
print("\n" + "="*60)
|
||||
print("🌱 DEMO 4: URL Seeder - Smart URL Discovery")
|
||||
print("="*60)
|
||||
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
# Discover Python tutorial URLs
|
||||
config = SeedingConfig(
|
||||
source="sitemap", # Use sitemap
|
||||
pattern="*python*", # URL pattern filter
|
||||
extract_head=True, # Get metadata
|
||||
query="python tutorial", # For relevance scoring
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.2,
|
||||
max_urls=10
|
||||
)
|
||||
|
||||
print("Discovering Python async tutorial URLs...")
|
||||
urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
|
||||
|
||||
print(f"\n✅ Found {len(urls)} relevant URLs:")
|
||||
for i, url_info in enumerate(urls[:5], 1):
|
||||
print(f"\n{i}. {url_info['url']}")
|
||||
if url_info.get('relevance_score'):
|
||||
print(f" Relevance: {url_info['relevance_score']:.3f}")
|
||||
if url_info.get('head_data', {}).get('title'):
|
||||
print(f" Title: {url_info['head_data']['title'][:60]}...")
|
||||
|
||||
|
||||
async def demo_c4a_script():
|
||||
"""
|
||||
Demo 5: C4A Script Language
|
||||
|
||||
Shows the domain-specific language for web automation
|
||||
with JavaScript transpilation.
|
||||
"""
|
||||
print("\n" + "="*60)
|
||||
print("🎭 DEMO 5: C4A Script - Web Automation Language")
|
||||
print("="*60)
|
||||
|
||||
# Example C4A script
|
||||
c4a_script = """
|
||||
# E-commerce automation script
|
||||
WAIT `body` 3
|
||||
|
||||
# Handle cookie banner
|
||||
IF (EXISTS `.cookie-banner`) THEN CLICK `.accept-cookies`
|
||||
|
||||
# Search for product
|
||||
CLICK `.search-box`
|
||||
TYPE "wireless headphones"
|
||||
PRESS Enter
|
||||
|
||||
# Wait for results
|
||||
WAIT `.product-grid` 10
|
||||
|
||||
# Load more products
|
||||
REPEAT (SCROLL DOWN 500, `document.querySelectorAll('.product').length < 50`)
|
||||
|
||||
# Apply filter
|
||||
IF (EXISTS `.price-filter`) THEN CLICK `input[data-max-price="100"]`
|
||||
"""
|
||||
|
||||
# Compile the script
|
||||
print("Compiling C4A script...")
|
||||
result = c4a_compile(c4a_script)
|
||||
|
||||
if result.success:
|
||||
print(f"✅ Successfully compiled to {len(result.js_code)} JavaScript statements!")
|
||||
print("\nFirst 3 JS statements:")
|
||||
for stmt in result.js_code[:3]:
|
||||
print(f" • {stmt}")
|
||||
|
||||
# Use with crawler
|
||||
config = CrawlerRunConfig(
|
||||
c4a_script=c4a_script, # Pass C4A script directly
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
print("\n✅ Script ready for use with AsyncWebCrawler!")
|
||||
else:
|
||||
print(f"❌ Compilation error: {result.first_error.message}")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all demos"""
|
||||
print("\n🚀 Crawl4AI v0.7.0 Feature Demonstrations")
|
||||
print("=" * 60)
|
||||
|
||||
demos = [
|
||||
("Link Preview & Scoring", demo_link_preview),
|
||||
("Adaptive Crawling", demo_adaptive_crawling),
|
||||
("Virtual Scroll", demo_virtual_scroll),
|
||||
("URL Seeder", demo_url_seeder),
|
||||
("C4A Script", demo_c4a_script),
|
||||
]
|
||||
|
||||
for name, demo_func in demos:
|
||||
try:
|
||||
await demo_func()
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error in {name} demo: {str(e)}")
|
||||
|
||||
# Pause between demos
|
||||
await asyncio.sleep(1)
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("✅ All demos completed!")
|
||||
print("\nKey Takeaways:")
|
||||
print("• Link Preview: 3-layer scoring for intelligent link analysis")
|
||||
print("• Adaptive Crawling: Stop when you have enough information")
|
||||
print("• Virtual Scroll: Capture all content from modern web pages")
|
||||
print("• URL Seeder: Pre-discover and filter URLs efficiently")
|
||||
print("• C4A Script: Simple language for complex automations")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,4 +1,4 @@
|
||||
site_name: Crawl4AI Documentation (v0.6.x)
|
||||
site_name: Crawl4AI Documentation (v0.7.x)
|
||||
site_favicon: docs/md_v2/favicon.ico
|
||||
site_description: 🚀🤖 Crawl4AI, Open-source LLM-Friendly Web Crawler & Scraper
|
||||
site_url: https://docs.crawl4ai.com
|
||||
@@ -25,6 +25,8 @@ nav:
|
||||
- "Command Line Interface": "core/cli.md"
|
||||
- "Simple Crawling": "core/simple-crawling.md"
|
||||
- "Deep Crawling": "core/deep-crawling.md"
|
||||
- "Adaptive Crawling": "core/adaptive-crawling.md"
|
||||
- "URL Seeding": "core/url-seeding.md"
|
||||
- "C4A-Script": "core/c4a-script.md"
|
||||
- "Crawler Result": "core/crawler-result.md"
|
||||
- "Browser, Crawler & LLM Config": "core/browser-crawler-config.md"
|
||||
@@ -37,6 +39,7 @@ nav:
|
||||
- "Link & Media": "core/link-media.md"
|
||||
- Advanced:
|
||||
- "Overview": "advanced/advanced-features.md"
|
||||
- "Adaptive Strategies": "advanced/adaptive-strategies.md"
|
||||
- "Virtual Scroll": "advanced/virtual-scroll.md"
|
||||
- "File Downloading": "advanced/file-downloading.md"
|
||||
- "Lazy Loading": "advanced/lazy-loading.md"
|
||||
@@ -48,6 +51,7 @@ nav:
|
||||
- "Identity Based Crawling": "advanced/identity-based-crawling.md"
|
||||
- "SSL Certificate": "advanced/ssl-certificate.md"
|
||||
- "Network & Console Capture": "advanced/network-console-capture.md"
|
||||
- "PDF Parsing": "advanced/pdf-parsing.md"
|
||||
- Extraction:
|
||||
- "LLM-Free Strategies": "extraction/no-llm-strategies.md"
|
||||
- "LLM Strategies": "extraction/llm-strategies.md"
|
||||
|
||||
@@ -17,7 +17,7 @@ dependencies = [
|
||||
"lxml~=5.3",
|
||||
"litellm>=1.53.1",
|
||||
"numpy>=1.26.0,<3",
|
||||
"pillow~=10.4",
|
||||
"pillow>=10.4",
|
||||
"playwright>=1.49.0",
|
||||
"python-dotenv~=1.0",
|
||||
"requests~=2.26",
|
||||
@@ -32,7 +32,6 @@ dependencies = [
|
||||
"psutil>=6.1.1",
|
||||
"nltk>=3.9.1",
|
||||
"playwright",
|
||||
"aiofiles",
|
||||
"rich>=13.9.4",
|
||||
"cssselect>=1.2.0",
|
||||
"httpx>=0.27.2",
|
||||
@@ -44,7 +43,9 @@ dependencies = [
|
||||
"aiohttp>=3.11.11",
|
||||
"brotli>=1.1.0",
|
||||
"humanize>=4.10.0",
|
||||
"lark>=1.2.2"
|
||||
"lark>=1.2.2",
|
||||
"alphashape>=1.3.1",
|
||||
"shapely>=2.0.0"
|
||||
]
|
||||
classifiers = [
|
||||
"Development Status :: 4 - Beta",
|
||||
@@ -60,8 +61,8 @@ classifiers = [
|
||||
[project.optional-dependencies]
|
||||
pdf = ["PyPDF2"]
|
||||
torch = ["torch", "nltk", "scikit-learn"]
|
||||
transformer = ["transformers", "tokenizers"]
|
||||
cosine = ["torch", "transformers", "nltk"]
|
||||
transformer = ["transformers", "tokenizers", "sentence-transformers"]
|
||||
cosine = ["torch", "transformers", "nltk", "sentence-transformers"]
|
||||
sync = ["selenium"]
|
||||
all = [
|
||||
"PyPDF2",
|
||||
@@ -70,8 +71,8 @@ all = [
|
||||
"scikit-learn",
|
||||
"transformers",
|
||||
"tokenizers",
|
||||
"selenium",
|
||||
"PyPDF2"
|
||||
"sentence-transformers",
|
||||
"selenium"
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
|
||||
@@ -4,7 +4,7 @@ aiosqlite~=0.20
|
||||
lxml~=5.3
|
||||
litellm>=1.53.1
|
||||
numpy>=1.26.0,<3
|
||||
pillow~=10.4
|
||||
pillow>=10.4
|
||||
playwright>=1.49.0
|
||||
python-dotenv~=1.0
|
||||
requests~=2.26
|
||||
@@ -24,3 +24,9 @@ cssselect>=1.2.0
|
||||
chardet>=5.2.0
|
||||
brotli>=1.1.0
|
||||
httpx[http2]>=0.27.2
|
||||
alphashape>=1.3.1
|
||||
shapely>=2.0.0
|
||||
|
||||
fake-useragent>=2.2.0
|
||||
pdf2image>=1.17.0
|
||||
PyPDF2>=3.0.1
|
||||
98
tests/adaptive/compare_performance.py
Normal file
98
tests/adaptive/compare_performance.py
Normal file
@@ -0,0 +1,98 @@
|
||||
"""
|
||||
Compare performance before and after optimizations
|
||||
"""
|
||||
|
||||
def read_baseline():
|
||||
"""Read baseline performance metrics"""
|
||||
with open('performance_baseline.txt', 'r') as f:
|
||||
content = f.read()
|
||||
|
||||
# Extract key metrics
|
||||
metrics = {}
|
||||
lines = content.split('\n')
|
||||
for i, line in enumerate(lines):
|
||||
if 'Total Time:' in line:
|
||||
metrics['total_time'] = float(line.split(':')[1].strip().split()[0])
|
||||
elif 'Memory Used:' in line:
|
||||
metrics['memory_mb'] = float(line.split(':')[1].strip().split()[0])
|
||||
elif 'validate_coverage:' in line and i+1 < len(lines) and 'Avg Time:' in lines[i+2]:
|
||||
metrics['validate_coverage_ms'] = float(lines[i+2].split(':')[1].strip().split()[0])
|
||||
elif 'select_links:' in line and i+1 < len(lines) and 'Avg Time:' in lines[i+2]:
|
||||
metrics['select_links_ms'] = float(lines[i+2].split(':')[1].strip().split()[0])
|
||||
elif 'calculate_confidence:' in line and i+1 < len(lines) and 'Avg Time:' in lines[i+2]:
|
||||
metrics['calculate_confidence_ms'] = float(lines[i+2].split(':')[1].strip().split()[0])
|
||||
|
||||
return metrics
|
||||
|
||||
|
||||
def print_comparison(before_metrics, after_metrics):
|
||||
"""Print performance comparison"""
|
||||
print("\n" + "="*80)
|
||||
print("PERFORMANCE COMPARISON: BEFORE vs AFTER OPTIMIZATIONS")
|
||||
print("="*80)
|
||||
|
||||
# Total time
|
||||
time_improvement = (before_metrics['total_time'] - after_metrics['total_time']) / before_metrics['total_time'] * 100
|
||||
print(f"\n📊 Total Time:")
|
||||
print(f" Before: {before_metrics['total_time']:.2f} seconds")
|
||||
print(f" After: {after_metrics['total_time']:.2f} seconds")
|
||||
print(f" Improvement: {time_improvement:.1f}% faster ✅" if time_improvement > 0 else f" Slower: {-time_improvement:.1f}% ❌")
|
||||
|
||||
# Memory
|
||||
mem_improvement = (before_metrics['memory_mb'] - after_metrics['memory_mb']) / before_metrics['memory_mb'] * 100
|
||||
print(f"\n💾 Memory Usage:")
|
||||
print(f" Before: {before_metrics['memory_mb']:.2f} MB")
|
||||
print(f" After: {after_metrics['memory_mb']:.2f} MB")
|
||||
print(f" Improvement: {mem_improvement:.1f}% less memory ✅" if mem_improvement > 0 else f" More memory: {-mem_improvement:.1f}% ❌")
|
||||
|
||||
# Key operations
|
||||
print(f"\n⚡ Key Operations:")
|
||||
|
||||
# Validate coverage
|
||||
if 'validate_coverage_ms' in before_metrics and 'validate_coverage_ms' in after_metrics:
|
||||
val_improvement = (before_metrics['validate_coverage_ms'] - after_metrics['validate_coverage_ms']) / before_metrics['validate_coverage_ms'] * 100
|
||||
print(f"\n validate_coverage:")
|
||||
print(f" Before: {before_metrics['validate_coverage_ms']:.1f} ms")
|
||||
print(f" After: {after_metrics['validate_coverage_ms']:.1f} ms")
|
||||
print(f" Improvement: {val_improvement:.1f}% faster ✅" if val_improvement > 0 else f" Slower: {-val_improvement:.1f}% ❌")
|
||||
|
||||
# Select links
|
||||
if 'select_links_ms' in before_metrics and 'select_links_ms' in after_metrics:
|
||||
sel_improvement = (before_metrics['select_links_ms'] - after_metrics['select_links_ms']) / before_metrics['select_links_ms'] * 100
|
||||
print(f"\n select_links:")
|
||||
print(f" Before: {before_metrics['select_links_ms']:.1f} ms")
|
||||
print(f" After: {after_metrics['select_links_ms']:.1f} ms")
|
||||
print(f" Improvement: {sel_improvement:.1f}% faster ✅" if sel_improvement > 0 else f" Slower: {-sel_improvement:.1f}% ❌")
|
||||
|
||||
# Calculate confidence
|
||||
if 'calculate_confidence_ms' in before_metrics and 'calculate_confidence_ms' in after_metrics:
|
||||
calc_improvement = (before_metrics['calculate_confidence_ms'] - after_metrics['calculate_confidence_ms']) / before_metrics['calculate_confidence_ms'] * 100
|
||||
print(f"\n calculate_confidence:")
|
||||
print(f" Before: {before_metrics['calculate_confidence_ms']:.1f} ms")
|
||||
print(f" After: {after_metrics['calculate_confidence_ms']:.1f} ms")
|
||||
print(f" Improvement: {calc_improvement:.1f}% faster ✅" if calc_improvement > 0 else f" Slower: {-calc_improvement:.1f}% ❌")
|
||||
|
||||
print("\n" + "="*80)
|
||||
|
||||
# Overall assessment
|
||||
if time_improvement > 50:
|
||||
print("🎉 EXCELLENT OPTIMIZATION! More than 50% performance improvement!")
|
||||
elif time_improvement > 30:
|
||||
print("✅ GOOD OPTIMIZATION! Significant performance improvement!")
|
||||
elif time_improvement > 10:
|
||||
print("👍 DECENT OPTIMIZATION! Noticeable performance improvement!")
|
||||
else:
|
||||
print("🤔 MINIMAL IMPROVEMENT. Further optimization may be needed.")
|
||||
|
||||
print("="*80)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Example usage - you'll run this after implementing optimizations
|
||||
baseline = read_baseline()
|
||||
print("Baseline metrics loaded:")
|
||||
for k, v in baseline.items():
|
||||
print(f" {k}: {v}")
|
||||
|
||||
print("\n⚠️ Run the performance test again after optimizations to compare!")
|
||||
print("Then update this script with the new metrics to see the comparison.")
|
||||
293
tests/adaptive/test_adaptive_crawler.py
Normal file
293
tests/adaptive/test_adaptive_crawler.py
Normal file
@@ -0,0 +1,293 @@
|
||||
"""
|
||||
Test and demo script for Adaptive Crawler
|
||||
|
||||
This script demonstrates the progressive crawling functionality
|
||||
with various configurations and use cases.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from pathlib import Path
|
||||
import time
|
||||
from typing import Dict, List
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
from rich.progress import Progress
|
||||
from rich import print as rprint
|
||||
|
||||
# Add parent directory to path for imports
|
||||
import sys
|
||||
sys.path.append(str(Path(__file__).parent.parent))
|
||||
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
AdaptiveCrawler,
|
||||
AdaptiveConfig,
|
||||
CrawlState
|
||||
)
|
||||
|
||||
|
||||
console = Console()
|
||||
|
||||
|
||||
|
||||
|
||||
def print_relevant_content(crawler: AdaptiveCrawler, top_k: int = 3):
|
||||
"""Print most relevant content found"""
|
||||
relevant = crawler.get_relevant_content(top_k=top_k)
|
||||
|
||||
if not relevant:
|
||||
console.print("[yellow]No relevant content found yet.[/yellow]")
|
||||
return
|
||||
|
||||
console.print(f"\n[bold cyan]Top {len(relevant)} Most Relevant Pages:[/bold cyan]")
|
||||
for i, doc in enumerate(relevant, 1):
|
||||
console.print(f"\n[green]{i}. {doc['url']}[/green]")
|
||||
console.print(f" Score: {doc['score']:.2f}")
|
||||
# Show snippet
|
||||
content = doc['content'] or ""
|
||||
snippet = content[:200].replace('\n', ' ') + "..." if len(content) > 200 else content
|
||||
console.print(f" [dim]{snippet}[/dim]")
|
||||
|
||||
|
||||
async def test_basic_progressive_crawl():
|
||||
"""Test basic progressive crawling functionality"""
|
||||
console.print("\n[bold yellow]Test 1: Basic Progressive Crawl[/bold yellow]")
|
||||
console.print("Testing on Python documentation with query about async/await")
|
||||
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.7,
|
||||
max_pages=10,
|
||||
top_k_links=2,
|
||||
min_gain_threshold=0.1
|
||||
)
|
||||
|
||||
# Create crawler
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
prog_crawler = AdaptiveCrawler(
|
||||
crawler=crawler,
|
||||
config=config
|
||||
)
|
||||
|
||||
# Start progressive crawl
|
||||
start_time = time.time()
|
||||
state = await prog_crawler.digest(
|
||||
start_url="https://docs.python.org/3/library/asyncio.html",
|
||||
query="async await context managers"
|
||||
)
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
# Print results
|
||||
prog_crawler.print_stats(detailed=False)
|
||||
prog_crawler.print_stats(detailed=True)
|
||||
print_relevant_content(prog_crawler)
|
||||
|
||||
console.print(f"\n[green]Crawl completed in {elapsed:.2f} seconds[/green]")
|
||||
console.print(f"Final confidence: {prog_crawler.confidence:.2%}")
|
||||
console.print(f"URLs crawled: {list(state.crawled_urls)[:5]}...") # Show first 5
|
||||
|
||||
# Test export functionality
|
||||
export_path = "knowledge_base_export.jsonl"
|
||||
prog_crawler.export_knowledge_base(export_path)
|
||||
console.print(f"[green]Knowledge base exported to {export_path}[/green]")
|
||||
|
||||
# Clean up
|
||||
Path(export_path).unlink(missing_ok=True)
|
||||
|
||||
|
||||
async def test_with_persistence():
|
||||
"""Test state persistence and resumption"""
|
||||
console.print("\n[bold yellow]Test 2: Persistence and Resumption[/bold yellow]")
|
||||
console.print("Testing state save/load functionality")
|
||||
|
||||
state_path = "test_crawl_state.json"
|
||||
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.6,
|
||||
max_pages=5,
|
||||
top_k_links=2,
|
||||
save_state=True,
|
||||
state_path=state_path
|
||||
)
|
||||
|
||||
# First crawl - partial
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
prog_crawler = AdaptiveCrawler(
|
||||
crawler=crawler,
|
||||
config=config
|
||||
)
|
||||
|
||||
state1 = await prog_crawler.digest(
|
||||
start_url="https://httpbin.org",
|
||||
query="http headers response"
|
||||
)
|
||||
|
||||
console.print(f"[cyan]First crawl: {len(state1.crawled_urls)} pages[/cyan]")
|
||||
|
||||
# Resume crawl
|
||||
config.max_pages = 10 # Increase limit
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
prog_crawler = AdaptiveCrawler(
|
||||
crawler=crawler,
|
||||
config=config
|
||||
)
|
||||
|
||||
state2 = await prog_crawler.digest(
|
||||
start_url="https://httpbin.org",
|
||||
query="http headers response",
|
||||
resume_from=state_path
|
||||
)
|
||||
|
||||
console.print(f"[green]Resumed crawl: {len(state2.crawled_urls)} total pages[/green]")
|
||||
|
||||
# Clean up
|
||||
Path(state_path).unlink(missing_ok=True)
|
||||
|
||||
|
||||
async def test_different_domains():
|
||||
"""Test on different types of websites"""
|
||||
console.print("\n[bold yellow]Test 3: Different Domain Types[/bold yellow]")
|
||||
|
||||
test_cases = [
|
||||
{
|
||||
"name": "Documentation Site",
|
||||
"url": "https://docs.python.org/3/",
|
||||
"query": "decorators and context managers"
|
||||
},
|
||||
{
|
||||
"name": "API Documentation",
|
||||
"url": "https://httpbin.org",
|
||||
"query": "http authentication headers"
|
||||
}
|
||||
]
|
||||
|
||||
for test in test_cases:
|
||||
console.print(f"\n[cyan]Testing: {test['name']}[/cyan]")
|
||||
console.print(f"URL: {test['url']}")
|
||||
console.print(f"Query: {test['query']}")
|
||||
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.6,
|
||||
max_pages=5,
|
||||
top_k_links=2
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
prog_crawler = AdaptiveCrawler(
|
||||
crawler=crawler,
|
||||
config=config
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
state = await prog_crawler.digest(
|
||||
start_url=test['url'],
|
||||
query=test['query']
|
||||
)
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
# Summary using print_stats
|
||||
prog_crawler.print_stats(detailed=False)
|
||||
|
||||
|
||||
async def test_stopping_criteria():
|
||||
"""Test different stopping criteria"""
|
||||
console.print("\n[bold yellow]Test 4: Stopping Criteria[/bold yellow]")
|
||||
|
||||
# Test 1: High confidence threshold
|
||||
console.print("\n[cyan]4.1 High confidence threshold (0.9)[/cyan]")
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.9, # Very high
|
||||
max_pages=20,
|
||||
top_k_links=3
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
prog_crawler = AdaptiveCrawler(crawler=crawler, config=config)
|
||||
state = await prog_crawler.digest(
|
||||
start_url="https://docs.python.org/3/library/",
|
||||
query="python standard library"
|
||||
)
|
||||
|
||||
console.print(f"Pages needed for 90% confidence: {len(state.crawled_urls)}")
|
||||
prog_crawler.print_stats(detailed=False)
|
||||
|
||||
# Test 2: Page limit
|
||||
console.print("\n[cyan]4.2 Page limit (3 pages max)[/cyan]")
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.9,
|
||||
max_pages=3, # Very low limit
|
||||
top_k_links=2
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
prog_crawler = AdaptiveCrawler(crawler=crawler, config=config)
|
||||
state = await prog_crawler.digest(
|
||||
start_url="https://docs.python.org/3/library/",
|
||||
query="python standard library modules"
|
||||
)
|
||||
|
||||
console.print(f"Stopped by: {'Page limit' if len(state.crawled_urls) >= 3 else 'Other'}")
|
||||
prog_crawler.print_stats(detailed=False)
|
||||
|
||||
|
||||
async def test_crawl_patterns():
|
||||
"""Analyze crawl patterns and link selection"""
|
||||
console.print("\n[bold yellow]Test 5: Crawl Pattern Analysis[/bold yellow]")
|
||||
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.7,
|
||||
max_pages=8,
|
||||
top_k_links=2,
|
||||
min_gain_threshold=0.05
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
prog_crawler = AdaptiveCrawler(crawler=crawler, config=config)
|
||||
|
||||
# Track crawl progress
|
||||
console.print("\n[cyan]Crawl Progress:[/cyan]")
|
||||
|
||||
state = await prog_crawler.digest(
|
||||
start_url="https://httpbin.org",
|
||||
query="http methods post get"
|
||||
)
|
||||
|
||||
# Show crawl order
|
||||
console.print("\n[green]Crawl Order:[/green]")
|
||||
for i, url in enumerate(state.crawl_order, 1):
|
||||
console.print(f"{i}. {url}")
|
||||
|
||||
# Show new terms discovered per page
|
||||
console.print("\n[green]New Terms Discovered:[/green]")
|
||||
for i, new_terms in enumerate(state.new_terms_history, 1):
|
||||
console.print(f"Page {i}: {new_terms} new terms")
|
||||
|
||||
# Final metrics
|
||||
console.print(f"\n[yellow]Saturation reached: {state.metrics.get('saturation', 0):.2%}[/yellow]")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all tests"""
|
||||
console.print("[bold magenta]Adaptive Crawler Test Suite[/bold magenta]")
|
||||
console.print("=" * 50)
|
||||
|
||||
try:
|
||||
# Run tests
|
||||
await test_basic_progressive_crawl()
|
||||
# await test_with_persistence()
|
||||
# await test_different_domains()
|
||||
# await test_stopping_criteria()
|
||||
# await test_crawl_patterns()
|
||||
|
||||
console.print("\n[bold green]✅ All tests completed successfully![/bold green]")
|
||||
|
||||
except Exception as e:
|
||||
console.print(f"\n[bold red]❌ Test failed with error: {e}[/bold red]")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Run the test suite
|
||||
asyncio.run(main())
|
||||
182
tests/adaptive/test_confidence_debug.py
Normal file
182
tests/adaptive/test_confidence_debug.py
Normal file
@@ -0,0 +1,182 @@
|
||||
"""
|
||||
Test script for debugging confidence calculation in adaptive crawler
|
||||
Focus: Testing why confidence decreases when crawling relevant URLs
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List, Dict
|
||||
import math
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.append(str(Path(__file__).parent.parent))
|
||||
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.adaptive_crawler import CrawlState, StatisticalStrategy
|
||||
from crawl4ai.models import CrawlResult
|
||||
|
||||
|
||||
class ConfidenceTestHarness:
|
||||
"""Test harness for analyzing confidence calculation"""
|
||||
|
||||
def __init__(self):
|
||||
self.strategy = StatisticalStrategy()
|
||||
self.test_urls = [
|
||||
'https://docs.python.org/3/library/asyncio.html',
|
||||
'https://docs.python.org/3/library/asyncio-runner.html',
|
||||
'https://docs.python.org/3/library/asyncio-api-index.html',
|
||||
'https://docs.python.org/3/library/contextvars.html',
|
||||
'https://docs.python.org/3/library/asyncio-stream.html'
|
||||
]
|
||||
self.query = "async await context manager"
|
||||
|
||||
async def test_confidence_progression(self):
|
||||
"""Test confidence calculation as we crawl each URL"""
|
||||
print(f"Testing confidence for query: '{self.query}'")
|
||||
print("=" * 80)
|
||||
|
||||
# Initialize state
|
||||
state = CrawlState(query=self.query)
|
||||
|
||||
# Create crawler
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
for i, url in enumerate(self.test_urls, 1):
|
||||
print(f"\n{i}. Crawling: {url}")
|
||||
print("-" * 80)
|
||||
|
||||
# Crawl the URL
|
||||
result = await crawler.arun(url=url)
|
||||
|
||||
# Extract markdown content
|
||||
if hasattr(result, '_results') and result._results:
|
||||
result = result._results[0]
|
||||
|
||||
# Create a mock CrawlResult with markdown
|
||||
mock_result = type('CrawlResult', (), {
|
||||
'markdown': type('Markdown', (), {
|
||||
'raw_markdown': result.markdown.raw_markdown if hasattr(result, 'markdown') else ''
|
||||
})(),
|
||||
'url': url
|
||||
})()
|
||||
|
||||
# Update state
|
||||
state.knowledge_base.append(mock_result)
|
||||
await self.strategy.update_state(state, [mock_result])
|
||||
|
||||
# Calculate metrics
|
||||
confidence = await self.strategy.calculate_confidence(state)
|
||||
|
||||
# Get individual components
|
||||
coverage = state.metrics.get('coverage', 0)
|
||||
consistency = state.metrics.get('consistency', 0)
|
||||
saturation = state.metrics.get('saturation', 0)
|
||||
|
||||
# Analyze term frequencies
|
||||
query_terms = self.strategy._tokenize(self.query.lower())
|
||||
term_stats = {}
|
||||
for term in query_terms:
|
||||
term_stats[term] = {
|
||||
'tf': state.term_frequencies.get(term, 0),
|
||||
'df': state.document_frequencies.get(term, 0)
|
||||
}
|
||||
|
||||
# Print detailed results
|
||||
print(f"State after crawl {i}:")
|
||||
print(f" Total documents: {state.total_documents}")
|
||||
print(f" Unique terms: {len(state.term_frequencies)}")
|
||||
print(f" New terms added: {state.new_terms_history[-1] if state.new_terms_history else 0}")
|
||||
|
||||
print(f"\nQuery term statistics:")
|
||||
for term, stats in term_stats.items():
|
||||
print(f" '{term}': tf={stats['tf']}, df={stats['df']}")
|
||||
|
||||
print(f"\nMetrics:")
|
||||
print(f" Coverage: {coverage:.3f}")
|
||||
print(f" Consistency: {consistency:.3f}")
|
||||
print(f" Saturation: {saturation:.3f}")
|
||||
print(f" → Confidence: {confidence:.3f}")
|
||||
|
||||
# Show coverage calculation details
|
||||
print(f"\nCoverage calculation details:")
|
||||
self._debug_coverage_calculation(state, query_terms)
|
||||
|
||||
# Alert if confidence decreased
|
||||
if i > 1 and confidence < state.metrics.get('prev_confidence', 0):
|
||||
print(f"\n⚠️ WARNING: Confidence decreased from {state.metrics.get('prev_confidence', 0):.3f} to {confidence:.3f}")
|
||||
|
||||
state.metrics['prev_confidence'] = confidence
|
||||
|
||||
def _debug_coverage_calculation(self, state: CrawlState, query_terms: List[str]):
|
||||
"""Debug coverage calculation step by step"""
|
||||
coverage_score = 0.0
|
||||
max_possible_score = 0.0
|
||||
|
||||
for term in query_terms:
|
||||
tf = state.term_frequencies.get(term, 0)
|
||||
df = state.document_frequencies.get(term, 0)
|
||||
|
||||
if df > 0:
|
||||
idf = math.log((state.total_documents - df + 0.5) / (df + 0.5) + 1)
|
||||
doc_coverage = df / state.total_documents
|
||||
tf_boost = min(tf / df, 3.0)
|
||||
term_score = doc_coverage * idf * (1 + 0.1 * math.log1p(tf_boost))
|
||||
|
||||
print(f" '{term}': doc_cov={doc_coverage:.2f}, idf={idf:.2f}, boost={1 + 0.1 * math.log1p(tf_boost):.2f} → score={term_score:.3f}")
|
||||
coverage_score += term_score
|
||||
else:
|
||||
print(f" '{term}': not found → score=0.000")
|
||||
|
||||
max_possible_score += 1.0 * 1.0 * 1.1
|
||||
|
||||
print(f" Total: {coverage_score:.3f} / {max_possible_score:.3f} = {coverage_score/max_possible_score if max_possible_score > 0 else 0:.3f}")
|
||||
|
||||
# New coverage calculation
|
||||
print(f"\n NEW Coverage calculation (without IDF):")
|
||||
new_coverage = self._calculate_coverage_new(state, query_terms)
|
||||
print(f" → New Coverage: {new_coverage:.3f}")
|
||||
|
||||
def _calculate_coverage_new(self, state: CrawlState, query_terms: List[str]) -> float:
|
||||
"""New coverage calculation without IDF"""
|
||||
if not query_terms or state.total_documents == 0:
|
||||
return 0.0
|
||||
|
||||
term_scores = []
|
||||
max_tf = max(state.term_frequencies.values()) if state.term_frequencies else 1
|
||||
|
||||
for term in query_terms:
|
||||
tf = state.term_frequencies.get(term, 0)
|
||||
df = state.document_frequencies.get(term, 0)
|
||||
|
||||
if df > 0:
|
||||
# Document coverage: what fraction of docs contain this term
|
||||
doc_coverage = df / state.total_documents
|
||||
|
||||
# Frequency signal: normalized log frequency
|
||||
freq_signal = math.log(1 + tf) / math.log(1 + max_tf) if max_tf > 0 else 0
|
||||
|
||||
# Combined score: document coverage with frequency boost
|
||||
term_score = doc_coverage * (1 + 0.5 * freq_signal)
|
||||
|
||||
print(f" '{term}': doc_cov={doc_coverage:.2f}, freq_signal={freq_signal:.2f} → score={term_score:.3f}")
|
||||
term_scores.append(term_score)
|
||||
else:
|
||||
print(f" '{term}': not found → score=0.000")
|
||||
term_scores.append(0.0)
|
||||
|
||||
# Average across all query terms
|
||||
coverage = sum(term_scores) / len(term_scores)
|
||||
return coverage
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run the confidence test"""
|
||||
tester = ConfidenceTestHarness()
|
||||
await tester.test_confidence_progression()
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("Test complete!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
254
tests/adaptive/test_embedding_performance.py
Normal file
254
tests/adaptive/test_embedding_performance.py
Normal file
@@ -0,0 +1,254 @@
|
||||
"""
|
||||
Performance test for Embedding Strategy optimizations
|
||||
Measures time and memory usage before and after optimizations
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
import tracemalloc
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.append(str(Path(__file__).parent.parent.parent))
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||
from crawl4ai.adaptive_crawler import EmbeddingStrategy, CrawlState
|
||||
from crawl4ai.models import CrawlResult
|
||||
|
||||
|
||||
class PerformanceMetrics:
|
||||
def __init__(self):
|
||||
self.start_time = 0
|
||||
self.end_time = 0
|
||||
self.start_memory = 0
|
||||
self.peak_memory = 0
|
||||
self.operation_times = {}
|
||||
|
||||
def start(self):
|
||||
tracemalloc.start()
|
||||
self.start_time = time.perf_counter()
|
||||
self.start_memory = tracemalloc.get_traced_memory()[0]
|
||||
|
||||
def end(self):
|
||||
self.end_time = time.perf_counter()
|
||||
current, peak = tracemalloc.get_traced_memory()
|
||||
self.peak_memory = peak
|
||||
tracemalloc.stop()
|
||||
|
||||
def record_operation(self, name: str, duration: float):
|
||||
if name not in self.operation_times:
|
||||
self.operation_times[name] = []
|
||||
self.operation_times[name].append(duration)
|
||||
|
||||
@property
|
||||
def total_time(self):
|
||||
return self.end_time - self.start_time
|
||||
|
||||
@property
|
||||
def memory_used_mb(self):
|
||||
return (self.peak_memory - self.start_memory) / 1024 / 1024
|
||||
|
||||
def print_summary(self, label: str):
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Performance Summary: {label}")
|
||||
print(f"{'='*60}")
|
||||
print(f"Total Time: {self.total_time:.3f} seconds")
|
||||
print(f"Memory Used: {self.memory_used_mb:.2f} MB")
|
||||
|
||||
if self.operation_times:
|
||||
print("\nOperation Breakdown:")
|
||||
for op, times in self.operation_times.items():
|
||||
avg_time = sum(times) / len(times)
|
||||
total_time = sum(times)
|
||||
print(f" {op}:")
|
||||
print(f" - Calls: {len(times)}")
|
||||
print(f" - Avg Time: {avg_time*1000:.2f} ms")
|
||||
print(f" - Total Time: {total_time:.3f} s")
|
||||
|
||||
|
||||
async def create_mock_crawl_results(n: int) -> list:
|
||||
"""Create mock crawl results for testing"""
|
||||
results = []
|
||||
for i in range(n):
|
||||
class MockMarkdown:
|
||||
def __init__(self, content):
|
||||
self.raw_markdown = content
|
||||
|
||||
class MockResult:
|
||||
def __init__(self, url, content):
|
||||
self.url = url
|
||||
self.markdown = MockMarkdown(content)
|
||||
self.success = True
|
||||
|
||||
content = f"This is test content {i} about async await coroutines event loops. " * 50
|
||||
result = MockResult(f"https://example.com/page{i}", content)
|
||||
results.append(result)
|
||||
return results
|
||||
|
||||
|
||||
async def test_embedding_performance():
|
||||
"""Test the performance of embedding strategy operations"""
|
||||
|
||||
# Configuration
|
||||
n_kb_docs = 30 # Number of documents in knowledge base
|
||||
n_queries = 10 # Number of query variations
|
||||
n_links = 50 # Number of candidate links
|
||||
n_iterations = 5 # Number of calculation iterations
|
||||
|
||||
print(f"\nTest Configuration:")
|
||||
print(f"- Knowledge Base Documents: {n_kb_docs}")
|
||||
print(f"- Query Variations: {n_queries}")
|
||||
print(f"- Candidate Links: {n_links}")
|
||||
print(f"- Iterations: {n_iterations}")
|
||||
|
||||
# Create embedding strategy
|
||||
config = AdaptiveConfig(
|
||||
strategy="embedding",
|
||||
max_pages=50,
|
||||
n_query_variations=n_queries,
|
||||
embedding_model="sentence-transformers/all-MiniLM-L6-v2" # 384 dimensions
|
||||
)
|
||||
|
||||
# Set up API key if available
|
||||
if os.getenv('OPENAI_API_KEY'):
|
||||
config.embedding_llm_config = {
|
||||
'provider': 'openai/text-embedding-3-small',
|
||||
'api_token': os.getenv('OPENAI_API_KEY'),
|
||||
'embedding_model': 'text-embedding-3-small'
|
||||
}
|
||||
else:
|
||||
config.embedding_llm_config = {
|
||||
'provider': 'openai/gpt-4o-mini',
|
||||
'api_token': 'dummy-key'
|
||||
}
|
||||
|
||||
strategy = EmbeddingStrategy(
|
||||
embedding_model=config.embedding_model,
|
||||
llm_config=config.embedding_llm_config
|
||||
)
|
||||
strategy.config = config
|
||||
|
||||
# Initialize state
|
||||
state = CrawlState()
|
||||
state.query = "async await coroutines event loops tasks"
|
||||
|
||||
# Start performance monitoring
|
||||
metrics = PerformanceMetrics()
|
||||
metrics.start()
|
||||
|
||||
# 1. Generate query embeddings
|
||||
print("\n1. Generating query embeddings...")
|
||||
start = time.perf_counter()
|
||||
query_embeddings, expanded_queries = await strategy.map_query_semantic_space(
|
||||
state.query,
|
||||
config.n_query_variations
|
||||
)
|
||||
state.query_embeddings = query_embeddings
|
||||
state.expanded_queries = expanded_queries
|
||||
metrics.record_operation("query_embedding", time.perf_counter() - start)
|
||||
print(f" Generated {len(query_embeddings)} query embeddings")
|
||||
|
||||
# 2. Build knowledge base incrementally
|
||||
print("\n2. Building knowledge base...")
|
||||
mock_results = await create_mock_crawl_results(n_kb_docs)
|
||||
|
||||
for i in range(0, n_kb_docs, 5): # Add 5 documents at a time
|
||||
batch = mock_results[i:i+5]
|
||||
start = time.perf_counter()
|
||||
await strategy.update_state(state, batch)
|
||||
metrics.record_operation("update_state", time.perf_counter() - start)
|
||||
state.knowledge_base.extend(batch)
|
||||
|
||||
print(f" Knowledge base has {len(state.kb_embeddings)} documents")
|
||||
|
||||
# 3. Test repeated confidence calculations
|
||||
print(f"\n3. Testing {n_iterations} confidence calculations...")
|
||||
for i in range(n_iterations):
|
||||
start = time.perf_counter()
|
||||
confidence = await strategy.calculate_confidence(state)
|
||||
metrics.record_operation("calculate_confidence", time.perf_counter() - start)
|
||||
print(f" Iteration {i+1}: {confidence:.3f} ({(time.perf_counter() - start)*1000:.1f} ms)")
|
||||
|
||||
# 4. Test coverage gap calculations
|
||||
print(f"\n4. Testing coverage gap calculations...")
|
||||
for i in range(n_iterations):
|
||||
start = time.perf_counter()
|
||||
gaps = strategy.find_coverage_gaps(state.kb_embeddings, state.query_embeddings)
|
||||
metrics.record_operation("find_coverage_gaps", time.perf_counter() - start)
|
||||
print(f" Iteration {i+1}: {len(gaps)} gaps ({(time.perf_counter() - start)*1000:.1f} ms)")
|
||||
|
||||
# 5. Test validation
|
||||
print(f"\n5. Testing validation coverage...")
|
||||
for i in range(n_iterations):
|
||||
start = time.perf_counter()
|
||||
val_score = await strategy.validate_coverage(state)
|
||||
metrics.record_operation("validate_coverage", time.perf_counter() - start)
|
||||
print(f" Iteration {i+1}: {val_score:.3f} ({(time.perf_counter() - start)*1000:.1f} ms)")
|
||||
|
||||
# 6. Create mock links for ranking
|
||||
from crawl4ai.models import Link
|
||||
mock_links = []
|
||||
for i in range(n_links):
|
||||
link = Link(
|
||||
href=f"https://example.com/new{i}",
|
||||
text=f"Link about async programming {i}",
|
||||
title=f"Async Guide {i}"
|
||||
)
|
||||
mock_links.append(link)
|
||||
|
||||
# 7. Test link selection
|
||||
print(f"\n6. Testing link selection with {n_links} candidates...")
|
||||
start = time.perf_counter()
|
||||
scored_links = await strategy.select_links_for_expansion(
|
||||
mock_links,
|
||||
gaps,
|
||||
state.kb_embeddings
|
||||
)
|
||||
metrics.record_operation("select_links", time.perf_counter() - start)
|
||||
print(f" Scored {len(scored_links)} links in {(time.perf_counter() - start)*1000:.1f} ms")
|
||||
|
||||
# End monitoring
|
||||
metrics.end()
|
||||
|
||||
return metrics
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run performance tests before and after optimizations"""
|
||||
|
||||
print("="*80)
|
||||
print("EMBEDDING STRATEGY PERFORMANCE TEST")
|
||||
print("="*80)
|
||||
|
||||
# Test current implementation
|
||||
print("\n📊 Testing CURRENT Implementation...")
|
||||
metrics_before = await test_embedding_performance()
|
||||
metrics_before.print_summary("BEFORE Optimizations")
|
||||
|
||||
# Store key metrics for comparison
|
||||
total_time_before = metrics_before.total_time
|
||||
memory_before = metrics_before.memory_used_mb
|
||||
|
||||
# Calculate specific operation costs
|
||||
calc_conf_avg = sum(metrics_before.operation_times.get("calculate_confidence", [])) / len(metrics_before.operation_times.get("calculate_confidence", [1]))
|
||||
find_gaps_avg = sum(metrics_before.operation_times.get("find_coverage_gaps", [])) / len(metrics_before.operation_times.get("find_coverage_gaps", [1]))
|
||||
validate_avg = sum(metrics_before.operation_times.get("validate_coverage", [])) / len(metrics_before.operation_times.get("validate_coverage", [1]))
|
||||
|
||||
print(f"\n🔍 Key Bottlenecks Identified:")
|
||||
print(f" - calculate_confidence: {calc_conf_avg*1000:.1f} ms per call")
|
||||
print(f" - find_coverage_gaps: {find_gaps_avg*1000:.1f} ms per call")
|
||||
print(f" - validate_coverage: {validate_avg*1000:.1f} ms per call")
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("EXPECTED IMPROVEMENTS AFTER OPTIMIZATION:")
|
||||
print("- Distance calculations: 80-90% faster (vectorization)")
|
||||
print("- Memory usage: 20-30% reduction (deduplication)")
|
||||
print("- Overall performance: 60-70% improvement")
|
||||
print("="*80)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
634
tests/adaptive/test_embedding_strategy.py
Normal file
634
tests/adaptive/test_embedding_strategy.py
Normal file
@@ -0,0 +1,634 @@
|
||||
"""
|
||||
Test and demo script for Embedding-based Adaptive Crawler
|
||||
|
||||
This script demonstrates the embedding-based adaptive crawling
|
||||
with semantic space coverage and gap-driven expansion.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from pathlib import Path
|
||||
import time
|
||||
from rich.console import Console
|
||||
from rich import print as rprint
|
||||
import sys
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.append(str(Path(__file__).parent.parent.parent))
|
||||
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
AdaptiveCrawler,
|
||||
AdaptiveConfig,
|
||||
CrawlState
|
||||
)
|
||||
|
||||
console = Console()
|
||||
|
||||
|
||||
async def test_basic_embedding_crawl():
|
||||
"""Test basic embedding-based adaptive crawling"""
|
||||
console.print("\n[bold yellow]Test 1: Basic Embedding-based Crawl[/bold yellow]")
|
||||
console.print("Testing semantic space coverage with query expansion")
|
||||
|
||||
# Configure with embedding strategy
|
||||
config = AdaptiveConfig(
|
||||
strategy="embedding",
|
||||
confidence_threshold=0.7, # Not used for stopping in embedding strategy
|
||||
min_gain_threshold=0.01,
|
||||
max_pages=15,
|
||||
top_k_links=3,
|
||||
n_query_variations=8,
|
||||
embedding_model="sentence-transformers/all-MiniLM-L6-v2" # Fast, good quality
|
||||
)
|
||||
|
||||
# For query expansion, we need an LLM config
|
||||
llm_config = {
|
||||
'provider': 'openai/gpt-4o-mini',
|
||||
'api_token': os.getenv('OPENAI_API_KEY')
|
||||
}
|
||||
|
||||
if not llm_config['api_token']:
|
||||
console.print("[red]Warning: OPENAI_API_KEY not set. Using mock data for demo.[/red]")
|
||||
# Continue with mock for demo purposes
|
||||
|
||||
config.embedding_llm_config = llm_config
|
||||
|
||||
# Create crawler
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
prog_crawler = AdaptiveCrawler(
|
||||
crawler=crawler,
|
||||
config=config
|
||||
)
|
||||
|
||||
# Start adaptive crawl
|
||||
start_time = time.time()
|
||||
console.print("\n[cyan]Starting semantic adaptive crawl...[/cyan]")
|
||||
|
||||
state = await prog_crawler.digest(
|
||||
start_url="https://docs.python.org/3/library/asyncio.html",
|
||||
query="async await coroutines event loops"
|
||||
)
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
# Print results
|
||||
console.print(f"\n[green]Crawl completed in {elapsed:.2f} seconds[/green]")
|
||||
prog_crawler.print_stats(detailed=False)
|
||||
|
||||
# Show semantic coverage details
|
||||
console.print("\n[bold cyan]Semantic Coverage Details:[/bold cyan]")
|
||||
if state.expanded_queries:
|
||||
console.print(f"Query expanded to {len(state.expanded_queries)} variations")
|
||||
console.print("Sample variations:")
|
||||
for i, q in enumerate(state.expanded_queries[:3], 1):
|
||||
console.print(f" {i}. {q}")
|
||||
|
||||
if state.semantic_gaps:
|
||||
console.print(f"\nSemantic gaps identified: {len(state.semantic_gaps)}")
|
||||
|
||||
console.print(f"\nFinal confidence: {prog_crawler.confidence:.2%}")
|
||||
console.print(f"Is Sufficient: {'Yes (Validated)' if prog_crawler.is_sufficient else 'No'}")
|
||||
console.print(f"Pages needed: {len(state.crawled_urls)}")
|
||||
|
||||
|
||||
async def test_embedding_vs_statistical(use_openai=False):
|
||||
"""Compare embedding strategy with statistical strategy"""
|
||||
console.print("\n[bold yellow]Test 2: Embedding vs Statistical Strategy Comparison[/bold yellow]")
|
||||
|
||||
test_url = "https://httpbin.org"
|
||||
test_query = "http headers authentication api"
|
||||
|
||||
# Test 1: Statistical strategy
|
||||
console.print("\n[cyan]1. Statistical Strategy:[/cyan]")
|
||||
config_stat = AdaptiveConfig(
|
||||
strategy="statistical",
|
||||
confidence_threshold=0.7,
|
||||
max_pages=10
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
stat_crawler = AdaptiveCrawler(crawler=crawler, config=config_stat)
|
||||
|
||||
start_time = time.time()
|
||||
state_stat = await stat_crawler.digest(start_url=test_url, query=test_query)
|
||||
stat_time = time.time() - start_time
|
||||
|
||||
stat_pages = len(state_stat.crawled_urls)
|
||||
stat_confidence = stat_crawler.confidence
|
||||
|
||||
# Test 2: Embedding strategy
|
||||
console.print("\n[cyan]2. Embedding Strategy:[/cyan]")
|
||||
config_emb = AdaptiveConfig(
|
||||
strategy="embedding",
|
||||
confidence_threshold=0.7, # Not used for stopping
|
||||
max_pages=10,
|
||||
n_query_variations=5,
|
||||
min_gain_threshold=0.01
|
||||
)
|
||||
|
||||
# Use OpenAI if available or requested
|
||||
if use_openai and os.getenv('OPENAI_API_KEY'):
|
||||
config_emb.embedding_llm_config = {
|
||||
'provider': 'openai/text-embedding-3-small',
|
||||
'api_token': os.getenv('OPENAI_API_KEY'),
|
||||
'embedding_model': 'text-embedding-3-small'
|
||||
}
|
||||
console.print("[cyan]Using OpenAI embeddings[/cyan]")
|
||||
else:
|
||||
# Default config will try sentence-transformers
|
||||
config_emb.embedding_llm_config = {
|
||||
'provider': 'openai/gpt-4o-mini',
|
||||
'api_token': os.getenv('OPENAI_API_KEY', 'dummy-key')
|
||||
}
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
emb_crawler = AdaptiveCrawler(crawler=crawler, config=config_emb)
|
||||
|
||||
start_time = time.time()
|
||||
state_emb = await emb_crawler.digest(start_url=test_url, query=test_query)
|
||||
emb_time = time.time() - start_time
|
||||
|
||||
emb_pages = len(state_emb.crawled_urls)
|
||||
emb_confidence = emb_crawler.confidence
|
||||
|
||||
# Compare results
|
||||
console.print("\n[bold green]Comparison Results:[/bold green]")
|
||||
console.print(f"Statistical: {stat_pages} pages in {stat_time:.2f}s, confidence: {stat_confidence:.2%}, sufficient: {stat_crawler.is_sufficient}")
|
||||
console.print(f"Embedding: {emb_pages} pages in {emb_time:.2f}s, confidence: {emb_confidence:.2%}, sufficient: {emb_crawler.is_sufficient}")
|
||||
|
||||
if emb_pages < stat_pages:
|
||||
efficiency = ((stat_pages - emb_pages) / stat_pages) * 100
|
||||
console.print(f"\n[green]Embedding strategy used {efficiency:.0f}% fewer pages![/green]")
|
||||
|
||||
# Show validation info for embedding
|
||||
if hasattr(state_emb, 'metrics') and 'validation_confidence' in state_emb.metrics:
|
||||
console.print(f"Embedding validation score: {state_emb.metrics['validation_confidence']:.2%}")
|
||||
|
||||
|
||||
async def test_custom_embedding_provider():
|
||||
"""Test with different embedding providers"""
|
||||
console.print("\n[bold yellow]Test 3: Custom Embedding Provider[/bold yellow]")
|
||||
|
||||
# Example with OpenAI embeddings
|
||||
config = AdaptiveConfig(
|
||||
strategy="embedding",
|
||||
confidence_threshold=0.8, # Not used for stopping
|
||||
max_pages=10,
|
||||
min_gain_threshold=0.01,
|
||||
n_query_variations=5
|
||||
)
|
||||
|
||||
# Configure to use OpenAI embeddings instead of sentence-transformers
|
||||
config.embedding_llm_config = {
|
||||
'provider': 'openai/text-embedding-3-small',
|
||||
'api_token': os.getenv('OPENAI_API_KEY'),
|
||||
'embedding_model': 'text-embedding-3-small'
|
||||
}
|
||||
|
||||
if not config.embedding_llm_config['api_token']:
|
||||
console.print("[yellow]Skipping OpenAI embedding test - no API key[/yellow]")
|
||||
return
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
prog_crawler = AdaptiveCrawler(crawler=crawler, config=config)
|
||||
|
||||
console.print("Using OpenAI embeddings for semantic analysis...")
|
||||
state = await prog_crawler.digest(
|
||||
start_url="https://httpbin.org",
|
||||
query="api endpoints json response"
|
||||
)
|
||||
|
||||
prog_crawler.print_stats(detailed=False)
|
||||
|
||||
|
||||
async def test_knowledge_export_import():
|
||||
"""Test exporting and importing semantic knowledge bases"""
|
||||
console.print("\n[bold yellow]Test 4: Semantic Knowledge Base Export/Import[/bold yellow]")
|
||||
|
||||
config = AdaptiveConfig(
|
||||
strategy="embedding",
|
||||
confidence_threshold=0.7, # Not used for stopping
|
||||
max_pages=5,
|
||||
min_gain_threshold=0.01,
|
||||
n_query_variations=4
|
||||
)
|
||||
|
||||
# First crawl
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
crawler1 = AdaptiveCrawler(crawler=crawler, config=config)
|
||||
|
||||
console.print("\n[cyan]Building initial knowledge base...[/cyan]")
|
||||
state1 = await crawler1.digest(
|
||||
start_url="https://httpbin.org",
|
||||
query="http methods headers"
|
||||
)
|
||||
|
||||
# Export
|
||||
export_path = "semantic_kb.jsonl"
|
||||
crawler1.export_knowledge_base(export_path)
|
||||
console.print(f"[green]Exported {len(state1.knowledge_base)} documents with embeddings[/green]")
|
||||
|
||||
# Import and continue
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
crawler2 = AdaptiveCrawler(crawler=crawler, config=config)
|
||||
|
||||
console.print("\n[cyan]Importing knowledge base...[/cyan]")
|
||||
crawler2.import_knowledge_base(export_path)
|
||||
|
||||
# Continue with new query - should be faster
|
||||
console.print("\n[cyan]Extending with new query...[/cyan]")
|
||||
state2 = await crawler2.digest(
|
||||
start_url="https://httpbin.org",
|
||||
query="authentication oauth tokens"
|
||||
)
|
||||
|
||||
console.print(f"[green]Total knowledge base: {len(state2.knowledge_base)} documents[/green]")
|
||||
|
||||
# Cleanup
|
||||
Path(export_path).unlink(missing_ok=True)
|
||||
|
||||
|
||||
async def test_gap_visualization():
|
||||
"""Visualize semantic gaps and coverage"""
|
||||
console.print("\n[bold yellow]Test 5: Semantic Gap Analysis[/bold yellow]")
|
||||
|
||||
config = AdaptiveConfig(
|
||||
strategy="embedding",
|
||||
confidence_threshold=0.9, # Not used for stopping
|
||||
max_pages=8,
|
||||
n_query_variations=6,
|
||||
min_gain_threshold=0.01
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
prog_crawler = AdaptiveCrawler(crawler=crawler, config=config)
|
||||
|
||||
# Initial crawl
|
||||
state = await prog_crawler.digest(
|
||||
start_url="https://docs.python.org/3/library/",
|
||||
query="concurrency threading multiprocessing"
|
||||
)
|
||||
|
||||
# Analyze gaps
|
||||
console.print("\n[bold cyan]Semantic Gap Analysis:[/bold cyan]")
|
||||
console.print(f"Query variations: {len(state.expanded_queries)}")
|
||||
console.print(f"Knowledge documents: {len(state.knowledge_base)}")
|
||||
console.print(f"Identified gaps: {len(state.semantic_gaps)}")
|
||||
|
||||
if state.semantic_gaps:
|
||||
console.print("\n[yellow]Gap sizes (distance from coverage):[/yellow]")
|
||||
for i, (_, distance) in enumerate(state.semantic_gaps[:5], 1):
|
||||
console.print(f" Gap {i}: {distance:.3f}")
|
||||
|
||||
# Show crawl progression
|
||||
console.print("\n[cyan]Crawl Order (gap-driven selection):[/cyan]")
|
||||
for i, url in enumerate(state.crawl_order[:5], 1):
|
||||
console.print(f" {i}. {url}")
|
||||
|
||||
|
||||
async def test_fast_convergence_with_relevant_query():
|
||||
"""Test that both strategies reach high confidence quickly with relevant queries"""
|
||||
console.print("\n[bold yellow]Test 7: Fast Convergence with Relevant Query[/bold yellow]")
|
||||
console.print("Testing that strategies reach 80%+ confidence within 2-3 batches")
|
||||
|
||||
# Test scenarios
|
||||
test_cases = [
|
||||
{
|
||||
"name": "Python Async Documentation",
|
||||
"url": "https://docs.python.org/3/library/asyncio.html",
|
||||
"query": "async await coroutines event loops tasks"
|
||||
}
|
||||
]
|
||||
|
||||
for test_case in test_cases:
|
||||
console.print(f"\n[bold cyan]Testing: {test_case['name']}[/bold cyan]")
|
||||
console.print(f"URL: {test_case['url']}")
|
||||
console.print(f"Query: {test_case['query']}")
|
||||
|
||||
# Test Embedding Strategy
|
||||
console.print("\n[yellow]Embedding Strategy:[/yellow]")
|
||||
config_emb = AdaptiveConfig(
|
||||
strategy="embedding",
|
||||
confidence_threshold=0.8,
|
||||
max_pages=9,
|
||||
top_k_links=3,
|
||||
min_gain_threshold=0.01,
|
||||
n_query_variations=5
|
||||
)
|
||||
|
||||
# Configure embeddings
|
||||
config_emb.embedding_llm_config = {
|
||||
'provider': 'openai/gpt-4o-mini',
|
||||
'api_token': os.getenv('OPENAI_API_KEY'),
|
||||
}
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
emb_crawler = AdaptiveCrawler(crawler=crawler, config=config_emb)
|
||||
|
||||
start_time = time.time()
|
||||
state = await emb_crawler.digest(
|
||||
start_url=test_case['url'],
|
||||
query=test_case['query']
|
||||
)
|
||||
|
||||
# Get batch breakdown
|
||||
total_pages = len(state.crawled_urls)
|
||||
for i in range(0, total_pages, 3):
|
||||
batch_num = (i // 3) + 1
|
||||
batch_pages = min(3, total_pages - i)
|
||||
pages_so_far = i + batch_pages
|
||||
estimated_confidence = state.metrics.get('confidence', 0) * (pages_so_far / total_pages)
|
||||
|
||||
console.print(f"Batch {batch_num}: {batch_pages} pages → Confidence: {estimated_confidence:.1%} {'✅' if estimated_confidence >= 0.8 else '❌'}")
|
||||
|
||||
final_confidence = emb_crawler.confidence
|
||||
console.print(f"[green]Final: {total_pages} pages → Confidence: {final_confidence:.1%} {'✅ (Sufficient!)' if emb_crawler.is_sufficient else '❌'}[/green]")
|
||||
|
||||
# Show learning metrics for embedding
|
||||
if 'avg_min_distance' in state.metrics:
|
||||
console.print(f"[dim]Avg gap distance: {state.metrics['avg_min_distance']:.3f}[/dim]")
|
||||
if 'validation_confidence' in state.metrics:
|
||||
console.print(f"[dim]Validation score: {state.metrics['validation_confidence']:.1%}[/dim]")
|
||||
|
||||
# Test Statistical Strategy
|
||||
console.print("\n[yellow]Statistical Strategy:[/yellow]")
|
||||
config_stat = AdaptiveConfig(
|
||||
strategy="statistical",
|
||||
confidence_threshold=0.8,
|
||||
max_pages=9,
|
||||
top_k_links=3,
|
||||
min_gain_threshold=0.01
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
stat_crawler = AdaptiveCrawler(crawler=crawler, config=config_stat)
|
||||
|
||||
# Track batch progress
|
||||
batch_results = []
|
||||
current_pages = 0
|
||||
|
||||
# Custom batch tracking
|
||||
start_time = time.time()
|
||||
state = await stat_crawler.digest(
|
||||
start_url=test_case['url'],
|
||||
query=test_case['query']
|
||||
)
|
||||
|
||||
# Get batch breakdown (every 3 pages)
|
||||
total_pages = len(state.crawled_urls)
|
||||
for i in range(0, total_pages, 3):
|
||||
batch_num = (i // 3) + 1
|
||||
batch_pages = min(3, total_pages - i)
|
||||
# Estimate confidence at this point (simplified)
|
||||
pages_so_far = i + batch_pages
|
||||
estimated_confidence = state.metrics.get('confidence', 0) * (pages_so_far / total_pages)
|
||||
|
||||
console.print(f"Batch {batch_num}: {batch_pages} pages → Confidence: {estimated_confidence:.1%} {'✅' if estimated_confidence >= 0.8 else '❌'}")
|
||||
|
||||
final_confidence = stat_crawler.confidence
|
||||
console.print(f"[green]Final: {total_pages} pages → Confidence: {final_confidence:.1%} {'✅ (Sufficient!)' if stat_crawler.is_sufficient else '❌'}[/green]")
|
||||
|
||||
|
||||
|
||||
|
||||
async def test_irrelevant_query_behavior():
|
||||
"""Test how embedding strategy handles completely irrelevant queries"""
|
||||
console.print("\n[bold yellow]Test 8: Irrelevant Query Behavior[/bold yellow]")
|
||||
console.print("Testing embedding strategy with a query that has no semantic relevance to the content")
|
||||
|
||||
# Test with irrelevant query on Python async documentation
|
||||
test_case = {
|
||||
"name": "Irrelevant Query on Python Docs",
|
||||
"url": "https://docs.python.org/3/library/asyncio.html",
|
||||
"query": "how to cook fried rice with vegetables"
|
||||
}
|
||||
|
||||
console.print(f"\n[bold cyan]Testing: {test_case['name']}[/bold cyan]")
|
||||
console.print(f"URL: {test_case['url']} (Python async documentation)")
|
||||
console.print(f"Query: '{test_case['query']}' (completely irrelevant)")
|
||||
console.print("\n[dim]Expected behavior: Low confidence, high distances, no convergence[/dim]")
|
||||
|
||||
# Configure embedding strategy
|
||||
config_emb = AdaptiveConfig(
|
||||
strategy="embedding",
|
||||
confidence_threshold=0.8,
|
||||
max_pages=9,
|
||||
top_k_links=3,
|
||||
min_gain_threshold=0.01,
|
||||
n_query_variations=5,
|
||||
embedding_min_relative_improvement=0.05, # Lower threshold to see more iterations
|
||||
embedding_min_confidence_threshold=0.1 # Will stop if confidence < 10%
|
||||
)
|
||||
|
||||
# Configure embeddings using the correct format
|
||||
config_emb.embedding_llm_config = {
|
||||
'provider': 'openai/gpt-4o-mini',
|
||||
'api_token': os.getenv('OPENAI_API_KEY'),
|
||||
}
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
emb_crawler = AdaptiveCrawler(crawler=crawler, config=config_emb)
|
||||
|
||||
start_time = time.time()
|
||||
state = await emb_crawler.digest(
|
||||
start_url=test_case['url'],
|
||||
query=test_case['query']
|
||||
)
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
# Analyze results
|
||||
console.print(f"\n[bold]Results after {elapsed:.1f} seconds:[/bold]")
|
||||
|
||||
# Basic metrics
|
||||
total_pages = len(state.crawled_urls)
|
||||
final_confidence = emb_crawler.confidence
|
||||
|
||||
console.print(f"\nPages crawled: {total_pages}")
|
||||
console.print(f"Final confidence: {final_confidence:.1%} {'✅' if emb_crawler.is_sufficient else '❌'}")
|
||||
|
||||
# Distance metrics
|
||||
if 'avg_min_distance' in state.metrics:
|
||||
console.print(f"\n[yellow]Distance Metrics:[/yellow]")
|
||||
console.print(f" Average minimum distance: {state.metrics['avg_min_distance']:.3f}")
|
||||
console.print(f" Close neighbors (<0.3): {state.metrics.get('avg_close_neighbors', 0):.1f}")
|
||||
console.print(f" Very close neighbors (<0.2): {state.metrics.get('avg_very_close_neighbors', 0):.1f}")
|
||||
|
||||
# Interpret distances
|
||||
avg_dist = state.metrics['avg_min_distance']
|
||||
if avg_dist > 0.8:
|
||||
console.print(f" [red]→ Very poor match (distance > 0.8)[/red]")
|
||||
elif avg_dist > 0.6:
|
||||
console.print(f" [yellow]→ Poor match (distance > 0.6)[/yellow]")
|
||||
elif avg_dist > 0.4:
|
||||
console.print(f" [blue]→ Moderate match (distance > 0.4)[/blue]")
|
||||
else:
|
||||
console.print(f" [green]→ Good match (distance < 0.4)[/green]")
|
||||
|
||||
# Show sample expanded queries
|
||||
if state.expanded_queries:
|
||||
console.print(f"\n[yellow]Sample Query Variations Generated:[/yellow]")
|
||||
for i, q in enumerate(state.expanded_queries[:3], 1):
|
||||
console.print(f" {i}. {q}")
|
||||
|
||||
# Show crawl progression
|
||||
console.print(f"\n[yellow]Crawl Progression:[/yellow]")
|
||||
for i, url in enumerate(state.crawl_order[:5], 1):
|
||||
console.print(f" {i}. {url}")
|
||||
if len(state.crawl_order) > 5:
|
||||
console.print(f" ... and {len(state.crawl_order) - 5} more")
|
||||
|
||||
# Validation score
|
||||
if 'validation_confidence' in state.metrics:
|
||||
console.print(f"\n[yellow]Validation:[/yellow]")
|
||||
console.print(f" Validation score: {state.metrics['validation_confidence']:.1%}")
|
||||
|
||||
# Why it stopped
|
||||
if 'stopped_reason' in state.metrics:
|
||||
console.print(f"\n[yellow]Stopping Reason:[/yellow] {state.metrics['stopped_reason']}")
|
||||
if state.metrics.get('is_irrelevant', False):
|
||||
console.print("[red]→ Query and content are completely unrelated![/red]")
|
||||
elif total_pages >= config_emb.max_pages:
|
||||
console.print(f"\n[yellow]Stopping Reason:[/yellow] Reached max pages limit ({config_emb.max_pages})")
|
||||
|
||||
# Summary
|
||||
console.print(f"\n[bold]Summary:[/bold]")
|
||||
if final_confidence < 0.2:
|
||||
console.print("[red]✗ As expected: Query is completely irrelevant to content[/red]")
|
||||
console.print("[green]✓ The embedding strategy correctly identified no semantic match[/green]")
|
||||
else:
|
||||
console.print(f"[yellow]⚠ Unexpected: Got {final_confidence:.1%} confidence for irrelevant query[/yellow]")
|
||||
console.print("[yellow] This may indicate the query variations are too broad[/yellow]")
|
||||
|
||||
|
||||
async def test_high_dimensional_handling():
|
||||
"""Test handling of high-dimensional embedding spaces"""
|
||||
console.print("\n[bold yellow]Test 6: High-Dimensional Embedding Space Handling[/bold yellow]")
|
||||
console.print("Testing how the system handles 384+ dimensional embeddings")
|
||||
|
||||
config = AdaptiveConfig(
|
||||
strategy="embedding",
|
||||
confidence_threshold=0.8, # Not used for stopping
|
||||
max_pages=5,
|
||||
n_query_variations=8, # Will create 9 points total
|
||||
min_gain_threshold=0.01,
|
||||
embedding_model="sentence-transformers/all-MiniLM-L6-v2" # 384 dimensions
|
||||
)
|
||||
|
||||
# Use OpenAI if available, otherwise mock
|
||||
if os.getenv('OPENAI_API_KEY'):
|
||||
config.embedding_llm_config = {
|
||||
'provider': 'openai/text-embedding-3-small',
|
||||
'api_token': os.getenv('OPENAI_API_KEY'),
|
||||
'embedding_model': 'text-embedding-3-small'
|
||||
}
|
||||
else:
|
||||
config.embedding_llm_config = {
|
||||
'provider': 'openai/gpt-4o-mini',
|
||||
'api_token': 'mock-key'
|
||||
}
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
prog_crawler = AdaptiveCrawler(crawler=crawler, config=config)
|
||||
|
||||
console.print("\n[cyan]Testing with high-dimensional embeddings (384D)...[/cyan]")
|
||||
|
||||
try:
|
||||
state = await prog_crawler.digest(
|
||||
start_url="https://httpbin.org",
|
||||
query="api endpoints json"
|
||||
)
|
||||
|
||||
console.print(f"[green]✓ Successfully handled {len(state.expanded_queries)} queries in 384D space[/green]")
|
||||
console.print(f"Coverage shape type: {type(state.coverage_shape)}")
|
||||
|
||||
if isinstance(state.coverage_shape, dict):
|
||||
console.print(f"Coverage model: centroid + radius")
|
||||
console.print(f" - Center shape: {state.coverage_shape['center'].shape if 'center' in state.coverage_shape else 'N/A'}")
|
||||
console.print(f" - Radius: {state.coverage_shape.get('radius', 'N/A'):.3f}")
|
||||
|
||||
except Exception as e:
|
||||
console.print(f"[red]Error: {e}[/red]")
|
||||
console.print("[yellow]This demonstrates why alpha shapes don't work in high dimensions[/yellow]")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all embedding strategy tests"""
|
||||
console.print("[bold magenta]Embedding-based Adaptive Crawler Test Suite[/bold magenta]")
|
||||
console.print("=" * 60)
|
||||
|
||||
try:
|
||||
# Check if we have required dependencies
|
||||
has_sentence_transformers = True
|
||||
has_numpy = True
|
||||
|
||||
try:
|
||||
import numpy
|
||||
console.print("[green]✓ NumPy installed[/green]")
|
||||
except ImportError:
|
||||
has_numpy = False
|
||||
console.print("[red]Missing numpy[/red]")
|
||||
|
||||
# Try to import sentence_transformers but catch numpy compatibility errors
|
||||
try:
|
||||
import sentence_transformers
|
||||
console.print("[green]✓ Sentence-transformers installed[/green]")
|
||||
except (ImportError, RuntimeError, ValueError) as e:
|
||||
has_sentence_transformers = False
|
||||
console.print(f"[yellow]Warning: sentence-transformers not available[/yellow]")
|
||||
console.print("[yellow]Tests will use OpenAI embeddings if available or mock data[/yellow]")
|
||||
|
||||
# Run tests based on available dependencies
|
||||
if has_numpy:
|
||||
# Check if we should use OpenAI for embeddings
|
||||
use_openai = not has_sentence_transformers and os.getenv('OPENAI_API_KEY')
|
||||
|
||||
if not has_sentence_transformers and not os.getenv('OPENAI_API_KEY'):
|
||||
console.print("\n[red]Neither sentence-transformers nor OpenAI API key available[/red]")
|
||||
console.print("[yellow]Please set OPENAI_API_KEY or fix sentence-transformers installation[/yellow]")
|
||||
return
|
||||
|
||||
# Run all tests
|
||||
# await test_basic_embedding_crawl()
|
||||
# await test_embedding_vs_statistical(use_openai=use_openai)
|
||||
|
||||
# Run the fast convergence test - this is the most important one
|
||||
# await test_fast_convergence_with_relevant_query()
|
||||
|
||||
# Test with irrelevant query
|
||||
await test_irrelevant_query_behavior()
|
||||
|
||||
# Only run OpenAI-specific test if we have API key
|
||||
# if os.getenv('OPENAI_API_KEY'):
|
||||
# await test_custom_embedding_provider()
|
||||
|
||||
# # Skip tests that require sentence-transformers when it's not available
|
||||
# if has_sentence_transformers:
|
||||
# await test_knowledge_export_import()
|
||||
# await test_gap_visualization()
|
||||
# else:
|
||||
# console.print("\n[yellow]Skipping tests that require sentence-transformers due to numpy compatibility issues[/yellow]")
|
||||
|
||||
# This test should work with mock data
|
||||
# await test_high_dimensional_handling()
|
||||
else:
|
||||
console.print("\n[red]Cannot run tests without NumPy[/red]")
|
||||
return
|
||||
|
||||
console.print("\n[bold green]✅ All tests completed![/bold green]")
|
||||
|
||||
except Exception as e:
|
||||
console.print(f"\n[bold red]❌ Test failed: {e}[/bold red]")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
75
tests/deep_crwaling/test_filter.py
Normal file
75
tests/deep_crwaling/test_filter.py
Normal file
@@ -0,0 +1,75 @@
|
||||
# // File: tests/deep_crawling/test_filters.py
|
||||
import pytest
|
||||
from urllib.parse import urlparse
|
||||
from crawl4ai import ContentTypeFilter, URLFilter
|
||||
|
||||
# Minimal URLFilter base class stub if not already importable directly for tests
|
||||
# In a real scenario, this would be imported from the library
|
||||
if not hasattr(URLFilter, '_update_stats'): # Check if it's a basic stub
|
||||
class URLFilter: # Basic stub for testing if needed
|
||||
def __init__(self, name=None): self.name = name
|
||||
def apply(self, url: str) -> bool: raise NotImplementedError
|
||||
def _update_stats(self, passed: bool): pass # Mock implementation
|
||||
|
||||
# Assume ContentTypeFilter is structured as discussed. If its definition is not fully
|
||||
# available for direct import in the test environment, a more elaborate stub or direct
|
||||
# instantiation of the real class (if possible) would be needed.
|
||||
# For this example, we assume ContentTypeFilter can be imported and used.
|
||||
|
||||
class TestContentTypeFilter:
|
||||
@pytest.mark.parametrize(
|
||||
"url, allowed_types, expected",
|
||||
[
|
||||
# Existing tests (examples)
|
||||
("http://example.com/page.html", ["text/html"], True),
|
||||
("http://example.com/page.json", ["application/json"], True),
|
||||
("http://example.com/image.png", ["text/html"], False),
|
||||
("http://example.com/document.pdf", ["application/pdf"], True),
|
||||
("http://example.com/page", ["text/html"], True), # No extension, allowed
|
||||
("http://example.com/page", ["text/html"], False), # No extension, disallowed
|
||||
("http://example.com/page.unknown", ["text/html"], False), # Unknown extension
|
||||
|
||||
# Tests for PHP extensions
|
||||
("http://example.com/index.php", ["application/x-httpd-php"], True),
|
||||
("http://example.com/script.php3", ["application/x-httpd-php"], True),
|
||||
("http://example.com/legacy.php4", ["application/x-httpd-php"], True),
|
||||
("http://example.com/main.php5", ["application/x-httpd-php"], True),
|
||||
("http://example.com/api.php7", ["application/x-httpd-php"], True),
|
||||
("http://example.com/index.phtml", ["application/x-httpd-php"], True),
|
||||
("http://example.com/source.phps", ["application/x-httpd-php-source"], True),
|
||||
|
||||
# Test rejection of PHP extensions
|
||||
("http://example.com/index.php", ["text/html"], False),
|
||||
("http://example.com/script.php3", ["text/plain"], False),
|
||||
("http://example.com/source.phps", ["application/x-httpd-php"], False), # Mismatch MIME
|
||||
("http://example.com/source.php", ["application/x-httpd-php-source"], False), # Mismatch MIME for .php
|
||||
|
||||
# Test case-insensitivity of extensions in URL
|
||||
("http://example.com/PAGE.HTML", ["text/html"], True),
|
||||
("http://example.com/INDEX.PHP", ["application/x-httpd-php"], True),
|
||||
("http://example.com/SOURCE.PHPS", ["application/x-httpd-php-source"], True),
|
||||
|
||||
# Test case-insensitivity of allowed_types
|
||||
("http://example.com/index.php", ["APPLICATION/X-HTTPD-PHP"], True),
|
||||
],
|
||||
)
|
||||
def test_apply(self, url, allowed_types, expected):
|
||||
content_filter = ContentTypeFilter(
|
||||
allowed_types=allowed_types
|
||||
)
|
||||
assert content_filter.apply(url) == expected
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url, expected_extension",
|
||||
[
|
||||
("http://example.com/file.html", "html"),
|
||||
("http://example.com/file.tar.gz", "gz"),
|
||||
("http://example.com/path/", ""),
|
||||
("http://example.com/nodot", ""),
|
||||
("http://example.com/.config", "config"), # hidden file with extension
|
||||
("http://example.com/path/to/archive.BIG.zip", "zip"), # Case test
|
||||
]
|
||||
)
|
||||
def test_extract_extension(self, url, expected_extension):
|
||||
# Test the static method directly
|
||||
assert ContentTypeFilter._extract_extension(url) == expected_extension
|
||||
345
tests/docker/simple_api_test.py
Normal file
345
tests/docker/simple_api_test.py
Normal file
@@ -0,0 +1,345 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Simple API Test for Crawl4AI Docker Server v0.7.0
|
||||
Uses only built-in Python modules to test all endpoints.
|
||||
"""
|
||||
|
||||
import urllib.request
|
||||
import urllib.parse
|
||||
import json
|
||||
import time
|
||||
import sys
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
# Configuration
|
||||
BASE_URL = "http://localhost:11234" # Change to your server URL
|
||||
TEST_TIMEOUT = 30
|
||||
|
||||
class SimpleApiTester:
|
||||
def __init__(self, base_url: str = BASE_URL):
|
||||
self.base_url = base_url
|
||||
self.token = None
|
||||
self.results = []
|
||||
|
||||
def log(self, message: str):
|
||||
print(f"[INFO] {message}")
|
||||
|
||||
def test_get_endpoint(self, endpoint: str) -> Dict:
|
||||
"""Test a GET endpoint"""
|
||||
url = f"{self.base_url}{endpoint}"
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(url)
|
||||
if self.token:
|
||||
req.add_header('Authorization', f'Bearer {self.token}')
|
||||
|
||||
with urllib.request.urlopen(req, timeout=TEST_TIMEOUT) as response:
|
||||
response_time = time.time() - start_time
|
||||
status_code = response.getcode()
|
||||
content = response.read().decode('utf-8')
|
||||
|
||||
# Try to parse JSON
|
||||
try:
|
||||
data = json.loads(content)
|
||||
except:
|
||||
data = {"raw_response": content[:200]}
|
||||
|
||||
return {
|
||||
"endpoint": endpoint,
|
||||
"method": "GET",
|
||||
"status": "PASS" if status_code < 400 else "FAIL",
|
||||
"status_code": status_code,
|
||||
"response_time": response_time,
|
||||
"data": data
|
||||
}
|
||||
except Exception as e:
|
||||
response_time = time.time() - start_time
|
||||
return {
|
||||
"endpoint": endpoint,
|
||||
"method": "GET",
|
||||
"status": "FAIL",
|
||||
"status_code": None,
|
||||
"response_time": response_time,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
def test_post_endpoint(self, endpoint: str, payload: Dict) -> Dict:
|
||||
"""Test a POST endpoint"""
|
||||
url = f"{self.base_url}{endpoint}"
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
data = json.dumps(payload).encode('utf-8')
|
||||
req = urllib.request.Request(url, data=data, method='POST')
|
||||
req.add_header('Content-Type', 'application/json')
|
||||
|
||||
if self.token:
|
||||
req.add_header('Authorization', f'Bearer {self.token}')
|
||||
|
||||
with urllib.request.urlopen(req, timeout=TEST_TIMEOUT) as response:
|
||||
response_time = time.time() - start_time
|
||||
status_code = response.getcode()
|
||||
content = response.read().decode('utf-8')
|
||||
|
||||
# Try to parse JSON
|
||||
try:
|
||||
data = json.loads(content)
|
||||
except:
|
||||
data = {"raw_response": content[:200]}
|
||||
|
||||
return {
|
||||
"endpoint": endpoint,
|
||||
"method": "POST",
|
||||
"status": "PASS" if status_code < 400 else "FAIL",
|
||||
"status_code": status_code,
|
||||
"response_time": response_time,
|
||||
"data": data
|
||||
}
|
||||
except Exception as e:
|
||||
response_time = time.time() - start_time
|
||||
return {
|
||||
"endpoint": endpoint,
|
||||
"method": "POST",
|
||||
"status": "FAIL",
|
||||
"status_code": None,
|
||||
"response_time": response_time,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
def print_result(self, result: Dict):
|
||||
"""Print a formatted test result"""
|
||||
status_color = {
|
||||
"PASS": "✅",
|
||||
"FAIL": "❌",
|
||||
"SKIP": "⏭️"
|
||||
}
|
||||
|
||||
print(f"{status_color[result['status']]} {result['method']} {result['endpoint']} "
|
||||
f"| {result['response_time']:.3f}s | Status: {result['status_code'] or 'N/A'}")
|
||||
|
||||
if result['status'] == 'FAIL' and 'error' in result:
|
||||
print(f" Error: {result['error']}")
|
||||
|
||||
self.results.append(result)
|
||||
|
||||
def run_all_tests(self):
|
||||
"""Run all API tests"""
|
||||
print("🚀 Starting Crawl4AI v0.7.0 API Test Suite")
|
||||
print(f"📡 Testing server at: {self.base_url}")
|
||||
print("=" * 60)
|
||||
|
||||
# # Test basic endpoints
|
||||
# print("\n=== BASIC ENDPOINTS ===")
|
||||
|
||||
# # Health check
|
||||
# result = self.test_get_endpoint("/health")
|
||||
# self.print_result(result)
|
||||
|
||||
|
||||
# # Schema endpoint
|
||||
# result = self.test_get_endpoint("/schema")
|
||||
# self.print_result(result)
|
||||
|
||||
# # Metrics endpoint
|
||||
# result = self.test_get_endpoint("/metrics")
|
||||
# self.print_result(result)
|
||||
|
||||
# # Root redirect
|
||||
# result = self.test_get_endpoint("/")
|
||||
# self.print_result(result)
|
||||
|
||||
# # Test authentication
|
||||
# print("\n=== AUTHENTICATION ===")
|
||||
|
||||
# # Get token
|
||||
# token_payload = {"email": "test@example.com"}
|
||||
# result = self.test_post_endpoint("/token", token_payload)
|
||||
# self.print_result(result)
|
||||
|
||||
# # Extract token if successful
|
||||
# if result['status'] == 'PASS' and 'data' in result:
|
||||
# token = result['data'].get('access_token')
|
||||
# if token:
|
||||
# self.token = token
|
||||
# self.log(f"Successfully obtained auth token: {token[:20]}...")
|
||||
|
||||
# Test core APIs
|
||||
print("\n=== CORE APIs ===")
|
||||
|
||||
test_url = "https://example.com"
|
||||
|
||||
# Test markdown endpoint
|
||||
md_payload = {
|
||||
"url": test_url,
|
||||
"f": "fit",
|
||||
"q": "test query",
|
||||
"c": "0"
|
||||
}
|
||||
result = self.test_post_endpoint("/md", md_payload)
|
||||
# print(result['data'].get('markdown', ''))
|
||||
self.print_result(result)
|
||||
|
||||
# Test HTML endpoint
|
||||
html_payload = {"url": test_url}
|
||||
result = self.test_post_endpoint("/html", html_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test screenshot endpoint
|
||||
screenshot_payload = {
|
||||
"url": test_url,
|
||||
"screenshot_wait_for": 2
|
||||
}
|
||||
result = self.test_post_endpoint("/screenshot", screenshot_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test PDF endpoint
|
||||
pdf_payload = {"url": test_url}
|
||||
result = self.test_post_endpoint("/pdf", pdf_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test JavaScript execution
|
||||
js_payload = {
|
||||
"url": test_url,
|
||||
"scripts": ["(() => document.title)()"]
|
||||
}
|
||||
result = self.test_post_endpoint("/execute_js", js_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test crawl endpoint
|
||||
crawl_payload = {
|
||||
"urls": [test_url],
|
||||
"browser_config": {},
|
||||
"crawler_config": {}
|
||||
}
|
||||
result = self.test_post_endpoint("/crawl", crawl_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test config dump
|
||||
config_payload = {"code": "CrawlerRunConfig()"}
|
||||
result = self.test_post_endpoint("/config/dump", config_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test LLM endpoint
|
||||
llm_endpoint = f"/llm/{test_url}?q=Extract%20main%20content"
|
||||
result = self.test_get_endpoint(llm_endpoint)
|
||||
self.print_result(result)
|
||||
|
||||
# Test ask endpoint
|
||||
ask_endpoint = "/ask?context_type=all&query=crawl4ai&max_results=5"
|
||||
result = self.test_get_endpoint(ask_endpoint)
|
||||
print(result)
|
||||
self.print_result(result)
|
||||
|
||||
# Test job APIs
|
||||
print("\n=== JOB APIs ===")
|
||||
|
||||
# Test LLM job
|
||||
llm_job_payload = {
|
||||
"url": test_url,
|
||||
"q": "Extract main content",
|
||||
"cache": False
|
||||
}
|
||||
result = self.test_post_endpoint("/llm/job", llm_job_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test crawl job
|
||||
crawl_job_payload = {
|
||||
"urls": [test_url],
|
||||
"browser_config": {},
|
||||
"crawler_config": {}
|
||||
}
|
||||
result = self.test_post_endpoint("/crawl/job", crawl_job_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test MCP
|
||||
print("\n=== MCP APIs ===")
|
||||
|
||||
# Test MCP schema
|
||||
result = self.test_get_endpoint("/mcp/schema")
|
||||
self.print_result(result)
|
||||
|
||||
# Test error handling
|
||||
print("\n=== ERROR HANDLING ===")
|
||||
|
||||
# Test invalid URL
|
||||
invalid_payload = {"url": "invalid-url", "f": "fit"}
|
||||
result = self.test_post_endpoint("/md", invalid_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test invalid endpoint
|
||||
result = self.test_get_endpoint("/nonexistent")
|
||||
self.print_result(result)
|
||||
|
||||
# Print summary
|
||||
self.print_summary()
|
||||
|
||||
def print_summary(self):
|
||||
"""Print test results summary"""
|
||||
print("\n" + "=" * 60)
|
||||
print("📊 TEST RESULTS SUMMARY")
|
||||
print("=" * 60)
|
||||
|
||||
total = len(self.results)
|
||||
passed = sum(1 for r in self.results if r['status'] == 'PASS')
|
||||
failed = sum(1 for r in self.results if r['status'] == 'FAIL')
|
||||
|
||||
print(f"Total Tests: {total}")
|
||||
print(f"✅ Passed: {passed}")
|
||||
print(f"❌ Failed: {failed}")
|
||||
print(f"📈 Success Rate: {(passed/total)*100:.1f}%")
|
||||
|
||||
if failed > 0:
|
||||
print("\n❌ FAILED TESTS:")
|
||||
for result in self.results:
|
||||
if result['status'] == 'FAIL':
|
||||
print(f" • {result['method']} {result['endpoint']}")
|
||||
if 'error' in result:
|
||||
print(f" Error: {result['error']}")
|
||||
|
||||
# Performance statistics
|
||||
response_times = [r['response_time'] for r in self.results if r['response_time'] > 0]
|
||||
if response_times:
|
||||
avg_time = sum(response_times) / len(response_times)
|
||||
max_time = max(response_times)
|
||||
print(f"\n⏱️ Average Response Time: {avg_time:.3f}s")
|
||||
print(f"⏱️ Max Response Time: {max_time:.3f}s")
|
||||
|
||||
# Save detailed report
|
||||
report_file = f"crawl4ai_test_report_{int(time.time())}.json"
|
||||
with open(report_file, 'w') as f:
|
||||
json.dump({
|
||||
"timestamp": time.time(),
|
||||
"server_url": self.base_url,
|
||||
"version": "0.7.0",
|
||||
"summary": {
|
||||
"total": total,
|
||||
"passed": passed,
|
||||
"failed": failed
|
||||
},
|
||||
"results": self.results
|
||||
}, f, indent=2)
|
||||
|
||||
print(f"\n📄 Detailed report saved to: {report_file}")
|
||||
|
||||
def main():
|
||||
"""Main test runner"""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Crawl4AI v0.7.0 API Test Suite')
|
||||
parser.add_argument('--url', default=BASE_URL, help='Base URL of the server')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
tester = SimpleApiTester(args.url)
|
||||
|
||||
try:
|
||||
tester.run_all_tests()
|
||||
except KeyboardInterrupt:
|
||||
print("\n🛑 Test suite interrupted by user")
|
||||
except Exception as e:
|
||||
print(f"\n💥 Test suite failed with error: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -105,7 +105,7 @@ def test_docker_deployment(version="basic"):
|
||||
def test_basic_crawl(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Basic Crawl ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 10,
|
||||
"session_id": "test",
|
||||
}
|
||||
@@ -119,7 +119,7 @@ def test_basic_crawl(tester: Crawl4AiTester):
|
||||
def test_basic_crawl_sync(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Basic Crawl (Sync) ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 10,
|
||||
"session_id": "test",
|
||||
}
|
||||
@@ -134,7 +134,7 @@ def test_basic_crawl_sync(tester: Crawl4AiTester):
|
||||
def test_js_execution(tester: Crawl4AiTester):
|
||||
print("\n=== Testing JS Execution ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 8,
|
||||
"js_code": [
|
||||
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
|
||||
@@ -151,7 +151,7 @@ def test_js_execution(tester: Crawl4AiTester):
|
||||
def test_css_selector(tester: Crawl4AiTester):
|
||||
print("\n=== Testing CSS Selector ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 7,
|
||||
"css_selector": ".wide-tease-item__description",
|
||||
"crawler_params": {"headless": True},
|
||||
@@ -188,7 +188,7 @@ def test_structured_extraction(tester: Crawl4AiTester):
|
||||
}
|
||||
|
||||
request = {
|
||||
"urls": "https://www.coinbase.com/explore",
|
||||
"urls": ["https://www.coinbase.com/explore"],
|
||||
"priority": 9,
|
||||
"extraction_config": {"type": "json_css", "params": {"schema": schema}},
|
||||
}
|
||||
@@ -223,7 +223,7 @@ def test_llm_extraction(tester: Crawl4AiTester):
|
||||
}
|
||||
|
||||
request = {
|
||||
"urls": "https://openai.com/api/pricing",
|
||||
"urls": ["https://openai.com/api/pricing"],
|
||||
"priority": 8,
|
||||
"extraction_config": {
|
||||
"type": "llm",
|
||||
@@ -270,7 +270,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
|
||||
}
|
||||
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 8,
|
||||
"extraction_config": {
|
||||
"type": "llm",
|
||||
@@ -297,7 +297,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
|
||||
def test_cosine_extraction(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Cosine Extraction ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 8,
|
||||
"extraction_config": {
|
||||
"type": "cosine",
|
||||
@@ -323,7 +323,7 @@ def test_cosine_extraction(tester: Crawl4AiTester):
|
||||
def test_screenshot(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Screenshot ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 5,
|
||||
"screenshot": True,
|
||||
"crawler_params": {"headless": True},
|
||||
|
||||
@@ -15,6 +15,24 @@ CRAWL4AI_HOME_DIR = Path(os.path.expanduser("~")).joinpath(".crawl4ai")
|
||||
if not CRAWL4AI_HOME_DIR.joinpath("profiles", "test_profile").exists():
|
||||
CRAWL4AI_HOME_DIR.joinpath("profiles", "test_profile").mkdir(parents=True)
|
||||
|
||||
@pytest.fixture
|
||||
def basic_html():
|
||||
return """
|
||||
<html lang="en">
|
||||
<head>
|
||||
<title>Basic HTML</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Main Heading</h1>
|
||||
<main>
|
||||
<div class="container">
|
||||
<p>Basic HTML document for testing purposes.</p>
|
||||
</div>
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
# Test Config Files
|
||||
@pytest.fixture
|
||||
def basic_browser_config():
|
||||
@@ -325,6 +343,13 @@ async def test_stealth_mode(crawler_strategy):
|
||||
)
|
||||
assert response.status_code == 200
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("prefix", ("raw:", "raw://"))
|
||||
async def test_raw_urls(crawler_strategy, basic_html, prefix):
|
||||
url = f"{prefix}{basic_html}"
|
||||
response = await crawler_strategy.crawl(url, CrawlerRunConfig())
|
||||
assert response.html == basic_html
|
||||
|
||||
# Error Handling Tests
|
||||
@pytest.mark.asyncio
|
||||
async def test_invalid_url():
|
||||
|
||||
34
tests/general/test_download_file.py
Normal file
34
tests/general/test_download_file.py
Normal file
@@ -0,0 +1,34 @@
|
||||
import asyncio
|
||||
from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, BrowserConfig
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
async def test_basic_download():
|
||||
|
||||
# Custom folder (otherwise defaults to ~/.crawl4ai/downloads)
|
||||
downloads_path = os.path.join(Path.home(), ".crawl4ai", "downloads")
|
||||
os.makedirs(downloads_path, exist_ok=True)
|
||||
browser_config = BrowserConfig(
|
||||
accept_downloads=True,
|
||||
downloads_path=downloads_path
|
||||
)
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
run_config = CrawlerRunConfig(
|
||||
js_code="""
|
||||
const link = document.querySelector('a[href$=".exe"]');
|
||||
if (link) { link.click(); }
|
||||
""",
|
||||
delay_before_return_html=5
|
||||
)
|
||||
result = await crawler.arun("https://www.python.org/downloads/", config=run_config)
|
||||
|
||||
if result.downloaded_files:
|
||||
print("Downloaded files:")
|
||||
for file_path in result.downloaded_files:
|
||||
print("•", file_path)
|
||||
else:
|
||||
print("No files downloaded.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_basic_download())
|
||||
|
||||
115
tests/general/test_max_scroll.py
Normal file
115
tests/general/test_max_scroll.py
Normal file
@@ -0,0 +1,115 @@
|
||||
"""
|
||||
Sample script to test the max_scroll_steps parameter implementation
|
||||
"""
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Get the grandparent directory
|
||||
grandparent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
sys.path.append(grandparent_dir)
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
|
||||
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
async def test_max_scroll_steps():
|
||||
"""
|
||||
Test the max_scroll_steps parameter with different configurations
|
||||
"""
|
||||
print("🚀 Testing max_scroll_steps parameter implementation")
|
||||
print("=" * 60)
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
|
||||
# Test 1: Without max_scroll_steps (unlimited scrolling)
|
||||
print("\\n📋 Test 1: Unlimited scrolling (max_scroll_steps=None)")
|
||||
config1 = CrawlerRunConfig(
|
||||
scan_full_page=True,
|
||||
scroll_delay=0.1,
|
||||
max_scroll_steps=None, # Default behavior
|
||||
verbose=True
|
||||
)
|
||||
|
||||
print(f"Config: scan_full_page={config1.scan_full_page}, max_scroll_steps={config1.max_scroll_steps}")
|
||||
|
||||
try:
|
||||
result1 = await crawler.arun(
|
||||
url="https://example.com", # Simple page for testing
|
||||
config=config1
|
||||
)
|
||||
print(f"✅ Test 1 Success: Crawled {len(result1.markdown)} characters")
|
||||
except Exception as e:
|
||||
print(f"❌ Test 1 Failed: {e}")
|
||||
|
||||
# Test 2: With limited scroll steps
|
||||
print("\\n📋 Test 2: Limited scrolling (max_scroll_steps=3)")
|
||||
config2 = CrawlerRunConfig(
|
||||
scan_full_page=True,
|
||||
scroll_delay=0.1,
|
||||
max_scroll_steps=3, # Limit to 3 scroll steps
|
||||
verbose=True
|
||||
)
|
||||
|
||||
print(f"Config: scan_full_page={config2.scan_full_page}, max_scroll_steps={config2.max_scroll_steps}")
|
||||
|
||||
try:
|
||||
result2 = await crawler.arun(
|
||||
url="https://techcrunch.com/", # Another test page
|
||||
config=config2
|
||||
)
|
||||
print(f"✅ Test 2 Success: Crawled {len(result2.markdown)} characters")
|
||||
except Exception as e:
|
||||
print(f"❌ Test 2 Failed: {e}")
|
||||
|
||||
# Test 3: Test serialization/deserialization
|
||||
print("\\n📋 Test 3: Configuration serialization test")
|
||||
config3 = CrawlerRunConfig(
|
||||
scan_full_page=True,
|
||||
max_scroll_steps=5,
|
||||
scroll_delay=0.2
|
||||
)
|
||||
|
||||
# Test to_dict
|
||||
config_dict = config3.to_dict()
|
||||
print(f"Serialized max_scroll_steps: {config_dict.get('max_scroll_steps')}")
|
||||
|
||||
# Test from_kwargs
|
||||
config4 = CrawlerRunConfig.from_kwargs({
|
||||
'scan_full_page': True,
|
||||
'max_scroll_steps': 7,
|
||||
'scroll_delay': 0.3
|
||||
})
|
||||
print(f"Deserialized max_scroll_steps: {config4.max_scroll_steps}")
|
||||
print("✅ Test 3 Success: Serialization works correctly")
|
||||
|
||||
# Test 4: Edge case - max_scroll_steps = 0
|
||||
print("\\n📋 Test 4: Edge case (max_scroll_steps=0)")
|
||||
config5 = CrawlerRunConfig(
|
||||
scan_full_page=True,
|
||||
max_scroll_steps=0, # Should not scroll at all
|
||||
verbose=True
|
||||
)
|
||||
|
||||
try:
|
||||
result5 = await crawler.arun(
|
||||
url="https://techcrunch.com/",
|
||||
config=config5
|
||||
)
|
||||
print(f"✅ Test 4 Success: No scrolling performed, crawled {len(result5.markdown)} characters")
|
||||
except Exception as e:
|
||||
print(f"❌ Test 4 Failed: {e}")
|
||||
|
||||
print("\\n" + "=" * 60)
|
||||
print("🎉 All tests completed!")
|
||||
print("\\nThe max_scroll_steps parameter is working correctly:")
|
||||
print("- None: Unlimited scrolling (default behavior)")
|
||||
print("- Positive integer: Limits scroll steps to that number")
|
||||
print("- 0: No scrolling performed")
|
||||
print("- Properly serializes/deserializes in config")
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("Starting max_scroll_steps test...")
|
||||
asyncio.run(test_max_scroll_steps())
|
||||
85
tests/general/test_url_pattern.py
Normal file
85
tests/general/test_url_pattern.py
Normal file
@@ -0,0 +1,85 @@
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Get the grandparent directory
|
||||
grandparent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
sys.path.append(grandparent_dir)
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
import asyncio
|
||||
from crawl4ai.deep_crawling.filters import URLPatternFilter
|
||||
|
||||
|
||||
def test_prefix_boundary_matching():
|
||||
"""Test that prefix patterns respect path boundaries"""
|
||||
print("=== Testing URLPatternFilter Prefix Boundary Fix ===")
|
||||
|
||||
filter_obj = URLPatternFilter(patterns=['https://langchain-ai.github.io/langgraph/*'])
|
||||
|
||||
test_cases = [
|
||||
('https://langchain-ai.github.io/langgraph/', True),
|
||||
('https://langchain-ai.github.io/langgraph/concepts/', True),
|
||||
('https://langchain-ai.github.io/langgraph/tutorials/', True),
|
||||
('https://langchain-ai.github.io/langgraph?param=1', True),
|
||||
('https://langchain-ai.github.io/langgraph#section', True),
|
||||
('https://langchain-ai.github.io/langgraphjs/', False),
|
||||
('https://langchain-ai.github.io/langgraphjs/concepts/', False),
|
||||
('https://other-site.com/langgraph/', False),
|
||||
]
|
||||
|
||||
all_passed = True
|
||||
for url, expected in test_cases:
|
||||
result = filter_obj.apply(url)
|
||||
status = "PASS" if result == expected else "FAIL"
|
||||
if result != expected:
|
||||
all_passed = False
|
||||
print(f"{status:4} | Expected: {expected:5} | Got: {result:5} | {url}")
|
||||
|
||||
return all_passed
|
||||
|
||||
|
||||
def test_edge_cases():
|
||||
"""Test edge cases for path boundary matching"""
|
||||
print("\n=== Testing Edge Cases ===")
|
||||
|
||||
test_patterns = [
|
||||
('/api/*', [
|
||||
('/api/', True),
|
||||
('/api/v1', True),
|
||||
('/api?param=1', True),
|
||||
('/apiv2/', False),
|
||||
('/api_old/', False),
|
||||
]),
|
||||
|
||||
('*/docs/*', [
|
||||
('example.com/docs/', True),
|
||||
('example.com/docs/guide', True),
|
||||
('example.com/documentation/', False),
|
||||
('example.com/docs_old/', False),
|
||||
]),
|
||||
]
|
||||
|
||||
all_passed = True
|
||||
for pattern, test_cases in test_patterns:
|
||||
print(f"\nPattern: {pattern}")
|
||||
filter_obj = URLPatternFilter(patterns=[pattern])
|
||||
|
||||
for url, expected in test_cases:
|
||||
result = filter_obj.apply(url)
|
||||
status = "PASS" if result == expected else "FAIL"
|
||||
if result != expected:
|
||||
all_passed = False
|
||||
print(f" {status:4} | Expected: {expected:5} | Got: {result:5} | {url}")
|
||||
|
||||
return all_passed
|
||||
|
||||
if __name__ == "__main__":
|
||||
test1_passed = test_prefix_boundary_matching()
|
||||
test2_passed = test_edge_cases()
|
||||
|
||||
if test1_passed and test2_passed:
|
||||
print("\n✅ All tests passed!")
|
||||
sys.exit(0)
|
||||
else:
|
||||
print("\n❌ Some tests failed!")
|
||||
sys.exit(1)
|
||||
151
tests/releases/test_release_0.6.4.py
Normal file
151
tests/releases/test_release_0.6.4.py
Normal file
@@ -0,0 +1,151 @@
|
||||
import pytest
|
||||
import asyncio
|
||||
import time
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig, CacheMode
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_wait_for_timeout_separate_from_page_timeout():
|
||||
"""Test that wait_for has its own timeout separate from page_timeout"""
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
|
||||
# Test with short wait_for_timeout but longer page_timeout
|
||||
config = CrawlerRunConfig(
|
||||
wait_for="css:.nonexistent-element",
|
||||
wait_for_timeout=2000, # 2 seconds
|
||||
page_timeout=10000, # 10 seconds
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
start_time = time.time()
|
||||
result = await crawler.arun("https://example.com", config=config)
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
# Should timeout after ~2 seconds (wait_for_timeout), not 10 seconds
|
||||
assert elapsed < 5, f"Expected timeout around 2s, but took {elapsed:.2f}s"
|
||||
assert result.success, "Crawl should still succeed even if wait_for times out"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_wait_for_timeout_with_existing_element():
|
||||
"""Test that wait_for_timeout works correctly when element exists"""
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
wait_for="css:body", # This should exist quickly
|
||||
wait_for_timeout=5000,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
start_time = time.time()
|
||||
result = await crawler.arun("https://example.com", config=config)
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
# Should complete quickly since body element exists
|
||||
assert elapsed < 3, f"Expected quick completion, but took {elapsed:.2f}s"
|
||||
assert result.success
|
||||
assert "<body" in result.html
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_javascript_wait_for_timeout():
|
||||
"""Test wait_for_timeout with JavaScript condition"""
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
wait_for="js:() => window.nonExistentVariable === true",
|
||||
wait_for_timeout=2000,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
start_time = time.time()
|
||||
result = await crawler.arun("https://example.com", config=config)
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
# Should timeout after ~2 seconds
|
||||
assert elapsed < 4, f"Expected timeout around 2s, but took {elapsed:.2f}s"
|
||||
assert result.success
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_google_analytics_integration():
|
||||
"""Test that Google Analytics scripts are properly integrated"""
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
|
||||
# Test with a simple HTML page that we can control
|
||||
html_content = """
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Test GA Integration</title>
|
||||
<!-- Google tag (gtag.js) -->
|
||||
<script async src="https://www.googletagmanager.com/gtag/js?id=GA_MEASUREMENT_ID"></script>
|
||||
<script>
|
||||
window.dataLayer = window.dataLayer || [];
|
||||
function gtag(){dataLayer.push(arguments);}
|
||||
gtag('config', 'GA_MEASUREMENT_ID');
|
||||
</script>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Test Page</h1>
|
||||
<p>Testing Google Analytics integration</p>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(f"raw://{html_content}", config=config)
|
||||
|
||||
assert result.success
|
||||
# Check that GA scripts are preserved in the HTML
|
||||
assert "googletagmanager.com/gtag/js" in result.html
|
||||
assert "dataLayer" in result.html
|
||||
assert "gtag('config'" in result.html
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_mkdocs_no_duplicate_gtag():
|
||||
"""Test that there are no duplicate gtag.js entries in documentation"""
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
|
||||
# Simulate MkDocs-like HTML structure
|
||||
html_content = """
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Crawl4AI Documentation</title>
|
||||
<script async src="https://www.googletagmanager.com/gtag/js?id=G-XXXXXXXXXX"></script>
|
||||
<script>
|
||||
window.dataLayer = window.dataLayer || [];
|
||||
function gtag(){dataLayer.push(arguments);}
|
||||
gtag('config', 'G-XXXXXXXXXX');
|
||||
</script>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Crawl4AI Documentation</h1>
|
||||
<p>Welcome to the documentation</p>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(f"raw://{html_content}", config=config)
|
||||
|
||||
assert result.success
|
||||
# Count occurrences of gtag.js to ensure no duplicates
|
||||
gtag_count = result.html.count("googletagmanager.com/gtag/js")
|
||||
assert gtag_count <= 1, f"Found {gtag_count} gtag.js scripts, expected at most 1"
|
||||
|
||||
# Ensure the analytics functionality is still there
|
||||
if gtag_count == 1:
|
||||
assert "dataLayer" in result.html
|
||||
assert "gtag('config'" in result.html
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
317
tests/releases/test_release_0.7.0.py
Normal file
317
tests/releases/test_release_0.7.0.py
Normal file
@@ -0,0 +1,317 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import asyncio
|
||||
import pytest
|
||||
import os
|
||||
import json
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy, LLMConfig
|
||||
from crawl4ai.content_filter_strategy import BM25ContentFilter
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
from crawl4ai.async_url_seeder import AsyncUrlSeeder
|
||||
from crawl4ai.utils import RobotsParser
|
||||
|
||||
|
||||
class TestCrawl4AIv070:
|
||||
"""Test suite for Crawl4AI v0.7.0 changes"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_raw_url_parsing(self):
|
||||
"""Test raw:// URL parsing logic fix"""
|
||||
html_content = "<html><body><h1>Test Content</h1><p>This is a test paragraph.</p></body></html>"
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Test raw:// prefix
|
||||
result1 = await crawler.arun(f"raw://{html_content}")
|
||||
assert result1.success
|
||||
assert "Test Content" in result1.markdown
|
||||
|
||||
# Test raw: prefix
|
||||
result2 = await crawler.arun(f"raw:{html_content}")
|
||||
assert result2.success
|
||||
assert "Test Content" in result2.markdown
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_max_pages_limit_batch_processing(self):
|
||||
"""Test max_pages limit is respected during batch processing"""
|
||||
urls = [
|
||||
"https://httpbin.org/html",
|
||||
"https://httpbin.org/json",
|
||||
"https://httpbin.org/xml"
|
||||
]
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
max_pages=2
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results = await crawler.arun_many(urls, config=config)
|
||||
# Should only process 2 pages due to max_pages limit
|
||||
successful_results = [r for r in results if r.success]
|
||||
assert len(successful_results) <= 2
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_navigation_abort_handling(self):
|
||||
"""Test handling of navigation aborts during file downloads"""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Test with a URL that might cause navigation issues
|
||||
result = await crawler.arun(
|
||||
"https://httpbin.org/status/404",
|
||||
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
)
|
||||
# Should not crash even with navigation issues
|
||||
assert result is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_screenshot_capture_fix(self):
|
||||
"""Test screenshot capture improvements"""
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
screenshot=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://httpbin.org/html", config=config)
|
||||
assert result.success
|
||||
assert result.screenshot is not None
|
||||
assert len(result.screenshot) > 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_redirect_status_codes(self):
|
||||
"""Test that real redirect status codes are surfaced"""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Test with a redirect URL
|
||||
result = await crawler.arun(
|
||||
"https://httpbin.org/redirect/1",
|
||||
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
)
|
||||
assert result.success
|
||||
# Should have redirect information
|
||||
assert result.status_code in [200, 301, 302, 303, 307, 308]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_local_file_processing(self):
|
||||
"""Test local file processing with captured_console initialization"""
|
||||
with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f:
|
||||
f.write("<html><body><h1>Local File Test</h1></body></html>")
|
||||
temp_file = f.name
|
||||
|
||||
try:
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(f"file://{temp_file}")
|
||||
assert result.success
|
||||
assert "Local File Test" in result.markdown
|
||||
finally:
|
||||
os.unlink(temp_file)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_robots_txt_wildcard_support(self):
|
||||
"""Test robots.txt wildcard rules support"""
|
||||
parser = RobotsParser()
|
||||
|
||||
# Test wildcard patterns
|
||||
robots_content = "User-agent: *\nDisallow: /admin/*\nDisallow: *.pdf"
|
||||
|
||||
# This should work without throwing exceptions
|
||||
assert parser is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_exclude_external_images(self):
|
||||
"""Test exclude_external_images flag"""
|
||||
html_with_images = '''
|
||||
<html><body>
|
||||
<img src="/local-image.jpg" alt="Local">
|
||||
<img src="https://external.com/image.jpg" alt="External">
|
||||
</body></html>
|
||||
'''
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
exclude_external_images=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(f"raw://{html_with_images}", config=config)
|
||||
assert result.success
|
||||
# External images should be excluded
|
||||
assert "external.com" not in result.cleaned_html
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_llm_extraction_strategy_fix(self):
|
||||
"""Test LLM extraction strategy choices error fix"""
|
||||
if not os.getenv("OPENAI_API_KEY"):
|
||||
pytest.skip("OpenAI API key not available")
|
||||
|
||||
llm_config = LLMConfig(
|
||||
provider="openai/gpt-4o-mini",
|
||||
api_token=os.getenv("OPENAI_API_KEY")
|
||||
)
|
||||
|
||||
strategy = LLMExtractionStrategy(
|
||||
llm_config=llm_config,
|
||||
instruction="Extract the main heading",
|
||||
extraction_type="block"
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
extraction_strategy=strategy
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://httpbin.org/html", config=config)
|
||||
assert result.success
|
||||
# Should not throw 'str' object has no attribute 'choices' error
|
||||
assert result.extracted_content is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_wait_for_timeout(self):
|
||||
"""Test separate timeout for wait_for condition"""
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
wait_for="css:non-existent-element",
|
||||
wait_for_timeout=1000 # 1 second timeout
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://httpbin.org/html", config=config)
|
||||
# Should timeout gracefully and still return result
|
||||
assert result is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_bm25_content_filter_language_parameter(self):
|
||||
"""Test BM25 filter with language parameter for stemming"""
|
||||
content_filter = BM25ContentFilter(
|
||||
user_query="test content",
|
||||
language="english",
|
||||
use_stemming=True
|
||||
)
|
||||
|
||||
markdown_generator = DefaultMarkdownGenerator(
|
||||
content_filter=content_filter
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=markdown_generator
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://httpbin.org/html", config=config)
|
||||
assert result.success
|
||||
assert result.markdown is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_url_normalization(self):
|
||||
"""Test URL normalization for invalid schemes and trailing slashes"""
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Test with trailing slash
|
||||
result = await crawler.arun(
|
||||
"https://httpbin.org/html/",
|
||||
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
)
|
||||
assert result.success
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_max_scroll_steps(self):
|
||||
"""Test max_scroll_steps parameter for full page scanning"""
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
scan_full_page=True,
|
||||
max_scroll_steps=3
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://httpbin.org/html", config=config)
|
||||
assert result.success
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_url_seeder(self):
|
||||
"""Test AsyncUrlSeeder functionality"""
|
||||
seeder = AsyncUrlSeeder(
|
||||
base_url="https://httpbin.org",
|
||||
max_depth=1,
|
||||
max_urls=5
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
urls = await seeder.seed(crawler)
|
||||
assert isinstance(urls, list)
|
||||
assert len(urls) <= 5
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_pdf_processing_timeout(self):
|
||||
"""Test PDF processing with timeout"""
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
pdf=True,
|
||||
pdf_timeout=10000 # 10 seconds
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://httpbin.org/html", config=config)
|
||||
assert result.success
|
||||
# PDF might be None for HTML pages, but should not hang
|
||||
assert result.pdf is not None or result.pdf is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_browser_session_management(self):
|
||||
"""Test improved browser session management"""
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
use_persistent_context=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://httpbin.org/html",
|
||||
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
)
|
||||
assert result.success
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_memory_management(self):
|
||||
"""Test memory management features"""
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
memory_threshold_percent=80.0,
|
||||
check_interval=1.0,
|
||||
memory_wait_timeout=600 # 10 minutes default
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://httpbin.org/html", config=config)
|
||||
assert result.success
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_virtual_scroll_support(self):
|
||||
"""Test virtual scroll support for modern web scraping"""
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
scan_full_page=True,
|
||||
virtual_scroll=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://httpbin.org/html", config=config)
|
||||
assert result.success
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_adaptive_crawling(self):
|
||||
"""Test adaptive crawling feature"""
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
adaptive_crawling=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://httpbin.org/html", config=config)
|
||||
assert result.success
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Run the tests
|
||||
pytest.main([__file__, "-v"])
|
||||
@@ -74,7 +74,7 @@ def test_docker_deployment(version="basic"):
|
||||
|
||||
def test_basic_crawl(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Basic Crawl ===")
|
||||
request = {"urls": "https://www.nbcnews.com/business", "priority": 10}
|
||||
request = {"urls": ["https://www.nbcnews.com/business"], "priority": 10}
|
||||
|
||||
result = tester.submit_and_wait(request)
|
||||
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
|
||||
@@ -85,7 +85,7 @@ def test_basic_crawl(tester: Crawl4AiTester):
|
||||
def test_js_execution(tester: Crawl4AiTester):
|
||||
print("\n=== Testing JS Execution ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 8,
|
||||
"js_code": [
|
||||
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
|
||||
@@ -102,7 +102,7 @@ def test_js_execution(tester: Crawl4AiTester):
|
||||
def test_css_selector(tester: Crawl4AiTester):
|
||||
print("\n=== Testing CSS Selector ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 7,
|
||||
"css_selector": ".wide-tease-item__description",
|
||||
"crawler_params": {"headless": True},
|
||||
@@ -139,7 +139,7 @@ def test_structured_extraction(tester: Crawl4AiTester):
|
||||
}
|
||||
|
||||
request = {
|
||||
"urls": "https://www.coinbase.com/explore",
|
||||
"urls": ["https://www.coinbase.com/explore"],
|
||||
"priority": 9,
|
||||
"extraction_config": {"type": "json_css", "params": {"schema": schema}},
|
||||
}
|
||||
@@ -174,7 +174,7 @@ def test_llm_extraction(tester: Crawl4AiTester):
|
||||
}
|
||||
|
||||
request = {
|
||||
"urls": "https://openai.com/api/pricing",
|
||||
"urls": ["https://openai.com/api/pricing"],
|
||||
"priority": 8,
|
||||
"extraction_config": {
|
||||
"type": "llm",
|
||||
@@ -221,7 +221,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
|
||||
}
|
||||
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 8,
|
||||
"extraction_config": {
|
||||
"type": "llm",
|
||||
@@ -248,7 +248,7 @@ def test_llm_with_ollama(tester: Crawl4AiTester):
|
||||
def test_cosine_extraction(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Cosine Extraction ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 8,
|
||||
"extraction_config": {
|
||||
"type": "cosine",
|
||||
@@ -274,7 +274,7 @@ def test_cosine_extraction(tester: Crawl4AiTester):
|
||||
def test_screenshot(tester: Crawl4AiTester):
|
||||
print("\n=== Testing Screenshot ===")
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 5,
|
||||
"screenshot": True,
|
||||
"crawler_params": {"headless": True},
|
||||
|
||||
@@ -5,7 +5,7 @@ Test script for Link Extractor functionality
|
||||
|
||||
from crawl4ai.models import Link
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.async_configs import LinkPreviewConfig
|
||||
from crawl4ai import LinkPreviewConfig
|
||||
import asyncio
|
||||
import sys
|
||||
import os
|
||||
@@ -237,7 +237,7 @@ def test_config_examples():
|
||||
print(f" {key}: {value}")
|
||||
|
||||
print(" Usage:")
|
||||
print(" from crawl4ai.async_configs import LinkPreviewConfig")
|
||||
print(" from crawl4ai import LinkPreviewConfig")
|
||||
print(" config = CrawlerRunConfig(")
|
||||
print(" link_preview_config=LinkPreviewConfig(")
|
||||
for key, value in config_dict.items():
|
||||
|
||||
@@ -54,7 +54,7 @@ class NBCNewsAPITest:
|
||||
async def test_basic_crawl():
|
||||
print("\n=== Testing Basic Crawl ===")
|
||||
async with NBCNewsAPITest() as api:
|
||||
request = {"urls": "https://www.nbcnews.com/business", "priority": 10}
|
||||
request = {"urls": ["https://www.nbcnews.com/business"], "priority": 10}
|
||||
task_id = await api.submit_crawl(request)
|
||||
result = await api.wait_for_task(task_id)
|
||||
print(f"Basic crawl result length: {len(result['result']['markdown'])}")
|
||||
@@ -67,7 +67,7 @@ async def test_js_execution():
|
||||
print("\n=== Testing JS Execution ===")
|
||||
async with NBCNewsAPITest() as api:
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 8,
|
||||
"js_code": [
|
||||
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
|
||||
@@ -86,7 +86,7 @@ async def test_css_selector():
|
||||
print("\n=== Testing CSS Selector ===")
|
||||
async with NBCNewsAPITest() as api:
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 7,
|
||||
"css_selector": ".wide-tease-item__description",
|
||||
}
|
||||
@@ -120,7 +120,7 @@ async def test_structured_extraction():
|
||||
}
|
||||
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 9,
|
||||
"extraction_config": {"type": "json_css", "params": {"schema": schema}},
|
||||
}
|
||||
@@ -177,7 +177,7 @@ async def test_llm_extraction():
|
||||
}
|
||||
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 8,
|
||||
"extraction_config": {
|
||||
"type": "llm",
|
||||
@@ -209,7 +209,7 @@ async def test_screenshot():
|
||||
print("\n=== Testing Screenshot ===")
|
||||
async with NBCNewsAPITest() as api:
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 5,
|
||||
"screenshot": True,
|
||||
"crawler_params": {"headless": True},
|
||||
@@ -227,7 +227,7 @@ async def test_priority_handling():
|
||||
async with NBCNewsAPITest() as api:
|
||||
# Submit low priority task first
|
||||
low_priority = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"urls": ["https://www.nbcnews.com/business"],
|
||||
"priority": 1,
|
||||
"crawler_params": {"headless": True},
|
||||
}
|
||||
@@ -235,7 +235,7 @@ async def test_priority_handling():
|
||||
|
||||
# Submit high priority task
|
||||
high_priority = {
|
||||
"urls": "https://www.nbcnews.com/business/consumer",
|
||||
"urls": ["https://www.nbcnews.com/business/consumer"],
|
||||
"priority": 10,
|
||||
"crawler_params": {"headless": True},
|
||||
}
|
||||
|
||||
91
tests/test_normalize_url.py
Normal file
91
tests/test_normalize_url.py
Normal file
@@ -0,0 +1,91 @@
|
||||
import unittest
|
||||
from crawl4ai.utils import normalize_url
|
||||
|
||||
class TestNormalizeUrl(unittest.TestCase):
|
||||
|
||||
def test_basic_relative_path(self):
|
||||
self.assertEqual(normalize_url("path/to/page.html", "http://example.com/base/"), "http://example.com/base/path/to/page.html")
|
||||
|
||||
def test_base_url_with_trailing_slash(self):
|
||||
self.assertEqual(normalize_url("page.html", "http://example.com/base/"), "http://example.com/base/page.html")
|
||||
|
||||
def test_base_url_without_trailing_slash(self):
|
||||
# If normalize_url correctly uses urljoin, "base" is treated as a file.
|
||||
self.assertEqual(normalize_url("page.html", "http://example.com/base"), "http://example.com/page.html")
|
||||
|
||||
def test_absolute_url_as_href(self):
|
||||
self.assertEqual(normalize_url("http://another.com/page.html", "http://example.com/"), "http://another.com/page.html")
|
||||
|
||||
def test_href_with_leading_trailing_spaces(self):
|
||||
self.assertEqual(normalize_url(" page.html ", "http://example.com/"), "http://example.com/page.html")
|
||||
|
||||
def test_empty_href(self):
|
||||
# urljoin with an empty href and base ending in '/' returns the base.
|
||||
self.assertEqual(normalize_url("", "http://example.com/base/"), "http://example.com/base/")
|
||||
# urljoin with an empty href and base not ending in '/' also returns base.
|
||||
self.assertEqual(normalize_url("", "http://example.com/base"), "http://example.com/base")
|
||||
|
||||
def test_href_with_query_parameters(self):
|
||||
self.assertEqual(normalize_url("page.html?query=test", "http://example.com/"), "http://example.com/page.html?query=test")
|
||||
|
||||
def test_href_with_fragment(self):
|
||||
self.assertEqual(normalize_url("page.html#section", "http://example.com/"), "http://example.com/page.html#section")
|
||||
|
||||
def test_different_scheme_in_href(self):
|
||||
self.assertEqual(normalize_url("https://secure.example.com/page.html", "http://example.com/"), "https://secure.example.com/page.html")
|
||||
|
||||
def test_parent_directory_in_href(self):
|
||||
self.assertEqual(normalize_url("../otherpage.html", "http://example.com/base/current/"), "http://example.com/base/otherpage.html")
|
||||
|
||||
def test_root_relative_href(self):
|
||||
self.assertEqual(normalize_url("/otherpage.html", "http://example.com/base/current/"), "http://example.com/otherpage.html")
|
||||
|
||||
def test_base_url_with_path_and_no_trailing_slash(self):
|
||||
# If normalize_url correctly uses urljoin, "path" is treated as a file.
|
||||
self.assertEqual(normalize_url("file.html", "http://example.com/path"), "http://example.com/file.html")
|
||||
|
||||
def test_base_url_is_just_domain(self):
|
||||
self.assertEqual(normalize_url("page.html", "http://example.com"), "http://example.com/page.html")
|
||||
|
||||
def test_href_is_only_query(self):
|
||||
self.assertEqual(normalize_url("?query=true", "http://example.com/page.html"), "http://example.com/page.html?query=true")
|
||||
|
||||
def test_href_is_only_fragment(self):
|
||||
self.assertEqual(normalize_url("#fragment", "http://example.com/page.html"), "http://example.com/page.html#fragment")
|
||||
|
||||
def test_relative_link_from_base_file_url(self):
|
||||
"""
|
||||
Tests the specific bug report: relative links from a base URL that is a file.
|
||||
Example:
|
||||
Page URL: http://example.com/path/to/document.html
|
||||
Link on page: <a href="./file.xlsx">
|
||||
Expected: http://example.com/path/to/file.xlsx
|
||||
"""
|
||||
base_url_file = "http://example.com/zwgk/fdzdgk/zdxx/spaq/t19360680.shtml"
|
||||
href_relative_current_dir = "./P020241203375994691134.xlsx"
|
||||
expected_url1 = "http://example.com/zwgk/fdzdgk/zdxx/spaq/P020241203375994691134.xlsx"
|
||||
self.assertEqual(normalize_url(href_relative_current_dir, base_url_file), expected_url1)
|
||||
|
||||
# Test with a relative link that doesn't start with "./"
|
||||
href_relative_no_dot_slash = "another.doc"
|
||||
expected_url2 = "http://example.com/zwgk/fdzdgk/zdxx/spaq/another.doc"
|
||||
self.assertEqual(normalize_url(href_relative_no_dot_slash, base_url_file), expected_url2)
|
||||
|
||||
def test_invalid_base_url_scheme(self):
|
||||
with self.assertRaises(ValueError) as context:
|
||||
normalize_url("page.html", "ftp://example.com/")
|
||||
self.assertIn("Invalid base URL format", str(context.exception))
|
||||
|
||||
def test_invalid_base_url_netloc(self):
|
||||
with self.assertRaises(ValueError) as context:
|
||||
normalize_url("page.html", "http:///path/")
|
||||
self.assertIn("Invalid base URL format", str(context.exception))
|
||||
|
||||
def test_base_url_with_port(self):
|
||||
self.assertEqual(normalize_url("path/file.html", "http://example.com:8080/base/"), "http://example.com:8080/base/path/file.html")
|
||||
|
||||
def test_href_with_special_characters(self):
|
||||
self.assertEqual(normalize_url("path%20with%20spaces/file.html", "http://example.com/"), "http://example.com/path%20with%20spaces/file.html")
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user