Compare commits
126 Commits
next-JUN
...
bug/proxy_
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
89cf5aba2b | ||
|
|
a5bcac4c9d | ||
|
|
45d8327d23 | ||
|
|
437395e490 | ||
|
|
fddae303fb | ||
|
|
ff6ea41ac3 | ||
|
|
31a435fb0e | ||
|
|
5de6a28055 | ||
|
|
de1561ad14 | ||
|
|
337b588732 | ||
|
|
7a6ad547f0 | ||
|
|
e6692b987d | ||
|
|
307fe28b32 | ||
|
|
438a103b17 | ||
|
|
a03e68fa2f | ||
|
|
864d87afb2 | ||
|
|
508b6fc233 | ||
|
|
e3281935bc | ||
|
|
48647300b4 | ||
|
|
9f9ea3bb3b | ||
|
|
d58b93c207 | ||
|
|
e2b4705010 | ||
|
|
4a1abd5086 | ||
|
|
04258cd4f2 | ||
|
|
84e462d9f8 | ||
|
|
9546773a07 | ||
|
|
66a979ad11 | ||
|
|
0c31e91b53 | ||
|
|
1b6a31f88f | ||
|
|
b8c261780f | ||
|
|
db6ad7a79d | ||
|
|
004d514f33 | ||
|
|
3a9e2c716e | ||
|
|
0163bd797c | ||
|
|
26bad799e4 | ||
|
|
cf8badfe27 | ||
|
|
805c498adf | ||
|
|
6a728cbe5b | ||
|
|
ccbe3c105c | ||
|
|
761c19d54b | ||
|
|
14b0ecb137 | ||
|
|
0eaa9f9895 | ||
|
|
1d1970ae69 | ||
|
|
205df1e330 | ||
|
|
2640dc73a5 | ||
|
|
58024755c5 | ||
|
|
5c33cbcca2 | ||
|
|
dd5ee752cf | ||
|
|
bde1bba6a2 | ||
|
|
7b80eb6b99 | ||
|
|
14f690d751 | ||
|
|
7b9ba3015f | ||
|
|
0c8bb742b7 | ||
|
|
ba2ed53ff1 | ||
|
|
a93efcb650 | ||
|
|
8794852a26 | ||
|
|
fb25a4a769 | ||
|
|
afe852935e | ||
|
|
0ebce590f8 | ||
|
|
026e96a2df | ||
|
|
36429a63de | ||
|
|
a3d41c7951 | ||
|
|
fee4c5c783 | ||
|
|
0f210f6e02 | ||
|
|
ee25c771d8 | ||
|
|
02f3127ded | ||
|
|
414f16e975 | ||
|
|
b7a6e02236 | ||
|
|
9332326457 | ||
|
|
6cd34b3157 | ||
|
|
871d4f1158 | ||
|
|
dc85481180 | ||
|
|
5d9213a0e9 | ||
|
|
4679ee023d | ||
|
|
f9b7090084 | ||
|
|
9442597f81 | ||
|
|
74b06d4b80 | ||
|
|
b4bb0ccea0 | ||
|
|
5ac19a61d7 | ||
|
|
022cc2d92a | ||
|
|
fcc2abe4db | ||
|
|
cc95d3abd4 | ||
|
|
5ce3e682f3 | ||
|
|
28125c1980 | ||
|
|
773ed7b281 | ||
|
|
58c1e17170 | ||
|
|
b55e27d2ef | ||
|
|
3d46d89759 | ||
|
|
da8f0dbb93 | ||
|
|
33a0c7a17a | ||
|
|
984524ca1c | ||
|
|
cb8d581e47 | ||
|
|
a55c2b3f88 | ||
|
|
ce09648af1 | ||
|
|
a97654270b | ||
|
|
b4fc60a555 | ||
|
|
137ac014fb | ||
|
|
faa98eefbc | ||
|
|
22725ca87b | ||
|
|
e0fbd2b0a0 | ||
|
|
32966bea11 | ||
|
|
a3b0cab52a | ||
|
|
137556b3dc | ||
|
|
260e2dc347 | ||
|
|
25d97d56e4 | ||
|
|
98a56e6e01 | ||
|
|
1af3d1c2e0 | ||
|
|
c1041b9bbe | ||
|
|
f6e25e2a6b | ||
|
|
ee93acbd06 | ||
|
|
2b17f234f8 | ||
|
|
eebb8c84f0 | ||
|
|
12783fabda | ||
|
|
39e3b792a1 | ||
|
|
e0cd3e10de | ||
|
|
1d6a2b9979 | ||
|
|
039be1b1ce | ||
|
|
53245e4e0e | ||
|
|
094201ab2a | ||
|
|
14a31456ef | ||
|
|
0886153d6a | ||
|
|
0ec3c4a788 | ||
|
|
05085b6e3d | ||
|
|
1f3b1251d0 | ||
|
|
7b9aabc64a | ||
|
|
27af4cc27b |
13
.github/workflows/main.yml
vendored
13
.github/workflows/main.yml
vendored
@@ -9,16 +9,26 @@ on:
|
||||
types: [opened]
|
||||
discussion:
|
||||
types: [created]
|
||||
watch:
|
||||
types: [started]
|
||||
|
||||
jobs:
|
||||
notify-discord:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Send to Google Apps Script (Stars only)
|
||||
if: github.event_name == 'watch'
|
||||
run: |
|
||||
curl -fSs -X POST "${{ secrets.GOOGLE_SCRIPT_ENDPOINT }}" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"url":"${{ github.event.sender.html_url }}"}'
|
||||
- name: Set webhook based on event type
|
||||
id: set-webhook
|
||||
run: |
|
||||
if [ "${{ github.event_name }}" == "discussion" ]; then
|
||||
echo "webhook=${{ secrets.DISCORD_DISCUSSIONS_WEBHOOK }}" >> $GITHUB_OUTPUT
|
||||
elif [ "${{ github.event_name }}" == "watch" ]; then
|
||||
echo "webhook=${{ secrets.DISCORD_STAR_GAZERS }}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "webhook=${{ secrets.DISCORD_WEBHOOK }}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
@@ -31,5 +41,6 @@ jobs:
|
||||
args: |
|
||||
${{ github.event_name == 'issues' && format('📣 New issue created: **{0}** by {1} - {2}', github.event.issue.title, github.event.issue.user.login, github.event.issue.html_url) ||
|
||||
github.event_name == 'issue_comment' && format('💬 New comment on issue **{0}** by {1} - {2}', github.event.issue.title, github.event.comment.user.login, github.event.comment.html_url) ||
|
||||
github.event_name == 'pull_request' && format('🔄 New PR opened: **{0}** by {1} - {2}', github.event.pull_request.title, github.event.pull_request.user.login, github.event.pull_request.html_url) ||
|
||||
github.event_name == 'pull_request' && format('🔄 New PR opened: **{0}** by {1} - {2}', github.event.pull_request.title, github.event.pull_request.user.login, github.event.pull_request.html_url) ||
|
||||
github.event_name == 'watch' && format('⭐ {0} starred Crawl4AI 🥳! Check out their profile: {1}', github.event.sender.login, github.event.sender.html_url) ||
|
||||
format('💬 New discussion started: **{0}** by {1} - {2}', github.event.discussion.title, github.event.discussion.user.login, github.event.discussion.html_url) }}
|
||||
|
||||
142
.github/workflows/release.yml
vendored
Normal file
142
.github/workflows/release.yml
vendored
Normal file
@@ -0,0 +1,142 @@
|
||||
name: Release Pipeline
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- 'v*'
|
||||
- '!test-v*' # Exclude test tags
|
||||
|
||||
jobs:
|
||||
release:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write # Required for creating releases
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Extract version from tag
|
||||
id: get_version
|
||||
run: |
|
||||
TAG_VERSION=${GITHUB_REF#refs/tags/v}
|
||||
echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT
|
||||
echo "Releasing version: $TAG_VERSION"
|
||||
|
||||
- name: Install package dependencies
|
||||
run: |
|
||||
pip install -e .
|
||||
|
||||
- name: Check version consistency
|
||||
run: |
|
||||
TAG_VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||
PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)")
|
||||
|
||||
echo "Tag version: $TAG_VERSION"
|
||||
echo "Package version: $PACKAGE_VERSION"
|
||||
|
||||
if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then
|
||||
echo "❌ Version mismatch! Tag: $TAG_VERSION, Package: $PACKAGE_VERSION"
|
||||
echo "Please update crawl4ai/__version__.py to match the tag version"
|
||||
exit 1
|
||||
fi
|
||||
echo "✅ Version check passed: $TAG_VERSION"
|
||||
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install build twine
|
||||
|
||||
- name: Build package
|
||||
run: python -m build
|
||||
|
||||
- name: Check package
|
||||
run: twine check dist/*
|
||||
|
||||
- name: Upload to PyPI
|
||||
env:
|
||||
TWINE_USERNAME: __token__
|
||||
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
|
||||
run: |
|
||||
echo "📦 Uploading to PyPI..."
|
||||
twine upload dist/*
|
||||
echo "✅ Package uploaded to https://pypi.org/project/crawl4ai/"
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Extract major and minor versions
|
||||
id: versions
|
||||
run: |
|
||||
VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||
MAJOR=$(echo $VERSION | cut -d. -f1)
|
||||
MINOR=$(echo $VERSION | cut -d. -f1-2)
|
||||
echo "MAJOR=$MAJOR" >> $GITHUB_OUTPUT
|
||||
echo "MINOR=$MINOR" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Build and push Docker images
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: |
|
||||
unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
||||
unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}
|
||||
unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}
|
||||
unclecode/crawl4ai:latest
|
||||
platforms: linux/amd64,linux/arm64
|
||||
|
||||
- name: Create GitHub Release
|
||||
uses: softprops/action-gh-release@v2
|
||||
with:
|
||||
tag_name: v${{ steps.get_version.outputs.VERSION }}
|
||||
name: Release v${{ steps.get_version.outputs.VERSION }}
|
||||
body: |
|
||||
## 🎉 Crawl4AI v${{ steps.get_version.outputs.VERSION }} Released!
|
||||
|
||||
### 📦 Installation
|
||||
|
||||
**PyPI:**
|
||||
```bash
|
||||
pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}
|
||||
```
|
||||
|
||||
**Docker:**
|
||||
```bash
|
||||
docker pull unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
||||
docker pull unclecode/crawl4ai:latest
|
||||
```
|
||||
|
||||
### 📝 What's Changed
|
||||
See [CHANGELOG.md](https://github.com/${{ github.repository }}/blob/main/CHANGELOG.md) for details.
|
||||
draft: false
|
||||
prerelease: false
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Summary
|
||||
run: |
|
||||
echo "## 🚀 Release Complete!" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 📦 PyPI Package" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- Version: ${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- URL: https://pypi.org/project/crawl4ai/" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- Install: \`pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 🐳 Docker Images" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:latest\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 📋 GitHub Release" >> $GITHUB_STEP_SUMMARY
|
||||
echo "https://github.com/${{ github.repository }}/releases/tag/v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
116
.github/workflows/test-release.yml.disabled
vendored
Normal file
116
.github/workflows/test-release.yml.disabled
vendored
Normal file
@@ -0,0 +1,116 @@
|
||||
name: Test Release Pipeline
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- 'test-v*'
|
||||
|
||||
jobs:
|
||||
test-release:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Extract version from tag
|
||||
id: get_version
|
||||
run: |
|
||||
TAG_VERSION=${GITHUB_REF#refs/tags/test-v}
|
||||
echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT
|
||||
echo "Testing with version: $TAG_VERSION"
|
||||
|
||||
- name: Install package dependencies
|
||||
run: |
|
||||
pip install -e .
|
||||
|
||||
- name: Check version consistency
|
||||
run: |
|
||||
TAG_VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||
PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)")
|
||||
|
||||
echo "Tag version: $TAG_VERSION"
|
||||
echo "Package version: $PACKAGE_VERSION"
|
||||
|
||||
if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then
|
||||
echo "❌ Version mismatch! Tag: $TAG_VERSION, Package: $PACKAGE_VERSION"
|
||||
echo "Please update crawl4ai/__version__.py to match the tag version"
|
||||
exit 1
|
||||
fi
|
||||
echo "✅ Version check passed: $TAG_VERSION"
|
||||
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install build twine
|
||||
|
||||
- name: Build package
|
||||
run: python -m build
|
||||
|
||||
- name: Check package
|
||||
run: twine check dist/*
|
||||
|
||||
- name: Upload to Test PyPI
|
||||
env:
|
||||
TWINE_USERNAME: __token__
|
||||
TWINE_PASSWORD: ${{ secrets.TEST_PYPI_TOKEN }}
|
||||
run: |
|
||||
echo "📦 Uploading to Test PyPI..."
|
||||
twine upload --repository testpypi dist/* || {
|
||||
if [ $? -eq 1 ]; then
|
||||
echo "⚠️ Upload failed - likely version already exists on Test PyPI"
|
||||
echo "Continuing anyway for test purposes..."
|
||||
else
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
echo "✅ Test PyPI step complete"
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Build and push Docker test images
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: |
|
||||
unclecode/crawl4ai:test-${{ steps.get_version.outputs.VERSION }}
|
||||
unclecode/crawl4ai:test-latest
|
||||
platforms: linux/amd64,linux/arm64
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
|
||||
- name: Summary
|
||||
run: |
|
||||
echo "## 🎉 Test Release Complete!" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 📦 Test PyPI Package" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- Version: ${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- URL: https://test.pypi.org/project/crawl4ai/" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- Install: \`pip install -i https://test.pypi.org/simple/ crawl4ai==${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 🐳 Docker Test Images" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:test-${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:test-latest\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 🧹 Cleanup Commands" >> $GITHUB_STEP_SUMMARY
|
||||
echo "\`\`\`bash" >> $GITHUB_STEP_SUMMARY
|
||||
echo "# Remove test tag" >> $GITHUB_STEP_SUMMARY
|
||||
echo "git tag -d test-v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "git push origin :test-v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "# Remove Docker test images" >> $GITHUB_STEP_SUMMARY
|
||||
echo "docker rmi unclecode/crawl4ai:test-${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "docker rmi unclecode/crawl4ai:test-latest" >> $GITHUB_STEP_SUMMARY
|
||||
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
|
||||
15
CHANGELOG.md
15
CHANGELOG.md
@@ -21,6 +21,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
- **Flexible LLM Provider Configuration** (Docker):
|
||||
- Support for `LLM_PROVIDER` environment variable to override default provider
|
||||
- Per-request provider override via optional `provider` parameter in API endpoints
|
||||
- Automatic provider validation with clear error messages
|
||||
- Updated Docker documentation and examples
|
||||
|
||||
### Changed
|
||||
- **WebScrapingStrategy Refactoring**: Simplified content scraping architecture
|
||||
- `WebScrapingStrategy` is now an alias for `LXMLWebScrapingStrategy` for backward compatibility
|
||||
- Removed redundant BeautifulSoup-based implementation (~1000 lines of code)
|
||||
- `LXMLWebScrapingStrategy` now inherits directly from `ContentScrapingStrategy`
|
||||
- All existing code using `WebScrapingStrategy` continues to work without modification
|
||||
- Default scraping strategy remains `LXMLWebScrapingStrategy` for optimal performance
|
||||
|
||||
### Added
|
||||
- **AsyncUrlSeeder**: High-performance URL discovery system for intelligent crawling at scale
|
||||
- Discover URLs from sitemaps and Common Crawl index
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
FROM python:3.12-slim-bookworm AS build
|
||||
|
||||
# C4ai version
|
||||
ARG C4AI_VER=0.6.0
|
||||
ARG C4AI_VER=0.7.0-r1
|
||||
ENV C4AI_VERSION=$C4AI_VER
|
||||
LABEL c4ai.version=$C4AI_VER
|
||||
|
||||
|
||||
119
README.md
119
README.md
@@ -11,19 +11,24 @@
|
||||
[](https://pypi.org/project/crawl4ai/)
|
||||
[](https://pepy.tech/project/crawl4ai)
|
||||
|
||||
<!-- [](https://crawl4ai.readthedocs.io/) -->
|
||||
[](https://github.com/unclecode/crawl4ai/blob/main/LICENSE)
|
||||
[](https://github.com/psf/black)
|
||||
[](https://github.com/PyCQA/bandit)
|
||||
[](code_of_conduct.md)
|
||||
|
||||
<p align="center">
|
||||
<a href="https://x.com/crawl4ai">
|
||||
<img src="https://img.shields.io/badge/Follow%20on%20X-000000?style=for-the-badge&logo=x&logoColor=white" alt="Follow on X" />
|
||||
</a>
|
||||
<a href="https://www.linkedin.com/company/crawl4ai">
|
||||
<img src="https://img.shields.io/badge/Follow%20on%20LinkedIn-0077B5?style=for-the-badge&logo=linkedin&logoColor=white" alt="Follow on LinkedIn" />
|
||||
</a>
|
||||
<a href="https://discord.gg/jP8KfhDhyN">
|
||||
<img src="https://img.shields.io/badge/Join%20our%20Discord-5865F2?style=for-the-badge&logo=discord&logoColor=white" alt="Join our Discord" />
|
||||
</a>
|
||||
</p>
|
||||
</div>
|
||||
|
||||
Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease.
|
||||
|
||||
[✨ Check out latest update v0.6.0](#-recent-updates)
|
||||
[✨ Check out latest update v0.7.0](#-recent-updates)
|
||||
|
||||
🎉 **Version 0.6.0 is now available!** This release candidate introduces World-aware Crawling with geolocation and locale settings, Table-to-DataFrame extraction, Browser pooling with pre-warming, Network and console traffic capture, MCP integration for AI tools, and a completely revamped Docker deployment! [Read the release notes →](https://docs.crawl4ai.com/blog)
|
||||
🎉 **Version 0.7.0 is now available!** The Adaptive Intelligence Update introduces groundbreaking features: Adaptive Crawling that learns website patterns, Virtual Scroll support for infinite pages, intelligent Link Preview with 3-layer scoring, Async URL Seeder for massive discovery, and significant performance improvements. [Read the release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.0.md)
|
||||
|
||||
<details>
|
||||
<summary>🤓 <strong>My Personal Story</strong></summary>
|
||||
@@ -269,8 +274,8 @@ The new Docker implementation includes:
|
||||
|
||||
```bash
|
||||
# Pull and run the latest release candidate
|
||||
docker pull unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number
|
||||
docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number
|
||||
docker pull unclecode/crawl4ai:0.7.0
|
||||
docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:0.7.0
|
||||
|
||||
# Visit the playground at http://localhost:11235/playground
|
||||
```
|
||||
@@ -291,12 +296,20 @@ import requests
|
||||
# Submit a crawl job
|
||||
response = requests.post(
|
||||
"http://localhost:11235/crawl",
|
||||
json={"urls": "https://example.com", "priority": 10}
|
||||
json={"urls": ["https://example.com"], "priority": 10}
|
||||
)
|
||||
task_id = response.json()["task_id"]
|
||||
|
||||
# Continue polling until the task is complete (status="completed")
|
||||
result = requests.get(f"http://localhost:11235/task/{task_id}")
|
||||
if response.status_code == 200:
|
||||
print("Crawl job submitted successfully.")
|
||||
|
||||
if "results" in response.json():
|
||||
results = response.json()["results"]
|
||||
print("Crawl job completed. Results:")
|
||||
for result in results:
|
||||
print(result)
|
||||
else:
|
||||
task_id = response.json()["task_id"]
|
||||
print(f"Crawl job submitted. Task ID:: {task_id}")
|
||||
result = requests.get(f"http://localhost:11235/task/{task_id}")
|
||||
```
|
||||
|
||||
For more examples, see our [Docker Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_example.py). For advanced configuration, environment variables, and usage examples, see our [Docker Deployment Guide](https://docs.crawl4ai.com/basic/docker-deployment/).
|
||||
@@ -505,7 +518,72 @@ async def test_news_crawl():
|
||||
|
||||
## ✨ Recent Updates
|
||||
|
||||
### Version 0.6.0 Release Highlights
|
||||
### Version 0.7.0 Release Highlights - The Adaptive Intelligence Update
|
||||
|
||||
- **🧠 Adaptive Crawling**: Your crawler now learns and adapts to website patterns automatically:
|
||||
```python
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.7, # Min confidence to stop crawling
|
||||
max_depth=5, # Maximum crawl depth
|
||||
max_pages=20, # Maximum number of pages to crawl
|
||||
strategy="statistical"
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
adaptive_crawler = AdaptiveCrawler(crawler, config)
|
||||
state = await adaptive_crawler.digest(
|
||||
start_url="https://news.example.com",
|
||||
query="latest news content"
|
||||
)
|
||||
# Crawler learns patterns and improves extraction over time
|
||||
```
|
||||
|
||||
- **🌊 Virtual Scroll Support**: Complete content extraction from infinite scroll pages:
|
||||
```python
|
||||
scroll_config = VirtualScrollConfig(
|
||||
container_selector="[data-testid='feed']",
|
||||
scroll_count=20,
|
||||
scroll_by="container_height",
|
||||
wait_after_scroll=1.0
|
||||
)
|
||||
|
||||
result = await crawler.arun(url, config=CrawlerRunConfig(
|
||||
virtual_scroll_config=scroll_config
|
||||
))
|
||||
```
|
||||
|
||||
- **🔗 Intelligent Link Analysis**: 3-layer scoring system for smart link prioritization:
|
||||
```python
|
||||
link_config = LinkPreviewConfig(
|
||||
query="machine learning tutorials",
|
||||
score_threshold=0.3,
|
||||
concurrent_requests=10
|
||||
)
|
||||
|
||||
result = await crawler.arun(url, config=CrawlerRunConfig(
|
||||
link_preview_config=link_config,
|
||||
score_links=True
|
||||
))
|
||||
# Links ranked by relevance and quality
|
||||
```
|
||||
|
||||
- **🎣 Async URL Seeder**: Discover thousands of URLs in seconds:
|
||||
```python
|
||||
seeder = AsyncUrlSeeder(SeedingConfig(
|
||||
source="sitemap+cc",
|
||||
pattern="*/blog/*",
|
||||
query="python tutorials",
|
||||
score_threshold=0.4
|
||||
))
|
||||
|
||||
urls = await seeder.discover("https://example.com")
|
||||
```
|
||||
|
||||
- **⚡ Performance Boost**: Up to 3x faster with optimized resource handling and memory efficiency
|
||||
|
||||
Read the full details in our [0.7.0 Release Notes](https://docs.crawl4ai.com/blog/release-v0.7.0) or check the [CHANGELOG](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md).
|
||||
|
||||
### Previous Version: 0.6.0 Release Highlights
|
||||
|
||||
- **🌎 World-aware Crawling**: Set geolocation, language, and timezone for authentic locale-specific content:
|
||||
```python
|
||||
@@ -540,16 +618,16 @@ async def test_news_crawl():
|
||||
# Process results
|
||||
raw_df = pd.DataFrame()
|
||||
for result in results:
|
||||
if result.success and result.media["tables"]:
|
||||
if result.success and result.tables:
|
||||
raw_df = pd.DataFrame(
|
||||
result.media["tables"][0]["rows"],
|
||||
columns=result.media["tables"][0]["headers"],
|
||||
result.tables[0]["rows"],
|
||||
columns=result.tables[0]["headers"],
|
||||
)
|
||||
break
|
||||
print(raw_df.head())
|
||||
|
||||
finally:
|
||||
await crawler.stop()
|
||||
await crawler.close()
|
||||
```
|
||||
|
||||
- **🚀 Browser Pooling**: Pages launch hot with pre-warmed browser instances for lower latency and memory usage
|
||||
@@ -575,7 +653,6 @@ async def test_news_crawl():
|
||||
|
||||
- **📱 Multi-stage Build System**: Optimized Dockerfile with platform-specific performance enhancements
|
||||
|
||||
Read the full details in our [0.6.0 Release Notes](https://docs.crawl4ai.com/blog/releases/0.6.0.html) or check the [CHANGELOG](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md).
|
||||
|
||||
### Previous Version: 0.5.0 Major Release Highlights
|
||||
|
||||
|
||||
@@ -3,12 +3,12 @@ import warnings
|
||||
|
||||
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
||||
# MODIFIED: Add SeedingConfig and VirtualScrollConfig here
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig, LinkPreviewConfig, MatchMode
|
||||
|
||||
from .content_scraping_strategy import (
|
||||
ContentScrapingStrategy,
|
||||
WebScrapingStrategy,
|
||||
LXMLWebScrapingStrategy,
|
||||
WebScrapingStrategy, # Backward compatibility alias
|
||||
)
|
||||
from .async_logger import (
|
||||
AsyncLoggerBase,
|
||||
@@ -88,6 +88,13 @@ from .script import (
|
||||
ErrorDetail
|
||||
)
|
||||
|
||||
# Browser Adapters
|
||||
from .browser_adapter import (
|
||||
BrowserAdapter,
|
||||
PlaywrightAdapter,
|
||||
UndetectedAdapter
|
||||
)
|
||||
|
||||
from .utils import (
|
||||
start_colab_display_server,
|
||||
setup_colab_environment
|
||||
@@ -132,6 +139,7 @@ __all__ = [
|
||||
"CrawlResult",
|
||||
"CrawlerHub",
|
||||
"CacheMode",
|
||||
"MatchMode",
|
||||
"ContentScrapingStrategy",
|
||||
"WebScrapingStrategy",
|
||||
"LXMLWebScrapingStrategy",
|
||||
@@ -173,6 +181,11 @@ __all__ = [
|
||||
"CompilationResult",
|
||||
"ValidationResult",
|
||||
"ErrorDetail",
|
||||
# Browser Adapters
|
||||
"BrowserAdapter",
|
||||
"PlaywrightAdapter",
|
||||
"UndetectedAdapter",
|
||||
"LinkPreviewConfig"
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# crawl4ai/__version__.py
|
||||
|
||||
# This is the version that will be used for stable releases
|
||||
__version__ = "0.6.3"
|
||||
__version__ = "0.7.2"
|
||||
|
||||
# For nightly builds, this gets set during build process
|
||||
__nightly_version__ = None
|
||||
|
||||
@@ -18,17 +18,24 @@ from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy
|
||||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||
|
||||
from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
|
||||
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy, LXMLWebScrapingStrategy
|
||||
from .content_scraping_strategy import ContentScrapingStrategy, LXMLWebScrapingStrategy
|
||||
from .deep_crawling import DeepCrawlStrategy
|
||||
|
||||
from .cache_context import CacheMode
|
||||
from .proxy_strategy import ProxyRotationStrategy
|
||||
|
||||
from typing import Union, List
|
||||
from typing import Union, List, Callable
|
||||
import inspect
|
||||
from typing import Any, Dict, Optional
|
||||
from enum import Enum
|
||||
|
||||
# Type alias for URL matching
|
||||
UrlMatcher = Union[str, Callable[[str], bool], List[Union[str, Callable[[str], bool]]]]
|
||||
|
||||
class MatchMode(Enum):
|
||||
OR = "or"
|
||||
AND = "and"
|
||||
|
||||
# from .proxy_strategy import ProxyConfig
|
||||
|
||||
|
||||
@@ -383,6 +390,8 @@ class BrowserConfig:
|
||||
light_mode (bool): Disables certain background features for performance gains. Default: False.
|
||||
extra_args (list): Additional command-line arguments passed to the browser.
|
||||
Default: [].
|
||||
enable_stealth (bool): If True, applies playwright-stealth to bypass basic bot detection.
|
||||
Cannot be used with use_undetected browser mode. Default: False.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -423,6 +432,7 @@ class BrowserConfig:
|
||||
extra_args: list = None,
|
||||
debugging_port: int = 9222,
|
||||
host: str = "localhost",
|
||||
enable_stealth: bool = False,
|
||||
):
|
||||
self.browser_type = browser_type
|
||||
self.headless = headless
|
||||
@@ -438,6 +448,10 @@ class BrowserConfig:
|
||||
self.chrome_channel = ""
|
||||
self.proxy = proxy
|
||||
self.proxy_config = proxy_config
|
||||
if isinstance(self.proxy_config, dict):
|
||||
self.proxy_config = ProxyConfig.from_dict(self.proxy_config)
|
||||
if isinstance(self.proxy_config, str):
|
||||
self.proxy_config = ProxyConfig.from_string(self.proxy_config)
|
||||
|
||||
|
||||
self.viewport_width = viewport_width
|
||||
@@ -463,6 +477,7 @@ class BrowserConfig:
|
||||
self.verbose = verbose
|
||||
self.debugging_port = debugging_port
|
||||
self.host = host
|
||||
self.enable_stealth = enable_stealth
|
||||
|
||||
fa_user_agenr_generator = ValidUAGenerator()
|
||||
if self.user_agent_mode == "random":
|
||||
@@ -494,6 +509,13 @@ class BrowserConfig:
|
||||
# If persistent context is requested, ensure managed browser is enabled
|
||||
if self.use_persistent_context:
|
||||
self.use_managed_browser = True
|
||||
|
||||
# Validate stealth configuration
|
||||
if self.enable_stealth and self.use_managed_browser and self.browser_mode == "builtin":
|
||||
raise ValueError(
|
||||
"enable_stealth cannot be used with browser_mode='builtin'. "
|
||||
"Stealth mode requires a dedicated browser instance."
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def from_kwargs(kwargs: dict) -> "BrowserConfig":
|
||||
@@ -530,6 +552,7 @@ class BrowserConfig:
|
||||
extra_args=kwargs.get("extra_args", []),
|
||||
debugging_port=kwargs.get("debugging_port", 9222),
|
||||
host=kwargs.get("host", "localhost"),
|
||||
enable_stealth=kwargs.get("enable_stealth", False),
|
||||
)
|
||||
|
||||
def to_dict(self):
|
||||
@@ -564,6 +587,7 @@ class BrowserConfig:
|
||||
"verbose": self.verbose,
|
||||
"debugging_port": self.debugging_port,
|
||||
"host": self.host,
|
||||
"enable_stealth": self.enable_stealth,
|
||||
}
|
||||
|
||||
|
||||
@@ -862,7 +886,7 @@ class CrawlerRunConfig():
|
||||
parser_type (str): Type of parser to use for HTML parsing.
|
||||
Default: "lxml".
|
||||
scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
|
||||
Default: WebScrapingStrategy.
|
||||
Default: LXMLWebScrapingStrategy.
|
||||
proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
|
||||
If None, no additional proxy config. Default: None.
|
||||
|
||||
@@ -926,6 +950,8 @@ class CrawlerRunConfig():
|
||||
Default: False.
|
||||
scroll_delay (float): Delay in seconds between scroll steps if scan_full_page is True.
|
||||
Default: 0.2.
|
||||
max_scroll_steps (Optional[int]): Maximum number of scroll steps to perform during full page scan.
|
||||
If None, scrolls until the entire page is loaded. Default: None.
|
||||
process_iframes (bool): If True, attempts to process and inline iframe content.
|
||||
Default: False.
|
||||
remove_overlay_elements (bool): If True, remove overlays/popups before extracting HTML.
|
||||
@@ -1066,6 +1092,7 @@ class CrawlerRunConfig():
|
||||
ignore_body_visibility: bool = True,
|
||||
scan_full_page: bool = False,
|
||||
scroll_delay: float = 0.2,
|
||||
max_scroll_steps: Optional[int] = None,
|
||||
process_iframes: bool = False,
|
||||
remove_overlay_elements: bool = False,
|
||||
simulate_user: bool = False,
|
||||
@@ -1110,6 +1137,9 @@ class CrawlerRunConfig():
|
||||
link_preview_config: Union[LinkPreviewConfig, Dict[str, Any]] = None,
|
||||
# Virtual Scroll Parameters
|
||||
virtual_scroll_config: Union[VirtualScrollConfig, Dict[str, Any]] = None,
|
||||
# URL Matching Parameters
|
||||
url_matcher: Optional[UrlMatcher] = None,
|
||||
match_mode: MatchMode = MatchMode.OR,
|
||||
# Experimental Parameters
|
||||
experimental: Dict[str, Any] = None,
|
||||
):
|
||||
@@ -1133,6 +1163,11 @@ class CrawlerRunConfig():
|
||||
self.parser_type = parser_type
|
||||
self.scraping_strategy = scraping_strategy or LXMLWebScrapingStrategy()
|
||||
self.proxy_config = proxy_config
|
||||
if isinstance(proxy_config, dict):
|
||||
self.proxy_config = ProxyConfig.from_dict(proxy_config)
|
||||
if isinstance(proxy_config, str):
|
||||
self.proxy_config = ProxyConfig.from_string(proxy_config)
|
||||
|
||||
self.proxy_rotation_strategy = proxy_rotation_strategy
|
||||
|
||||
# Browser Location and Identity Parameters
|
||||
@@ -1170,6 +1205,7 @@ class CrawlerRunConfig():
|
||||
self.ignore_body_visibility = ignore_body_visibility
|
||||
self.scan_full_page = scan_full_page
|
||||
self.scroll_delay = scroll_delay
|
||||
self.max_scroll_steps = max_scroll_steps
|
||||
self.process_iframes = process_iframes
|
||||
self.remove_overlay_elements = remove_overlay_elements
|
||||
self.simulate_user = simulate_user
|
||||
@@ -1262,6 +1298,10 @@ class CrawlerRunConfig():
|
||||
else:
|
||||
raise ValueError("virtual_scroll_config must be VirtualScrollConfig object or dict")
|
||||
|
||||
# URL Matching Parameters
|
||||
self.url_matcher = url_matcher
|
||||
self.match_mode = match_mode
|
||||
|
||||
# Experimental Parameters
|
||||
self.experimental = experimental or {}
|
||||
|
||||
@@ -1317,6 +1357,51 @@ class CrawlerRunConfig():
|
||||
if "compilation error" not in str(e).lower():
|
||||
raise ValueError(f"Failed to compile C4A script: {str(e)}")
|
||||
raise
|
||||
|
||||
def is_match(self, url: str) -> bool:
|
||||
"""Check if this config matches the given URL.
|
||||
|
||||
Args:
|
||||
url: The URL to check against this config's matcher
|
||||
|
||||
Returns:
|
||||
bool: True if this config should be used for the URL or if no matcher is set.
|
||||
"""
|
||||
if self.url_matcher is None:
|
||||
return True
|
||||
|
||||
if callable(self.url_matcher):
|
||||
# Single function matcher
|
||||
return self.url_matcher(url)
|
||||
|
||||
elif isinstance(self.url_matcher, str):
|
||||
# Single pattern string
|
||||
from fnmatch import fnmatch
|
||||
return fnmatch(url, self.url_matcher)
|
||||
|
||||
elif isinstance(self.url_matcher, list):
|
||||
# List of mixed matchers
|
||||
if not self.url_matcher: # Empty list
|
||||
return False
|
||||
|
||||
results = []
|
||||
for matcher in self.url_matcher:
|
||||
if callable(matcher):
|
||||
results.append(matcher(url))
|
||||
elif isinstance(matcher, str):
|
||||
from fnmatch import fnmatch
|
||||
results.append(fnmatch(url, matcher))
|
||||
else:
|
||||
# Skip invalid matchers
|
||||
continue
|
||||
|
||||
# Apply match mode logic
|
||||
if self.match_mode == MatchMode.OR:
|
||||
return any(results) if results else False
|
||||
else: # AND mode
|
||||
return all(results) if results else False
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def __getattr__(self, name):
|
||||
@@ -1387,6 +1472,7 @@ class CrawlerRunConfig():
|
||||
ignore_body_visibility=kwargs.get("ignore_body_visibility", True),
|
||||
scan_full_page=kwargs.get("scan_full_page", False),
|
||||
scroll_delay=kwargs.get("scroll_delay", 0.2),
|
||||
max_scroll_steps=kwargs.get("max_scroll_steps"),
|
||||
process_iframes=kwargs.get("process_iframes", False),
|
||||
remove_overlay_elements=kwargs.get("remove_overlay_elements", False),
|
||||
simulate_user=kwargs.get("simulate_user", False),
|
||||
@@ -1438,6 +1524,9 @@ class CrawlerRunConfig():
|
||||
# Link Extraction Parameters
|
||||
link_preview_config=kwargs.get("link_preview_config"),
|
||||
url=kwargs.get("url"),
|
||||
# URL Matching Parameters
|
||||
url_matcher=kwargs.get("url_matcher"),
|
||||
match_mode=kwargs.get("match_mode", MatchMode.OR),
|
||||
# Experimental Parameters
|
||||
experimental=kwargs.get("experimental"),
|
||||
)
|
||||
@@ -1499,6 +1588,7 @@ class CrawlerRunConfig():
|
||||
"ignore_body_visibility": self.ignore_body_visibility,
|
||||
"scan_full_page": self.scan_full_page,
|
||||
"scroll_delay": self.scroll_delay,
|
||||
"max_scroll_steps": self.max_scroll_steps,
|
||||
"process_iframes": self.process_iframes,
|
||||
"remove_overlay_elements": self.remove_overlay_elements,
|
||||
"simulate_user": self.simulate_user,
|
||||
@@ -1534,6 +1624,8 @@ class CrawlerRunConfig():
|
||||
"deep_crawl_strategy": self.deep_crawl_strategy,
|
||||
"link_preview_config": self.link_preview_config.to_dict() if self.link_preview_config else None,
|
||||
"url": self.url,
|
||||
"url_matcher": self.url_matcher,
|
||||
"match_mode": self.match_mode,
|
||||
"experimental": self.experimental,
|
||||
}
|
||||
|
||||
@@ -1653,22 +1745,57 @@ class SeedingConfig:
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
source: str = "sitemap+cc", # Options: "sitemap", "cc", "sitemap+cc"
|
||||
pattern: Optional[str] = "*", # URL pattern to filter discovered URLs (e.g., "*example.com/blog/*")
|
||||
live_check: bool = False, # Whether to perform HEAD requests to verify URL liveness
|
||||
extract_head: bool = False, # Whether to fetch and parse <head> section for metadata
|
||||
max_urls: int = -1, # Maximum number of URLs to discover (default: -1 for no limit)
|
||||
concurrency: int = 1000, # Maximum concurrent requests for live checks/head extraction
|
||||
hits_per_sec: int = 5, # Rate limit in requests per second
|
||||
force: bool = False, # If True, bypasses the AsyncUrlSeeder's internal .jsonl cache
|
||||
base_directory: Optional[str] = None, # Base directory for UrlSeeder's cache files (.jsonl)
|
||||
llm_config: Optional[LLMConfig] = None, # Forward LLM config for future use (e.g., relevance scoring)
|
||||
verbose: Optional[bool] = None, # Override crawler's general verbose setting
|
||||
query: Optional[str] = None, # Search query for relevance scoring
|
||||
score_threshold: Optional[float] = None, # Minimum relevance score to include URL (0.0-1.0)
|
||||
scoring_method: str = "bm25", # Scoring method: "bm25" (default), future: "semantic"
|
||||
filter_nonsense_urls: bool = True, # Filter out utility URLs like robots.txt, sitemap.xml, etc.
|
||||
source: str = "sitemap+cc",
|
||||
pattern: Optional[str] = "*",
|
||||
live_check: bool = False,
|
||||
extract_head: bool = False,
|
||||
max_urls: int = -1,
|
||||
concurrency: int = 1000,
|
||||
hits_per_sec: int = 5,
|
||||
force: bool = False,
|
||||
base_directory: Optional[str] = None,
|
||||
llm_config: Optional[LLMConfig] = None,
|
||||
verbose: Optional[bool] = None,
|
||||
query: Optional[str] = None,
|
||||
score_threshold: Optional[float] = None,
|
||||
scoring_method: str = "bm25",
|
||||
filter_nonsense_urls: bool = True,
|
||||
):
|
||||
"""
|
||||
Initialize URL seeding configuration.
|
||||
|
||||
Args:
|
||||
source: Discovery source(s) to use. Options: "sitemap", "cc" (Common Crawl),
|
||||
or "sitemap+cc" (both). Default: "sitemap+cc"
|
||||
pattern: URL pattern to filter discovered URLs (e.g., "*example.com/blog/*").
|
||||
Supports glob-style wildcards. Default: "*" (all URLs)
|
||||
live_check: Whether to perform HEAD requests to verify URL liveness.
|
||||
Default: False
|
||||
extract_head: Whether to fetch and parse <head> section for metadata extraction.
|
||||
Required for BM25 relevance scoring. Default: False
|
||||
max_urls: Maximum number of URLs to discover. Use -1 for no limit.
|
||||
Default: -1
|
||||
concurrency: Maximum concurrent requests for live checks/head extraction.
|
||||
Default: 1000
|
||||
hits_per_sec: Rate limit in requests per second to avoid overwhelming servers.
|
||||
Default: 5
|
||||
force: If True, bypasses the AsyncUrlSeeder's internal .jsonl cache and
|
||||
re-fetches URLs. Default: False
|
||||
base_directory: Base directory for UrlSeeder's cache files (.jsonl).
|
||||
If None, uses default ~/.crawl4ai/. Default: None
|
||||
llm_config: LLM configuration for future features (e.g., semantic scoring).
|
||||
Currently unused. Default: None
|
||||
verbose: Override crawler's general verbose setting for seeding operations.
|
||||
Default: None (inherits from crawler)
|
||||
query: Search query for BM25 relevance scoring (e.g., "python tutorials").
|
||||
Requires extract_head=True. Default: None
|
||||
score_threshold: Minimum relevance score (0.0-1.0) to include URL.
|
||||
Only applies when query is provided. Default: None
|
||||
scoring_method: Scoring algorithm to use. Currently only "bm25" is supported.
|
||||
Future: "semantic". Default: "bm25"
|
||||
filter_nonsense_urls: Filter out utility URLs like robots.txt, sitemap.xml,
|
||||
ads.txt, favicon.ico, etc. Default: True
|
||||
"""
|
||||
self.source = source
|
||||
self.pattern = pattern
|
||||
self.live_check = live_check
|
||||
|
||||
2450
crawl4ai/async_crawler_strategy.back.py
Normal file
2450
crawl4ai/async_crawler_strategy.back.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -21,6 +21,7 @@ from .async_logger import AsyncLogger
|
||||
from .ssl_certificate import SSLCertificate
|
||||
from .user_agent_generator import ValidUAGenerator
|
||||
from .browser_manager import BrowserManager
|
||||
from .browser_adapter import BrowserAdapter, PlaywrightAdapter, UndetectedAdapter
|
||||
|
||||
import aiofiles
|
||||
import aiohttp
|
||||
@@ -71,7 +72,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, browser_config: BrowserConfig = None, logger: AsyncLogger = None, **kwargs
|
||||
self, browser_config: BrowserConfig = None, logger: AsyncLogger = None, browser_adapter: BrowserAdapter = None, **kwargs
|
||||
):
|
||||
"""
|
||||
Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration.
|
||||
@@ -80,11 +81,16 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
browser_config (BrowserConfig): Configuration object containing browser settings.
|
||||
If None, will be created from kwargs for backwards compatibility.
|
||||
logger: Logger instance for recording events and errors.
|
||||
browser_adapter (BrowserAdapter): Browser adapter for handling browser-specific operations.
|
||||
If None, defaults to PlaywrightAdapter.
|
||||
**kwargs: Additional arguments for backwards compatibility and extending functionality.
|
||||
"""
|
||||
# Initialize browser config, either from provided object or kwargs
|
||||
self.browser_config = browser_config or BrowserConfig.from_kwargs(kwargs)
|
||||
self.logger = logger
|
||||
|
||||
# Initialize browser adapter
|
||||
self.adapter = browser_adapter or PlaywrightAdapter()
|
||||
|
||||
# Initialize session management
|
||||
self._downloaded_files = []
|
||||
@@ -104,7 +110,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
|
||||
# Initialize browser manager with config
|
||||
self.browser_manager = BrowserManager(
|
||||
browser_config=self.browser_config, logger=self.logger
|
||||
browser_config=self.browser_config,
|
||||
logger=self.logger,
|
||||
use_undetected=isinstance(self.adapter, UndetectedAdapter)
|
||||
)
|
||||
|
||||
async def __aenter__(self):
|
||||
@@ -322,7 +330,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
"""
|
||||
|
||||
try:
|
||||
result = await page.evaluate(wrapper_js)
|
||||
result = await self.adapter.evaluate(page, wrapper_js)
|
||||
return result
|
||||
except Exception as e:
|
||||
if "Error evaluating condition" in str(e):
|
||||
@@ -367,7 +375,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
|
||||
# Replace the iframe with a div containing the extracted content
|
||||
_iframe = iframe_content.replace("`", "\\`")
|
||||
await page.evaluate(
|
||||
await self.adapter.evaluate(page,
|
||||
f"""
|
||||
() => {{
|
||||
const iframe = document.getElementById('iframe-{i}');
|
||||
@@ -445,6 +453,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
return await self._crawl_web(url, config)
|
||||
|
||||
elif url.startswith("file://"):
|
||||
# initialize empty lists for console messages
|
||||
captured_console = []
|
||||
|
||||
# Process local file
|
||||
local_file_path = url[7:] # Remove 'file://' prefix
|
||||
if not os.path.exists(local_file_path):
|
||||
@@ -466,9 +477,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
console_messages=captured_console,
|
||||
)
|
||||
|
||||
elif url.startswith("raw:") or url.startswith("raw://"):
|
||||
#####
|
||||
# Since both "raw:" and "raw://" start with "raw:", the first condition is always true for both, so "raw://" will be sliced as "//...", which is incorrect.
|
||||
# Fix: Check for "raw://" first, then "raw:"
|
||||
# Also, the prefix "raw://" is actually 6 characters long, not 7, so it should be sliced accordingly: url[6:]
|
||||
#####
|
||||
elif url.startswith("raw://") or url.startswith("raw:"):
|
||||
# Process raw HTML content
|
||||
raw_html = url[4:] if url[:4] == "raw:" else url[7:]
|
||||
# raw_html = url[4:] if url[:4] == "raw:" else url[7:]
|
||||
raw_html = url[6:] if url.startswith("raw://") else url[4:]
|
||||
html = raw_html
|
||||
if config.screenshot:
|
||||
screenshot_data = await self._generate_screenshot_from_html(html)
|
||||
@@ -619,91 +636,16 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
page.on("requestfailed", handle_request_failed_capture)
|
||||
|
||||
# Console Message Capturing
|
||||
handle_console = None
|
||||
handle_error = None
|
||||
if config.capture_console_messages:
|
||||
def handle_console_capture(msg):
|
||||
try:
|
||||
message_type = "unknown"
|
||||
try:
|
||||
message_type = msg.type
|
||||
except:
|
||||
pass
|
||||
|
||||
message_text = "unknown"
|
||||
try:
|
||||
message_text = msg.text
|
||||
except:
|
||||
pass
|
||||
|
||||
# Basic console message with minimal content
|
||||
entry = {
|
||||
"type": message_type,
|
||||
"text": message_text,
|
||||
"timestamp": time.time()
|
||||
}
|
||||
|
||||
captured_console.append(entry)
|
||||
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.warning(f"Error capturing console message: {e}", tag="CAPTURE")
|
||||
# Still add something to the list even on error
|
||||
captured_console.append({
|
||||
"type": "console_capture_error",
|
||||
"error": str(e),
|
||||
"timestamp": time.time()
|
||||
})
|
||||
|
||||
def handle_pageerror_capture(err):
|
||||
try:
|
||||
error_message = "Unknown error"
|
||||
try:
|
||||
error_message = err.message
|
||||
except:
|
||||
pass
|
||||
|
||||
error_stack = ""
|
||||
try:
|
||||
error_stack = err.stack
|
||||
except:
|
||||
pass
|
||||
|
||||
captured_console.append({
|
||||
"type": "error",
|
||||
"text": error_message,
|
||||
"stack": error_stack,
|
||||
"timestamp": time.time()
|
||||
})
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.warning(f"Error capturing page error: {e}", tag="CAPTURE")
|
||||
captured_console.append({
|
||||
"type": "pageerror_capture_error",
|
||||
"error": str(e),
|
||||
"timestamp": time.time()
|
||||
})
|
||||
|
||||
# Add event listeners directly
|
||||
page.on("console", handle_console_capture)
|
||||
page.on("pageerror", handle_pageerror_capture)
|
||||
# Set up console capture using adapter
|
||||
handle_console = await self.adapter.setup_console_capture(page, captured_console)
|
||||
handle_error = await self.adapter.setup_error_capture(page, captured_console)
|
||||
|
||||
# Set up console logging if requested
|
||||
if config.log_console:
|
||||
def log_consol(
|
||||
msg, console_log_type="debug"
|
||||
): # Corrected the parameter syntax
|
||||
if console_log_type == "error":
|
||||
self.logger.error(
|
||||
message=f"Console error: {msg}", # Use f-string for variable interpolation
|
||||
tag="CONSOLE"
|
||||
)
|
||||
elif console_log_type == "debug":
|
||||
self.logger.debug(
|
||||
message=f"Console: {msg}", # Use f-string for variable interpolation
|
||||
tag="CONSOLE"
|
||||
)
|
||||
|
||||
page.on("console", log_consol)
|
||||
page.on("pageerror", lambda e: log_consol(e, "error"))
|
||||
# Note: For undetected browsers, console logging won't work directly
|
||||
# but captured messages can still be logged after retrieval
|
||||
|
||||
try:
|
||||
# Get SSL certificate information if requested and URL is HTTPS
|
||||
@@ -741,18 +683,49 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
)
|
||||
redirected_url = page.url
|
||||
except Error as e:
|
||||
raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
|
||||
# Allow navigation to be aborted when downloading files
|
||||
# This is expected behavior for downloads in some browser engines
|
||||
if 'net::ERR_ABORTED' in str(e) and self.browser_config.accept_downloads:
|
||||
self.logger.info(
|
||||
message=f"Navigation aborted, likely due to file download: {url}",
|
||||
tag="GOTO",
|
||||
params={"url": url},
|
||||
)
|
||||
response = None
|
||||
else:
|
||||
raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
|
||||
|
||||
await self.execute_hook(
|
||||
"after_goto", page, context=context, url=url, response=response, config=config
|
||||
)
|
||||
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
# Walk the redirect chain. Playwright returns only the last
|
||||
# hop, so we trace the `request.redirected_from` links until the
|
||||
# first response that differs from the final one and surface its
|
||||
# status-code.
|
||||
# ──────────────────────────────────────────────────────────────
|
||||
if response is None:
|
||||
status_code = 200
|
||||
response_headers = {}
|
||||
else:
|
||||
status_code = response.status
|
||||
response_headers = response.headers
|
||||
first_resp = response
|
||||
req = response.request
|
||||
while req and req.redirected_from:
|
||||
prev_req = req.redirected_from
|
||||
prev_resp = await prev_req.response()
|
||||
if prev_resp: # keep earliest
|
||||
first_resp = prev_resp
|
||||
req = prev_req
|
||||
|
||||
status_code = first_resp.status
|
||||
response_headers = first_resp.headers
|
||||
# if response is None:
|
||||
# status_code = 200
|
||||
# response_headers = {}
|
||||
# else:
|
||||
# status_code = response.status
|
||||
# response_headers = response.headers
|
||||
|
||||
else:
|
||||
status_code = 200
|
||||
@@ -784,7 +757,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
except Error:
|
||||
visibility_info = await self.check_visibility(page)
|
||||
|
||||
if self.browser_config.config.verbose:
|
||||
if self.browser_config.verbose:
|
||||
self.logger.debug(
|
||||
message="Body visibility info: {info}",
|
||||
tag="DEBUG",
|
||||
@@ -896,7 +869,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
|
||||
# Handle full page scanning
|
||||
if config.scan_full_page:
|
||||
await self._handle_full_page_scan(page, config.scroll_delay)
|
||||
# await self._handle_full_page_scan(page, config.scroll_delay)
|
||||
await self._handle_full_page_scan(page, config.scroll_delay, config.max_scroll_steps)
|
||||
|
||||
# Handle virtual scroll if configured
|
||||
if config.virtual_scroll_config:
|
||||
@@ -957,7 +931,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
await page.wait_for_load_state("domcontentloaded", timeout=5)
|
||||
except PlaywrightTimeoutError:
|
||||
pass
|
||||
await page.evaluate(update_image_dimensions_js)
|
||||
await self.adapter.evaluate(page, update_image_dimensions_js)
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
message="Error updating image dimensions: {error}",
|
||||
@@ -986,7 +960,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
|
||||
for selector in selectors:
|
||||
try:
|
||||
content = await page.evaluate(
|
||||
content = await self.adapter.evaluate(page,
|
||||
f"""Array.from(document.querySelectorAll("{selector}"))
|
||||
.map(el => el.outerHTML)
|
||||
.join('')"""
|
||||
@@ -1044,6 +1018,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
await asyncio.sleep(delay)
|
||||
return await page.content()
|
||||
|
||||
# For undetected browsers, retrieve console messages before returning
|
||||
if config.capture_console_messages and hasattr(self.adapter, 'retrieve_console_messages'):
|
||||
final_messages = await self.adapter.retrieve_console_messages(page)
|
||||
captured_console.extend(final_messages)
|
||||
|
||||
# Return complete response
|
||||
return AsyncCrawlResponse(
|
||||
html=html,
|
||||
@@ -1082,13 +1061,19 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
page.remove_listener("response", handle_response_capture)
|
||||
page.remove_listener("requestfailed", handle_request_failed_capture)
|
||||
if config.capture_console_messages:
|
||||
page.remove_listener("console", handle_console_capture)
|
||||
page.remove_listener("pageerror", handle_pageerror_capture)
|
||||
# Retrieve any final console messages for undetected browsers
|
||||
if hasattr(self.adapter, 'retrieve_console_messages'):
|
||||
final_messages = await self.adapter.retrieve_console_messages(page)
|
||||
captured_console.extend(final_messages)
|
||||
|
||||
# Clean up console capture
|
||||
await self.adapter.cleanup_console_capture(page, handle_console, handle_error)
|
||||
|
||||
# Close the page
|
||||
await page.close()
|
||||
|
||||
async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1):
|
||||
# async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1):
|
||||
async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1, max_scroll_steps: Optional[int] = None):
|
||||
"""
|
||||
Helper method to handle full page scanning.
|
||||
|
||||
@@ -1103,6 +1088,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
Args:
|
||||
page (Page): The Playwright page object
|
||||
scroll_delay (float): The delay between page scrolls
|
||||
max_scroll_steps (Optional[int]): Maximum number of scroll steps to perform. If None, scrolls until end.
|
||||
|
||||
"""
|
||||
try:
|
||||
@@ -1127,9 +1113,21 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
dimensions = await self.get_page_dimensions(page)
|
||||
total_height = dimensions["height"]
|
||||
|
||||
scroll_step_count = 0
|
||||
while current_position < total_height:
|
||||
####
|
||||
# NEW FEATURE: Check if we've reached the maximum allowed scroll steps
|
||||
# This prevents infinite scrolling on very long pages or infinite scroll scenarios
|
||||
# If max_scroll_steps is None, this check is skipped (unlimited scrolling - original behavior)
|
||||
####
|
||||
if max_scroll_steps is not None and scroll_step_count >= max_scroll_steps:
|
||||
break
|
||||
current_position = min(current_position + viewport_height, total_height)
|
||||
await self.safe_scroll(page, 0, current_position, delay=scroll_delay)
|
||||
|
||||
# Increment the step counter for max_scroll_steps tracking
|
||||
scroll_step_count += 1
|
||||
|
||||
# await page.evaluate(f"window.scrollTo(0, {current_position})")
|
||||
# await asyncio.sleep(scroll_delay)
|
||||
|
||||
@@ -1299,7 +1297,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
"""
|
||||
|
||||
# Execute virtual scroll capture
|
||||
result = await page.evaluate(virtual_scroll_js, config.to_dict())
|
||||
result = await self.adapter.evaluate(page, virtual_scroll_js, config.to_dict())
|
||||
|
||||
if result.get("replaced", False):
|
||||
self.logger.success(
|
||||
@@ -1383,7 +1381,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
remove_overlays_js = load_js_script("remove_overlay_elements")
|
||||
|
||||
try:
|
||||
await page.evaluate(
|
||||
await self.adapter.evaluate(page,
|
||||
f"""
|
||||
(() => {{
|
||||
try {{
|
||||
@@ -1616,12 +1614,32 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
num_segments = (page_height // viewport_height) + 1
|
||||
for i in range(num_segments):
|
||||
y_offset = i * viewport_height
|
||||
# Special handling for the last segment
|
||||
if i == num_segments - 1:
|
||||
last_part_height = page_height % viewport_height
|
||||
|
||||
# If page_height is an exact multiple of viewport_height,
|
||||
# we don't need an extra segment
|
||||
if last_part_height == 0:
|
||||
# Skip last segment if page height is exact multiple of viewport
|
||||
break
|
||||
|
||||
# Adjust viewport to exactly match the remaining content height
|
||||
await page.set_viewport_size({"width": page_width, "height": last_part_height})
|
||||
|
||||
await page.evaluate(f"window.scrollTo(0, {y_offset})")
|
||||
await asyncio.sleep(0.01) # wait for render
|
||||
seg_shot = await page.screenshot(full_page=False)
|
||||
|
||||
# Capture the current segment
|
||||
# Note: Using compression options (format, quality) would go here
|
||||
seg_shot = await page.screenshot(full_page=False, type="jpeg", quality=85)
|
||||
# seg_shot = await page.screenshot(full_page=False)
|
||||
img = Image.open(BytesIO(seg_shot)).convert("RGB")
|
||||
segments.append(img)
|
||||
|
||||
# Reset viewport to original size after capturing segments
|
||||
await page.set_viewport_size({"width": page_width, "height": viewport_height})
|
||||
|
||||
total_height = sum(img.height for img in segments)
|
||||
stitched = Image.new("RGB", (segments[0].width, total_height))
|
||||
offset = 0
|
||||
@@ -1750,12 +1768,31 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
# then wait for the new page to load before continuing
|
||||
result = None
|
||||
try:
|
||||
result = await page.evaluate(
|
||||
# OLD VERSION:
|
||||
# result = await page.evaluate(
|
||||
# f"""
|
||||
# (async () => {{
|
||||
# try {{
|
||||
# const script_result = {script};
|
||||
# return {{ success: true, result: script_result }};
|
||||
# }} catch (err) {{
|
||||
# return {{ success: false, error: err.toString(), stack: err.stack }};
|
||||
# }}
|
||||
# }})();
|
||||
# """
|
||||
# )
|
||||
|
||||
# """ NEW VERSION:
|
||||
# When {script} contains statements (e.g., const link = …; link.click();),
|
||||
# this forms invalid JavaScript, causing Playwright execution error: SyntaxError: Unexpected token 'const'.
|
||||
# """
|
||||
result = await self.adapter.evaluate(page,
|
||||
f"""
|
||||
(async () => {{
|
||||
try {{
|
||||
const script_result = {script};
|
||||
return {{ success: true, result: script_result }};
|
||||
return await (async () => {{
|
||||
{script}
|
||||
}})();
|
||||
}} catch (err) {{
|
||||
return {{ success: false, error: err.toString(), stack: err.stack }};
|
||||
}}
|
||||
@@ -1871,7 +1908,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
for script in scripts:
|
||||
try:
|
||||
# Execute the script and wait for network idle
|
||||
result = await page.evaluate(
|
||||
result = await self.adapter.evaluate(page,
|
||||
f"""
|
||||
(() => {{
|
||||
return new Promise((resolve) => {{
|
||||
@@ -1955,7 +1992,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
Returns:
|
||||
Boolean indicating visibility
|
||||
"""
|
||||
return await page.evaluate(
|
||||
return await self.adapter.evaluate(page,
|
||||
"""
|
||||
() => {
|
||||
const element = document.body;
|
||||
@@ -1996,7 +2033,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
Dict containing scroll status and position information
|
||||
"""
|
||||
try:
|
||||
result = await page.evaluate(
|
||||
result = await self.adapter.evaluate(page,
|
||||
f"""() => {{
|
||||
try {{
|
||||
const startX = window.scrollX;
|
||||
@@ -2053,7 +2090,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
Returns:
|
||||
Dict containing width and height of the page
|
||||
"""
|
||||
return await page.evaluate(
|
||||
return await self.adapter.evaluate(page,
|
||||
"""
|
||||
() => {
|
||||
const {scrollWidth, scrollHeight} = document.documentElement;
|
||||
@@ -2073,7 +2110,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
bool: True if page needs scrolling
|
||||
"""
|
||||
try:
|
||||
need_scroll = await page.evaluate(
|
||||
need_scroll = await self.adapter.evaluate(page,
|
||||
"""
|
||||
() => {
|
||||
const scrollHeight = document.documentElement.scrollHeight;
|
||||
@@ -2092,265 +2129,3 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
return True # Default to scrolling if check fails
|
||||
|
||||
|
||||
####################################################################################################
|
||||
# HTTP Crawler Strategy
|
||||
####################################################################################################
|
||||
|
||||
class HTTPCrawlerError(Exception):
|
||||
"""Base error class for HTTP crawler specific exceptions"""
|
||||
pass
|
||||
|
||||
|
||||
class ConnectionTimeoutError(HTTPCrawlerError):
|
||||
"""Raised when connection timeout occurs"""
|
||||
pass
|
||||
|
||||
|
||||
class HTTPStatusError(HTTPCrawlerError):
|
||||
"""Raised for unexpected status codes"""
|
||||
def __init__(self, status_code: int, message: str):
|
||||
self.status_code = status_code
|
||||
super().__init__(f"HTTP {status_code}: {message}")
|
||||
|
||||
|
||||
class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
"""
|
||||
Fast, lightweight HTTP-only crawler strategy optimized for memory efficiency.
|
||||
"""
|
||||
|
||||
__slots__ = ('logger', 'max_connections', 'dns_cache_ttl', 'chunk_size', '_session', 'hooks', 'browser_config')
|
||||
|
||||
DEFAULT_TIMEOUT: Final[int] = 30
|
||||
DEFAULT_CHUNK_SIZE: Final[int] = 64 * 1024
|
||||
DEFAULT_MAX_CONNECTIONS: Final[int] = min(32, (os.cpu_count() or 1) * 4)
|
||||
DEFAULT_DNS_CACHE_TTL: Final[int] = 300
|
||||
VALID_SCHEMES: Final = frozenset({'http', 'https', 'file', 'raw'})
|
||||
|
||||
_BASE_HEADERS: Final = MappingProxyType({
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||
})
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
browser_config: Optional[HTTPCrawlerConfig] = None,
|
||||
logger: Optional[AsyncLogger] = None,
|
||||
max_connections: int = DEFAULT_MAX_CONNECTIONS,
|
||||
dns_cache_ttl: int = DEFAULT_DNS_CACHE_TTL,
|
||||
chunk_size: int = DEFAULT_CHUNK_SIZE
|
||||
):
|
||||
"""Initialize the HTTP crawler with config"""
|
||||
self.browser_config = browser_config or HTTPCrawlerConfig()
|
||||
self.logger = logger
|
||||
self.max_connections = max_connections
|
||||
self.dns_cache_ttl = dns_cache_ttl
|
||||
self.chunk_size = chunk_size
|
||||
self._session: Optional[aiohttp.ClientSession] = None
|
||||
|
||||
self.hooks = {
|
||||
k: partial(self._execute_hook, k)
|
||||
for k in ('before_request', 'after_request', 'on_error')
|
||||
}
|
||||
|
||||
# Set default hooks
|
||||
self.set_hook('before_request', lambda *args, **kwargs: None)
|
||||
self.set_hook('after_request', lambda *args, **kwargs: None)
|
||||
self.set_hook('on_error', lambda *args, **kwargs: None)
|
||||
|
||||
|
||||
async def __aenter__(self) -> AsyncHTTPCrawlerStrategy:
|
||||
await self.start()
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
|
||||
await self.close()
|
||||
|
||||
@contextlib.asynccontextmanager
|
||||
async def _session_context(self):
|
||||
try:
|
||||
if not self._session:
|
||||
await self.start()
|
||||
yield self._session
|
||||
finally:
|
||||
pass
|
||||
|
||||
def set_hook(self, hook_type: str, hook_func: Callable) -> None:
|
||||
if hook_type in self.hooks:
|
||||
self.hooks[hook_type] = partial(self._execute_hook, hook_type, hook_func)
|
||||
else:
|
||||
raise ValueError(f"Invalid hook type: {hook_type}")
|
||||
|
||||
async def _execute_hook(
|
||||
self,
|
||||
hook_type: str,
|
||||
hook_func: Callable,
|
||||
*args: Any,
|
||||
**kwargs: Any
|
||||
) -> Any:
|
||||
if asyncio.iscoroutinefunction(hook_func):
|
||||
return await hook_func(*args, **kwargs)
|
||||
return hook_func(*args, **kwargs)
|
||||
|
||||
async def start(self) -> None:
|
||||
if not self._session:
|
||||
connector = aiohttp.TCPConnector(
|
||||
limit=self.max_connections,
|
||||
ttl_dns_cache=self.dns_cache_ttl,
|
||||
use_dns_cache=True,
|
||||
force_close=False
|
||||
)
|
||||
self._session = aiohttp.ClientSession(
|
||||
headers=dict(self._BASE_HEADERS),
|
||||
connector=connector,
|
||||
timeout=ClientTimeout(total=self.DEFAULT_TIMEOUT)
|
||||
)
|
||||
|
||||
async def close(self) -> None:
|
||||
if self._session and not self._session.closed:
|
||||
try:
|
||||
await asyncio.wait_for(self._session.close(), timeout=5.0)
|
||||
except asyncio.TimeoutError:
|
||||
if self.logger:
|
||||
self.logger.warning(
|
||||
message="Session cleanup timed out",
|
||||
tag="CLEANUP"
|
||||
)
|
||||
finally:
|
||||
self._session = None
|
||||
|
||||
async def _stream_file(self, path: str) -> AsyncGenerator[memoryview, None]:
|
||||
async with aiofiles.open(path, mode='rb') as f:
|
||||
while chunk := await f.read(self.chunk_size):
|
||||
yield memoryview(chunk)
|
||||
|
||||
async def _handle_file(self, path: str) -> AsyncCrawlResponse:
|
||||
if not os.path.exists(path):
|
||||
raise FileNotFoundError(f"Local file not found: {path}")
|
||||
|
||||
chunks = []
|
||||
async for chunk in self._stream_file(path):
|
||||
chunks.append(chunk.tobytes().decode('utf-8', errors='replace'))
|
||||
|
||||
return AsyncCrawlResponse(
|
||||
html=''.join(chunks),
|
||||
response_headers={},
|
||||
status_code=200
|
||||
)
|
||||
|
||||
async def _handle_raw(self, content: str) -> AsyncCrawlResponse:
|
||||
return AsyncCrawlResponse(
|
||||
html=content,
|
||||
response_headers={},
|
||||
status_code=200
|
||||
)
|
||||
|
||||
|
||||
async def _handle_http(
|
||||
self,
|
||||
url: str,
|
||||
config: CrawlerRunConfig
|
||||
) -> AsyncCrawlResponse:
|
||||
async with self._session_context() as session:
|
||||
timeout = ClientTimeout(
|
||||
total=config.page_timeout or self.DEFAULT_TIMEOUT,
|
||||
connect=10,
|
||||
sock_read=30
|
||||
)
|
||||
|
||||
headers = dict(self._BASE_HEADERS)
|
||||
if self.browser_config.headers:
|
||||
headers.update(self.browser_config.headers)
|
||||
|
||||
request_kwargs = {
|
||||
'timeout': timeout,
|
||||
'allow_redirects': self.browser_config.follow_redirects,
|
||||
'ssl': self.browser_config.verify_ssl,
|
||||
'headers': headers
|
||||
}
|
||||
|
||||
if self.browser_config.method == "POST":
|
||||
if self.browser_config.data:
|
||||
request_kwargs['data'] = self.browser_config.data
|
||||
if self.browser_config.json:
|
||||
request_kwargs['json'] = self.browser_config.json
|
||||
|
||||
await self.hooks['before_request'](url, request_kwargs)
|
||||
|
||||
try:
|
||||
async with session.request(self.browser_config.method, url, **request_kwargs) as response:
|
||||
content = memoryview(await response.read())
|
||||
|
||||
if not (200 <= response.status < 300):
|
||||
raise HTTPStatusError(
|
||||
response.status,
|
||||
f"Unexpected status code for {url}"
|
||||
)
|
||||
|
||||
encoding = response.charset
|
||||
if not encoding:
|
||||
encoding = chardet.detect(content.tobytes())['encoding'] or 'utf-8'
|
||||
|
||||
result = AsyncCrawlResponse(
|
||||
html=content.tobytes().decode(encoding, errors='replace'),
|
||||
response_headers=dict(response.headers),
|
||||
status_code=response.status,
|
||||
redirected_url=str(response.url)
|
||||
)
|
||||
|
||||
await self.hooks['after_request'](result)
|
||||
return result
|
||||
|
||||
except aiohttp.ServerTimeoutError as e:
|
||||
await self.hooks['on_error'](e)
|
||||
raise ConnectionTimeoutError(f"Request timed out: {str(e)}")
|
||||
|
||||
except aiohttp.ClientConnectorError as e:
|
||||
await self.hooks['on_error'](e)
|
||||
raise ConnectionError(f"Connection failed: {str(e)}")
|
||||
|
||||
except aiohttp.ClientError as e:
|
||||
await self.hooks['on_error'](e)
|
||||
raise HTTPCrawlerError(f"HTTP client error: {str(e)}")
|
||||
|
||||
except asyncio.exceptions.TimeoutError as e:
|
||||
await self.hooks['on_error'](e)
|
||||
raise ConnectionTimeoutError(f"Request timed out: {str(e)}")
|
||||
|
||||
except Exception as e:
|
||||
await self.hooks['on_error'](e)
|
||||
raise HTTPCrawlerError(f"HTTP request failed: {str(e)}")
|
||||
|
||||
async def crawl(
|
||||
self,
|
||||
url: str,
|
||||
config: Optional[CrawlerRunConfig] = None,
|
||||
**kwargs
|
||||
) -> AsyncCrawlResponse:
|
||||
config = config or CrawlerRunConfig.from_kwargs(kwargs)
|
||||
|
||||
parsed = urlparse(url)
|
||||
scheme = parsed.scheme.rstrip('/')
|
||||
|
||||
if scheme not in self.VALID_SCHEMES:
|
||||
raise ValueError(f"Unsupported URL scheme: {scheme}")
|
||||
|
||||
try:
|
||||
if scheme == 'file':
|
||||
return await self._handle_file(parsed.path)
|
||||
elif scheme == 'raw':
|
||||
return await self._handle_raw(parsed.path)
|
||||
else: # http or https
|
||||
return await self._handle_http(url, config)
|
||||
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.error(
|
||||
message="Crawl failed: {error}",
|
||||
tag="CRAWL",
|
||||
params={"error": str(e), "url": url}
|
||||
)
|
||||
raise
|
||||
@@ -1,4 +1,4 @@
|
||||
from typing import Dict, Optional, List, Tuple
|
||||
from typing import Dict, Optional, List, Tuple, Union
|
||||
from .async_configs import CrawlerRunConfig
|
||||
from .models import (
|
||||
CrawlResult,
|
||||
@@ -22,6 +22,8 @@ from urllib.parse import urlparse
|
||||
import random
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from .memory_utils import get_true_memory_usage_percent
|
||||
|
||||
|
||||
class RateLimiter:
|
||||
def __init__(
|
||||
@@ -96,11 +98,37 @@ class BaseDispatcher(ABC):
|
||||
self.rate_limiter = rate_limiter
|
||||
self.monitor = monitor
|
||||
|
||||
def select_config(self, url: str, configs: Union[CrawlerRunConfig, List[CrawlerRunConfig]]) -> Optional[CrawlerRunConfig]:
|
||||
"""Select the appropriate config for a given URL.
|
||||
|
||||
Args:
|
||||
url: The URL to match against
|
||||
configs: Single config or list of configs to choose from
|
||||
|
||||
Returns:
|
||||
The matching config, or None if no match found
|
||||
"""
|
||||
# Single config - return as is
|
||||
if isinstance(configs, CrawlerRunConfig):
|
||||
return configs
|
||||
|
||||
# Empty list - return None
|
||||
if not configs:
|
||||
return None
|
||||
|
||||
# Find first matching config
|
||||
for config in configs:
|
||||
if config.is_match(url):
|
||||
return config
|
||||
|
||||
# No match found - return None to indicate URL should be skipped
|
||||
return None
|
||||
|
||||
@abstractmethod
|
||||
async def crawl_url(
|
||||
self,
|
||||
url: str,
|
||||
config: CrawlerRunConfig,
|
||||
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||
task_id: str,
|
||||
monitor: Optional[CrawlerMonitor] = None,
|
||||
) -> CrawlerTaskResult:
|
||||
@@ -111,7 +139,7 @@ class BaseDispatcher(ABC):
|
||||
self,
|
||||
urls: List[str],
|
||||
crawler: AsyncWebCrawler, # noqa: F821
|
||||
config: CrawlerRunConfig,
|
||||
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||
monitor: Optional[CrawlerMonitor] = None,
|
||||
) -> List[CrawlerTaskResult]:
|
||||
pass
|
||||
@@ -147,7 +175,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
async def _memory_monitor_task(self):
|
||||
"""Background task to continuously monitor memory usage and update state"""
|
||||
while True:
|
||||
self.current_memory_percent = psutil.virtual_memory().percent
|
||||
self.current_memory_percent = get_true_memory_usage_percent()
|
||||
|
||||
# Enter memory pressure mode if we cross the threshold
|
||||
if self.current_memory_percent >= self.memory_threshold_percent:
|
||||
@@ -200,7 +228,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
async def crawl_url(
|
||||
self,
|
||||
url: str,
|
||||
config: CrawlerRunConfig,
|
||||
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||
task_id: str,
|
||||
retry_count: int = 0,
|
||||
) -> CrawlerTaskResult:
|
||||
@@ -208,6 +236,37 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
error_message = ""
|
||||
memory_usage = peak_memory = 0.0
|
||||
|
||||
# Select appropriate config for this URL
|
||||
selected_config = self.select_config(url, config)
|
||||
|
||||
# If no config matches, return failed result
|
||||
if selected_config is None:
|
||||
error_message = f"No matching configuration found for URL: {url}"
|
||||
if self.monitor:
|
||||
self.monitor.update_task(
|
||||
task_id,
|
||||
status=CrawlStatus.FAILED,
|
||||
error_message=error_message
|
||||
)
|
||||
|
||||
return CrawlerTaskResult(
|
||||
task_id=task_id,
|
||||
url=url,
|
||||
result=CrawlResult(
|
||||
url=url,
|
||||
html="",
|
||||
metadata={"status": "no_config_match"},
|
||||
success=False,
|
||||
error_message=error_message
|
||||
),
|
||||
memory_usage=0,
|
||||
peak_memory=0,
|
||||
start_time=start_time,
|
||||
end_time=time.time(),
|
||||
error_message=error_message,
|
||||
retry_count=retry_count
|
||||
)
|
||||
|
||||
# Get starting memory for accurate measurement
|
||||
process = psutil.Process()
|
||||
start_memory = process.memory_info().rss / (1024 * 1024)
|
||||
@@ -257,8 +316,8 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
retry_count=retry_count + 1
|
||||
)
|
||||
|
||||
# Execute the crawl
|
||||
result = await self.crawler.arun(url, config=config, session_id=task_id)
|
||||
# Execute the crawl with selected config
|
||||
result = await self.crawler.arun(url, config=selected_config, session_id=task_id)
|
||||
|
||||
# Measure memory usage
|
||||
end_memory = process.memory_info().rss / (1024 * 1024)
|
||||
@@ -316,7 +375,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
self,
|
||||
urls: List[str],
|
||||
crawler: AsyncWebCrawler,
|
||||
config: CrawlerRunConfig,
|
||||
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||
) -> List[CrawlerTaskResult]:
|
||||
self.crawler = crawler
|
||||
|
||||
@@ -470,7 +529,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
self,
|
||||
urls: List[str],
|
||||
crawler: AsyncWebCrawler,
|
||||
config: CrawlerRunConfig,
|
||||
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||
) -> AsyncGenerator[CrawlerTaskResult, None]:
|
||||
self.crawler = crawler
|
||||
|
||||
@@ -572,7 +631,7 @@ class SemaphoreDispatcher(BaseDispatcher):
|
||||
async def crawl_url(
|
||||
self,
|
||||
url: str,
|
||||
config: CrawlerRunConfig,
|
||||
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||
task_id: str,
|
||||
semaphore: asyncio.Semaphore = None,
|
||||
) -> CrawlerTaskResult:
|
||||
@@ -580,6 +639,36 @@ class SemaphoreDispatcher(BaseDispatcher):
|
||||
error_message = ""
|
||||
memory_usage = peak_memory = 0.0
|
||||
|
||||
# Select appropriate config for this URL
|
||||
selected_config = self.select_config(url, config)
|
||||
|
||||
# If no config matches, return failed result
|
||||
if selected_config is None:
|
||||
error_message = f"No matching configuration found for URL: {url}"
|
||||
if self.monitor:
|
||||
self.monitor.update_task(
|
||||
task_id,
|
||||
status=CrawlStatus.FAILED,
|
||||
error_message=error_message
|
||||
)
|
||||
|
||||
return CrawlerTaskResult(
|
||||
task_id=task_id,
|
||||
url=url,
|
||||
result=CrawlResult(
|
||||
url=url,
|
||||
html="",
|
||||
metadata={"status": "no_config_match"},
|
||||
success=False,
|
||||
error_message=error_message
|
||||
),
|
||||
memory_usage=0,
|
||||
peak_memory=0,
|
||||
start_time=start_time,
|
||||
end_time=time.time(),
|
||||
error_message=error_message
|
||||
)
|
||||
|
||||
try:
|
||||
if self.monitor:
|
||||
self.monitor.update_task(
|
||||
@@ -592,7 +681,7 @@ class SemaphoreDispatcher(BaseDispatcher):
|
||||
async with semaphore:
|
||||
process = psutil.Process()
|
||||
start_memory = process.memory_info().rss / (1024 * 1024)
|
||||
result = await self.crawler.arun(url, config=config, session_id=task_id)
|
||||
result = await self.crawler.arun(url, config=selected_config, session_id=task_id)
|
||||
end_memory = process.memory_info().rss / (1024 * 1024)
|
||||
|
||||
memory_usage = peak_memory = end_memory - start_memory
|
||||
@@ -654,7 +743,7 @@ class SemaphoreDispatcher(BaseDispatcher):
|
||||
self,
|
||||
crawler: AsyncWebCrawler, # noqa: F821
|
||||
urls: List[str],
|
||||
config: CrawlerRunConfig,
|
||||
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||
) -> List[CrawlerTaskResult]:
|
||||
self.crawler = crawler
|
||||
if self.monitor:
|
||||
|
||||
@@ -39,6 +39,7 @@ class LogColor(str, Enum):
|
||||
YELLOW = "yellow"
|
||||
MAGENTA = "magenta"
|
||||
DIM_MAGENTA = "dim magenta"
|
||||
RED = "red"
|
||||
|
||||
def __str__(self):
|
||||
"""Automatically convert rich color to string."""
|
||||
|
||||
@@ -424,10 +424,21 @@ class AsyncUrlSeeder:
|
||||
self._log("info", "Finished URL seeding for {domain}. Total URLs: {count}",
|
||||
params={"domain": domain, "count": len(results)}, tag="URL_SEED")
|
||||
|
||||
# Sort by relevance score if query was provided
|
||||
# Apply BM25 scoring if query was provided
|
||||
if query and extract_head and scoring_method == "bm25":
|
||||
results.sort(key=lambda x: x.get(
|
||||
"relevance_score", 0.0), reverse=True)
|
||||
# Apply collective BM25 scoring across all documents
|
||||
results = await self._apply_bm25_scoring(results, config)
|
||||
|
||||
# Filter by score threshold if specified
|
||||
if score_threshold is not None:
|
||||
original_count = len(results)
|
||||
results = [r for r in results if r.get("relevance_score", 0) >= score_threshold]
|
||||
if original_count > len(results):
|
||||
self._log("info", "Filtered {filtered} URLs below score threshold {threshold}",
|
||||
params={"filtered": original_count - len(results), "threshold": score_threshold}, tag="URL_SEED")
|
||||
|
||||
# Sort by relevance score
|
||||
results.sort(key=lambda x: x.get("relevance_score", 0.0), reverse=True)
|
||||
self._log("info", "Sorted {count} URLs by relevance score for query: '{query}'",
|
||||
params={"count": len(results), "query": query}, tag="URL_SEED")
|
||||
elif query and not extract_head:
|
||||
@@ -818,7 +829,7 @@ class AsyncUrlSeeder:
|
||||
|
||||
async def _iter_sitemap(self, url: str):
|
||||
try:
|
||||
r = await self.client.get(url, timeout=15)
|
||||
r = await self.client.get(url, timeout=15, follow_redirects=True)
|
||||
r.raise_for_status()
|
||||
except httpx.HTTPStatusError as e:
|
||||
self._log("warning", "Failed to fetch sitemap {url}: HTTP {status_code}",
|
||||
@@ -982,28 +993,6 @@ class AsyncUrlSeeder:
|
||||
"head_data": head_data,
|
||||
}
|
||||
|
||||
# Apply BM25 scoring if query is provided and head data exists
|
||||
if query and ok and scoring_method == "bm25" and head_data:
|
||||
text_context = self._extract_text_context(head_data)
|
||||
if text_context:
|
||||
# Calculate BM25 score for this single document
|
||||
# scores = self._calculate_bm25_score(query, [text_context])
|
||||
scores = await asyncio.to_thread(self._calculate_bm25_score, query, [text_context])
|
||||
relevance_score = scores[0] if scores else 0.0
|
||||
entry["relevance_score"] = float(relevance_score)
|
||||
else:
|
||||
# No text context, use URL-based scoring as fallback
|
||||
relevance_score = self._calculate_url_relevance_score(
|
||||
query, entry["url"])
|
||||
entry["relevance_score"] = float(relevance_score)
|
||||
elif query:
|
||||
# Query provided but no head data - we reject this entry
|
||||
self._log("debug", "No head data for {url}, using URL-based scoring",
|
||||
params={"url": url}, tag="URL_SEED")
|
||||
return
|
||||
# relevance_score = self._calculate_url_relevance_score(query, entry["url"])
|
||||
# entry["relevance_score"] = float(relevance_score)
|
||||
|
||||
elif live:
|
||||
self._log("debug", "Performing live check for {url}", params={
|
||||
"url": url}, tag="URL_SEED")
|
||||
@@ -1013,35 +1002,13 @@ class AsyncUrlSeeder:
|
||||
params={"status": status.upper(), "url": url}, tag="URL_SEED")
|
||||
entry = {"url": url, "status": status, "head_data": {}}
|
||||
|
||||
# Apply URL-based scoring if query is provided
|
||||
if query:
|
||||
relevance_score = self._calculate_url_relevance_score(
|
||||
query, url)
|
||||
entry["relevance_score"] = float(relevance_score)
|
||||
|
||||
else:
|
||||
entry = {"url": url, "status": "unknown", "head_data": {}}
|
||||
|
||||
# Apply URL-based scoring if query is provided
|
||||
if query:
|
||||
relevance_score = self._calculate_url_relevance_score(
|
||||
query, url)
|
||||
entry["relevance_score"] = float(relevance_score)
|
||||
|
||||
# Now decide whether to add the entry based on score threshold
|
||||
if query and "relevance_score" in entry:
|
||||
if score_threshold is None or entry["relevance_score"] >= score_threshold:
|
||||
if live or extract:
|
||||
await self._cache_set(cache_kind, url, entry)
|
||||
res_list.append(entry)
|
||||
else:
|
||||
self._log("debug", "URL {url} filtered out with score {score} < {threshold}",
|
||||
params={"url": url, "score": entry["relevance_score"], "threshold": score_threshold}, tag="URL_SEED")
|
||||
else:
|
||||
# No query or no scoring - add as usual
|
||||
if live or extract:
|
||||
await self._cache_set(cache_kind, url, entry)
|
||||
res_list.append(entry)
|
||||
# Add entry to results (scoring will be done later)
|
||||
if live or extract:
|
||||
await self._cache_set(cache_kind, url, entry)
|
||||
res_list.append(entry)
|
||||
|
||||
async def _head_ok(self, url: str, timeout: int) -> bool:
|
||||
try:
|
||||
@@ -1436,8 +1403,19 @@ class AsyncUrlSeeder:
|
||||
scores = bm25.get_scores(query_tokens)
|
||||
|
||||
# Normalize scores to 0-1 range
|
||||
max_score = max(scores) if max(scores) > 0 else 1.0
|
||||
normalized_scores = [score / max_score for score in scores]
|
||||
# BM25 can return negative scores, so we need to handle the full range
|
||||
if len(scores) == 0:
|
||||
return []
|
||||
|
||||
min_score = min(scores)
|
||||
max_score = max(scores)
|
||||
|
||||
# If all scores are the same, return 0.5 for all
|
||||
if max_score == min_score:
|
||||
return [0.5] * len(scores)
|
||||
|
||||
# Normalize to 0-1 range using min-max normalization
|
||||
normalized_scores = [(score - min_score) / (max_score - min_score) for score in scores]
|
||||
|
||||
return normalized_scores
|
||||
except Exception as e:
|
||||
|
||||
@@ -363,7 +363,7 @@ class AsyncWebCrawler:
|
||||
pdf_data=pdf_data,
|
||||
verbose=config.verbose,
|
||||
is_raw_html=True if url.startswith("raw:") else False,
|
||||
redirected_url=async_response.redirected_url,
|
||||
redirected_url=async_response.redirected_url,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@@ -502,11 +502,14 @@ class AsyncWebCrawler:
|
||||
metadata = result.get("metadata", {})
|
||||
else:
|
||||
cleaned_html = sanitize_input_encode(result.cleaned_html)
|
||||
media = result.media.model_dump()
|
||||
tables = media.pop("tables", [])
|
||||
links = result.links.model_dump()
|
||||
# media = result.media.model_dump()
|
||||
# tables = media.pop("tables", [])
|
||||
# links = result.links.model_dump()
|
||||
media = result.media.model_dump() if hasattr(result.media, 'model_dump') else result.media
|
||||
tables = media.pop("tables", []) if isinstance(media, dict) else []
|
||||
links = result.links.model_dump() if hasattr(result.links, 'model_dump') else result.links
|
||||
metadata = result.metadata
|
||||
|
||||
|
||||
fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)
|
||||
|
||||
################################
|
||||
@@ -588,11 +591,13 @@ class AsyncWebCrawler:
|
||||
# Choose content based on input_format
|
||||
content_format = config.extraction_strategy.input_format
|
||||
if content_format == "fit_markdown" and not markdown_result.fit_markdown:
|
||||
self.logger.warning(
|
||||
message="Fit markdown requested but not available. Falling back to raw markdown.",
|
||||
tag="EXTRACT",
|
||||
params={"url": _url},
|
||||
)
|
||||
|
||||
self.logger.url_status(
|
||||
url=_url,
|
||||
success=bool(html),
|
||||
timing=time.perf_counter() - t1,
|
||||
tag="EXTRACT",
|
||||
)
|
||||
content_format = "markdown"
|
||||
|
||||
content = {
|
||||
@@ -616,11 +621,12 @@ class AsyncWebCrawler:
|
||||
)
|
||||
|
||||
# Log extraction completion
|
||||
self.logger.info(
|
||||
message="Completed for {url:.50}... | Time: {timing}s",
|
||||
tag="EXTRACT",
|
||||
params={"url": _url, "timing": time.perf_counter() - t1},
|
||||
)
|
||||
self.logger.url_status(
|
||||
url=_url,
|
||||
success=bool(html),
|
||||
timing=time.perf_counter() - t1,
|
||||
tag="EXTRACT",
|
||||
)
|
||||
|
||||
# Apply HTML formatting if requested
|
||||
if config.prettiify:
|
||||
@@ -647,7 +653,7 @@ class AsyncWebCrawler:
|
||||
async def arun_many(
|
||||
self,
|
||||
urls: List[str],
|
||||
config: Optional[CrawlerRunConfig] = None,
|
||||
config: Optional[Union[CrawlerRunConfig, List[CrawlerRunConfig]]] = None,
|
||||
dispatcher: Optional[BaseDispatcher] = None,
|
||||
# Legacy parameters maintained for backwards compatibility
|
||||
# word_count_threshold=MIN_WORD_THRESHOLD,
|
||||
@@ -668,7 +674,9 @@ class AsyncWebCrawler:
|
||||
|
||||
Args:
|
||||
urls: List of URLs to crawl
|
||||
config: Configuration object controlling crawl behavior for all URLs
|
||||
config: Configuration object(s) controlling crawl behavior. Can be:
|
||||
- Single CrawlerRunConfig: Used for all URLs
|
||||
- List[CrawlerRunConfig]: Configs with url_matcher for URL-specific settings
|
||||
dispatcher: The dispatcher strategy instance to use. Defaults to MemoryAdaptiveDispatcher
|
||||
[other parameters maintained for backwards compatibility]
|
||||
|
||||
@@ -733,7 +741,11 @@ class AsyncWebCrawler:
|
||||
or task_result.result
|
||||
)
|
||||
|
||||
stream = config.stream
|
||||
# Handle stream setting - use first config's stream setting if config is a list
|
||||
if isinstance(config, list):
|
||||
stream = config[0].stream if config else False
|
||||
else:
|
||||
stream = config.stream
|
||||
|
||||
if stream:
|
||||
|
||||
|
||||
293
crawl4ai/browser_adapter.py
Normal file
293
crawl4ai/browser_adapter.py
Normal file
@@ -0,0 +1,293 @@
|
||||
# browser_adapter.py
|
||||
"""
|
||||
Browser adapter for Crawl4AI to support both Playwright and undetected browsers
|
||||
with minimal changes to existing codebase.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Dict, Any, Optional, Callable
|
||||
import time
|
||||
import json
|
||||
|
||||
# Import both, but use conditionally
|
||||
try:
|
||||
from playwright.async_api import Page
|
||||
except ImportError:
|
||||
Page = Any
|
||||
|
||||
try:
|
||||
from patchright.async_api import Page as UndetectedPage
|
||||
except ImportError:
|
||||
UndetectedPage = Any
|
||||
|
||||
|
||||
class BrowserAdapter(ABC):
|
||||
"""Abstract adapter for browser-specific operations"""
|
||||
|
||||
@abstractmethod
|
||||
async def evaluate(self, page: Page, expression: str, arg: Any = None) -> Any:
|
||||
"""Execute JavaScript in the page"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def setup_console_capture(self, page: Page, captured_console: List[Dict]) -> Optional[Callable]:
|
||||
"""Setup console message capturing, returns handler function if needed"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def setup_error_capture(self, page: Page, captured_console: List[Dict]) -> Optional[Callable]:
|
||||
"""Setup error capturing, returns handler function if needed"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def retrieve_console_messages(self, page: Page) -> List[Dict]:
|
||||
"""Retrieve captured console messages (for undetected browsers)"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def cleanup_console_capture(self, page: Page, handle_console: Optional[Callable], handle_error: Optional[Callable]):
|
||||
"""Clean up console event listeners"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_imports(self) -> tuple:
|
||||
"""Get the appropriate imports for this adapter"""
|
||||
pass
|
||||
|
||||
|
||||
class PlaywrightAdapter(BrowserAdapter):
|
||||
"""Adapter for standard Playwright"""
|
||||
|
||||
async def evaluate(self, page: Page, expression: str, arg: Any = None) -> Any:
|
||||
"""Standard Playwright evaluate"""
|
||||
if arg is not None:
|
||||
return await page.evaluate(expression, arg)
|
||||
return await page.evaluate(expression)
|
||||
|
||||
async def setup_console_capture(self, page: Page, captured_console: List[Dict]) -> Optional[Callable]:
|
||||
"""Setup console capture using Playwright's event system"""
|
||||
def handle_console_capture(msg):
|
||||
try:
|
||||
message_type = "unknown"
|
||||
try:
|
||||
message_type = msg.type
|
||||
except:
|
||||
pass
|
||||
|
||||
message_text = "unknown"
|
||||
try:
|
||||
message_text = msg.text
|
||||
except:
|
||||
pass
|
||||
|
||||
entry = {
|
||||
"type": message_type,
|
||||
"text": message_text,
|
||||
"timestamp": time.time()
|
||||
}
|
||||
|
||||
captured_console.append(entry)
|
||||
|
||||
except Exception as e:
|
||||
captured_console.append({
|
||||
"type": "console_capture_error",
|
||||
"error": str(e),
|
||||
"timestamp": time.time()
|
||||
})
|
||||
|
||||
page.on("console", handle_console_capture)
|
||||
return handle_console_capture
|
||||
|
||||
async def setup_error_capture(self, page: Page, captured_console: List[Dict]) -> Optional[Callable]:
|
||||
"""Setup error capture using Playwright's event system"""
|
||||
def handle_pageerror_capture(err):
|
||||
try:
|
||||
error_message = "Unknown error"
|
||||
try:
|
||||
error_message = err.message
|
||||
except:
|
||||
pass
|
||||
|
||||
error_stack = ""
|
||||
try:
|
||||
error_stack = err.stack
|
||||
except:
|
||||
pass
|
||||
|
||||
captured_console.append({
|
||||
"type": "error",
|
||||
"text": error_message,
|
||||
"stack": error_stack,
|
||||
"timestamp": time.time()
|
||||
})
|
||||
except Exception as e:
|
||||
captured_console.append({
|
||||
"type": "pageerror_capture_error",
|
||||
"error": str(e),
|
||||
"timestamp": time.time()
|
||||
})
|
||||
|
||||
page.on("pageerror", handle_pageerror_capture)
|
||||
return handle_pageerror_capture
|
||||
|
||||
async def retrieve_console_messages(self, page: Page) -> List[Dict]:
|
||||
"""Not needed for Playwright - messages are captured via events"""
|
||||
return []
|
||||
|
||||
async def cleanup_console_capture(self, page: Page, handle_console: Optional[Callable], handle_error: Optional[Callable]):
|
||||
"""Remove event listeners"""
|
||||
if handle_console:
|
||||
page.remove_listener("console", handle_console)
|
||||
if handle_error:
|
||||
page.remove_listener("pageerror", handle_error)
|
||||
|
||||
def get_imports(self) -> tuple:
|
||||
"""Return Playwright imports"""
|
||||
from playwright.async_api import Page, Error
|
||||
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
|
||||
return Page, Error, PlaywrightTimeoutError
|
||||
|
||||
|
||||
class UndetectedAdapter(BrowserAdapter):
|
||||
"""Adapter for undetected browser automation with stealth features"""
|
||||
|
||||
def __init__(self):
|
||||
self._console_script_injected = {}
|
||||
|
||||
async def evaluate(self, page: UndetectedPage, expression: str, arg: Any = None) -> Any:
|
||||
"""Undetected browser evaluate with isolated context"""
|
||||
# For most evaluations, use isolated context for stealth
|
||||
# Only use non-isolated when we need to access our injected console capture
|
||||
isolated = not (
|
||||
"__console" in expression or
|
||||
"__captured" in expression or
|
||||
"__error" in expression or
|
||||
"window.__" in expression
|
||||
)
|
||||
|
||||
if arg is not None:
|
||||
return await page.evaluate(expression, arg, isolated_context=isolated)
|
||||
return await page.evaluate(expression, isolated_context=isolated)
|
||||
|
||||
async def setup_console_capture(self, page: UndetectedPage, captured_console: List[Dict]) -> Optional[Callable]:
|
||||
"""Setup console capture using JavaScript injection for undetected browsers"""
|
||||
if not self._console_script_injected.get(page, False):
|
||||
await page.add_init_script("""
|
||||
// Initialize console capture
|
||||
window.__capturedConsole = [];
|
||||
window.__capturedErrors = [];
|
||||
|
||||
// Store original console methods
|
||||
const originalConsole = {};
|
||||
['log', 'info', 'warn', 'error', 'debug'].forEach(method => {
|
||||
originalConsole[method] = console[method];
|
||||
console[method] = function(...args) {
|
||||
try {
|
||||
window.__capturedConsole.push({
|
||||
type: method,
|
||||
text: args.map(arg => {
|
||||
try {
|
||||
if (typeof arg === 'object') {
|
||||
return JSON.stringify(arg);
|
||||
}
|
||||
return String(arg);
|
||||
} catch (e) {
|
||||
return '[Object]';
|
||||
}
|
||||
}).join(' '),
|
||||
timestamp: Date.now()
|
||||
});
|
||||
} catch (e) {
|
||||
// Fail silently to avoid detection
|
||||
}
|
||||
|
||||
// Call original method
|
||||
originalConsole[method].apply(console, args);
|
||||
};
|
||||
});
|
||||
""")
|
||||
self._console_script_injected[page] = True
|
||||
|
||||
return None # No handler function needed for undetected browser
|
||||
|
||||
async def setup_error_capture(self, page: UndetectedPage, captured_console: List[Dict]) -> Optional[Callable]:
|
||||
"""Setup error capture using JavaScript injection for undetected browsers"""
|
||||
if not self._console_script_injected.get(page, False):
|
||||
await page.add_init_script("""
|
||||
// Capture errors
|
||||
window.addEventListener('error', (event) => {
|
||||
try {
|
||||
window.__capturedErrors.push({
|
||||
type: 'error',
|
||||
text: event.message,
|
||||
stack: event.error ? event.error.stack : '',
|
||||
filename: event.filename,
|
||||
lineno: event.lineno,
|
||||
colno: event.colno,
|
||||
timestamp: Date.now()
|
||||
});
|
||||
} catch (e) {
|
||||
// Fail silently
|
||||
}
|
||||
});
|
||||
|
||||
// Capture unhandled promise rejections
|
||||
window.addEventListener('unhandledrejection', (event) => {
|
||||
try {
|
||||
window.__capturedErrors.push({
|
||||
type: 'unhandledrejection',
|
||||
text: event.reason ? String(event.reason) : 'Unhandled Promise Rejection',
|
||||
stack: event.reason && event.reason.stack ? event.reason.stack : '',
|
||||
timestamp: Date.now()
|
||||
});
|
||||
} catch (e) {
|
||||
// Fail silently
|
||||
}
|
||||
});
|
||||
""")
|
||||
self._console_script_injected[page] = True
|
||||
|
||||
return None # No handler function needed for undetected browser
|
||||
|
||||
async def retrieve_console_messages(self, page: UndetectedPage) -> List[Dict]:
|
||||
"""Retrieve captured console messages and errors from the page"""
|
||||
messages = []
|
||||
|
||||
try:
|
||||
# Get console messages
|
||||
console_messages = await page.evaluate(
|
||||
"() => { const msgs = window.__capturedConsole || []; window.__capturedConsole = []; return msgs; }",
|
||||
isolated_context=False
|
||||
)
|
||||
messages.extend(console_messages)
|
||||
|
||||
# Get errors
|
||||
errors = await page.evaluate(
|
||||
"() => { const errs = window.__capturedErrors || []; window.__capturedErrors = []; return errs; }",
|
||||
isolated_context=False
|
||||
)
|
||||
messages.extend(errors)
|
||||
|
||||
# Convert timestamps from JS to Python format
|
||||
for msg in messages:
|
||||
if 'timestamp' in msg and isinstance(msg['timestamp'], (int, float)):
|
||||
msg['timestamp'] = msg['timestamp'] / 1000.0 # Convert from ms to seconds
|
||||
|
||||
except Exception:
|
||||
# If retrieval fails, return empty list
|
||||
pass
|
||||
|
||||
return messages
|
||||
|
||||
async def cleanup_console_capture(self, page: UndetectedPage, handle_console: Optional[Callable], handle_error: Optional[Callable]):
|
||||
"""Clean up for undetected browser - retrieve final messages"""
|
||||
# For undetected browser, we don't have event listeners to remove
|
||||
# but we should retrieve any final messages
|
||||
final_messages = await self.retrieve_console_messages(page)
|
||||
return final_messages
|
||||
|
||||
def get_imports(self) -> tuple:
|
||||
"""Return undetected browser imports"""
|
||||
from patchright.async_api import Page, Error
|
||||
from patchright.async_api import TimeoutError as PlaywrightTimeoutError
|
||||
return Page, Error, PlaywrightTimeoutError
|
||||
@@ -14,23 +14,8 @@ import hashlib
|
||||
from .js_snippet import load_js_script
|
||||
from .config import DOWNLOAD_PAGE_TIMEOUT
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from playwright_stealth import StealthConfig
|
||||
from .utils import get_chromium_path
|
||||
|
||||
stealth_config = StealthConfig(
|
||||
webdriver=True,
|
||||
chrome_app=True,
|
||||
chrome_csi=True,
|
||||
chrome_load_times=True,
|
||||
chrome_runtime=True,
|
||||
navigator_languages=True,
|
||||
navigator_plugins=True,
|
||||
navigator_permissions=True,
|
||||
webgl_vendor=True,
|
||||
outerdimensions=True,
|
||||
navigator_hardware_concurrency=True,
|
||||
media_codecs=True,
|
||||
)
|
||||
|
||||
BROWSER_DISABLE_OPTIONS = [
|
||||
"--disable-background-networking",
|
||||
@@ -588,21 +573,26 @@ class BrowserManager:
|
||||
_playwright_instance = None
|
||||
|
||||
@classmethod
|
||||
async def get_playwright(cls):
|
||||
from playwright.async_api import async_playwright
|
||||
async def get_playwright(cls, use_undetected: bool = False):
|
||||
if use_undetected:
|
||||
from patchright.async_api import async_playwright
|
||||
else:
|
||||
from playwright.async_api import async_playwright
|
||||
cls._playwright_instance = await async_playwright().start()
|
||||
return cls._playwright_instance
|
||||
|
||||
def __init__(self, browser_config: BrowserConfig, logger=None):
|
||||
def __init__(self, browser_config: BrowserConfig, logger=None, use_undetected: bool = False):
|
||||
"""
|
||||
Initialize the BrowserManager with a browser configuration.
|
||||
|
||||
Args:
|
||||
browser_config (BrowserConfig): Configuration object containing all browser settings
|
||||
logger: Logger instance for recording events and errors
|
||||
use_undetected (bool): Whether to use undetected browser (Patchright)
|
||||
"""
|
||||
self.config: BrowserConfig = browser_config
|
||||
self.logger = logger
|
||||
self.use_undetected = use_undetected
|
||||
|
||||
# Browser state
|
||||
self.browser = None
|
||||
@@ -616,7 +606,11 @@ class BrowserManager:
|
||||
|
||||
# Keep track of contexts by a "config signature," so each unique config reuses a single context
|
||||
self.contexts_by_config = {}
|
||||
self._contexts_lock = asyncio.Lock()
|
||||
self._contexts_lock = asyncio.Lock()
|
||||
|
||||
# Stealth-related attributes
|
||||
self._stealth_instance = None
|
||||
self._stealth_cm = None
|
||||
|
||||
# Initialize ManagedBrowser if needed
|
||||
if self.config.use_managed_browser:
|
||||
@@ -645,9 +639,21 @@ class BrowserManager:
|
||||
if self.playwright is not None:
|
||||
await self.close()
|
||||
|
||||
from playwright.async_api import async_playwright
|
||||
if self.use_undetected:
|
||||
from patchright.async_api import async_playwright
|
||||
else:
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
self.playwright = await async_playwright().start()
|
||||
# Initialize playwright with or without stealth
|
||||
if self.config.enable_stealth and not self.use_undetected:
|
||||
# Import stealth only when needed
|
||||
from playwright_stealth import Stealth
|
||||
# Use the recommended stealth wrapper approach
|
||||
self._stealth_instance = Stealth()
|
||||
self._stealth_cm = self._stealth_instance.use_async(async_playwright())
|
||||
self.playwright = await self._stealth_cm.__aenter__()
|
||||
else:
|
||||
self.playwright = await async_playwright().start()
|
||||
|
||||
if self.config.cdp_url or self.config.use_managed_browser:
|
||||
self.config.use_managed_browser = True
|
||||
@@ -1109,5 +1115,19 @@ class BrowserManager:
|
||||
self.managed_browser = None
|
||||
|
||||
if self.playwright:
|
||||
await self.playwright.stop()
|
||||
# Handle stealth context manager cleanup if it exists
|
||||
if hasattr(self, '_stealth_cm') and self._stealth_cm is not None:
|
||||
try:
|
||||
await self._stealth_cm.__aexit__(None, None, None)
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.error(
|
||||
message="Error closing stealth context: {error}",
|
||||
tag="ERROR",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
self._stealth_cm = None
|
||||
self._stealth_instance = None
|
||||
else:
|
||||
await self.playwright.stop()
|
||||
self.playwright = None
|
||||
|
||||
@@ -480,7 +480,7 @@ class BrowserProfiler:
|
||||
self.logger.info("4. Exit", tag="MENU", base_color=LogColor.MAGENTA)
|
||||
exit_option = "4"
|
||||
|
||||
self.logger.print(f"\n[cyan]Enter your choice (1-{exit_option}): [/cyan]", end="")
|
||||
self.logger.info(f"\n[cyan]Enter your choice (1-{exit_option}): [/cyan]", end="")
|
||||
choice = input()
|
||||
|
||||
if choice == "1":
|
||||
@@ -637,9 +637,18 @@ class BrowserProfiler:
|
||||
self.logger.info(f"Debugging port: {debugging_port}", tag="CDP")
|
||||
self.logger.info(f"Headless mode: {headless}", tag="CDP")
|
||||
|
||||
# create browser config
|
||||
browser_config = BrowserConfig(
|
||||
browser_type=browser_type,
|
||||
headless=headless,
|
||||
user_data_dir=profile_path,
|
||||
debugging_port=debugging_port,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
# Create managed browser instance
|
||||
managed_browser = ManagedBrowser(
|
||||
browser_type=browser_type,
|
||||
browser_config=browser_config,
|
||||
user_data_dir=profile_path,
|
||||
headless=headless,
|
||||
logger=self.logger,
|
||||
|
||||
@@ -27,7 +27,10 @@ from crawl4ai import (
|
||||
PruningContentFilter,
|
||||
BrowserProfiler,
|
||||
DefaultMarkdownGenerator,
|
||||
LLMConfig
|
||||
LLMConfig,
|
||||
BFSDeepCrawlStrategy,
|
||||
DFSDeepCrawlStrategy,
|
||||
BestFirstCrawlingStrategy,
|
||||
)
|
||||
from crawl4ai.config import USER_SETTINGS
|
||||
from litellm import completion
|
||||
@@ -1010,13 +1013,15 @@ def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless
|
||||
@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
|
||||
@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
|
||||
@click.option("--output-file", "-O", type=click.Path(), help="Output file path (default: stdout)")
|
||||
@click.option("--bypass-cache", "-b", is_flag=True, default=True, help="Bypass cache when crawling")
|
||||
@click.option("--bypass-cache", "-bc", is_flag=True, default=True, help="Bypass cache when crawling")
|
||||
@click.option("--question", "-q", help="Ask a question about the crawled content")
|
||||
@click.option("--verbose", "-v", is_flag=True)
|
||||
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
|
||||
@click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy (bfs, dfs, or best-first)")
|
||||
@click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode")
|
||||
def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str,
|
||||
extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
|
||||
output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
|
||||
output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int):
|
||||
"""Crawl a website and extract content
|
||||
|
||||
Simple Usage:
|
||||
@@ -1156,6 +1161,27 @@ Always return valid, properly formatted JSON."""
|
||||
|
||||
crawler_cfg.scraping_strategy = LXMLWebScrapingStrategy()
|
||||
|
||||
# Handle deep crawling configuration
|
||||
if deep_crawl:
|
||||
if deep_crawl == "bfs":
|
||||
crawler_cfg.deep_crawl_strategy = BFSDeepCrawlStrategy(
|
||||
max_depth=3,
|
||||
max_pages=max_pages
|
||||
)
|
||||
elif deep_crawl == "dfs":
|
||||
crawler_cfg.deep_crawl_strategy = DFSDeepCrawlStrategy(
|
||||
max_depth=3,
|
||||
max_pages=max_pages
|
||||
)
|
||||
elif deep_crawl == "best-first":
|
||||
crawler_cfg.deep_crawl_strategy = BestFirstCrawlingStrategy(
|
||||
max_depth=3,
|
||||
max_pages=max_pages
|
||||
)
|
||||
|
||||
if verbose:
|
||||
console.print(f"[green]Deep crawling enabled:[/green] {deep_crawl} strategy, max {max_pages} pages")
|
||||
|
||||
config = get_global_config()
|
||||
|
||||
browser_cfg.verbose = config.get("VERBOSE", False)
|
||||
@@ -1170,39 +1196,60 @@ Always return valid, properly formatted JSON."""
|
||||
verbose
|
||||
)
|
||||
|
||||
# Handle deep crawl results (list) vs single result
|
||||
if isinstance(result, list):
|
||||
if len(result) == 0:
|
||||
click.echo("No results found during deep crawling")
|
||||
return
|
||||
# Use the first result for question answering and output
|
||||
main_result = result[0]
|
||||
all_results = result
|
||||
else:
|
||||
# Single result from regular crawling
|
||||
main_result = result
|
||||
all_results = [result]
|
||||
|
||||
# Handle question
|
||||
if question:
|
||||
provider, token = setup_llm_config()
|
||||
markdown = result.markdown.raw_markdown
|
||||
markdown = main_result.markdown.raw_markdown
|
||||
anyio.run(stream_llm_response, url, markdown, question, provider, token)
|
||||
return
|
||||
|
||||
# Handle output
|
||||
if not output_file:
|
||||
if output == "all":
|
||||
click.echo(json.dumps(result.model_dump(), indent=2))
|
||||
if isinstance(result, list):
|
||||
output_data = [r.model_dump() for r in all_results]
|
||||
click.echo(json.dumps(output_data, indent=2))
|
||||
else:
|
||||
click.echo(json.dumps(main_result.model_dump(), indent=2))
|
||||
elif output == "json":
|
||||
print(result.extracted_content)
|
||||
extracted_items = json.loads(result.extracted_content)
|
||||
print(main_result.extracted_content)
|
||||
extracted_items = json.loads(main_result.extracted_content)
|
||||
click.echo(json.dumps(extracted_items, indent=2))
|
||||
|
||||
elif output in ["markdown", "md"]:
|
||||
click.echo(result.markdown.raw_markdown)
|
||||
click.echo(main_result.markdown.raw_markdown)
|
||||
elif output in ["markdown-fit", "md-fit"]:
|
||||
click.echo(result.markdown.fit_markdown)
|
||||
click.echo(main_result.markdown.fit_markdown)
|
||||
else:
|
||||
if output == "all":
|
||||
with open(output_file, "w") as f:
|
||||
f.write(json.dumps(result.model_dump(), indent=2))
|
||||
if isinstance(result, list):
|
||||
output_data = [r.model_dump() for r in all_results]
|
||||
f.write(json.dumps(output_data, indent=2))
|
||||
else:
|
||||
f.write(json.dumps(main_result.model_dump(), indent=2))
|
||||
elif output == "json":
|
||||
with open(output_file, "w") as f:
|
||||
f.write(result.extracted_content)
|
||||
f.write(main_result.extracted_content)
|
||||
elif output in ["markdown", "md"]:
|
||||
with open(output_file, "w") as f:
|
||||
f.write(result.markdown.raw_markdown)
|
||||
f.write(main_result.markdown.raw_markdown)
|
||||
elif output in ["markdown-fit", "md-fit"]:
|
||||
with open(output_file, "w") as f:
|
||||
f.write(result.markdown.fit_markdown)
|
||||
f.write(main_result.markdown.fit_markdown)
|
||||
|
||||
except Exception as e:
|
||||
raise click.ClickException(str(e))
|
||||
@@ -1354,9 +1401,11 @@ def profiles_cmd():
|
||||
@click.option("--question", "-q", help="Ask a question about the crawled content")
|
||||
@click.option("--verbose", "-v", is_flag=True)
|
||||
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
|
||||
@click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy")
|
||||
@click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode")
|
||||
def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str,
|
||||
extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
|
||||
output: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
|
||||
output: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int):
|
||||
"""Crawl4AI CLI - Web content extraction tool
|
||||
|
||||
Simple Usage:
|
||||
@@ -1406,7 +1455,9 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f
|
||||
bypass_cache=bypass_cache,
|
||||
question=question,
|
||||
verbose=verbose,
|
||||
profile=profile
|
||||
profile=profile,
|
||||
deep_crawl=deep_crawl,
|
||||
max_pages=max_pages
|
||||
)
|
||||
|
||||
def main():
|
||||
|
||||
@@ -98,20 +98,20 @@ class ContentScrapingStrategy(ABC):
|
||||
pass
|
||||
|
||||
|
||||
class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
class LXMLWebScrapingStrategy(ContentScrapingStrategy):
|
||||
"""
|
||||
Class for web content scraping. Perhaps the most important class.
|
||||
|
||||
How it works:
|
||||
1. Extract content from HTML using BeautifulSoup.
|
||||
2. Clean the extracted content using a content cleaning strategy.
|
||||
3. Filter the cleaned content using a content filtering strategy.
|
||||
4. Generate markdown content from the filtered content.
|
||||
5. Return the markdown content.
|
||||
LXML-based implementation for fast web content scraping.
|
||||
|
||||
This is the primary scraping strategy in Crawl4AI, providing high-performance
|
||||
HTML parsing and content extraction using the lxml library.
|
||||
|
||||
Note: WebScrapingStrategy is now an alias for this class to maintain
|
||||
backward compatibility.
|
||||
"""
|
||||
|
||||
def __init__(self, logger=None):
|
||||
self.logger = logger
|
||||
self.DIMENSION_REGEX = re.compile(r"(\d+)(\D*)")
|
||||
self.BASE64_PATTERN = re.compile(r'data:image/[^;]+;base64,([^"]+)')
|
||||
|
||||
def _log(self, level, message, tag="SCRAPE", **kwargs):
|
||||
"""Helper method to safely use logger."""
|
||||
@@ -132,7 +132,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
ScrapingResult: A structured result containing the scraped content.
|
||||
"""
|
||||
actual_url = kwargs.get("redirected_url", url)
|
||||
raw_result = self._scrap(actual_url, html, is_async=False, **kwargs)
|
||||
raw_result = self._scrap(actual_url, html, **kwargs)
|
||||
if raw_result is None:
|
||||
return ScrapingResult(
|
||||
cleaned_html="",
|
||||
@@ -196,376 +196,9 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
Returns:
|
||||
ScrapingResult: A structured result containing the scraped content.
|
||||
"""
|
||||
return await asyncio.to_thread(self._scrap, url, html, **kwargs)
|
||||
return await asyncio.to_thread(self.scrap, url, html, **kwargs)
|
||||
|
||||
def is_data_table(self, table: Tag, **kwargs) -> bool:
|
||||
"""
|
||||
Determine if a table element is a data table (not a layout table).
|
||||
|
||||
Args:
|
||||
table (Tag): BeautifulSoup Tag representing a table element
|
||||
**kwargs: Additional keyword arguments including table_score_threshold
|
||||
|
||||
Returns:
|
||||
bool: True if the table is a data table, False otherwise
|
||||
"""
|
||||
score = 0
|
||||
|
||||
# Check for thead and tbody
|
||||
has_thead = len(table.select('thead')) > 0
|
||||
has_tbody = len(table.select('tbody')) > 0
|
||||
if has_thead:
|
||||
score += 2
|
||||
if has_tbody:
|
||||
score += 1
|
||||
|
||||
# Check for th elements
|
||||
th_count = len(table.select('th'))
|
||||
if th_count > 0:
|
||||
score += 2
|
||||
if has_thead or len(table.select('tr:first-child th')) > 0:
|
||||
score += 1
|
||||
|
||||
# Check for nested tables
|
||||
if len(table.select('table')) > 0:
|
||||
score -= 3
|
||||
|
||||
# Role attribute check
|
||||
role = table.get('role', '').lower()
|
||||
if role in {'presentation', 'none'}:
|
||||
score -= 3
|
||||
|
||||
# Column consistency
|
||||
rows = table.select('tr')
|
||||
if not rows:
|
||||
return False
|
||||
|
||||
col_counts = [len(row.select('td, th')) for row in rows]
|
||||
avg_cols = sum(col_counts) / len(col_counts)
|
||||
variance = sum((c - avg_cols)**2 for c in col_counts) / len(col_counts)
|
||||
if variance < 1:
|
||||
score += 2
|
||||
|
||||
# Caption and summary
|
||||
if table.select('caption'):
|
||||
score += 2
|
||||
if table.has_attr('summary') and table['summary']:
|
||||
score += 1
|
||||
|
||||
# Text density
|
||||
total_text = sum(len(cell.get_text().strip()) for row in rows for cell in row.select('td, th'))
|
||||
total_tags = sum(1 for _ in table.descendants if isinstance(_, Tag))
|
||||
text_ratio = total_text / (total_tags + 1e-5)
|
||||
if text_ratio > 20:
|
||||
score += 3
|
||||
elif text_ratio > 10:
|
||||
score += 2
|
||||
|
||||
# Data attributes
|
||||
data_attrs = sum(1 for attr in table.attrs if attr.startswith('data-'))
|
||||
score += data_attrs * 0.5
|
||||
|
||||
# Size check
|
||||
if avg_cols >= 2 and len(rows) >= 2:
|
||||
score += 2
|
||||
|
||||
threshold = kwargs.get('table_score_threshold', 7)
|
||||
return score >= threshold
|
||||
|
||||
def extract_table_data(self, table: Tag) -> dict:
|
||||
"""
|
||||
Extract structured data from a table element.
|
||||
|
||||
Args:
|
||||
table (Tag): BeautifulSoup Tag representing a table element
|
||||
|
||||
Returns:
|
||||
dict: Dictionary containing table data (headers, rows, caption, summary)
|
||||
"""
|
||||
caption_elem = table.select_one('caption')
|
||||
caption = caption_elem.get_text().strip() if caption_elem else ""
|
||||
summary = table.get('summary', '').strip()
|
||||
|
||||
# Extract headers with colspan handling
|
||||
headers = []
|
||||
thead_rows = table.select('thead tr')
|
||||
if thead_rows:
|
||||
header_cells = thead_rows[0].select('th')
|
||||
for cell in header_cells:
|
||||
text = cell.get_text().strip()
|
||||
colspan = int(cell.get('colspan', 1))
|
||||
headers.extend([text] * colspan)
|
||||
else:
|
||||
first_row = table.select('tr:first-child')
|
||||
if first_row:
|
||||
for cell in first_row[0].select('th, td'):
|
||||
text = cell.get_text().strip()
|
||||
colspan = int(cell.get('colspan', 1))
|
||||
headers.extend([text] * colspan)
|
||||
|
||||
# Extract rows with colspan handling
|
||||
rows = []
|
||||
all_rows = table.select('tr')
|
||||
thead = table.select_one('thead')
|
||||
tbody_rows = []
|
||||
|
||||
if thead:
|
||||
thead_rows = thead.select('tr')
|
||||
tbody_rows = [row for row in all_rows if row not in thead_rows]
|
||||
else:
|
||||
if all_rows and all_rows[0].select('th'):
|
||||
tbody_rows = all_rows[1:]
|
||||
else:
|
||||
tbody_rows = all_rows
|
||||
|
||||
for row in tbody_rows:
|
||||
# for row in table.select('tr:not(:has(ancestor::thead))'):
|
||||
row_data = []
|
||||
for cell in row.select('td'):
|
||||
text = cell.get_text().strip()
|
||||
colspan = int(cell.get('colspan', 1))
|
||||
row_data.extend([text] * colspan)
|
||||
if row_data:
|
||||
rows.append(row_data)
|
||||
|
||||
# Align rows with headers
|
||||
max_columns = len(headers) if headers else (max(len(row) for row in rows) if rows else 0)
|
||||
aligned_rows = []
|
||||
for row in rows:
|
||||
aligned = row[:max_columns] + [''] * (max_columns - len(row))
|
||||
aligned_rows.append(aligned)
|
||||
|
||||
if not headers:
|
||||
headers = [f"Column {i+1}" for i in range(max_columns)]
|
||||
|
||||
return {
|
||||
"headers": headers,
|
||||
"rows": aligned_rows,
|
||||
"caption": caption,
|
||||
"summary": summary,
|
||||
}
|
||||
|
||||
def flatten_nested_elements(self, node):
|
||||
"""
|
||||
Flatten nested elements in a HTML tree.
|
||||
|
||||
Args:
|
||||
node (Tag): The root node of the HTML tree.
|
||||
|
||||
Returns:
|
||||
Tag: The flattened HTML tree.
|
||||
"""
|
||||
if isinstance(node, NavigableString):
|
||||
return node
|
||||
if (
|
||||
len(node.contents) == 1
|
||||
and isinstance(node.contents[0], Tag)
|
||||
and node.contents[0].name == node.name
|
||||
):
|
||||
return self.flatten_nested_elements(node.contents[0])
|
||||
node.contents = [self.flatten_nested_elements(child) for child in node.contents]
|
||||
return node
|
||||
|
||||
def find_closest_parent_with_useful_text(self, tag, **kwargs):
|
||||
"""
|
||||
Find the closest parent with useful text.
|
||||
|
||||
Args:
|
||||
tag (Tag): The starting tag to search from.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
Tag: The closest parent with useful text, or None if not found.
|
||||
"""
|
||||
image_description_min_word_threshold = kwargs.get(
|
||||
"image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
|
||||
)
|
||||
current_tag = tag
|
||||
while current_tag:
|
||||
current_tag = current_tag.parent
|
||||
# Get the text content of the parent tag
|
||||
if current_tag:
|
||||
text_content = current_tag.get_text(separator=" ", strip=True)
|
||||
# Check if the text content has at least word_count_threshold
|
||||
if len(text_content.split()) >= image_description_min_word_threshold:
|
||||
return text_content
|
||||
return None
|
||||
|
||||
def remove_unwanted_attributes(
|
||||
self, element, important_attrs, keep_data_attributes=False
|
||||
):
|
||||
"""
|
||||
Remove unwanted attributes from an HTML element.
|
||||
|
||||
Args:
|
||||
element (Tag): The HTML element to remove attributes from.
|
||||
important_attrs (list): List of important attributes to keep.
|
||||
keep_data_attributes (bool): Whether to keep data attributes.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
attrs_to_remove = []
|
||||
for attr in element.attrs:
|
||||
if attr not in important_attrs:
|
||||
if keep_data_attributes:
|
||||
if not attr.startswith("data-"):
|
||||
attrs_to_remove.append(attr)
|
||||
else:
|
||||
attrs_to_remove.append(attr)
|
||||
|
||||
for attr in attrs_to_remove:
|
||||
del element[attr]
|
||||
|
||||
def process_image(self, img, url, index, total_images, **kwargs):
|
||||
"""
|
||||
Process an image element.
|
||||
|
||||
How it works:
|
||||
1. Check if the image has valid display and inside undesired html elements.
|
||||
2. Score an image for it's usefulness.
|
||||
3. Extract image file metadata to extract size and extension.
|
||||
4. Generate a dictionary with the processed image information.
|
||||
5. Return the processed image information.
|
||||
|
||||
Args:
|
||||
img (Tag): The image element to process.
|
||||
url (str): The URL of the page containing the image.
|
||||
index (int): The index of the image in the list of images.
|
||||
total_images (int): The total number of images in the list.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the processed image information.
|
||||
"""
|
||||
# parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w')
|
||||
# if ' ' in u else None}
|
||||
# for u in [f"http{p}" for p in s.split("http") if p]]
|
||||
|
||||
# Constants for checks
|
||||
classes_to_check = frozenset(["button", "icon", "logo"])
|
||||
tags_to_check = frozenset(["button", "input"])
|
||||
image_formats = frozenset(["jpg", "jpeg", "png", "webp", "avif", "gif"])
|
||||
|
||||
# Pre-fetch commonly used attributes
|
||||
style = img.get("style", "")
|
||||
alt = img.get("alt", "")
|
||||
src = img.get("src", "")
|
||||
data_src = img.get("data-src", "")
|
||||
srcset = img.get("srcset", "")
|
||||
data_srcset = img.get("data-srcset", "")
|
||||
width = img.get("width")
|
||||
height = img.get("height")
|
||||
parent = img.parent
|
||||
parent_classes = parent.get("class", [])
|
||||
|
||||
# Quick validation checks
|
||||
if (
|
||||
"display:none" in style
|
||||
or parent.name in tags_to_check
|
||||
or any(c in cls for c in parent_classes for cls in classes_to_check)
|
||||
or any(c in src for c in classes_to_check)
|
||||
or any(c in alt for c in classes_to_check)
|
||||
):
|
||||
return None
|
||||
|
||||
# Quick score calculation
|
||||
score = 0
|
||||
if width and width.isdigit():
|
||||
width_val = int(width)
|
||||
score += 1 if width_val > 150 else 0
|
||||
if height and height.isdigit():
|
||||
height_val = int(height)
|
||||
score += 1 if height_val > 150 else 0
|
||||
if alt:
|
||||
score += 1
|
||||
score += index / total_images < 0.5
|
||||
|
||||
# image_format = ''
|
||||
# if "data:image/" in src:
|
||||
# image_format = src.split(',')[0].split(';')[0].split('/')[1].split(';')[0]
|
||||
# else:
|
||||
# image_format = os.path.splitext(src)[1].lower().strip('.').split('?')[0]
|
||||
|
||||
# if image_format in ('jpg', 'png', 'webp', 'avif'):
|
||||
# score += 1
|
||||
|
||||
# Check for image format in all possible sources
|
||||
def has_image_format(url):
|
||||
return any(fmt in url.lower() for fmt in image_formats)
|
||||
|
||||
# Score for having proper image sources
|
||||
if any(has_image_format(url) for url in [src, data_src, srcset, data_srcset]):
|
||||
score += 1
|
||||
if srcset or data_srcset:
|
||||
score += 1
|
||||
if img.find_parent("picture"):
|
||||
score += 1
|
||||
|
||||
# Detect format from any available source
|
||||
detected_format = None
|
||||
for url in [src, data_src, srcset, data_srcset]:
|
||||
if url:
|
||||
format_matches = [fmt for fmt in image_formats if fmt in url.lower()]
|
||||
if format_matches:
|
||||
detected_format = format_matches[0]
|
||||
break
|
||||
|
||||
if score <= kwargs.get("image_score_threshold", IMAGE_SCORE_THRESHOLD):
|
||||
return None
|
||||
|
||||
# Use set for deduplication
|
||||
unique_urls = set()
|
||||
image_variants = []
|
||||
|
||||
# Generate a unique group ID for this set of variants
|
||||
group_id = index
|
||||
|
||||
# Base image info template
|
||||
base_info = {
|
||||
"alt": alt,
|
||||
"desc": self.find_closest_parent_with_useful_text(img, **kwargs),
|
||||
"score": score,
|
||||
"type": "image",
|
||||
"group_id": group_id, # Group ID for this set of variants
|
||||
"format": detected_format,
|
||||
}
|
||||
|
||||
# Inline function for adding variants
|
||||
def add_variant(src, width=None):
|
||||
if src and not src.startswith("data:") and src not in unique_urls:
|
||||
unique_urls.add(src)
|
||||
image_variants.append({**base_info, "src": src, "width": width})
|
||||
|
||||
# Process all sources
|
||||
add_variant(src)
|
||||
add_variant(data_src)
|
||||
|
||||
# Handle srcset and data-srcset in one pass
|
||||
for attr in ("srcset", "data-srcset"):
|
||||
if value := img.get(attr):
|
||||
for source in parse_srcset(value):
|
||||
add_variant(source["url"], source["width"])
|
||||
|
||||
# Quick picture element check
|
||||
if picture := img.find_parent("picture"):
|
||||
for source in picture.find_all("source"):
|
||||
if srcset := source.get("srcset"):
|
||||
for src in parse_srcset(srcset):
|
||||
add_variant(src["url"], src["width"])
|
||||
|
||||
# Framework-specific attributes in one pass
|
||||
for attr, value in img.attrs.items():
|
||||
if (
|
||||
attr.startswith("data-")
|
||||
and ("src" in attr or "srcset" in attr)
|
||||
and "http" in value
|
||||
):
|
||||
add_variant(value)
|
||||
|
||||
return image_variants if image_variants else None
|
||||
|
||||
def process_element(self, url, element: PageElement, **kwargs) -> Dict[str, Any]:
|
||||
def process_element(self, url, element: lhtml.HtmlElement, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Process an HTML element.
|
||||
|
||||
@@ -577,7 +210,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
|
||||
Args:
|
||||
url (str): The URL of the page containing the element.
|
||||
element (Tag): The HTML element to process.
|
||||
element (lhtml.HtmlElement): The HTML element to process.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
@@ -595,509 +228,6 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
"external_links_dict": external_links_dict,
|
||||
}
|
||||
|
||||
def _process_element(
|
||||
self,
|
||||
url,
|
||||
element: PageElement,
|
||||
media: Dict[str, Any],
|
||||
internal_links_dict: Dict[str, Any],
|
||||
external_links_dict: Dict[str, Any],
|
||||
**kwargs,
|
||||
) -> bool:
|
||||
"""
|
||||
Process an HTML element.
|
||||
"""
|
||||
try:
|
||||
if isinstance(element, NavigableString):
|
||||
if isinstance(element, Comment):
|
||||
element.extract()
|
||||
return False
|
||||
|
||||
# if element.name == 'img':
|
||||
# process_image(element, url, 0, 1)
|
||||
# return True
|
||||
base_domain = kwargs.get("base_domain", get_base_domain(url))
|
||||
|
||||
if element.name in ["script", "style", "link", "meta", "noscript"]:
|
||||
element.decompose()
|
||||
return False
|
||||
|
||||
keep_element = False
|
||||
# Special case for table elements - always preserve structure
|
||||
if element.name in ["tr", "td", "th"]:
|
||||
keep_element = True
|
||||
|
||||
exclude_domains = kwargs.get("exclude_domains", [])
|
||||
# exclude_social_media_domains = kwargs.get('exclude_social_media_domains', set(SOCIAL_MEDIA_DOMAINS))
|
||||
# exclude_social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('exclude_social_media_domains', [])
|
||||
# exclude_social_media_domains = list(set(exclude_social_media_domains))
|
||||
|
||||
try:
|
||||
if element.name == "a" and element.get("href"):
|
||||
href = element.get("href", "").strip()
|
||||
if not href: # Skip empty hrefs
|
||||
return False
|
||||
|
||||
# url_base = url.split("/")[2]
|
||||
|
||||
# Normalize the URL
|
||||
try:
|
||||
normalized_href = normalize_url(href, url)
|
||||
except ValueError:
|
||||
# logging.warning(f"Invalid URL format: {href}, Error: {str(e)}")
|
||||
return False
|
||||
|
||||
link_data = {
|
||||
"href": normalized_href,
|
||||
"text": element.get_text().strip(),
|
||||
"title": element.get("title", "").strip(),
|
||||
"base_domain": base_domain,
|
||||
}
|
||||
|
||||
is_external = is_external_url(normalized_href, base_domain)
|
||||
|
||||
keep_element = True
|
||||
|
||||
# Handle external link exclusions
|
||||
if is_external:
|
||||
link_base_domain = get_base_domain(normalized_href)
|
||||
link_data["base_domain"] = link_base_domain
|
||||
if kwargs.get("exclude_external_links", False):
|
||||
element.decompose()
|
||||
return False
|
||||
# elif kwargs.get('exclude_social_media_links', False):
|
||||
# if link_base_domain in exclude_social_media_domains:
|
||||
# element.decompose()
|
||||
# return False
|
||||
# if any(domain in normalized_href.lower() for domain in exclude_social_media_domains):
|
||||
# element.decompose()
|
||||
# return False
|
||||
elif exclude_domains:
|
||||
if link_base_domain in exclude_domains:
|
||||
element.decompose()
|
||||
return False
|
||||
# if any(domain in normalized_href.lower() for domain in kwargs.get('exclude_domains', [])):
|
||||
# element.decompose()
|
||||
# return False
|
||||
|
||||
if is_external:
|
||||
if normalized_href not in external_links_dict:
|
||||
external_links_dict[normalized_href] = link_data
|
||||
else:
|
||||
if kwargs.get("exclude_internal_links", False):
|
||||
element.decompose()
|
||||
return False
|
||||
if normalized_href not in internal_links_dict:
|
||||
internal_links_dict[normalized_href] = link_data
|
||||
|
||||
except Exception as e:
|
||||
raise Exception(f"Error processing links: {str(e)}")
|
||||
|
||||
try:
|
||||
if element.name == "img":
|
||||
potential_sources = [
|
||||
"src",
|
||||
"data-src",
|
||||
"srcset" "data-lazy-src",
|
||||
"data-original",
|
||||
]
|
||||
src = element.get("src", "")
|
||||
while not src and potential_sources:
|
||||
src = element.get(potential_sources.pop(0), "")
|
||||
if not src:
|
||||
element.decompose()
|
||||
return False
|
||||
|
||||
# If it is srcset pick up the first image
|
||||
if "srcset" in element.attrs:
|
||||
src = element.attrs["srcset"].split(",")[0].split(" ")[0]
|
||||
|
||||
# If image src is internal, then skip
|
||||
if not is_external_url(src, base_domain):
|
||||
return True
|
||||
|
||||
image_src_base_domain = get_base_domain(src)
|
||||
|
||||
# Check flag if we should remove external images
|
||||
if kwargs.get("exclude_external_images", False):
|
||||
element.decompose()
|
||||
return False
|
||||
# src_url_base = src.split('/')[2]
|
||||
# url_base = url.split('/')[2]
|
||||
# if url_base not in src_url_base:
|
||||
# element.decompose()
|
||||
# return False
|
||||
|
||||
# if kwargs.get('exclude_social_media_links', False):
|
||||
# if image_src_base_domain in exclude_social_media_domains:
|
||||
# element.decompose()
|
||||
# return False
|
||||
# src_url_base = src.split('/')[2]
|
||||
# url_base = url.split('/')[2]
|
||||
# if any(domain in src for domain in exclude_social_media_domains):
|
||||
# element.decompose()
|
||||
# return False
|
||||
|
||||
# Handle exclude domains
|
||||
if exclude_domains:
|
||||
if image_src_base_domain in exclude_domains:
|
||||
element.decompose()
|
||||
return False
|
||||
# if any(domain in src for domain in kwargs.get('exclude_domains', [])):
|
||||
# element.decompose()
|
||||
# return False
|
||||
|
||||
return True # Always keep image elements
|
||||
except Exception:
|
||||
raise "Error processing images"
|
||||
|
||||
# Check if flag to remove all forms is set
|
||||
if kwargs.get("remove_forms", False) and element.name == "form":
|
||||
element.decompose()
|
||||
return False
|
||||
|
||||
if element.name in ["video", "audio"]:
|
||||
media[f"{element.name}s"].append(
|
||||
{
|
||||
"src": element.get("src"),
|
||||
"alt": element.get("alt"),
|
||||
"type": element.name,
|
||||
"description": self.find_closest_parent_with_useful_text(
|
||||
element, **kwargs
|
||||
),
|
||||
}
|
||||
)
|
||||
source_tags = element.find_all("source")
|
||||
for source_tag in source_tags:
|
||||
media[f"{element.name}s"].append(
|
||||
{
|
||||
"src": source_tag.get("src"),
|
||||
"alt": element.get("alt"),
|
||||
"type": element.name,
|
||||
"description": self.find_closest_parent_with_useful_text(
|
||||
element, **kwargs
|
||||
),
|
||||
}
|
||||
)
|
||||
return True # Always keep video and audio elements
|
||||
|
||||
if element.name in ONLY_TEXT_ELIGIBLE_TAGS:
|
||||
if kwargs.get("only_text", False):
|
||||
element.replace_with(element.get_text())
|
||||
|
||||
try:
|
||||
self.remove_unwanted_attributes(
|
||||
element, IMPORTANT_ATTRS + kwargs.get("keep_attrs", []) , kwargs.get("keep_data_attributes", False)
|
||||
)
|
||||
except Exception as e:
|
||||
# print('Error removing unwanted attributes:', str(e))
|
||||
self._log(
|
||||
"error",
|
||||
message="Error removing unwanted attributes: {error}",
|
||||
tag="SCRAPE",
|
||||
params={"error": str(e)},
|
||||
)
|
||||
# Process children
|
||||
for child in list(element.children):
|
||||
if isinstance(child, NavigableString) and not isinstance(
|
||||
child, Comment
|
||||
):
|
||||
if len(child.strip()) > 0:
|
||||
keep_element = True
|
||||
else:
|
||||
if self._process_element(
|
||||
url,
|
||||
child,
|
||||
media,
|
||||
internal_links_dict,
|
||||
external_links_dict,
|
||||
**kwargs,
|
||||
):
|
||||
keep_element = True
|
||||
|
||||
# Check word count
|
||||
word_count_threshold = kwargs.get(
|
||||
"word_count_threshold", MIN_WORD_THRESHOLD
|
||||
)
|
||||
if not keep_element:
|
||||
word_count = len(element.get_text(strip=True).split())
|
||||
keep_element = word_count >= word_count_threshold
|
||||
|
||||
if not keep_element:
|
||||
element.decompose()
|
||||
|
||||
return keep_element
|
||||
except Exception as e:
|
||||
# print('Error processing element:', str(e))
|
||||
self._log(
|
||||
"error",
|
||||
message="Error processing element: {error}",
|
||||
tag="SCRAPE",
|
||||
params={"error": str(e)},
|
||||
)
|
||||
return False
|
||||
|
||||
def _scrap(
|
||||
self,
|
||||
url: str,
|
||||
html: str,
|
||||
word_count_threshold: int = MIN_WORD_THRESHOLD,
|
||||
css_selector: str = None,
|
||||
target_elements: List[str] = None,
|
||||
**kwargs,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract content from HTML using BeautifulSoup.
|
||||
|
||||
Args:
|
||||
url (str): The URL of the page to scrape.
|
||||
html (str): The HTML content of the page to scrape.
|
||||
word_count_threshold (int): The minimum word count threshold for content extraction.
|
||||
css_selector (str): The CSS selector to use for content extraction.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the extracted content.
|
||||
"""
|
||||
success = True
|
||||
if not html:
|
||||
return None
|
||||
|
||||
parser_type = kwargs.get("parser", "lxml")
|
||||
soup = BeautifulSoup(html, parser_type)
|
||||
body = soup.body
|
||||
if body is None:
|
||||
raise Exception("'<body>' tag is not found in fetched html. Consider adding wait_for=\"css:body\" to wait for body tag to be loaded into DOM.")
|
||||
base_domain = get_base_domain(url)
|
||||
|
||||
# Early removal of all images if exclude_all_images is set
|
||||
# This happens before any processing to minimize memory usage
|
||||
if kwargs.get("exclude_all_images", False):
|
||||
for img in body.find_all('img'):
|
||||
img.decompose()
|
||||
|
||||
try:
|
||||
meta = extract_metadata("", soup)
|
||||
except Exception as e:
|
||||
self._log(
|
||||
"error",
|
||||
message="Error extracting metadata: {error}",
|
||||
tag="SCRAPE",
|
||||
params={"error": str(e)},
|
||||
)
|
||||
meta = {}
|
||||
|
||||
# Handle tag-based removal first - faster than CSS selection
|
||||
excluded_tags = set(kwargs.get("excluded_tags", []) or [])
|
||||
if excluded_tags:
|
||||
for element in body.find_all(lambda tag: tag.name in excluded_tags):
|
||||
element.extract()
|
||||
|
||||
# Handle CSS selector-based removal
|
||||
excluded_selector = kwargs.get("excluded_selector", "")
|
||||
if excluded_selector:
|
||||
is_single_selector = (
|
||||
"," not in excluded_selector and " " not in excluded_selector
|
||||
)
|
||||
if is_single_selector:
|
||||
while element := body.select_one(excluded_selector):
|
||||
element.extract()
|
||||
else:
|
||||
for element in body.select(excluded_selector):
|
||||
element.extract()
|
||||
|
||||
content_element = None
|
||||
if target_elements:
|
||||
try:
|
||||
for_content_targeted_element = []
|
||||
for target_element in target_elements:
|
||||
for_content_targeted_element.extend(body.select(target_element))
|
||||
content_element = soup.new_tag("div")
|
||||
for el in for_content_targeted_element:
|
||||
content_element.append(copy.deepcopy(el))
|
||||
except Exception as e:
|
||||
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
|
||||
return None
|
||||
else:
|
||||
content_element = body
|
||||
|
||||
kwargs["exclude_social_media_domains"] = set(
|
||||
kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS
|
||||
)
|
||||
kwargs["exclude_domains"] = set(kwargs.get("exclude_domains", []))
|
||||
if kwargs.get("exclude_social_media_links", False):
|
||||
kwargs["exclude_domains"] = kwargs["exclude_domains"].union(
|
||||
kwargs["exclude_social_media_domains"]
|
||||
)
|
||||
|
||||
result_obj = self.process_element(
|
||||
url,
|
||||
body,
|
||||
word_count_threshold=word_count_threshold,
|
||||
base_domain=base_domain,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
links = {"internal": [], "external": []}
|
||||
media = result_obj["media"]
|
||||
internal_links_dict = result_obj["internal_links_dict"]
|
||||
external_links_dict = result_obj["external_links_dict"]
|
||||
|
||||
# Update the links dictionary with unique links
|
||||
links["internal"] = list(internal_links_dict.values())
|
||||
links["external"] = list(external_links_dict.values())
|
||||
|
||||
# Extract head content for links if configured
|
||||
link_preview_config = kwargs.get("link_preview_config")
|
||||
if link_preview_config is not None:
|
||||
try:
|
||||
import asyncio
|
||||
from .link_preview import LinkPreview
|
||||
from .models import Links, Link
|
||||
|
||||
verbose = link_preview_config.verbose
|
||||
|
||||
if verbose:
|
||||
self._log("info", "Starting link head extraction for {internal} internal and {external} external links",
|
||||
params={"internal": len(links["internal"]), "external": len(links["external"])}, tag="LINK_EXTRACT")
|
||||
|
||||
# Convert dict links to Link objects
|
||||
internal_links = [Link(**link_data) for link_data in links["internal"]]
|
||||
external_links = [Link(**link_data) for link_data in links["external"]]
|
||||
links_obj = Links(internal=internal_links, external=external_links)
|
||||
|
||||
# Create a config object for LinkPreview
|
||||
class TempCrawlerRunConfig:
|
||||
def __init__(self, link_config, score_links):
|
||||
self.link_preview_config = link_config
|
||||
self.score_links = score_links
|
||||
|
||||
config = TempCrawlerRunConfig(link_preview_config, kwargs.get("score_links", False))
|
||||
|
||||
# Extract head content (run async operation in sync context)
|
||||
async def extract_links():
|
||||
async with LinkPreview(self.logger) as extractor:
|
||||
return await extractor.extract_link_heads(links_obj, config)
|
||||
|
||||
# Run the async operation
|
||||
try:
|
||||
# Check if we're already in an async context
|
||||
loop = asyncio.get_running_loop()
|
||||
# If we're in an async context, we need to run in a thread
|
||||
import concurrent.futures
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
future = executor.submit(asyncio.run, extract_links())
|
||||
updated_links = future.result()
|
||||
except RuntimeError:
|
||||
# No running loop, we can use asyncio.run directly
|
||||
updated_links = asyncio.run(extract_links())
|
||||
|
||||
# Convert back to dict format
|
||||
links["internal"] = [link.dict() for link in updated_links.internal]
|
||||
links["external"] = [link.dict() for link in updated_links.external]
|
||||
|
||||
if verbose:
|
||||
successful_internal = len([l for l in updated_links.internal if l.head_extraction_status == "valid"])
|
||||
successful_external = len([l for l in updated_links.external if l.head_extraction_status == "valid"])
|
||||
self._log("info", "Link head extraction completed: {internal_success}/{internal_total} internal, {external_success}/{external_total} external",
|
||||
params={
|
||||
"internal_success": successful_internal,
|
||||
"internal_total": len(updated_links.internal),
|
||||
"external_success": successful_external,
|
||||
"external_total": len(updated_links.external)
|
||||
}, tag="LINK_EXTRACT")
|
||||
else:
|
||||
self._log("info", "Link head extraction completed successfully", tag="LINK_EXTRACT")
|
||||
|
||||
except Exception as e:
|
||||
self._log("error", f"Link head extraction failed: {str(e)}", tag="LINK_EXTRACT")
|
||||
# Continue with original links if extraction fails
|
||||
|
||||
# # Process images using ThreadPoolExecutor
|
||||
imgs = body.find_all("img")
|
||||
|
||||
media["images"] = [
|
||||
img
|
||||
for result in (
|
||||
self.process_image(img, url, i, len(imgs), **kwargs)
|
||||
for i, img in enumerate(imgs)
|
||||
)
|
||||
if result is not None
|
||||
for img in result
|
||||
]
|
||||
|
||||
# Process tables if not excluded
|
||||
excluded_tags = set(kwargs.get("excluded_tags", []) or [])
|
||||
if 'table' not in excluded_tags:
|
||||
tables = body.find_all('table')
|
||||
for table in tables:
|
||||
if self.is_data_table(table, **kwargs):
|
||||
table_data = self.extract_table_data(table)
|
||||
media["tables"].append(table_data)
|
||||
|
||||
body = self.flatten_nested_elements(body)
|
||||
base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)')
|
||||
for img in imgs:
|
||||
src = img.get("src", "")
|
||||
if base64_pattern.match(src):
|
||||
# Replace base64 data with empty string
|
||||
img["src"] = base64_pattern.sub("", src)
|
||||
|
||||
str_body = ""
|
||||
try:
|
||||
str_body = content_element.encode_contents().decode("utf-8")
|
||||
except Exception:
|
||||
# Reset body to the original HTML
|
||||
success = False
|
||||
body = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Create a new div with a special ID
|
||||
error_div = body.new_tag("div", id="crawl4ai_error_message")
|
||||
error_div.string = """
|
||||
Crawl4AI Error: This page is not fully supported.
|
||||
|
||||
Possible reasons:
|
||||
1. The page may have restrictions that prevent crawling.
|
||||
2. The page might not be fully loaded.
|
||||
|
||||
Suggestions:
|
||||
- Try calling the crawl function with these parameters:
|
||||
magic=True,
|
||||
- Set headless=False to visualize what's happening on the page.
|
||||
|
||||
If the issue persists, please check the page's structure and any potential anti-crawling measures.
|
||||
"""
|
||||
|
||||
# Append the error div to the body
|
||||
body.append(error_div)
|
||||
str_body = body.encode_contents().decode("utf-8")
|
||||
|
||||
print(
|
||||
"[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details."
|
||||
)
|
||||
self._log(
|
||||
"error",
|
||||
message="After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.",
|
||||
tag="SCRAPE",
|
||||
)
|
||||
|
||||
cleaned_html = str_body.replace("\n\n", "\n").replace(" ", " ")
|
||||
|
||||
return {
|
||||
"cleaned_html": cleaned_html,
|
||||
"success": success,
|
||||
"media": media,
|
||||
"links": links,
|
||||
"metadata": meta,
|
||||
}
|
||||
|
||||
|
||||
class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
def __init__(self, logger=None):
|
||||
super().__init__(logger)
|
||||
self.DIMENSION_REGEX = re.compile(r"(\d+)(\D*)")
|
||||
self.BASE64_PATTERN = re.compile(r'data:image/[^;]+;base64,([^"]+)')
|
||||
|
||||
def _process_element(
|
||||
self,
|
||||
url: str,
|
||||
@@ -1140,10 +270,10 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
link_data["intrinsic_score"] = intrinsic_score
|
||||
except Exception:
|
||||
# Fail gracefully - assign default score
|
||||
link_data["intrinsic_score"] = float('inf')
|
||||
link_data["intrinsic_score"] = 0
|
||||
else:
|
||||
# No scoring enabled - assign infinity (all links equal priority)
|
||||
link_data["intrinsic_score"] = float('inf')
|
||||
link_data["intrinsic_score"] = 0
|
||||
|
||||
is_external = is_external_url(normalized_href, base_domain)
|
||||
if is_external:
|
||||
@@ -1857,3 +987,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
"links": {"internal": [], "external": []},
|
||||
"metadata": {},
|
||||
}
|
||||
|
||||
|
||||
# Backward compatibility alias
|
||||
WebScrapingStrategy = LXMLWebScrapingStrategy
|
||||
|
||||
@@ -150,6 +150,14 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
||||
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
|
||||
break
|
||||
|
||||
# Calculate how many more URLs we can process in this batch
|
||||
remaining = self.max_pages - self._pages_crawled
|
||||
batch_size = min(BATCH_SIZE, remaining)
|
||||
if batch_size <= 0:
|
||||
# No more pages to crawl
|
||||
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
|
||||
break
|
||||
|
||||
batch: List[Tuple[float, int, str, Optional[str]]] = []
|
||||
# Retrieve up to BATCH_SIZE items from the priority queue.
|
||||
for _ in range(BATCH_SIZE):
|
||||
@@ -184,6 +192,10 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
||||
# Count only successful crawls toward max_pages limit
|
||||
if result.success:
|
||||
self._pages_crawled += 1
|
||||
# Check if we've reached the limit during batch processing
|
||||
if self._pages_crawled >= self.max_pages:
|
||||
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
|
||||
break # Exit the generator
|
||||
|
||||
yield result
|
||||
|
||||
|
||||
@@ -157,6 +157,11 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
||||
results: List[CrawlResult] = []
|
||||
|
||||
while current_level and not self._cancel_event.is_set():
|
||||
# Check if we've already reached max_pages before starting a new level
|
||||
if self._pages_crawled >= self.max_pages:
|
||||
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
|
||||
break
|
||||
|
||||
next_level: List[Tuple[str, Optional[str]]] = []
|
||||
urls = [url for url, _ in current_level]
|
||||
|
||||
@@ -221,6 +226,10 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
||||
# Count only successful crawls
|
||||
if result.success:
|
||||
self._pages_crawled += 1
|
||||
# Check if we've reached the limit during batch processing
|
||||
if self._pages_crawled >= self.max_pages:
|
||||
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
|
||||
break # Exit the generator
|
||||
|
||||
results_count += 1
|
||||
yield result
|
||||
|
||||
@@ -49,6 +49,10 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
|
||||
# Count only successful crawls toward max_pages limit
|
||||
if result.success:
|
||||
self._pages_crawled += 1
|
||||
# Check if we've reached the limit during batch processing
|
||||
if self._pages_crawled >= self.max_pages:
|
||||
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
|
||||
break # Exit the generator
|
||||
|
||||
# Only discover links from successful crawls
|
||||
new_links: List[Tuple[str, Optional[str]]] = []
|
||||
@@ -94,6 +98,10 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
|
||||
# and only discover links from successful crawls
|
||||
if result.success:
|
||||
self._pages_crawled += 1
|
||||
# Check if we've reached the limit during batch processing
|
||||
if self._pages_crawled >= self.max_pages:
|
||||
self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
|
||||
break # Exit the generator
|
||||
|
||||
new_links: List[Tuple[str, Optional[str]]] = []
|
||||
await self.link_discovery(result, url, depth, visited, new_links, depths)
|
||||
|
||||
@@ -227,10 +227,21 @@ class URLPatternFilter(URLFilter):
|
||||
# Prefix check (/foo/*)
|
||||
if self._simple_prefixes:
|
||||
path = url.split("?")[0]
|
||||
if any(path.startswith(p) for p in self._simple_prefixes):
|
||||
result = True
|
||||
self._update_stats(result)
|
||||
return not result if self._reverse else result
|
||||
# if any(path.startswith(p) for p in self._simple_prefixes):
|
||||
# result = True
|
||||
# self._update_stats(result)
|
||||
# return not result if self._reverse else result
|
||||
####
|
||||
# Modified the prefix matching logic to ensure path boundary checking:
|
||||
# - Check if the matched prefix is followed by a path separator (`/`), query parameter (`?`), fragment (`#`), or is at the end of the path
|
||||
# - This ensures `/api/` only matches complete path segments, not substrings like `/apiv2/`
|
||||
####
|
||||
for prefix in self._simple_prefixes:
|
||||
if path.startswith(prefix):
|
||||
if len(path) == len(prefix) or path[len(prefix)] in ['/', '?', '#']:
|
||||
result = True
|
||||
self._update_stats(result)
|
||||
return not result if self._reverse else result
|
||||
|
||||
# Complex patterns
|
||||
if self._path_patterns:
|
||||
@@ -337,6 +348,15 @@ class ContentTypeFilter(URLFilter):
|
||||
"sqlite": "application/vnd.sqlite3",
|
||||
# Placeholder
|
||||
"unknown": "application/octet-stream", # Fallback for unknown file types
|
||||
# php
|
||||
"php": "application/x-httpd-php",
|
||||
"php3": "application/x-httpd-php",
|
||||
"php4": "application/x-httpd-php",
|
||||
"php5": "application/x-httpd-php",
|
||||
"php7": "application/x-httpd-php",
|
||||
"phtml": "application/x-httpd-php",
|
||||
"phps": "application/x-httpd-php-source",
|
||||
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
|
||||
@@ -73,6 +73,8 @@ class Crawl4aiDockerClient:
|
||||
def _prepare_request(self, urls: List[str], browser_config: Optional[BrowserConfig] = None,
|
||||
crawler_config: Optional[CrawlerRunConfig] = None) -> Dict[str, Any]:
|
||||
"""Prepare request data from configs."""
|
||||
if self._token:
|
||||
self._http_client.headers["Authorization"] = f"Bearer {self._token}"
|
||||
return {
|
||||
"urls": urls,
|
||||
"browser_config": browser_config.dump() if browser_config else {},
|
||||
@@ -103,8 +105,6 @@ class Crawl4aiDockerClient:
|
||||
crawler_config: Optional[CrawlerRunConfig] = None
|
||||
) -> Union[CrawlResult, List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
|
||||
"""Execute a crawl operation."""
|
||||
if not self._token:
|
||||
raise Crawl4aiClientError("Authentication required. Call authenticate() first.")
|
||||
await self._check_server()
|
||||
|
||||
data = self._prepare_request(urls, browser_config, crawler_config)
|
||||
@@ -140,8 +140,6 @@ class Crawl4aiDockerClient:
|
||||
|
||||
async def get_schema(self) -> Dict[str, Any]:
|
||||
"""Retrieve configuration schemas."""
|
||||
if not self._token:
|
||||
raise Crawl4aiClientError("Authentication required. Call authenticate() first.")
|
||||
response = await self._request("GET", "/schema")
|
||||
return response.json()
|
||||
|
||||
@@ -167,4 +165,4 @@ async def main():
|
||||
print(schema)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
asyncio.run(main())
|
||||
|
||||
@@ -656,11 +656,11 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
self.total_usage.total_tokens += usage.total_tokens
|
||||
|
||||
try:
|
||||
response = response.choices[0].message.content
|
||||
content = response.choices[0].message.content
|
||||
blocks = None
|
||||
|
||||
if self.force_json_response:
|
||||
blocks = json.loads(response)
|
||||
blocks = json.loads(content)
|
||||
if isinstance(blocks, dict):
|
||||
# If it has only one key which calue is list then assign that to blocks, exampled: {"news": [..]}
|
||||
if len(blocks) == 1 and isinstance(list(blocks.values())[0], list):
|
||||
@@ -673,7 +673,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
blocks = blocks
|
||||
else:
|
||||
# blocks = extract_xml_data(["blocks"], response.choices[0].message.content)["blocks"]
|
||||
blocks = extract_xml_data(["blocks"], response)["blocks"]
|
||||
blocks = extract_xml_data(["blocks"], content)["blocks"]
|
||||
blocks = json.loads(blocks)
|
||||
|
||||
for block in blocks:
|
||||
|
||||
@@ -119,6 +119,32 @@ def install_playwright():
|
||||
logger.warning(
|
||||
f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation."
|
||||
)
|
||||
|
||||
# Install Patchright browsers for undetected browser support
|
||||
logger.info("Installing Patchright browsers for undetected mode...", tag="INIT")
|
||||
try:
|
||||
subprocess.check_call(
|
||||
[
|
||||
sys.executable,
|
||||
"-m",
|
||||
"patchright",
|
||||
"install",
|
||||
"--with-deps",
|
||||
"--force",
|
||||
"chromium",
|
||||
]
|
||||
)
|
||||
logger.success(
|
||||
"Patchright installation completed successfully.", tag="COMPLETE"
|
||||
)
|
||||
except subprocess.CalledProcessError:
|
||||
logger.warning(
|
||||
f"Please run '{sys.executable} -m patchright install --with-deps' manually after the installation."
|
||||
)
|
||||
except Exception:
|
||||
logger.warning(
|
||||
f"Please run '{sys.executable} -m patchright install --with-deps' manually after the installation."
|
||||
)
|
||||
|
||||
|
||||
def run_migration():
|
||||
|
||||
@@ -11,7 +11,7 @@ from .extraction_strategy import *
|
||||
from .crawler_strategy import *
|
||||
from typing import List
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from .content_scraping_strategy import WebScrapingStrategy
|
||||
from ..content_scraping_strategy import LXMLWebScrapingStrategy as WebScrapingStrategy
|
||||
from .config import *
|
||||
import warnings
|
||||
import json
|
||||
|
||||
79
crawl4ai/memory_utils.py
Normal file
79
crawl4ai/memory_utils.py
Normal file
@@ -0,0 +1,79 @@
|
||||
import psutil
|
||||
import platform
|
||||
import subprocess
|
||||
from typing import Tuple
|
||||
|
||||
|
||||
def get_true_available_memory_gb() -> float:
|
||||
"""Get truly available memory including inactive pages (cross-platform)"""
|
||||
vm = psutil.virtual_memory()
|
||||
|
||||
if platform.system() == 'Darwin': # macOS
|
||||
# On macOS, we need to include inactive memory too
|
||||
try:
|
||||
# Use vm_stat to get accurate values
|
||||
result = subprocess.run(['vm_stat'], capture_output=True, text=True)
|
||||
lines = result.stdout.split('\n')
|
||||
|
||||
page_size = 16384 # macOS page size
|
||||
pages = {}
|
||||
|
||||
for line in lines:
|
||||
if 'Pages free:' in line:
|
||||
pages['free'] = int(line.split()[-1].rstrip('.'))
|
||||
elif 'Pages inactive:' in line:
|
||||
pages['inactive'] = int(line.split()[-1].rstrip('.'))
|
||||
elif 'Pages speculative:' in line:
|
||||
pages['speculative'] = int(line.split()[-1].rstrip('.'))
|
||||
elif 'Pages purgeable:' in line:
|
||||
pages['purgeable'] = int(line.split()[-1].rstrip('.'))
|
||||
|
||||
# Calculate total available (free + inactive + speculative + purgeable)
|
||||
total_available_pages = (
|
||||
pages.get('free', 0) +
|
||||
pages.get('inactive', 0) +
|
||||
pages.get('speculative', 0) +
|
||||
pages.get('purgeable', 0)
|
||||
)
|
||||
available_gb = (total_available_pages * page_size) / (1024**3)
|
||||
|
||||
return available_gb
|
||||
except:
|
||||
# Fallback to psutil
|
||||
return vm.available / (1024**3)
|
||||
else:
|
||||
# For Windows and Linux, psutil.available is accurate
|
||||
return vm.available / (1024**3)
|
||||
|
||||
|
||||
def get_true_memory_usage_percent() -> float:
|
||||
"""
|
||||
Get memory usage percentage that accounts for platform differences.
|
||||
|
||||
Returns:
|
||||
float: Memory usage percentage (0-100)
|
||||
"""
|
||||
vm = psutil.virtual_memory()
|
||||
total_gb = vm.total / (1024**3)
|
||||
available_gb = get_true_available_memory_gb()
|
||||
|
||||
# Calculate used percentage based on truly available memory
|
||||
used_percent = 100.0 * (total_gb - available_gb) / total_gb
|
||||
|
||||
# Ensure it's within valid range
|
||||
return max(0.0, min(100.0, used_percent))
|
||||
|
||||
|
||||
def get_memory_stats() -> Tuple[float, float, float]:
|
||||
"""
|
||||
Get comprehensive memory statistics.
|
||||
|
||||
Returns:
|
||||
Tuple[float, float, float]: (used_percent, available_gb, total_gb)
|
||||
"""
|
||||
vm = psutil.virtual_memory()
|
||||
total_gb = vm.total / (1024**3)
|
||||
available_gb = get_true_available_memory_gb()
|
||||
used_percent = get_true_memory_usage_percent()
|
||||
|
||||
return used_percent, available_gb, total_gb
|
||||
@@ -1056,7 +1056,7 @@ Your output must:
|
||||
</output_requirements>
|
||||
"""
|
||||
|
||||
GENERATE_SCRIPT_PROMPT = """You are a world-class browser automation specialist. Your sole purpose is to convert a natural language objective and a snippet of HTML into the most **efficient, robust, and simple** script possible to prepare a web page for data extraction.
|
||||
GENERATE_SCRIPT_PROMPT = r"""You are a world-class browser automation specialist. Your sole purpose is to convert a natural language objective and a snippet of HTML into the most **efficient, robust, and simple** script possible to prepare a web page for data extraction.
|
||||
|
||||
Your scripts run **before the crawl** to handle dynamic content, user interactions, and other obstacles. You are a master of two tools: raw **JavaScript** and the high-level **Crawl4ai Script (c4a)**.
|
||||
|
||||
|
||||
@@ -23,8 +23,9 @@ SeedingConfig = Union['SeedingConfigType']
|
||||
|
||||
# Content scraping types
|
||||
ContentScrapingStrategy = Union['ContentScrapingStrategyType']
|
||||
WebScrapingStrategy = Union['WebScrapingStrategyType']
|
||||
LXMLWebScrapingStrategy = Union['LXMLWebScrapingStrategyType']
|
||||
# Backward compatibility alias
|
||||
WebScrapingStrategy = Union['LXMLWebScrapingStrategyType']
|
||||
|
||||
# Proxy types
|
||||
ProxyRotationStrategy = Union['ProxyRotationStrategyType']
|
||||
@@ -114,7 +115,6 @@ if TYPE_CHECKING:
|
||||
# Content scraping imports
|
||||
from .content_scraping_strategy import (
|
||||
ContentScrapingStrategy as ContentScrapingStrategyType,
|
||||
WebScrapingStrategy as WebScrapingStrategyType,
|
||||
LXMLWebScrapingStrategy as LXMLWebScrapingStrategyType,
|
||||
)
|
||||
|
||||
|
||||
@@ -50,6 +50,29 @@ from urllib.parse import (
|
||||
)
|
||||
|
||||
|
||||
# Monkey patch to fix wildcard handling in urllib.robotparser
|
||||
from urllib.robotparser import RuleLine
|
||||
import re
|
||||
|
||||
original_applies_to = RuleLine.applies_to
|
||||
|
||||
def patched_applies_to(self, filename):
|
||||
# Handle wildcards in paths
|
||||
if '*' in self.path or '%2A' in self.path or self.path in ("*", "%2A"):
|
||||
pattern = self.path.replace('%2A', '*')
|
||||
pattern = re.escape(pattern).replace('\\*', '.*')
|
||||
pattern = '^' + pattern
|
||||
if pattern.endswith('\\$'):
|
||||
pattern = pattern[:-2] + '$'
|
||||
try:
|
||||
return bool(re.match(pattern, filename))
|
||||
except re.error:
|
||||
return original_applies_to(self, filename)
|
||||
return original_applies_to(self, filename)
|
||||
|
||||
RuleLine.applies_to = patched_applies_to
|
||||
# Monkey patch ends
|
||||
|
||||
def chunk_documents(
|
||||
documents: Iterable[str],
|
||||
chunk_token_threshold: int,
|
||||
@@ -318,7 +341,7 @@ class RobotsParser:
|
||||
robots_url = f"{scheme}://{domain}/robots.txt"
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(robots_url, timeout=2) as response:
|
||||
async with session.get(robots_url, timeout=2, ssl=False) as response:
|
||||
if response.status == 200:
|
||||
rules = await response.text()
|
||||
self._cache_rules(domain, rules)
|
||||
@@ -1494,8 +1517,29 @@ def extract_metadata_using_lxml(html, doc=None):
|
||||
head = head[0]
|
||||
|
||||
# Title - using XPath
|
||||
# title = head.xpath(".//title/text()")
|
||||
# metadata["title"] = title[0].strip() if title else None
|
||||
|
||||
# === Title Extraction - New Approach ===
|
||||
# Attempt to extract <title> using XPath
|
||||
title = head.xpath(".//title/text()")
|
||||
metadata["title"] = title[0].strip() if title else None
|
||||
title = title[0] if title else None
|
||||
|
||||
# Fallback: Use .find() in case XPath fails due to malformed HTML
|
||||
if not title:
|
||||
title_el = doc.find(".//title")
|
||||
title = title_el.text if title_el is not None else None
|
||||
|
||||
# Final fallback: Use OpenGraph or Twitter title if <title> is missing or empty
|
||||
if not title:
|
||||
title_candidates = (
|
||||
doc.xpath("//meta[@property='og:title']/@content") or
|
||||
doc.xpath("//meta[@name='twitter:title']/@content")
|
||||
)
|
||||
title = title_candidates[0] if title_candidates else None
|
||||
|
||||
# Strip and assign title
|
||||
metadata["title"] = title.strip() if title else None
|
||||
|
||||
# Meta description - using XPath with multiple attribute conditions
|
||||
description = head.xpath('.//meta[@name="description"]/@content')
|
||||
@@ -1524,6 +1568,14 @@ def extract_metadata_using_lxml(html, doc=None):
|
||||
content = tag.get("content", "").strip()
|
||||
if property_name and content:
|
||||
metadata[property_name] = content
|
||||
|
||||
# Article metadata
|
||||
article_tags = head.xpath('.//meta[starts-with(@property, "article:")]')
|
||||
for tag in article_tags:
|
||||
property_name = tag.get("property", "").strip()
|
||||
content = tag.get("content", "").strip()
|
||||
if property_name and content:
|
||||
metadata[property_name] = content
|
||||
|
||||
return metadata
|
||||
|
||||
@@ -1599,7 +1651,15 @@ def extract_metadata(html, soup=None):
|
||||
content = tag.get("content", "").strip()
|
||||
if property_name and content:
|
||||
metadata[property_name] = content
|
||||
|
||||
|
||||
# Article metadata
|
||||
article_tags = head.find_all("meta", attrs={"property": re.compile(r"^article:")})
|
||||
for tag in article_tags:
|
||||
property_name = tag.get("property", "").strip()
|
||||
content = tag.get("content", "").strip()
|
||||
if property_name and content:
|
||||
metadata[property_name] = content
|
||||
|
||||
return metadata
|
||||
|
||||
|
||||
@@ -2068,14 +2128,16 @@ def normalize_url(href, base_url):
|
||||
parsed_base = urlparse(base_url)
|
||||
if not parsed_base.scheme or not parsed_base.netloc:
|
||||
raise ValueError(f"Invalid base URL format: {base_url}")
|
||||
|
||||
# Ensure base_url ends with a trailing slash if it's a directory path
|
||||
if not base_url.endswith('/'):
|
||||
base_url = base_url + '/'
|
||||
|
||||
if parsed_base.scheme.lower() not in ["http", "https"]:
|
||||
# Handle special protocols
|
||||
raise ValueError(f"Invalid base URL format: {base_url}")
|
||||
cleaned_href = href.strip()
|
||||
|
||||
# Use urljoin to handle all cases
|
||||
normalized = urljoin(base_url, href.strip())
|
||||
return normalized
|
||||
return urljoin(base_url, cleaned_href)
|
||||
|
||||
|
||||
|
||||
|
||||
def normalize_url(
|
||||
@@ -3301,7 +3363,13 @@ async def get_text_embeddings(
|
||||
# Default: use sentence-transformers
|
||||
else:
|
||||
# Lazy load to avoid importing heavy libraries unless needed
|
||||
from sentence_transformers import SentenceTransformer
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"sentence-transformers is required for local embeddings. "
|
||||
"Install it with: pip install 'crawl4ai[transformer]' or pip install sentence-transformers"
|
||||
)
|
||||
|
||||
# Cache the model in function attribute to avoid reloading
|
||||
if not hasattr(get_text_embeddings, '_models'):
|
||||
|
||||
@@ -5,4 +5,9 @@ ANTHROPIC_API_KEY=your_anthropic_key_here
|
||||
GROQ_API_KEY=your_groq_key_here
|
||||
TOGETHER_API_KEY=your_together_key_here
|
||||
MISTRAL_API_KEY=your_mistral_key_here
|
||||
GEMINI_API_TOKEN=your_gemini_key_here
|
||||
GEMINI_API_TOKEN=your_gemini_key_here
|
||||
|
||||
# Optional: Override the default LLM provider
|
||||
# Examples: "openai/gpt-4", "anthropic/claude-3-opus", "deepseek/chat", etc.
|
||||
# If not set, uses the provider specified in config.yml (default: openai/gpt-4o-mini)
|
||||
# LLM_PROVIDER=anthropic/claude-3-opus
|
||||
@@ -58,13 +58,15 @@ Pull and run images directly from Docker Hub without building locally.
|
||||
|
||||
#### 1. Pull the Image
|
||||
|
||||
Our latest release candidate is `0.6.0-r1`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
|
||||
Our latest release candidate is `0.7.0-r1`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
|
||||
|
||||
> ⚠️ **Important Note**: The `latest` tag currently points to the stable `0.6.0` version. After testing and validation, `0.7.0` (without -r1) will be released and `latest` will be updated. For now, please use `0.7.0-r1` to test the new features.
|
||||
|
||||
```bash
|
||||
# Pull the release candidate (recommended for latest features)
|
||||
docker pull unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number
|
||||
# Pull the release candidate (for testing new features)
|
||||
docker pull unclecode/crawl4ai:0.7.0-r1
|
||||
|
||||
# Or pull the latest stable version
|
||||
# Or pull the current stable version (0.6.0)
|
||||
docker pull unclecode/crawl4ai:latest
|
||||
```
|
||||
|
||||
@@ -99,7 +101,7 @@ EOL
|
||||
-p 11235:11235 \
|
||||
--name crawl4ai \
|
||||
--shm-size=1g \
|
||||
unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number
|
||||
unclecode/crawl4ai:0.7.0-r1
|
||||
```
|
||||
|
||||
* **With LLM support:**
|
||||
@@ -110,7 +112,7 @@ EOL
|
||||
--name crawl4ai \
|
||||
--env-file .llm.env \
|
||||
--shm-size=1g \
|
||||
unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number
|
||||
unclecode/crawl4ai:0.7.0-r1
|
||||
```
|
||||
|
||||
> The server will be available at `http://localhost:11235`. Visit `/playground` to access the interactive testing interface.
|
||||
@@ -124,7 +126,7 @@ docker stop crawl4ai && docker rm crawl4ai
|
||||
#### Docker Hub Versioning Explained
|
||||
|
||||
* **Image Name:** `unclecode/crawl4ai`
|
||||
* **Tag Format:** `LIBRARY_VERSION[-SUFFIX]` (e.g., `0.6.0-r1`)
|
||||
* **Tag Format:** `LIBRARY_VERSION[-SUFFIX]` (e.g., `0.7.0-r1`)
|
||||
* `LIBRARY_VERSION`: The semantic version of the core `crawl4ai` Python library
|
||||
* `SUFFIX`: Optional tag for release candidates (``) and revisions (`r1`)
|
||||
* **`latest` Tag:** Points to the most recent stable version
|
||||
@@ -152,6 +154,29 @@ cp deploy/docker/.llm.env.example .llm.env
|
||||
# Now edit .llm.env and add your API keys
|
||||
```
|
||||
|
||||
**Flexible LLM Provider Configuration:**
|
||||
|
||||
The Docker setup now supports flexible LLM provider configuration through three methods:
|
||||
|
||||
1. **Environment Variable** (Highest Priority): Set `LLM_PROVIDER` to override the default
|
||||
```bash
|
||||
export LLM_PROVIDER="anthropic/claude-3-opus"
|
||||
# Or in your .llm.env file:
|
||||
# LLM_PROVIDER=anthropic/claude-3-opus
|
||||
```
|
||||
|
||||
2. **API Request Parameter**: Specify provider per request
|
||||
```json
|
||||
{
|
||||
"url": "https://example.com",
|
||||
"provider": "groq/mixtral-8x7b"
|
||||
}
|
||||
```
|
||||
|
||||
3. **Config File Default**: Falls back to `config.yml` (default: `openai/gpt-4o-mini`)
|
||||
|
||||
The system automatically selects the appropriate API key based on the provider.
|
||||
|
||||
#### 3. Build and Run with Compose
|
||||
|
||||
The `docker-compose.yml` file in the project root provides a simplified approach that automatically handles architecture detection using buildx.
|
||||
@@ -160,7 +185,7 @@ The `docker-compose.yml` file in the project root provides a simplified approach
|
||||
```bash
|
||||
# Pulls and runs the release candidate from Docker Hub
|
||||
# Automatically selects the correct architecture
|
||||
IMAGE=unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number docker compose up -d
|
||||
IMAGE=unclecode/crawl4ai:0.7.0-r1 docker compose up -d
|
||||
```
|
||||
|
||||
* **Build and Run Locally:**
|
||||
@@ -666,7 +691,7 @@ app:
|
||||
|
||||
# Default LLM Configuration
|
||||
llm:
|
||||
provider: "openai/gpt-4o-mini"
|
||||
provider: "openai/gpt-4o-mini" # Can be overridden by LLM_PROVIDER env var
|
||||
api_key_env: "OPENAI_API_KEY"
|
||||
# api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ from typing import List, Tuple, Dict
|
||||
from functools import partial
|
||||
from uuid import uuid4
|
||||
from datetime import datetime
|
||||
from base64 import b64encode
|
||||
|
||||
import logging
|
||||
from typing import Optional, AsyncGenerator
|
||||
@@ -39,7 +40,9 @@ from utils import (
|
||||
get_base_url,
|
||||
is_task_id,
|
||||
should_cleanup_task,
|
||||
decode_redis_hash
|
||||
decode_redis_hash,
|
||||
get_llm_api_key,
|
||||
validate_llm_provider
|
||||
)
|
||||
|
||||
import psutil, time
|
||||
@@ -88,10 +91,12 @@ async def handle_llm_qa(
|
||||
|
||||
Answer:"""
|
||||
|
||||
# api_token=os.environ.get(config["llm"].get("api_key_env", ""))
|
||||
|
||||
response = perform_completion_with_backoff(
|
||||
provider=config["llm"]["provider"],
|
||||
prompt_with_variables=prompt,
|
||||
api_token=os.environ.get(config["llm"].get("api_key_env", ""))
|
||||
api_token=get_llm_api_key(config)
|
||||
)
|
||||
|
||||
return response.choices[0].message.content
|
||||
@@ -109,19 +114,23 @@ async def process_llm_extraction(
|
||||
url: str,
|
||||
instruction: str,
|
||||
schema: Optional[str] = None,
|
||||
cache: str = "0"
|
||||
cache: str = "0",
|
||||
provider: Optional[str] = None
|
||||
) -> None:
|
||||
"""Process LLM extraction in background."""
|
||||
try:
|
||||
# If config['llm'] has api_key then ignore the api_key_env
|
||||
api_key = ""
|
||||
if "api_key" in config["llm"]:
|
||||
api_key = config["llm"]["api_key"]
|
||||
else:
|
||||
api_key = os.environ.get(config["llm"].get("api_key_env", None), "")
|
||||
# Validate provider
|
||||
is_valid, error_msg = validate_llm_provider(config, provider)
|
||||
if not is_valid:
|
||||
await redis.hset(f"task:{task_id}", mapping={
|
||||
"status": TaskStatus.FAILED,
|
||||
"error": error_msg
|
||||
})
|
||||
return
|
||||
api_key = get_llm_api_key(config, provider)
|
||||
llm_strategy = LLMExtractionStrategy(
|
||||
llm_config=LLMConfig(
|
||||
provider=config["llm"]["provider"],
|
||||
provider=provider or config["llm"]["provider"],
|
||||
api_token=api_key
|
||||
),
|
||||
instruction=instruction,
|
||||
@@ -168,10 +177,19 @@ async def handle_markdown_request(
|
||||
filter_type: FilterType,
|
||||
query: Optional[str] = None,
|
||||
cache: str = "0",
|
||||
config: Optional[dict] = None
|
||||
config: Optional[dict] = None,
|
||||
provider: Optional[str] = None
|
||||
) -> str:
|
||||
"""Handle markdown generation requests."""
|
||||
try:
|
||||
# Validate provider if using LLM filter
|
||||
if filter_type == FilterType.LLM:
|
||||
is_valid, error_msg = validate_llm_provider(config, provider)
|
||||
if not is_valid:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=error_msg
|
||||
)
|
||||
decoded_url = unquote(url)
|
||||
if not decoded_url.startswith(('http://', 'https://')):
|
||||
decoded_url = 'https://' + decoded_url
|
||||
@@ -184,8 +202,8 @@ async def handle_markdown_request(
|
||||
FilterType.BM25: BM25ContentFilter(user_query=query or ""),
|
||||
FilterType.LLM: LLMContentFilter(
|
||||
llm_config=LLMConfig(
|
||||
provider=config["llm"]["provider"],
|
||||
api_token=os.environ.get(config["llm"].get("api_key_env", None), ""),
|
||||
provider=provider or config["llm"]["provider"],
|
||||
api_token=get_llm_api_key(config, provider),
|
||||
),
|
||||
instruction=query or "Extract main content"
|
||||
)
|
||||
@@ -229,7 +247,8 @@ async def handle_llm_request(
|
||||
query: Optional[str] = None,
|
||||
schema: Optional[str] = None,
|
||||
cache: str = "0",
|
||||
config: Optional[dict] = None
|
||||
config: Optional[dict] = None,
|
||||
provider: Optional[str] = None
|
||||
) -> JSONResponse:
|
||||
"""Handle LLM extraction requests."""
|
||||
base_url = get_base_url(request)
|
||||
@@ -259,7 +278,8 @@ async def handle_llm_request(
|
||||
schema,
|
||||
cache,
|
||||
base_url,
|
||||
config
|
||||
config,
|
||||
provider
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
@@ -303,7 +323,8 @@ async def create_new_task(
|
||||
schema: Optional[str],
|
||||
cache: str,
|
||||
base_url: str,
|
||||
config: dict
|
||||
config: dict,
|
||||
provider: Optional[str] = None
|
||||
) -> JSONResponse:
|
||||
"""Create and initialize a new task."""
|
||||
decoded_url = unquote(input_path)
|
||||
@@ -327,7 +348,8 @@ async def create_new_task(
|
||||
decoded_url,
|
||||
query,
|
||||
schema,
|
||||
cache
|
||||
cache,
|
||||
provider
|
||||
)
|
||||
|
||||
return JSONResponse({
|
||||
@@ -371,6 +393,9 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
|
||||
server_memory_mb = _get_memory_mb()
|
||||
result_dict = result.model_dump()
|
||||
result_dict['server_memory_mb'] = server_memory_mb
|
||||
# If PDF exists, encode it to base64
|
||||
if result_dict.get('pdf') is not None:
|
||||
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
|
||||
logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")
|
||||
data = json.dumps(result_dict, default=datetime_handler) + "\n"
|
||||
yield data.encode('utf-8')
|
||||
@@ -443,10 +468,19 @@ async def handle_crawl_request(
|
||||
mem_delta_mb = end_mem_mb - start_mem_mb # <--- Calculate delta
|
||||
peak_mem_mb = max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb) # <--- Get peak memory
|
||||
logger.info(f"Memory usage: Start: {start_mem_mb} MB, End: {end_mem_mb} MB, Delta: {mem_delta_mb} MB, Peak: {peak_mem_mb} MB")
|
||||
|
||||
|
||||
# Process results to handle PDF bytes
|
||||
processed_results = []
|
||||
for result in results:
|
||||
result_dict = result.model_dump()
|
||||
# If PDF exists, encode it to base64
|
||||
if result_dict.get('pdf') is not None:
|
||||
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
|
||||
processed_results.append(result_dict)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"results": [result.model_dump() for result in results],
|
||||
"results": processed_results,
|
||||
"server_processing_time_s": end_time - start_time,
|
||||
"server_memory_delta_mb": mem_delta_mb,
|
||||
"server_peak_memory_mb": peak_mem_mb
|
||||
@@ -459,7 +493,7 @@ async def handle_crawl_request(
|
||||
# await crawler.close()
|
||||
# except Exception as close_e:
|
||||
# logger.error(f"Error closing crawler during exception handling: {close_e}")
|
||||
logger.error(f"Error closing crawler during exception handling: {close_e}")
|
||||
logger.error(f"Error closing crawler during exception handling: {str(e)}")
|
||||
|
||||
# Measure memory even on error if possible
|
||||
end_mem_mb_error = _get_memory_mb()
|
||||
@@ -518,7 +552,7 @@ async def handle_stream_crawl_request(
|
||||
# await crawler.close()
|
||||
# except Exception as close_e:
|
||||
# logger.error(f"Error closing crawler during stream setup exception: {close_e}")
|
||||
logger.error(f"Error closing crawler during stream setup exception: {close_e}")
|
||||
logger.error(f"Error closing crawler during stream setup exception: {str(e)}")
|
||||
logger.error(f"Stream crawl error: {str(e)}", exc_info=True)
|
||||
# Raising HTTPException here will prevent streaming response
|
||||
raise HTTPException(
|
||||
|
||||
@@ -332,7 +332,7 @@ The `clone()` method:
|
||||
### Key fields to note
|
||||
|
||||
1. **`provider`**:
|
||||
- Which LLM provoder to use.
|
||||
- Which LLM provider to use.
|
||||
- Possible values are `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`<br/>*(default: `"openai/gpt-4o-mini"`)*
|
||||
|
||||
2. **`api_token`**:
|
||||
@@ -403,7 +403,7 @@ async def main():
|
||||
|
||||
md_generator = DefaultMarkdownGenerator(
|
||||
content_filter=filter,
|
||||
options={"ignore_links": True}
|
||||
options={"ignore_links": True})
|
||||
|
||||
# 4) Crawler run config: skip cache, use extraction
|
||||
run_conf = CrawlerRunConfig(
|
||||
@@ -3760,11 +3760,11 @@ To crawl a live web page, provide the URL starting with `http://` or `https://`,
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
async def crawl_web():
|
||||
config = CrawlerRunConfig(bypass_cache=True)
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://en.wikipedia.org/wiki/apple",
|
||||
@@ -3785,13 +3785,13 @@ To crawl a local HTML file, prefix the file path with `file://`.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
async def crawl_local_file():
|
||||
local_file_path = "/path/to/apple.html" # Replace with your file path
|
||||
file_url = f"file://{local_file_path}"
|
||||
config = CrawlerRunConfig(bypass_cache=True)
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url=file_url, config=config)
|
||||
@@ -3810,13 +3810,13 @@ To crawl raw HTML content, prefix the HTML string with `raw:`.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
async def crawl_raw_html():
|
||||
raw_html = "<html><body><h1>Hello, World!</h1></body></html>"
|
||||
raw_html_url = f"raw:{raw_html}"
|
||||
config = CrawlerRunConfig(bypass_cache=True)
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url=raw_html_url, config=config)
|
||||
@@ -3845,7 +3845,7 @@ import os
|
||||
import sys
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
async def main():
|
||||
@@ -3856,7 +3856,7 @@ async def main():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Step 1: Crawl the Web URL
|
||||
print("\n=== Step 1: Crawling the Wikipedia URL ===")
|
||||
web_config = CrawlerRunConfig(bypass_cache=True)
|
||||
web_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
result = await crawler.arun(url=wikipedia_url, config=web_config)
|
||||
|
||||
if not result.success:
|
||||
@@ -3871,7 +3871,7 @@ async def main():
|
||||
# Step 2: Crawl from the Local HTML File
|
||||
print("=== Step 2: Crawling from the Local HTML File ===")
|
||||
file_url = f"file://{html_file_path.resolve()}"
|
||||
file_config = CrawlerRunConfig(bypass_cache=True)
|
||||
file_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
local_result = await crawler.arun(url=file_url, config=file_config)
|
||||
|
||||
if not local_result.success:
|
||||
@@ -3887,7 +3887,7 @@ async def main():
|
||||
with open(html_file_path, 'r', encoding='utf-8') as f:
|
||||
raw_html_content = f.read()
|
||||
raw_html_url = f"raw:{raw_html_content}"
|
||||
raw_config = CrawlerRunConfig(bypass_cache=True)
|
||||
raw_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
raw_result = await crawler.arun(url=raw_html_url, config=raw_config)
|
||||
|
||||
if not raw_result.success:
|
||||
@@ -4152,7 +4152,7 @@ prune_filter = PruningContentFilter(
|
||||
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
|
||||
async def main():
|
||||
@@ -4175,8 +4175,13 @@ async def main():
|
||||
verbose=True
|
||||
)
|
||||
|
||||
md_generator = DefaultMarkdownGenerator(
|
||||
content_filter=filter,
|
||||
options={"ignore_links": True}
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
content_filter=filter
|
||||
markdown_generator=md_generator
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
@@ -5428,29 +5433,38 @@ Sometimes you need a visual record of a page or a PDF “printout.” Crawl4AI c
|
||||
```python
|
||||
import os, asyncio
|
||||
from base64 import b64decode
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
|
||||
|
||||
async def main():
|
||||
run_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
screenshot=True,
|
||||
pdf=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://en.wikipedia.org/wiki/List_of_common_misconceptions",
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
pdf=True,
|
||||
screenshot=True
|
||||
config=run_config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
# Save screenshot
|
||||
print(f"Screenshot data present: {result.screenshot is not None}")
|
||||
print(f"PDF data present: {result.pdf is not None}")
|
||||
|
||||
if result.screenshot:
|
||||
print(f"[OK] Screenshot captured, size: {len(result.screenshot)} bytes")
|
||||
with open("wikipedia_screenshot.png", "wb") as f:
|
||||
f.write(b64decode(result.screenshot))
|
||||
|
||||
# Save PDF
|
||||
else:
|
||||
print("[WARN] Screenshot data is None.")
|
||||
|
||||
if result.pdf:
|
||||
print(f"[OK] PDF captured, size: {len(result.pdf)} bytes")
|
||||
with open("wikipedia_page.pdf", "wb") as f:
|
||||
f.write(result.pdf)
|
||||
|
||||
print("[OK] PDF & screenshot captured.")
|
||||
else:
|
||||
print("[WARN] PDF data is None.")
|
||||
|
||||
else:
|
||||
print("[ERROR]", result.error_message)
|
||||
|
||||
|
||||
@@ -36,6 +36,7 @@ class LlmJobPayload(BaseModel):
|
||||
q: str
|
||||
schema: Optional[str] = None
|
||||
cache: bool = False
|
||||
provider: Optional[str] = None
|
||||
|
||||
|
||||
class CrawlJobPayload(BaseModel):
|
||||
@@ -61,6 +62,7 @@ async def llm_job_enqueue(
|
||||
schema=payload.schema,
|
||||
cache=payload.cache,
|
||||
config=_config,
|
||||
provider=payload.provider,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -12,10 +12,10 @@ class CrawlRequest(BaseModel):
|
||||
class MarkdownRequest(BaseModel):
|
||||
"""Request body for the /md endpoint."""
|
||||
url: str = Field(..., description="Absolute http/https URL to fetch")
|
||||
f: FilterType = Field(FilterType.FIT,
|
||||
description="Content‑filter strategy: FIT, RAW, BM25, or LLM")
|
||||
f: FilterType = Field(FilterType.FIT, description="Content‑filter strategy: fit, raw, bm25, or llm")
|
||||
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
|
||||
c: Optional[str] = Field("0", description="Cache‑bust / revision counter")
|
||||
provider: Optional[str] = Field(None, description="LLM provider override (e.g., 'anthropic/claude-3-opus')")
|
||||
|
||||
|
||||
class RawCode(BaseModel):
|
||||
|
||||
@@ -241,7 +241,7 @@ async def get_markdown(
|
||||
raise HTTPException(
|
||||
400, "URL must be absolute and start with http/https")
|
||||
markdown = await handle_markdown_request(
|
||||
body.url, body.f, body.q, body.c, config
|
||||
body.url, body.f, body.q, body.c, config, body.provider
|
||||
)
|
||||
return JSONResponse({
|
||||
"url": body.url,
|
||||
|
||||
@@ -671,6 +671,16 @@
|
||||
method: 'GET',
|
||||
headers: { 'Accept': 'application/json' }
|
||||
});
|
||||
responseData = await response.json();
|
||||
const time = Math.round(performance.now() - startTime);
|
||||
if (!response.ok) {
|
||||
updateStatus('error', time);
|
||||
throw new Error(responseData.error || 'Request failed');
|
||||
}
|
||||
updateStatus('success', time);
|
||||
document.querySelector('#response-content code').textContent = JSON.stringify(responseData, null, 2);
|
||||
document.querySelector('#response-content code').className = 'json hljs';
|
||||
forceHighlightElement(document.querySelector('#response-content code'));
|
||||
} else if (endpoint === 'crawl_stream') {
|
||||
// Stream processing
|
||||
response = await fetch(api, {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import dns.resolver
|
||||
import logging
|
||||
import yaml
|
||||
import os
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
@@ -19,10 +20,24 @@ class FilterType(str, Enum):
|
||||
LLM = "llm"
|
||||
|
||||
def load_config() -> Dict:
|
||||
"""Load and return application configuration."""
|
||||
"""Load and return application configuration with environment variable overrides."""
|
||||
config_path = Path(__file__).parent / "config.yml"
|
||||
with open(config_path, "r") as config_file:
|
||||
return yaml.safe_load(config_file)
|
||||
config = yaml.safe_load(config_file)
|
||||
|
||||
# Override LLM provider from environment if set
|
||||
llm_provider = os.environ.get("LLM_PROVIDER")
|
||||
if llm_provider:
|
||||
config["llm"]["provider"] = llm_provider
|
||||
logging.info(f"LLM provider overridden from environment: {llm_provider}")
|
||||
|
||||
# Also support direct API key from environment if the provider-specific key isn't set
|
||||
llm_api_key = os.environ.get("LLM_API_KEY")
|
||||
if llm_api_key and "api_key" not in config["llm"]:
|
||||
config["llm"]["api_key"] = llm_api_key
|
||||
logging.info("LLM API key loaded from LLM_API_KEY environment variable")
|
||||
|
||||
return config
|
||||
|
||||
def setup_logging(config: Dict) -> None:
|
||||
"""Configure application logging."""
|
||||
@@ -56,6 +71,52 @@ def decode_redis_hash(hash_data: Dict[bytes, bytes]) -> Dict[str, str]:
|
||||
|
||||
|
||||
|
||||
def get_llm_api_key(config: Dict, provider: Optional[str] = None) -> str:
|
||||
"""Get the appropriate API key based on the LLM provider.
|
||||
|
||||
Args:
|
||||
config: The application configuration dictionary
|
||||
provider: Optional provider override (e.g., "openai/gpt-4")
|
||||
|
||||
Returns:
|
||||
The API key for the provider, or empty string if not found
|
||||
"""
|
||||
|
||||
# Use provided provider or fall back to config
|
||||
if not provider:
|
||||
provider = config["llm"]["provider"]
|
||||
|
||||
# Check if direct API key is configured
|
||||
if "api_key" in config["llm"]:
|
||||
return config["llm"]["api_key"]
|
||||
|
||||
# Fall back to the configured api_key_env if no match
|
||||
return os.environ.get(config["llm"].get("api_key_env", ""), "")
|
||||
|
||||
|
||||
def validate_llm_provider(config: Dict, provider: Optional[str] = None) -> tuple[bool, str]:
|
||||
"""Validate that the LLM provider has an associated API key.
|
||||
|
||||
Args:
|
||||
config: The application configuration dictionary
|
||||
provider: Optional provider override (e.g., "openai/gpt-4")
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid, error_message)
|
||||
"""
|
||||
# Use provided provider or fall back to config
|
||||
if not provider:
|
||||
provider = config["llm"]["provider"]
|
||||
|
||||
# Get the API key for this provider
|
||||
api_key = get_llm_api_key(config, provider)
|
||||
|
||||
if not api_key:
|
||||
return False, f"No API key found for provider '{provider}'. Please set the appropriate environment variable."
|
||||
|
||||
return True, ""
|
||||
|
||||
|
||||
def verify_email_domain(email: str) -> bool:
|
||||
try:
|
||||
domain = email.split('@')[1]
|
||||
|
||||
@@ -14,6 +14,7 @@ x-base-config: &base-config
|
||||
- TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
|
||||
- MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
|
||||
- GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
|
||||
- LLM_PROVIDER=${LLM_PROVIDER:-} # Optional: Override default provider (e.g., "anthropic/claude-3-opus")
|
||||
volumes:
|
||||
- /dev/shm:/dev/shm # Chromium performance
|
||||
deploy:
|
||||
|
||||
343
docs/blog/release-v0.7.0.md
Normal file
343
docs/blog/release-v0.7.0.md
Normal file
@@ -0,0 +1,343 @@
|
||||
# 🚀 Crawl4AI v0.7.0: The Adaptive Intelligence Update
|
||||
|
||||
*January 28, 2025 • 10 min read*
|
||||
|
||||
---
|
||||
|
||||
Today I'm releasing Crawl4AI v0.7.0—the Adaptive Intelligence Update. This release introduces fundamental improvements in how Crawl4AI handles modern web complexity through adaptive learning, intelligent content discovery, and advanced extraction capabilities.
|
||||
|
||||
## 🎯 What's New at a Glance
|
||||
|
||||
- **Adaptive Crawling**: Your crawler now learns and adapts to website patterns
|
||||
- **Virtual Scroll Support**: Complete content extraction from infinite scroll pages
|
||||
- **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization
|
||||
- **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering
|
||||
- **Performance Optimizations**: Significant speed and memory improvements
|
||||
|
||||
## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning
|
||||
|
||||
**The Problem:** Websites change. Class names shift. IDs disappear. Your carefully crafted selectors break at 3 AM, and you wake up to empty datasets and angry stakeholders.
|
||||
|
||||
**My Solution:** I implemented an adaptive learning system that observes patterns, builds confidence scores, and adjusts extraction strategies on the fly. It's like having a junior developer who gets better at their job with every page they scrape.
|
||||
|
||||
### Technical Deep-Dive
|
||||
|
||||
The Adaptive Crawler maintains a persistent state for each domain, tracking:
|
||||
- Pattern success rates
|
||||
- Selector stability over time
|
||||
- Content structure variations
|
||||
- Extraction confidence scores
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||
import asyncio
|
||||
|
||||
async def main():
|
||||
|
||||
# Configure adaptive crawler
|
||||
config = AdaptiveConfig(
|
||||
strategy="statistical", # or "embedding" for semantic understanding
|
||||
max_pages=10,
|
||||
confidence_threshold=0.7, # Stop at 70% confidence
|
||||
top_k_links=3, # Follow top 3 links per page
|
||||
min_gain_threshold=0.05 # Need 5% information gain to continue
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
|
||||
print("Starting adaptive crawl about Python decorators...")
|
||||
result = await adaptive.digest(
|
||||
start_url="https://docs.python.org/3/glossary.html",
|
||||
query="python decorators functions wrapping"
|
||||
)
|
||||
|
||||
print(f"\n✅ Crawling Complete!")
|
||||
print(f"• Confidence Level: {adaptive.confidence:.0%}")
|
||||
print(f"• Pages Crawled: {len(result.crawled_urls)}")
|
||||
print(f"• Knowledge Base: {len(adaptive.state.knowledge_base)} documents")
|
||||
|
||||
# Get most relevant content
|
||||
relevant = adaptive.get_relevant_content(top_k=3)
|
||||
print(f"\nMost Relevant Pages:")
|
||||
for i, page in enumerate(relevant, 1):
|
||||
print(f"{i}. {page['url']} (relevance: {page['score']:.2%})")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
- **News Aggregation**: Maintain 95%+ extraction accuracy even as news sites update their templates
|
||||
- **E-commerce Monitoring**: Track product changes across hundreds of stores without constant maintenance
|
||||
- **Research Data Collection**: Build robust academic datasets that survive website redesigns
|
||||
- **Reduced Maintenance**: Cut selector update time by 80% for frequently-changing sites
|
||||
|
||||
## 🌊 Virtual Scroll: Complete Content Capture
|
||||
|
||||
**The Problem:** Modern web apps only render what's visible. Scroll down, new content appears, old content vanishes into the void. Traditional crawlers capture that first viewport and miss 90% of the content. It's like reading only the first page of every book.
|
||||
|
||||
**My Solution:** I built Virtual Scroll support that mimics human browsing behavior, capturing content as it loads and preserving it before the browser's garbage collector strikes.
|
||||
|
||||
### Implementation Details
|
||||
|
||||
```python
|
||||
from crawl4ai import VirtualScrollConfig
|
||||
|
||||
# For social media feeds (Twitter/X style)
|
||||
twitter_config = VirtualScrollConfig(
|
||||
container_selector="[data-testid='primaryColumn']",
|
||||
scroll_count=20, # Number of scrolls
|
||||
scroll_by="container_height", # Smart scrolling by container size
|
||||
wait_after_scroll=1.0 # Let content load
|
||||
)
|
||||
|
||||
# For e-commerce product grids (Instagram style)
|
||||
grid_config = VirtualScrollConfig(
|
||||
container_selector="main .product-grid",
|
||||
scroll_count=30,
|
||||
scroll_by=800, # Fixed pixel scrolling
|
||||
wait_after_scroll=1.5 # Images need time
|
||||
)
|
||||
|
||||
# For news feeds with lazy loading
|
||||
news_config = VirtualScrollConfig(
|
||||
container_selector=".article-feed",
|
||||
scroll_count=50,
|
||||
scroll_by="page_height", # Viewport-based scrolling
|
||||
wait_after_scroll=0.5 # Wait for content to load
|
||||
)
|
||||
|
||||
# Use it in your crawl
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://twitter.com/trending",
|
||||
config=CrawlerRunConfig(
|
||||
virtual_scroll_config=twitter_config,
|
||||
# Combine with other features
|
||||
extraction_strategy=JsonCssExtractionStrategy({
|
||||
"tweets": {
|
||||
"selector": "[data-testid='tweet']",
|
||||
"fields": {
|
||||
"text": {"selector": "[data-testid='tweetText']", "type": "text"},
|
||||
"likes": {"selector": "[data-testid='like']", "type": "text"}
|
||||
}
|
||||
}
|
||||
})
|
||||
)
|
||||
)
|
||||
|
||||
print(f"Captured {len(result.extracted_content['tweets'])} tweets")
|
||||
```
|
||||
|
||||
**Key Capabilities:**
|
||||
- **DOM Recycling Awareness**: Detects and handles virtual DOM element recycling
|
||||
- **Smart Scroll Physics**: Three modes - container height, page height, or fixed pixels
|
||||
- **Content Preservation**: Captures content before it's destroyed
|
||||
- **Intelligent Stopping**: Stops when no new content appears
|
||||
- **Memory Efficient**: Streams content instead of holding everything in memory
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
- **Social Media Analysis**: Capture entire Twitter threads with hundreds of replies, not just top 10
|
||||
- **E-commerce Scraping**: Extract 500+ products from infinite scroll catalogs vs. 20-50 with traditional methods
|
||||
- **News Aggregation**: Get all articles from modern news sites, not just above-the-fold content
|
||||
- **Research Applications**: Complete data extraction from academic databases using virtual pagination
|
||||
|
||||
## 🔗 Link Preview: Intelligent Link Analysis and Scoring
|
||||
|
||||
**The Problem:** You crawl a page and get 200 links. Which ones matter? Which lead to the content you actually want? Traditional crawlers force you to follow everything or build complex filters.
|
||||
|
||||
**My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals.
|
||||
|
||||
### Intelligent Link Analysis and Scoring
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import CrawlerRunConfig, CacheMode, AsyncWebCrawler
|
||||
from crawl4ai.adaptive_crawler import LinkPreviewConfig
|
||||
|
||||
async def main():
|
||||
# Configure intelligent link analysis
|
||||
link_config = LinkPreviewConfig(
|
||||
include_internal=True,
|
||||
include_external=False,
|
||||
max_links=10,
|
||||
concurrency=5,
|
||||
query="python tutorial", # For contextual scoring
|
||||
score_threshold=0.3,
|
||||
verbose=True
|
||||
)
|
||||
# Use in your crawl
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://www.geeksforgeeks.org/",
|
||||
config=CrawlerRunConfig(
|
||||
link_preview_config=link_config,
|
||||
score_links=True, # Enable intrinsic scoring
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
)
|
||||
|
||||
# Access scored and sorted links
|
||||
if result.success and result.links:
|
||||
for link in result.links.get("internal", []):
|
||||
text = link.get('text', 'No text')[:40]
|
||||
print(
|
||||
text,
|
||||
f"{link.get('intrinsic_score', 0):.1f}/10" if link.get('intrinsic_score') is not None else "0.0/10",
|
||||
f"{link.get('contextual_score', 0):.2f}/1" if link.get('contextual_score') is not None else "0.00/1",
|
||||
f"{link.get('total_score', 0):.3f}" if link.get('total_score') is not None else "0.000"
|
||||
)
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Scoring Components:**
|
||||
|
||||
1. **Intrinsic Score**: Based on link quality indicators
|
||||
- Position on page (navigation, content, footer)
|
||||
- Link attributes (rel, title, class names)
|
||||
- Anchor text quality and length
|
||||
- URL structure and depth
|
||||
|
||||
2. **Contextual Score**: Relevance to your query using BM25 algorithm
|
||||
- Keyword matching in link text and title
|
||||
- Meta description analysis
|
||||
- Content preview scoring
|
||||
|
||||
3. **Total Score**: Combined score for final ranking
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
- **Research Efficiency**: Find relevant papers 10x faster by following only high-score links
|
||||
- **Competitive Analysis**: Automatically identify important pages on competitor sites
|
||||
- **Content Discovery**: Build topic-focused crawlers that stay on track
|
||||
- **SEO Audits**: Identify and prioritize high-value internal linking opportunities
|
||||
|
||||
## 🎣 Async URL Seeder: Automated URL Discovery at Scale
|
||||
|
||||
**The Problem:** You want to crawl an entire domain but only have the homepage. Or worse, you want specific content types across thousands of pages. Manual URL discovery? That's a job for machines, not humans.
|
||||
|
||||
**My Solution:** I built Async URL Seeder—a turbocharged URL discovery engine that combines multiple sources with intelligent filtering and relevance scoring.
|
||||
|
||||
### Technical Architecture
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
||||
|
||||
async def main():
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
# Discover Python tutorial URLs
|
||||
config = SeedingConfig(
|
||||
source="sitemap", # Use sitemap
|
||||
pattern="*python*", # URL pattern filter
|
||||
extract_head=True, # Get metadata
|
||||
query="python tutorial", # For relevance scoring
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.2,
|
||||
max_urls=10
|
||||
)
|
||||
|
||||
print("Discovering Python async tutorial URLs...")
|
||||
urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
|
||||
|
||||
print(f"\n✅ Found {len(urls)} relevant URLs:")
|
||||
for i, url_info in enumerate(urls[:5], 1):
|
||||
print(f"\n{i}. {url_info['url']}")
|
||||
if url_info.get('relevance_score'):
|
||||
print(f" Relevance: {url_info['relevance_score']:.3f}")
|
||||
if url_info.get('head_data', {}).get('title'):
|
||||
print(f" Title: {url_info['head_data']['title'][:60]}...")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Discovery Methods:**
|
||||
- **Sitemap Mining**: Parses robots.txt and all linked sitemaps
|
||||
- **Common Crawl**: Queries the Common Crawl index for historical URLs
|
||||
- **Intelligent Crawling**: Follows links with smart depth control
|
||||
- **Pattern Analysis**: Learns URL structures and generates variations
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
- **Migration Projects**: Discover 10,000+ URLs from legacy sites in under 60 seconds
|
||||
- **Market Research**: Map entire competitor ecosystems automatically
|
||||
- **Academic Research**: Build comprehensive datasets without manual URL collection
|
||||
- **SEO Audits**: Find every indexable page with content scoring
|
||||
- **Content Archival**: Ensure no content is left behind during site migrations
|
||||
|
||||
## ⚡ Performance Optimizations
|
||||
|
||||
This release includes significant performance improvements through optimized resource handling, better concurrency management, and reduced memory footprint.
|
||||
|
||||
### What We Optimized
|
||||
|
||||
```python
|
||||
# Optimized crawling with v0.7.0 improvements
|
||||
results = []
|
||||
for url in urls:
|
||||
result = await crawler.arun(
|
||||
url,
|
||||
config=CrawlerRunConfig(
|
||||
# Performance optimizations
|
||||
wait_until="domcontentloaded", # Faster than networkidle
|
||||
cache_mode=CacheMode.ENABLED # Enable caching
|
||||
)
|
||||
)
|
||||
results.append(result)
|
||||
```
|
||||
|
||||
**Performance Gains:**
|
||||
- **Startup Time**: 70% faster browser initialization
|
||||
- **Page Loading**: 40% reduction with smart resource blocking
|
||||
- **Extraction**: 3x faster with compiled CSS selectors
|
||||
- **Memory Usage**: 60% reduction with streaming processing
|
||||
- **Concurrent Crawls**: Handle 5x more parallel requests
|
||||
|
||||
|
||||
## 🔧 Important Changes
|
||||
|
||||
### Breaking Changes
|
||||
- `link_extractor` renamed to `link_preview` (better reflects functionality)
|
||||
- Minimum Python version now 3.9
|
||||
- `CrawlerConfig` split into `CrawlerRunConfig` and `BrowserConfig`
|
||||
|
||||
### Migration Guide
|
||||
```python
|
||||
# Old (v0.6.x)
|
||||
from crawl4ai import CrawlerConfig
|
||||
config = CrawlerConfig(timeout=30000)
|
||||
|
||||
# New (v0.7.0)
|
||||
from crawl4ai import CrawlerRunConfig, BrowserConfig
|
||||
browser_config = BrowserConfig(timeout=30000)
|
||||
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
```
|
||||
|
||||
## 🤖 Coming Soon: Intelligent Web Automation
|
||||
|
||||
I'm currently working on bringing advanced automation capabilities to Crawl4AI. This includes:
|
||||
|
||||
- **Crawl Agents**: Autonomous crawlers that understand your goals and adapt their strategies
|
||||
- **Auto JS Generation**: Automatic JavaScript code generation for complex interactions
|
||||
- **Smart Form Handling**: Intelligent form detection and filling
|
||||
- **Context-Aware Actions**: Crawlers that understand page context and make decisions
|
||||
|
||||
These features are under active development and will revolutionize how we approach web automation. Stay tuned!
|
||||
|
||||
## 🚀 Get Started
|
||||
|
||||
```bash
|
||||
pip install crawl4ai==0.7.0
|
||||
```
|
||||
|
||||
Check out the [updated documentation](https://docs.crawl4ai.com).
|
||||
|
||||
Questions? Issues? I'm always listening:
|
||||
- GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||
- Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||
- Twitter: [@unclecode](https://x.com/unclecode)
|
||||
|
||||
Happy crawling! 🕷️
|
||||
|
||||
---
|
||||
|
||||
*P.S. If you're using Crawl4AI in production, I'd love to hear about it. Your use cases inspire the next features.*
|
||||
43
docs/blog/release-v0.7.1.md
Normal file
43
docs/blog/release-v0.7.1.md
Normal file
@@ -0,0 +1,43 @@
|
||||
# 🛠️ Crawl4AI v0.7.1: Minor Cleanup Update
|
||||
|
||||
*July 17, 2025 • 2 min read*
|
||||
|
||||
---
|
||||
|
||||
A small maintenance release that removes unused code and improves documentation.
|
||||
|
||||
## 🎯 What's Changed
|
||||
|
||||
- **Removed unused StealthConfig** from `crawl4ai/browser_manager.py`
|
||||
- **Updated documentation** with better examples and parameter explanations
|
||||
- **Fixed virtual scroll configuration** examples in docs
|
||||
|
||||
## 🧹 Code Cleanup
|
||||
|
||||
Removed unused `StealthConfig` import and configuration that wasn't being used anywhere in the codebase. The project uses its own custom stealth implementation through JavaScript injection instead.
|
||||
|
||||
```python
|
||||
# Removed unused code:
|
||||
from playwright_stealth import StealthConfig
|
||||
stealth_config = StealthConfig(...) # This was never used
|
||||
```
|
||||
|
||||
## 📖 Documentation Updates
|
||||
|
||||
- Fixed adaptive crawling parameter examples
|
||||
- Updated session management documentation
|
||||
- Corrected virtual scroll configuration examples
|
||||
|
||||
## 🚀 Installation
|
||||
|
||||
```bash
|
||||
pip install crawl4ai==0.7.1
|
||||
```
|
||||
|
||||
No breaking changes - upgrade directly from v0.7.0.
|
||||
|
||||
---
|
||||
|
||||
Questions? Issues?
|
||||
- GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||
- Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||
@@ -3,8 +3,8 @@ C4A-Script API Usage Examples
|
||||
Shows how to use the new Result-based API in various scenarios
|
||||
"""
|
||||
|
||||
from c4a_compile import compile, validate, compile_file
|
||||
from c4a_result import CompilationResult, ValidationResult
|
||||
from crawl4ai.script.c4a_compile import compile, validate, compile_file
|
||||
from crawl4ai.script.c4a_result import CompilationResult, ValidationResult
|
||||
import json
|
||||
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ C4A-Script Hello World
|
||||
A concise example showing how to use the C4A-Script compiler
|
||||
"""
|
||||
|
||||
from c4a_compile import compile
|
||||
from crawl4ai.script.c4a_compile import compile
|
||||
|
||||
# Define your C4A-Script
|
||||
script = """
|
||||
|
||||
@@ -3,7 +3,7 @@ C4A-Script Hello World - Error Example
|
||||
Shows how error handling works
|
||||
"""
|
||||
|
||||
from c4a_compile import compile
|
||||
from crawl4ai.script.c4a_compile import compile
|
||||
|
||||
# Define a script with an error (missing THEN)
|
||||
script = """
|
||||
|
||||
303
docs/examples/demo_multi_config_clean.py
Normal file
303
docs/examples/demo_multi_config_clean.py
Normal file
@@ -0,0 +1,303 @@
|
||||
"""
|
||||
🎯 Multi-Config URL Matching Demo
|
||||
=================================
|
||||
Learn how to use different crawler configurations for different URL patterns
|
||||
in a single crawl batch with Crawl4AI's multi-config feature.
|
||||
|
||||
Part 1: Understanding URL Matching (Pattern Testing)
|
||||
Part 2: Practical Example with Real Crawling
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
CrawlerRunConfig,
|
||||
MatchMode
|
||||
)
|
||||
from crawl4ai.processors.pdf import PDFContentScrapingStrategy
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
|
||||
|
||||
def print_section(title):
|
||||
"""Print a formatted section header"""
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"{title}")
|
||||
print(f"{'=' * 60}\n")
|
||||
|
||||
|
||||
def test_url_matching(config, test_urls, config_name):
|
||||
"""Test URL matching for a config and show results"""
|
||||
print(f"Config: {config_name}")
|
||||
print(f"Matcher: {config.url_matcher}")
|
||||
if hasattr(config, 'match_mode'):
|
||||
print(f"Mode: {config.match_mode.value}")
|
||||
print("-" * 40)
|
||||
|
||||
for url in test_urls:
|
||||
matches = config.is_match(url)
|
||||
symbol = "✓" if matches else "✗"
|
||||
print(f"{symbol} {url}")
|
||||
print()
|
||||
|
||||
|
||||
# ==============================================================================
|
||||
# PART 1: Understanding URL Matching
|
||||
# ==============================================================================
|
||||
|
||||
def demo_part1_pattern_matching():
|
||||
"""Part 1: Learn how URL matching works without crawling"""
|
||||
|
||||
print_section("PART 1: Understanding URL Matching")
|
||||
print("Let's explore different ways to match URLs with configs.\n")
|
||||
|
||||
# Test URLs we'll use throughout
|
||||
test_urls = [
|
||||
"https://example.com/report.pdf",
|
||||
"https://example.com/data.json",
|
||||
"https://example.com/blog/post-1",
|
||||
"https://example.com/article/news",
|
||||
"https://api.example.com/v1/users",
|
||||
"https://example.com/about"
|
||||
]
|
||||
|
||||
# 1.1 Simple String Pattern
|
||||
print("1.1 Simple String Pattern Matching")
|
||||
print("-" * 40)
|
||||
|
||||
pdf_config = CrawlerRunConfig(
|
||||
url_matcher="*.pdf"
|
||||
)
|
||||
|
||||
test_url_matching(pdf_config, test_urls, "PDF Config")
|
||||
|
||||
|
||||
# 1.2 Multiple String Patterns
|
||||
print("1.2 Multiple String Patterns (OR logic)")
|
||||
print("-" * 40)
|
||||
|
||||
blog_config = CrawlerRunConfig(
|
||||
url_matcher=["*/blog/*", "*/article/*", "*/news/*"],
|
||||
match_mode=MatchMode.OR # This is default, shown for clarity
|
||||
)
|
||||
|
||||
test_url_matching(blog_config, test_urls, "Blog/Article Config")
|
||||
|
||||
|
||||
# 1.3 Single Function Matcher
|
||||
print("1.3 Function-based Matching")
|
||||
print("-" * 40)
|
||||
|
||||
api_config = CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'api' in url or url.endswith('.json')
|
||||
)
|
||||
|
||||
test_url_matching(api_config, test_urls, "API Config")
|
||||
|
||||
|
||||
# 1.4 List of Functions
|
||||
print("1.4 Multiple Functions with AND Logic")
|
||||
print("-" * 40)
|
||||
|
||||
# Must be HTTPS AND contain 'api' AND have version number
|
||||
secure_api_config = CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
lambda url: url.startswith('https://'),
|
||||
lambda url: 'api' in url,
|
||||
lambda url: '/v' in url # Version indicator
|
||||
],
|
||||
match_mode=MatchMode.AND
|
||||
)
|
||||
|
||||
test_url_matching(secure_api_config, test_urls, "Secure API Config")
|
||||
|
||||
|
||||
# 1.5 Mixed: String and Function Together
|
||||
print("1.5 Mixed Patterns: String + Function")
|
||||
print("-" * 40)
|
||||
|
||||
# Match JSON files OR any API endpoint
|
||||
json_or_api_config = CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
"*.json", # String pattern
|
||||
lambda url: 'api' in url # Function
|
||||
],
|
||||
match_mode=MatchMode.OR
|
||||
)
|
||||
|
||||
test_url_matching(json_or_api_config, test_urls, "JSON or API Config")
|
||||
|
||||
|
||||
# 1.6 Complex: Multiple Strings + Multiple Functions
|
||||
print("1.6 Complex Matcher: Mixed Types with AND Logic")
|
||||
print("-" * 40)
|
||||
|
||||
# Must be: HTTPS AND (.com domain) AND (blog OR article) AND NOT a PDF
|
||||
complex_config = CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
lambda url: url.startswith('https://'), # Function: HTTPS check
|
||||
"*.com/*", # String: .com domain
|
||||
lambda url: any(pattern in url for pattern in ['/blog/', '/article/']), # Function: Blog OR article
|
||||
lambda url: not url.endswith('.pdf') # Function: Not PDF
|
||||
],
|
||||
match_mode=MatchMode.AND
|
||||
)
|
||||
|
||||
test_url_matching(complex_config, test_urls, "Complex Mixed Config")
|
||||
|
||||
print("\n✅ Key Takeaway: First matching config wins when passed to arun_many()!")
|
||||
|
||||
|
||||
# ==============================================================================
|
||||
# PART 2: Practical Multi-URL Crawling
|
||||
# ==============================================================================
|
||||
|
||||
async def demo_part2_practical_crawling():
|
||||
"""Part 2: Real-world example with different content types"""
|
||||
|
||||
print_section("PART 2: Practical Multi-URL Crawling")
|
||||
print("Now let's see multi-config in action with real URLs.\n")
|
||||
|
||||
# Create specialized configs for different content types
|
||||
configs = [
|
||||
# Config 1: PDF documents - only match files ending with .pdf
|
||||
CrawlerRunConfig(
|
||||
url_matcher="*.pdf",
|
||||
scraping_strategy=PDFContentScrapingStrategy()
|
||||
),
|
||||
|
||||
# Config 2: Blog/article pages with content filtering
|
||||
CrawlerRunConfig(
|
||||
url_matcher=["*/blog/*", "*/article/*", "*python.org*"],
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(threshold=0.48)
|
||||
)
|
||||
),
|
||||
|
||||
# Config 3: Dynamic pages requiring JavaScript
|
||||
CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'github.com' in url,
|
||||
js_code="window.scrollTo(0, 500);" # Scroll to load content
|
||||
),
|
||||
|
||||
# Config 4: Mixed matcher - API endpoints (string OR function)
|
||||
CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
"*.json", # String pattern for JSON files
|
||||
lambda url: 'api' in url or 'httpbin.org' in url # Function for API endpoints
|
||||
],
|
||||
match_mode=MatchMode.OR,
|
||||
),
|
||||
|
||||
# Config 5: Complex matcher - Secure documentation sites
|
||||
CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
lambda url: url.startswith('https://'), # Must be HTTPS
|
||||
"*.org/*", # String: .org domain
|
||||
lambda url: any(doc in url for doc in ['docs', 'documentation', 'reference']), # Has docs
|
||||
lambda url: not url.endswith(('.pdf', '.json')) # Not PDF or JSON
|
||||
],
|
||||
match_mode=MatchMode.AND,
|
||||
# wait_for="css:.content, css:article" # Wait for content to load
|
||||
),
|
||||
|
||||
# Default config for everything else
|
||||
# CrawlerRunConfig() # No url_matcher means it matches everything (use it as fallback)
|
||||
]
|
||||
|
||||
# URLs to crawl - each will use a different config
|
||||
urls = [
|
||||
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", # → PDF config
|
||||
"https://blog.python.org/", # → Blog config with content filter
|
||||
"https://github.com/microsoft/playwright", # → JS config
|
||||
"https://httpbin.org/json", # → Mixed matcher config (API)
|
||||
"https://docs.python.org/3/reference/", # → Complex matcher config
|
||||
"https://www.w3schools.com/", # → Default config, if you uncomment the default config line above, if not you will see `Error: No matching configuration`
|
||||
]
|
||||
|
||||
print("URLs to crawl:")
|
||||
for i, url in enumerate(urls, 1):
|
||||
print(f"{i}. {url}")
|
||||
|
||||
print("\nCrawling with appropriate config for each URL...\n")
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=configs
|
||||
)
|
||||
|
||||
# Display results
|
||||
print("Results:")
|
||||
print("-" * 60)
|
||||
|
||||
for result in results:
|
||||
if result.success:
|
||||
# Determine which config was used
|
||||
config_type = "Default"
|
||||
if result.url.endswith('.pdf'):
|
||||
config_type = "PDF Strategy"
|
||||
elif any(pattern in result.url for pattern in ['blog', 'python.org']) and 'docs' not in result.url:
|
||||
config_type = "Blog + Content Filter"
|
||||
elif 'github.com' in result.url:
|
||||
config_type = "JavaScript Enabled"
|
||||
elif 'httpbin.org' in result.url or result.url.endswith('.json'):
|
||||
config_type = "Mixed Matcher (API)"
|
||||
elif 'docs.python.org' in result.url:
|
||||
config_type = "Complex Matcher (Secure Docs)"
|
||||
|
||||
print(f"\n✓ {result.url}")
|
||||
print(f" Config used: {config_type}")
|
||||
print(f" Content size: {len(result.markdown)} chars")
|
||||
|
||||
# Show if we have fit_markdown (from content filter)
|
||||
if hasattr(result.markdown, 'fit_markdown') and result.markdown.fit_markdown:
|
||||
print(f" Fit markdown size: {len(result.markdown.fit_markdown)} chars")
|
||||
reduction = (1 - len(result.markdown.fit_markdown) / len(result.markdown)) * 100
|
||||
print(f" Content reduced by: {reduction:.1f}%")
|
||||
|
||||
# Show extracted data if using extraction strategy
|
||||
if hasattr(result, 'extracted_content') and result.extracted_content:
|
||||
print(f" Extracted data: {str(result.extracted_content)[:100]}...")
|
||||
else:
|
||||
print(f"\n✗ {result.url}")
|
||||
print(f" Error: {result.error_message}")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("✅ Multi-config crawling complete!")
|
||||
print("\nBenefits demonstrated:")
|
||||
print("- PDFs handled with specialized scraper")
|
||||
print("- Blog content filtered for relevance")
|
||||
print("- JavaScript executed only where needed")
|
||||
print("- Mixed matchers (string + function) for flexible matching")
|
||||
print("- Complex matchers for precise URL targeting")
|
||||
print("- Each URL got optimal configuration automatically!")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run both parts of the demo"""
|
||||
|
||||
print("""
|
||||
🎯 Multi-Config URL Matching Demo
|
||||
=================================
|
||||
Learn how Crawl4AI can use different configurations
|
||||
for different URLs in a single batch.
|
||||
""")
|
||||
|
||||
# Part 1: Pattern matching
|
||||
demo_part1_pattern_matching()
|
||||
|
||||
print("\nPress Enter to continue to Part 2...")
|
||||
try:
|
||||
input()
|
||||
except EOFError:
|
||||
# Running in non-interactive mode, skip input
|
||||
pass
|
||||
|
||||
# Part 2: Practical crawling
|
||||
await demo_part2_practical_crawling()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
57
docs/examples/hello_world_undetected.py
Normal file
57
docs/examples/hello_world_undetected.py
Normal file
@@ -0,0 +1,57 @@
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
DefaultMarkdownGenerator,
|
||||
PruningContentFilter,
|
||||
CrawlResult,
|
||||
UndetectedAdapter
|
||||
)
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
|
||||
async def main():
|
||||
# Create browser config
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
# Create the undetected adapter
|
||||
undetected_adapter = UndetectedAdapter()
|
||||
|
||||
# Create the crawler strategy with the undetected adapter
|
||||
crawler_strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=undetected_adapter
|
||||
)
|
||||
|
||||
# Create the crawler with our custom strategy
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=crawler_strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
# Configure the crawl
|
||||
crawler_config = CrawlerRunConfig(
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter()
|
||||
),
|
||||
capture_console_messages=True, # Enable console capture to test adapter
|
||||
)
|
||||
|
||||
# Test on a site that typically detects bots
|
||||
print("Testing undetected adapter...")
|
||||
result: CrawlResult = await crawler.arun(
|
||||
url="https://www.helloworld.org",
|
||||
config=crawler_config
|
||||
)
|
||||
|
||||
print(f"Status: {result.status_code}")
|
||||
print(f"Success: {result.success}")
|
||||
print(f"Console messages captured: {len(result.console_messages or [])}")
|
||||
print(f"Markdown content (first 500 chars):\n{result.markdown.raw_markdown[:500]}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -18,7 +18,7 @@ Usage:
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.async_configs import LinkPreviewConfig
|
||||
from crawl4ai import LinkPreviewConfig
|
||||
|
||||
|
||||
async def basic_link_head_extraction():
|
||||
|
||||
@@ -1,43 +1,55 @@
|
||||
from crawl4ai import LLMConfig
|
||||
from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy
|
||||
import asyncio
|
||||
import os
|
||||
import json
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
url = "https://openai.com/api/pricing/"
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig, BrowserConfig, CacheMode
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
from typing import Dict
|
||||
import os
|
||||
|
||||
|
||||
class OpenAIModelFee(BaseModel):
|
||||
model_name: str = Field(..., description="Name of the OpenAI model.")
|
||||
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
|
||||
output_fee: str = Field(
|
||||
..., description="Fee for output token for the OpenAI model."
|
||||
output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
|
||||
|
||||
|
||||
async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None):
|
||||
print(f"\n--- Extracting Structured Data with {provider} ---")
|
||||
|
||||
if api_token is None and provider != "ollama":
|
||||
print(f"API token is required for {provider}. Skipping this example.")
|
||||
return
|
||||
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
|
||||
extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
|
||||
if extra_headers:
|
||||
extra_args["extra_headers"] = extra_headers
|
||||
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
word_count_threshold=1,
|
||||
page_timeout=80000,
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
llm_config=LLMConfig(provider=provider, api_token=api_token),
|
||||
schema=OpenAIModelFee.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
||||
Do not miss any models in the entire content.""",
|
||||
extra_args=extra_args,
|
||||
),
|
||||
)
|
||||
|
||||
async def main():
|
||||
# Use AsyncWebCrawler
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
word_count_threshold=1,
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
# provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
|
||||
llm_config=LLMConfig(provider="groq/llama-3.1-70b-versatile", api_token=os.getenv("GROQ_API_KEY")),
|
||||
schema=OpenAIModelFee.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="From the crawled content, extract all mentioned model names along with their "
|
||||
"fees for input and output tokens. Make sure not to miss anything in the entire content. "
|
||||
"One extracted model JSON format should look like this: "
|
||||
'{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }',
|
||||
),
|
||||
url="https://openai.com/api/pricing/",
|
||||
config=crawler_config
|
||||
)
|
||||
print("Success:", result.success)
|
||||
model_fees = json.loads(result.extracted_content)
|
||||
print(len(model_fees))
|
||||
|
||||
with open(".data/data.json", "w", encoding="utf-8") as f:
|
||||
f.write(result.extracted_content)
|
||||
print(result.extracted_content)
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(
|
||||
extract_structured_data_using_llm(
|
||||
provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY")
|
||||
)
|
||||
)
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import time, re
|
||||
from crawl4ai.content_scraping_strategy import WebScrapingStrategy, LXMLWebScrapingStrategy
|
||||
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||
# WebScrapingStrategy is now an alias for LXMLWebScrapingStrategy
|
||||
import time
|
||||
import functools
|
||||
from collections import defaultdict
|
||||
@@ -57,7 +58,7 @@ methods_to_profile = [
|
||||
|
||||
|
||||
# Apply decorators to both strategies
|
||||
for strategy, name in [(WebScrapingStrategy, "Original"), (LXMLWebScrapingStrategy, "LXML")]:
|
||||
for strategy, name in [(LXMLWebScrapingStrategy, "LXML")]:
|
||||
for method in methods_to_profile:
|
||||
apply_decorators(strategy, method, name)
|
||||
|
||||
@@ -85,7 +86,7 @@ def generate_large_html(n_elements=1000):
|
||||
|
||||
def test_scraping():
|
||||
# Initialize both scrapers
|
||||
original_scraper = WebScrapingStrategy()
|
||||
original_scraper = LXMLWebScrapingStrategy()
|
||||
selected_scraper = LXMLWebScrapingStrategy()
|
||||
|
||||
# Generate test HTML
|
||||
|
||||
59
docs/examples/simple_anti_bot_examples.py
Normal file
59
docs/examples/simple_anti_bot_examples.py
Normal file
@@ -0,0 +1,59 @@
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, UndetectedAdapter
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
# Example 1: Stealth Mode
|
||||
async def stealth_mode_example():
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True,
|
||||
headless=False
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun("https://example.com")
|
||||
return result.html[:500]
|
||||
|
||||
# Example 2: Undetected Browser
|
||||
async def undetected_browser_example():
|
||||
browser_config = BrowserConfig(
|
||||
headless=False
|
||||
)
|
||||
|
||||
adapter = UndetectedAdapter()
|
||||
strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=adapter
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
result = await crawler.arun("https://example.com")
|
||||
return result.html[:500]
|
||||
|
||||
# Example 3: Both Combined
|
||||
async def combined_example():
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True,
|
||||
headless=False
|
||||
)
|
||||
|
||||
adapter = UndetectedAdapter()
|
||||
strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=adapter
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
result = await crawler.arun("https://example.com")
|
||||
return result.html[:500]
|
||||
|
||||
# Run examples
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(stealth_mode_example())
|
||||
asyncio.run(undetected_browser_example())
|
||||
asyncio.run(combined_example())
|
||||
522
docs/examples/stealth_mode_example.py
Normal file
522
docs/examples/stealth_mode_example.py
Normal file
@@ -0,0 +1,522 @@
|
||||
"""
|
||||
Stealth Mode Example with Crawl4AI
|
||||
|
||||
This example demonstrates how to use the stealth mode feature to bypass basic bot detection.
|
||||
The stealth mode uses playwright-stealth to modify browser fingerprints and behaviors
|
||||
that are commonly used to detect automated browsers.
|
||||
|
||||
Key features demonstrated:
|
||||
1. Comparing crawling with and without stealth mode
|
||||
2. Testing against bot detection sites
|
||||
3. Accessing sites that block automated browsers
|
||||
4. Best practices for stealth crawling
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from typing import Dict, Any
|
||||
from colorama import Fore, Style, init
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Initialize colorama for colored output
|
||||
init()
|
||||
|
||||
# Create a logger for better output
|
||||
logger = AsyncLogger(verbose=True)
|
||||
|
||||
|
||||
async def test_bot_detection(use_stealth: bool = False) -> Dict[str, Any]:
|
||||
"""Test against a bot detection service"""
|
||||
|
||||
logger.info(
|
||||
f"Testing bot detection with stealth={'ON' if use_stealth else 'OFF'}",
|
||||
tag="STEALTH"
|
||||
)
|
||||
|
||||
# Configure browser with or without stealth
|
||||
browser_config = BrowserConfig(
|
||||
headless=False, # Use False to see the browser in action
|
||||
enable_stealth=use_stealth,
|
||||
viewport_width=1280,
|
||||
viewport_height=800
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# JavaScript to extract bot detection results
|
||||
detection_script = """
|
||||
// Comprehensive bot detection checks
|
||||
(() => {
|
||||
const detectionResults = {
|
||||
// Basic WebDriver detection
|
||||
webdriver: navigator.webdriver,
|
||||
|
||||
// Chrome specific
|
||||
chrome: !!window.chrome,
|
||||
chromeRuntime: !!window.chrome?.runtime,
|
||||
|
||||
// Automation indicators
|
||||
automationControlled: navigator.webdriver,
|
||||
|
||||
// Permissions API
|
||||
permissionsPresent: !!navigator.permissions?.query,
|
||||
|
||||
// Plugins
|
||||
pluginsLength: navigator.plugins.length,
|
||||
pluginsArray: Array.from(navigator.plugins).map(p => p.name),
|
||||
|
||||
// Languages
|
||||
languages: navigator.languages,
|
||||
language: navigator.language,
|
||||
|
||||
// User agent
|
||||
userAgent: navigator.userAgent,
|
||||
|
||||
// Screen and window properties
|
||||
screen: {
|
||||
width: screen.width,
|
||||
height: screen.height,
|
||||
availWidth: screen.availWidth,
|
||||
availHeight: screen.availHeight,
|
||||
colorDepth: screen.colorDepth,
|
||||
pixelDepth: screen.pixelDepth
|
||||
},
|
||||
|
||||
// WebGL vendor
|
||||
webglVendor: (() => {
|
||||
try {
|
||||
const canvas = document.createElement('canvas');
|
||||
const gl = canvas.getContext('webgl') || canvas.getContext('experimental-webgl');
|
||||
const ext = gl.getExtension('WEBGL_debug_renderer_info');
|
||||
return gl.getParameter(ext.UNMASKED_VENDOR_WEBGL);
|
||||
} catch (e) {
|
||||
return 'Error';
|
||||
}
|
||||
})(),
|
||||
|
||||
// Platform
|
||||
platform: navigator.platform,
|
||||
|
||||
// Hardware concurrency
|
||||
hardwareConcurrency: navigator.hardwareConcurrency,
|
||||
|
||||
// Device memory
|
||||
deviceMemory: navigator.deviceMemory,
|
||||
|
||||
// Connection
|
||||
connection: navigator.connection?.effectiveType
|
||||
};
|
||||
|
||||
// Log results for console capture
|
||||
console.log('DETECTION_RESULTS:', JSON.stringify(detectionResults, null, 2));
|
||||
|
||||
// Return results
|
||||
return detectionResults;
|
||||
})();
|
||||
"""
|
||||
|
||||
# Crawl bot detection test page
|
||||
config = CrawlerRunConfig(
|
||||
js_code=detection_script,
|
||||
capture_console_messages=True,
|
||||
wait_until="networkidle",
|
||||
delay_before_return_html=2.0 # Give time for all checks to complete
|
||||
)
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://bot.sannysoft.com",
|
||||
config=config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
# Extract detection results from console
|
||||
detection_data = None
|
||||
for msg in result.console_messages or []:
|
||||
if "DETECTION_RESULTS:" in msg.get("text", ""):
|
||||
try:
|
||||
json_str = msg["text"].replace("DETECTION_RESULTS:", "").strip()
|
||||
detection_data = json.loads(json_str)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Also try to get from JavaScript execution result
|
||||
if not detection_data and result.js_execution_result:
|
||||
detection_data = result.js_execution_result
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"url": result.url,
|
||||
"detection_data": detection_data,
|
||||
"page_title": result.metadata.get("title", ""),
|
||||
"stealth_enabled": use_stealth
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"error": result.error_message,
|
||||
"stealth_enabled": use_stealth
|
||||
}
|
||||
|
||||
|
||||
async def test_cloudflare_site(use_stealth: bool = False) -> Dict[str, Any]:
|
||||
"""Test accessing a Cloudflare-protected site"""
|
||||
|
||||
logger.info(
|
||||
f"Testing Cloudflare site with stealth={'ON' if use_stealth else 'OFF'}",
|
||||
tag="STEALTH"
|
||||
)
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True, # Cloudflare detection works better in headless mode with stealth
|
||||
enable_stealth=use_stealth,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
wait_until="networkidle",
|
||||
page_timeout=30000, # 30 seconds
|
||||
delay_before_return_html=3.0
|
||||
)
|
||||
|
||||
# Test on a site that often shows Cloudflare challenges
|
||||
result = await crawler.arun(
|
||||
url="https://nowsecure.nl",
|
||||
config=config
|
||||
)
|
||||
|
||||
# Check if we hit Cloudflare challenge
|
||||
cloudflare_detected = False
|
||||
if result.html:
|
||||
cloudflare_indicators = [
|
||||
"Checking your browser",
|
||||
"Just a moment",
|
||||
"cf-browser-verification",
|
||||
"cf-challenge",
|
||||
"ray ID"
|
||||
]
|
||||
cloudflare_detected = any(indicator in result.html for indicator in cloudflare_indicators)
|
||||
|
||||
return {
|
||||
"success": result.success,
|
||||
"url": result.url,
|
||||
"cloudflare_challenge": cloudflare_detected,
|
||||
"status_code": result.status_code,
|
||||
"page_title": result.metadata.get("title", "") if result.metadata else "",
|
||||
"stealth_enabled": use_stealth,
|
||||
"html_snippet": result.html[:500] if result.html else ""
|
||||
}
|
||||
|
||||
|
||||
async def test_anti_bot_site(use_stealth: bool = False) -> Dict[str, Any]:
|
||||
"""Test against sites with anti-bot measures"""
|
||||
|
||||
logger.info(
|
||||
f"Testing anti-bot site with stealth={'ON' if use_stealth else 'OFF'}",
|
||||
tag="STEALTH"
|
||||
)
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
enable_stealth=use_stealth,
|
||||
# Additional browser arguments that help with stealth
|
||||
extra_args=[
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--disable-features=site-per-process"
|
||||
] if not use_stealth else [] # These are automatically applied with stealth
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Some sites check for specific behaviors
|
||||
behavior_script = """
|
||||
(async () => {
|
||||
// Simulate human-like behavior
|
||||
const sleep = ms => new Promise(resolve => setTimeout(resolve, ms));
|
||||
|
||||
// Random mouse movement
|
||||
const moveX = Math.random() * 100;
|
||||
const moveY = Math.random() * 100;
|
||||
|
||||
// Simulate reading time
|
||||
await sleep(1000 + Math.random() * 2000);
|
||||
|
||||
// Scroll slightly
|
||||
window.scrollBy(0, 100 + Math.random() * 200);
|
||||
|
||||
console.log('Human behavior simulation complete');
|
||||
return true;
|
||||
})()
|
||||
"""
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
js_code=behavior_script,
|
||||
wait_until="networkidle",
|
||||
delay_before_return_html=5.0, # Longer delay to appear more human
|
||||
capture_console_messages=True
|
||||
)
|
||||
|
||||
# Test on a site that implements anti-bot measures
|
||||
result = await crawler.arun(
|
||||
url="https://www.g2.com/",
|
||||
config=config
|
||||
)
|
||||
|
||||
# Check for common anti-bot blocks
|
||||
blocked_indicators = [
|
||||
"Access Denied",
|
||||
"403 Forbidden",
|
||||
"Security Check",
|
||||
"Verify you are human",
|
||||
"captcha",
|
||||
"challenge"
|
||||
]
|
||||
|
||||
blocked = False
|
||||
if result.html:
|
||||
blocked = any(indicator.lower() in result.html.lower() for indicator in blocked_indicators)
|
||||
|
||||
return {
|
||||
"success": result.success and not blocked,
|
||||
"url": result.url,
|
||||
"blocked": blocked,
|
||||
"status_code": result.status_code,
|
||||
"page_title": result.metadata.get("title", "") if result.metadata else "",
|
||||
"stealth_enabled": use_stealth
|
||||
}
|
||||
|
||||
|
||||
async def compare_results():
|
||||
"""Run all tests with and without stealth mode and compare results"""
|
||||
|
||||
print(f"\n{Fore.CYAN}{'='*60}{Style.RESET_ALL}")
|
||||
print(f"{Fore.CYAN}Crawl4AI Stealth Mode Comparison{Style.RESET_ALL}")
|
||||
print(f"{Fore.CYAN}{'='*60}{Style.RESET_ALL}\n")
|
||||
|
||||
# Test 1: Bot Detection
|
||||
print(f"{Fore.YELLOW}1. Bot Detection Test (bot.sannysoft.com){Style.RESET_ALL}")
|
||||
print("-" * 40)
|
||||
|
||||
# Without stealth
|
||||
regular_detection = await test_bot_detection(use_stealth=False)
|
||||
if regular_detection["success"] and regular_detection["detection_data"]:
|
||||
print(f"{Fore.RED}Without Stealth:{Style.RESET_ALL}")
|
||||
data = regular_detection["detection_data"]
|
||||
print(f" • WebDriver detected: {data.get('webdriver', 'Unknown')}")
|
||||
print(f" • Chrome: {data.get('chrome', 'Unknown')}")
|
||||
print(f" • Languages: {data.get('languages', 'Unknown')}")
|
||||
print(f" • Plugins: {data.get('pluginsLength', 'Unknown')}")
|
||||
print(f" • User Agent: {data.get('userAgent', 'Unknown')[:60]}...")
|
||||
|
||||
# With stealth
|
||||
stealth_detection = await test_bot_detection(use_stealth=True)
|
||||
if stealth_detection["success"] and stealth_detection["detection_data"]:
|
||||
print(f"\n{Fore.GREEN}With Stealth:{Style.RESET_ALL}")
|
||||
data = stealth_detection["detection_data"]
|
||||
print(f" • WebDriver detected: {data.get('webdriver', 'Unknown')}")
|
||||
print(f" • Chrome: {data.get('chrome', 'Unknown')}")
|
||||
print(f" • Languages: {data.get('languages', 'Unknown')}")
|
||||
print(f" • Plugins: {data.get('pluginsLength', 'Unknown')}")
|
||||
print(f" • User Agent: {data.get('userAgent', 'Unknown')[:60]}...")
|
||||
|
||||
# Test 2: Cloudflare Site
|
||||
print(f"\n\n{Fore.YELLOW}2. Cloudflare Protected Site Test{Style.RESET_ALL}")
|
||||
print("-" * 40)
|
||||
|
||||
# Without stealth
|
||||
regular_cf = await test_cloudflare_site(use_stealth=False)
|
||||
print(f"{Fore.RED}Without Stealth:{Style.RESET_ALL}")
|
||||
print(f" • Success: {regular_cf['success']}")
|
||||
print(f" • Cloudflare Challenge: {regular_cf['cloudflare_challenge']}")
|
||||
print(f" • Status Code: {regular_cf['status_code']}")
|
||||
print(f" • Page Title: {regular_cf['page_title']}")
|
||||
|
||||
# With stealth
|
||||
stealth_cf = await test_cloudflare_site(use_stealth=True)
|
||||
print(f"\n{Fore.GREEN}With Stealth:{Style.RESET_ALL}")
|
||||
print(f" • Success: {stealth_cf['success']}")
|
||||
print(f" • Cloudflare Challenge: {stealth_cf['cloudflare_challenge']}")
|
||||
print(f" • Status Code: {stealth_cf['status_code']}")
|
||||
print(f" • Page Title: {stealth_cf['page_title']}")
|
||||
|
||||
# Test 3: Anti-bot Site
|
||||
print(f"\n\n{Fore.YELLOW}3. Anti-Bot Site Test{Style.RESET_ALL}")
|
||||
print("-" * 40)
|
||||
|
||||
# Without stealth
|
||||
regular_antibot = await test_anti_bot_site(use_stealth=False)
|
||||
print(f"{Fore.RED}Without Stealth:{Style.RESET_ALL}")
|
||||
print(f" • Success: {regular_antibot['success']}")
|
||||
print(f" • Blocked: {regular_antibot['blocked']}")
|
||||
print(f" • Status Code: {regular_antibot['status_code']}")
|
||||
print(f" • Page Title: {regular_antibot['page_title']}")
|
||||
|
||||
# With stealth
|
||||
stealth_antibot = await test_anti_bot_site(use_stealth=True)
|
||||
print(f"\n{Fore.GREEN}With Stealth:{Style.RESET_ALL}")
|
||||
print(f" • Success: {stealth_antibot['success']}")
|
||||
print(f" • Blocked: {stealth_antibot['blocked']}")
|
||||
print(f" • Status Code: {stealth_antibot['status_code']}")
|
||||
print(f" • Page Title: {stealth_antibot['page_title']}")
|
||||
|
||||
# Summary
|
||||
print(f"\n{Fore.CYAN}{'='*60}{Style.RESET_ALL}")
|
||||
print(f"{Fore.CYAN}Summary:{Style.RESET_ALL}")
|
||||
print(f"{Fore.CYAN}{'='*60}{Style.RESET_ALL}")
|
||||
print(f"\nStealth mode helps bypass basic bot detection by:")
|
||||
print(f" • Hiding webdriver property")
|
||||
print(f" • Modifying browser fingerprints")
|
||||
print(f" • Adjusting navigator properties")
|
||||
print(f" • Emulating real browser plugin behavior")
|
||||
print(f"\n{Fore.YELLOW}Note:{Style.RESET_ALL} Stealth mode is not a silver bullet.")
|
||||
print(f"Advanced anti-bot systems may still detect automation.")
|
||||
print(f"Always respect robots.txt and website terms of service.")
|
||||
|
||||
|
||||
async def stealth_best_practices():
|
||||
"""Demonstrate best practices for using stealth mode"""
|
||||
|
||||
print(f"\n\n{Fore.CYAN}{'='*60}{Style.RESET_ALL}")
|
||||
print(f"{Fore.CYAN}Stealth Mode Best Practices{Style.RESET_ALL}")
|
||||
print(f"{Fore.CYAN}{'='*60}{Style.RESET_ALL}\n")
|
||||
|
||||
# Best Practice 1: Combine with realistic behavior
|
||||
print(f"{Fore.YELLOW}1. Combine with Realistic Behavior:{Style.RESET_ALL}")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
enable_stealth=True,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Simulate human-like behavior
|
||||
human_behavior_script = """
|
||||
(async () => {
|
||||
// Wait random time between actions
|
||||
const randomWait = () => Math.random() * 2000 + 1000;
|
||||
|
||||
// Simulate reading
|
||||
await new Promise(resolve => setTimeout(resolve, randomWait()));
|
||||
|
||||
// Smooth scroll
|
||||
const smoothScroll = async () => {
|
||||
const totalHeight = document.body.scrollHeight;
|
||||
const viewHeight = window.innerHeight;
|
||||
let currentPosition = 0;
|
||||
|
||||
while (currentPosition < totalHeight - viewHeight) {
|
||||
const scrollAmount = Math.random() * 300 + 100;
|
||||
window.scrollBy({
|
||||
top: scrollAmount,
|
||||
behavior: 'smooth'
|
||||
});
|
||||
currentPosition += scrollAmount;
|
||||
await new Promise(resolve => setTimeout(resolve, randomWait()));
|
||||
}
|
||||
};
|
||||
|
||||
await smoothScroll();
|
||||
console.log('Human-like behavior simulation completed');
|
||||
return true;
|
||||
})()
|
||||
"""
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
js_code=human_behavior_script,
|
||||
wait_until="networkidle",
|
||||
delay_before_return_html=3.0,
|
||||
capture_console_messages=True
|
||||
)
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=config
|
||||
)
|
||||
|
||||
print(f" ✓ Simulated human-like scrolling and reading patterns")
|
||||
print(f" ✓ Added random delays between actions")
|
||||
print(f" ✓ Result: {result.success}")
|
||||
|
||||
# Best Practice 2: Use appropriate viewport and user agent
|
||||
print(f"\n{Fore.YELLOW}2. Use Realistic Viewport and User Agent:{Style.RESET_ALL}")
|
||||
|
||||
# Get a realistic user agent
|
||||
from crawl4ai.user_agent_generator import UserAgentGenerator
|
||||
ua_generator = UserAgentGenerator()
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
enable_stealth=True,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080,
|
||||
user_agent=ua_generator.generate(device_type="desktop", browser_type="chrome")
|
||||
)
|
||||
|
||||
print(f" ✓ Using realistic viewport: 1920x1080")
|
||||
print(f" ✓ Using current Chrome user agent")
|
||||
print(f" ✓ Stealth mode will ensure consistency")
|
||||
|
||||
# Best Practice 3: Manage request rate
|
||||
print(f"\n{Fore.YELLOW}3. Manage Request Rate:{Style.RESET_ALL}")
|
||||
print(f" ✓ Add delays between requests")
|
||||
print(f" ✓ Randomize timing patterns")
|
||||
print(f" ✓ Respect robots.txt")
|
||||
|
||||
# Best Practice 4: Session management
|
||||
print(f"\n{Fore.YELLOW}4. Use Session Management:{Style.RESET_ALL}")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
enable_stealth=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Create a session for multiple requests
|
||||
session_id = "stealth_session_1"
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
session_id=session_id,
|
||||
wait_until="domcontentloaded"
|
||||
)
|
||||
|
||||
# First request
|
||||
result1 = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=config
|
||||
)
|
||||
|
||||
# Subsequent request reuses the same browser context
|
||||
result2 = await crawler.arun(
|
||||
url="https://example.com/about",
|
||||
config=config
|
||||
)
|
||||
|
||||
print(f" ✓ Reused browser session for multiple requests")
|
||||
print(f" ✓ Maintains cookies and state between requests")
|
||||
print(f" ✓ More efficient and realistic browsing pattern")
|
||||
|
||||
print(f"\n{Fore.CYAN}{'='*60}{Style.RESET_ALL}")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all examples"""
|
||||
|
||||
# Run comparison tests
|
||||
await compare_results()
|
||||
|
||||
# Show best practices
|
||||
await stealth_best_practices()
|
||||
|
||||
print(f"\n{Fore.GREEN}Examples completed!{Style.RESET_ALL}")
|
||||
print(f"\n{Fore.YELLOW}Remember:{Style.RESET_ALL}")
|
||||
print(f"• Stealth mode helps with basic bot detection")
|
||||
print(f"• Always respect website terms of service")
|
||||
print(f"• Consider rate limiting and ethical scraping practices")
|
||||
print(f"• For advanced protection, consider additional measures")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
215
docs/examples/stealth_mode_quick_start.py
Normal file
215
docs/examples/stealth_mode_quick_start.py
Normal file
@@ -0,0 +1,215 @@
|
||||
"""
|
||||
Quick Start: Using Stealth Mode in Crawl4AI
|
||||
|
||||
This example shows practical use cases for the stealth mode feature.
|
||||
Stealth mode helps bypass basic bot detection mechanisms.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
|
||||
|
||||
async def example_1_basic_stealth():
|
||||
"""Example 1: Basic stealth mode usage"""
|
||||
print("\n=== Example 1: Basic Stealth Mode ===")
|
||||
|
||||
# Enable stealth mode in browser config
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True, # This is the key parameter
|
||||
headless=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
print(f"✓ Crawled {result.url} successfully")
|
||||
print(f"✓ Title: {result.metadata.get('title', 'N/A')}")
|
||||
|
||||
|
||||
async def example_2_stealth_with_screenshot():
|
||||
"""Example 2: Stealth mode with screenshot to show detection results"""
|
||||
print("\n=== Example 2: Stealth Mode Visual Verification ===")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True,
|
||||
headless=False # Set to False to see the browser
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
screenshot=True,
|
||||
wait_until="networkidle"
|
||||
)
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://bot.sannysoft.com",
|
||||
config=config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
print(f"✓ Successfully crawled bot detection site")
|
||||
print(f"✓ With stealth enabled, many detection tests should show as passed")
|
||||
|
||||
if result.screenshot:
|
||||
# Save screenshot for verification
|
||||
import base64
|
||||
with open("stealth_detection_results.png", "wb") as f:
|
||||
f.write(base64.b64decode(result.screenshot))
|
||||
print(f"✓ Screenshot saved as 'stealth_detection_results.png'")
|
||||
print(f" Check the screenshot to see detection results!")
|
||||
|
||||
|
||||
async def example_3_stealth_for_protected_sites():
|
||||
"""Example 3: Using stealth for sites with bot protection"""
|
||||
print("\n=== Example 3: Stealth for Protected Sites ===")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True,
|
||||
headless=True,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Add human-like behavior
|
||||
config = CrawlerRunConfig(
|
||||
wait_until="networkidle",
|
||||
delay_before_return_html=2.0, # Wait 2 seconds
|
||||
js_code="""
|
||||
// Simulate human-like scrolling
|
||||
window.scrollTo({
|
||||
top: document.body.scrollHeight / 2,
|
||||
behavior: 'smooth'
|
||||
});
|
||||
"""
|
||||
)
|
||||
|
||||
# Try accessing a site that might have bot protection
|
||||
result = await crawler.arun(
|
||||
url="https://www.g2.com/products/slack/reviews",
|
||||
config=config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
print(f"✓ Successfully accessed protected site")
|
||||
print(f"✓ Retrieved {len(result.html)} characters of HTML")
|
||||
else:
|
||||
print(f"✗ Failed to access site: {result.error_message}")
|
||||
|
||||
|
||||
async def example_4_stealth_with_sessions():
|
||||
"""Example 4: Stealth mode with session management"""
|
||||
print("\n=== Example 4: Stealth + Session Management ===")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True,
|
||||
headless=False
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
session_id = "my_stealth_session"
|
||||
|
||||
# First request - establish session
|
||||
config = CrawlerRunConfig(
|
||||
session_id=session_id,
|
||||
wait_until="domcontentloaded"
|
||||
)
|
||||
|
||||
result1 = await crawler.arun(
|
||||
url="https://news.ycombinator.com",
|
||||
config=config
|
||||
)
|
||||
print(f"✓ First request completed: {result1.url}")
|
||||
|
||||
# Second request - reuse session
|
||||
await asyncio.sleep(2) # Brief delay between requests
|
||||
|
||||
result2 = await crawler.arun(
|
||||
url="https://news.ycombinator.com/best",
|
||||
config=config
|
||||
)
|
||||
print(f"✓ Second request completed: {result2.url}")
|
||||
print(f"✓ Session reused, maintaining cookies and state")
|
||||
|
||||
|
||||
async def example_5_stealth_comparison():
|
||||
"""Example 5: Compare results with and without stealth using screenshots"""
|
||||
print("\n=== Example 5: Stealth Mode Comparison ===")
|
||||
|
||||
test_url = "https://bot.sannysoft.com"
|
||||
|
||||
# First test WITHOUT stealth
|
||||
print("\nWithout stealth:")
|
||||
regular_config = BrowserConfig(
|
||||
enable_stealth=False,
|
||||
headless=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=regular_config) as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
screenshot=True,
|
||||
wait_until="networkidle"
|
||||
)
|
||||
result = await crawler.arun(url=test_url, config=config)
|
||||
|
||||
if result.success and result.screenshot:
|
||||
import base64
|
||||
with open("comparison_without_stealth.png", "wb") as f:
|
||||
f.write(base64.b64decode(result.screenshot))
|
||||
print(f" ✓ Screenshot saved: comparison_without_stealth.png")
|
||||
print(f" Many tests will show as FAILED (red)")
|
||||
|
||||
# Then test WITH stealth
|
||||
print("\nWith stealth:")
|
||||
stealth_config = BrowserConfig(
|
||||
enable_stealth=True,
|
||||
headless=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=stealth_config) as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
screenshot=True,
|
||||
wait_until="networkidle"
|
||||
)
|
||||
result = await crawler.arun(url=test_url, config=config)
|
||||
|
||||
if result.success and result.screenshot:
|
||||
import base64
|
||||
with open("comparison_with_stealth.png", "wb") as f:
|
||||
f.write(base64.b64decode(result.screenshot))
|
||||
print(f" ✓ Screenshot saved: comparison_with_stealth.png")
|
||||
print(f" More tests should show as PASSED (green)")
|
||||
|
||||
print("\nCompare the two screenshots to see the difference!")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all examples"""
|
||||
print("Crawl4AI Stealth Mode Examples")
|
||||
print("==============================")
|
||||
|
||||
# Run basic example
|
||||
await example_1_basic_stealth()
|
||||
|
||||
# Run screenshot verification example
|
||||
await example_2_stealth_with_screenshot()
|
||||
|
||||
# Run protected site example
|
||||
await example_3_stealth_for_protected_sites()
|
||||
|
||||
# Run session example
|
||||
await example_4_stealth_with_sessions()
|
||||
|
||||
# Run comparison example
|
||||
await example_5_stealth_comparison()
|
||||
|
||||
print("\n" + "="*50)
|
||||
print("Tips for using stealth mode effectively:")
|
||||
print("- Use realistic viewport sizes (1920x1080, 1366x768)")
|
||||
print("- Add delays between requests to appear more human")
|
||||
print("- Combine with session management for better results")
|
||||
print("- Remember: stealth mode is for legitimate scraping only")
|
||||
print("="*50)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
62
docs/examples/stealth_test_simple.py
Normal file
62
docs/examples/stealth_test_simple.py
Normal file
@@ -0,0 +1,62 @@
|
||||
"""
|
||||
Simple test to verify stealth mode is working
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
|
||||
|
||||
async def test_stealth():
|
||||
"""Test stealth mode effectiveness"""
|
||||
|
||||
# Test WITHOUT stealth
|
||||
print("=== WITHOUT Stealth ===")
|
||||
config1 = BrowserConfig(
|
||||
headless=False,
|
||||
enable_stealth=False
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=config1) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://bot.sannysoft.com",
|
||||
config=CrawlerRunConfig(
|
||||
wait_until="networkidle",
|
||||
screenshot=True
|
||||
)
|
||||
)
|
||||
print(f"Success: {result.success}")
|
||||
# Take screenshot
|
||||
if result.screenshot:
|
||||
with open("without_stealth.png", "wb") as f:
|
||||
import base64
|
||||
f.write(base64.b64decode(result.screenshot))
|
||||
print("Screenshot saved: without_stealth.png")
|
||||
|
||||
# Test WITH stealth
|
||||
print("\n=== WITH Stealth ===")
|
||||
config2 = BrowserConfig(
|
||||
headless=False,
|
||||
enable_stealth=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=config2) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://bot.sannysoft.com",
|
||||
config=CrawlerRunConfig(
|
||||
wait_until="networkidle",
|
||||
screenshot=True
|
||||
)
|
||||
)
|
||||
print(f"Success: {result.success}")
|
||||
# Take screenshot
|
||||
if result.screenshot:
|
||||
with open("with_stealth.png", "wb") as f:
|
||||
import base64
|
||||
f.write(base64.b64decode(result.screenshot))
|
||||
print("Screenshot saved: with_stealth.png")
|
||||
|
||||
print("\nCheck the screenshots to see the difference in bot detection results!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_stealth())
|
||||
74
docs/examples/undetectability/undetected_basic_test.py
Normal file
74
docs/examples/undetectability/undetected_basic_test.py
Normal file
@@ -0,0 +1,74 @@
|
||||
"""
|
||||
Basic Undetected Browser Test
|
||||
Simple example to test if undetected mode works
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
||||
|
||||
async def test_regular_mode():
|
||||
"""Test with regular browser"""
|
||||
print("Testing Regular Browser Mode...")
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url="https://www.example.com")
|
||||
print(f"Regular Mode - Success: {result.success}")
|
||||
print(f"Regular Mode - Status: {result.status_code}")
|
||||
print(f"Regular Mode - Content length: {len(result.markdown.raw_markdown)}")
|
||||
print(f"Regular Mode - First 100 chars: {result.markdown.raw_markdown[:100]}...")
|
||||
return result.success
|
||||
|
||||
async def test_undetected_mode():
|
||||
"""Test with undetected browser"""
|
||||
print("\nTesting Undetected Browser Mode...")
|
||||
from crawl4ai import UndetectedAdapter
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
# Create undetected adapter
|
||||
undetected_adapter = UndetectedAdapter()
|
||||
|
||||
# Create strategy with undetected adapter
|
||||
crawler_strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=undetected_adapter
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=crawler_strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
result = await crawler.arun(url="https://www.example.com")
|
||||
print(f"Undetected Mode - Success: {result.success}")
|
||||
print(f"Undetected Mode - Status: {result.status_code}")
|
||||
print(f"Undetected Mode - Content length: {len(result.markdown.raw_markdown)}")
|
||||
print(f"Undetected Mode - First 100 chars: {result.markdown.raw_markdown[:100]}...")
|
||||
return result.success
|
||||
|
||||
async def main():
|
||||
"""Run both tests"""
|
||||
print("🤖 Crawl4AI Basic Adapter Test\n")
|
||||
|
||||
# Test regular mode
|
||||
regular_success = await test_regular_mode()
|
||||
|
||||
# Test undetected mode
|
||||
undetected_success = await test_undetected_mode()
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*50)
|
||||
print("Summary:")
|
||||
print(f"Regular Mode: {'✅ Success' if regular_success else '❌ Failed'}")
|
||||
print(f"Undetected Mode: {'✅ Success' if undetected_success else '❌ Failed'}")
|
||||
print("="*50)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
155
docs/examples/undetectability/undetected_bot_test.py
Normal file
155
docs/examples/undetectability/undetected_bot_test.py
Normal file
@@ -0,0 +1,155 @@
|
||||
"""
|
||||
Bot Detection Test - Compare Regular vs Undetected
|
||||
Tests browser fingerprinting differences at bot.sannysoft.com
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
UndetectedAdapter,
|
||||
CrawlResult
|
||||
)
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
# Bot detection test site
|
||||
TEST_URL = "https://bot.sannysoft.com"
|
||||
|
||||
def analyze_bot_detection(result: CrawlResult) -> dict:
|
||||
"""Analyze bot detection results from the page"""
|
||||
detections = {
|
||||
"webdriver": False,
|
||||
"headless": False,
|
||||
"automation": False,
|
||||
"user_agent": False,
|
||||
"total_tests": 0,
|
||||
"failed_tests": 0
|
||||
}
|
||||
|
||||
if not result.success or not result.html:
|
||||
return detections
|
||||
|
||||
# Look for specific test results in the HTML
|
||||
html_lower = result.html.lower()
|
||||
|
||||
# Check for common bot indicators
|
||||
if "webdriver" in html_lower and ("fail" in html_lower or "true" in html_lower):
|
||||
detections["webdriver"] = True
|
||||
detections["failed_tests"] += 1
|
||||
|
||||
if "headless" in html_lower and ("fail" in html_lower or "true" in html_lower):
|
||||
detections["headless"] = True
|
||||
detections["failed_tests"] += 1
|
||||
|
||||
if "automation" in html_lower and "detected" in html_lower:
|
||||
detections["automation"] = True
|
||||
detections["failed_tests"] += 1
|
||||
|
||||
# Count total tests (approximate)
|
||||
detections["total_tests"] = html_lower.count("test") + html_lower.count("check")
|
||||
|
||||
return detections
|
||||
|
||||
async def test_browser_mode(adapter_name: str, adapter=None):
|
||||
"""Test a browser mode and return results"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Testing: {adapter_name}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=False, # Run in headed mode for better results
|
||||
verbose=True,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080,
|
||||
)
|
||||
|
||||
if adapter:
|
||||
# Use undetected mode
|
||||
crawler_strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=adapter
|
||||
)
|
||||
crawler = AsyncWebCrawler(
|
||||
crawler_strategy=crawler_strategy,
|
||||
config=browser_config
|
||||
)
|
||||
else:
|
||||
# Use regular mode
|
||||
crawler = AsyncWebCrawler(config=browser_config)
|
||||
|
||||
async with crawler:
|
||||
config = CrawlerRunConfig(
|
||||
delay_before_return_html=3.0, # Let detection scripts run
|
||||
wait_for_images=True,
|
||||
screenshot=True,
|
||||
simulate_user=False, # Don't simulate for accurate detection
|
||||
)
|
||||
|
||||
result = await crawler.arun(url=TEST_URL, config=config)
|
||||
|
||||
print(f"\n✓ Success: {result.success}")
|
||||
print(f"✓ Status Code: {result.status_code}")
|
||||
|
||||
if result.success:
|
||||
# Analyze detection results
|
||||
detections = analyze_bot_detection(result)
|
||||
|
||||
print(f"\n🔍 Bot Detection Analysis:")
|
||||
print(f" - WebDriver Detected: {'❌ Yes' if detections['webdriver'] else '✅ No'}")
|
||||
print(f" - Headless Detected: {'❌ Yes' if detections['headless'] else '✅ No'}")
|
||||
print(f" - Automation Detected: {'❌ Yes' if detections['automation'] else '✅ No'}")
|
||||
print(f" - Failed Tests: {detections['failed_tests']}")
|
||||
|
||||
# Show some content
|
||||
if result.markdown.raw_markdown:
|
||||
print(f"\nContent preview:")
|
||||
lines = result.markdown.raw_markdown.split('\n')
|
||||
for line in lines[:20]: # Show first 20 lines
|
||||
if any(keyword in line.lower() for keyword in ['test', 'pass', 'fail', 'yes', 'no']):
|
||||
print(f" {line.strip()}")
|
||||
|
||||
return result, detections if result.success else {}
|
||||
|
||||
async def main():
|
||||
"""Run the comparison"""
|
||||
print("🤖 Crawl4AI - Bot Detection Test")
|
||||
print(f"Testing at: {TEST_URL}")
|
||||
print("This site runs various browser fingerprinting tests\n")
|
||||
|
||||
# Test regular browser
|
||||
regular_result, regular_detections = await test_browser_mode("Regular Browser")
|
||||
|
||||
# Small delay
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Test undetected browser
|
||||
undetected_adapter = UndetectedAdapter()
|
||||
undetected_result, undetected_detections = await test_browser_mode(
|
||||
"Undetected Browser",
|
||||
undetected_adapter
|
||||
)
|
||||
|
||||
# Summary comparison
|
||||
print(f"\n{'='*60}")
|
||||
print("COMPARISON SUMMARY")
|
||||
print(f"{'='*60}")
|
||||
|
||||
print(f"\n{'Test':<25} {'Regular':<15} {'Undetected':<15}")
|
||||
print(f"{'-'*55}")
|
||||
|
||||
if regular_detections and undetected_detections:
|
||||
print(f"{'WebDriver Detection':<25} {'❌ Detected' if regular_detections['webdriver'] else '✅ Passed':<15} {'❌ Detected' if undetected_detections['webdriver'] else '✅ Passed':<15}")
|
||||
print(f"{'Headless Detection':<25} {'❌ Detected' if regular_detections['headless'] else '✅ Passed':<15} {'❌ Detected' if undetected_detections['headless'] else '✅ Passed':<15}")
|
||||
print(f"{'Automation Detection':<25} {'❌ Detected' if regular_detections['automation'] else '✅ Passed':<15} {'❌ Detected' if undetected_detections['automation'] else '✅ Passed':<15}")
|
||||
print(f"{'Failed Tests':<25} {regular_detections['failed_tests']:<15} {undetected_detections['failed_tests']:<15}")
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
|
||||
if undetected_detections.get('failed_tests', 0) < regular_detections.get('failed_tests', 1):
|
||||
print("✅ Undetected browser performed better at evading detection!")
|
||||
else:
|
||||
print("ℹ️ Both browsers had similar detection results")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
164
docs/examples/undetectability/undetected_cloudflare_test.py
Normal file
164
docs/examples/undetectability/undetected_cloudflare_test.py
Normal file
@@ -0,0 +1,164 @@
|
||||
"""
|
||||
Undetected Browser Test - Cloudflare Protected Site
|
||||
Tests the difference between regular and undetected modes on a Cloudflare-protected site
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
UndetectedAdapter
|
||||
)
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
# Test URL with Cloudflare protection
|
||||
TEST_URL = "https://nowsecure.nl"
|
||||
|
||||
async def test_regular_browser():
|
||||
"""Test with regular browser - likely to be blocked"""
|
||||
print("=" * 60)
|
||||
print("Testing with Regular Browser")
|
||||
print("=" * 60)
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
verbose=True,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080,
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
delay_before_return_html=2.0,
|
||||
simulate_user=True,
|
||||
magic=True, # Try with magic mode too
|
||||
)
|
||||
|
||||
result = await crawler.arun(url=TEST_URL, config=config)
|
||||
|
||||
print(f"\n✓ Success: {result.success}")
|
||||
print(f"✓ Status Code: {result.status_code}")
|
||||
print(f"✓ HTML Length: {len(result.html)}")
|
||||
|
||||
# Check for Cloudflare challenge
|
||||
if result.html:
|
||||
cf_indicators = [
|
||||
"Checking your browser",
|
||||
"Please stand by",
|
||||
"cloudflare",
|
||||
"cf-browser-verification",
|
||||
"Access denied",
|
||||
"Ray ID"
|
||||
]
|
||||
|
||||
detected = False
|
||||
for indicator in cf_indicators:
|
||||
if indicator.lower() in result.html.lower():
|
||||
print(f"⚠️ Cloudflare Challenge Detected: '{indicator}' found")
|
||||
detected = True
|
||||
break
|
||||
|
||||
if not detected and len(result.markdown.raw_markdown) > 100:
|
||||
print("✅ Successfully bypassed Cloudflare!")
|
||||
print(f"Content preview: {result.markdown.raw_markdown[:200]}...")
|
||||
elif not detected:
|
||||
print("⚠️ Page loaded but content seems minimal")
|
||||
|
||||
return result
|
||||
|
||||
async def test_undetected_browser():
|
||||
"""Test with undetected browser - should bypass Cloudflare"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Testing with Undetected Browser")
|
||||
print("=" * 60)
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=False, # Headless is easier to detect
|
||||
verbose=True,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080,
|
||||
)
|
||||
|
||||
# Create undetected adapter
|
||||
undetected_adapter = UndetectedAdapter()
|
||||
|
||||
# Create strategy with undetected adapter
|
||||
crawler_strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=undetected_adapter
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=crawler_strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
delay_before_return_html=2.0,
|
||||
simulate_user=True,
|
||||
)
|
||||
|
||||
result = await crawler.arun(url=TEST_URL, config=config)
|
||||
|
||||
print(f"\n✓ Success: {result.success}")
|
||||
print(f"✓ Status Code: {result.status_code}")
|
||||
print(f"✓ HTML Length: {len(result.html)}")
|
||||
|
||||
# Check for Cloudflare challenge
|
||||
if result.html:
|
||||
cf_indicators = [
|
||||
"Checking your browser",
|
||||
"Please stand by",
|
||||
"cloudflare",
|
||||
"cf-browser-verification",
|
||||
"Access denied",
|
||||
"Ray ID"
|
||||
]
|
||||
|
||||
detected = False
|
||||
for indicator in cf_indicators:
|
||||
if indicator.lower() in result.html.lower():
|
||||
print(f"⚠️ Cloudflare Challenge Detected: '{indicator}' found")
|
||||
detected = True
|
||||
break
|
||||
|
||||
if not detected and len(result.markdown.raw_markdown) > 100:
|
||||
print("✅ Successfully bypassed Cloudflare!")
|
||||
print(f"Content preview: {result.markdown.raw_markdown[:200]}...")
|
||||
elif not detected:
|
||||
print("⚠️ Page loaded but content seems minimal")
|
||||
|
||||
return result
|
||||
|
||||
async def main():
|
||||
"""Compare regular vs undetected browser"""
|
||||
print("🤖 Crawl4AI - Cloudflare Bypass Test")
|
||||
print(f"Testing URL: {TEST_URL}\n")
|
||||
|
||||
# Test regular browser
|
||||
regular_result = await test_regular_browser()
|
||||
|
||||
# Small delay
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Test undetected browser
|
||||
undetected_result = await test_undetected_browser()
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 60)
|
||||
print("SUMMARY")
|
||||
print("=" * 60)
|
||||
print(f"Regular Browser:")
|
||||
print(f" - Success: {regular_result.success}")
|
||||
print(f" - Content Length: {len(regular_result.markdown.raw_markdown) if regular_result.markdown else 0}")
|
||||
|
||||
print(f"\nUndetected Browser:")
|
||||
print(f" - Success: {undetected_result.success}")
|
||||
print(f" - Content Length: {len(undetected_result.markdown.raw_markdown) if undetected_result.markdown else 0}")
|
||||
|
||||
if undetected_result.success and len(undetected_result.markdown.raw_markdown) > len(regular_result.markdown.raw_markdown):
|
||||
print("\n✅ Undetected browser successfully bypassed protection!")
|
||||
print("=" * 60)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1,184 @@
|
||||
"""
|
||||
Undetected vs Regular Browser Comparison
|
||||
This example demonstrates the difference between regular and undetected browser modes
|
||||
when accessing sites with bot detection services.
|
||||
|
||||
Based on tested anti-bot services:
|
||||
- Cloudflare
|
||||
- Kasada
|
||||
- Akamai
|
||||
- DataDome
|
||||
- Bet365
|
||||
- And others
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
PlaywrightAdapter,
|
||||
UndetectedAdapter,
|
||||
CrawlResult
|
||||
)
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
|
||||
# Test URLs for various bot detection services
|
||||
TEST_SITES = {
|
||||
"Cloudflare Protected": "https://nowsecure.nl",
|
||||
# "Bot Detection Test": "https://bot.sannysoft.com",
|
||||
# "Fingerprint Test": "https://fingerprint.com/products/bot-detection",
|
||||
# "Browser Scan": "https://browserscan.net",
|
||||
# "CreepJS": "https://abrahamjuliot.github.io/creepjs",
|
||||
}
|
||||
|
||||
|
||||
async def test_with_adapter(url: str, adapter_name: str, adapter):
|
||||
"""Test a URL with a specific adapter"""
|
||||
browser_config = BrowserConfig(
|
||||
headless=False, # Better for avoiding detection
|
||||
viewport_width=1920,
|
||||
viewport_height=1080,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
# Create the crawler strategy with the adapter
|
||||
crawler_strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=adapter
|
||||
)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Testing with {adapter_name} adapter")
|
||||
print(f"URL: {url}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
try:
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=crawler_strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
crawler_config = CrawlerRunConfig(
|
||||
delay_before_return_html=3.0, # Give page time to load
|
||||
wait_for_images=True,
|
||||
screenshot=True,
|
||||
simulate_user=True, # Add user simulation
|
||||
)
|
||||
|
||||
result: CrawlResult = await crawler.arun(
|
||||
url=url,
|
||||
config=crawler_config
|
||||
)
|
||||
|
||||
# Check results
|
||||
print(f"✓ Status Code: {result.status_code}")
|
||||
print(f"✓ Success: {result.success}")
|
||||
print(f"✓ HTML Length: {len(result.html)}")
|
||||
print(f"✓ Markdown Length: {len(result.markdown.raw_markdown)}")
|
||||
|
||||
# Check for common bot detection indicators
|
||||
detection_indicators = [
|
||||
"Access denied",
|
||||
"Please verify you are human",
|
||||
"Checking your browser",
|
||||
"Enable JavaScript",
|
||||
"captcha",
|
||||
"403 Forbidden",
|
||||
"Bot detection",
|
||||
"Security check"
|
||||
]
|
||||
|
||||
content_lower = result.markdown.raw_markdown.lower()
|
||||
detected = False
|
||||
for indicator in detection_indicators:
|
||||
if indicator.lower() in content_lower:
|
||||
print(f"⚠️ Possible detection: Found '{indicator}'")
|
||||
detected = True
|
||||
break
|
||||
|
||||
if not detected:
|
||||
print("✅ No obvious bot detection triggered!")
|
||||
# Show first 200 chars of content
|
||||
print(f"Content preview: {result.markdown.raw_markdown[:200]}...")
|
||||
|
||||
return result.success and not detected
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
async def compare_adapters(url: str, site_name: str):
|
||||
"""Compare regular and undetected adapters on the same URL"""
|
||||
print(f"\n{'#'*60}")
|
||||
print(f"# Testing: {site_name}")
|
||||
print(f"{'#'*60}")
|
||||
|
||||
# Test with regular adapter
|
||||
regular_adapter = PlaywrightAdapter()
|
||||
regular_success = await test_with_adapter(url, "Regular", regular_adapter)
|
||||
|
||||
# Small delay between tests
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Test with undetected adapter
|
||||
undetected_adapter = UndetectedAdapter()
|
||||
undetected_success = await test_with_adapter(url, "Undetected", undetected_adapter)
|
||||
|
||||
# Summary
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Summary for {site_name}:")
|
||||
print(f"Regular Adapter: {'✅ Passed' if regular_success else '❌ Blocked/Detected'}")
|
||||
print(f"Undetected Adapter: {'✅ Passed' if undetected_success else '❌ Blocked/Detected'}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
return regular_success, undetected_success
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run comparison tests on multiple sites"""
|
||||
print("🤖 Crawl4AI Browser Adapter Comparison")
|
||||
print("Testing regular vs undetected browser modes\n")
|
||||
|
||||
results = {}
|
||||
|
||||
# Test each site
|
||||
for site_name, url in TEST_SITES.items():
|
||||
regular, undetected = await compare_adapters(url, site_name)
|
||||
results[site_name] = {
|
||||
"regular": regular,
|
||||
"undetected": undetected
|
||||
}
|
||||
|
||||
# Delay between different sites
|
||||
await asyncio.sleep(3)
|
||||
|
||||
# Final summary
|
||||
print(f"\n{'#'*60}")
|
||||
print("# FINAL RESULTS")
|
||||
print(f"{'#'*60}")
|
||||
print(f"{'Site':<30} {'Regular':<15} {'Undetected':<15}")
|
||||
print(f"{'-'*60}")
|
||||
|
||||
for site, result in results.items():
|
||||
regular_status = "✅ Passed" if result["regular"] else "❌ Blocked"
|
||||
undetected_status = "✅ Passed" if result["undetected"] else "❌ Blocked"
|
||||
print(f"{site:<30} {regular_status:<15} {undetected_status:<15}")
|
||||
|
||||
# Calculate success rates
|
||||
regular_success = sum(1 for r in results.values() if r["regular"])
|
||||
undetected_success = sum(1 for r in results.values() if r["undetected"])
|
||||
total = len(results)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Success Rates:")
|
||||
print(f"Regular Adapter: {regular_success}/{total} ({regular_success/total*100:.1f}%)")
|
||||
print(f"Undetected Adapter: {undetected_success}/{total} ({undetected_success/total*100:.1f}%)")
|
||||
print(f"{'='*60}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Note: This example may take a while to run as it tests multiple sites
|
||||
# You can comment out sites in TEST_SITES to run faster tests
|
||||
asyncio.run(main())
|
||||
118
docs/examples/undetected_simple_demo.py
Normal file
118
docs/examples/undetected_simple_demo.py
Normal file
@@ -0,0 +1,118 @@
|
||||
"""
|
||||
Simple Undetected Browser Demo
|
||||
Demonstrates the basic usage of undetected browser mode
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
UndetectedAdapter
|
||||
)
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
async def crawl_with_regular_browser(url: str):
|
||||
"""Crawl with regular browser"""
|
||||
print("\n[Regular Browser Mode]")
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
config=CrawlerRunConfig(
|
||||
delay_before_return_html=2.0
|
||||
)
|
||||
)
|
||||
|
||||
print(f"Success: {result.success}")
|
||||
print(f"Status: {result.status_code}")
|
||||
print(f"Content length: {len(result.markdown.raw_markdown)}")
|
||||
|
||||
# Check for bot detection keywords
|
||||
content = result.markdown.raw_markdown.lower()
|
||||
if any(word in content for word in ["cloudflare", "checking your browser", "please wait"]):
|
||||
print("⚠️ Bot detection triggered!")
|
||||
else:
|
||||
print("✅ Page loaded successfully")
|
||||
|
||||
return result
|
||||
|
||||
async def crawl_with_undetected_browser(url: str):
|
||||
"""Crawl with undetected browser"""
|
||||
print("\n[Undetected Browser Mode]")
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
# Create undetected adapter and strategy
|
||||
undetected_adapter = UndetectedAdapter()
|
||||
crawler_strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=undetected_adapter
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=crawler_strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
config=CrawlerRunConfig(
|
||||
delay_before_return_html=2.0
|
||||
)
|
||||
)
|
||||
|
||||
print(f"Success: {result.success}")
|
||||
print(f"Status: {result.status_code}")
|
||||
print(f"Content length: {len(result.markdown.raw_markdown)}")
|
||||
|
||||
# Check for bot detection keywords
|
||||
content = result.markdown.raw_markdown.lower()
|
||||
if any(word in content for word in ["cloudflare", "checking your browser", "please wait"]):
|
||||
print("⚠️ Bot detection triggered!")
|
||||
else:
|
||||
print("✅ Page loaded successfully")
|
||||
|
||||
return result
|
||||
|
||||
async def main():
|
||||
"""Demo comparing regular vs undetected modes"""
|
||||
print("🤖 Crawl4AI Undetected Browser Demo")
|
||||
print("="*50)
|
||||
|
||||
# Test URLs - you can change these
|
||||
test_urls = [
|
||||
"https://www.example.com", # Simple site
|
||||
"https://httpbin.org/headers", # Shows request headers
|
||||
]
|
||||
|
||||
for url in test_urls:
|
||||
print(f"\n📍 Testing URL: {url}")
|
||||
|
||||
# Test with regular browser
|
||||
regular_result = await crawl_with_regular_browser(url)
|
||||
|
||||
# Small delay
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Test with undetected browser
|
||||
undetected_result = await crawl_with_undetected_browser(url)
|
||||
|
||||
# Compare results
|
||||
print(f"\n📊 Comparison for {url}:")
|
||||
print(f"Regular browser content: {len(regular_result.markdown.raw_markdown)} chars")
|
||||
print(f"Undetected browser content: {len(undetected_result.markdown.raw_markdown)} chars")
|
||||
|
||||
if url == "https://httpbin.org/headers":
|
||||
# Show headers for comparison
|
||||
print("\nHeaders seen by server:")
|
||||
print("Regular:", regular_result.markdown.raw_markdown[:500])
|
||||
print("\nUndetected:", undetected_result.markdown.raw_markdown[:500])
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -66,29 +66,38 @@ Sometimes you need a visual record of a page or a PDF “printout.” Crawl4AI c
|
||||
```python
|
||||
import os, asyncio
|
||||
from base64 import b64decode
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
|
||||
|
||||
async def main():
|
||||
run_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
screenshot=True,
|
||||
pdf=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://en.wikipedia.org/wiki/List_of_common_misconceptions",
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
pdf=True,
|
||||
screenshot=True
|
||||
config=run_config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
# Save screenshot
|
||||
print(f"Screenshot data present: {result.screenshot is not None}")
|
||||
print(f"PDF data present: {result.pdf is not None}")
|
||||
|
||||
if result.screenshot:
|
||||
print(f"[OK] Screenshot captured, size: {len(result.screenshot)} bytes")
|
||||
with open("wikipedia_screenshot.png", "wb") as f:
|
||||
f.write(b64decode(result.screenshot))
|
||||
|
||||
# Save PDF
|
||||
else:
|
||||
print("[WARN] Screenshot data is None.")
|
||||
|
||||
if result.pdf:
|
||||
print(f"[OK] PDF captured, size: {len(result.pdf)} bytes")
|
||||
with open("wikipedia_page.pdf", "wb") as f:
|
||||
f.write(result.pdf)
|
||||
|
||||
print("[OK] PDF & screenshot captured.")
|
||||
else:
|
||||
print("[WARN] PDF data is None.")
|
||||
|
||||
else:
|
||||
print("[ERROR]", result.error_message)
|
||||
|
||||
@@ -349,9 +358,77 @@ if __name__ == "__main__":
|
||||
|
||||
---
|
||||
|
||||
---
|
||||
|
||||
## 7. Anti-Bot Features (Stealth Mode & Undetected Browser)
|
||||
|
||||
Crawl4AI provides two powerful features to bypass bot detection:
|
||||
|
||||
### 7.1 Stealth Mode
|
||||
|
||||
Stealth mode uses playwright-stealth to modify browser fingerprints and behaviors. Enable it with a simple flag:
|
||||
|
||||
```python
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True, # Activates stealth mode
|
||||
headless=False
|
||||
)
|
||||
```
|
||||
|
||||
**When to use**: Sites with basic bot detection (checking navigator.webdriver, plugins, etc.)
|
||||
|
||||
### 7.2 Undetected Browser
|
||||
|
||||
For advanced bot detection, use the undetected browser adapter:
|
||||
|
||||
```python
|
||||
from crawl4ai import UndetectedAdapter
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
# Create undetected adapter
|
||||
adapter = UndetectedAdapter()
|
||||
strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=adapter
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(crawler_strategy=strategy, config=browser_config) as crawler:
|
||||
# Your crawling code
|
||||
```
|
||||
|
||||
**When to use**: Sites with sophisticated bot detection (Cloudflare, DataDome, etc.)
|
||||
|
||||
### 7.3 Combining Both
|
||||
|
||||
For maximum evasion, combine stealth mode with undetected browser:
|
||||
|
||||
```python
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True, # Enable stealth
|
||||
headless=False
|
||||
)
|
||||
|
||||
adapter = UndetectedAdapter() # Use undetected browser
|
||||
```
|
||||
|
||||
### Choosing the Right Approach
|
||||
|
||||
| Detection Level | Recommended Approach |
|
||||
|----------------|---------------------|
|
||||
| No protection | Regular browser |
|
||||
| Basic checks | Regular + Stealth mode |
|
||||
| Advanced protection | Undetected browser |
|
||||
| Maximum evasion | Undetected + Stealth mode |
|
||||
|
||||
**Best Practice**: Start with regular browser + stealth mode. Only use undetected browser if needed, as it may be slightly slower.
|
||||
|
||||
See [Undetected Browser Mode](undetected-browser.md) for detailed examples.
|
||||
|
||||
---
|
||||
|
||||
## Conclusion & Next Steps
|
||||
|
||||
You’ve now explored several **advanced** features:
|
||||
You've now explored several **advanced** features:
|
||||
|
||||
- **Proxy Usage**
|
||||
- **PDF & Screenshot** capturing for large or critical pages
|
||||
@@ -359,7 +436,10 @@ You’ve now explored several **advanced** features:
|
||||
- **Custom Headers** for language or specialized requests
|
||||
- **Session Persistence** via storage state
|
||||
- **Robots.txt Compliance**
|
||||
- **Anti-Bot Features** (Stealth Mode & Undetected Browser)
|
||||
|
||||
With these power tools, you can build robust scraping workflows that mimic real user behavior, handle secure sites, capture detailed snapshots, and manage sessions across multiple runs—streamlining your entire data collection pipeline.
|
||||
With these power tools, you can build robust scraping workflows that mimic real user behavior, handle secure sites, capture detailed snapshots, manage sessions across multiple runs, and bypass bot detection—streamlining your entire data collection pipeline.
|
||||
|
||||
**Last Updated**: 2025-01-01
|
||||
**Note**: In future versions, we may enable stealth mode and undetected browser by default. For now, users should explicitly enable these features when needed.
|
||||
|
||||
**Last Updated**: 2025-01-17
|
||||
@@ -404,7 +404,182 @@ for result in results:
|
||||
print(f"Duration: {dr.end_time - dr.start_time}")
|
||||
```
|
||||
|
||||
## 6. Summary
|
||||
## 6. URL-Specific Configurations
|
||||
|
||||
When crawling diverse content types, you often need different configurations for different URLs. For example:
|
||||
- PDFs need specialized extraction
|
||||
- Blog pages benefit from content filtering
|
||||
- Dynamic sites need JavaScript execution
|
||||
- API endpoints need JSON parsing
|
||||
|
||||
### 6.1 Basic URL Pattern Matching
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, MatchMode
|
||||
from crawl4ai.processors.pdf import PDFContentScrapingStrategy
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
|
||||
async def crawl_mixed_content():
|
||||
# Configure different strategies for different content
|
||||
configs = [
|
||||
# PDF files - specialized extraction
|
||||
CrawlerRunConfig(
|
||||
url_matcher="*.pdf",
|
||||
scraping_strategy=PDFContentScrapingStrategy()
|
||||
),
|
||||
|
||||
# Blog/article pages - content filtering
|
||||
CrawlerRunConfig(
|
||||
url_matcher=["*/blog/*", "*/article/*"],
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(threshold=0.48)
|
||||
)
|
||||
),
|
||||
|
||||
# Dynamic pages - JavaScript execution
|
||||
CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'github.com' in url,
|
||||
js_code="window.scrollTo(0, 500);"
|
||||
),
|
||||
|
||||
# API endpoints - JSON extraction
|
||||
CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'api' in url or url.endswith('.json'),
|
||||
# Custome settings for JSON extraction
|
||||
),
|
||||
|
||||
# Default config for everything else
|
||||
CrawlerRunConfig() # No url_matcher means it matches ALL URLs (fallback)
|
||||
]
|
||||
|
||||
# Mixed URLs
|
||||
urls = [
|
||||
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
|
||||
"https://blog.python.org/",
|
||||
"https://github.com/microsoft/playwright",
|
||||
"https://httpbin.org/json",
|
||||
"https://example.com/"
|
||||
]
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=configs # Pass list of configs
|
||||
)
|
||||
|
||||
for result in results:
|
||||
print(f"{result.url}: {len(result.markdown)} chars")
|
||||
```
|
||||
|
||||
### 6.2 Advanced Pattern Matching
|
||||
|
||||
**Important**: A `CrawlerRunConfig` without `url_matcher` (or with `url_matcher=None`) matches ALL URLs. This makes it perfect as a default/fallback configuration.
|
||||
|
||||
The `url_matcher` parameter supports three types of patterns:
|
||||
|
||||
#### Glob Patterns (Strings)
|
||||
```python
|
||||
# Simple patterns
|
||||
"*.pdf" # Any PDF file
|
||||
"*/api/*" # Any URL with /api/ in path
|
||||
"https://*.example.com/*" # Subdomain matching
|
||||
"*://example.com/blog/*" # Any protocol
|
||||
```
|
||||
|
||||
#### Custom Functions
|
||||
```python
|
||||
# Complex logic with lambdas
|
||||
lambda url: url.startswith('https://') and 'secure' in url
|
||||
lambda url: len(url) > 50 and url.count('/') > 5
|
||||
lambda url: any(domain in url for domain in ['api.', 'data.', 'feed.'])
|
||||
```
|
||||
|
||||
#### Mixed Lists with AND/OR Logic
|
||||
```python
|
||||
# Combine multiple conditions
|
||||
CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
"https://*", # Must be HTTPS
|
||||
lambda url: 'internal' in url, # Must contain 'internal'
|
||||
lambda url: not url.endswith('.pdf') # Must not be PDF
|
||||
],
|
||||
match_mode=MatchMode.AND # ALL conditions must match
|
||||
)
|
||||
```
|
||||
|
||||
### 6.3 Practical Example: News Site Crawler
|
||||
|
||||
```python
|
||||
async def crawl_news_site():
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=70.0,
|
||||
rate_limiter=RateLimiter(base_delay=(1.0, 2.0))
|
||||
)
|
||||
|
||||
configs = [
|
||||
# Homepage - light extraction
|
||||
CrawlerRunConfig(
|
||||
url_matcher=lambda url: url.rstrip('/') == 'https://news.ycombinator.com',
|
||||
css_selector="nav, .headline",
|
||||
extraction_strategy=None
|
||||
),
|
||||
|
||||
# Article pages - full extraction
|
||||
CrawlerRunConfig(
|
||||
url_matcher="*/article/*",
|
||||
extraction_strategy=CosineStrategy(
|
||||
semantic_filter="article content",
|
||||
word_count_threshold=100
|
||||
),
|
||||
screenshot=True,
|
||||
excluded_tags=["nav", "aside", "footer"]
|
||||
),
|
||||
|
||||
# Author pages - metadata focus
|
||||
CrawlerRunConfig(
|
||||
url_matcher="*/author/*",
|
||||
extraction_strategy=JsonCssExtractionStrategy({
|
||||
"name": "h1.author-name",
|
||||
"bio": ".author-bio",
|
||||
"articles": "article.post-card h2"
|
||||
})
|
||||
),
|
||||
|
||||
# Everything else
|
||||
CrawlerRunConfig()
|
||||
]
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results = await crawler.arun_many(
|
||||
urls=news_urls,
|
||||
config=configs,
|
||||
dispatcher=dispatcher
|
||||
)
|
||||
```
|
||||
|
||||
### 6.4 Best Practices
|
||||
|
||||
1. **Order Matters**: Configs are evaluated in order - put specific patterns before general ones
|
||||
2. **Default Config Behavior**:
|
||||
- A config without `url_matcher` matches ALL URLs
|
||||
- Always include a default config as the last item if you want to handle all URLs
|
||||
- Without a default config, unmatched URLs will fail with "No matching configuration found"
|
||||
3. **Test Your Patterns**: Use the config's `is_match()` method to test patterns:
|
||||
```python
|
||||
config = CrawlerRunConfig(url_matcher="*.pdf")
|
||||
print(config.is_match("https://example.com/doc.pdf")) # True
|
||||
|
||||
default_config = CrawlerRunConfig() # No url_matcher
|
||||
print(default_config.is_match("https://any-url.com")) # True - matches everything!
|
||||
```
|
||||
4. **Optimize for Performance**:
|
||||
- Disable JS for static content
|
||||
- Skip screenshots for data APIs
|
||||
- Use appropriate extraction strategies
|
||||
|
||||
## 7. Summary
|
||||
|
||||
1. **Two Dispatcher Types**:
|
||||
|
||||
|
||||
201
docs/md_v2/advanced/pdf-parsing.md
Normal file
201
docs/md_v2/advanced/pdf-parsing.md
Normal file
@@ -0,0 +1,201 @@
|
||||
# PDF Processing Strategies
|
||||
|
||||
Crawl4AI provides specialized strategies for handling and extracting content from PDF files. These strategies allow you to seamlessly integrate PDF processing into your crawling workflows, whether the PDFs are hosted online or stored locally.
|
||||
|
||||
## `PDFCrawlerStrategy`
|
||||
|
||||
### Overview
|
||||
`PDFCrawlerStrategy` is an implementation of `AsyncCrawlerStrategy` designed specifically for PDF documents. Instead of interpreting the input URL as an HTML webpage, this strategy treats it as a pointer to a PDF file. It doesn't perform deep crawling or HTML parsing itself but rather prepares the PDF source for a dedicated PDF scraping strategy. Its primary role is to identify the PDF source (web URL or local file) and pass it along the processing pipeline in a way that `AsyncWebCrawler` can handle.
|
||||
|
||||
### When to Use
|
||||
Use `PDFCrawlerStrategy` when you need to:
|
||||
- Process PDF files using the `AsyncWebCrawler`.
|
||||
- Handle PDFs from both web URLs (e.g., `https://example.com/document.pdf`) and local file paths (e.g., `file:///path/to/your/document.pdf`).
|
||||
- Integrate PDF content extraction into a unified `CrawlResult` object, allowing consistent handling of PDF data alongside web page data.
|
||||
|
||||
### Key Methods and Their Behavior
|
||||
- **`__init__(self, logger: AsyncLogger = None)`**:
|
||||
- Initializes the strategy.
|
||||
- `logger`: An optional `AsyncLogger` instance (from `crawl4ai.async_logger`) for logging purposes.
|
||||
- **`async crawl(self, url: str, **kwargs) -> AsyncCrawlResponse`**:
|
||||
- This method is called by the `AsyncWebCrawler` during the `arun` process.
|
||||
- It takes the `url` (which should point to a PDF) and creates a minimal `AsyncCrawlResponse`.
|
||||
- The `html` attribute of this response is typically empty or a placeholder, as the actual PDF content processing is deferred to the `PDFContentScrapingStrategy` (or a similar PDF-aware scraping strategy).
|
||||
- It sets `response_headers` to indicate "application/pdf" and `status_code` to 200.
|
||||
- **`async close(self)`**:
|
||||
- A method for cleaning up any resources used by the strategy. For `PDFCrawlerStrategy`, this is usually minimal.
|
||||
- **`async __aenter__(self)` / `async __aexit__(self, exc_type, exc_val, exc_tb)`**:
|
||||
- Enables asynchronous context management for the strategy, allowing it to be used with `async with`.
|
||||
|
||||
### Example Usage
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
|
||||
|
||||
async def main():
|
||||
# Initialize the PDF crawler strategy
|
||||
pdf_crawler_strategy = PDFCrawlerStrategy()
|
||||
|
||||
# PDFCrawlerStrategy is typically used in conjunction with PDFContentScrapingStrategy
|
||||
# The scraping strategy handles the actual PDF content extraction
|
||||
pdf_scraping_strategy = PDFContentScrapingStrategy()
|
||||
run_config = CrawlerRunConfig(scraping_strategy=pdf_scraping_strategy)
|
||||
|
||||
async with AsyncWebCrawler(crawler_strategy=pdf_crawler_strategy) as crawler:
|
||||
# Example with a remote PDF URL
|
||||
pdf_url = "https://arxiv.org/pdf/2310.06825.pdf" # A public PDF from arXiv
|
||||
|
||||
print(f"Attempting to process PDF: {pdf_url}")
|
||||
result = await crawler.arun(url=pdf_url, config=run_config)
|
||||
|
||||
if result.success:
|
||||
print(f"Successfully processed PDF: {result.url}")
|
||||
print(f"Metadata Title: {result.metadata.get('title', 'N/A')}")
|
||||
# Further processing of result.markdown, result.media, etc.
|
||||
# would be done here, based on what PDFContentScrapingStrategy extracts.
|
||||
if result.markdown and hasattr(result.markdown, 'raw_markdown'):
|
||||
print(f"Extracted text (first 200 chars): {result.markdown.raw_markdown[:200]}...")
|
||||
else:
|
||||
print("No markdown (text) content extracted.")
|
||||
else:
|
||||
print(f"Failed to process PDF: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
### Pros and Cons
|
||||
**Pros:**
|
||||
- Enables `AsyncWebCrawler` to handle PDF sources directly using familiar `arun` calls.
|
||||
- Provides a consistent interface for specifying PDF sources (URLs or local paths).
|
||||
- Abstracts the source handling, allowing a separate scraping strategy to focus on PDF content parsing.
|
||||
|
||||
**Cons:**
|
||||
- Does not perform any PDF data extraction itself; it strictly relies on a compatible scraping strategy (like `PDFContentScrapingStrategy`) to process the PDF.
|
||||
- Has limited utility on its own; most of its value comes from being paired with a PDF-specific content scraping strategy.
|
||||
|
||||
---
|
||||
|
||||
## `PDFContentScrapingStrategy`
|
||||
|
||||
### Overview
|
||||
`PDFContentScrapingStrategy` is an implementation of `ContentScrapingStrategy` designed to extract text, metadata, and optionally images from PDF documents. It is intended to be used in conjunction with a crawler strategy that can provide it with a PDF source, such as `PDFCrawlerStrategy`. This strategy uses the `NaivePDFProcessorStrategy` internally to perform the low-level PDF parsing.
|
||||
|
||||
### When to Use
|
||||
Use `PDFContentScrapingStrategy` when your `AsyncWebCrawler` (often configured with `PDFCrawlerStrategy`) needs to:
|
||||
- Extract textual content page by page from a PDF document.
|
||||
- Retrieve standard metadata embedded within the PDF (e.g., title, author, subject, creation date, page count).
|
||||
- Optionally, extract images contained within the PDF pages. These images can be saved to a local directory or made available for further processing.
|
||||
- Produce a `ScrapingResult` that can be converted into a `CrawlResult`, making PDF content accessible in a manner similar to HTML web content (e.g., text in `result.markdown`, metadata in `result.metadata`).
|
||||
|
||||
### Key Configuration Attributes
|
||||
When initializing `PDFContentScrapingStrategy`, you can configure its behavior using the following attributes:
|
||||
- **`extract_images: bool = False`**: If `True`, the strategy will attempt to extract images from the PDF.
|
||||
- **`save_images_locally: bool = False`**: If `True` (and `extract_images` is also `True`), extracted images will be saved to disk in the `image_save_dir`. If `False`, image data might be available in another form (e.g., base64, depending on the underlying processor) but not saved as separate files by this strategy.
|
||||
- **`image_save_dir: str = None`**: Specifies the directory where extracted images should be saved if `save_images_locally` is `True`. If `None`, a default or temporary directory might be used.
|
||||
- **`batch_size: int = 4`**: Defines how many PDF pages are processed in a single batch. This can be useful for managing memory when dealing with very large PDF documents.
|
||||
- **`logger: AsyncLogger = None`**: An optional `AsyncLogger` instance for logging.
|
||||
|
||||
### Key Methods and Their Behavior
|
||||
- **`__init__(self, save_images_locally: bool = False, extract_images: bool = False, image_save_dir: str = None, batch_size: int = 4, logger: AsyncLogger = None)`**:
|
||||
- Initializes the strategy with configurations for image handling, batch processing, and logging. It sets up an internal `NaivePDFProcessorStrategy` instance which performs the actual PDF parsing.
|
||||
- **`scrap(self, url: str, html: str, **params) -> ScrapingResult`**:
|
||||
- This is the primary synchronous method called by the crawler (via `ascrap`) to process the PDF.
|
||||
- `url`: The path or URL to the PDF file (provided by `PDFCrawlerStrategy` or similar).
|
||||
- `html`: Typically an empty string when used with `PDFCrawlerStrategy`, as the content is a PDF, not HTML.
|
||||
- It first ensures the PDF is accessible locally (downloads it to a temporary file if `url` is remote).
|
||||
- It then uses its internal PDF processor to extract text, metadata, and images (if configured).
|
||||
- The extracted information is compiled into a `ScrapingResult` object:
|
||||
- `cleaned_html`: Contains an HTML-like representation of the PDF, where each page's content is often wrapped in a `<div>` with page number information.
|
||||
- `media`: A dictionary where `media["images"]` will contain information about extracted images if `extract_images` was `True`.
|
||||
- `links`: A dictionary where `links["urls"]` can contain URLs found within the PDF content.
|
||||
- `metadata`: A dictionary holding PDF metadata (e.g., title, author, num_pages).
|
||||
- **`async ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult`**:
|
||||
- The asynchronous version of `scrap`. Under the hood, it typically runs the synchronous `scrap` method in a separate thread using `asyncio.to_thread` to avoid blocking the event loop.
|
||||
- **`_get_pdf_path(self, url: str) -> str`**:
|
||||
- A private helper method to manage PDF file access. If the `url` is remote (http/https), it downloads the PDF to a temporary local file and returns its path. If `url` indicates a local file (`file://` or a direct path), it resolves and returns the local path.
|
||||
|
||||
### Example Usage
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
|
||||
import os # For creating image directory
|
||||
|
||||
async def main():
|
||||
# Define the directory for saving extracted images
|
||||
image_output_dir = "./my_pdf_images"
|
||||
os.makedirs(image_output_dir, exist_ok=True)
|
||||
|
||||
# Configure the PDF content scraping strategy
|
||||
# Enable image extraction and specify where to save them
|
||||
pdf_scraping_cfg = PDFContentScrapingStrategy(
|
||||
extract_images=True,
|
||||
save_images_locally=True,
|
||||
image_save_dir=image_output_dir,
|
||||
batch_size=2 # Process 2 pages at a time for demonstration
|
||||
)
|
||||
|
||||
# The PDFCrawlerStrategy is needed to tell AsyncWebCrawler how to "crawl" a PDF
|
||||
pdf_crawler_cfg = PDFCrawlerStrategy()
|
||||
|
||||
# Configure the overall crawl run
|
||||
run_cfg = CrawlerRunConfig(
|
||||
scraping_strategy=pdf_scraping_cfg # Use our PDF scraping strategy
|
||||
)
|
||||
|
||||
# Initialize the crawler with the PDF-specific crawler strategy
|
||||
async with AsyncWebCrawler(crawler_strategy=pdf_crawler_cfg) as crawler:
|
||||
pdf_url = "https://arxiv.org/pdf/2310.06825.pdf" # Example PDF
|
||||
|
||||
print(f"Starting PDF processing for: {pdf_url}")
|
||||
result = await crawler.arun(url=pdf_url, config=run_cfg)
|
||||
|
||||
if result.success:
|
||||
print("\n--- PDF Processing Successful ---")
|
||||
print(f"Processed URL: {result.url}")
|
||||
|
||||
print("\n--- Metadata ---")
|
||||
for key, value in result.metadata.items():
|
||||
print(f" {key.replace('_', ' ').title()}: {value}")
|
||||
|
||||
if result.markdown and hasattr(result.markdown, 'raw_markdown'):
|
||||
print(f"\n--- Extracted Text (Markdown Snippet) ---")
|
||||
print(result.markdown.raw_markdown[:500].strip() + "...")
|
||||
else:
|
||||
print("\nNo text (markdown) content extracted.")
|
||||
|
||||
if result.media and result.media.get("images"):
|
||||
print(f"\n--- Image Extraction ---")
|
||||
print(f"Extracted {len(result.media['images'])} image(s).")
|
||||
for i, img_info in enumerate(result.media["images"][:2]): # Show info for first 2 images
|
||||
print(f" Image {i+1}:")
|
||||
print(f" Page: {img_info.get('page')}")
|
||||
print(f" Format: {img_info.get('format', 'N/A')}")
|
||||
if img_info.get('path'):
|
||||
print(f" Saved at: {img_info.get('path')}")
|
||||
else:
|
||||
print("\nNo images were extracted (or extract_images was False).")
|
||||
else:
|
||||
print(f"\n--- PDF Processing Failed ---")
|
||||
print(f"Error: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
### Pros and Cons
|
||||
|
||||
**Pros:**
|
||||
- Provides a comprehensive way to extract text, metadata, and (optionally) images from PDF documents.
|
||||
- Handles both remote PDFs (via URL) and local PDF files.
|
||||
- Configurable image extraction allows saving images to disk or accessing their data.
|
||||
- Integrates smoothly with the `CrawlResult` object structure, making PDF-derived data accessible in a way consistent with web-scraped data.
|
||||
- The `batch_size` parameter can help in managing memory consumption when processing large or numerous PDF pages.
|
||||
|
||||
**Cons:**
|
||||
- Extraction quality and performance can vary significantly depending on the PDF's complexity, encoding, and whether it's image-based (scanned) or text-based.
|
||||
- Image extraction can be resource-intensive (both CPU and disk space if `save_images_locally` is true).
|
||||
- Relies on `NaivePDFProcessorStrategy` internally, which might have limitations with very complex layouts, encrypted PDFs, or forms compared to more sophisticated PDF parsing libraries. Scanned PDFs will not yield text unless an OCR step is performed (which is not part of this strategy by default).
|
||||
- Link extraction from PDFs can be basic and depends on how hyperlinks are embedded in the document.
|
||||
@@ -25,44 +25,70 @@ Use an authenticated proxy with `BrowserConfig`:
|
||||
```python
|
||||
from crawl4ai.async_configs import BrowserConfig
|
||||
|
||||
proxy_config = {
|
||||
"server": "http://proxy.example.com:8080",
|
||||
"username": "user",
|
||||
"password": "pass"
|
||||
}
|
||||
|
||||
browser_config = BrowserConfig(proxy_config=proxy_config)
|
||||
browser_config = BrowserConfig(proxy="http://[username]:[password]@[host]:[port]")
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
```
|
||||
|
||||
Here's the corrected documentation:
|
||||
|
||||
## Rotating Proxies
|
||||
|
||||
Example using a proxy rotation service dynamically:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
|
||||
async def get_next_proxy():
|
||||
# Your proxy rotation logic here
|
||||
return {"server": "http://next.proxy.com:8080"}
|
||||
|
||||
import re
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
CacheMode,
|
||||
RoundRobinProxyStrategy,
|
||||
)
|
||||
import asyncio
|
||||
from crawl4ai import ProxyConfig
|
||||
async def main():
|
||||
browser_config = BrowserConfig()
|
||||
run_config = CrawlerRunConfig()
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# For each URL, create a new run config with different proxy
|
||||
for url in urls:
|
||||
proxy = await get_next_proxy()
|
||||
# Clone the config and update proxy - this creates a new browser context
|
||||
current_config = run_config.clone(proxy_config=proxy)
|
||||
result = await crawler.arun(url=url, config=current_config)
|
||||
# Load proxies and create rotation strategy
|
||||
proxies = ProxyConfig.from_env()
|
||||
#eg: export PROXIES="ip1:port1:username1:password1,ip2:port2:username2:password2"
|
||||
if not proxies:
|
||||
print("No proxies found in environment. Set PROXIES env variable!")
|
||||
return
|
||||
|
||||
proxy_strategy = RoundRobinProxyStrategy(proxies)
|
||||
|
||||
# Create configs
|
||||
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||
run_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
proxy_rotation_strategy=proxy_strategy
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
urls = ["https://httpbin.org/ip"] * (len(proxies) * 2) # Test each proxy twice
|
||||
|
||||
print("\n📈 Initializing crawler with proxy rotation...")
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
print("\n🚀 Starting batch crawl with proxy rotation...")
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=run_config
|
||||
)
|
||||
for result in results:
|
||||
if result.success:
|
||||
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
|
||||
current_proxy = run_config.proxy_config if run_config.proxy_config else None
|
||||
|
||||
if current_proxy and ip_match:
|
||||
print(f"URL {result.url}")
|
||||
print(f"Proxy {current_proxy.server} -> Response IP: {ip_match.group(0)}")
|
||||
verified = ip_match.group(0) == current_proxy.ip
|
||||
if verified:
|
||||
print(f"✅ Proxy working! IP matches: {current_proxy.ip}")
|
||||
else:
|
||||
print("❌ Proxy failed or IP mismatch!")
|
||||
print("---")
|
||||
|
||||
asyncio.run(main())
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
|
||||
@@ -49,46 +49,75 @@ from crawl4ai import JsonCssExtractionStrategy
|
||||
from crawl4ai.cache_context import CacheMode
|
||||
|
||||
async def crawl_dynamic_content():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
session_id = "github_commits_session"
|
||||
url = "https://github.com/microsoft/TypeScript/commits/main"
|
||||
all_commits = []
|
||||
url = "https://github.com/microsoft/TypeScript/commits/main"
|
||||
session_id = "wait_for_session"
|
||||
all_commits = []
|
||||
|
||||
# Define extraction schema
|
||||
schema = {
|
||||
"name": "Commit Extractor",
|
||||
"baseSelector": "li.Box-sc-g0xbh4-0",
|
||||
"fields": [{
|
||||
"name": "title", "selector": "h4.markdown-title", "type": "text"
|
||||
}],
|
||||
}
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema)
|
||||
js_next_page = """
|
||||
const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4');
|
||||
if (commits.length > 0) {
|
||||
window.lastCommit = commits[0].textContent.trim();
|
||||
}
|
||||
const button = document.querySelector('a[data-testid="pagination-next-button"]');
|
||||
if (button) {button.click(); console.log('button clicked') }
|
||||
"""
|
||||
|
||||
# JavaScript and wait configurations
|
||||
js_next_page = """document.querySelector('a[data-testid="pagination-next-button"]').click();"""
|
||||
wait_for = """() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0"""
|
||||
|
||||
# Crawl multiple pages
|
||||
wait_for = """() => {
|
||||
const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4');
|
||||
if (commits.length === 0) return false;
|
||||
const firstCommit = commits[0].textContent.trim();
|
||||
return firstCommit !== window.lastCommit;
|
||||
}"""
|
||||
|
||||
schema = {
|
||||
"name": "Commit Extractor",
|
||||
"baseSelector": "li[data-testid='commit-row-item']",
|
||||
"fields": [
|
||||
{
|
||||
"name": "title",
|
||||
"selector": "h4 a",
|
||||
"type": "text",
|
||||
"transform": "strip",
|
||||
},
|
||||
],
|
||||
}
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
|
||||
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
verbose=True,
|
||||
headless=False,
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
for page in range(3):
|
||||
config = CrawlerRunConfig(
|
||||
url=url,
|
||||
crawler_config = CrawlerRunConfig(
|
||||
session_id=session_id,
|
||||
css_selector="li[data-testid='commit-row-item']",
|
||||
extraction_strategy=extraction_strategy,
|
||||
js_code=js_next_page if page > 0 else None,
|
||||
wait_for=wait_for if page > 0 else None,
|
||||
js_only=page > 0,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
capture_console_messages=True,
|
||||
)
|
||||
|
||||
result = await crawler.arun(config=config)
|
||||
if result.success:
|
||||
|
||||
result = await crawler.arun(url=url, config=crawler_config)
|
||||
|
||||
if result.console_messages:
|
||||
print(f"Page {page + 1} console messages:", result.console_messages)
|
||||
|
||||
if result.extracted_content:
|
||||
# print(f"Page {page + 1} result:", result.extracted_content)
|
||||
commits = json.loads(result.extracted_content)
|
||||
all_commits.extend(commits)
|
||||
print(f"Page {page + 1}: Found {len(commits)} commits")
|
||||
else:
|
||||
print(f"Page {page + 1}: No content extracted")
|
||||
|
||||
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
|
||||
# Clean up session
|
||||
await crawler.crawler_strategy.kill_session(session_id)
|
||||
return all_commits
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
394
docs/md_v2/advanced/undetected-browser.md
Normal file
394
docs/md_v2/advanced/undetected-browser.md
Normal file
@@ -0,0 +1,394 @@
|
||||
# Undetected Browser Mode
|
||||
|
||||
## Overview
|
||||
|
||||
Crawl4AI offers two powerful anti-bot features to help you access websites with bot detection:
|
||||
|
||||
1. **Stealth Mode** - Uses playwright-stealth to modify browser fingerprints and behaviors
|
||||
2. **Undetected Browser Mode** - Advanced browser adapter with deep-level patches for sophisticated bot detection
|
||||
|
||||
This guide covers both features and helps you choose the right approach for your needs.
|
||||
|
||||
## Anti-Bot Features Comparison
|
||||
|
||||
| Feature | Regular Browser | Stealth Mode | Undetected Browser |
|
||||
|---------|----------------|--------------|-------------------|
|
||||
| WebDriver Detection | ❌ | ✅ | ✅ |
|
||||
| Navigator Properties | ❌ | ✅ | ✅ |
|
||||
| Plugin Emulation | ❌ | ✅ | ✅ |
|
||||
| CDP Detection | ❌ | Partial | ✅ |
|
||||
| Deep Browser Patches | ❌ | ❌ | ✅ |
|
||||
| Performance Impact | None | Minimal | Moderate |
|
||||
| Setup Complexity | None | None | Minimal |
|
||||
|
||||
## When to Use Each Approach
|
||||
|
||||
### Use Regular Browser + Stealth Mode When:
|
||||
- Sites have basic bot detection (checking navigator.webdriver, plugins, etc.)
|
||||
- You need good performance with basic protection
|
||||
- Sites check for common automation indicators
|
||||
|
||||
### Use Undetected Browser When:
|
||||
- Sites employ sophisticated bot detection services (Cloudflare, DataDome, etc.)
|
||||
- Stealth mode alone isn't sufficient
|
||||
- You're willing to trade some performance for better evasion
|
||||
|
||||
### Best Practice: Progressive Enhancement
|
||||
1. **Start with**: Regular browser + Stealth mode
|
||||
2. **If blocked**: Switch to Undetected browser
|
||||
3. **If still blocked**: Combine Undetected browser + Stealth mode
|
||||
|
||||
## Stealth Mode
|
||||
|
||||
Stealth mode is the simpler anti-bot solution that works with both regular and undetected browsers:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
||||
|
||||
# Enable stealth mode with regular browser
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True, # Simple flag to enable
|
||||
headless=False # Better for avoiding detection
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun("https://example.com")
|
||||
```
|
||||
|
||||
### What Stealth Mode Does:
|
||||
- Removes `navigator.webdriver` flag
|
||||
- Modifies browser fingerprints
|
||||
- Emulates realistic plugin behavior
|
||||
- Adjusts navigator properties
|
||||
- Fixes common automation leaks
|
||||
|
||||
## Undetected Browser Mode
|
||||
|
||||
For sites with sophisticated bot detection that stealth mode can't bypass, use the undetected browser adapter:
|
||||
|
||||
### Key Features
|
||||
|
||||
- **Drop-in Replacement**: Uses the same API as regular browser mode
|
||||
- **Enhanced Stealth**: Built-in patches to evade common detection methods
|
||||
- **Browser Adapter Pattern**: Seamlessly switch between regular and undetected modes
|
||||
- **Automatic Installation**: `crawl4ai-setup` installs all necessary browser dependencies
|
||||
|
||||
### Quick Start
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
UndetectedAdapter
|
||||
)
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
async def main():
|
||||
# Create the undetected adapter
|
||||
undetected_adapter = UndetectedAdapter()
|
||||
|
||||
# Create browser config
|
||||
browser_config = BrowserConfig(
|
||||
headless=False, # Headless mode can be detected easier
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
# Create the crawler strategy with undetected adapter
|
||||
crawler_strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=undetected_adapter
|
||||
)
|
||||
|
||||
# Create the crawler with our custom strategy
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=crawler_strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
# Your crawling code here
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=CrawlerRunConfig()
|
||||
)
|
||||
print(result.markdown[:500])
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
## Combining Both Features
|
||||
|
||||
For maximum evasion, combine stealth mode with undetected browser:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, UndetectedAdapter
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
# Create browser config with stealth enabled
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True, # Enable stealth mode
|
||||
headless=False
|
||||
)
|
||||
|
||||
# Create undetected adapter
|
||||
adapter = UndetectedAdapter()
|
||||
|
||||
# Create strategy with both features
|
||||
strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=adapter
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
result = await crawler.arun("https://protected-site.com")
|
||||
```
|
||||
|
||||
## Examples
|
||||
|
||||
### Example 1: Basic Stealth Mode
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
|
||||
async def test_stealth_mode():
|
||||
# Simple stealth mode configuration
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True,
|
||||
headless=False
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://bot.sannysoft.com",
|
||||
config=CrawlerRunConfig(screenshot=True)
|
||||
)
|
||||
|
||||
if result.success:
|
||||
print("✓ Successfully accessed bot detection test site")
|
||||
# Save screenshot to verify detection results
|
||||
if result.screenshot:
|
||||
import base64
|
||||
with open("stealth_test.png", "wb") as f:
|
||||
f.write(base64.b64decode(result.screenshot))
|
||||
print("✓ Screenshot saved - check for green (passed) tests")
|
||||
|
||||
asyncio.run(test_stealth_mode())
|
||||
```
|
||||
|
||||
### Example 2: Undetected Browser Mode
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
UndetectedAdapter
|
||||
)
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
|
||||
async def main():
|
||||
# Create browser config
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
# Create the undetected adapter
|
||||
undetected_adapter = UndetectedAdapter()
|
||||
|
||||
# Create the crawler strategy with the undetected adapter
|
||||
crawler_strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=undetected_adapter
|
||||
)
|
||||
|
||||
# Create the crawler with our custom strategy
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=crawler_strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
# Configure the crawl
|
||||
crawler_config = CrawlerRunConfig(
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter()
|
||||
),
|
||||
capture_console_messages=True, # Test adapter console capture
|
||||
)
|
||||
|
||||
# Test on a site that typically detects bots
|
||||
print("Testing undetected adapter...")
|
||||
result: CrawlResult = await crawler.arun(
|
||||
url="https://www.helloworld.org",
|
||||
config=crawler_config
|
||||
)
|
||||
|
||||
print(f"Status: {result.status_code}")
|
||||
print(f"Success: {result.success}")
|
||||
print(f"Console messages captured: {len(result.console_messages or [])}")
|
||||
print(f"Markdown content (first 500 chars):\n{result.markdown.raw_markdown[:500]}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
## Browser Adapter Pattern
|
||||
|
||||
The undetected browser support is implemented using an adapter pattern, allowing seamless switching between different browser implementations:
|
||||
|
||||
```python
|
||||
# Regular browser adapter (default)
|
||||
from crawl4ai import PlaywrightAdapter
|
||||
regular_adapter = PlaywrightAdapter()
|
||||
|
||||
# Undetected browser adapter
|
||||
from crawl4ai import UndetectedAdapter
|
||||
undetected_adapter = UndetectedAdapter()
|
||||
```
|
||||
|
||||
The adapter handles:
|
||||
- JavaScript execution
|
||||
- Console message capture
|
||||
- Error handling
|
||||
- Browser-specific optimizations
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Avoid Headless Mode**: Detection is easier in headless mode
|
||||
```python
|
||||
browser_config = BrowserConfig(headless=False)
|
||||
```
|
||||
|
||||
2. **Use Reasonable Delays**: Don't rush through pages
|
||||
```python
|
||||
crawler_config = CrawlerRunConfig(
|
||||
wait_time=3.0, # Wait 3 seconds after page load
|
||||
delay_before_return_html=2.0 # Additional delay
|
||||
)
|
||||
```
|
||||
|
||||
3. **Rotate User Agents**: You can customize user agents
|
||||
```python
|
||||
browser_config = BrowserConfig(
|
||||
headers={"User-Agent": "your-user-agent"}
|
||||
)
|
||||
```
|
||||
|
||||
4. **Handle Failures Gracefully**: Some sites may still detect and block
|
||||
```python
|
||||
if not result.success:
|
||||
print(f"Crawl failed: {result.error_message}")
|
||||
```
|
||||
|
||||
## Advanced Usage Tips
|
||||
|
||||
### Progressive Detection Handling
|
||||
|
||||
```python
|
||||
async def crawl_with_progressive_evasion(url):
|
||||
# Step 1: Try regular browser with stealth
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True,
|
||||
headless=False
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url)
|
||||
if result.success and "Access Denied" not in result.html:
|
||||
return result
|
||||
|
||||
# Step 2: If blocked, try undetected browser
|
||||
print("Regular + stealth blocked, trying undetected browser...")
|
||||
|
||||
adapter = UndetectedAdapter()
|
||||
strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=adapter
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
result = await crawler.arun(url)
|
||||
return result
|
||||
```
|
||||
|
||||
## Installation
|
||||
|
||||
The undetected browser dependencies are automatically installed when you run:
|
||||
|
||||
```bash
|
||||
crawl4ai-setup
|
||||
```
|
||||
|
||||
This command installs all necessary browser dependencies for both regular and undetected modes.
|
||||
|
||||
## Limitations
|
||||
|
||||
- **Performance**: Slightly slower than regular mode due to additional patches
|
||||
- **Headless Detection**: Some sites can still detect headless mode
|
||||
- **Resource Usage**: May use more resources than regular mode
|
||||
- **Not 100% Guaranteed**: Advanced anti-bot services are constantly evolving
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Browser Not Found
|
||||
|
||||
Run the setup command:
|
||||
```bash
|
||||
crawl4ai-setup
|
||||
```
|
||||
|
||||
### Detection Still Occurring
|
||||
|
||||
Try combining with other features:
|
||||
```python
|
||||
crawler_config = CrawlerRunConfig(
|
||||
simulate_user=True, # Add user simulation
|
||||
magic=True, # Enable magic mode
|
||||
wait_time=5.0, # Longer waits
|
||||
)
|
||||
```
|
||||
|
||||
### Performance Issues
|
||||
|
||||
If experiencing slow performance:
|
||||
```python
|
||||
# Use selective undetected mode only for protected sites
|
||||
if is_protected_site(url):
|
||||
adapter = UndetectedAdapter()
|
||||
else:
|
||||
adapter = PlaywrightAdapter() # Default adapter
|
||||
```
|
||||
|
||||
## Future Plans
|
||||
|
||||
**Note**: In future versions of Crawl4AI, we may enable stealth mode and undetected browser by default to provide better out-of-the-box success rates. For now, users should explicitly enable these features when needed.
|
||||
|
||||
## Conclusion
|
||||
|
||||
Crawl4AI provides flexible anti-bot solutions:
|
||||
|
||||
1. **Start Simple**: Use regular browser + stealth mode for most sites
|
||||
2. **Escalate if Needed**: Switch to undetected browser for sophisticated protection
|
||||
3. **Combine for Maximum Effect**: Use both features together when facing the toughest challenges
|
||||
|
||||
Remember:
|
||||
- Always respect robots.txt and website terms of service
|
||||
- Use appropriate delays to avoid overwhelming servers
|
||||
- Consider the performance trade-offs of each approach
|
||||
- Test progressively to find the minimum necessary evasion level
|
||||
|
||||
## See Also
|
||||
|
||||
- [Advanced Features](advanced-features.md) - Overview of all advanced features
|
||||
- [Proxy & Security](proxy-security.md) - Using proxies with anti-bot features
|
||||
- [Session Management](session-management.md) - Maintaining sessions across requests
|
||||
- [Identity Based Crawling](identity-based-crawling.md) - Additional anti-detection strategies
|
||||
@@ -91,13 +91,12 @@ async def crawl_twitter_timeline():
|
||||
wait_after_scroll=1.0 # Twitter needs time to load
|
||||
)
|
||||
|
||||
browser_config = BrowserConfig(headless=True) # Set to False to watch it work
|
||||
config = CrawlerRunConfig(
|
||||
virtual_scroll_config=virtual_config,
|
||||
# Optional: Set headless=False to watch it work
|
||||
# browser_config=BrowserConfig(headless=False)
|
||||
virtual_scroll_config=virtual_config
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://twitter.com/search?q=AI",
|
||||
config=config
|
||||
@@ -200,7 +199,7 @@ Use **scan_full_page** when:
|
||||
Virtual Scroll works seamlessly with extraction strategies:
|
||||
|
||||
```python
|
||||
from crawl4ai import LLMExtractionStrategy
|
||||
from crawl4ai import LLMExtractionStrategy, LLMConfig
|
||||
|
||||
# Define extraction schema
|
||||
schema = {
|
||||
@@ -222,7 +221,7 @@ config = CrawlerRunConfig(
|
||||
scroll_count=20
|
||||
),
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o-mini",
|
||||
llm_config=LLMConfig(provider="openai/gpt-4o-mini"),
|
||||
schema=schema
|
||||
)
|
||||
)
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
```python
|
||||
async def arun_many(
|
||||
urls: Union[List[str], List[Any]],
|
||||
config: Optional[CrawlerRunConfig] = None,
|
||||
config: Optional[Union[CrawlerRunConfig, List[CrawlerRunConfig]]] = None,
|
||||
dispatcher: Optional[BaseDispatcher] = None,
|
||||
...
|
||||
) -> Union[List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
|
||||
@@ -15,7 +15,9 @@ async def arun_many(
|
||||
Crawl multiple URLs concurrently or in batches.
|
||||
|
||||
:param urls: A list of URLs (or tasks) to crawl.
|
||||
:param config: (Optional) A default `CrawlerRunConfig` applying to each crawl.
|
||||
:param config: (Optional) Either:
|
||||
- A single `CrawlerRunConfig` applying to all URLs
|
||||
- A list of `CrawlerRunConfig` objects with url_matcher patterns
|
||||
:param dispatcher: (Optional) A concurrency controller (e.g. MemoryAdaptiveDispatcher).
|
||||
...
|
||||
:return: Either a list of `CrawlResult` objects, or an async generator if streaming is enabled.
|
||||
@@ -95,10 +97,70 @@ results = await crawler.arun_many(
|
||||
)
|
||||
```
|
||||
|
||||
### URL-Specific Configurations
|
||||
|
||||
Instead of using one config for all URLs, provide a list of configs with `url_matcher` patterns:
|
||||
|
||||
```python
|
||||
from crawl4ai import CrawlerRunConfig, MatchMode
|
||||
from crawl4ai.processors.pdf import PDFContentScrapingStrategy
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
|
||||
# PDF files - specialized extraction
|
||||
pdf_config = CrawlerRunConfig(
|
||||
url_matcher="*.pdf",
|
||||
scraping_strategy=PDFContentScrapingStrategy()
|
||||
)
|
||||
|
||||
# Blog/article pages - content filtering
|
||||
blog_config = CrawlerRunConfig(
|
||||
url_matcher=["*/blog/*", "*/article/*", "*python.org*"],
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(threshold=0.48)
|
||||
)
|
||||
)
|
||||
|
||||
# Dynamic pages - JavaScript execution
|
||||
github_config = CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'github.com' in url,
|
||||
js_code="window.scrollTo(0, 500);"
|
||||
)
|
||||
|
||||
# API endpoints - JSON extraction
|
||||
api_config = CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'api' in url or url.endswith('.json'),
|
||||
# Custome settings for JSON extraction
|
||||
)
|
||||
|
||||
# Default fallback config
|
||||
default_config = CrawlerRunConfig() # No url_matcher means it never matches except as fallback
|
||||
|
||||
# Pass the list of configs - first match wins!
|
||||
results = await crawler.arun_many(
|
||||
urls=[
|
||||
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", # → pdf_config
|
||||
"https://blog.python.org/", # → blog_config
|
||||
"https://github.com/microsoft/playwright", # → github_config
|
||||
"https://httpbin.org/json", # → api_config
|
||||
"https://example.com/" # → default_config
|
||||
],
|
||||
config=[pdf_config, blog_config, github_config, api_config, default_config]
|
||||
)
|
||||
```
|
||||
|
||||
**URL Matching Features**:
|
||||
- **String patterns**: `"*.pdf"`, `"*/blog/*"`, `"*python.org*"`
|
||||
- **Function matchers**: `lambda url: 'api' in url`
|
||||
- **Mixed patterns**: Combine strings and functions with `MatchMode.OR` or `MatchMode.AND`
|
||||
- **First match wins**: Configs are evaluated in order
|
||||
|
||||
**Key Points**:
|
||||
- Each URL is processed by the same or separate sessions, depending on the dispatcher’s strategy.
|
||||
- `dispatch_result` in each `CrawlResult` (if using concurrency) can hold memory and timing info.
|
||||
- If you need to handle authentication or session IDs, pass them in each individual task or within your run config.
|
||||
- **Important**: Always include a default config (without `url_matcher`) as the last item if you want to handle all URLs. Otherwise, unmatched URLs will fail.
|
||||
|
||||
### Return Value
|
||||
|
||||
|
||||
@@ -208,6 +208,71 @@ config = CrawlerRunConfig(
|
||||
|
||||
See [Virtual Scroll documentation](../../advanced/virtual-scroll.md) for detailed examples.
|
||||
|
||||
---
|
||||
|
||||
### I) **URL Matching Configuration**
|
||||
|
||||
| **Parameter** | **Type / Default** | **What It Does** |
|
||||
|------------------------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| **`url_matcher`** | `UrlMatcher` (None) | Pattern(s) to match URLs against. Can be: string (glob), function, or list of mixed types. **None means match ALL URLs** |
|
||||
| **`match_mode`** | `MatchMode` (MatchMode.OR) | How to combine multiple matchers in a list: `MatchMode.OR` (any match) or `MatchMode.AND` (all must match) |
|
||||
|
||||
The `url_matcher` parameter enables URL-specific configurations when used with `arun_many()`:
|
||||
|
||||
```python
|
||||
from crawl4ai import CrawlerRunConfig, MatchMode
|
||||
from crawl4ai.processors.pdf import PDFContentScrapingStrategy
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
|
||||
# Simple string pattern (glob-style)
|
||||
pdf_config = CrawlerRunConfig(
|
||||
url_matcher="*.pdf",
|
||||
scraping_strategy=PDFContentScrapingStrategy()
|
||||
)
|
||||
|
||||
# Multiple patterns with OR logic (default)
|
||||
blog_config = CrawlerRunConfig(
|
||||
url_matcher=["*/blog/*", "*/article/*", "*/news/*"],
|
||||
match_mode=MatchMode.OR # Any pattern matches
|
||||
)
|
||||
|
||||
# Function matcher
|
||||
api_config = CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'api' in url or url.endswith('.json'),
|
||||
# Other settings like extraction_strategy
|
||||
)
|
||||
|
||||
# Mixed: String + Function with AND logic
|
||||
complex_config = CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
lambda url: url.startswith('https://'), # Must be HTTPS
|
||||
"*.org/*", # Must be .org domain
|
||||
lambda url: 'docs' in url # Must contain 'docs'
|
||||
],
|
||||
match_mode=MatchMode.AND # ALL conditions must match
|
||||
)
|
||||
|
||||
# Combined patterns and functions with AND logic
|
||||
secure_docs = CrawlerRunConfig(
|
||||
url_matcher=["https://*", lambda url: '.doc' in url],
|
||||
match_mode=MatchMode.AND # Must be HTTPS AND contain .doc
|
||||
)
|
||||
|
||||
# Default config - matches ALL URLs
|
||||
default_config = CrawlerRunConfig() # No url_matcher = matches everything
|
||||
```
|
||||
|
||||
**UrlMatcher Types:**
|
||||
- **None (default)**: When `url_matcher` is None or not set, the config matches ALL URLs
|
||||
- **String patterns**: Glob-style patterns like `"*.pdf"`, `"*/api/*"`, `"https://*.example.com/*"`
|
||||
- **Functions**: `lambda url: bool` - Custom logic for complex matching
|
||||
- **Lists**: Mix strings and functions, combined with `MatchMode.OR` or `MatchMode.AND`
|
||||
|
||||
**Important Behavior:**
|
||||
- When passing a list of configs to `arun_many()`, URLs are matched against each config's `url_matcher` in order. First match wins!
|
||||
- If no config matches a URL and there's no default config (one without `url_matcher`), the URL will fail with "No matching configuration found"
|
||||
- Always include a default config as the last item if you want to handle all URLs
|
||||
|
||||
---## 2.2 Helper Methods
|
||||
|
||||
Both `BrowserConfig` and `CrawlerRunConfig` provide a `clone()` method to create modified copies:
|
||||
@@ -298,7 +363,7 @@ LLMConfig is useful to pass LLM provider config to strategies and functions that
|
||||
## 3.1 Parameters
|
||||
| **Parameter** | **Type / Default** | **What It Does** |
|
||||
|-----------------------|----------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| **`provider`** | `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`<br/>*(default: `"openai/gpt-4o-mini"`)* | Which LLM provoder to use.
|
||||
| **`provider`** | `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`<br/>*(default: `"openai/gpt-4o-mini"`)* | Which LLM provider to use.
|
||||
| **`api_token`** |1.Optional. When not provided explicitly, api_token will be read from environment variables based on provider. For example: If a gemini model is passed as provider then,`"GEMINI_API_KEY"` will be read from environment variables <br/> 2. API token of LLM provider <br/> eg: `api_token = "gsk_1ClHGGJ7Lpn4WGybR7vNWGdyb3FY7zXEw3SCiy0BAVM9lL8CQv"` <br/> 3. Environment variable - use with prefix "env:" <br/> eg:`api_token = "env: GROQ_API_KEY"` | API token to use for the given provider
|
||||
| **`base_url`** |Optional. Custom API endpoint | If your provider has a custom endpoint
|
||||
|
||||
|
||||
@@ -20,14 +20,28 @@ Ever wondered why your AI coding assistant struggles with your library despite c
|
||||
|
||||
## Latest Release
|
||||
|
||||
Here’s the blog index entry for **v0.6.0**, written to match the exact tone and structure of your previous entries:
|
||||
### [Crawl4AI v0.7.0 – The Adaptive Intelligence Update](releases/0.7.0.md)
|
||||
*January 28, 2025*
|
||||
|
||||
Crawl4AI v0.7.0 introduces groundbreaking intelligence features that transform how crawlers understand and adapt to websites. This release brings Adaptive Crawling that learns website patterns, Virtual Scroll support for infinite pages, intelligent Link Preview with 3-layer scoring, and the powerful Async URL Seeder for massive URL discovery.
|
||||
|
||||
Key highlights:
|
||||
- **Adaptive Crawling**: Crawlers that learn and adapt to website structures automatically
|
||||
- **Virtual Scroll Support**: Complete content extraction from modern infinite scroll pages
|
||||
- **Link Preview**: 3-layer scoring system for intelligent link prioritization
|
||||
- **Async URL Seeder**: Discover thousands of URLs in seconds with smart filtering
|
||||
- **Performance Boost**: Up to 3x faster with optimized resource handling
|
||||
|
||||
[Read full release notes →](releases/0.7.0.md)
|
||||
|
||||
---
|
||||
|
||||
### [Crawl4AI v0.6.0 – World-Aware Crawling, Pre-Warmed Browsers, and the MCP API](releases/0.6.0.md)
|
||||
*April 23, 2025*
|
||||
## Previous Releases
|
||||
|
||||
Crawl4AI v0.6.0 is our most powerful release yet. This update brings major architectural upgrades including world-aware crawling (set geolocation, locale, and timezone), real-time traffic capture, and a memory-efficient crawler pool with pre-warmed pages.
|
||||
### [Crawl4AI v0.6.0 – World-Aware Crawling, Pre-Warmed Browsers, and the MCP API](releases/0.6.0.md)
|
||||
*December 23, 2024*
|
||||
|
||||
Crawl4AI v0.6.0 brought major architectural upgrades including world-aware crawling (set geolocation, locale, and timezone), real-time traffic capture, and a memory-efficient crawler pool with pre-warmed pages.
|
||||
|
||||
The Docker server now exposes a full-featured MCP socket + SSE interface, supports streaming, and comes with a new Playground UI. Plus, table extraction is now native, and the new stress-test framework supports crawling 1,000+ URLs.
|
||||
|
||||
@@ -45,8 +59,6 @@ Other key changes:
|
||||
|
||||
---
|
||||
|
||||
Let me know if you want me to auto-update the actual file or just paste this into the markdown.
|
||||
|
||||
### [Crawl4AI v0.5.0: Deep Crawling, Scalability, and a New CLI!](releases/0.5.0.md)
|
||||
|
||||
My dear friends and crawlers, there you go, this is the release of Crawl4AI v0.5.0! This release brings a wealth of new features, performance improvements, and a more streamlined developer experience. Here's a breakdown of what's new:
|
||||
@@ -140,5 +152,4 @@ Curious about how Crawl4AI has evolved? Check out our [complete changelog](https
|
||||
|
||||
- Star us on [GitHub](https://github.com/unclecode/crawl4ai)
|
||||
- Follow [@unclecode](https://twitter.com/unclecode) on Twitter
|
||||
- Join our community discussions on GitHub
|
||||
|
||||
- Join our community discussions on GitHub
|
||||
144
docs/md_v2/blog/index.md.bak
Normal file
144
docs/md_v2/blog/index.md.bak
Normal file
@@ -0,0 +1,144 @@
|
||||
# Crawl4AI Blog
|
||||
|
||||
Welcome to the Crawl4AI blog! Here you'll find detailed release notes, technical insights, and updates about the project. Whether you're looking for the latest improvements or want to dive deep into web crawling techniques, this is the place.
|
||||
|
||||
## Featured Articles
|
||||
|
||||
### [When to Stop Crawling: The Art of Knowing "Enough"](articles/adaptive-crawling-revolution.md)
|
||||
*January 29, 2025*
|
||||
|
||||
Traditional crawlers are like tourists with unlimited time—they'll visit every street, every alley, every dead end. But what if your crawler could think like a researcher with a deadline? Discover how Adaptive Crawling revolutionizes web scraping by knowing when to stop. Learn about the three-layer intelligence system that evaluates coverage, consistency, and saturation to build focused knowledge bases instead of endless page collections.
|
||||
|
||||
[Read the full article →](articles/adaptive-crawling-revolution.md)
|
||||
|
||||
### [The LLM Context Protocol: Why Your AI Assistant Needs Memory, Reasoning, and Examples](articles/llm-context-revolution.md)
|
||||
*January 24, 2025*
|
||||
|
||||
Ever wondered why your AI coding assistant struggles with your library despite comprehensive documentation? This article introduces the three-dimensional context protocol that transforms how AI understands code. Learn why memory, reasoning, and examples together create wisdom—not just information.
|
||||
|
||||
[Read the full article →](articles/llm-context-revolution.md)
|
||||
|
||||
## Latest Release
|
||||
|
||||
Here’s the blog index entry for **v0.6.0**, written to match the exact tone and structure of your previous entries:
|
||||
|
||||
---
|
||||
|
||||
### [Crawl4AI v0.6.0 – World-Aware Crawling, Pre-Warmed Browsers, and the MCP API](releases/0.6.0.md)
|
||||
*April 23, 2025*
|
||||
|
||||
Crawl4AI v0.6.0 is our most powerful release yet. This update brings major architectural upgrades including world-aware crawling (set geolocation, locale, and timezone), real-time traffic capture, and a memory-efficient crawler pool with pre-warmed pages.
|
||||
|
||||
The Docker server now exposes a full-featured MCP socket + SSE interface, supports streaming, and comes with a new Playground UI. Plus, table extraction is now native, and the new stress-test framework supports crawling 1,000+ URLs.
|
||||
|
||||
Other key changes:
|
||||
|
||||
* Native support for `result.media["tables"]` to export DataFrames
|
||||
* Full network + console logs and MHTML snapshot per crawl
|
||||
* Browser pooling and pre-warming for faster cold starts
|
||||
* New streaming endpoints via MCP API and Playground
|
||||
* Robots.txt support, proxy rotation, and improved session handling
|
||||
* Deprecated old markdown names, legacy modules cleaned up
|
||||
* Massive repo cleanup: ~36K insertions, ~5K deletions across 121 files
|
||||
|
||||
[Read full release notes →](releases/0.6.0.md)
|
||||
|
||||
---
|
||||
|
||||
Let me know if you want me to auto-update the actual file or just paste this into the markdown.
|
||||
|
||||
### [Crawl4AI v0.5.0: Deep Crawling, Scalability, and a New CLI!](releases/0.5.0.md)
|
||||
|
||||
My dear friends and crawlers, there you go, this is the release of Crawl4AI v0.5.0! This release brings a wealth of new features, performance improvements, and a more streamlined developer experience. Here's a breakdown of what's new:
|
||||
|
||||
**Major New Features:**
|
||||
|
||||
* **Deep Crawling:** Explore entire websites with configurable strategies (BFS, DFS, Best-First). Define custom filters and URL scoring for targeted crawls.
|
||||
* **Memory-Adaptive Dispatcher:** Handle large-scale crawls with ease! Our new dispatcher dynamically adjusts concurrency based on available memory and includes built-in rate limiting.
|
||||
* **Multiple Crawler Strategies:** Choose between the full-featured Playwright browser-based crawler or a new, *much* faster HTTP-only crawler for simpler tasks.
|
||||
* **Docker Deployment:** Deploy Crawl4AI as a scalable, self-contained service with built-in API endpoints and optional JWT authentication.
|
||||
* **Command-Line Interface (CLI):** Interact with Crawl4AI directly from your terminal. Crawl, configure, and extract data with simple commands.
|
||||
* **LLM Configuration (`LLMConfig`):** A new, unified way to configure LLM providers (OpenAI, Anthropic, Ollama, etc.) for extraction, filtering, and schema generation. Simplifies API key management and switching between models.
|
||||
|
||||
**Minor Updates & Improvements:**
|
||||
|
||||
* **LXML Scraping Mode:** Faster HTML parsing with `LXMLWebScrapingStrategy`.
|
||||
* **Proxy Rotation:** Added `ProxyRotationStrategy` with a `RoundRobinProxyStrategy` implementation.
|
||||
* **PDF Processing:** Extract text, images, and metadata from PDF files.
|
||||
* **URL Redirection Tracking:** Automatically follows and records redirects.
|
||||
* **Robots.txt Compliance:** Optionally respect website crawling rules.
|
||||
* **LLM-Powered Schema Generation:** Automatically create extraction schemas using an LLM.
|
||||
* **`LLMContentFilter`:** Generate high-quality, focused markdown using an LLM.
|
||||
* **Improved Error Handling & Stability:** Numerous bug fixes and performance enhancements.
|
||||
* **Enhanced Documentation:** Updated guides and examples.
|
||||
|
||||
**Breaking Changes & Migration:**
|
||||
|
||||
This release includes several breaking changes to improve the library's structure and consistency. Here's what you need to know:
|
||||
|
||||
* **`arun_many()` Behavior:** Now uses the `MemoryAdaptiveDispatcher` by default. The return type depends on the `stream` parameter in `CrawlerRunConfig`. Adjust code that relied on unbounded concurrency.
|
||||
* **`max_depth` Location:** Moved to `CrawlerRunConfig` and now controls *crawl depth*.
|
||||
* **Deep Crawling Imports:** Import `DeepCrawlStrategy` and related classes from `crawl4ai.deep_crawling`.
|
||||
* **`BrowserContext` API:** Updated; the old `get_context` method is deprecated.
|
||||
* **Optional Model Fields:** Many data model fields are now optional. Handle potential `None` values.
|
||||
* **`ScrapingMode` Enum:** Replaced with strategy pattern (`WebScrapingStrategy`, `LXMLWebScrapingStrategy`).
|
||||
* **`content_filter` Parameter:** Removed from `CrawlerRunConfig`. Use extraction strategies or markdown generators with filters.
|
||||
* **Removed Functionality:** The synchronous `WebCrawler`, the old CLI, and docs management tools have been removed.
|
||||
* **Docker:** Significant changes to deployment. See the [Docker documentation](../deploy/docker/README.md).
|
||||
* **`ssl_certificate.json`:** This file has been removed.
|
||||
* **Config**: FastFilterChain has been replaced with FilterChain
|
||||
* **Deep-Crawl**: DeepCrawlStrategy.arun now returns Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
||||
* **Proxy**: Removed synchronous WebCrawler support and related rate limiting configurations
|
||||
* **LLM Parameters:** Use the new `LLMConfig` object instead of passing `provider`, `api_token`, `base_url`, and `api_base` directly to `LLMExtractionStrategy` and `LLMContentFilter`.
|
||||
|
||||
**In short:** Update imports, adjust `arun_many()` usage, check for optional fields, and review the Docker deployment guide.
|
||||
|
||||
## License Change
|
||||
|
||||
Crawl4AI v0.5.0 updates the license to Apache 2.0 *with a required attribution clause*. This means you are free to use, modify, and distribute Crawl4AI (even commercially), but you *must* clearly attribute the project in any public use or distribution. See the updated `LICENSE` file for the full legal text and specific requirements.
|
||||
|
||||
**Get Started:**
|
||||
|
||||
* **Installation:** `pip install "crawl4ai[all]"` (or use the Docker image)
|
||||
* **Documentation:** [https://docs.crawl4ai.com](https://docs.crawl4ai.com)
|
||||
* **GitHub:** [https://github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||
|
||||
I'm very excited to see what you build with Crawl4AI v0.5.0!
|
||||
|
||||
---
|
||||
|
||||
### [0.4.2 - Configurable Crawlers, Session Management, and Smarter Screenshots](releases/0.4.2.md)
|
||||
*December 12, 2024*
|
||||
|
||||
The 0.4.2 update brings massive improvements to configuration, making crawlers and browsers easier to manage with dedicated objects. You can now import/export local storage for seamless session management. Plus, long-page screenshots are faster and cleaner, and full-page PDF exports are now possible. Check out all the new features to make your crawling experience even smoother.
|
||||
|
||||
[Read full release notes →](releases/0.4.2.md)
|
||||
|
||||
---
|
||||
|
||||
### [0.4.1 - Smarter Crawling with Lazy-Load Handling, Text-Only Mode, and More](releases/0.4.1.md)
|
||||
*December 8, 2024*
|
||||
|
||||
This release brings major improvements to handling lazy-loaded images, a blazing-fast Text-Only Mode, full-page scanning for infinite scrolls, dynamic viewport adjustments, and session reuse for efficient crawling. If you're looking to improve speed, reliability, or handle dynamic content with ease, this update has you covered.
|
||||
|
||||
[Read full release notes →](releases/0.4.1.md)
|
||||
|
||||
---
|
||||
|
||||
### [0.4.0 - Major Content Filtering Update](releases/0.4.0.md)
|
||||
*December 1, 2024*
|
||||
|
||||
Introduced significant improvements to content filtering, multi-threaded environment handling, and user-agent generation. This release features the new PruningContentFilter, enhanced thread safety, and improved test coverage.
|
||||
|
||||
[Read full release notes →](releases/0.4.0.md)
|
||||
|
||||
## Project History
|
||||
|
||||
Curious about how Crawl4AI has evolved? Check out our [complete changelog](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md) for a detailed history of all versions and updates.
|
||||
|
||||
## Stay Updated
|
||||
|
||||
- Star us on [GitHub](https://github.com/unclecode/crawl4ai)
|
||||
- Follow [@unclecode](https://twitter.com/unclecode) on Twitter
|
||||
- Join our community discussions on GitHub
|
||||
|
||||
343
docs/md_v2/blog/releases/0.7.0.md
Normal file
343
docs/md_v2/blog/releases/0.7.0.md
Normal file
@@ -0,0 +1,343 @@
|
||||
# 🚀 Crawl4AI v0.7.0: The Adaptive Intelligence Update
|
||||
|
||||
*January 28, 2025 • 10 min read*
|
||||
|
||||
---
|
||||
|
||||
Today I'm releasing Crawl4AI v0.7.0—the Adaptive Intelligence Update. This release introduces fundamental improvements in how Crawl4AI handles modern web complexity through adaptive learning, intelligent content discovery, and advanced extraction capabilities.
|
||||
|
||||
## 🎯 What's New at a Glance
|
||||
|
||||
- **Adaptive Crawling**: Your crawler now learns and adapts to website patterns
|
||||
- **Virtual Scroll Support**: Complete content extraction from infinite scroll pages
|
||||
- **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization
|
||||
- **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering
|
||||
- **Performance Optimizations**: Significant speed and memory improvements
|
||||
|
||||
## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning
|
||||
|
||||
**The Problem:** Websites change. Class names shift. IDs disappear. Your carefully crafted selectors break at 3 AM, and you wake up to empty datasets and angry stakeholders.
|
||||
|
||||
**My Solution:** I implemented an adaptive learning system that observes patterns, builds confidence scores, and adjusts extraction strategies on the fly. It's like having a junior developer who gets better at their job with every page they scrape.
|
||||
|
||||
### Technical Deep-Dive
|
||||
|
||||
The Adaptive Crawler maintains a persistent state for each domain, tracking:
|
||||
- Pattern success rates
|
||||
- Selector stability over time
|
||||
- Content structure variations
|
||||
- Extraction confidence scores
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||
import asyncio
|
||||
|
||||
async def main():
|
||||
|
||||
# Configure adaptive crawler
|
||||
config = AdaptiveConfig(
|
||||
strategy="statistical", # or "embedding" for semantic understanding
|
||||
max_pages=10,
|
||||
confidence_threshold=0.7, # Stop at 70% confidence
|
||||
top_k_links=3, # Follow top 3 links per page
|
||||
min_gain_threshold=0.05 # Need 5% information gain to continue
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
|
||||
print("Starting adaptive crawl about Python decorators...")
|
||||
result = await adaptive.digest(
|
||||
start_url="https://docs.python.org/3/glossary.html",
|
||||
query="python decorators functions wrapping"
|
||||
)
|
||||
|
||||
print(f"\n✅ Crawling Complete!")
|
||||
print(f"• Confidence Level: {adaptive.confidence:.0%}")
|
||||
print(f"• Pages Crawled: {len(result.crawled_urls)}")
|
||||
print(f"• Knowledge Base: {len(adaptive.state.knowledge_base)} documents")
|
||||
|
||||
# Get most relevant content
|
||||
relevant = adaptive.get_relevant_content(top_k=3)
|
||||
print(f"\nMost Relevant Pages:")
|
||||
for i, page in enumerate(relevant, 1):
|
||||
print(f"{i}. {page['url']} (relevance: {page['score']:.2%})")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
- **News Aggregation**: Maintain 95%+ extraction accuracy even as news sites update their templates
|
||||
- **E-commerce Monitoring**: Track product changes across hundreds of stores without constant maintenance
|
||||
- **Research Data Collection**: Build robust academic datasets that survive website redesigns
|
||||
- **Reduced Maintenance**: Cut selector update time by 80% for frequently-changing sites
|
||||
|
||||
## 🌊 Virtual Scroll: Complete Content Capture
|
||||
|
||||
**The Problem:** Modern web apps only render what's visible. Scroll down, new content appears, old content vanishes into the void. Traditional crawlers capture that first viewport and miss 90% of the content. It's like reading only the first page of every book.
|
||||
|
||||
**My Solution:** I built Virtual Scroll support that mimics human browsing behavior, capturing content as it loads and preserving it before the browser's garbage collector strikes.
|
||||
|
||||
### Implementation Details
|
||||
|
||||
```python
|
||||
from crawl4ai import VirtualScrollConfig
|
||||
|
||||
# For social media feeds (Twitter/X style)
|
||||
twitter_config = VirtualScrollConfig(
|
||||
container_selector="[data-testid='primaryColumn']",
|
||||
scroll_count=20, # Number of scrolls
|
||||
scroll_by="container_height", # Smart scrolling by container size
|
||||
wait_after_scroll=1.0 # Let content load
|
||||
)
|
||||
|
||||
# For e-commerce product grids (Instagram style)
|
||||
grid_config = VirtualScrollConfig(
|
||||
container_selector="main .product-grid",
|
||||
scroll_count=30,
|
||||
scroll_by=800, # Fixed pixel scrolling
|
||||
wait_after_scroll=1.5 # Images need time
|
||||
)
|
||||
|
||||
# For news feeds with lazy loading
|
||||
news_config = VirtualScrollConfig(
|
||||
container_selector=".article-feed",
|
||||
scroll_count=50,
|
||||
scroll_by="page_height", # Viewport-based scrolling
|
||||
wait_after_scroll=0.5 # Wait for content to load
|
||||
)
|
||||
|
||||
# Use it in your crawl
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://twitter.com/trending",
|
||||
config=CrawlerRunConfig(
|
||||
virtual_scroll_config=twitter_config,
|
||||
# Combine with other features
|
||||
extraction_strategy=JsonCssExtractionStrategy({
|
||||
"tweets": {
|
||||
"selector": "[data-testid='tweet']",
|
||||
"fields": {
|
||||
"text": {"selector": "[data-testid='tweetText']", "type": "text"},
|
||||
"likes": {"selector": "[data-testid='like']", "type": "text"}
|
||||
}
|
||||
}
|
||||
})
|
||||
)
|
||||
)
|
||||
|
||||
print(f"Captured {len(result.extracted_content['tweets'])} tweets")
|
||||
```
|
||||
|
||||
**Key Capabilities:**
|
||||
- **DOM Recycling Awareness**: Detects and handles virtual DOM element recycling
|
||||
- **Smart Scroll Physics**: Three modes - container height, page height, or fixed pixels
|
||||
- **Content Preservation**: Captures content before it's destroyed
|
||||
- **Intelligent Stopping**: Stops when no new content appears
|
||||
- **Memory Efficient**: Streams content instead of holding everything in memory
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
- **Social Media Analysis**: Capture entire Twitter threads with hundreds of replies, not just top 10
|
||||
- **E-commerce Scraping**: Extract 500+ products from infinite scroll catalogs vs. 20-50 with traditional methods
|
||||
- **News Aggregation**: Get all articles from modern news sites, not just above-the-fold content
|
||||
- **Research Applications**: Complete data extraction from academic databases using virtual pagination
|
||||
|
||||
## 🔗 Link Preview: Intelligent Link Analysis and Scoring
|
||||
|
||||
**The Problem:** You crawl a page and get 200 links. Which ones matter? Which lead to the content you actually want? Traditional crawlers force you to follow everything or build complex filters.
|
||||
|
||||
**My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals.
|
||||
|
||||
### Intelligent Link Analysis and Scoring
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import CrawlerRunConfig, CacheMode, AsyncWebCrawler
|
||||
from crawl4ai.adaptive_crawler import LinkPreviewConfig
|
||||
|
||||
async def main():
|
||||
# Configure intelligent link analysis
|
||||
link_config = LinkPreviewConfig(
|
||||
include_internal=True,
|
||||
include_external=False,
|
||||
max_links=10,
|
||||
concurrency=5,
|
||||
query="python tutorial", # For contextual scoring
|
||||
score_threshold=0.3,
|
||||
verbose=True
|
||||
)
|
||||
# Use in your crawl
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://www.geeksforgeeks.org/",
|
||||
config=CrawlerRunConfig(
|
||||
link_preview_config=link_config,
|
||||
score_links=True, # Enable intrinsic scoring
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
)
|
||||
|
||||
# Access scored and sorted links
|
||||
if result.success and result.links:
|
||||
for link in result.links.get("internal", []):
|
||||
text = link.get('text', 'No text')[:40]
|
||||
print(
|
||||
text,
|
||||
f"{link.get('intrinsic_score', 0):.1f}/10" if link.get('intrinsic_score') is not None else "0.0/10",
|
||||
f"{link.get('contextual_score', 0):.2f}/1" if link.get('contextual_score') is not None else "0.00/1",
|
||||
f"{link.get('total_score', 0):.3f}" if link.get('total_score') is not None else "0.000"
|
||||
)
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Scoring Components:**
|
||||
|
||||
1. **Intrinsic Score**: Based on link quality indicators
|
||||
- Position on page (navigation, content, footer)
|
||||
- Link attributes (rel, title, class names)
|
||||
- Anchor text quality and length
|
||||
- URL structure and depth
|
||||
|
||||
2. **Contextual Score**: Relevance to your query using BM25 algorithm
|
||||
- Keyword matching in link text and title
|
||||
- Meta description analysis
|
||||
- Content preview scoring
|
||||
|
||||
3. **Total Score**: Combined score for final ranking
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
- **Research Efficiency**: Find relevant papers 10x faster by following only high-score links
|
||||
- **Competitive Analysis**: Automatically identify important pages on competitor sites
|
||||
- **Content Discovery**: Build topic-focused crawlers that stay on track
|
||||
- **SEO Audits**: Identify and prioritize high-value internal linking opportunities
|
||||
|
||||
## 🎣 Async URL Seeder: Automated URL Discovery at Scale
|
||||
|
||||
**The Problem:** You want to crawl an entire domain but only have the homepage. Or worse, you want specific content types across thousands of pages. Manual URL discovery? That's a job for machines, not humans.
|
||||
|
||||
**My Solution:** I built Async URL Seeder—a turbocharged URL discovery engine that combines multiple sources with intelligent filtering and relevance scoring.
|
||||
|
||||
### Technical Architecture
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
||||
|
||||
async def main():
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
# Discover Python tutorial URLs
|
||||
config = SeedingConfig(
|
||||
source="sitemap", # Use sitemap
|
||||
pattern="*python*", # URL pattern filter
|
||||
extract_head=True, # Get metadata
|
||||
query="python tutorial", # For relevance scoring
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.2,
|
||||
max_urls=10
|
||||
)
|
||||
|
||||
print("Discovering Python async tutorial URLs...")
|
||||
urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
|
||||
|
||||
print(f"\n✅ Found {len(urls)} relevant URLs:")
|
||||
for i, url_info in enumerate(urls[:5], 1):
|
||||
print(f"\n{i}. {url_info['url']}")
|
||||
if url_info.get('relevance_score'):
|
||||
print(f" Relevance: {url_info['relevance_score']:.3f}")
|
||||
if url_info.get('head_data', {}).get('title'):
|
||||
print(f" Title: {url_info['head_data']['title'][:60]}...")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Discovery Methods:**
|
||||
- **Sitemap Mining**: Parses robots.txt and all linked sitemaps
|
||||
- **Common Crawl**: Queries the Common Crawl index for historical URLs
|
||||
- **Intelligent Crawling**: Follows links with smart depth control
|
||||
- **Pattern Analysis**: Learns URL structures and generates variations
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
- **Migration Projects**: Discover 10,000+ URLs from legacy sites in under 60 seconds
|
||||
- **Market Research**: Map entire competitor ecosystems automatically
|
||||
- **Academic Research**: Build comprehensive datasets without manual URL collection
|
||||
- **SEO Audits**: Find every indexable page with content scoring
|
||||
- **Content Archival**: Ensure no content is left behind during site migrations
|
||||
|
||||
## ⚡ Performance Optimizations
|
||||
|
||||
This release includes significant performance improvements through optimized resource handling, better concurrency management, and reduced memory footprint.
|
||||
|
||||
### What We Optimized
|
||||
|
||||
```python
|
||||
# Optimized crawling with v0.7.0 improvements
|
||||
results = []
|
||||
for url in urls:
|
||||
result = await crawler.arun(
|
||||
url,
|
||||
config=CrawlerRunConfig(
|
||||
# Performance optimizations
|
||||
wait_until="domcontentloaded", # Faster than networkidle
|
||||
cache_mode=CacheMode.ENABLED # Enable caching
|
||||
)
|
||||
)
|
||||
results.append(result)
|
||||
```
|
||||
|
||||
**Performance Gains:**
|
||||
- **Startup Time**: 70% faster browser initialization
|
||||
- **Page Loading**: 40% reduction with smart resource blocking
|
||||
- **Extraction**: 3x faster with compiled CSS selectors
|
||||
- **Memory Usage**: 60% reduction with streaming processing
|
||||
- **Concurrent Crawls**: Handle 5x more parallel requests
|
||||
|
||||
|
||||
## 🔧 Important Changes
|
||||
|
||||
### Breaking Changes
|
||||
- `link_extractor` renamed to `link_preview` (better reflects functionality)
|
||||
- Minimum Python version now 3.9
|
||||
- `CrawlerConfig` split into `CrawlerRunConfig` and `BrowserConfig`
|
||||
|
||||
### Migration Guide
|
||||
```python
|
||||
# Old (v0.6.x)
|
||||
from crawl4ai import CrawlerConfig
|
||||
config = CrawlerConfig(timeout=30000)
|
||||
|
||||
# New (v0.7.0)
|
||||
from crawl4ai import CrawlerRunConfig, BrowserConfig
|
||||
browser_config = BrowserConfig(timeout=30000)
|
||||
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
```
|
||||
|
||||
## 🤖 Coming Soon: Intelligent Web Automation
|
||||
|
||||
I'm currently working on bringing advanced automation capabilities to Crawl4AI. This includes:
|
||||
|
||||
- **Crawl Agents**: Autonomous crawlers that understand your goals and adapt their strategies
|
||||
- **Auto JS Generation**: Automatic JavaScript code generation for complex interactions
|
||||
- **Smart Form Handling**: Intelligent form detection and filling
|
||||
- **Context-Aware Actions**: Crawlers that understand page context and make decisions
|
||||
|
||||
These features are under active development and will revolutionize how we approach web automation. Stay tuned!
|
||||
|
||||
## 🚀 Get Started
|
||||
|
||||
```bash
|
||||
pip install crawl4ai==0.7.0
|
||||
```
|
||||
|
||||
Check out the [updated documentation](https://docs.crawl4ai.com).
|
||||
|
||||
Questions? Issues? I'm always listening:
|
||||
- GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||
- Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||
- Twitter: [@unclecode](https://x.com/unclecode)
|
||||
|
||||
Happy crawling! 🕷️
|
||||
|
||||
---
|
||||
|
||||
*P.S. If you're using Crawl4AI in production, I'd love to hear about it. Your use cases inspire the next features.*
|
||||
43
docs/md_v2/blog/releases/0.7.1.md
Normal file
43
docs/md_v2/blog/releases/0.7.1.md
Normal file
@@ -0,0 +1,43 @@
|
||||
# 🛠️ Crawl4AI v0.7.1: Minor Cleanup Update
|
||||
|
||||
*July 17, 2025 • 2 min read*
|
||||
|
||||
---
|
||||
|
||||
A small maintenance release that removes unused code and improves documentation.
|
||||
|
||||
## 🎯 What's Changed
|
||||
|
||||
- **Removed unused StealthConfig** from `crawl4ai/browser_manager.py`
|
||||
- **Updated documentation** with better examples and parameter explanations
|
||||
- **Fixed virtual scroll configuration** examples in docs
|
||||
|
||||
## 🧹 Code Cleanup
|
||||
|
||||
Removed unused `StealthConfig` import and configuration that wasn't being used anywhere in the codebase. The project uses its own custom stealth implementation through JavaScript injection instead.
|
||||
|
||||
```python
|
||||
# Removed unused code:
|
||||
from playwright_stealth import StealthConfig
|
||||
stealth_config = StealthConfig(...) # This was never used
|
||||
```
|
||||
|
||||
## 📖 Documentation Updates
|
||||
|
||||
- Fixed adaptive crawling parameter examples
|
||||
- Updated session management documentation
|
||||
- Corrected virtual scroll configuration examples
|
||||
|
||||
## 🚀 Installation
|
||||
|
||||
```bash
|
||||
pip install crawl4ai==0.7.1
|
||||
```
|
||||
|
||||
No breaking changes - upgrade directly from v0.7.0.
|
||||
|
||||
---
|
||||
|
||||
Questions? Issues?
|
||||
- GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||
- Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||
98
docs/md_v2/blog/releases/0.7.2.md
Normal file
98
docs/md_v2/blog/releases/0.7.2.md
Normal file
@@ -0,0 +1,98 @@
|
||||
# 🚀 Crawl4AI v0.7.2: CI/CD & Dependency Optimization Update
|
||||
|
||||
*July 25, 2025 • 3 min read*
|
||||
|
||||
---
|
||||
|
||||
This release introduces automated CI/CD pipelines for seamless releases and optimizes dependencies for a lighter, more efficient package.
|
||||
|
||||
## 🎯 What's New
|
||||
|
||||
### 🔄 Automated Release Pipeline
|
||||
- **GitHub Actions CI/CD**: Automated PyPI and Docker Hub releases on tag push
|
||||
- **Multi-platform Docker images**: Support for both AMD64 and ARM64 architectures
|
||||
- **Version consistency checks**: Ensures tag, package, and Docker versions align
|
||||
- **Automated release notes**: GitHub releases created automatically
|
||||
|
||||
### 📦 Dependency Optimization
|
||||
- **Moved sentence-transformers to optional dependencies**: Significantly reduces default installation size
|
||||
- **Lighter Docker images**: Optimized Dockerfile for faster builds and smaller images
|
||||
- **Better dependency management**: Core vs. optional dependencies clearly separated
|
||||
|
||||
## 🏗️ CI/CD Pipeline
|
||||
|
||||
The new automated release process ensures consistent, reliable releases:
|
||||
|
||||
```yaml
|
||||
# Trigger releases with a simple tag
|
||||
git tag v0.7.2
|
||||
git push origin v0.7.2
|
||||
|
||||
# Automatically:
|
||||
# ✅ Validates version consistency
|
||||
# ✅ Builds and publishes to PyPI
|
||||
# ✅ Builds multi-platform Docker images
|
||||
# ✅ Pushes to Docker Hub with proper tags
|
||||
# ✅ Creates GitHub release
|
||||
```
|
||||
|
||||
## 💾 Lighter Installation
|
||||
|
||||
Default installation is now significantly smaller:
|
||||
|
||||
```bash
|
||||
# Core installation (smaller, faster)
|
||||
pip install crawl4ai==0.7.2
|
||||
|
||||
# With ML features (includes sentence-transformers)
|
||||
pip install crawl4ai[transformer]==0.7.2
|
||||
|
||||
# Full installation
|
||||
pip install crawl4ai[all]==0.7.2
|
||||
```
|
||||
|
||||
## 🐳 Docker Improvements
|
||||
|
||||
Enhanced Docker support with multi-platform images:
|
||||
|
||||
```bash
|
||||
# Pull the latest version
|
||||
docker pull unclecode/crawl4ai:0.7.2
|
||||
docker pull unclecode/crawl4ai:latest
|
||||
|
||||
# Available tags:
|
||||
# - unclecode/crawl4ai:0.7.2 (specific version)
|
||||
# - unclecode/crawl4ai:0.7 (minor version)
|
||||
# - unclecode/crawl4ai:0 (major version)
|
||||
# - unclecode/crawl4ai:latest
|
||||
```
|
||||
|
||||
## 🔧 Technical Details
|
||||
|
||||
### Dependency Changes
|
||||
- `sentence-transformers` moved from required to optional dependencies
|
||||
- Reduces default installation by ~500MB
|
||||
- No impact on functionality when transformer features aren't needed
|
||||
|
||||
### CI/CD Configuration
|
||||
- GitHub Actions workflows for automated releases
|
||||
- Version validation before publishing
|
||||
- Parallel PyPI and Docker Hub deployments
|
||||
- Automatic tagging strategy for Docker images
|
||||
|
||||
## 🚀 Installation
|
||||
|
||||
```bash
|
||||
pip install crawl4ai==0.7.2
|
||||
```
|
||||
|
||||
No breaking changes - direct upgrade from v0.7.0 or v0.7.1.
|
||||
|
||||
---
|
||||
|
||||
Questions? Issues?
|
||||
- GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||
- Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||
- Twitter: [@unclecode](https://x.com/unclecode)
|
||||
|
||||
*P.S. The new CI/CD pipeline will make future releases faster and more reliable. Thanks for your patience as we improve our release process!*
|
||||
@@ -35,7 +35,7 @@ from crawl4ai import AsyncWebCrawler, AdaptiveCrawler
|
||||
|
||||
async def main():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Create an adaptive crawler
|
||||
# Create an adaptive crawler (config is optional)
|
||||
adaptive = AdaptiveCrawler(crawler)
|
||||
|
||||
# Start crawling with a query
|
||||
@@ -59,13 +59,13 @@ async def main():
|
||||
from crawl4ai import AdaptiveConfig
|
||||
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.7, # Stop when 70% confident (default: 0.8)
|
||||
max_pages=20, # Maximum pages to crawl (default: 50)
|
||||
top_k_links=3, # Links to follow per page (default: 5)
|
||||
confidence_threshold=0.8, # Stop when 80% confident (default: 0.7)
|
||||
max_pages=30, # Maximum pages to crawl (default: 20)
|
||||
top_k_links=5, # Links to follow per page (default: 3)
|
||||
min_gain_threshold=0.05 # Minimum expected gain to continue (default: 0.1)
|
||||
)
|
||||
|
||||
adaptive = AdaptiveCrawler(crawler, config=config)
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
```
|
||||
|
||||
## Crawling Strategies
|
||||
@@ -198,8 +198,8 @@ if result.metrics.get('is_irrelevant', False):
|
||||
The confidence score (0-1) indicates how sufficient the gathered information is:
|
||||
- **0.0-0.3**: Insufficient information, needs more crawling
|
||||
- **0.3-0.6**: Partial information, may answer basic queries
|
||||
- **0.6-0.8**: Good coverage, can answer most queries
|
||||
- **0.8-1.0**: Excellent coverage, comprehensive information
|
||||
- **0.6-0.7**: Good coverage, can answer most queries
|
||||
- **0.7-1.0**: Excellent coverage, comprehensive information
|
||||
|
||||
### Statistics Display
|
||||
|
||||
@@ -257,9 +257,9 @@ new_adaptive.import_knowledge_base("knowledge_base.jsonl")
|
||||
- Avoid overly broad queries
|
||||
|
||||
### 2. Threshold Tuning
|
||||
- Start with default (0.8) for general use
|
||||
- Lower to 0.6-0.7 for exploratory crawling
|
||||
- Raise to 0.9+ for exhaustive coverage
|
||||
- Start with default (0.7) for general use
|
||||
- Lower to 0.5-0.6 for exploratory crawling
|
||||
- Raise to 0.8+ for exhaustive coverage
|
||||
|
||||
### 3. Performance Optimization
|
||||
- Use appropriate `max_pages` limits
|
||||
|
||||
@@ -29,6 +29,7 @@ class BrowserConfig:
|
||||
text_mode=False,
|
||||
light_mode=False,
|
||||
extra_args=None,
|
||||
enable_stealth=False,
|
||||
# ... other advanced parameters omitted here
|
||||
):
|
||||
...
|
||||
@@ -84,6 +85,11 @@ class BrowserConfig:
|
||||
- Additional flags for the underlying browser.
|
||||
- E.g. `["--disable-extensions"]`.
|
||||
|
||||
11. **`enable_stealth`**:
|
||||
- If `True`, enables stealth mode using playwright-stealth.
|
||||
- Modifies browser fingerprints to avoid basic bot detection.
|
||||
- Default is `False`. Recommended for sites with bot protection.
|
||||
|
||||
### Helper Methods
|
||||
|
||||
Both configuration classes provide a `clone()` method to create modified copies:
|
||||
@@ -209,7 +215,13 @@ class CrawlerRunConfig:
|
||||
- The maximum number of concurrent crawl sessions.
|
||||
- Helps prevent overwhelming the system.
|
||||
|
||||
14. **`display_mode`**:
|
||||
14. **`url_matcher`** & **`match_mode`**:
|
||||
- Enable URL-specific configurations when used with `arun_many()`.
|
||||
- Set `url_matcher` to patterns (glob, function, or list) to match specific URLs.
|
||||
- Use `match_mode` (OR/AND) to control how multiple patterns combine.
|
||||
- See [URL-Specific Configurations](../api/arun_many.md#url-specific-configurations) for examples.
|
||||
|
||||
15. **`display_mode`**:
|
||||
- The display mode for progress information (`DETAILED`, `BRIEF`, etc.).
|
||||
- Affects how much information is printed during the crawl.
|
||||
|
||||
@@ -252,7 +264,7 @@ The `clone()` method:
|
||||
### Key fields to note
|
||||
|
||||
1. **`provider`**:
|
||||
- Which LLM provoder to use.
|
||||
- Which LLM provider to use.
|
||||
- Possible values are `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`<br/>*(default: `"openai/gpt-4o-mini"`)*
|
||||
|
||||
2. **`api_token`**:
|
||||
@@ -273,7 +285,7 @@ In a typical scenario, you define **one** `BrowserConfig` for your crawler sessi
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig, LLMContentFilter, DefaultMarkdownGenerator
|
||||
from crawl4ai import JsonCssExtractionStrategy
|
||||
|
||||
async def main():
|
||||
@@ -298,7 +310,7 @@ async def main():
|
||||
# 3) Example LLM content filtering
|
||||
|
||||
gemini_config = LLMConfig(
|
||||
provider="gemini/gemini-1.5-pro"
|
||||
provider="gemini/gemini-1.5-pro",
|
||||
api_token = "env:GEMINI_API_TOKEN"
|
||||
)
|
||||
|
||||
@@ -322,8 +334,9 @@ async def main():
|
||||
)
|
||||
|
||||
md_generator = DefaultMarkdownGenerator(
|
||||
content_filter=filter,
|
||||
options={"ignore_links": True}
|
||||
content_filter=filter,
|
||||
options={"ignore_links": True}
|
||||
)
|
||||
|
||||
# 4) Crawler run config: skip cache, use extraction
|
||||
run_conf = CrawlerRunConfig(
|
||||
|
||||
@@ -52,11 +52,9 @@ That's it! In just a few lines, you've automated a complete search workflow.
|
||||
|
||||
Want to learn by doing? We've got you covered:
|
||||
|
||||
**🚀 [Live Demo](https://docs.crawl4ai.com/c4a-script/demo)** - Try C4A-Script in your browser right now!
|
||||
**🚀 [Live Demo](https://docs.crawl4ai.com/apps/c4a-script/)** - Try C4A-Script in your browser right now!
|
||||
|
||||
**📁 [Tutorial Examples](/examples/c4a_script/)** - Complete examples with source code
|
||||
|
||||
**🛠️ [Local Tutorial](/examples/c4a_script/tutorial/)** - Run the interactive tutorial on your machine
|
||||
**📁 [Tutorial Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/c4a_script/)** - Complete examples with source code
|
||||
|
||||
### Running the Tutorial Locally
|
||||
|
||||
|
||||
@@ -17,6 +17,9 @@
|
||||
- [Configuration Reference](#configuration-reference)
|
||||
- [Best Practices & Tips](#best-practices--tips)
|
||||
|
||||
## Installation
|
||||
The Crawl4AI CLI will be installed automatically when you install the library.
|
||||
|
||||
## Basic Usage
|
||||
|
||||
The Crawl4AI CLI (`crwl`) provides a simple interface to the Crawl4AI library:
|
||||
|
||||
@@ -350,15 +350,22 @@ if __name__ == "__main__":
|
||||
|
||||
## 6. Scraping Modes
|
||||
|
||||
Crawl4AI provides two different scraping strategies for HTML content processing: `WebScrapingStrategy` (BeautifulSoup-based, default) and `LXMLWebScrapingStrategy` (LXML-based). The LXML strategy offers significantly better performance, especially for large HTML documents.
|
||||
Crawl4AI uses `LXMLWebScrapingStrategy` (LXML-based) as the default scraping strategy for HTML content processing. This strategy offers excellent performance, especially for large HTML documents.
|
||||
|
||||
**Note:** For backward compatibility, `WebScrapingStrategy` is still available as an alias for `LXMLWebScrapingStrategy`.
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LXMLWebScrapingStrategy
|
||||
|
||||
async def main():
|
||||
config = CrawlerRunConfig(
|
||||
scraping_strategy=LXMLWebScrapingStrategy() # Faster alternative to default BeautifulSoup
|
||||
# Default configuration already uses LXMLWebScrapingStrategy
|
||||
config = CrawlerRunConfig()
|
||||
|
||||
# Or explicitly specify it if desired
|
||||
config_explicit = CrawlerRunConfig(
|
||||
scraping_strategy=LXMLWebScrapingStrategy()
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
@@ -417,21 +424,20 @@ class CustomScrapingStrategy(ContentScrapingStrategy):
|
||||
|
||||
### Performance Considerations
|
||||
|
||||
The LXML strategy can be up to 10-20x faster than BeautifulSoup strategy, particularly when processing large HTML documents. However, please note:
|
||||
The LXML strategy provides excellent performance, particularly when processing large HTML documents, offering up to 10-20x faster processing compared to BeautifulSoup-based approaches.
|
||||
|
||||
1. LXML strategy is currently experimental
|
||||
2. In some edge cases, the parsing results might differ slightly from BeautifulSoup
|
||||
3. If you encounter any inconsistencies between LXML and BeautifulSoup results, please [raise an issue](https://github.com/codeium/crawl4ai/issues) with a reproducible example
|
||||
Benefits of LXML strategy:
|
||||
- Fast processing of large HTML documents (especially >100KB)
|
||||
- Efficient memory usage
|
||||
- Good handling of well-formed HTML
|
||||
- Robust table detection and extraction
|
||||
|
||||
Choose LXML strategy when:
|
||||
- Processing large HTML documents (recommended for >100KB)
|
||||
- Performance is critical
|
||||
- Working with well-formed HTML
|
||||
### Backward Compatibility
|
||||
|
||||
Stick to BeautifulSoup strategy (default) when:
|
||||
- Maximum compatibility is needed
|
||||
- Working with malformed HTML
|
||||
- Exact parsing behavior is critical
|
||||
For users upgrading from earlier versions:
|
||||
- `WebScrapingStrategy` is now an alias for `LXMLWebScrapingStrategy`
|
||||
- Existing code using `WebScrapingStrategy` will continue to work without modification
|
||||
- No changes are required to your existing code
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -19,13 +19,15 @@ class MarkdownGenerationResult(BaseModel):
|
||||
class CrawlResult(BaseModel):
|
||||
url: str
|
||||
html: str
|
||||
fit_html: Optional[str] = None
|
||||
success: bool
|
||||
cleaned_html: Optional[str] = None
|
||||
media: Dict[str, List[Dict]] = {}
|
||||
links: Dict[str, List[Dict]] = {}
|
||||
downloaded_files: Optional[List[str]] = None
|
||||
js_execution_result: Optional[Dict[str, Any]] = None
|
||||
screenshot: Optional[str] = None
|
||||
pdf : Optional[bytes] = None
|
||||
pdf: Optional[bytes] = None
|
||||
mhtml: Optional[str] = None
|
||||
markdown: Optional[Union[str, MarkdownGenerationResult]] = None
|
||||
extracted_content: Optional[str] = None
|
||||
@@ -35,6 +37,12 @@ class CrawlResult(BaseModel):
|
||||
response_headers: Optional[dict] = None
|
||||
status_code: Optional[int] = None
|
||||
ssl_certificate: Optional[SSLCertificate] = None
|
||||
dispatch_result: Optional[DispatchResult] = None
|
||||
redirected_url: Optional[str] = None
|
||||
network_requests: Optional[List[Dict[str, Any]]] = None
|
||||
console_messages: Optional[List[Dict[str, Any]]] = None
|
||||
tables: List[Dict] = Field(default_factory=list)
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
```
|
||||
@@ -45,11 +53,13 @@ class CrawlResult(BaseModel):
|
||||
|-------------------------------------------|-----------------------------------------------------------------------------------------------------|
|
||||
| **url (`str`)** | The final or actual URL crawled (in case of redirects). |
|
||||
| **html (`str`)** | Original, unmodified page HTML. Good for debugging or custom processing. |
|
||||
| **fit_html (`Optional[str]`)** | Preprocessed HTML optimized for extraction and content filtering. |
|
||||
| **success (`bool`)** | `True` if the crawl completed without major errors, else `False`. |
|
||||
| **cleaned_html (`Optional[str]`)** | Sanitized HTML with scripts/styles removed; can exclude tags if configured via `excluded_tags` etc. |
|
||||
| **media (`Dict[str, List[Dict]]`)** | Extracted media info (images, audio, etc.), each with attributes like `src`, `alt`, `score`, etc. |
|
||||
| **links (`Dict[str, List[Dict]]`)** | Extracted link data, split by `internal` and `external`. Each link usually has `href`, `text`, etc. |
|
||||
| **downloaded_files (`Optional[List[str]]`)** | If `accept_downloads=True` in `BrowserConfig`, this lists the filepaths of saved downloads. |
|
||||
| **js_execution_result (`Optional[Dict[str, Any]]`)** | Results from JavaScript execution during crawling. |
|
||||
| **screenshot (`Optional[str]`)** | Screenshot of the page (base64-encoded) if `screenshot=True`. |
|
||||
| **pdf (`Optional[bytes]`)** | PDF of the page if `pdf=True`. |
|
||||
| **mhtml (`Optional[str]`)** | MHTML snapshot of the page if `capture_mhtml=True`. Contains the full page with all resources. |
|
||||
@@ -61,6 +71,11 @@ class CrawlResult(BaseModel):
|
||||
| **response_headers (`Optional[dict]`)** | HTTP response headers, if captured. |
|
||||
| **status_code (`Optional[int]`)** | HTTP status code (e.g., 200 for OK). |
|
||||
| **ssl_certificate (`Optional[SSLCertificate]`)** | SSL certificate info if `fetch_ssl_certificate=True`. |
|
||||
| **dispatch_result (`Optional[DispatchResult]`)** | Additional concurrency and resource usage information when crawling URLs in parallel. |
|
||||
| **redirected_url (`Optional[str]`)** | The URL after any redirects (different from `url` which is the final URL). |
|
||||
| **network_requests (`Optional[List[Dict[str, Any]]]`)** | List of network requests, responses, and failures captured during the crawl if `capture_network_requests=True`. |
|
||||
| **console_messages (`Optional[List[Dict[str, Any]]]`)** | List of browser console messages captured during the crawl if `capture_console_messages=True`. |
|
||||
| **tables (`List[Dict]`)** | Table data extracted from HTML tables with structure `[{headers, rows, caption, summary}]`. |
|
||||
|
||||
---
|
||||
|
||||
@@ -172,7 +187,7 @@ Here:
|
||||
|
||||
---
|
||||
|
||||
## 5. More Fields: Links, Media, and More
|
||||
## 5. More Fields: Links, Media, Tables and More
|
||||
|
||||
### 5.1 `links`
|
||||
|
||||
@@ -192,7 +207,77 @@ for img in images:
|
||||
print("Image URL:", img["src"], "Alt:", img.get("alt"))
|
||||
```
|
||||
|
||||
### 5.3 `screenshot`, `pdf`, and `mhtml`
|
||||
### 5.3 `tables`
|
||||
|
||||
The `tables` field contains structured data extracted from HTML tables found on the crawled page. Tables are analyzed based on various criteria to determine if they are actual data tables (as opposed to layout tables), including:
|
||||
|
||||
- Presence of thead and tbody sections
|
||||
- Use of th elements for headers
|
||||
- Column consistency
|
||||
- Text density
|
||||
- And other factors
|
||||
|
||||
Tables that score above the threshold (default: 7) are extracted and stored in result.tables.
|
||||
|
||||
### Accessing Table data:
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
|
||||
async def main():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.w3schools.com/html/html_tables.asp",
|
||||
config=CrawlerRunConfig(
|
||||
table_score_threshold=7 # Minimum score for table detection
|
||||
)
|
||||
)
|
||||
|
||||
if result.success and result.tables:
|
||||
print(f"Found {len(result.tables)} tables")
|
||||
|
||||
for i, table in enumerate(result.tables):
|
||||
print(f"\nTable {i+1}:")
|
||||
print(f"Caption: {table.get('caption', 'No caption')}")
|
||||
print(f"Headers: {table['headers']}")
|
||||
print(f"Rows: {len(table['rows'])}")
|
||||
|
||||
# Print first few rows as example
|
||||
for j, row in enumerate(table['rows'][:3]):
|
||||
print(f" Row {j+1}: {row}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
||||
```
|
||||
|
||||
### Configuring Table Extraction:
|
||||
|
||||
You can adjust the sensitivity of the table detection algorithm with:
|
||||
|
||||
```python
|
||||
config = CrawlerRunConfig(
|
||||
table_score_threshold=5 # Lower value = more tables detected (default: 7)
|
||||
)
|
||||
```
|
||||
|
||||
Each extracted table contains:
|
||||
|
||||
- `headers`: Column header names
|
||||
- `rows`: List of rows, each containing cell values
|
||||
- `caption`: Table caption text (if available)
|
||||
- `summary`: Table summary attribute (if specified)
|
||||
|
||||
### Table Extraction Tips
|
||||
|
||||
- Not all HTML tables are extracted - only those detected as "data tables" vs. layout tables.
|
||||
- Tables with inconsistent cell counts, nested tables, or those used purely for layout may be skipped.
|
||||
- If you're missing tables, try adjusting the `table_score_threshold` to a lower value (default is 7).
|
||||
|
||||
The table detection algorithm scores tables based on features like consistent columns, presence of headers, text density, and more. Tables scoring above the threshold are considered data tables worth extracting.
|
||||
|
||||
|
||||
### 5.4 `screenshot`, `pdf`, and `mhtml`
|
||||
|
||||
If you set `screenshot=True`, `pdf=True`, or `capture_mhtml=True` in **`CrawlerRunConfig`**, then:
|
||||
|
||||
@@ -213,7 +298,7 @@ if result.mhtml:
|
||||
|
||||
The MHTML (MIME HTML) format is particularly useful as it captures the entire web page including all of its resources (CSS, images, scripts, etc.) in a single file, making it perfect for archiving or offline viewing.
|
||||
|
||||
### 5.4 `ssl_certificate`
|
||||
### 5.5 `ssl_certificate`
|
||||
|
||||
If `fetch_ssl_certificate=True`, `result.ssl_certificate` holds details about the site’s SSL cert, such as issuer, validity dates, etc.
|
||||
|
||||
|
||||
@@ -58,13 +58,15 @@ Pull and run images directly from Docker Hub without building locally.
|
||||
|
||||
#### 1. Pull the Image
|
||||
|
||||
Our latest release candidate is `0.6.0-r2`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
|
||||
Our latest release candidate is `0.7.0-r1`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
|
||||
|
||||
> ⚠️ **Important Note**: The `latest` tag currently points to the stable `0.6.0` version. After testing and validation, `0.7.0` (without -r1) will be released and `latest` will be updated. For now, please use `0.7.0-r1` to test the new features.
|
||||
|
||||
```bash
|
||||
# Pull the release candidate (recommended for latest features)
|
||||
docker pull unclecode/crawl4ai:0.6.0-r1
|
||||
# Pull the release candidate (for testing new features)
|
||||
docker pull unclecode/crawl4ai:0.7.0-r1
|
||||
|
||||
# Or pull the latest stable version
|
||||
# Or pull the current stable version (0.6.0)
|
||||
docker pull unclecode/crawl4ai:latest
|
||||
```
|
||||
|
||||
@@ -124,7 +126,7 @@ docker stop crawl4ai && docker rm crawl4ai
|
||||
#### Docker Hub Versioning Explained
|
||||
|
||||
* **Image Name:** `unclecode/crawl4ai`
|
||||
* **Tag Format:** `LIBRARY_VERSION[-SUFFIX]` (e.g., `0.6.0-r2`)
|
||||
* **Tag Format:** `LIBRARY_VERSION[-SUFFIX]` (e.g., `0.7.0-r1`)
|
||||
* `LIBRARY_VERSION`: The semantic version of the core `crawl4ai` Python library
|
||||
* `SUFFIX`: Optional tag for release candidates (``) and revisions (`r1`)
|
||||
* **`latest` Tag:** Points to the most recent stable version
|
||||
@@ -152,6 +154,30 @@ cp deploy/docker/.llm.env.example .llm.env
|
||||
# Now edit .llm.env and add your API keys
|
||||
```
|
||||
|
||||
**Flexible LLM Provider Configuration:**
|
||||
|
||||
The Docker setup now supports flexible LLM provider configuration through three methods:
|
||||
|
||||
1. **Environment Variable** (Highest Priority): Set `LLM_PROVIDER` to override the default
|
||||
```bash
|
||||
export LLM_PROVIDER="anthropic/claude-3-opus"
|
||||
# Or in your .llm.env file:
|
||||
# LLM_PROVIDER=anthropic/claude-3-opus
|
||||
```
|
||||
|
||||
2. **API Request Parameter**: Specify provider per request
|
||||
```json
|
||||
{
|
||||
"url": "https://example.com",
|
||||
"f": "llm",
|
||||
"provider": "groq/mixtral-8x7b"
|
||||
}
|
||||
```
|
||||
|
||||
3. **Config File Default**: Falls back to `config.yml` (default: `openai/gpt-4o-mini`)
|
||||
|
||||
The system automatically selects the appropriate API key based on the configured `api_key_env` in the config file.
|
||||
|
||||
#### 3. Build and Run with Compose
|
||||
|
||||
The `docker-compose.yml` file in the project root provides a simplified approach that automatically handles architecture detection using buildx.
|
||||
@@ -666,7 +692,7 @@ app:
|
||||
|
||||
# Default LLM Configuration
|
||||
llm:
|
||||
provider: "openai/gpt-4o-mini"
|
||||
provider: "openai/gpt-4o-mini" # Can be overridden by LLM_PROVIDER env var
|
||||
api_key_env: "OPENAI_API_KEY"
|
||||
# api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
|
||||
|
||||
|
||||
@@ -28,11 +28,8 @@ This page provides a comprehensive list of example scripts that demonstrate vari
|
||||
| Example | Description | Link |
|
||||
|---------|-------------|------|
|
||||
| Deep Crawling | An extensive tutorial on deep crawling capabilities, demonstrating BFS and BestFirst strategies, stream vs. non-stream execution, filters, scorers, and advanced configurations. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/deepcrawl_example.py) |
|
||||
<<<<<<< HEAD
|
||||
| Virtual Scroll | Comprehensive examples for handling virtualized scrolling on sites like Twitter, Instagram. Demonstrates different scrolling scenarios with local test server. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/virtual_scroll_example.py) |
|
||||
=======
|
||||
| Adaptive Crawling | Demonstrates intelligent crawling that automatically determines when sufficient information has been gathered. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/adaptive_crawling/) |
|
||||
>>>>>>> feature/progressive-crawling
|
||||
| Dispatcher | Shows how to use the crawl dispatcher for advanced workload management. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/dispatcher_example.py) |
|
||||
| Storage State | Tutorial on managing browser storage state for persistence. | [View Guide](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/storage_state_tutorial.md) |
|
||||
| Network Console Capture | Demonstrates how to capture and analyze network requests and console logs. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/network_console_capture_example.py) |
|
||||
@@ -57,6 +54,16 @@ This page provides a comprehensive list of example scripts that demonstrate vari
|
||||
| Crypto Analysis | Demonstrates how to crawl and analyze cryptocurrency data. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/crypto_analysis_example.py) |
|
||||
| SERP API | Demonstrates using Crawl4AI with search engine result pages. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/serp_api_project_11_feb.py) |
|
||||
|
||||
## Anti-Bot & Stealth Features
|
||||
|
||||
| Example | Description | Link |
|
||||
|---------|-------------|------|
|
||||
| Stealth Mode Quick Start | Five practical examples showing how to use stealth mode for bypassing basic bot detection. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/stealth_mode_quick_start.py) |
|
||||
| Stealth Mode Comprehensive | Comprehensive demonstration of stealth mode features with bot detection testing and comparisons. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/stealth_mode_example.py) |
|
||||
| Undetected Browser | Simple example showing how to use the undetected browser adapter. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/hello_world_undetected.py) |
|
||||
| Undetected Browser Demo | Basic demo comparing regular and undetected browser modes. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/undetected_simple_demo.py) |
|
||||
| Undetected Tests | Advanced tests comparing regular vs undetected browsers on various bot detection services. | [View Folder](https://github.com/unclecode/crawl4ai/tree/main/docs/examples/undetectability/) |
|
||||
|
||||
## Customization & Security
|
||||
|
||||
| Example | Description | Link |
|
||||
@@ -117,4 +124,4 @@ Some examples may require:
|
||||
|
||||
## Contributing New Examples
|
||||
|
||||
If you've created an interesting example that demonstrates a unique use case or feature of Crawl4AI, we encourage you to contribute it to our examples collection. Please see our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTORS.md) for more information.
|
||||
If you've created an interesting example that demonstrates a unique use case or feature of Crawl4AI, we encourage you to contribute it to our examples collection. Please see our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTORS.md) for more information.
|
||||
|
||||
@@ -18,7 +18,7 @@ crawl4ai-setup
|
||||
```
|
||||
|
||||
**What does it do?**
|
||||
- Installs or updates required Playwright browsers (Chromium, Firefox, etc.)
|
||||
- Installs or updates required browser dependencies for both regular and undetected modes
|
||||
- Performs OS-level checks (e.g., missing libs on Linux)
|
||||
- Confirms your environment is ready to crawl
|
||||
|
||||
|
||||
@@ -125,7 +125,7 @@ Here's a full example you can copy, paste, and run immediately:
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.async_configs import LinkPreviewConfig
|
||||
from crawl4ai import LinkPreviewConfig
|
||||
|
||||
async def extract_link_heads_example():
|
||||
"""
|
||||
@@ -237,7 +237,7 @@ if __name__ == "__main__":
|
||||
The `LinkPreviewConfig` class supports these options:
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import LinkPreviewConfig
|
||||
from crawl4ai import LinkPreviewConfig
|
||||
|
||||
link_preview_config = LinkPreviewConfig(
|
||||
# BASIC SETTINGS
|
||||
@@ -520,7 +520,8 @@ This approach is handy when you still want external links but need to block cert
|
||||
|
||||
### 4.1 Accessing `result.media`
|
||||
|
||||
By default, Crawl4AI collects images, audio, video URLs, and data tables it finds on the page. These are stored in `result.media`, a dictionary keyed by media type (e.g., `images`, `videos`, `audio`, `tables`).
|
||||
By default, Crawl4AI collects images, audio and video URLs it finds on the page. These are stored in `result.media`, a dictionary keyed by media type (e.g., `images`, `videos`, `audio`).
|
||||
**Note: Tables have been moved from `result.media["tables"]` to the new `result.tables` format for better organization and direct access.**
|
||||
|
||||
**Basic Example**:
|
||||
|
||||
@@ -534,14 +535,6 @@ if result.success:
|
||||
print(f" Alt text: {img.get('alt', '')}")
|
||||
print(f" Score: {img.get('score')}")
|
||||
print(f" Description: {img.get('desc', '')}\n")
|
||||
|
||||
# Get tables
|
||||
tables = result.media.get("tables", [])
|
||||
print(f"Found {len(tables)} data tables in total.")
|
||||
for i, table in enumerate(tables):
|
||||
print(f"[Table {i}] Caption: {table.get('caption', 'No caption')}")
|
||||
print(f" Columns: {len(table.get('headers', []))}")
|
||||
print(f" Rows: {len(table.get('rows', []))}")
|
||||
```
|
||||
|
||||
**Structure Example**:
|
||||
@@ -568,19 +561,6 @@ result.media = {
|
||||
"audio": [
|
||||
# Similar structure but with audio-specific fields
|
||||
],
|
||||
"tables": [
|
||||
{
|
||||
"headers": ["Name", "Age", "Location"],
|
||||
"rows": [
|
||||
["John Doe", "34", "New York"],
|
||||
["Jane Smith", "28", "San Francisco"],
|
||||
["Alex Johnson", "42", "Chicago"]
|
||||
],
|
||||
"caption": "Employee Directory",
|
||||
"summary": "Directory of company employees"
|
||||
},
|
||||
# More tables if present
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
@@ -608,53 +588,7 @@ crawler_cfg = CrawlerRunConfig(
|
||||
|
||||
This setting attempts to discard images from outside the primary domain, keeping only those from the site you’re crawling.
|
||||
|
||||
### 3.3 Working with Tables
|
||||
|
||||
Crawl4AI can detect and extract structured data from HTML tables. Tables are analyzed based on various criteria to determine if they are actual data tables (as opposed to layout tables), including:
|
||||
|
||||
- Presence of thead and tbody sections
|
||||
- Use of th elements for headers
|
||||
- Column consistency
|
||||
- Text density
|
||||
- And other factors
|
||||
|
||||
Tables that score above the threshold (default: 7) are extracted and stored in `result.media.tables`.
|
||||
|
||||
**Accessing Table Data**:
|
||||
|
||||
```python
|
||||
if result.success:
|
||||
tables = result.media.get("tables", [])
|
||||
print(f"Found {len(tables)} data tables on the page")
|
||||
|
||||
if tables:
|
||||
# Access the first table
|
||||
first_table = tables[0]
|
||||
print(f"Table caption: {first_table.get('caption', 'No caption')}")
|
||||
print(f"Headers: {first_table.get('headers', [])}")
|
||||
|
||||
# Print the first 3 rows
|
||||
for i, row in enumerate(first_table.get('rows', [])[:3]):
|
||||
print(f"Row {i+1}: {row}")
|
||||
```
|
||||
|
||||
**Configuring Table Extraction**:
|
||||
|
||||
You can adjust the sensitivity of the table detection algorithm with:
|
||||
|
||||
```python
|
||||
crawler_cfg = CrawlerRunConfig(
|
||||
table_score_threshold=5 # Lower value = more tables detected (default: 7)
|
||||
)
|
||||
```
|
||||
|
||||
Each extracted table contains:
|
||||
- `headers`: Column header names
|
||||
- `rows`: List of rows, each containing cell values
|
||||
- `caption`: Table caption text (if available)
|
||||
- `summary`: Table summary attribute (if specified)
|
||||
|
||||
### 3.4 Additional Media Config
|
||||
### 4.3 Additional Media Config
|
||||
|
||||
- **`screenshot`**: Set to `True` if you want a full-page screenshot stored as `base64` in `result.screenshot`.
|
||||
- **`pdf`**: Set to `True` if you want a PDF version of the page in `result.pdf`.
|
||||
@@ -695,7 +629,7 @@ The MHTML format is particularly useful because:
|
||||
|
||||
---
|
||||
|
||||
## 4. Putting It All Together: Link & Media Filtering
|
||||
## 5. Putting It All Together: Link & Media Filtering
|
||||
|
||||
Here’s a combined example demonstrating how to filter out external links, skip certain domains, and exclude external images:
|
||||
|
||||
@@ -743,7 +677,7 @@ if __name__ == "__main__":
|
||||
|
||||
---
|
||||
|
||||
## 5. Common Pitfalls & Tips
|
||||
## 6. Common Pitfalls & Tips
|
||||
|
||||
1. **Conflicting Flags**:
|
||||
- `exclude_external_links=True` but then also specifying `exclude_social_media_links=True` is typically fine, but understand that the first setting already discards *all* external links. The second becomes somewhat redundant.
|
||||
@@ -762,10 +696,3 @@ if __name__ == "__main__":
|
||||
---
|
||||
|
||||
**That’s it for Link & Media Analysis!** You’re now equipped to filter out unwanted sites and zero in on the images and videos that matter for your project.
|
||||
### Table Extraction Tips
|
||||
|
||||
- Not all HTML tables are extracted - only those detected as "data tables" vs. layout tables.
|
||||
- Tables with inconsistent cell counts, nested tables, or those used purely for layout may be skipped.
|
||||
- If you're missing tables, try adjusting the `table_score_threshold` to a lower value (default is 7).
|
||||
|
||||
The table detection algorithm scores tables based on features like consistent columns, presence of headers, text density, and more. Tables scoring above the threshold are considered data tables worth extracting.
|
||||
|
||||
@@ -8,11 +8,10 @@ To crawl a live web page, provide the URL starting with `http://` or `https://`,
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
|
||||
|
||||
async def crawl_web():
|
||||
config = CrawlerRunConfig(bypass_cache=True)
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://en.wikipedia.org/wiki/apple",
|
||||
@@ -33,13 +32,12 @@ To crawl a local HTML file, prefix the file path with `file://`.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
|
||||
|
||||
async def crawl_local_file():
|
||||
local_file_path = "/path/to/apple.html" # Replace with your file path
|
||||
file_url = f"file://{local_file_path}"
|
||||
config = CrawlerRunConfig(bypass_cache=True)
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url=file_url, config=config)
|
||||
@@ -93,8 +91,7 @@ import os
|
||||
import sys
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
|
||||
|
||||
async def main():
|
||||
wikipedia_url = "https://en.wikipedia.org/wiki/apple"
|
||||
@@ -104,7 +101,7 @@ async def main():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Step 1: Crawl the Web URL
|
||||
print("\n=== Step 1: Crawling the Wikipedia URL ===")
|
||||
web_config = CrawlerRunConfig(bypass_cache=True)
|
||||
web_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
result = await crawler.arun(url=wikipedia_url, config=web_config)
|
||||
|
||||
if not result.success:
|
||||
@@ -119,7 +116,7 @@ async def main():
|
||||
# Step 2: Crawl from the Local HTML File
|
||||
print("=== Step 2: Crawling from the Local HTML File ===")
|
||||
file_url = f"file://{html_file_path.resolve()}"
|
||||
file_config = CrawlerRunConfig(bypass_cache=True)
|
||||
file_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
local_result = await crawler.arun(url=file_url, config=file_config)
|
||||
|
||||
if not local_result.success:
|
||||
@@ -135,7 +132,7 @@ async def main():
|
||||
with open(html_file_path, 'r', encoding='utf-8') as f:
|
||||
raw_html_content = f.read()
|
||||
raw_html_url = f"raw:{raw_html_content}"
|
||||
raw_config = CrawlerRunConfig(bypass_cache=True)
|
||||
raw_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
raw_result = await crawler.arun(url=raw_html_url, config=raw_config)
|
||||
|
||||
if not raw_result.success:
|
||||
|
||||
@@ -200,7 +200,8 @@ config = CrawlerRunConfig(markdown_generator=md_generator)
|
||||
|
||||
- **`user_query`**: The term you want to focus on. BM25 tries to keep only content blocks relevant to that query.
|
||||
- **`bm25_threshold`**: Raise it to keep fewer blocks; lower it to keep more.
|
||||
- **`use_stemming`** *(default `True`)*: If enabled, variations of words match (e.g., “learn,” “learning,” “learnt”).
|
||||
- **`use_stemming`** *(default `True`)*: Whether to apply stemming to the query and content.
|
||||
- **`language (str)`**: Language for stemming (default: 'english').
|
||||
|
||||
**No query provided?** BM25 tries to glean a context from page metadata, or you can simply treat it as a scorched-earth approach that discards text with low generic score. Realistically, you want to supply a query for best results.
|
||||
|
||||
@@ -233,7 +234,7 @@ prune_filter = PruningContentFilter(
|
||||
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
|
||||
async def main():
|
||||
@@ -255,9 +256,12 @@ async def main():
|
||||
chunk_token_threshold=4096, # Adjust based on your needs
|
||||
verbose=True
|
||||
)
|
||||
|
||||
md_generator = DefaultMarkdownGenerator(
|
||||
content_filter=filter,
|
||||
options={"ignore_links": True}
|
||||
)
|
||||
config = CrawlerRunConfig(
|
||||
content_filter=filter
|
||||
markdown_generator=md_generator,
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
|
||||
@@ -31,9 +31,16 @@ if __name__ == "__main__":
|
||||
The `arun()` method returns a `CrawlResult` object with several useful properties. Here's a quick overview (see [CrawlResult](../api/crawl-result.md) for complete details):
|
||||
|
||||
```python
|
||||
config = CrawlerRunConfig(
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(threshold=0.6),
|
||||
options={"ignore_links": True}
|
||||
)
|
||||
)
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=CrawlerRunConfig(fit_markdown=True)
|
||||
config=config
|
||||
)
|
||||
|
||||
# Different content formats
|
||||
|
||||
@@ -137,7 +137,7 @@ async def smart_blog_crawler():
|
||||
word_count_threshold=300 # Only substantial articles
|
||||
)
|
||||
|
||||
# Extract URLs and stream results as they come
|
||||
# Extract URLs and crawl them
|
||||
tutorial_urls = [t["url"] for t in tutorials[:10]]
|
||||
results = await crawler.arun_many(tutorial_urls, config=config)
|
||||
|
||||
@@ -231,7 +231,7 @@ Common Crawl is a massive public dataset that regularly crawls the entire web. I
|
||||
|
||||
```python
|
||||
# Use both sources
|
||||
config = SeedingConfig(source="cc+sitemap")
|
||||
config = SeedingConfig(source="sitemap+cc")
|
||||
urls = await seeder.urls("example.com", config)
|
||||
```
|
||||
|
||||
@@ -241,13 +241,13 @@ The `SeedingConfig` object is your control panel. Here's everything you can conf
|
||||
|
||||
| Parameter | Type | Default | Description |
|
||||
|-----------|------|---------|-------------|
|
||||
| `source` | str | "cc" | URL source: "cc" (Common Crawl), "sitemap", or "cc+sitemap" |
|
||||
| `source` | str | "sitemap+cc" | URL source: "cc" (Common Crawl), "sitemap", or "sitemap+cc" |
|
||||
| `pattern` | str | "*" | URL pattern filter (e.g., "*/blog/*", "*.html") |
|
||||
| `extract_head` | bool | False | Extract metadata from page `<head>` |
|
||||
| `live_check` | bool | False | Verify URLs are accessible |
|
||||
| `max_urls` | int | -1 | Maximum URLs to return (-1 = unlimited) |
|
||||
| `concurrency` | int | 10 | Parallel workers for fetching |
|
||||
| `hits_per_sec` | int | None | Rate limit for requests |
|
||||
| `hits_per_sec` | int | 5 | Rate limit for requests |
|
||||
| `force` | bool | False | Bypass cache, fetch fresh data |
|
||||
| `verbose` | bool | False | Show detailed progress |
|
||||
| `query` | str | None | Search query for BM25 scoring |
|
||||
@@ -522,7 +522,7 @@ urls = await seeder.urls("docs.example.com", config)
|
||||
```python
|
||||
# Find specific products
|
||||
config = SeedingConfig(
|
||||
source="cc+sitemap", # Use both sources
|
||||
source="sitemap+cc", # Use both sources
|
||||
extract_head=True,
|
||||
query="wireless headphones noise canceling",
|
||||
scoring_method="bm25",
|
||||
@@ -782,7 +782,7 @@ class ResearchAssistant:
|
||||
|
||||
# Step 1: Discover relevant URLs
|
||||
config = SeedingConfig(
|
||||
source="cc+sitemap", # Maximum coverage
|
||||
source="sitemap+cc", # Maximum coverage
|
||||
extract_head=True, # Get metadata
|
||||
query=topic, # Research topic
|
||||
scoring_method="bm25", # Smart scoring
|
||||
@@ -832,7 +832,8 @@ class ResearchAssistant:
|
||||
# Extract URLs and crawl all articles
|
||||
article_urls = [article['url'] for article in top_articles]
|
||||
results = []
|
||||
async for result in await crawler.arun_many(article_urls, config=config):
|
||||
crawl_results = await crawler.arun_many(article_urls, config=config)
|
||||
async for result in crawl_results:
|
||||
if result.success:
|
||||
results.append({
|
||||
'url': result.url,
|
||||
@@ -933,10 +934,10 @@ config = SeedingConfig(concurrency=10, hits_per_sec=5)
|
||||
# When crawling many URLs
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Assuming urls is a list of URL strings
|
||||
results = await crawler.arun_many(urls, config=config)
|
||||
crawl_results = await crawler.arun_many(urls, config=config)
|
||||
|
||||
# Process as they arrive
|
||||
async for result in results:
|
||||
async for result in crawl_results:
|
||||
process_immediately(result) # Don't wait for all
|
||||
```
|
||||
|
||||
@@ -1020,7 +1021,7 @@ config = SeedingConfig(
|
||||
|
||||
# E-commerce product discovery
|
||||
config = SeedingConfig(
|
||||
source="cc+sitemap",
|
||||
source="sitemap+cc",
|
||||
pattern="*/product/*",
|
||||
extract_head=True,
|
||||
live_check=True
|
||||
|
||||
@@ -218,7 +218,7 @@ import json
|
||||
import asyncio
|
||||
from typing import List
|
||||
from pydantic import BaseModel, Field
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
|
||||
from crawl4ai import LLMExtractionStrategy
|
||||
|
||||
class Entity(BaseModel):
|
||||
@@ -238,8 +238,8 @@ class KnowledgeGraph(BaseModel):
|
||||
async def main():
|
||||
# LLM extraction strategy
|
||||
llm_strat = LLMExtractionStrategy(
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4", api_token=os.getenv('OPENAI_API_KEY')),
|
||||
schema=KnowledgeGraph.schema_json(),
|
||||
llmConfig = LLMConfig(provider="openai/gpt-4", api_token=os.getenv('OPENAI_API_KEY')),
|
||||
schema=KnowledgeGraph.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="Extract entities and relationships from the content. Return valid JSON.",
|
||||
chunk_token_threshold=1400,
|
||||
@@ -258,6 +258,10 @@ async def main():
|
||||
url = "https://www.nbcnews.com/business"
|
||||
result = await crawler.arun(url=url, config=crawl_config)
|
||||
|
||||
print("--- LLM RAW RESPONSE ---")
|
||||
print(result.extracted_content)
|
||||
print("--- END LLM RAW RESPONSE ---")
|
||||
|
||||
if result.success:
|
||||
with open("kb_result.json", "w", encoding="utf-8") as f:
|
||||
f.write(result.extracted_content)
|
||||
|
||||
@@ -41,6 +41,17 @@
|
||||
alt="License"/>
|
||||
</a>
|
||||
</p>
|
||||
<p align="center">
|
||||
<a href="https://x.com/crawl4ai">
|
||||
<img src="https://img.shields.io/badge/Follow%20on%20X-000000?style=for-the-badge&logo=x&logoColor=white" alt="Follow on X" />
|
||||
</a>
|
||||
<a href="https://www.linkedin.com/company/crawl4ai">
|
||||
<img src="https://img.shields.io/badge/Follow%20on%20LinkedIn-0077B5?style=for-the-badge&logo=linkedin&logoColor=white" alt="Follow on LinkedIn" />
|
||||
</a>
|
||||
<a href="https://discord.gg/jP8KfhDhyN">
|
||||
<img src="https://img.shields.io/badge/Join%20our%20Discord-5865F2?style=for-the-badge&logo=discord&logoColor=white" alt="Join our Discord" />
|
||||
</a>
|
||||
</p>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
92
docs/md_v2/migration/webscraping-strategy-migration.md
Normal file
92
docs/md_v2/migration/webscraping-strategy-migration.md
Normal file
@@ -0,0 +1,92 @@
|
||||
# WebScrapingStrategy Migration Guide
|
||||
|
||||
## Overview
|
||||
|
||||
Crawl4AI has simplified its content scraping architecture. The BeautifulSoup-based `WebScrapingStrategy` has been deprecated in favor of the faster LXML-based implementation. However, **no action is required** - your existing code will continue to work.
|
||||
|
||||
## What Changed?
|
||||
|
||||
1. **`WebScrapingStrategy` is now an alias** for `LXMLWebScrapingStrategy`
|
||||
2. **The BeautifulSoup implementation has been removed** (~1000 lines of redundant code)
|
||||
3. **`LXMLWebScrapingStrategy` inherits directly** from `ContentScrapingStrategy`
|
||||
4. **Performance remains optimal** with LXML as the sole implementation
|
||||
|
||||
## Backward Compatibility
|
||||
|
||||
**Your existing code continues to work without any changes:**
|
||||
|
||||
```python
|
||||
# This still works perfectly
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, WebScrapingStrategy
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
scraping_strategy=WebScrapingStrategy() # Works as before
|
||||
)
|
||||
```
|
||||
|
||||
## Migration Options
|
||||
|
||||
You have three options:
|
||||
|
||||
### Option 1: Do Nothing (Recommended)
|
||||
Your code will continue to work. `WebScrapingStrategy` is permanently aliased to `LXMLWebScrapingStrategy`.
|
||||
|
||||
### Option 2: Update Imports (Optional)
|
||||
For clarity, you can update your imports:
|
||||
|
||||
```python
|
||||
# Old (still works)
|
||||
from crawl4ai import WebScrapingStrategy
|
||||
strategy = WebScrapingStrategy()
|
||||
|
||||
# New (more explicit)
|
||||
from crawl4ai import LXMLWebScrapingStrategy
|
||||
strategy = LXMLWebScrapingStrategy()
|
||||
```
|
||||
|
||||
### Option 3: Use Default Configuration
|
||||
Since `LXMLWebScrapingStrategy` is the default, you can omit the strategy parameter:
|
||||
|
||||
```python
|
||||
# Simplest approach - uses LXMLWebScrapingStrategy by default
|
||||
config = CrawlerRunConfig()
|
||||
```
|
||||
|
||||
## Type Hints
|
||||
|
||||
If you use type hints, both work:
|
||||
|
||||
```python
|
||||
from crawl4ai import WebScrapingStrategy, LXMLWebScrapingStrategy
|
||||
|
||||
def process_with_strategy(strategy: WebScrapingStrategy) -> None:
|
||||
# Works with both WebScrapingStrategy and LXMLWebScrapingStrategy
|
||||
pass
|
||||
|
||||
# Both are valid
|
||||
process_with_strategy(WebScrapingStrategy())
|
||||
process_with_strategy(LXMLWebScrapingStrategy())
|
||||
```
|
||||
|
||||
## Subclassing
|
||||
|
||||
If you've subclassed `WebScrapingStrategy`, it continues to work:
|
||||
|
||||
```python
|
||||
class MyCustomStrategy(WebScrapingStrategy):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
# Your custom code
|
||||
```
|
||||
|
||||
## Performance Benefits
|
||||
|
||||
By consolidating to LXML:
|
||||
- **10-20x faster** HTML parsing for large documents
|
||||
- **Lower memory usage**
|
||||
- **Consistent behavior** across all use cases
|
||||
- **Simplified maintenance** and bug fixes
|
||||
|
||||
## Summary
|
||||
|
||||
This change simplifies Crawl4AI's internals while maintaining 100% backward compatibility. Your existing code continues to work, and you get better performance automatically.
|
||||
1584
docs/releases_review/crawl4ai_v0_7_0_showcase.py
Normal file
1584
docs/releases_review/crawl4ai_v0_7_0_showcase.py
Normal file
File diff suppressed because it is too large
Load Diff
408
docs/releases_review/demo_v0.7.0.py
Normal file
408
docs/releases_review/demo_v0.7.0.py
Normal file
@@ -0,0 +1,408 @@
|
||||
"""
|
||||
🚀 Crawl4AI v0.7.0 Release Demo
|
||||
================================
|
||||
This demo showcases all major features introduced in v0.7.0 release.
|
||||
|
||||
Major Features:
|
||||
1. ✅ Adaptive Crawling - Intelligent crawling with confidence tracking
|
||||
2. ✅ Virtual Scroll Support - Handle infinite scroll pages
|
||||
3. ✅ Link Preview - Advanced link analysis with 3-layer scoring
|
||||
4. ✅ URL Seeder - Smart URL discovery and filtering
|
||||
5. ✅ C4A Script - Domain-specific language for web automation
|
||||
6. ✅ Chrome Extension Updates - Click2Crawl and instant schema extraction
|
||||
7. ✅ PDF Parsing Support - Extract content from PDF documents
|
||||
8. ✅ Nightly Builds - Automated nightly releases
|
||||
|
||||
Run this demo to see all features in action!
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from typing import List, Dict
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
from rich.panel import Panel
|
||||
from rich import box
|
||||
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
CrawlerRunConfig,
|
||||
BrowserConfig,
|
||||
CacheMode,
|
||||
AdaptiveCrawler,
|
||||
AdaptiveConfig,
|
||||
AsyncUrlSeeder,
|
||||
SeedingConfig,
|
||||
c4a_compile,
|
||||
CompilationResult
|
||||
)
|
||||
from crawl4ai.async_configs import VirtualScrollConfig, LinkPreviewConfig
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
|
||||
console = Console()
|
||||
|
||||
def print_section(title: str, description: str = ""):
|
||||
"""Print a section header"""
|
||||
console.print(f"\n[bold cyan]{'=' * 60}[/bold cyan]")
|
||||
console.print(f"[bold yellow]{title}[/bold yellow]")
|
||||
if description:
|
||||
console.print(f"[dim]{description}[/dim]")
|
||||
console.print(f"[bold cyan]{'=' * 60}[/bold cyan]\n")
|
||||
|
||||
|
||||
async def demo_1_adaptive_crawling():
|
||||
"""Demo 1: Adaptive Crawling - Intelligent content extraction"""
|
||||
print_section(
|
||||
"Demo 1: Adaptive Crawling",
|
||||
"Intelligently learns and adapts to website patterns"
|
||||
)
|
||||
|
||||
# Create adaptive crawler with custom configuration
|
||||
config = AdaptiveConfig(
|
||||
strategy="statistical", # or "embedding"
|
||||
confidence_threshold=0.7,
|
||||
max_pages=10,
|
||||
top_k_links=3,
|
||||
min_gain_threshold=0.1
|
||||
)
|
||||
|
||||
# Example: Learn from a product page
|
||||
console.print("[cyan]Learning from product page patterns...[/cyan]")
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
|
||||
# Start adaptive crawl
|
||||
console.print("[cyan]Starting adaptive crawl...[/cyan]")
|
||||
result = await adaptive.digest(
|
||||
start_url="https://docs.python.org/3/",
|
||||
query="python decorators tutorial"
|
||||
)
|
||||
|
||||
console.print("[green]✓ Adaptive crawl completed[/green]")
|
||||
console.print(f" - Confidence Level: {adaptive.confidence:.0%}")
|
||||
console.print(f" - Pages Crawled: {len(result.crawled_urls)}")
|
||||
console.print(f" - Knowledge Base: {len(adaptive.state.knowledge_base)} documents")
|
||||
|
||||
# Get most relevant content
|
||||
relevant = adaptive.get_relevant_content(top_k=3)
|
||||
if relevant:
|
||||
console.print("\nMost relevant pages:")
|
||||
for i, page in enumerate(relevant, 1):
|
||||
console.print(f" {i}. {page['url']} (relevance: {page['score']:.2%})")
|
||||
|
||||
|
||||
async def demo_2_virtual_scroll():
|
||||
"""Demo 2: Virtual Scroll - Handle infinite scroll pages"""
|
||||
print_section(
|
||||
"Demo 2: Virtual Scroll Support",
|
||||
"Capture content from modern infinite scroll pages"
|
||||
)
|
||||
|
||||
# Configure virtual scroll - using body as container for example.com
|
||||
scroll_config = VirtualScrollConfig(
|
||||
container_selector="body", # Using body since example.com has simple structure
|
||||
scroll_count=3, # Just 3 scrolls for demo
|
||||
scroll_by="container_height", # or "page_height" or pixel value
|
||||
wait_after_scroll=0.5 # Wait 500ms after each scroll
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
virtual_scroll_config=scroll_config,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
wait_until="networkidle"
|
||||
)
|
||||
|
||||
console.print("[cyan]Virtual Scroll Configuration:[/cyan]")
|
||||
console.print(f" - Container: {scroll_config.container_selector}")
|
||||
console.print(f" - Scroll count: {scroll_config.scroll_count}")
|
||||
console.print(f" - Scroll by: {scroll_config.scroll_by}")
|
||||
console.print(f" - Wait after scroll: {scroll_config.wait_after_scroll}s")
|
||||
|
||||
console.print("\n[dim]Note: Using example.com for demo - in production, use this[/dim]")
|
||||
console.print("[dim]with actual infinite scroll pages like social media feeds.[/dim]\n")
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://example.com",
|
||||
config=config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
console.print("[green]✓ Virtual scroll executed successfully![/green]")
|
||||
console.print(f" - Content length: {len(result.markdown)} chars")
|
||||
|
||||
# Show example of how to use with real infinite scroll sites
|
||||
console.print("\n[yellow]Example for real infinite scroll sites:[/yellow]")
|
||||
console.print("""
|
||||
# For Twitter-like feeds:
|
||||
scroll_config = VirtualScrollConfig(
|
||||
container_selector="[data-testid='primaryColumn']",
|
||||
scroll_count=20,
|
||||
scroll_by="container_height",
|
||||
wait_after_scroll=1.0
|
||||
)
|
||||
|
||||
# For Instagram-like grids:
|
||||
scroll_config = VirtualScrollConfig(
|
||||
container_selector="main article",
|
||||
scroll_count=15,
|
||||
scroll_by=1000, # Fixed pixel amount
|
||||
wait_after_scroll=1.5
|
||||
)""")
|
||||
|
||||
|
||||
async def demo_3_link_preview():
|
||||
"""Demo 3: Link Preview with 3-layer scoring"""
|
||||
print_section(
|
||||
"Demo 3: Link Preview & Scoring",
|
||||
"Advanced link analysis with intrinsic, contextual, and total scoring"
|
||||
)
|
||||
|
||||
# Configure link preview
|
||||
link_config = LinkPreviewConfig(
|
||||
include_internal=True,
|
||||
include_external=False,
|
||||
max_links=10,
|
||||
concurrency=5,
|
||||
query="python tutorial", # For contextual scoring
|
||||
score_threshold=0.3,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
link_preview_config=link_config,
|
||||
score_links=True, # Enable intrinsic scoring
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
console.print("[cyan]Analyzing links with 3-layer scoring system...[/cyan]")
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://docs.python.org/3/", config=config)
|
||||
|
||||
if result.success and result.links:
|
||||
# Get scored links
|
||||
internal_links = result.links.get("internal", [])
|
||||
scored_links = [l for l in internal_links if l.get("total_score")]
|
||||
scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
|
||||
|
||||
# Create a scoring table
|
||||
table = Table(title="Link Scoring Results", box=box.ROUNDED)
|
||||
table.add_column("Link Text", style="cyan", width=40)
|
||||
table.add_column("Intrinsic Score", justify="center")
|
||||
table.add_column("Contextual Score", justify="center")
|
||||
table.add_column("Total Score", justify="center", style="bold green")
|
||||
|
||||
for link in scored_links[:5]:
|
||||
text = link.get('text', 'No text')[:40]
|
||||
table.add_row(
|
||||
text,
|
||||
f"{link.get('intrinsic_score', 0):.1f}/10",
|
||||
f"{link.get('contextual_score', 0):.2f}/1",
|
||||
f"{link.get('total_score', 0):.3f}"
|
||||
)
|
||||
|
||||
console.print(table)
|
||||
|
||||
|
||||
async def demo_4_url_seeder():
|
||||
"""Demo 4: URL Seeder - Smart URL discovery"""
|
||||
print_section(
|
||||
"Demo 4: URL Seeder",
|
||||
"Intelligent URL discovery and filtering"
|
||||
)
|
||||
|
||||
# Configure seeding
|
||||
seeding_config = SeedingConfig(
|
||||
source="cc+sitemap", # or "crawl"
|
||||
pattern="*tutorial*", # URL pattern filter
|
||||
max_urls=50,
|
||||
extract_head=True, # Get metadata
|
||||
query="python programming", # For relevance scoring
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.2,
|
||||
force = True
|
||||
)
|
||||
|
||||
console.print("[cyan]URL Seeder Configuration:[/cyan]")
|
||||
console.print(f" - Source: {seeding_config.source}")
|
||||
console.print(f" - Pattern: {seeding_config.pattern}")
|
||||
console.print(f" - Max URLs: {seeding_config.max_urls}")
|
||||
console.print(f" - Query: {seeding_config.query}")
|
||||
console.print(f" - Scoring: {seeding_config.scoring_method}")
|
||||
|
||||
# Use URL seeder to discover URLs
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
console.print("\n[cyan]Discovering URLs from Python docs...[/cyan]")
|
||||
urls = await seeder.urls("docs.python.org", seeding_config)
|
||||
|
||||
console.print(f"\n[green]✓ Discovered {len(urls)} URLs[/green]")
|
||||
for i, url_info in enumerate(urls[:5], 1):
|
||||
console.print(f" {i}. {url_info['url']}")
|
||||
if url_info.get('relevance_score'):
|
||||
console.print(f" Relevance: {url_info['relevance_score']:.3f}")
|
||||
|
||||
|
||||
async def demo_5_c4a_script():
|
||||
"""Demo 5: C4A Script - Domain-specific language"""
|
||||
print_section(
|
||||
"Demo 5: C4A Script Language",
|
||||
"Domain-specific language for web automation"
|
||||
)
|
||||
|
||||
# Example C4A script
|
||||
c4a_script = """
|
||||
# Simple C4A script example
|
||||
WAIT `body` 3
|
||||
IF (EXISTS `.cookie-banner`) THEN CLICK `.accept`
|
||||
CLICK `.search-button`
|
||||
TYPE "python tutorial"
|
||||
PRESS Enter
|
||||
WAIT `.results` 5
|
||||
"""
|
||||
|
||||
console.print("[cyan]C4A Script Example:[/cyan]")
|
||||
console.print(Panel(c4a_script, title="script.c4a", border_style="blue"))
|
||||
|
||||
# Compile the script
|
||||
compilation_result = c4a_compile(c4a_script)
|
||||
|
||||
if compilation_result.success:
|
||||
console.print("[green]✓ Script compiled successfully![/green]")
|
||||
console.print(f" - Generated {len(compilation_result.js_code)} JavaScript statements")
|
||||
console.print("\nFirst 3 JS statements:")
|
||||
for stmt in compilation_result.js_code[:3]:
|
||||
console.print(f" • {stmt}")
|
||||
else:
|
||||
console.print("[red]✗ Script compilation failed[/red]")
|
||||
if compilation_result.first_error:
|
||||
error = compilation_result.first_error
|
||||
console.print(f" Error at line {error.line}: {error.message}")
|
||||
|
||||
|
||||
async def demo_6_css_extraction():
|
||||
"""Demo 6: Enhanced CSS/JSON extraction"""
|
||||
print_section(
|
||||
"Demo 6: Enhanced Extraction",
|
||||
"Improved CSS selector and JSON extraction"
|
||||
)
|
||||
|
||||
# Define extraction schema
|
||||
schema = {
|
||||
"name": "Example Page Data",
|
||||
"baseSelector": "body",
|
||||
"fields": [
|
||||
{
|
||||
"name": "title",
|
||||
"selector": "h1",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "paragraphs",
|
||||
"selector": "p",
|
||||
"type": "list",
|
||||
"fields": [
|
||||
{"name": "text", "type": "text"}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema)
|
||||
|
||||
console.print("[cyan]Extraction Schema:[/cyan]")
|
||||
console.print(json.dumps(schema, indent=2))
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://example.com",
|
||||
config=CrawlerRunConfig(
|
||||
extraction_strategy=extraction_strategy,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
)
|
||||
|
||||
if result.success and result.extracted_content:
|
||||
console.print("\n[green]✓ Content extracted successfully![/green]")
|
||||
console.print(f"Extracted: {json.dumps(json.loads(result.extracted_content), indent=2)[:200]}...")
|
||||
|
||||
|
||||
async def demo_7_performance_improvements():
|
||||
"""Demo 7: Performance improvements"""
|
||||
print_section(
|
||||
"Demo 7: Performance Improvements",
|
||||
"Faster crawling with better resource management"
|
||||
)
|
||||
|
||||
# Performance-optimized configuration
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.ENABLED, # Use caching
|
||||
wait_until="domcontentloaded", # Faster than networkidle
|
||||
page_timeout=10000, # 10 second timeout
|
||||
exclude_external_links=True,
|
||||
exclude_social_media_links=True,
|
||||
exclude_external_images=True
|
||||
)
|
||||
|
||||
console.print("[cyan]Performance Configuration:[/cyan]")
|
||||
console.print(" - Cache: ENABLED")
|
||||
console.print(" - Wait: domcontentloaded (faster)")
|
||||
console.print(" - Timeout: 10s")
|
||||
console.print(" - Excluding: external links, images, social media")
|
||||
|
||||
# Measure performance
|
||||
import time
|
||||
start_time = time.time()
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://example.com", config=config)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
if result.success:
|
||||
console.print(f"\n[green]✓ Page crawled in {elapsed:.2f} seconds[/green]")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all demos"""
|
||||
console.print(Panel(
|
||||
"[bold cyan]Crawl4AI v0.7.0 Release Demo[/bold cyan]\n\n"
|
||||
"This demo showcases all major features introduced in v0.7.0.\n"
|
||||
"Each demo is self-contained and demonstrates a specific feature.",
|
||||
title="Welcome",
|
||||
border_style="blue"
|
||||
))
|
||||
|
||||
demos = [
|
||||
demo_1_adaptive_crawling,
|
||||
demo_2_virtual_scroll,
|
||||
demo_3_link_preview,
|
||||
demo_4_url_seeder,
|
||||
demo_5_c4a_script,
|
||||
demo_6_css_extraction,
|
||||
demo_7_performance_improvements
|
||||
]
|
||||
|
||||
for i, demo in enumerate(demos, 1):
|
||||
try:
|
||||
await demo()
|
||||
if i < len(demos):
|
||||
console.print("\n[dim]Press Enter to continue to next demo...[/dim]")
|
||||
input()
|
||||
except Exception as e:
|
||||
console.print(f"[red]Error in demo: {e}[/red]")
|
||||
continue
|
||||
|
||||
console.print(Panel(
|
||||
"[bold green]Demo Complete![/bold green]\n\n"
|
||||
"Thank you for trying Crawl4AI v0.7.0!\n"
|
||||
"For more examples and documentation, visit:\n"
|
||||
"https://github.com/unclecode/crawl4ai",
|
||||
title="Complete",
|
||||
border_style="green"
|
||||
))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
280
docs/releases_review/v0_7_0_features_demo.py
Normal file
280
docs/releases_review/v0_7_0_features_demo.py
Normal file
@@ -0,0 +1,280 @@
|
||||
"""
|
||||
🚀 Crawl4AI v0.7.0 Feature Demo
|
||||
================================
|
||||
This file demonstrates the major features introduced in v0.7.0 with practical examples.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from pathlib import Path
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
CrawlerRunConfig,
|
||||
BrowserConfig,
|
||||
CacheMode,
|
||||
# New imports for v0.7.0
|
||||
VirtualScrollConfig,
|
||||
LinkPreviewConfig,
|
||||
AdaptiveCrawler,
|
||||
AdaptiveConfig,
|
||||
AsyncUrlSeeder,
|
||||
SeedingConfig,
|
||||
c4a_compile,
|
||||
)
|
||||
|
||||
|
||||
async def demo_link_preview():
|
||||
"""
|
||||
Demo 1: Link Preview with 3-Layer Scoring
|
||||
|
||||
Shows how to analyze links with intrinsic quality scores,
|
||||
contextual relevance, and combined total scores.
|
||||
"""
|
||||
print("\n" + "="*60)
|
||||
print("🔗 DEMO 1: Link Preview & Intelligent Scoring")
|
||||
print("="*60)
|
||||
|
||||
# Configure link preview with contextual scoring
|
||||
config = CrawlerRunConfig(
|
||||
link_preview_config=LinkPreviewConfig(
|
||||
include_internal=True,
|
||||
include_external=False,
|
||||
max_links=10,
|
||||
concurrency=5,
|
||||
query="machine learning tutorials", # For contextual scoring
|
||||
score_threshold=0.3, # Minimum relevance
|
||||
verbose=True
|
||||
),
|
||||
score_links=True, # Enable intrinsic scoring
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://scikit-learn.org/stable/", config=config)
|
||||
|
||||
if result.success:
|
||||
# Get scored links
|
||||
internal_links = result.links.get("internal", [])
|
||||
scored_links = [l for l in internal_links if l.get("total_score")]
|
||||
scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
|
||||
|
||||
print(f"\nTop 5 Most Relevant Links:")
|
||||
for i, link in enumerate(scored_links[:5], 1):
|
||||
print(f"\n{i}. {link.get('text', 'No text')[:50]}...")
|
||||
print(f" URL: {link['href']}")
|
||||
print(f" Intrinsic Score: {link.get('intrinsic_score', 0):.2f}/10")
|
||||
print(f" Contextual Score: {link.get('contextual_score', 0):.3f}")
|
||||
print(f" Total Score: {link.get('total_score', 0):.3f}")
|
||||
|
||||
# Show metadata if available
|
||||
if link.get('head_data'):
|
||||
title = link['head_data'].get('title', 'No title')
|
||||
print(f" Title: {title[:60]}...")
|
||||
|
||||
|
||||
async def demo_adaptive_crawling():
|
||||
"""
|
||||
Demo 2: Adaptive Crawling
|
||||
|
||||
Shows intelligent crawling that stops when enough information
|
||||
is gathered, with confidence tracking.
|
||||
"""
|
||||
print("\n" + "="*60)
|
||||
print("🎯 DEMO 2: Adaptive Crawling with Confidence Tracking")
|
||||
print("="*60)
|
||||
|
||||
# Configure adaptive crawler
|
||||
config = AdaptiveConfig(
|
||||
strategy="statistical", # or "embedding" for semantic understanding
|
||||
max_pages=10,
|
||||
confidence_threshold=0.7, # Stop at 70% confidence
|
||||
top_k_links=3, # Follow top 3 links per page
|
||||
min_gain_threshold=0.05 # Need 5% information gain to continue
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
|
||||
print("Starting adaptive crawl about Python decorators...")
|
||||
result = await adaptive.digest(
|
||||
start_url="https://docs.python.org/3/glossary.html",
|
||||
query="python decorators functions wrapping"
|
||||
)
|
||||
|
||||
print(f"\n✅ Crawling Complete!")
|
||||
print(f"• Confidence Level: {adaptive.confidence:.0%}")
|
||||
print(f"• Pages Crawled: {len(result.crawled_urls)}")
|
||||
print(f"• Knowledge Base: {len(adaptive.state.knowledge_base)} documents")
|
||||
|
||||
# Get most relevant content
|
||||
relevant = adaptive.get_relevant_content(top_k=3)
|
||||
print(f"\nMost Relevant Pages:")
|
||||
for i, page in enumerate(relevant, 1):
|
||||
print(f"{i}. {page['url']} (relevance: {page['score']:.2%})")
|
||||
|
||||
|
||||
async def demo_virtual_scroll():
|
||||
"""
|
||||
Demo 3: Virtual Scroll for Modern Web Pages
|
||||
|
||||
Shows how to capture content from pages with DOM recycling
|
||||
(Twitter, Instagram, infinite scroll).
|
||||
"""
|
||||
print("\n" + "="*60)
|
||||
print("📜 DEMO 3: Virtual Scroll Support")
|
||||
print("="*60)
|
||||
|
||||
# Configure virtual scroll for a news site
|
||||
virtual_config = VirtualScrollConfig(
|
||||
container_selector="main, article, .content", # Common containers
|
||||
scroll_count=20, # Scroll up to 20 times
|
||||
scroll_by="container_height", # Scroll by container height
|
||||
wait_after_scroll=0.5 # Wait 500ms after each scroll
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
virtual_scroll_config=virtual_config,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
wait_for="css:article" # Wait for articles to load
|
||||
)
|
||||
|
||||
# Example with a real news site
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://news.ycombinator.com/",
|
||||
config=config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
# Count items captured
|
||||
import re
|
||||
items = len(re.findall(r'class="athing"', result.html))
|
||||
print(f"\n✅ Captured {items} news items")
|
||||
print(f"• HTML size: {len(result.html):,} bytes")
|
||||
print(f"• Without virtual scroll, would only capture ~30 items")
|
||||
|
||||
|
||||
async def demo_url_seeder():
|
||||
"""
|
||||
Demo 4: URL Seeder for Intelligent Discovery
|
||||
|
||||
Shows how to discover and filter URLs before crawling,
|
||||
with relevance scoring.
|
||||
"""
|
||||
print("\n" + "="*60)
|
||||
print("🌱 DEMO 4: URL Seeder - Smart URL Discovery")
|
||||
print("="*60)
|
||||
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
# Discover Python tutorial URLs
|
||||
config = SeedingConfig(
|
||||
source="sitemap", # Use sitemap
|
||||
pattern="*python*", # URL pattern filter
|
||||
extract_head=True, # Get metadata
|
||||
query="python tutorial", # For relevance scoring
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.2,
|
||||
max_urls=10
|
||||
)
|
||||
|
||||
print("Discovering Python async tutorial URLs...")
|
||||
urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
|
||||
|
||||
print(f"\n✅ Found {len(urls)} relevant URLs:")
|
||||
for i, url_info in enumerate(urls[:5], 1):
|
||||
print(f"\n{i}. {url_info['url']}")
|
||||
if url_info.get('relevance_score'):
|
||||
print(f" Relevance: {url_info['relevance_score']:.3f}")
|
||||
if url_info.get('head_data', {}).get('title'):
|
||||
print(f" Title: {url_info['head_data']['title'][:60]}...")
|
||||
|
||||
|
||||
async def demo_c4a_script():
|
||||
"""
|
||||
Demo 5: C4A Script Language
|
||||
|
||||
Shows the domain-specific language for web automation
|
||||
with JavaScript transpilation.
|
||||
"""
|
||||
print("\n" + "="*60)
|
||||
print("🎭 DEMO 5: C4A Script - Web Automation Language")
|
||||
print("="*60)
|
||||
|
||||
# Example C4A script
|
||||
c4a_script = """
|
||||
# E-commerce automation script
|
||||
WAIT `body` 3
|
||||
|
||||
# Handle cookie banner
|
||||
IF (EXISTS `.cookie-banner`) THEN CLICK `.accept-cookies`
|
||||
|
||||
# Search for product
|
||||
CLICK `.search-box`
|
||||
TYPE "wireless headphones"
|
||||
PRESS Enter
|
||||
|
||||
# Wait for results
|
||||
WAIT `.product-grid` 10
|
||||
|
||||
# Load more products
|
||||
REPEAT (SCROLL DOWN 500, `document.querySelectorAll('.product').length < 50`)
|
||||
|
||||
# Apply filter
|
||||
IF (EXISTS `.price-filter`) THEN CLICK `input[data-max-price="100"]`
|
||||
"""
|
||||
|
||||
# Compile the script
|
||||
print("Compiling C4A script...")
|
||||
result = c4a_compile(c4a_script)
|
||||
|
||||
if result.success:
|
||||
print(f"✅ Successfully compiled to {len(result.js_code)} JavaScript statements!")
|
||||
print("\nFirst 3 JS statements:")
|
||||
for stmt in result.js_code[:3]:
|
||||
print(f" • {stmt}")
|
||||
|
||||
# Use with crawler
|
||||
config = CrawlerRunConfig(
|
||||
c4a_script=c4a_script, # Pass C4A script directly
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
print("\n✅ Script ready for use with AsyncWebCrawler!")
|
||||
else:
|
||||
print(f"❌ Compilation error: {result.first_error.message}")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all demos"""
|
||||
print("\n🚀 Crawl4AI v0.7.0 Feature Demonstrations")
|
||||
print("=" * 60)
|
||||
|
||||
demos = [
|
||||
("Link Preview & Scoring", demo_link_preview),
|
||||
("Adaptive Crawling", demo_adaptive_crawling),
|
||||
("Virtual Scroll", demo_virtual_scroll),
|
||||
("URL Seeder", demo_url_seeder),
|
||||
("C4A Script", demo_c4a_script),
|
||||
]
|
||||
|
||||
for name, demo_func in demos:
|
||||
try:
|
||||
await demo_func()
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error in {name} demo: {str(e)}")
|
||||
|
||||
# Pause between demos
|
||||
await asyncio.sleep(1)
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("✅ All demos completed!")
|
||||
print("\nKey Takeaways:")
|
||||
print("• Link Preview: 3-layer scoring for intelligent link analysis")
|
||||
print("• Adaptive Crawling: Stop when you have enough information")
|
||||
print("• Virtual Scroll: Capture all content from modern web pages")
|
||||
print("• URL Seeder: Pre-discover and filter URLs efficiently")
|
||||
print("• C4A Script: Simple language for complex automations")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,4 +1,4 @@
|
||||
site_name: Crawl4AI Documentation (v0.6.x)
|
||||
site_name: Crawl4AI Documentation (v0.7.x)
|
||||
site_favicon: docs/md_v2/favicon.ico
|
||||
site_description: 🚀🤖 Crawl4AI, Open-source LLM-Friendly Web Crawler & Scraper
|
||||
site_url: https://docs.crawl4ai.com
|
||||
@@ -25,6 +25,8 @@ nav:
|
||||
- "Command Line Interface": "core/cli.md"
|
||||
- "Simple Crawling": "core/simple-crawling.md"
|
||||
- "Deep Crawling": "core/deep-crawling.md"
|
||||
- "Adaptive Crawling": "core/adaptive-crawling.md"
|
||||
- "URL Seeding": "core/url-seeding.md"
|
||||
- "C4A-Script": "core/c4a-script.md"
|
||||
- "Crawler Result": "core/crawler-result.md"
|
||||
- "Browser, Crawler & LLM Config": "core/browser-crawler-config.md"
|
||||
@@ -37,17 +39,20 @@ nav:
|
||||
- "Link & Media": "core/link-media.md"
|
||||
- Advanced:
|
||||
- "Overview": "advanced/advanced-features.md"
|
||||
- "Adaptive Strategies": "advanced/adaptive-strategies.md"
|
||||
- "Virtual Scroll": "advanced/virtual-scroll.md"
|
||||
- "File Downloading": "advanced/file-downloading.md"
|
||||
- "Lazy Loading": "advanced/lazy-loading.md"
|
||||
- "Hooks & Auth": "advanced/hooks-auth.md"
|
||||
- "Proxy & Security": "advanced/proxy-security.md"
|
||||
- "Undetected Browser": "advanced/undetected-browser.md"
|
||||
- "Session Management": "advanced/session-management.md"
|
||||
- "Multi-URL Crawling": "advanced/multi-url-crawling.md"
|
||||
- "Crawl Dispatcher": "advanced/crawl-dispatcher.md"
|
||||
- "Identity Based Crawling": "advanced/identity-based-crawling.md"
|
||||
- "SSL Certificate": "advanced/ssl-certificate.md"
|
||||
- "Network & Console Capture": "advanced/network-console-capture.md"
|
||||
- "PDF Parsing": "advanced/pdf-parsing.md"
|
||||
- Extraction:
|
||||
- "LLM-Free Strategies": "extraction/no-llm-strategies.md"
|
||||
- "LLM Strategies": "extraction/llm-strategies.md"
|
||||
|
||||
@@ -13,39 +13,37 @@ authors = [
|
||||
{name = "Unclecode", email = "unclecode@kidocode.com"}
|
||||
]
|
||||
dependencies = [
|
||||
"aiofiles>=24.1.0",
|
||||
"aiohttp>=3.11.11",
|
||||
"aiosqlite~=0.20",
|
||||
"anyio>=4.0.0",
|
||||
"lxml~=5.3",
|
||||
"litellm>=1.53.1",
|
||||
"numpy>=1.26.0,<3",
|
||||
"pillow~=10.4",
|
||||
"pillow>=10.4",
|
||||
"playwright>=1.49.0",
|
||||
"patchright>=1.49.0",
|
||||
"python-dotenv~=1.0",
|
||||
"requests~=2.26",
|
||||
"beautifulsoup4~=4.12",
|
||||
"tf-playwright-stealth>=1.1.0",
|
||||
"xxhash~=3.4",
|
||||
"rank-bm25~=0.2",
|
||||
"aiofiles>=24.1.0",
|
||||
"snowballstemmer~=2.2",
|
||||
"pydantic>=2.10",
|
||||
"pyOpenSSL>=24.3.0",
|
||||
"psutil>=6.1.1",
|
||||
"PyYAML>=6.0",
|
||||
"nltk>=3.9.1",
|
||||
"playwright",
|
||||
"aiofiles",
|
||||
"rich>=13.9.4",
|
||||
"cssselect>=1.2.0",
|
||||
"httpx>=0.27.2",
|
||||
"httpx[http2]>=0.27.2",
|
||||
"fake-useragent>=2.0.3",
|
||||
"click>=8.1.7",
|
||||
"pyperclip>=1.8.2",
|
||||
"chardet>=5.2.0",
|
||||
"aiohttp>=3.11.11",
|
||||
"brotli>=1.1.0",
|
||||
"humanize>=4.10.0",
|
||||
"lark>=1.2.2",
|
||||
"sentence-transformers>=2.2.0",
|
||||
"alphashape>=1.3.1",
|
||||
"shapely>=2.0.0"
|
||||
]
|
||||
@@ -63,8 +61,8 @@ classifiers = [
|
||||
[project.optional-dependencies]
|
||||
pdf = ["PyPDF2"]
|
||||
torch = ["torch", "nltk", "scikit-learn"]
|
||||
transformer = ["transformers", "tokenizers"]
|
||||
cosine = ["torch", "transformers", "nltk"]
|
||||
transformer = ["transformers", "tokenizers", "sentence-transformers"]
|
||||
cosine = ["torch", "transformers", "nltk", "sentence-transformers"]
|
||||
sync = ["selenium"]
|
||||
all = [
|
||||
"PyPDF2",
|
||||
@@ -73,8 +71,8 @@ all = [
|
||||
"scikit-learn",
|
||||
"transformers",
|
||||
"tokenizers",
|
||||
"selenium",
|
||||
"PyPDF2"
|
||||
"sentence-transformers",
|
||||
"selenium"
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
|
||||
@@ -1,29 +1,35 @@
|
||||
# Note: These requirements are also specified in pyproject.toml
|
||||
# This file is kept for development environment setup and compatibility
|
||||
aiofiles>=24.1.0
|
||||
aiohttp>=3.11.11
|
||||
aiosqlite~=0.20
|
||||
anyio>=4.0.0
|
||||
lxml~=5.3
|
||||
litellm>=1.53.1
|
||||
numpy>=1.26.0,<3
|
||||
pillow~=10.4
|
||||
pillow>=10.4
|
||||
playwright>=1.49.0
|
||||
patchright>=1.49.0
|
||||
python-dotenv~=1.0
|
||||
requests~=2.26
|
||||
beautifulsoup4~=4.12
|
||||
tf-playwright-stealth>=1.1.0
|
||||
xxhash~=3.4
|
||||
rank-bm25~=0.2
|
||||
aiofiles>=24.1.0
|
||||
colorama~=0.4
|
||||
snowballstemmer~=2.2
|
||||
pydantic>=2.10
|
||||
pyOpenSSL>=24.3.0
|
||||
psutil>=6.1.1
|
||||
PyYAML>=6.0
|
||||
nltk>=3.9.1
|
||||
rich>=13.9.4
|
||||
cssselect>=1.2.0
|
||||
chardet>=5.2.0
|
||||
brotli>=1.1.0
|
||||
httpx[http2]>=0.27.2
|
||||
sentence-transformers>=2.2.0
|
||||
alphashape>=1.3.1
|
||||
shapely>=2.0.0
|
||||
|
||||
fake-useragent>=2.2.0
|
||||
pdf2image>=1.17.0
|
||||
PyPDF2>=3.0.1
|
||||
@@ -12,11 +12,8 @@ parent_dir = os.path.dirname(
|
||||
sys.path.append(parent_dir)
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
from crawl4ai.content_scraping_strategy import WebScrapingStrategy
|
||||
from crawl4ai.content_scraping_strategy import (
|
||||
WebScrapingStrategy as WebScrapingStrategyCurrent,
|
||||
)
|
||||
# from crawl4ai.content_scrapping_strategy_current import WebScrapingStrategy as WebScrapingStrategyCurrent
|
||||
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||
# This test compares the same strategy with itself now since WebScrapingStrategy is deprecated
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -32,8 +29,8 @@ class TestResult:
|
||||
|
||||
class StrategyTester:
|
||||
def __init__(self):
|
||||
self.new_scraper = WebScrapingStrategy()
|
||||
self.current_scraper = WebScrapingStrategyCurrent()
|
||||
self.new_scraper = LXMLWebScrapingStrategy()
|
||||
self.current_scraper = LXMLWebScrapingStrategy() # Same strategy now
|
||||
with open(__location__ + "/sample_wikipedia.html", "r", encoding="utf-8") as f:
|
||||
self.WIKI_HTML = f.read()
|
||||
self.results = {"new": [], "current": []}
|
||||
|
||||
344
tests/check_dependencies.py
Executable file
344
tests/check_dependencies.py
Executable file
@@ -0,0 +1,344 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Dependency checker for Crawl4AI
|
||||
Analyzes imports in the codebase and shows which files use them
|
||||
"""
|
||||
|
||||
import ast
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Set, Dict, List, Tuple
|
||||
from collections import defaultdict
|
||||
import re
|
||||
import toml
|
||||
|
||||
# Standard library modules to ignore
|
||||
STDLIB_MODULES = {
|
||||
'abc', 'argparse', 'asyncio', 'base64', 'collections', 'concurrent', 'contextlib',
|
||||
'copy', 'datetime', 'decimal', 'email', 'enum', 'functools', 'glob', 'hashlib',
|
||||
'http', 'importlib', 'io', 'itertools', 'json', 'logging', 'math', 'mimetypes',
|
||||
'multiprocessing', 'os', 'pathlib', 'pickle', 'platform', 'pprint', 'random',
|
||||
're', 'shutil', 'signal', 'socket', 'sqlite3', 'string', 'subprocess', 'sys',
|
||||
'tempfile', 'threading', 'time', 'traceback', 'typing', 'unittest', 'urllib',
|
||||
'uuid', 'warnings', 'weakref', 'xml', 'zipfile', 'dataclasses', 'secrets',
|
||||
'statistics', 'textwrap', 'queue', 'csv', 'gzip', 'tarfile', 'configparser',
|
||||
'inspect', 'operator', 'struct', 'binascii', 'codecs', 'locale', 'gc',
|
||||
'atexit', 'builtins', 'html', 'errno', 'fcntl', 'pwd', 'grp', 'resource',
|
||||
'termios', 'tty', 'pty', 'select', 'selectors', 'ssl', 'zlib', 'bz2',
|
||||
'lzma', 'types', 'copy', 'pydoc', 'profile', 'cProfile', 'timeit',
|
||||
'trace', 'doctest', 'pdb', 'contextvars', 'dataclasses', 'graphlib',
|
||||
'zoneinfo', 'tomllib', 'cgi', 'wsgiref', 'fileinput', 'linecache',
|
||||
'tokenize', 'tabnanny', 'compileall', 'dis', 'pickletools', 'formatter',
|
||||
'__future__', 'array', 'ctypes', 'heapq', 'bisect', 'array', 'weakref',
|
||||
'types', 'copy', 'pprint', 'repr', 'numbers', 'cmath', 'fractions',
|
||||
'statistics', 'itertools', 'functools', 'operator', 'pathlib', 'fileinput',
|
||||
'stat', 'filecmp', 'tempfile', 'glob', 'fnmatch', 'linecache', 'shutil',
|
||||
'pickle', 'copyreg', 'shelve', 'marshal', 'dbm', 'sqlite3', 'zlib', 'gzip',
|
||||
'bz2', 'lzma', 'zipfile', 'tarfile', 'configparser', 'netrc', 'xdrlib',
|
||||
'plistlib', 'hashlib', 'hmac', 'secrets', 'os', 'io', 'time', 'argparse',
|
||||
'getopt', 'logging', 'getpass', 'curses', 'platform', 'errno', 'ctypes',
|
||||
'threading', 'multiprocessing', 'concurrent', 'subprocess', 'sched', 'queue',
|
||||
'contextvars', 'asyncio', 'socket', 'ssl', 'email', 'json', 'mailcap',
|
||||
'mailbox', 'mimetypes', 'base64', 'binhex', 'binascii', 'quopri', 'uu',
|
||||
'html', 'xml', 'webbrowser', 'cgi', 'cgitb', 'wsgiref', 'urllib', 'http',
|
||||
'ftplib', 'poplib', 'imaplib', 'nntplib', 'smtplib', 'smtpd', 'telnetlib',
|
||||
'uuid', 'socketserver', 'xmlrpc', 'ipaddress', 'audioop', 'aifc', 'sunau',
|
||||
'wave', 'chunk', 'colorsys', 'imghdr', 'sndhdr', 'ossaudiodev', 'gettext',
|
||||
'locale', 'turtle', 'cmd', 'shlex', 'tkinter', 'typing', 'pydoc', 'doctest',
|
||||
'unittest', 'test', '2to3', 'distutils', 'venv', 'ensurepip', 'zipapp',
|
||||
'py_compile', 'compileall', 'dis', 'pickletools', 'pdb', 'timeit', 'trace',
|
||||
'tracemalloc', 'warnings', 'faulthandler', 'pdb', 'dataclasses', 'cgi',
|
||||
'cgitb', 'chunk', 'crypt', 'imghdr', 'mailcap', 'nis', 'nntplib', 'optparse',
|
||||
'ossaudiodev', 'pipes', 'smtpd', 'sndhdr', 'spwd', 'sunau', 'telnetlib',
|
||||
'uu', 'xdrlib', 'msilib', 'pstats', 'rlcompleter', 'tkinter', 'ast'
|
||||
}
|
||||
|
||||
# Known package name mappings (import name -> package name)
|
||||
PACKAGE_MAPPINGS = {
|
||||
'bs4': 'beautifulsoup4',
|
||||
'PIL': 'pillow',
|
||||
'cv2': 'opencv-python',
|
||||
'sklearn': 'scikit-learn',
|
||||
'yaml': 'PyYAML',
|
||||
'OpenSSL': 'pyOpenSSL',
|
||||
'sqlalchemy': 'SQLAlchemy',
|
||||
'playwright': 'playwright',
|
||||
'patchright': 'patchright',
|
||||
'dotenv': 'python-dotenv',
|
||||
'fake_useragent': 'fake-useragent',
|
||||
'playwright_stealth': 'tf-playwright-stealth',
|
||||
'sentence_transformers': 'sentence-transformers',
|
||||
'rank_bm25': 'rank-bm25',
|
||||
'snowballstemmer': 'snowballstemmer',
|
||||
'PyPDF2': 'PyPDF2',
|
||||
'pdf2image': 'pdf2image',
|
||||
}
|
||||
|
||||
|
||||
class ImportVisitor(ast.NodeVisitor):
|
||||
"""AST visitor to extract imports from Python files"""
|
||||
|
||||
def __init__(self):
|
||||
self.imports = {} # Changed to dict to store line numbers
|
||||
self.from_imports = {}
|
||||
|
||||
def visit_Import(self, node):
|
||||
for alias in node.names:
|
||||
module_name = alias.name.split('.')[0]
|
||||
if module_name not in self.imports:
|
||||
self.imports[module_name] = []
|
||||
self.imports[module_name].append(node.lineno)
|
||||
|
||||
def visit_ImportFrom(self, node):
|
||||
if node.module and node.level == 0: # absolute imports only
|
||||
module_name = node.module.split('.')[0]
|
||||
if module_name not in self.from_imports:
|
||||
self.from_imports[module_name] = []
|
||||
self.from_imports[module_name].append(node.lineno)
|
||||
|
||||
|
||||
def extract_imports_from_file(filepath: Path) -> Dict[str, List[int]]:
|
||||
"""Extract all imports from a Python file with line numbers"""
|
||||
all_imports = {}
|
||||
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
tree = ast.parse(content)
|
||||
visitor = ImportVisitor()
|
||||
visitor.visit(tree)
|
||||
|
||||
# Merge imports and from_imports
|
||||
for module, lines in visitor.imports.items():
|
||||
if module not in all_imports:
|
||||
all_imports[module] = []
|
||||
all_imports[module].extend(lines)
|
||||
|
||||
for module, lines in visitor.from_imports.items():
|
||||
if module not in all_imports:
|
||||
all_imports[module] = []
|
||||
all_imports[module].extend(lines)
|
||||
|
||||
except Exception as e:
|
||||
# Silently skip files that can't be parsed
|
||||
pass
|
||||
|
||||
return all_imports
|
||||
|
||||
|
||||
def get_codebase_imports_with_files(root_dir: Path) -> Dict[str, List[Tuple[str, List[int]]]]:
|
||||
"""Get all imports from the crawl4ai library and docs folders with file locations and line numbers"""
|
||||
import_to_files = defaultdict(list)
|
||||
|
||||
# Only scan crawl4ai library folder and docs folder
|
||||
target_dirs = [
|
||||
root_dir / 'crawl4ai',
|
||||
root_dir / 'docs'
|
||||
]
|
||||
|
||||
for target_dir in target_dirs:
|
||||
if not target_dir.exists():
|
||||
continue
|
||||
|
||||
for py_file in target_dir.rglob('*.py'):
|
||||
# Skip __pycache__ directories
|
||||
if '__pycache__' in py_file.parts:
|
||||
continue
|
||||
|
||||
# Skip setup.py and similar files
|
||||
if py_file.name in ['setup.py', 'setup.cfg', 'conf.py']:
|
||||
continue
|
||||
|
||||
imports = extract_imports_from_file(py_file)
|
||||
|
||||
# Map each import to the file and line numbers
|
||||
for imp, line_numbers in imports.items():
|
||||
relative_path = py_file.relative_to(root_dir)
|
||||
import_to_files[imp].append((str(relative_path), sorted(line_numbers)))
|
||||
|
||||
return dict(import_to_files)
|
||||
|
||||
|
||||
def get_declared_dependencies() -> Set[str]:
|
||||
"""Get declared dependencies from pyproject.toml and requirements.txt"""
|
||||
declared = set()
|
||||
|
||||
# Read from pyproject.toml
|
||||
if Path('pyproject.toml').exists():
|
||||
with open('pyproject.toml', 'r') as f:
|
||||
data = toml.load(f)
|
||||
|
||||
# Get main dependencies
|
||||
deps = data.get('project', {}).get('dependencies', [])
|
||||
for dep in deps:
|
||||
# Parse dependency string (e.g., "numpy>=1.26.0,<3")
|
||||
match = re.match(r'^([a-zA-Z0-9_-]+)', dep)
|
||||
if match:
|
||||
pkg_name = match.group(1).lower()
|
||||
declared.add(pkg_name)
|
||||
|
||||
# Get optional dependencies
|
||||
optional = data.get('project', {}).get('optional-dependencies', {})
|
||||
for group, deps in optional.items():
|
||||
for dep in deps:
|
||||
match = re.match(r'^([a-zA-Z0-9_-]+)', dep)
|
||||
if match:
|
||||
pkg_name = match.group(1).lower()
|
||||
declared.add(pkg_name)
|
||||
|
||||
# Also check requirements.txt as backup
|
||||
if Path('requirements.txt').exists():
|
||||
with open('requirements.txt', 'r') as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line and not line.startswith('#'):
|
||||
match = re.match(r'^([a-zA-Z0-9_-]+)', line)
|
||||
if match:
|
||||
pkg_name = match.group(1).lower()
|
||||
declared.add(pkg_name)
|
||||
|
||||
return declared
|
||||
|
||||
|
||||
def normalize_package_name(name: str) -> str:
|
||||
"""Normalize package name for comparison"""
|
||||
# Handle known mappings first
|
||||
if name in PACKAGE_MAPPINGS:
|
||||
return PACKAGE_MAPPINGS[name].lower()
|
||||
|
||||
# Basic normalization
|
||||
return name.lower().replace('_', '-')
|
||||
|
||||
|
||||
def check_missing_dependencies():
|
||||
"""Main function to check for missing dependencies"""
|
||||
print("🔍 Analyzing crawl4ai library and docs folders...\n")
|
||||
|
||||
# Get all imports with their file locations
|
||||
root_dir = Path('.')
|
||||
import_to_files = get_codebase_imports_with_files(root_dir)
|
||||
|
||||
# Get declared dependencies
|
||||
declared_deps = get_declared_dependencies()
|
||||
|
||||
# Normalize declared dependencies
|
||||
normalized_declared = {normalize_package_name(dep) for dep in declared_deps}
|
||||
|
||||
# Categorize imports
|
||||
external_imports = {}
|
||||
local_imports = {}
|
||||
|
||||
# Known local packages
|
||||
local_packages = {'crawl4ai'}
|
||||
|
||||
for imp, file_info in import_to_files.items():
|
||||
# Skip standard library
|
||||
if imp in STDLIB_MODULES:
|
||||
continue
|
||||
|
||||
# Check if it's a local import
|
||||
if any(imp.startswith(local) for local in local_packages):
|
||||
local_imports[imp] = file_info
|
||||
else:
|
||||
external_imports[imp] = file_info
|
||||
|
||||
# Check which external imports are not declared
|
||||
not_declared = {}
|
||||
declared_imports = {}
|
||||
|
||||
for imp, file_info in external_imports.items():
|
||||
normalized_imp = normalize_package_name(imp)
|
||||
|
||||
# Check if import is covered by declared dependencies
|
||||
found = False
|
||||
for declared in normalized_declared:
|
||||
if normalized_imp == declared or normalized_imp.startswith(declared + '.') or declared.startswith(normalized_imp):
|
||||
found = True
|
||||
break
|
||||
|
||||
if found:
|
||||
declared_imports[imp] = file_info
|
||||
else:
|
||||
not_declared[imp] = file_info
|
||||
|
||||
# Print results
|
||||
print(f"📊 Summary:")
|
||||
print(f" - Total unique imports: {len(import_to_files)}")
|
||||
print(f" - External imports: {len(external_imports)}")
|
||||
print(f" - Declared dependencies: {len(declared_deps)}")
|
||||
print(f" - External imports NOT in dependencies: {len(not_declared)}\n")
|
||||
|
||||
if not_declared:
|
||||
print("❌ External imports NOT declared in pyproject.toml or requirements.txt:\n")
|
||||
|
||||
# Sort by import name
|
||||
for imp in sorted(not_declared.keys()):
|
||||
file_info = not_declared[imp]
|
||||
print(f" 📦 {imp}")
|
||||
if imp in PACKAGE_MAPPINGS:
|
||||
print(f" → Package name: {PACKAGE_MAPPINGS[imp]}")
|
||||
|
||||
# Show up to 3 files that use this import
|
||||
for i, (file_path, line_numbers) in enumerate(file_info[:3]):
|
||||
# Format line numbers for clickable output
|
||||
if len(line_numbers) == 1:
|
||||
print(f" - {file_path}:{line_numbers[0]}")
|
||||
else:
|
||||
# Show first few line numbers
|
||||
line_str = ','.join(str(ln) for ln in line_numbers[:3])
|
||||
if len(line_numbers) > 3:
|
||||
line_str += f"... ({len(line_numbers)} imports)"
|
||||
print(f" - {file_path}: lines {line_str}")
|
||||
|
||||
if len(file_info) > 3:
|
||||
print(f" ... and {len(file_info) - 3} more files")
|
||||
print()
|
||||
|
||||
# Check for potentially unused dependencies
|
||||
print("\n🔎 Checking declared dependencies usage...\n")
|
||||
|
||||
# Get all used external packages
|
||||
used_packages = set()
|
||||
for imp in external_imports.keys():
|
||||
normalized = normalize_package_name(imp)
|
||||
used_packages.add(normalized)
|
||||
|
||||
# Find unused
|
||||
unused = []
|
||||
for dep in declared_deps:
|
||||
normalized_dep = normalize_package_name(dep)
|
||||
|
||||
# Check if any import uses this dependency
|
||||
found_usage = False
|
||||
for used in used_packages:
|
||||
if used == normalized_dep or used.startswith(normalized_dep) or normalized_dep.startswith(used):
|
||||
found_usage = True
|
||||
break
|
||||
|
||||
if not found_usage:
|
||||
# Some packages are commonly unused directly
|
||||
indirect_deps = {'wheel', 'setuptools', 'pip', 'colorama', 'certifi', 'packaging', 'urllib3'}
|
||||
if normalized_dep not in indirect_deps:
|
||||
unused.append(dep)
|
||||
|
||||
if unused:
|
||||
print("⚠️ Declared dependencies with NO imports found:")
|
||||
for dep in sorted(unused):
|
||||
print(f" - {dep}")
|
||||
print("\n Note: These might be used indirectly or by other dependencies")
|
||||
else:
|
||||
print("✅ All declared dependencies have corresponding imports")
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("💡 How to use this report:")
|
||||
print(" 1. Check each ❌ import to see if it's legitimate")
|
||||
print(" 2. If legitimate, add the package to pyproject.toml")
|
||||
print(" 3. If it's an internal module or typo, fix the import")
|
||||
print(" 4. Review unused dependencies - remove if truly not needed")
|
||||
print("="*60)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
check_missing_dependencies()
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user