Compare commits
42 Commits
fix/releas
...
fix/exit_w
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0541b61405 | ||
|
|
6735c68288 | ||
|
|
ff6ea41ac3 | ||
|
|
31a435fb0e | ||
|
|
5de6a28055 | ||
|
|
de1561ad14 | ||
|
|
337b588732 | ||
|
|
7a6ad547f0 | ||
|
|
e6692b987d | ||
|
|
307fe28b32 | ||
|
|
438a103b17 | ||
|
|
a03e68fa2f | ||
|
|
864d87afb2 | ||
|
|
508b6fc233 | ||
|
|
e3281935bc | ||
|
|
48647300b4 | ||
|
|
9f9ea3bb3b | ||
|
|
d58b93c207 | ||
|
|
e2b4705010 | ||
|
|
4a1abd5086 | ||
|
|
04258cd4f2 | ||
|
|
84e462d9f8 | ||
|
|
9546773a07 | ||
|
|
66a979ad11 | ||
|
|
0c31e91b53 | ||
|
|
1b6a31f88f | ||
|
|
b8c261780f | ||
|
|
db6ad7a79d | ||
|
|
004d514f33 | ||
|
|
3a9e2c716e | ||
|
|
0163bd797c | ||
|
|
26bad799e4 | ||
|
|
cf8badfe27 | ||
|
|
ccbe3c105c | ||
|
|
761c19d54b | ||
|
|
14b0ecb137 | ||
|
|
0eaa9f9895 | ||
|
|
bde1bba6a2 | ||
|
|
ee25c771d8 | ||
|
|
c4d625fb3c | ||
|
|
ef722766f0 | ||
|
|
4bcb7171a3 |
142
.github/workflows/release.yml
vendored
Normal file
142
.github/workflows/release.yml
vendored
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
name: Release Pipeline
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- 'v*'
|
||||||
|
- '!test-v*' # Exclude test tags
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
release:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
permissions:
|
||||||
|
contents: write # Required for creating releases
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: '3.12'
|
||||||
|
|
||||||
|
- name: Extract version from tag
|
||||||
|
id: get_version
|
||||||
|
run: |
|
||||||
|
TAG_VERSION=${GITHUB_REF#refs/tags/v}
|
||||||
|
echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT
|
||||||
|
echo "Releasing version: $TAG_VERSION"
|
||||||
|
|
||||||
|
- name: Install package dependencies
|
||||||
|
run: |
|
||||||
|
pip install -e .
|
||||||
|
|
||||||
|
- name: Check version consistency
|
||||||
|
run: |
|
||||||
|
TAG_VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||||
|
PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)")
|
||||||
|
|
||||||
|
echo "Tag version: $TAG_VERSION"
|
||||||
|
echo "Package version: $PACKAGE_VERSION"
|
||||||
|
|
||||||
|
if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then
|
||||||
|
echo "❌ Version mismatch! Tag: $TAG_VERSION, Package: $PACKAGE_VERSION"
|
||||||
|
echo "Please update crawl4ai/__version__.py to match the tag version"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "✅ Version check passed: $TAG_VERSION"
|
||||||
|
|
||||||
|
- name: Install build dependencies
|
||||||
|
run: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
pip install build twine
|
||||||
|
|
||||||
|
- name: Build package
|
||||||
|
run: python -m build
|
||||||
|
|
||||||
|
- name: Check package
|
||||||
|
run: twine check dist/*
|
||||||
|
|
||||||
|
- name: Upload to PyPI
|
||||||
|
env:
|
||||||
|
TWINE_USERNAME: __token__
|
||||||
|
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
|
||||||
|
run: |
|
||||||
|
echo "📦 Uploading to PyPI..."
|
||||||
|
twine upload dist/*
|
||||||
|
echo "✅ Package uploaded to https://pypi.org/project/crawl4ai/"
|
||||||
|
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
|
- name: Log in to Docker Hub
|
||||||
|
uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.DOCKER_USERNAME }}
|
||||||
|
password: ${{ secrets.DOCKER_TOKEN }}
|
||||||
|
|
||||||
|
- name: Extract major and minor versions
|
||||||
|
id: versions
|
||||||
|
run: |
|
||||||
|
VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||||
|
MAJOR=$(echo $VERSION | cut -d. -f1)
|
||||||
|
MINOR=$(echo $VERSION | cut -d. -f1-2)
|
||||||
|
echo "MAJOR=$MAJOR" >> $GITHUB_OUTPUT
|
||||||
|
echo "MINOR=$MINOR" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
|
- name: Build and push Docker images
|
||||||
|
uses: docker/build-push-action@v5
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
push: true
|
||||||
|
tags: |
|
||||||
|
unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
||||||
|
unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}
|
||||||
|
unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}
|
||||||
|
unclecode/crawl4ai:latest
|
||||||
|
platforms: linux/amd64,linux/arm64
|
||||||
|
|
||||||
|
- name: Create GitHub Release
|
||||||
|
uses: softprops/action-gh-release@v2
|
||||||
|
with:
|
||||||
|
tag_name: v${{ steps.get_version.outputs.VERSION }}
|
||||||
|
name: Release v${{ steps.get_version.outputs.VERSION }}
|
||||||
|
body: |
|
||||||
|
## 🎉 Crawl4AI v${{ steps.get_version.outputs.VERSION }} Released!
|
||||||
|
|
||||||
|
### 📦 Installation
|
||||||
|
|
||||||
|
**PyPI:**
|
||||||
|
```bash
|
||||||
|
pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Docker:**
|
||||||
|
```bash
|
||||||
|
docker pull unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
||||||
|
docker pull unclecode/crawl4ai:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
### 📝 What's Changed
|
||||||
|
See [CHANGELOG.md](https://github.com/${{ github.repository }}/blob/main/CHANGELOG.md) for details.
|
||||||
|
draft: false
|
||||||
|
prerelease: false
|
||||||
|
token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
||||||
|
- name: Summary
|
||||||
|
run: |
|
||||||
|
echo "## 🚀 Release Complete!" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "### 📦 PyPI Package" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "- Version: ${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "- URL: https://pypi.org/project/crawl4ai/" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "- Install: \`pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "### 🐳 Docker Images" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "- \`unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}\`" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}\`" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "- \`unclecode/crawl4ai:latest\`" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "### 📋 GitHub Release" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "https://github.com/${{ github.repository }}/releases/tag/v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||||
116
.github/workflows/test-release.yml.disabled
vendored
Normal file
116
.github/workflows/test-release.yml.disabled
vendored
Normal file
@@ -0,0 +1,116 @@
|
|||||||
|
name: Test Release Pipeline
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- 'test-v*'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
test-release:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: '3.12'
|
||||||
|
|
||||||
|
- name: Extract version from tag
|
||||||
|
id: get_version
|
||||||
|
run: |
|
||||||
|
TAG_VERSION=${GITHUB_REF#refs/tags/test-v}
|
||||||
|
echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT
|
||||||
|
echo "Testing with version: $TAG_VERSION"
|
||||||
|
|
||||||
|
- name: Install package dependencies
|
||||||
|
run: |
|
||||||
|
pip install -e .
|
||||||
|
|
||||||
|
- name: Check version consistency
|
||||||
|
run: |
|
||||||
|
TAG_VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||||
|
PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)")
|
||||||
|
|
||||||
|
echo "Tag version: $TAG_VERSION"
|
||||||
|
echo "Package version: $PACKAGE_VERSION"
|
||||||
|
|
||||||
|
if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then
|
||||||
|
echo "❌ Version mismatch! Tag: $TAG_VERSION, Package: $PACKAGE_VERSION"
|
||||||
|
echo "Please update crawl4ai/__version__.py to match the tag version"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "✅ Version check passed: $TAG_VERSION"
|
||||||
|
|
||||||
|
- name: Install build dependencies
|
||||||
|
run: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
pip install build twine
|
||||||
|
|
||||||
|
- name: Build package
|
||||||
|
run: python -m build
|
||||||
|
|
||||||
|
- name: Check package
|
||||||
|
run: twine check dist/*
|
||||||
|
|
||||||
|
- name: Upload to Test PyPI
|
||||||
|
env:
|
||||||
|
TWINE_USERNAME: __token__
|
||||||
|
TWINE_PASSWORD: ${{ secrets.TEST_PYPI_TOKEN }}
|
||||||
|
run: |
|
||||||
|
echo "📦 Uploading to Test PyPI..."
|
||||||
|
twine upload --repository testpypi dist/* || {
|
||||||
|
if [ $? -eq 1 ]; then
|
||||||
|
echo "⚠️ Upload failed - likely version already exists on Test PyPI"
|
||||||
|
echo "Continuing anyway for test purposes..."
|
||||||
|
else
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
echo "✅ Test PyPI step complete"
|
||||||
|
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
|
- name: Log in to Docker Hub
|
||||||
|
uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.DOCKER_USERNAME }}
|
||||||
|
password: ${{ secrets.DOCKER_TOKEN }}
|
||||||
|
|
||||||
|
- name: Build and push Docker test images
|
||||||
|
uses: docker/build-push-action@v5
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
push: true
|
||||||
|
tags: |
|
||||||
|
unclecode/crawl4ai:test-${{ steps.get_version.outputs.VERSION }}
|
||||||
|
unclecode/crawl4ai:test-latest
|
||||||
|
platforms: linux/amd64,linux/arm64
|
||||||
|
cache-from: type=gha
|
||||||
|
cache-to: type=gha,mode=max
|
||||||
|
|
||||||
|
- name: Summary
|
||||||
|
run: |
|
||||||
|
echo "## 🎉 Test Release Complete!" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "### 📦 Test PyPI Package" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "- Version: ${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "- URL: https://test.pypi.org/project/crawl4ai/" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "- Install: \`pip install -i https://test.pypi.org/simple/ crawl4ai==${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "### 🐳 Docker Test Images" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "- \`unclecode/crawl4ai:test-${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "- \`unclecode/crawl4ai:test-latest\`" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "### 🧹 Cleanup Commands" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "\`\`\`bash" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "# Remove test tag" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "git tag -d test-v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "git push origin :test-v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "# Remove Docker test images" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "docker rmi unclecode/crawl4ai:test-${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "docker rmi unclecode/crawl4ai:test-latest" >> $GITHUB_STEP_SUMMARY
|
||||||
|
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
|
||||||
15
CHANGELOG.md
15
CHANGELOG.md
@@ -21,6 +21,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- **Flexible LLM Provider Configuration** (Docker):
|
||||||
|
- Support for `LLM_PROVIDER` environment variable to override default provider
|
||||||
|
- Per-request provider override via optional `provider` parameter in API endpoints
|
||||||
|
- Automatic provider validation with clear error messages
|
||||||
|
- Updated Docker documentation and examples
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
- **WebScrapingStrategy Refactoring**: Simplified content scraping architecture
|
||||||
|
- `WebScrapingStrategy` is now an alias for `LXMLWebScrapingStrategy` for backward compatibility
|
||||||
|
- Removed redundant BeautifulSoup-based implementation (~1000 lines of code)
|
||||||
|
- `LXMLWebScrapingStrategy` now inherits directly from `ContentScrapingStrategy`
|
||||||
|
- All existing code using `WebScrapingStrategy` continues to work without modification
|
||||||
|
- Default scraping strategy remains `LXMLWebScrapingStrategy` for optimal performance
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
- **AsyncUrlSeeder**: High-performance URL discovery system for intelligent crawling at scale
|
- **AsyncUrlSeeder**: High-performance URL discovery system for intelligent crawling at scale
|
||||||
- Discover URLs from sitemaps and Common Crawl index
|
- Discover URLs from sitemaps and Common Crawl index
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant
|
|||||||
|
|
||||||
[✨ Check out latest update v0.7.0](#-recent-updates)
|
[✨ Check out latest update v0.7.0](#-recent-updates)
|
||||||
|
|
||||||
🎉 **Version 0.7.0 is now available!** The Adaptive Intelligence Update introduces groundbreaking features: Adaptive Crawling that learns website patterns, Virtual Scroll support for infinite pages, intelligent Link Preview with 3-layer scoring, Async URL Seeder for massive discovery, and significant performance improvements. [Read the release notes →](https://docs.crawl4ai.com/blog/release-v0.7.0)
|
🎉 **Version 0.7.0 is now available!** The Adaptive Intelligence Update introduces groundbreaking features: Adaptive Crawling that learns website patterns, Virtual Scroll support for infinite pages, intelligent Link Preview with 3-layer scoring, Async URL Seeder for massive discovery, and significant performance improvements. [Read the release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.0.md)
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary>🤓 <strong>My Personal Story</strong></summary>
|
<summary>🤓 <strong>My Personal Story</strong></summary>
|
||||||
|
|||||||
@@ -3,12 +3,12 @@ import warnings
|
|||||||
|
|
||||||
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
||||||
# MODIFIED: Add SeedingConfig and VirtualScrollConfig here
|
# MODIFIED: Add SeedingConfig and VirtualScrollConfig here
|
||||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig
|
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig, LinkPreviewConfig, MatchMode
|
||||||
|
|
||||||
from .content_scraping_strategy import (
|
from .content_scraping_strategy import (
|
||||||
ContentScrapingStrategy,
|
ContentScrapingStrategy,
|
||||||
WebScrapingStrategy,
|
|
||||||
LXMLWebScrapingStrategy,
|
LXMLWebScrapingStrategy,
|
||||||
|
WebScrapingStrategy, # Backward compatibility alias
|
||||||
)
|
)
|
||||||
from .async_logger import (
|
from .async_logger import (
|
||||||
AsyncLoggerBase,
|
AsyncLoggerBase,
|
||||||
@@ -132,6 +132,7 @@ __all__ = [
|
|||||||
"CrawlResult",
|
"CrawlResult",
|
||||||
"CrawlerHub",
|
"CrawlerHub",
|
||||||
"CacheMode",
|
"CacheMode",
|
||||||
|
"MatchMode",
|
||||||
"ContentScrapingStrategy",
|
"ContentScrapingStrategy",
|
||||||
"WebScrapingStrategy",
|
"WebScrapingStrategy",
|
||||||
"LXMLWebScrapingStrategy",
|
"LXMLWebScrapingStrategy",
|
||||||
@@ -173,6 +174,7 @@ __all__ = [
|
|||||||
"CompilationResult",
|
"CompilationResult",
|
||||||
"ValidationResult",
|
"ValidationResult",
|
||||||
"ErrorDetail",
|
"ErrorDetail",
|
||||||
|
"LinkPreviewConfig"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
# crawl4ai/__version__.py
|
# crawl4ai/__version__.py
|
||||||
|
|
||||||
# This is the version that will be used for stable releases
|
# This is the version that will be used for stable releases
|
||||||
__version__ = "0.7.0"
|
__version__ = "0.7.2"
|
||||||
|
|
||||||
# For nightly builds, this gets set during build process
|
# For nightly builds, this gets set during build process
|
||||||
__nightly_version__ = None
|
__nightly_version__ = None
|
||||||
|
|||||||
@@ -18,17 +18,24 @@ from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy
|
|||||||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||||
|
|
||||||
from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
|
from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
|
||||||
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy, LXMLWebScrapingStrategy
|
from .content_scraping_strategy import ContentScrapingStrategy, LXMLWebScrapingStrategy
|
||||||
from .deep_crawling import DeepCrawlStrategy
|
from .deep_crawling import DeepCrawlStrategy
|
||||||
|
|
||||||
from .cache_context import CacheMode
|
from .cache_context import CacheMode
|
||||||
from .proxy_strategy import ProxyRotationStrategy
|
from .proxy_strategy import ProxyRotationStrategy
|
||||||
|
|
||||||
from typing import Union, List
|
from typing import Union, List, Callable
|
||||||
import inspect
|
import inspect
|
||||||
from typing import Any, Dict, Optional
|
from typing import Any, Dict, Optional
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
|
||||||
|
# Type alias for URL matching
|
||||||
|
UrlMatcher = Union[str, Callable[[str], bool], List[Union[str, Callable[[str], bool]]]]
|
||||||
|
|
||||||
|
class MatchMode(Enum):
|
||||||
|
OR = "or"
|
||||||
|
AND = "and"
|
||||||
|
|
||||||
# from .proxy_strategy import ProxyConfig
|
# from .proxy_strategy import ProxyConfig
|
||||||
|
|
||||||
|
|
||||||
@@ -862,7 +869,7 @@ class CrawlerRunConfig():
|
|||||||
parser_type (str): Type of parser to use for HTML parsing.
|
parser_type (str): Type of parser to use for HTML parsing.
|
||||||
Default: "lxml".
|
Default: "lxml".
|
||||||
scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
|
scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
|
||||||
Default: WebScrapingStrategy.
|
Default: LXMLWebScrapingStrategy.
|
||||||
proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
|
proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
|
||||||
If None, no additional proxy config. Default: None.
|
If None, no additional proxy config. Default: None.
|
||||||
|
|
||||||
@@ -1113,6 +1120,9 @@ class CrawlerRunConfig():
|
|||||||
link_preview_config: Union[LinkPreviewConfig, Dict[str, Any]] = None,
|
link_preview_config: Union[LinkPreviewConfig, Dict[str, Any]] = None,
|
||||||
# Virtual Scroll Parameters
|
# Virtual Scroll Parameters
|
||||||
virtual_scroll_config: Union[VirtualScrollConfig, Dict[str, Any]] = None,
|
virtual_scroll_config: Union[VirtualScrollConfig, Dict[str, Any]] = None,
|
||||||
|
# URL Matching Parameters
|
||||||
|
url_matcher: Optional[UrlMatcher] = None,
|
||||||
|
match_mode: MatchMode = MatchMode.OR,
|
||||||
# Experimental Parameters
|
# Experimental Parameters
|
||||||
experimental: Dict[str, Any] = None,
|
experimental: Dict[str, Any] = None,
|
||||||
):
|
):
|
||||||
@@ -1266,6 +1276,10 @@ class CrawlerRunConfig():
|
|||||||
else:
|
else:
|
||||||
raise ValueError("virtual_scroll_config must be VirtualScrollConfig object or dict")
|
raise ValueError("virtual_scroll_config must be VirtualScrollConfig object or dict")
|
||||||
|
|
||||||
|
# URL Matching Parameters
|
||||||
|
self.url_matcher = url_matcher
|
||||||
|
self.match_mode = match_mode
|
||||||
|
|
||||||
# Experimental Parameters
|
# Experimental Parameters
|
||||||
self.experimental = experimental or {}
|
self.experimental = experimental or {}
|
||||||
|
|
||||||
@@ -1322,6 +1336,51 @@ class CrawlerRunConfig():
|
|||||||
raise ValueError(f"Failed to compile C4A script: {str(e)}")
|
raise ValueError(f"Failed to compile C4A script: {str(e)}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
def is_match(self, url: str) -> bool:
|
||||||
|
"""Check if this config matches the given URL.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: The URL to check against this config's matcher
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if this config should be used for the URL or if no matcher is set.
|
||||||
|
"""
|
||||||
|
if self.url_matcher is None:
|
||||||
|
return True
|
||||||
|
|
||||||
|
if callable(self.url_matcher):
|
||||||
|
# Single function matcher
|
||||||
|
return self.url_matcher(url)
|
||||||
|
|
||||||
|
elif isinstance(self.url_matcher, str):
|
||||||
|
# Single pattern string
|
||||||
|
from fnmatch import fnmatch
|
||||||
|
return fnmatch(url, self.url_matcher)
|
||||||
|
|
||||||
|
elif isinstance(self.url_matcher, list):
|
||||||
|
# List of mixed matchers
|
||||||
|
if not self.url_matcher: # Empty list
|
||||||
|
return False
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for matcher in self.url_matcher:
|
||||||
|
if callable(matcher):
|
||||||
|
results.append(matcher(url))
|
||||||
|
elif isinstance(matcher, str):
|
||||||
|
from fnmatch import fnmatch
|
||||||
|
results.append(fnmatch(url, matcher))
|
||||||
|
else:
|
||||||
|
# Skip invalid matchers
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Apply match mode logic
|
||||||
|
if self.match_mode == MatchMode.OR:
|
||||||
|
return any(results) if results else False
|
||||||
|
else: # AND mode
|
||||||
|
return all(results) if results else False
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def __getattr__(self, name):
|
def __getattr__(self, name):
|
||||||
"""Handle attribute access."""
|
"""Handle attribute access."""
|
||||||
@@ -1443,6 +1502,9 @@ class CrawlerRunConfig():
|
|||||||
# Link Extraction Parameters
|
# Link Extraction Parameters
|
||||||
link_preview_config=kwargs.get("link_preview_config"),
|
link_preview_config=kwargs.get("link_preview_config"),
|
||||||
url=kwargs.get("url"),
|
url=kwargs.get("url"),
|
||||||
|
# URL Matching Parameters
|
||||||
|
url_matcher=kwargs.get("url_matcher"),
|
||||||
|
match_mode=kwargs.get("match_mode", MatchMode.OR),
|
||||||
# Experimental Parameters
|
# Experimental Parameters
|
||||||
experimental=kwargs.get("experimental"),
|
experimental=kwargs.get("experimental"),
|
||||||
)
|
)
|
||||||
@@ -1540,6 +1602,8 @@ class CrawlerRunConfig():
|
|||||||
"deep_crawl_strategy": self.deep_crawl_strategy,
|
"deep_crawl_strategy": self.deep_crawl_strategy,
|
||||||
"link_preview_config": self.link_preview_config.to_dict() if self.link_preview_config else None,
|
"link_preview_config": self.link_preview_config.to_dict() if self.link_preview_config else None,
|
||||||
"url": self.url,
|
"url": self.url,
|
||||||
|
"url_matcher": self.url_matcher,
|
||||||
|
"match_mode": self.match_mode,
|
||||||
"experimental": self.experimental,
|
"experimental": self.experimental,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -824,7 +824,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
except Error:
|
except Error:
|
||||||
visibility_info = await self.check_visibility(page)
|
visibility_info = await self.check_visibility(page)
|
||||||
|
|
||||||
if self.browser_config.config.verbose:
|
if self.browser_config.verbose:
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
message="Body visibility info: {info}",
|
message="Body visibility info: {info}",
|
||||||
tag="DEBUG",
|
tag="DEBUG",
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from typing import Dict, Optional, List, Tuple
|
from typing import Dict, Optional, List, Tuple, Union
|
||||||
from .async_configs import CrawlerRunConfig
|
from .async_configs import CrawlerRunConfig
|
||||||
from .models import (
|
from .models import (
|
||||||
CrawlResult,
|
CrawlResult,
|
||||||
@@ -22,6 +22,8 @@ from urllib.parse import urlparse
|
|||||||
import random
|
import random
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
|
|
||||||
|
from .memory_utils import get_true_memory_usage_percent
|
||||||
|
|
||||||
|
|
||||||
class RateLimiter:
|
class RateLimiter:
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -96,11 +98,37 @@ class BaseDispatcher(ABC):
|
|||||||
self.rate_limiter = rate_limiter
|
self.rate_limiter = rate_limiter
|
||||||
self.monitor = monitor
|
self.monitor = monitor
|
||||||
|
|
||||||
|
def select_config(self, url: str, configs: Union[CrawlerRunConfig, List[CrawlerRunConfig]]) -> Optional[CrawlerRunConfig]:
|
||||||
|
"""Select the appropriate config for a given URL.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: The URL to match against
|
||||||
|
configs: Single config or list of configs to choose from
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The matching config, or None if no match found
|
||||||
|
"""
|
||||||
|
# Single config - return as is
|
||||||
|
if isinstance(configs, CrawlerRunConfig):
|
||||||
|
return configs
|
||||||
|
|
||||||
|
# Empty list - return None
|
||||||
|
if not configs:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Find first matching config
|
||||||
|
for config in configs:
|
||||||
|
if config.is_match(url):
|
||||||
|
return config
|
||||||
|
|
||||||
|
# No match found - return None to indicate URL should be skipped
|
||||||
|
return None
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def crawl_url(
|
async def crawl_url(
|
||||||
self,
|
self,
|
||||||
url: str,
|
url: str,
|
||||||
config: CrawlerRunConfig,
|
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||||
task_id: str,
|
task_id: str,
|
||||||
monitor: Optional[CrawlerMonitor] = None,
|
monitor: Optional[CrawlerMonitor] = None,
|
||||||
) -> CrawlerTaskResult:
|
) -> CrawlerTaskResult:
|
||||||
@@ -111,7 +139,7 @@ class BaseDispatcher(ABC):
|
|||||||
self,
|
self,
|
||||||
urls: List[str],
|
urls: List[str],
|
||||||
crawler: AsyncWebCrawler, # noqa: F821
|
crawler: AsyncWebCrawler, # noqa: F821
|
||||||
config: CrawlerRunConfig,
|
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||||
monitor: Optional[CrawlerMonitor] = None,
|
monitor: Optional[CrawlerMonitor] = None,
|
||||||
) -> List[CrawlerTaskResult]:
|
) -> List[CrawlerTaskResult]:
|
||||||
pass
|
pass
|
||||||
@@ -147,7 +175,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
|||||||
async def _memory_monitor_task(self):
|
async def _memory_monitor_task(self):
|
||||||
"""Background task to continuously monitor memory usage and update state"""
|
"""Background task to continuously monitor memory usage and update state"""
|
||||||
while True:
|
while True:
|
||||||
self.current_memory_percent = psutil.virtual_memory().percent
|
self.current_memory_percent = get_true_memory_usage_percent()
|
||||||
|
|
||||||
# Enter memory pressure mode if we cross the threshold
|
# Enter memory pressure mode if we cross the threshold
|
||||||
if self.current_memory_percent >= self.memory_threshold_percent:
|
if self.current_memory_percent >= self.memory_threshold_percent:
|
||||||
@@ -200,7 +228,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
|||||||
async def crawl_url(
|
async def crawl_url(
|
||||||
self,
|
self,
|
||||||
url: str,
|
url: str,
|
||||||
config: CrawlerRunConfig,
|
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||||
task_id: str,
|
task_id: str,
|
||||||
retry_count: int = 0,
|
retry_count: int = 0,
|
||||||
) -> CrawlerTaskResult:
|
) -> CrawlerTaskResult:
|
||||||
@@ -208,6 +236,37 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
|||||||
error_message = ""
|
error_message = ""
|
||||||
memory_usage = peak_memory = 0.0
|
memory_usage = peak_memory = 0.0
|
||||||
|
|
||||||
|
# Select appropriate config for this URL
|
||||||
|
selected_config = self.select_config(url, config)
|
||||||
|
|
||||||
|
# If no config matches, return failed result
|
||||||
|
if selected_config is None:
|
||||||
|
error_message = f"No matching configuration found for URL: {url}"
|
||||||
|
if self.monitor:
|
||||||
|
self.monitor.update_task(
|
||||||
|
task_id,
|
||||||
|
status=CrawlStatus.FAILED,
|
||||||
|
error_message=error_message
|
||||||
|
)
|
||||||
|
|
||||||
|
return CrawlerTaskResult(
|
||||||
|
task_id=task_id,
|
||||||
|
url=url,
|
||||||
|
result=CrawlResult(
|
||||||
|
url=url,
|
||||||
|
html="",
|
||||||
|
metadata={"status": "no_config_match"},
|
||||||
|
success=False,
|
||||||
|
error_message=error_message
|
||||||
|
),
|
||||||
|
memory_usage=0,
|
||||||
|
peak_memory=0,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=time.time(),
|
||||||
|
error_message=error_message,
|
||||||
|
retry_count=retry_count
|
||||||
|
)
|
||||||
|
|
||||||
# Get starting memory for accurate measurement
|
# Get starting memory for accurate measurement
|
||||||
process = psutil.Process()
|
process = psutil.Process()
|
||||||
start_memory = process.memory_info().rss / (1024 * 1024)
|
start_memory = process.memory_info().rss / (1024 * 1024)
|
||||||
@@ -257,8 +316,8 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
|||||||
retry_count=retry_count + 1
|
retry_count=retry_count + 1
|
||||||
)
|
)
|
||||||
|
|
||||||
# Execute the crawl
|
# Execute the crawl with selected config
|
||||||
result = await self.crawler.arun(url, config=config, session_id=task_id)
|
result = await self.crawler.arun(url, config=selected_config, session_id=task_id)
|
||||||
|
|
||||||
# Measure memory usage
|
# Measure memory usage
|
||||||
end_memory = process.memory_info().rss / (1024 * 1024)
|
end_memory = process.memory_info().rss / (1024 * 1024)
|
||||||
@@ -316,7 +375,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
|||||||
self,
|
self,
|
||||||
urls: List[str],
|
urls: List[str],
|
||||||
crawler: AsyncWebCrawler,
|
crawler: AsyncWebCrawler,
|
||||||
config: CrawlerRunConfig,
|
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||||
) -> List[CrawlerTaskResult]:
|
) -> List[CrawlerTaskResult]:
|
||||||
self.crawler = crawler
|
self.crawler = crawler
|
||||||
|
|
||||||
@@ -470,7 +529,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
|||||||
self,
|
self,
|
||||||
urls: List[str],
|
urls: List[str],
|
||||||
crawler: AsyncWebCrawler,
|
crawler: AsyncWebCrawler,
|
||||||
config: CrawlerRunConfig,
|
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||||
) -> AsyncGenerator[CrawlerTaskResult, None]:
|
) -> AsyncGenerator[CrawlerTaskResult, None]:
|
||||||
self.crawler = crawler
|
self.crawler = crawler
|
||||||
|
|
||||||
@@ -572,7 +631,7 @@ class SemaphoreDispatcher(BaseDispatcher):
|
|||||||
async def crawl_url(
|
async def crawl_url(
|
||||||
self,
|
self,
|
||||||
url: str,
|
url: str,
|
||||||
config: CrawlerRunConfig,
|
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||||
task_id: str,
|
task_id: str,
|
||||||
semaphore: asyncio.Semaphore = None,
|
semaphore: asyncio.Semaphore = None,
|
||||||
) -> CrawlerTaskResult:
|
) -> CrawlerTaskResult:
|
||||||
@@ -580,6 +639,36 @@ class SemaphoreDispatcher(BaseDispatcher):
|
|||||||
error_message = ""
|
error_message = ""
|
||||||
memory_usage = peak_memory = 0.0
|
memory_usage = peak_memory = 0.0
|
||||||
|
|
||||||
|
# Select appropriate config for this URL
|
||||||
|
selected_config = self.select_config(url, config)
|
||||||
|
|
||||||
|
# If no config matches, return failed result
|
||||||
|
if selected_config is None:
|
||||||
|
error_message = f"No matching configuration found for URL: {url}"
|
||||||
|
if self.monitor:
|
||||||
|
self.monitor.update_task(
|
||||||
|
task_id,
|
||||||
|
status=CrawlStatus.FAILED,
|
||||||
|
error_message=error_message
|
||||||
|
)
|
||||||
|
|
||||||
|
return CrawlerTaskResult(
|
||||||
|
task_id=task_id,
|
||||||
|
url=url,
|
||||||
|
result=CrawlResult(
|
||||||
|
url=url,
|
||||||
|
html="",
|
||||||
|
metadata={"status": "no_config_match"},
|
||||||
|
success=False,
|
||||||
|
error_message=error_message
|
||||||
|
),
|
||||||
|
memory_usage=0,
|
||||||
|
peak_memory=0,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=time.time(),
|
||||||
|
error_message=error_message
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if self.monitor:
|
if self.monitor:
|
||||||
self.monitor.update_task(
|
self.monitor.update_task(
|
||||||
@@ -592,7 +681,7 @@ class SemaphoreDispatcher(BaseDispatcher):
|
|||||||
async with semaphore:
|
async with semaphore:
|
||||||
process = psutil.Process()
|
process = psutil.Process()
|
||||||
start_memory = process.memory_info().rss / (1024 * 1024)
|
start_memory = process.memory_info().rss / (1024 * 1024)
|
||||||
result = await self.crawler.arun(url, config=config, session_id=task_id)
|
result = await self.crawler.arun(url, config=selected_config, session_id=task_id)
|
||||||
end_memory = process.memory_info().rss / (1024 * 1024)
|
end_memory = process.memory_info().rss / (1024 * 1024)
|
||||||
|
|
||||||
memory_usage = peak_memory = end_memory - start_memory
|
memory_usage = peak_memory = end_memory - start_memory
|
||||||
@@ -654,7 +743,7 @@ class SemaphoreDispatcher(BaseDispatcher):
|
|||||||
self,
|
self,
|
||||||
crawler: AsyncWebCrawler, # noqa: F821
|
crawler: AsyncWebCrawler, # noqa: F821
|
||||||
urls: List[str],
|
urls: List[str],
|
||||||
config: CrawlerRunConfig,
|
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||||
) -> List[CrawlerTaskResult]:
|
) -> List[CrawlerTaskResult]:
|
||||||
self.crawler = crawler
|
self.crawler = crawler
|
||||||
if self.monitor:
|
if self.monitor:
|
||||||
|
|||||||
@@ -829,7 +829,7 @@ class AsyncUrlSeeder:
|
|||||||
|
|
||||||
async def _iter_sitemap(self, url: str):
|
async def _iter_sitemap(self, url: str):
|
||||||
try:
|
try:
|
||||||
r = await self.client.get(url, timeout=15)
|
r = await self.client.get(url, timeout=15, follow_redirects=True)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
except httpx.HTTPStatusError as e:
|
except httpx.HTTPStatusError as e:
|
||||||
self._log("warning", "Failed to fetch sitemap {url}: HTTP {status_code}",
|
self._log("warning", "Failed to fetch sitemap {url}: HTTP {status_code}",
|
||||||
|
|||||||
@@ -502,9 +502,12 @@ class AsyncWebCrawler:
|
|||||||
metadata = result.get("metadata", {})
|
metadata = result.get("metadata", {})
|
||||||
else:
|
else:
|
||||||
cleaned_html = sanitize_input_encode(result.cleaned_html)
|
cleaned_html = sanitize_input_encode(result.cleaned_html)
|
||||||
media = result.media.model_dump()
|
# media = result.media.model_dump()
|
||||||
tables = media.pop("tables", [])
|
# tables = media.pop("tables", [])
|
||||||
links = result.links.model_dump()
|
# links = result.links.model_dump()
|
||||||
|
media = result.media.model_dump() if hasattr(result.media, 'model_dump') else result.media
|
||||||
|
tables = media.pop("tables", []) if isinstance(media, dict) else []
|
||||||
|
links = result.links.model_dump() if hasattr(result.links, 'model_dump') else result.links
|
||||||
metadata = result.metadata
|
metadata = result.metadata
|
||||||
|
|
||||||
fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)
|
fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)
|
||||||
@@ -650,7 +653,7 @@ class AsyncWebCrawler:
|
|||||||
async def arun_many(
|
async def arun_many(
|
||||||
self,
|
self,
|
||||||
urls: List[str],
|
urls: List[str],
|
||||||
config: Optional[CrawlerRunConfig] = None,
|
config: Optional[Union[CrawlerRunConfig, List[CrawlerRunConfig]]] = None,
|
||||||
dispatcher: Optional[BaseDispatcher] = None,
|
dispatcher: Optional[BaseDispatcher] = None,
|
||||||
# Legacy parameters maintained for backwards compatibility
|
# Legacy parameters maintained for backwards compatibility
|
||||||
# word_count_threshold=MIN_WORD_THRESHOLD,
|
# word_count_threshold=MIN_WORD_THRESHOLD,
|
||||||
@@ -671,7 +674,9 @@ class AsyncWebCrawler:
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
urls: List of URLs to crawl
|
urls: List of URLs to crawl
|
||||||
config: Configuration object controlling crawl behavior for all URLs
|
config: Configuration object(s) controlling crawl behavior. Can be:
|
||||||
|
- Single CrawlerRunConfig: Used for all URLs
|
||||||
|
- List[CrawlerRunConfig]: Configs with url_matcher for URL-specific settings
|
||||||
dispatcher: The dispatcher strategy instance to use. Defaults to MemoryAdaptiveDispatcher
|
dispatcher: The dispatcher strategy instance to use. Defaults to MemoryAdaptiveDispatcher
|
||||||
[other parameters maintained for backwards compatibility]
|
[other parameters maintained for backwards compatibility]
|
||||||
|
|
||||||
@@ -736,7 +741,11 @@ class AsyncWebCrawler:
|
|||||||
or task_result.result
|
or task_result.result
|
||||||
)
|
)
|
||||||
|
|
||||||
stream = config.stream
|
# Handle stream setting - use first config's stream setting if config is a list
|
||||||
|
if isinstance(config, list):
|
||||||
|
stream = config[0].stream if config else False
|
||||||
|
else:
|
||||||
|
stream = config.stream
|
||||||
|
|
||||||
if stream:
|
if stream:
|
||||||
|
|
||||||
|
|||||||
@@ -14,23 +14,8 @@ import hashlib
|
|||||||
from .js_snippet import load_js_script
|
from .js_snippet import load_js_script
|
||||||
from .config import DOWNLOAD_PAGE_TIMEOUT
|
from .config import DOWNLOAD_PAGE_TIMEOUT
|
||||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
from playwright_stealth import StealthConfig
|
|
||||||
from .utils import get_chromium_path
|
from .utils import get_chromium_path
|
||||||
|
|
||||||
stealth_config = StealthConfig(
|
|
||||||
webdriver=True,
|
|
||||||
chrome_app=True,
|
|
||||||
chrome_csi=True,
|
|
||||||
chrome_load_times=True,
|
|
||||||
chrome_runtime=True,
|
|
||||||
navigator_languages=True,
|
|
||||||
navigator_plugins=True,
|
|
||||||
navigator_permissions=True,
|
|
||||||
webgl_vendor=True,
|
|
||||||
outerdimensions=True,
|
|
||||||
navigator_hardware_concurrency=True,
|
|
||||||
media_codecs=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
BROWSER_DISABLE_OPTIONS = [
|
BROWSER_DISABLE_OPTIONS = [
|
||||||
"--disable-background-networking",
|
"--disable-background-networking",
|
||||||
|
|||||||
@@ -65,6 +65,213 @@ class BrowserProfiler:
|
|||||||
self.builtin_config_file = os.path.join(self.builtin_browser_dir, "browser_config.json")
|
self.builtin_config_file = os.path.join(self.builtin_browser_dir, "browser_config.json")
|
||||||
os.makedirs(self.builtin_browser_dir, exist_ok=True)
|
os.makedirs(self.builtin_browser_dir, exist_ok=True)
|
||||||
|
|
||||||
|
def _is_windows(self) -> bool:
|
||||||
|
"""Check if running on Windows platform."""
|
||||||
|
return sys.platform.startswith('win') or sys.platform == 'cygwin'
|
||||||
|
|
||||||
|
def _is_macos(self) -> bool:
|
||||||
|
"""Check if running on macOS platform."""
|
||||||
|
return sys.platform == 'darwin'
|
||||||
|
|
||||||
|
def _is_linux(self) -> bool:
|
||||||
|
"""Check if running on Linux platform."""
|
||||||
|
return sys.platform.startswith('linux')
|
||||||
|
|
||||||
|
def _get_quit_message(self, tag: str) -> str:
|
||||||
|
"""Get appropriate quit message based on context."""
|
||||||
|
if tag == "PROFILE":
|
||||||
|
return "Closing browser and saving profile..."
|
||||||
|
elif tag == "CDP":
|
||||||
|
return "Closing browser..."
|
||||||
|
else:
|
||||||
|
return "Closing browser..."
|
||||||
|
|
||||||
|
async def _listen_windows(self, user_done_event, check_browser_process, tag: str):
|
||||||
|
"""Windows-specific keyboard listener using msvcrt."""
|
||||||
|
try:
|
||||||
|
import msvcrt
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError("msvcrt module not available on this platform")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
# Check for keyboard input
|
||||||
|
if msvcrt.kbhit():
|
||||||
|
raw = msvcrt.getch()
|
||||||
|
|
||||||
|
# Handle Unicode decoding more robustly
|
||||||
|
key = None
|
||||||
|
try:
|
||||||
|
key = raw.decode("utf-8")
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
try:
|
||||||
|
# Try different encodings
|
||||||
|
key = raw.decode("latin1")
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
# Skip if we can't decode
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Validate key
|
||||||
|
if not key or len(key) != 1:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check for printable characters only
|
||||||
|
if not key.isprintable():
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check for quit command
|
||||||
|
if key.lower() == "q":
|
||||||
|
self.logger.info(
|
||||||
|
self._get_quit_message(tag),
|
||||||
|
tag=tag,
|
||||||
|
base_color=LogColor.GREEN
|
||||||
|
)
|
||||||
|
user_done_event.set()
|
||||||
|
return
|
||||||
|
|
||||||
|
# Check if browser process ended
|
||||||
|
if await check_browser_process():
|
||||||
|
return
|
||||||
|
|
||||||
|
# Small delay to prevent busy waiting
|
||||||
|
await asyncio.sleep(0.1)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error in Windows keyboard listener: {e}", tag=tag)
|
||||||
|
# Continue trying instead of failing completely
|
||||||
|
await asyncio.sleep(0.1)
|
||||||
|
continue
|
||||||
|
|
||||||
|
async def _listen_unix(self, user_done_event: asyncio.Event, check_browser_process, tag: str):
|
||||||
|
"""Unix/Linux/macOS keyboard listener using termios and select."""
|
||||||
|
try:
|
||||||
|
import termios
|
||||||
|
import tty
|
||||||
|
import select
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError("termios/tty/select modules not available on this platform")
|
||||||
|
|
||||||
|
# Get stdin file descriptor
|
||||||
|
try:
|
||||||
|
fd = sys.stdin.fileno()
|
||||||
|
except (AttributeError, OSError):
|
||||||
|
raise ImportError("stdin is not a terminal")
|
||||||
|
|
||||||
|
# Save original terminal settings
|
||||||
|
old_settings = None
|
||||||
|
try:
|
||||||
|
old_settings = termios.tcgetattr(fd)
|
||||||
|
except termios.error as e:
|
||||||
|
raise ImportError(f"Cannot get terminal attributes: {e}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Switch to non-canonical mode (cbreak mode)
|
||||||
|
tty.setcbreak(fd)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
# Use select to check if input is available (non-blocking)
|
||||||
|
# Timeout of 0.5 seconds to periodically check browser process
|
||||||
|
readable, _, _ = select.select([sys.stdin], [], [], 0.5)
|
||||||
|
|
||||||
|
if readable:
|
||||||
|
# Read one character
|
||||||
|
key = sys.stdin.read(1)
|
||||||
|
|
||||||
|
if key and key.lower() == "q":
|
||||||
|
self.logger.info(
|
||||||
|
self._get_quit_message(tag),
|
||||||
|
tag=tag,
|
||||||
|
base_color=LogColor.GREEN
|
||||||
|
)
|
||||||
|
user_done_event.set()
|
||||||
|
return
|
||||||
|
|
||||||
|
# Check if browser process ended
|
||||||
|
if await check_browser_process():
|
||||||
|
return
|
||||||
|
|
||||||
|
# Small delay to prevent busy waiting
|
||||||
|
await asyncio.sleep(0.1)
|
||||||
|
|
||||||
|
except (KeyboardInterrupt, EOFError):
|
||||||
|
# Handle Ctrl+C or EOF gracefully
|
||||||
|
self.logger.info("Keyboard interrupt received", tag=tag)
|
||||||
|
user_done_event.set()
|
||||||
|
return
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error in Unix keyboard listener: {e}", tag=tag)
|
||||||
|
await asyncio.sleep(0.1)
|
||||||
|
continue
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# Always restore terminal settings
|
||||||
|
if old_settings is not None:
|
||||||
|
try:
|
||||||
|
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Failed to restore terminal settings: {e}", tag=tag)
|
||||||
|
|
||||||
|
async def _listen_fallback(self, user_done_event: asyncio.Event, check_browser_process, tag: str):
|
||||||
|
"""Fallback keyboard listener using simple input() method."""
|
||||||
|
self.logger.info("Using fallback input mode. Type 'q' and press Enter to quit.", tag=tag)
|
||||||
|
|
||||||
|
# Run input in a separate thread to avoid blocking
|
||||||
|
import threading
|
||||||
|
import queue
|
||||||
|
|
||||||
|
input_queue = queue.Queue()
|
||||||
|
|
||||||
|
def input_thread():
|
||||||
|
"""Thread function to handle input."""
|
||||||
|
try:
|
||||||
|
while not user_done_event.is_set():
|
||||||
|
try:
|
||||||
|
# Use input() with a prompt
|
||||||
|
user_input = input("Press 'q' + Enter to quit: ").strip().lower()
|
||||||
|
input_queue.put(user_input)
|
||||||
|
if user_input == 'q':
|
||||||
|
break
|
||||||
|
except (EOFError, KeyboardInterrupt):
|
||||||
|
input_queue.put('q')
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(f"Error in input thread: {e}", tag=tag)
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Input thread failed: {e}", tag=tag)
|
||||||
|
|
||||||
|
# Start input thread
|
||||||
|
thread = threading.Thread(target=input_thread, daemon=True)
|
||||||
|
thread.start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
while not user_done_event.is_set():
|
||||||
|
# Check for user input
|
||||||
|
try:
|
||||||
|
user_input = input_queue.get_nowait()
|
||||||
|
if user_input == 'q':
|
||||||
|
self.logger.info(
|
||||||
|
self._get_quit_message(tag),
|
||||||
|
tag=tag,
|
||||||
|
base_color=LogColor.GREEN
|
||||||
|
)
|
||||||
|
user_done_event.set()
|
||||||
|
return
|
||||||
|
except queue.Empty:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Check if browser process ended
|
||||||
|
if await check_browser_process():
|
||||||
|
return
|
||||||
|
|
||||||
|
# Small delay
|
||||||
|
await asyncio.sleep(0.5)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"Fallback listener failed: {e}", tag=tag)
|
||||||
|
user_done_event.set()
|
||||||
|
|
||||||
async def create_profile(self,
|
async def create_profile(self,
|
||||||
profile_name: Optional[str] = None,
|
profile_name: Optional[str] = None,
|
||||||
browser_config: Optional[BrowserConfig] = None) -> Optional[str]:
|
browser_config: Optional[BrowserConfig] = None) -> Optional[str]:
|
||||||
@@ -180,42 +387,38 @@ class BrowserProfiler:
|
|||||||
|
|
||||||
# Run keyboard input loop in a separate task
|
# Run keyboard input loop in a separate task
|
||||||
async def listen_for_quit_command():
|
async def listen_for_quit_command():
|
||||||
import termios
|
"""Cross-platform keyboard listener that waits for 'q' key press."""
|
||||||
import tty
|
|
||||||
import select
|
|
||||||
|
|
||||||
# First output the prompt
|
# First output the prompt
|
||||||
self.logger.info("Press 'q' when you've finished using the browser...", tag="PROFILE")
|
self.logger.info(
|
||||||
|
"Press {segment} when you've finished using the browser...",
|
||||||
|
tag="PROFILE",
|
||||||
|
params={"segment": "'q'"}, colors={"segment": LogColor.YELLOW},
|
||||||
|
base_color=LogColor.CYAN
|
||||||
|
)
|
||||||
|
|
||||||
# Save original terminal settings
|
async def check_browser_process():
|
||||||
fd = sys.stdin.fileno()
|
"""Check if browser process is still running."""
|
||||||
old_settings = termios.tcgetattr(fd)
|
if (
|
||||||
|
managed_browser.browser_process
|
||||||
|
and managed_browser.browser_process.poll() is not None
|
||||||
|
):
|
||||||
|
self.logger.info(
|
||||||
|
"Browser already closed. Ending input listener.", tag="PROFILE"
|
||||||
|
)
|
||||||
|
user_done_event.set()
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Try platform-specific implementations with fallback
|
||||||
try:
|
try:
|
||||||
# Switch to non-canonical mode (no line buffering)
|
if self._is_windows():
|
||||||
tty.setcbreak(fd)
|
await self._listen_windows(user_done_event, check_browser_process, "PROFILE")
|
||||||
|
else:
|
||||||
while True:
|
await self._listen_unix(user_done_event, check_browser_process, "PROFILE")
|
||||||
# Check if input is available (non-blocking)
|
except Exception as e:
|
||||||
readable, _, _ = select.select([sys.stdin], [], [], 0.5)
|
self.logger.warning(f"Platform-specific keyboard listener failed: {e}", tag="PROFILE")
|
||||||
if readable:
|
self.logger.info("Falling back to simple input mode...", tag="PROFILE")
|
||||||
key = sys.stdin.read(1)
|
await self._listen_fallback(user_done_event, check_browser_process, "PROFILE")
|
||||||
if key.lower() == 'q':
|
|
||||||
self.logger.info("Closing browser and saving profile...", tag="PROFILE", base_color=LogColor.GREEN)
|
|
||||||
user_done_event.set()
|
|
||||||
return
|
|
||||||
|
|
||||||
# Check if the browser process has already exited
|
|
||||||
if managed_browser.browser_process and managed_browser.browser_process.poll() is not None:
|
|
||||||
self.logger.info("Browser already closed. Ending input listener.", tag="PROFILE")
|
|
||||||
user_done_event.set()
|
|
||||||
return
|
|
||||||
|
|
||||||
await asyncio.sleep(0.1)
|
|
||||||
|
|
||||||
finally:
|
|
||||||
# Restore terminal settings
|
|
||||||
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from playwright.async_api import async_playwright
|
from playwright.async_api import async_playwright
|
||||||
@@ -682,42 +885,33 @@ class BrowserProfiler:
|
|||||||
|
|
||||||
# Run keyboard input loop in a separate task
|
# Run keyboard input loop in a separate task
|
||||||
async def listen_for_quit_command():
|
async def listen_for_quit_command():
|
||||||
import termios
|
"""Cross-platform keyboard listener that waits for 'q' key press."""
|
||||||
import tty
|
|
||||||
import select
|
|
||||||
|
|
||||||
# First output the prompt
|
# First output the prompt
|
||||||
self.logger.info("Press 'q' to stop the browser and exit...", tag="CDP")
|
self.logger.info(
|
||||||
|
"Press {segment} to stop the browser and exit...",
|
||||||
|
tag="CDP",
|
||||||
|
params={"segment": "'q'"}, colors={"segment": LogColor.YELLOW},
|
||||||
|
base_color=LogColor.CYAN
|
||||||
|
)
|
||||||
|
|
||||||
# Save original terminal settings
|
async def check_browser_process():
|
||||||
fd = sys.stdin.fileno()
|
"""Check if browser process is still running."""
|
||||||
old_settings = termios.tcgetattr(fd)
|
if managed_browser.browser_process and managed_browser.browser_process.poll() is not None:
|
||||||
|
self.logger.info("Browser already closed. Ending input listener.", tag="CDP")
|
||||||
|
user_done_event.set()
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Try platform-specific implementations with fallback
|
||||||
try:
|
try:
|
||||||
# Switch to non-canonical mode (no line buffering)
|
if self._is_windows():
|
||||||
tty.setcbreak(fd)
|
await self._listen_windows(user_done_event, check_browser_process, "CDP")
|
||||||
|
else:
|
||||||
while True:
|
await self._listen_unix(user_done_event, check_browser_process, "CDP")
|
||||||
# Check if input is available (non-blocking)
|
except Exception as e:
|
||||||
readable, _, _ = select.select([sys.stdin], [], [], 0.5)
|
self.logger.warning(f"Platform-specific keyboard listener failed: {e}", tag="CDP")
|
||||||
if readable:
|
self.logger.info("Falling back to simple input mode...", tag="CDP")
|
||||||
key = sys.stdin.read(1)
|
await self._listen_fallback(user_done_event, check_browser_process, "CDP")
|
||||||
if key.lower() == 'q':
|
|
||||||
self.logger.info("Closing browser...", tag="CDP")
|
|
||||||
user_done_event.set()
|
|
||||||
return
|
|
||||||
|
|
||||||
# Check if the browser process has already exited
|
|
||||||
if managed_browser.browser_process and managed_browser.browser_process.poll() is not None:
|
|
||||||
self.logger.info("Browser already closed. Ending input listener.", tag="CDP")
|
|
||||||
user_done_event.set()
|
|
||||||
return
|
|
||||||
|
|
||||||
await asyncio.sleep(0.1)
|
|
||||||
|
|
||||||
finally:
|
|
||||||
# Restore terminal settings
|
|
||||||
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
|
|
||||||
|
|
||||||
# Function to retrieve and display CDP JSON config
|
# Function to retrieve and display CDP JSON config
|
||||||
async def get_cdp_json(port):
|
async def get_cdp_json(port):
|
||||||
|
|||||||
@@ -27,7 +27,10 @@ from crawl4ai import (
|
|||||||
PruningContentFilter,
|
PruningContentFilter,
|
||||||
BrowserProfiler,
|
BrowserProfiler,
|
||||||
DefaultMarkdownGenerator,
|
DefaultMarkdownGenerator,
|
||||||
LLMConfig
|
LLMConfig,
|
||||||
|
BFSDeepCrawlStrategy,
|
||||||
|
DFSDeepCrawlStrategy,
|
||||||
|
BestFirstCrawlingStrategy,
|
||||||
)
|
)
|
||||||
from crawl4ai.config import USER_SETTINGS
|
from crawl4ai.config import USER_SETTINGS
|
||||||
from litellm import completion
|
from litellm import completion
|
||||||
@@ -1014,9 +1017,11 @@ def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless
|
|||||||
@click.option("--question", "-q", help="Ask a question about the crawled content")
|
@click.option("--question", "-q", help="Ask a question about the crawled content")
|
||||||
@click.option("--verbose", "-v", is_flag=True)
|
@click.option("--verbose", "-v", is_flag=True)
|
||||||
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
|
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
|
||||||
|
@click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy (bfs, dfs, or best-first)")
|
||||||
|
@click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode")
|
||||||
def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str,
|
def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str,
|
||||||
extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
|
extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
|
||||||
output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
|
output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int):
|
||||||
"""Crawl a website and extract content
|
"""Crawl a website and extract content
|
||||||
|
|
||||||
Simple Usage:
|
Simple Usage:
|
||||||
@@ -1156,6 +1161,27 @@ Always return valid, properly formatted JSON."""
|
|||||||
|
|
||||||
crawler_cfg.scraping_strategy = LXMLWebScrapingStrategy()
|
crawler_cfg.scraping_strategy = LXMLWebScrapingStrategy()
|
||||||
|
|
||||||
|
# Handle deep crawling configuration
|
||||||
|
if deep_crawl:
|
||||||
|
if deep_crawl == "bfs":
|
||||||
|
crawler_cfg.deep_crawl_strategy = BFSDeepCrawlStrategy(
|
||||||
|
max_depth=3,
|
||||||
|
max_pages=max_pages
|
||||||
|
)
|
||||||
|
elif deep_crawl == "dfs":
|
||||||
|
crawler_cfg.deep_crawl_strategy = DFSDeepCrawlStrategy(
|
||||||
|
max_depth=3,
|
||||||
|
max_pages=max_pages
|
||||||
|
)
|
||||||
|
elif deep_crawl == "best-first":
|
||||||
|
crawler_cfg.deep_crawl_strategy = BestFirstCrawlingStrategy(
|
||||||
|
max_depth=3,
|
||||||
|
max_pages=max_pages
|
||||||
|
)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
console.print(f"[green]Deep crawling enabled:[/green] {deep_crawl} strategy, max {max_pages} pages")
|
||||||
|
|
||||||
config = get_global_config()
|
config = get_global_config()
|
||||||
|
|
||||||
browser_cfg.verbose = config.get("VERBOSE", False)
|
browser_cfg.verbose = config.get("VERBOSE", False)
|
||||||
@@ -1170,39 +1196,60 @@ Always return valid, properly formatted JSON."""
|
|||||||
verbose
|
verbose
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Handle deep crawl results (list) vs single result
|
||||||
|
if isinstance(result, list):
|
||||||
|
if len(result) == 0:
|
||||||
|
click.echo("No results found during deep crawling")
|
||||||
|
return
|
||||||
|
# Use the first result for question answering and output
|
||||||
|
main_result = result[0]
|
||||||
|
all_results = result
|
||||||
|
else:
|
||||||
|
# Single result from regular crawling
|
||||||
|
main_result = result
|
||||||
|
all_results = [result]
|
||||||
|
|
||||||
# Handle question
|
# Handle question
|
||||||
if question:
|
if question:
|
||||||
provider, token = setup_llm_config()
|
provider, token = setup_llm_config()
|
||||||
markdown = result.markdown.raw_markdown
|
markdown = main_result.markdown.raw_markdown
|
||||||
anyio.run(stream_llm_response, url, markdown, question, provider, token)
|
anyio.run(stream_llm_response, url, markdown, question, provider, token)
|
||||||
return
|
return
|
||||||
|
|
||||||
# Handle output
|
# Handle output
|
||||||
if not output_file:
|
if not output_file:
|
||||||
if output == "all":
|
if output == "all":
|
||||||
click.echo(json.dumps(result.model_dump(), indent=2))
|
if isinstance(result, list):
|
||||||
|
output_data = [r.model_dump() for r in all_results]
|
||||||
|
click.echo(json.dumps(output_data, indent=2))
|
||||||
|
else:
|
||||||
|
click.echo(json.dumps(main_result.model_dump(), indent=2))
|
||||||
elif output == "json":
|
elif output == "json":
|
||||||
print(result.extracted_content)
|
print(main_result.extracted_content)
|
||||||
extracted_items = json.loads(result.extracted_content)
|
extracted_items = json.loads(main_result.extracted_content)
|
||||||
click.echo(json.dumps(extracted_items, indent=2))
|
click.echo(json.dumps(extracted_items, indent=2))
|
||||||
|
|
||||||
elif output in ["markdown", "md"]:
|
elif output in ["markdown", "md"]:
|
||||||
click.echo(result.markdown.raw_markdown)
|
click.echo(main_result.markdown.raw_markdown)
|
||||||
elif output in ["markdown-fit", "md-fit"]:
|
elif output in ["markdown-fit", "md-fit"]:
|
||||||
click.echo(result.markdown.fit_markdown)
|
click.echo(main_result.markdown.fit_markdown)
|
||||||
else:
|
else:
|
||||||
if output == "all":
|
if output == "all":
|
||||||
with open(output_file, "w") as f:
|
with open(output_file, "w") as f:
|
||||||
f.write(json.dumps(result.model_dump(), indent=2))
|
if isinstance(result, list):
|
||||||
|
output_data = [r.model_dump() for r in all_results]
|
||||||
|
f.write(json.dumps(output_data, indent=2))
|
||||||
|
else:
|
||||||
|
f.write(json.dumps(main_result.model_dump(), indent=2))
|
||||||
elif output == "json":
|
elif output == "json":
|
||||||
with open(output_file, "w") as f:
|
with open(output_file, "w") as f:
|
||||||
f.write(result.extracted_content)
|
f.write(main_result.extracted_content)
|
||||||
elif output in ["markdown", "md"]:
|
elif output in ["markdown", "md"]:
|
||||||
with open(output_file, "w") as f:
|
with open(output_file, "w") as f:
|
||||||
f.write(result.markdown.raw_markdown)
|
f.write(main_result.markdown.raw_markdown)
|
||||||
elif output in ["markdown-fit", "md-fit"]:
|
elif output in ["markdown-fit", "md-fit"]:
|
||||||
with open(output_file, "w") as f:
|
with open(output_file, "w") as f:
|
||||||
f.write(result.markdown.fit_markdown)
|
f.write(main_result.markdown.fit_markdown)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise click.ClickException(str(e))
|
raise click.ClickException(str(e))
|
||||||
@@ -1354,9 +1401,11 @@ def profiles_cmd():
|
|||||||
@click.option("--question", "-q", help="Ask a question about the crawled content")
|
@click.option("--question", "-q", help="Ask a question about the crawled content")
|
||||||
@click.option("--verbose", "-v", is_flag=True)
|
@click.option("--verbose", "-v", is_flag=True)
|
||||||
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
|
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
|
||||||
|
@click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy")
|
||||||
|
@click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode")
|
||||||
def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str,
|
def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str,
|
||||||
extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
|
extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
|
||||||
output: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
|
output: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int):
|
||||||
"""Crawl4AI CLI - Web content extraction tool
|
"""Crawl4AI CLI - Web content extraction tool
|
||||||
|
|
||||||
Simple Usage:
|
Simple Usage:
|
||||||
@@ -1406,7 +1455,9 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f
|
|||||||
bypass_cache=bypass_cache,
|
bypass_cache=bypass_cache,
|
||||||
question=question,
|
question=question,
|
||||||
verbose=verbose,
|
verbose=verbose,
|
||||||
profile=profile
|
profile=profile,
|
||||||
|
deep_crawl=deep_crawl,
|
||||||
|
max_pages=max_pages
|
||||||
)
|
)
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|||||||
@@ -98,20 +98,20 @@ class ContentScrapingStrategy(ABC):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class WebScrapingStrategy(ContentScrapingStrategy):
|
class LXMLWebScrapingStrategy(ContentScrapingStrategy):
|
||||||
"""
|
"""
|
||||||
Class for web content scraping. Perhaps the most important class.
|
LXML-based implementation for fast web content scraping.
|
||||||
|
|
||||||
How it works:
|
This is the primary scraping strategy in Crawl4AI, providing high-performance
|
||||||
1. Extract content from HTML using BeautifulSoup.
|
HTML parsing and content extraction using the lxml library.
|
||||||
2. Clean the extracted content using a content cleaning strategy.
|
|
||||||
3. Filter the cleaned content using a content filtering strategy.
|
Note: WebScrapingStrategy is now an alias for this class to maintain
|
||||||
4. Generate markdown content from the filtered content.
|
backward compatibility.
|
||||||
5. Return the markdown content.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, logger=None):
|
def __init__(self, logger=None):
|
||||||
self.logger = logger
|
self.logger = logger
|
||||||
|
self.DIMENSION_REGEX = re.compile(r"(\d+)(\D*)")
|
||||||
|
self.BASE64_PATTERN = re.compile(r'data:image/[^;]+;base64,([^"]+)')
|
||||||
|
|
||||||
def _log(self, level, message, tag="SCRAPE", **kwargs):
|
def _log(self, level, message, tag="SCRAPE", **kwargs):
|
||||||
"""Helper method to safely use logger."""
|
"""Helper method to safely use logger."""
|
||||||
@@ -132,7 +132,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
ScrapingResult: A structured result containing the scraped content.
|
ScrapingResult: A structured result containing the scraped content.
|
||||||
"""
|
"""
|
||||||
actual_url = kwargs.get("redirected_url", url)
|
actual_url = kwargs.get("redirected_url", url)
|
||||||
raw_result = self._scrap(actual_url, html, is_async=False, **kwargs)
|
raw_result = self._scrap(actual_url, html, **kwargs)
|
||||||
if raw_result is None:
|
if raw_result is None:
|
||||||
return ScrapingResult(
|
return ScrapingResult(
|
||||||
cleaned_html="",
|
cleaned_html="",
|
||||||
@@ -196,376 +196,9 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
Returns:
|
Returns:
|
||||||
ScrapingResult: A structured result containing the scraped content.
|
ScrapingResult: A structured result containing the scraped content.
|
||||||
"""
|
"""
|
||||||
return await asyncio.to_thread(self._scrap, url, html, **kwargs)
|
return await asyncio.to_thread(self.scrap, url, html, **kwargs)
|
||||||
|
|
||||||
def is_data_table(self, table: Tag, **kwargs) -> bool:
|
def process_element(self, url, element: lhtml.HtmlElement, **kwargs) -> Dict[str, Any]:
|
||||||
"""
|
|
||||||
Determine if a table element is a data table (not a layout table).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
table (Tag): BeautifulSoup Tag representing a table element
|
|
||||||
**kwargs: Additional keyword arguments including table_score_threshold
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
bool: True if the table is a data table, False otherwise
|
|
||||||
"""
|
|
||||||
score = 0
|
|
||||||
|
|
||||||
# Check for thead and tbody
|
|
||||||
has_thead = len(table.select('thead')) > 0
|
|
||||||
has_tbody = len(table.select('tbody')) > 0
|
|
||||||
if has_thead:
|
|
||||||
score += 2
|
|
||||||
if has_tbody:
|
|
||||||
score += 1
|
|
||||||
|
|
||||||
# Check for th elements
|
|
||||||
th_count = len(table.select('th'))
|
|
||||||
if th_count > 0:
|
|
||||||
score += 2
|
|
||||||
if has_thead or len(table.select('tr:first-child th')) > 0:
|
|
||||||
score += 1
|
|
||||||
|
|
||||||
# Check for nested tables
|
|
||||||
if len(table.select('table')) > 0:
|
|
||||||
score -= 3
|
|
||||||
|
|
||||||
# Role attribute check
|
|
||||||
role = table.get('role', '').lower()
|
|
||||||
if role in {'presentation', 'none'}:
|
|
||||||
score -= 3
|
|
||||||
|
|
||||||
# Column consistency
|
|
||||||
rows = table.select('tr')
|
|
||||||
if not rows:
|
|
||||||
return False
|
|
||||||
|
|
||||||
col_counts = [len(row.select('td, th')) for row in rows]
|
|
||||||
avg_cols = sum(col_counts) / len(col_counts)
|
|
||||||
variance = sum((c - avg_cols)**2 for c in col_counts) / len(col_counts)
|
|
||||||
if variance < 1:
|
|
||||||
score += 2
|
|
||||||
|
|
||||||
# Caption and summary
|
|
||||||
if table.select('caption'):
|
|
||||||
score += 2
|
|
||||||
if table.has_attr('summary') and table['summary']:
|
|
||||||
score += 1
|
|
||||||
|
|
||||||
# Text density
|
|
||||||
total_text = sum(len(cell.get_text().strip()) for row in rows for cell in row.select('td, th'))
|
|
||||||
total_tags = sum(1 for _ in table.descendants if isinstance(_, Tag))
|
|
||||||
text_ratio = total_text / (total_tags + 1e-5)
|
|
||||||
if text_ratio > 20:
|
|
||||||
score += 3
|
|
||||||
elif text_ratio > 10:
|
|
||||||
score += 2
|
|
||||||
|
|
||||||
# Data attributes
|
|
||||||
data_attrs = sum(1 for attr in table.attrs if attr.startswith('data-'))
|
|
||||||
score += data_attrs * 0.5
|
|
||||||
|
|
||||||
# Size check
|
|
||||||
if avg_cols >= 2 and len(rows) >= 2:
|
|
||||||
score += 2
|
|
||||||
|
|
||||||
threshold = kwargs.get('table_score_threshold', 7)
|
|
||||||
return score >= threshold
|
|
||||||
|
|
||||||
def extract_table_data(self, table: Tag) -> dict:
|
|
||||||
"""
|
|
||||||
Extract structured data from a table element.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
table (Tag): BeautifulSoup Tag representing a table element
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
dict: Dictionary containing table data (headers, rows, caption, summary)
|
|
||||||
"""
|
|
||||||
caption_elem = table.select_one('caption')
|
|
||||||
caption = caption_elem.get_text().strip() if caption_elem else ""
|
|
||||||
summary = table.get('summary', '').strip()
|
|
||||||
|
|
||||||
# Extract headers with colspan handling
|
|
||||||
headers = []
|
|
||||||
thead_rows = table.select('thead tr')
|
|
||||||
if thead_rows:
|
|
||||||
header_cells = thead_rows[0].select('th')
|
|
||||||
for cell in header_cells:
|
|
||||||
text = cell.get_text().strip()
|
|
||||||
colspan = int(cell.get('colspan', 1))
|
|
||||||
headers.extend([text] * colspan)
|
|
||||||
else:
|
|
||||||
first_row = table.select('tr:first-child')
|
|
||||||
if first_row:
|
|
||||||
for cell in first_row[0].select('th, td'):
|
|
||||||
text = cell.get_text().strip()
|
|
||||||
colspan = int(cell.get('colspan', 1))
|
|
||||||
headers.extend([text] * colspan)
|
|
||||||
|
|
||||||
# Extract rows with colspan handling
|
|
||||||
rows = []
|
|
||||||
all_rows = table.select('tr')
|
|
||||||
thead = table.select_one('thead')
|
|
||||||
tbody_rows = []
|
|
||||||
|
|
||||||
if thead:
|
|
||||||
thead_rows = thead.select('tr')
|
|
||||||
tbody_rows = [row for row in all_rows if row not in thead_rows]
|
|
||||||
else:
|
|
||||||
if all_rows and all_rows[0].select('th'):
|
|
||||||
tbody_rows = all_rows[1:]
|
|
||||||
else:
|
|
||||||
tbody_rows = all_rows
|
|
||||||
|
|
||||||
for row in tbody_rows:
|
|
||||||
# for row in table.select('tr:not(:has(ancestor::thead))'):
|
|
||||||
row_data = []
|
|
||||||
for cell in row.select('td'):
|
|
||||||
text = cell.get_text().strip()
|
|
||||||
colspan = int(cell.get('colspan', 1))
|
|
||||||
row_data.extend([text] * colspan)
|
|
||||||
if row_data:
|
|
||||||
rows.append(row_data)
|
|
||||||
|
|
||||||
# Align rows with headers
|
|
||||||
max_columns = len(headers) if headers else (max(len(row) for row in rows) if rows else 0)
|
|
||||||
aligned_rows = []
|
|
||||||
for row in rows:
|
|
||||||
aligned = row[:max_columns] + [''] * (max_columns - len(row))
|
|
||||||
aligned_rows.append(aligned)
|
|
||||||
|
|
||||||
if not headers:
|
|
||||||
headers = [f"Column {i+1}" for i in range(max_columns)]
|
|
||||||
|
|
||||||
return {
|
|
||||||
"headers": headers,
|
|
||||||
"rows": aligned_rows,
|
|
||||||
"caption": caption,
|
|
||||||
"summary": summary,
|
|
||||||
}
|
|
||||||
|
|
||||||
def flatten_nested_elements(self, node):
|
|
||||||
"""
|
|
||||||
Flatten nested elements in a HTML tree.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
node (Tag): The root node of the HTML tree.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tag: The flattened HTML tree.
|
|
||||||
"""
|
|
||||||
if isinstance(node, NavigableString):
|
|
||||||
return node
|
|
||||||
if (
|
|
||||||
len(node.contents) == 1
|
|
||||||
and isinstance(node.contents[0], Tag)
|
|
||||||
and node.contents[0].name == node.name
|
|
||||||
):
|
|
||||||
return self.flatten_nested_elements(node.contents[0])
|
|
||||||
node.contents = [self.flatten_nested_elements(child) for child in node.contents]
|
|
||||||
return node
|
|
||||||
|
|
||||||
def find_closest_parent_with_useful_text(self, tag, **kwargs):
|
|
||||||
"""
|
|
||||||
Find the closest parent with useful text.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
tag (Tag): The starting tag to search from.
|
|
||||||
**kwargs: Additional keyword arguments.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tag: The closest parent with useful text, or None if not found.
|
|
||||||
"""
|
|
||||||
image_description_min_word_threshold = kwargs.get(
|
|
||||||
"image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
|
|
||||||
)
|
|
||||||
current_tag = tag
|
|
||||||
while current_tag:
|
|
||||||
current_tag = current_tag.parent
|
|
||||||
# Get the text content of the parent tag
|
|
||||||
if current_tag:
|
|
||||||
text_content = current_tag.get_text(separator=" ", strip=True)
|
|
||||||
# Check if the text content has at least word_count_threshold
|
|
||||||
if len(text_content.split()) >= image_description_min_word_threshold:
|
|
||||||
return text_content
|
|
||||||
return None
|
|
||||||
|
|
||||||
def remove_unwanted_attributes(
|
|
||||||
self, element, important_attrs, keep_data_attributes=False
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Remove unwanted attributes from an HTML element.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
element (Tag): The HTML element to remove attributes from.
|
|
||||||
important_attrs (list): List of important attributes to keep.
|
|
||||||
keep_data_attributes (bool): Whether to keep data attributes.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
None
|
|
||||||
"""
|
|
||||||
attrs_to_remove = []
|
|
||||||
for attr in element.attrs:
|
|
||||||
if attr not in important_attrs:
|
|
||||||
if keep_data_attributes:
|
|
||||||
if not attr.startswith("data-"):
|
|
||||||
attrs_to_remove.append(attr)
|
|
||||||
else:
|
|
||||||
attrs_to_remove.append(attr)
|
|
||||||
|
|
||||||
for attr in attrs_to_remove:
|
|
||||||
del element[attr]
|
|
||||||
|
|
||||||
def process_image(self, img, url, index, total_images, **kwargs):
|
|
||||||
"""
|
|
||||||
Process an image element.
|
|
||||||
|
|
||||||
How it works:
|
|
||||||
1. Check if the image has valid display and inside undesired html elements.
|
|
||||||
2. Score an image for it's usefulness.
|
|
||||||
3. Extract image file metadata to extract size and extension.
|
|
||||||
4. Generate a dictionary with the processed image information.
|
|
||||||
5. Return the processed image information.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
img (Tag): The image element to process.
|
|
||||||
url (str): The URL of the page containing the image.
|
|
||||||
index (int): The index of the image in the list of images.
|
|
||||||
total_images (int): The total number of images in the list.
|
|
||||||
**kwargs: Additional keyword arguments.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
dict: A dictionary containing the processed image information.
|
|
||||||
"""
|
|
||||||
# parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w')
|
|
||||||
# if ' ' in u else None}
|
|
||||||
# for u in [f"http{p}" for p in s.split("http") if p]]
|
|
||||||
|
|
||||||
# Constants for checks
|
|
||||||
classes_to_check = frozenset(["button", "icon", "logo"])
|
|
||||||
tags_to_check = frozenset(["button", "input"])
|
|
||||||
image_formats = frozenset(["jpg", "jpeg", "png", "webp", "avif", "gif"])
|
|
||||||
|
|
||||||
# Pre-fetch commonly used attributes
|
|
||||||
style = img.get("style", "")
|
|
||||||
alt = img.get("alt", "")
|
|
||||||
src = img.get("src", "")
|
|
||||||
data_src = img.get("data-src", "")
|
|
||||||
srcset = img.get("srcset", "")
|
|
||||||
data_srcset = img.get("data-srcset", "")
|
|
||||||
width = img.get("width")
|
|
||||||
height = img.get("height")
|
|
||||||
parent = img.parent
|
|
||||||
parent_classes = parent.get("class", [])
|
|
||||||
|
|
||||||
# Quick validation checks
|
|
||||||
if (
|
|
||||||
"display:none" in style
|
|
||||||
or parent.name in tags_to_check
|
|
||||||
or any(c in cls for c in parent_classes for cls in classes_to_check)
|
|
||||||
or any(c in src for c in classes_to_check)
|
|
||||||
or any(c in alt for c in classes_to_check)
|
|
||||||
):
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Quick score calculation
|
|
||||||
score = 0
|
|
||||||
if width and width.isdigit():
|
|
||||||
width_val = int(width)
|
|
||||||
score += 1 if width_val > 150 else 0
|
|
||||||
if height and height.isdigit():
|
|
||||||
height_val = int(height)
|
|
||||||
score += 1 if height_val > 150 else 0
|
|
||||||
if alt:
|
|
||||||
score += 1
|
|
||||||
score += index / total_images < 0.5
|
|
||||||
|
|
||||||
# image_format = ''
|
|
||||||
# if "data:image/" in src:
|
|
||||||
# image_format = src.split(',')[0].split(';')[0].split('/')[1].split(';')[0]
|
|
||||||
# else:
|
|
||||||
# image_format = os.path.splitext(src)[1].lower().strip('.').split('?')[0]
|
|
||||||
|
|
||||||
# if image_format in ('jpg', 'png', 'webp', 'avif'):
|
|
||||||
# score += 1
|
|
||||||
|
|
||||||
# Check for image format in all possible sources
|
|
||||||
def has_image_format(url):
|
|
||||||
return any(fmt in url.lower() for fmt in image_formats)
|
|
||||||
|
|
||||||
# Score for having proper image sources
|
|
||||||
if any(has_image_format(url) for url in [src, data_src, srcset, data_srcset]):
|
|
||||||
score += 1
|
|
||||||
if srcset or data_srcset:
|
|
||||||
score += 1
|
|
||||||
if img.find_parent("picture"):
|
|
||||||
score += 1
|
|
||||||
|
|
||||||
# Detect format from any available source
|
|
||||||
detected_format = None
|
|
||||||
for url in [src, data_src, srcset, data_srcset]:
|
|
||||||
if url:
|
|
||||||
format_matches = [fmt for fmt in image_formats if fmt in url.lower()]
|
|
||||||
if format_matches:
|
|
||||||
detected_format = format_matches[0]
|
|
||||||
break
|
|
||||||
|
|
||||||
if score <= kwargs.get("image_score_threshold", IMAGE_SCORE_THRESHOLD):
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Use set for deduplication
|
|
||||||
unique_urls = set()
|
|
||||||
image_variants = []
|
|
||||||
|
|
||||||
# Generate a unique group ID for this set of variants
|
|
||||||
group_id = index
|
|
||||||
|
|
||||||
# Base image info template
|
|
||||||
base_info = {
|
|
||||||
"alt": alt,
|
|
||||||
"desc": self.find_closest_parent_with_useful_text(img, **kwargs),
|
|
||||||
"score": score,
|
|
||||||
"type": "image",
|
|
||||||
"group_id": group_id, # Group ID for this set of variants
|
|
||||||
"format": detected_format,
|
|
||||||
}
|
|
||||||
|
|
||||||
# Inline function for adding variants
|
|
||||||
def add_variant(src, width=None):
|
|
||||||
if src and not src.startswith("data:") and src not in unique_urls:
|
|
||||||
unique_urls.add(src)
|
|
||||||
image_variants.append({**base_info, "src": src, "width": width})
|
|
||||||
|
|
||||||
# Process all sources
|
|
||||||
add_variant(src)
|
|
||||||
add_variant(data_src)
|
|
||||||
|
|
||||||
# Handle srcset and data-srcset in one pass
|
|
||||||
for attr in ("srcset", "data-srcset"):
|
|
||||||
if value := img.get(attr):
|
|
||||||
for source in parse_srcset(value):
|
|
||||||
add_variant(source["url"], source["width"])
|
|
||||||
|
|
||||||
# Quick picture element check
|
|
||||||
if picture := img.find_parent("picture"):
|
|
||||||
for source in picture.find_all("source"):
|
|
||||||
if srcset := source.get("srcset"):
|
|
||||||
for src in parse_srcset(srcset):
|
|
||||||
add_variant(src["url"], src["width"])
|
|
||||||
|
|
||||||
# Framework-specific attributes in one pass
|
|
||||||
for attr, value in img.attrs.items():
|
|
||||||
if (
|
|
||||||
attr.startswith("data-")
|
|
||||||
and ("src" in attr or "srcset" in attr)
|
|
||||||
and "http" in value
|
|
||||||
):
|
|
||||||
add_variant(value)
|
|
||||||
|
|
||||||
return image_variants if image_variants else None
|
|
||||||
|
|
||||||
def process_element(self, url, element: PageElement, **kwargs) -> Dict[str, Any]:
|
|
||||||
"""
|
"""
|
||||||
Process an HTML element.
|
Process an HTML element.
|
||||||
|
|
||||||
@@ -577,7 +210,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
url (str): The URL of the page containing the element.
|
url (str): The URL of the page containing the element.
|
||||||
element (Tag): The HTML element to process.
|
element (lhtml.HtmlElement): The HTML element to process.
|
||||||
**kwargs: Additional keyword arguments.
|
**kwargs: Additional keyword arguments.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@@ -595,514 +228,6 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
"external_links_dict": external_links_dict,
|
"external_links_dict": external_links_dict,
|
||||||
}
|
}
|
||||||
|
|
||||||
def _process_element(
|
|
||||||
self,
|
|
||||||
url,
|
|
||||||
element: PageElement,
|
|
||||||
media: Dict[str, Any],
|
|
||||||
internal_links_dict: Dict[str, Any],
|
|
||||||
external_links_dict: Dict[str, Any],
|
|
||||||
**kwargs,
|
|
||||||
) -> bool:
|
|
||||||
"""
|
|
||||||
Process an HTML element.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
if isinstance(element, NavigableString):
|
|
||||||
if isinstance(element, Comment):
|
|
||||||
element.extract()
|
|
||||||
return False
|
|
||||||
|
|
||||||
# if element.name == 'img':
|
|
||||||
# process_image(element, url, 0, 1)
|
|
||||||
# return True
|
|
||||||
base_domain = kwargs.get("base_domain", get_base_domain(url))
|
|
||||||
|
|
||||||
if element.name in ["script", "style", "link", "meta", "noscript"]:
|
|
||||||
element.decompose()
|
|
||||||
return False
|
|
||||||
|
|
||||||
keep_element = False
|
|
||||||
# Special case for table elements - always preserve structure
|
|
||||||
if element.name in ["tr", "td", "th"]:
|
|
||||||
keep_element = True
|
|
||||||
|
|
||||||
exclude_domains = kwargs.get("exclude_domains", [])
|
|
||||||
# exclude_social_media_domains = kwargs.get('exclude_social_media_domains', set(SOCIAL_MEDIA_DOMAINS))
|
|
||||||
# exclude_social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('exclude_social_media_domains', [])
|
|
||||||
# exclude_social_media_domains = list(set(exclude_social_media_domains))
|
|
||||||
|
|
||||||
try:
|
|
||||||
if element.name == "a" and element.get("href"):
|
|
||||||
href = element.get("href", "").strip()
|
|
||||||
if not href: # Skip empty hrefs
|
|
||||||
return False
|
|
||||||
|
|
||||||
# url_base = url.split("/")[2]
|
|
||||||
|
|
||||||
# Normalize the URL
|
|
||||||
try:
|
|
||||||
normalized_href = normalize_url(href, url)
|
|
||||||
except ValueError:
|
|
||||||
# logging.warning(f"Invalid URL format: {href}, Error: {str(e)}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
link_data = {
|
|
||||||
"href": normalized_href,
|
|
||||||
"text": element.get_text().strip(),
|
|
||||||
"title": element.get("title", "").strip(),
|
|
||||||
"base_domain": base_domain,
|
|
||||||
}
|
|
||||||
|
|
||||||
is_external = is_external_url(normalized_href, base_domain)
|
|
||||||
|
|
||||||
keep_element = True
|
|
||||||
|
|
||||||
# Handle external link exclusions
|
|
||||||
if is_external:
|
|
||||||
link_base_domain = get_base_domain(normalized_href)
|
|
||||||
link_data["base_domain"] = link_base_domain
|
|
||||||
if kwargs.get("exclude_external_links", False):
|
|
||||||
element.decompose()
|
|
||||||
return False
|
|
||||||
# elif kwargs.get('exclude_social_media_links', False):
|
|
||||||
# if link_base_domain in exclude_social_media_domains:
|
|
||||||
# element.decompose()
|
|
||||||
# return False
|
|
||||||
# if any(domain in normalized_href.lower() for domain in exclude_social_media_domains):
|
|
||||||
# element.decompose()
|
|
||||||
# return False
|
|
||||||
elif exclude_domains:
|
|
||||||
if link_base_domain in exclude_domains:
|
|
||||||
element.decompose()
|
|
||||||
return False
|
|
||||||
# if any(domain in normalized_href.lower() for domain in kwargs.get('exclude_domains', [])):
|
|
||||||
# element.decompose()
|
|
||||||
# return False
|
|
||||||
|
|
||||||
if is_external:
|
|
||||||
if normalized_href not in external_links_dict:
|
|
||||||
external_links_dict[normalized_href] = link_data
|
|
||||||
else:
|
|
||||||
if kwargs.get("exclude_internal_links", False):
|
|
||||||
element.decompose()
|
|
||||||
return False
|
|
||||||
if normalized_href not in internal_links_dict:
|
|
||||||
internal_links_dict[normalized_href] = link_data
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
raise Exception(f"Error processing links: {str(e)}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
if element.name == "img":
|
|
||||||
potential_sources = [
|
|
||||||
"src",
|
|
||||||
"data-src",
|
|
||||||
"srcset" "data-lazy-src",
|
|
||||||
"data-original",
|
|
||||||
]
|
|
||||||
src = element.get("src", "")
|
|
||||||
while not src and potential_sources:
|
|
||||||
src = element.get(potential_sources.pop(0), "")
|
|
||||||
if not src:
|
|
||||||
element.decompose()
|
|
||||||
return False
|
|
||||||
|
|
||||||
# If it is srcset pick up the first image
|
|
||||||
if "srcset" in element.attrs:
|
|
||||||
src = element.attrs["srcset"].split(",")[0].split(" ")[0]
|
|
||||||
|
|
||||||
# If image src is internal, then skip
|
|
||||||
if not is_external_url(src, base_domain):
|
|
||||||
return True
|
|
||||||
|
|
||||||
image_src_base_domain = get_base_domain(src)
|
|
||||||
|
|
||||||
# Check flag if we should remove external images
|
|
||||||
if kwargs.get("exclude_external_images", False):
|
|
||||||
# Handle relative URLs (which are always from the same domain)
|
|
||||||
if not src.startswith('http') and not src.startswith('//'):
|
|
||||||
return True # Keep relative URLs
|
|
||||||
|
|
||||||
# For absolute URLs, compare the base domains using the existing function
|
|
||||||
src_base_domain = get_base_domain(src)
|
|
||||||
url_base_domain = get_base_domain(url)
|
|
||||||
|
|
||||||
# If the domains don't match and both are valid, the image is external
|
|
||||||
if src_base_domain and url_base_domain and src_base_domain != url_base_domain:
|
|
||||||
element.decompose()
|
|
||||||
return False
|
|
||||||
|
|
||||||
# if kwargs.get('exclude_social_media_links', False):
|
|
||||||
# if image_src_base_domain in exclude_social_media_domains:
|
|
||||||
# element.decompose()
|
|
||||||
# return False
|
|
||||||
# src_url_base = src.split('/')[2]
|
|
||||||
# url_base = url.split('/')[2]
|
|
||||||
# if any(domain in src for domain in exclude_social_media_domains):
|
|
||||||
# element.decompose()
|
|
||||||
# return False
|
|
||||||
|
|
||||||
# Handle exclude domains
|
|
||||||
if exclude_domains:
|
|
||||||
if image_src_base_domain in exclude_domains:
|
|
||||||
element.decompose()
|
|
||||||
return False
|
|
||||||
# if any(domain in src for domain in kwargs.get('exclude_domains', [])):
|
|
||||||
# element.decompose()
|
|
||||||
# return False
|
|
||||||
|
|
||||||
return True # Always keep image elements
|
|
||||||
except Exception:
|
|
||||||
raise "Error processing images"
|
|
||||||
|
|
||||||
# Check if flag to remove all forms is set
|
|
||||||
if kwargs.get("remove_forms", False) and element.name == "form":
|
|
||||||
element.decompose()
|
|
||||||
return False
|
|
||||||
|
|
||||||
if element.name in ["video", "audio"]:
|
|
||||||
media[f"{element.name}s"].append(
|
|
||||||
{
|
|
||||||
"src": element.get("src"),
|
|
||||||
"alt": element.get("alt"),
|
|
||||||
"type": element.name,
|
|
||||||
"description": self.find_closest_parent_with_useful_text(
|
|
||||||
element, **kwargs
|
|
||||||
),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
source_tags = element.find_all("source")
|
|
||||||
for source_tag in source_tags:
|
|
||||||
media[f"{element.name}s"].append(
|
|
||||||
{
|
|
||||||
"src": source_tag.get("src"),
|
|
||||||
"alt": element.get("alt"),
|
|
||||||
"type": element.name,
|
|
||||||
"description": self.find_closest_parent_with_useful_text(
|
|
||||||
element, **kwargs
|
|
||||||
),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
return True # Always keep video and audio elements
|
|
||||||
|
|
||||||
if element.name in ONLY_TEXT_ELIGIBLE_TAGS:
|
|
||||||
if kwargs.get("only_text", False):
|
|
||||||
element.replace_with(element.get_text())
|
|
||||||
|
|
||||||
try:
|
|
||||||
self.remove_unwanted_attributes(
|
|
||||||
element, IMPORTANT_ATTRS + kwargs.get("keep_attrs", []) , kwargs.get("keep_data_attributes", False)
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
# print('Error removing unwanted attributes:', str(e))
|
|
||||||
self._log(
|
|
||||||
"error",
|
|
||||||
message="Error removing unwanted attributes: {error}",
|
|
||||||
tag="SCRAPE",
|
|
||||||
params={"error": str(e)},
|
|
||||||
)
|
|
||||||
# Process children
|
|
||||||
for child in list(element.children):
|
|
||||||
if isinstance(child, NavigableString) and not isinstance(
|
|
||||||
child, Comment
|
|
||||||
):
|
|
||||||
if len(child.strip()) > 0:
|
|
||||||
keep_element = True
|
|
||||||
else:
|
|
||||||
if self._process_element(
|
|
||||||
url,
|
|
||||||
child,
|
|
||||||
media,
|
|
||||||
internal_links_dict,
|
|
||||||
external_links_dict,
|
|
||||||
**kwargs,
|
|
||||||
):
|
|
||||||
keep_element = True
|
|
||||||
|
|
||||||
# Check word count
|
|
||||||
word_count_threshold = kwargs.get(
|
|
||||||
"word_count_threshold", MIN_WORD_THRESHOLD
|
|
||||||
)
|
|
||||||
if not keep_element:
|
|
||||||
word_count = len(element.get_text(strip=True).split())
|
|
||||||
keep_element = word_count >= word_count_threshold
|
|
||||||
|
|
||||||
if not keep_element:
|
|
||||||
element.decompose()
|
|
||||||
|
|
||||||
return keep_element
|
|
||||||
except Exception as e:
|
|
||||||
# print('Error processing element:', str(e))
|
|
||||||
self._log(
|
|
||||||
"error",
|
|
||||||
message="Error processing element: {error}",
|
|
||||||
tag="SCRAPE",
|
|
||||||
params={"error": str(e)},
|
|
||||||
)
|
|
||||||
return False
|
|
||||||
|
|
||||||
def _scrap(
|
|
||||||
self,
|
|
||||||
url: str,
|
|
||||||
html: str,
|
|
||||||
word_count_threshold: int = MIN_WORD_THRESHOLD,
|
|
||||||
css_selector: str = None,
|
|
||||||
target_elements: List[str] = None,
|
|
||||||
**kwargs,
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Extract content from HTML using BeautifulSoup.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
url (str): The URL of the page to scrape.
|
|
||||||
html (str): The HTML content of the page to scrape.
|
|
||||||
word_count_threshold (int): The minimum word count threshold for content extraction.
|
|
||||||
css_selector (str): The CSS selector to use for content extraction.
|
|
||||||
**kwargs: Additional keyword arguments.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
dict: A dictionary containing the extracted content.
|
|
||||||
"""
|
|
||||||
success = True
|
|
||||||
if not html:
|
|
||||||
return None
|
|
||||||
|
|
||||||
parser_type = kwargs.get("parser", "lxml")
|
|
||||||
soup = BeautifulSoup(html, parser_type)
|
|
||||||
body = soup.body
|
|
||||||
if body is None:
|
|
||||||
raise Exception("'<body>' tag is not found in fetched html. Consider adding wait_for=\"css:body\" to wait for body tag to be loaded into DOM.")
|
|
||||||
base_domain = get_base_domain(url)
|
|
||||||
|
|
||||||
# Early removal of all images if exclude_all_images is set
|
|
||||||
# This happens before any processing to minimize memory usage
|
|
||||||
if kwargs.get("exclude_all_images", False):
|
|
||||||
for img in body.find_all('img'):
|
|
||||||
img.decompose()
|
|
||||||
|
|
||||||
try:
|
|
||||||
meta = extract_metadata("", soup)
|
|
||||||
except Exception as e:
|
|
||||||
self._log(
|
|
||||||
"error",
|
|
||||||
message="Error extracting metadata: {error}",
|
|
||||||
tag="SCRAPE",
|
|
||||||
params={"error": str(e)},
|
|
||||||
)
|
|
||||||
meta = {}
|
|
||||||
|
|
||||||
# Handle tag-based removal first - faster than CSS selection
|
|
||||||
excluded_tags = set(kwargs.get("excluded_tags", []) or [])
|
|
||||||
if excluded_tags:
|
|
||||||
for element in body.find_all(lambda tag: tag.name in excluded_tags):
|
|
||||||
element.extract()
|
|
||||||
|
|
||||||
# Handle CSS selector-based removal
|
|
||||||
excluded_selector = kwargs.get("excluded_selector", "")
|
|
||||||
if excluded_selector:
|
|
||||||
is_single_selector = (
|
|
||||||
"," not in excluded_selector and " " not in excluded_selector
|
|
||||||
)
|
|
||||||
if is_single_selector:
|
|
||||||
while element := body.select_one(excluded_selector):
|
|
||||||
element.extract()
|
|
||||||
else:
|
|
||||||
for element in body.select(excluded_selector):
|
|
||||||
element.extract()
|
|
||||||
|
|
||||||
content_element = None
|
|
||||||
if target_elements:
|
|
||||||
try:
|
|
||||||
for_content_targeted_element = []
|
|
||||||
for target_element in target_elements:
|
|
||||||
for_content_targeted_element.extend(body.select(target_element))
|
|
||||||
content_element = soup.new_tag("div")
|
|
||||||
for el in for_content_targeted_element:
|
|
||||||
content_element.append(copy.deepcopy(el))
|
|
||||||
except Exception as e:
|
|
||||||
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
content_element = body
|
|
||||||
|
|
||||||
kwargs["exclude_social_media_domains"] = set(
|
|
||||||
kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS
|
|
||||||
)
|
|
||||||
kwargs["exclude_domains"] = set(kwargs.get("exclude_domains", []))
|
|
||||||
if kwargs.get("exclude_social_media_links", False):
|
|
||||||
kwargs["exclude_domains"] = kwargs["exclude_domains"].union(
|
|
||||||
kwargs["exclude_social_media_domains"]
|
|
||||||
)
|
|
||||||
|
|
||||||
result_obj = self.process_element(
|
|
||||||
url,
|
|
||||||
body,
|
|
||||||
word_count_threshold=word_count_threshold,
|
|
||||||
base_domain=base_domain,
|
|
||||||
**kwargs,
|
|
||||||
)
|
|
||||||
|
|
||||||
links = {"internal": [], "external": []}
|
|
||||||
media = result_obj["media"]
|
|
||||||
internal_links_dict = result_obj["internal_links_dict"]
|
|
||||||
external_links_dict = result_obj["external_links_dict"]
|
|
||||||
|
|
||||||
# Update the links dictionary with unique links
|
|
||||||
links["internal"] = list(internal_links_dict.values())
|
|
||||||
links["external"] = list(external_links_dict.values())
|
|
||||||
|
|
||||||
# Extract head content for links if configured
|
|
||||||
link_preview_config = kwargs.get("link_preview_config")
|
|
||||||
if link_preview_config is not None:
|
|
||||||
try:
|
|
||||||
import asyncio
|
|
||||||
from .link_preview import LinkPreview
|
|
||||||
from .models import Links, Link
|
|
||||||
|
|
||||||
verbose = link_preview_config.verbose
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
self._log("info", "Starting link head extraction for {internal} internal and {external} external links",
|
|
||||||
params={"internal": len(links["internal"]), "external": len(links["external"])}, tag="LINK_EXTRACT")
|
|
||||||
|
|
||||||
# Convert dict links to Link objects
|
|
||||||
internal_links = [Link(**link_data) for link_data in links["internal"]]
|
|
||||||
external_links = [Link(**link_data) for link_data in links["external"]]
|
|
||||||
links_obj = Links(internal=internal_links, external=external_links)
|
|
||||||
|
|
||||||
# Create a config object for LinkPreview
|
|
||||||
class TempCrawlerRunConfig:
|
|
||||||
def __init__(self, link_config, score_links):
|
|
||||||
self.link_preview_config = link_config
|
|
||||||
self.score_links = score_links
|
|
||||||
|
|
||||||
config = TempCrawlerRunConfig(link_preview_config, kwargs.get("score_links", False))
|
|
||||||
|
|
||||||
# Extract head content (run async operation in sync context)
|
|
||||||
async def extract_links():
|
|
||||||
async with LinkPreview(self.logger) as extractor:
|
|
||||||
return await extractor.extract_link_heads(links_obj, config)
|
|
||||||
|
|
||||||
# Run the async operation
|
|
||||||
try:
|
|
||||||
# Check if we're already in an async context
|
|
||||||
loop = asyncio.get_running_loop()
|
|
||||||
# If we're in an async context, we need to run in a thread
|
|
||||||
import concurrent.futures
|
|
||||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
||||||
future = executor.submit(asyncio.run, extract_links())
|
|
||||||
updated_links = future.result()
|
|
||||||
except RuntimeError:
|
|
||||||
# No running loop, we can use asyncio.run directly
|
|
||||||
updated_links = asyncio.run(extract_links())
|
|
||||||
|
|
||||||
# Convert back to dict format
|
|
||||||
links["internal"] = [link.dict() for link in updated_links.internal]
|
|
||||||
links["external"] = [link.dict() for link in updated_links.external]
|
|
||||||
|
|
||||||
if verbose:
|
|
||||||
successful_internal = len([l for l in updated_links.internal if l.head_extraction_status == "valid"])
|
|
||||||
successful_external = len([l for l in updated_links.external if l.head_extraction_status == "valid"])
|
|
||||||
self._log("info", "Link head extraction completed: {internal_success}/{internal_total} internal, {external_success}/{external_total} external",
|
|
||||||
params={
|
|
||||||
"internal_success": successful_internal,
|
|
||||||
"internal_total": len(updated_links.internal),
|
|
||||||
"external_success": successful_external,
|
|
||||||
"external_total": len(updated_links.external)
|
|
||||||
}, tag="LINK_EXTRACT")
|
|
||||||
else:
|
|
||||||
self._log("info", "Link head extraction completed successfully", tag="LINK_EXTRACT")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self._log("error", f"Link head extraction failed: {str(e)}", tag="LINK_EXTRACT")
|
|
||||||
# Continue with original links if extraction fails
|
|
||||||
|
|
||||||
# # Process images using ThreadPoolExecutor
|
|
||||||
imgs = body.find_all("img")
|
|
||||||
|
|
||||||
media["images"] = [
|
|
||||||
img
|
|
||||||
for result in (
|
|
||||||
self.process_image(img, url, i, len(imgs), **kwargs)
|
|
||||||
for i, img in enumerate(imgs)
|
|
||||||
)
|
|
||||||
if result is not None
|
|
||||||
for img in result
|
|
||||||
]
|
|
||||||
|
|
||||||
# Process tables if not excluded
|
|
||||||
excluded_tags = set(kwargs.get("excluded_tags", []) or [])
|
|
||||||
if 'table' not in excluded_tags:
|
|
||||||
tables = body.find_all('table')
|
|
||||||
for table in tables:
|
|
||||||
if self.is_data_table(table, **kwargs):
|
|
||||||
table_data = self.extract_table_data(table)
|
|
||||||
media["tables"].append(table_data)
|
|
||||||
|
|
||||||
body = self.flatten_nested_elements(body)
|
|
||||||
base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)')
|
|
||||||
for img in imgs:
|
|
||||||
src = img.get("src", "")
|
|
||||||
if base64_pattern.match(src):
|
|
||||||
# Replace base64 data with empty string
|
|
||||||
img["src"] = base64_pattern.sub("", src)
|
|
||||||
|
|
||||||
str_body = ""
|
|
||||||
try:
|
|
||||||
str_body = content_element.encode_contents().decode("utf-8")
|
|
||||||
except Exception:
|
|
||||||
# Reset body to the original HTML
|
|
||||||
success = False
|
|
||||||
body = BeautifulSoup(html, "html.parser")
|
|
||||||
|
|
||||||
# Create a new div with a special ID
|
|
||||||
error_div = body.new_tag("div", id="crawl4ai_error_message")
|
|
||||||
error_div.string = """
|
|
||||||
Crawl4AI Error: This page is not fully supported.
|
|
||||||
|
|
||||||
Possible reasons:
|
|
||||||
1. The page may have restrictions that prevent crawling.
|
|
||||||
2. The page might not be fully loaded.
|
|
||||||
|
|
||||||
Suggestions:
|
|
||||||
- Try calling the crawl function with these parameters:
|
|
||||||
magic=True,
|
|
||||||
- Set headless=False to visualize what's happening on the page.
|
|
||||||
|
|
||||||
If the issue persists, please check the page's structure and any potential anti-crawling measures.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Append the error div to the body
|
|
||||||
body.append(error_div)
|
|
||||||
str_body = body.encode_contents().decode("utf-8")
|
|
||||||
|
|
||||||
print(
|
|
||||||
"[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details."
|
|
||||||
)
|
|
||||||
self._log(
|
|
||||||
"error",
|
|
||||||
message="After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.",
|
|
||||||
tag="SCRAPE",
|
|
||||||
)
|
|
||||||
|
|
||||||
cleaned_html = str_body.replace("\n\n", "\n").replace(" ", " ")
|
|
||||||
|
|
||||||
return {
|
|
||||||
"cleaned_html": cleaned_html,
|
|
||||||
"success": success,
|
|
||||||
"media": media,
|
|
||||||
"links": links,
|
|
||||||
"metadata": meta,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
|
||||||
def __init__(self, logger=None):
|
|
||||||
super().__init__(logger)
|
|
||||||
self.DIMENSION_REGEX = re.compile(r"(\d+)(\D*)")
|
|
||||||
self.BASE64_PATTERN = re.compile(r'data:image/[^;]+;base64,([^"]+)')
|
|
||||||
|
|
||||||
def _process_element(
|
def _process_element(
|
||||||
self,
|
self,
|
||||||
url: str,
|
url: str,
|
||||||
@@ -1145,10 +270,10 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
|||||||
link_data["intrinsic_score"] = intrinsic_score
|
link_data["intrinsic_score"] = intrinsic_score
|
||||||
except Exception:
|
except Exception:
|
||||||
# Fail gracefully - assign default score
|
# Fail gracefully - assign default score
|
||||||
link_data["intrinsic_score"] = float('inf')
|
link_data["intrinsic_score"] = 0
|
||||||
else:
|
else:
|
||||||
# No scoring enabled - assign infinity (all links equal priority)
|
# No scoring enabled - assign infinity (all links equal priority)
|
||||||
link_data["intrinsic_score"] = float('inf')
|
link_data["intrinsic_score"] = 0
|
||||||
|
|
||||||
is_external = is_external_url(normalized_href, base_domain)
|
is_external = is_external_url(normalized_href, base_domain)
|
||||||
if is_external:
|
if is_external:
|
||||||
@@ -1862,3 +987,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
|||||||
"links": {"internal": [], "external": []},
|
"links": {"internal": [], "external": []},
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Backward compatibility alias
|
||||||
|
WebScrapingStrategy = LXMLWebScrapingStrategy
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ from .extraction_strategy import *
|
|||||||
from .crawler_strategy import *
|
from .crawler_strategy import *
|
||||||
from typing import List
|
from typing import List
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from .content_scraping_strategy import WebScrapingStrategy
|
from ..content_scraping_strategy import LXMLWebScrapingStrategy as WebScrapingStrategy
|
||||||
from .config import *
|
from .config import *
|
||||||
import warnings
|
import warnings
|
||||||
import json
|
import json
|
||||||
|
|||||||
79
crawl4ai/memory_utils.py
Normal file
79
crawl4ai/memory_utils.py
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
import psutil
|
||||||
|
import platform
|
||||||
|
import subprocess
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
|
||||||
|
def get_true_available_memory_gb() -> float:
|
||||||
|
"""Get truly available memory including inactive pages (cross-platform)"""
|
||||||
|
vm = psutil.virtual_memory()
|
||||||
|
|
||||||
|
if platform.system() == 'Darwin': # macOS
|
||||||
|
# On macOS, we need to include inactive memory too
|
||||||
|
try:
|
||||||
|
# Use vm_stat to get accurate values
|
||||||
|
result = subprocess.run(['vm_stat'], capture_output=True, text=True)
|
||||||
|
lines = result.stdout.split('\n')
|
||||||
|
|
||||||
|
page_size = 16384 # macOS page size
|
||||||
|
pages = {}
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
if 'Pages free:' in line:
|
||||||
|
pages['free'] = int(line.split()[-1].rstrip('.'))
|
||||||
|
elif 'Pages inactive:' in line:
|
||||||
|
pages['inactive'] = int(line.split()[-1].rstrip('.'))
|
||||||
|
elif 'Pages speculative:' in line:
|
||||||
|
pages['speculative'] = int(line.split()[-1].rstrip('.'))
|
||||||
|
elif 'Pages purgeable:' in line:
|
||||||
|
pages['purgeable'] = int(line.split()[-1].rstrip('.'))
|
||||||
|
|
||||||
|
# Calculate total available (free + inactive + speculative + purgeable)
|
||||||
|
total_available_pages = (
|
||||||
|
pages.get('free', 0) +
|
||||||
|
pages.get('inactive', 0) +
|
||||||
|
pages.get('speculative', 0) +
|
||||||
|
pages.get('purgeable', 0)
|
||||||
|
)
|
||||||
|
available_gb = (total_available_pages * page_size) / (1024**3)
|
||||||
|
|
||||||
|
return available_gb
|
||||||
|
except:
|
||||||
|
# Fallback to psutil
|
||||||
|
return vm.available / (1024**3)
|
||||||
|
else:
|
||||||
|
# For Windows and Linux, psutil.available is accurate
|
||||||
|
return vm.available / (1024**3)
|
||||||
|
|
||||||
|
|
||||||
|
def get_true_memory_usage_percent() -> float:
|
||||||
|
"""
|
||||||
|
Get memory usage percentage that accounts for platform differences.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
float: Memory usage percentage (0-100)
|
||||||
|
"""
|
||||||
|
vm = psutil.virtual_memory()
|
||||||
|
total_gb = vm.total / (1024**3)
|
||||||
|
available_gb = get_true_available_memory_gb()
|
||||||
|
|
||||||
|
# Calculate used percentage based on truly available memory
|
||||||
|
used_percent = 100.0 * (total_gb - available_gb) / total_gb
|
||||||
|
|
||||||
|
# Ensure it's within valid range
|
||||||
|
return max(0.0, min(100.0, used_percent))
|
||||||
|
|
||||||
|
|
||||||
|
def get_memory_stats() -> Tuple[float, float, float]:
|
||||||
|
"""
|
||||||
|
Get comprehensive memory statistics.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple[float, float, float]: (used_percent, available_gb, total_gb)
|
||||||
|
"""
|
||||||
|
vm = psutil.virtual_memory()
|
||||||
|
total_gb = vm.total / (1024**3)
|
||||||
|
available_gb = get_true_available_memory_gb()
|
||||||
|
used_percent = get_true_memory_usage_percent()
|
||||||
|
|
||||||
|
return used_percent, available_gb, total_gb
|
||||||
@@ -23,8 +23,9 @@ SeedingConfig = Union['SeedingConfigType']
|
|||||||
|
|
||||||
# Content scraping types
|
# Content scraping types
|
||||||
ContentScrapingStrategy = Union['ContentScrapingStrategyType']
|
ContentScrapingStrategy = Union['ContentScrapingStrategyType']
|
||||||
WebScrapingStrategy = Union['WebScrapingStrategyType']
|
|
||||||
LXMLWebScrapingStrategy = Union['LXMLWebScrapingStrategyType']
|
LXMLWebScrapingStrategy = Union['LXMLWebScrapingStrategyType']
|
||||||
|
# Backward compatibility alias
|
||||||
|
WebScrapingStrategy = Union['LXMLWebScrapingStrategyType']
|
||||||
|
|
||||||
# Proxy types
|
# Proxy types
|
||||||
ProxyRotationStrategy = Union['ProxyRotationStrategyType']
|
ProxyRotationStrategy = Union['ProxyRotationStrategyType']
|
||||||
@@ -114,7 +115,6 @@ if TYPE_CHECKING:
|
|||||||
# Content scraping imports
|
# Content scraping imports
|
||||||
from .content_scraping_strategy import (
|
from .content_scraping_strategy import (
|
||||||
ContentScrapingStrategy as ContentScrapingStrategyType,
|
ContentScrapingStrategy as ContentScrapingStrategyType,
|
||||||
WebScrapingStrategy as WebScrapingStrategyType,
|
|
||||||
LXMLWebScrapingStrategy as LXMLWebScrapingStrategyType,
|
LXMLWebScrapingStrategy as LXMLWebScrapingStrategyType,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -1517,8 +1517,29 @@ def extract_metadata_using_lxml(html, doc=None):
|
|||||||
head = head[0]
|
head = head[0]
|
||||||
|
|
||||||
# Title - using XPath
|
# Title - using XPath
|
||||||
|
# title = head.xpath(".//title/text()")
|
||||||
|
# metadata["title"] = title[0].strip() if title else None
|
||||||
|
|
||||||
|
# === Title Extraction - New Approach ===
|
||||||
|
# Attempt to extract <title> using XPath
|
||||||
title = head.xpath(".//title/text()")
|
title = head.xpath(".//title/text()")
|
||||||
metadata["title"] = title[0].strip() if title else None
|
title = title[0] if title else None
|
||||||
|
|
||||||
|
# Fallback: Use .find() in case XPath fails due to malformed HTML
|
||||||
|
if not title:
|
||||||
|
title_el = doc.find(".//title")
|
||||||
|
title = title_el.text if title_el is not None else None
|
||||||
|
|
||||||
|
# Final fallback: Use OpenGraph or Twitter title if <title> is missing or empty
|
||||||
|
if not title:
|
||||||
|
title_candidates = (
|
||||||
|
doc.xpath("//meta[@property='og:title']/@content") or
|
||||||
|
doc.xpath("//meta[@name='twitter:title']/@content")
|
||||||
|
)
|
||||||
|
title = title_candidates[0] if title_candidates else None
|
||||||
|
|
||||||
|
# Strip and assign title
|
||||||
|
metadata["title"] = title.strip() if title else None
|
||||||
|
|
||||||
# Meta description - using XPath with multiple attribute conditions
|
# Meta description - using XPath with multiple attribute conditions
|
||||||
description = head.xpath('.//meta[@name="description"]/@content')
|
description = head.xpath('.//meta[@name="description"]/@content')
|
||||||
@@ -3342,7 +3363,13 @@ async def get_text_embeddings(
|
|||||||
# Default: use sentence-transformers
|
# Default: use sentence-transformers
|
||||||
else:
|
else:
|
||||||
# Lazy load to avoid importing heavy libraries unless needed
|
# Lazy load to avoid importing heavy libraries unless needed
|
||||||
from sentence_transformers import SentenceTransformer
|
try:
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"sentence-transformers is required for local embeddings. "
|
||||||
|
"Install it with: pip install 'crawl4ai[transformer]' or pip install sentence-transformers"
|
||||||
|
)
|
||||||
|
|
||||||
# Cache the model in function attribute to avoid reloading
|
# Cache the model in function attribute to avoid reloading
|
||||||
if not hasattr(get_text_embeddings, '_models'):
|
if not hasattr(get_text_embeddings, '_models'):
|
||||||
|
|||||||
@@ -6,3 +6,8 @@ GROQ_API_KEY=your_groq_key_here
|
|||||||
TOGETHER_API_KEY=your_together_key_here
|
TOGETHER_API_KEY=your_together_key_here
|
||||||
MISTRAL_API_KEY=your_mistral_key_here
|
MISTRAL_API_KEY=your_mistral_key_here
|
||||||
GEMINI_API_TOKEN=your_gemini_key_here
|
GEMINI_API_TOKEN=your_gemini_key_here
|
||||||
|
|
||||||
|
# Optional: Override the default LLM provider
|
||||||
|
# Examples: "openai/gpt-4", "anthropic/claude-3-opus", "deepseek/chat", etc.
|
||||||
|
# If not set, uses the provider specified in config.yml (default: openai/gpt-4o-mini)
|
||||||
|
# LLM_PROVIDER=anthropic/claude-3-opus
|
||||||
@@ -154,6 +154,29 @@ cp deploy/docker/.llm.env.example .llm.env
|
|||||||
# Now edit .llm.env and add your API keys
|
# Now edit .llm.env and add your API keys
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Flexible LLM Provider Configuration:**
|
||||||
|
|
||||||
|
The Docker setup now supports flexible LLM provider configuration through three methods:
|
||||||
|
|
||||||
|
1. **Environment Variable** (Highest Priority): Set `LLM_PROVIDER` to override the default
|
||||||
|
```bash
|
||||||
|
export LLM_PROVIDER="anthropic/claude-3-opus"
|
||||||
|
# Or in your .llm.env file:
|
||||||
|
# LLM_PROVIDER=anthropic/claude-3-opus
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **API Request Parameter**: Specify provider per request
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"url": "https://example.com",
|
||||||
|
"provider": "groq/mixtral-8x7b"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Config File Default**: Falls back to `config.yml` (default: `openai/gpt-4o-mini`)
|
||||||
|
|
||||||
|
The system automatically selects the appropriate API key based on the provider.
|
||||||
|
|
||||||
#### 3. Build and Run with Compose
|
#### 3. Build and Run with Compose
|
||||||
|
|
||||||
The `docker-compose.yml` file in the project root provides a simplified approach that automatically handles architecture detection using buildx.
|
The `docker-compose.yml` file in the project root provides a simplified approach that automatically handles architecture detection using buildx.
|
||||||
@@ -668,7 +691,7 @@ app:
|
|||||||
|
|
||||||
# Default LLM Configuration
|
# Default LLM Configuration
|
||||||
llm:
|
llm:
|
||||||
provider: "openai/gpt-4o-mini"
|
provider: "openai/gpt-4o-mini" # Can be overridden by LLM_PROVIDER env var
|
||||||
api_key_env: "OPENAI_API_KEY"
|
api_key_env: "OPENAI_API_KEY"
|
||||||
# api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
|
# api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ from typing import List, Tuple, Dict
|
|||||||
from functools import partial
|
from functools import partial
|
||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from base64 import b64encode
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from typing import Optional, AsyncGenerator
|
from typing import Optional, AsyncGenerator
|
||||||
@@ -39,7 +40,9 @@ from utils import (
|
|||||||
get_base_url,
|
get_base_url,
|
||||||
is_task_id,
|
is_task_id,
|
||||||
should_cleanup_task,
|
should_cleanup_task,
|
||||||
decode_redis_hash
|
decode_redis_hash,
|
||||||
|
get_llm_api_key,
|
||||||
|
validate_llm_provider
|
||||||
)
|
)
|
||||||
|
|
||||||
import psutil, time
|
import psutil, time
|
||||||
@@ -88,10 +91,12 @@ async def handle_llm_qa(
|
|||||||
|
|
||||||
Answer:"""
|
Answer:"""
|
||||||
|
|
||||||
|
# api_token=os.environ.get(config["llm"].get("api_key_env", ""))
|
||||||
|
|
||||||
response = perform_completion_with_backoff(
|
response = perform_completion_with_backoff(
|
||||||
provider=config["llm"]["provider"],
|
provider=config["llm"]["provider"],
|
||||||
prompt_with_variables=prompt,
|
prompt_with_variables=prompt,
|
||||||
api_token=os.environ.get(config["llm"].get("api_key_env", ""))
|
api_token=get_llm_api_key(config)
|
||||||
)
|
)
|
||||||
|
|
||||||
return response.choices[0].message.content
|
return response.choices[0].message.content
|
||||||
@@ -109,19 +114,23 @@ async def process_llm_extraction(
|
|||||||
url: str,
|
url: str,
|
||||||
instruction: str,
|
instruction: str,
|
||||||
schema: Optional[str] = None,
|
schema: Optional[str] = None,
|
||||||
cache: str = "0"
|
cache: str = "0",
|
||||||
|
provider: Optional[str] = None
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Process LLM extraction in background."""
|
"""Process LLM extraction in background."""
|
||||||
try:
|
try:
|
||||||
# If config['llm'] has api_key then ignore the api_key_env
|
# Validate provider
|
||||||
api_key = ""
|
is_valid, error_msg = validate_llm_provider(config, provider)
|
||||||
if "api_key" in config["llm"]:
|
if not is_valid:
|
||||||
api_key = config["llm"]["api_key"]
|
await redis.hset(f"task:{task_id}", mapping={
|
||||||
else:
|
"status": TaskStatus.FAILED,
|
||||||
api_key = os.environ.get(config["llm"].get("api_key_env", None), "")
|
"error": error_msg
|
||||||
|
})
|
||||||
|
return
|
||||||
|
api_key = get_llm_api_key(config, provider)
|
||||||
llm_strategy = LLMExtractionStrategy(
|
llm_strategy = LLMExtractionStrategy(
|
||||||
llm_config=LLMConfig(
|
llm_config=LLMConfig(
|
||||||
provider=config["llm"]["provider"],
|
provider=provider or config["llm"]["provider"],
|
||||||
api_token=api_key
|
api_token=api_key
|
||||||
),
|
),
|
||||||
instruction=instruction,
|
instruction=instruction,
|
||||||
@@ -168,10 +177,19 @@ async def handle_markdown_request(
|
|||||||
filter_type: FilterType,
|
filter_type: FilterType,
|
||||||
query: Optional[str] = None,
|
query: Optional[str] = None,
|
||||||
cache: str = "0",
|
cache: str = "0",
|
||||||
config: Optional[dict] = None
|
config: Optional[dict] = None,
|
||||||
|
provider: Optional[str] = None
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Handle markdown generation requests."""
|
"""Handle markdown generation requests."""
|
||||||
try:
|
try:
|
||||||
|
# Validate provider if using LLM filter
|
||||||
|
if filter_type == FilterType.LLM:
|
||||||
|
is_valid, error_msg = validate_llm_provider(config, provider)
|
||||||
|
if not is_valid:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_400_BAD_REQUEST,
|
||||||
|
detail=error_msg
|
||||||
|
)
|
||||||
decoded_url = unquote(url)
|
decoded_url = unquote(url)
|
||||||
if not decoded_url.startswith(('http://', 'https://')):
|
if not decoded_url.startswith(('http://', 'https://')):
|
||||||
decoded_url = 'https://' + decoded_url
|
decoded_url = 'https://' + decoded_url
|
||||||
@@ -184,8 +202,8 @@ async def handle_markdown_request(
|
|||||||
FilterType.BM25: BM25ContentFilter(user_query=query or ""),
|
FilterType.BM25: BM25ContentFilter(user_query=query or ""),
|
||||||
FilterType.LLM: LLMContentFilter(
|
FilterType.LLM: LLMContentFilter(
|
||||||
llm_config=LLMConfig(
|
llm_config=LLMConfig(
|
||||||
provider=config["llm"]["provider"],
|
provider=provider or config["llm"]["provider"],
|
||||||
api_token=os.environ.get(config["llm"].get("api_key_env", None), ""),
|
api_token=get_llm_api_key(config, provider),
|
||||||
),
|
),
|
||||||
instruction=query or "Extract main content"
|
instruction=query or "Extract main content"
|
||||||
)
|
)
|
||||||
@@ -229,7 +247,8 @@ async def handle_llm_request(
|
|||||||
query: Optional[str] = None,
|
query: Optional[str] = None,
|
||||||
schema: Optional[str] = None,
|
schema: Optional[str] = None,
|
||||||
cache: str = "0",
|
cache: str = "0",
|
||||||
config: Optional[dict] = None
|
config: Optional[dict] = None,
|
||||||
|
provider: Optional[str] = None
|
||||||
) -> JSONResponse:
|
) -> JSONResponse:
|
||||||
"""Handle LLM extraction requests."""
|
"""Handle LLM extraction requests."""
|
||||||
base_url = get_base_url(request)
|
base_url = get_base_url(request)
|
||||||
@@ -259,7 +278,8 @@ async def handle_llm_request(
|
|||||||
schema,
|
schema,
|
||||||
cache,
|
cache,
|
||||||
base_url,
|
base_url,
|
||||||
config
|
config,
|
||||||
|
provider
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -303,7 +323,8 @@ async def create_new_task(
|
|||||||
schema: Optional[str],
|
schema: Optional[str],
|
||||||
cache: str,
|
cache: str,
|
||||||
base_url: str,
|
base_url: str,
|
||||||
config: dict
|
config: dict,
|
||||||
|
provider: Optional[str] = None
|
||||||
) -> JSONResponse:
|
) -> JSONResponse:
|
||||||
"""Create and initialize a new task."""
|
"""Create and initialize a new task."""
|
||||||
decoded_url = unquote(input_path)
|
decoded_url = unquote(input_path)
|
||||||
@@ -327,7 +348,8 @@ async def create_new_task(
|
|||||||
decoded_url,
|
decoded_url,
|
||||||
query,
|
query,
|
||||||
schema,
|
schema,
|
||||||
cache
|
cache,
|
||||||
|
provider
|
||||||
)
|
)
|
||||||
|
|
||||||
return JSONResponse({
|
return JSONResponse({
|
||||||
@@ -371,6 +393,9 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
|
|||||||
server_memory_mb = _get_memory_mb()
|
server_memory_mb = _get_memory_mb()
|
||||||
result_dict = result.model_dump()
|
result_dict = result.model_dump()
|
||||||
result_dict['server_memory_mb'] = server_memory_mb
|
result_dict['server_memory_mb'] = server_memory_mb
|
||||||
|
# If PDF exists, encode it to base64
|
||||||
|
if result_dict.get('pdf') is not None:
|
||||||
|
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
|
||||||
logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")
|
logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")
|
||||||
data = json.dumps(result_dict, default=datetime_handler) + "\n"
|
data = json.dumps(result_dict, default=datetime_handler) + "\n"
|
||||||
yield data.encode('utf-8')
|
yield data.encode('utf-8')
|
||||||
@@ -444,9 +469,18 @@ async def handle_crawl_request(
|
|||||||
peak_mem_mb = max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb) # <--- Get peak memory
|
peak_mem_mb = max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb) # <--- Get peak memory
|
||||||
logger.info(f"Memory usage: Start: {start_mem_mb} MB, End: {end_mem_mb} MB, Delta: {mem_delta_mb} MB, Peak: {peak_mem_mb} MB")
|
logger.info(f"Memory usage: Start: {start_mem_mb} MB, End: {end_mem_mb} MB, Delta: {mem_delta_mb} MB, Peak: {peak_mem_mb} MB")
|
||||||
|
|
||||||
|
# Process results to handle PDF bytes
|
||||||
|
processed_results = []
|
||||||
|
for result in results:
|
||||||
|
result_dict = result.model_dump()
|
||||||
|
# If PDF exists, encode it to base64
|
||||||
|
if result_dict.get('pdf') is not None:
|
||||||
|
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
|
||||||
|
processed_results.append(result_dict)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"success": True,
|
"success": True,
|
||||||
"results": [result.model_dump() for result in results],
|
"results": processed_results,
|
||||||
"server_processing_time_s": end_time - start_time,
|
"server_processing_time_s": end_time - start_time,
|
||||||
"server_memory_delta_mb": mem_delta_mb,
|
"server_memory_delta_mb": mem_delta_mb,
|
||||||
"server_peak_memory_mb": peak_mem_mb
|
"server_peak_memory_mb": peak_mem_mb
|
||||||
|
|||||||
@@ -36,6 +36,7 @@ class LlmJobPayload(BaseModel):
|
|||||||
q: str
|
q: str
|
||||||
schema: Optional[str] = None
|
schema: Optional[str] = None
|
||||||
cache: bool = False
|
cache: bool = False
|
||||||
|
provider: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
class CrawlJobPayload(BaseModel):
|
class CrawlJobPayload(BaseModel):
|
||||||
@@ -61,6 +62,7 @@ async def llm_job_enqueue(
|
|||||||
schema=payload.schema,
|
schema=payload.schema,
|
||||||
cache=payload.cache,
|
cache=payload.cache,
|
||||||
config=_config,
|
config=_config,
|
||||||
|
provider=payload.provider,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ class MarkdownRequest(BaseModel):
|
|||||||
f: FilterType = Field(FilterType.FIT, description="Content‑filter strategy: fit, raw, bm25, or llm")
|
f: FilterType = Field(FilterType.FIT, description="Content‑filter strategy: fit, raw, bm25, or llm")
|
||||||
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
|
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
|
||||||
c: Optional[str] = Field("0", description="Cache‑bust / revision counter")
|
c: Optional[str] = Field("0", description="Cache‑bust / revision counter")
|
||||||
|
provider: Optional[str] = Field(None, description="LLM provider override (e.g., 'anthropic/claude-3-opus')")
|
||||||
|
|
||||||
|
|
||||||
class RawCode(BaseModel):
|
class RawCode(BaseModel):
|
||||||
|
|||||||
@@ -241,7 +241,7 @@ async def get_markdown(
|
|||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
400, "URL must be absolute and start with http/https")
|
400, "URL must be absolute and start with http/https")
|
||||||
markdown = await handle_markdown_request(
|
markdown = await handle_markdown_request(
|
||||||
body.url, body.f, body.q, body.c, config
|
body.url, body.f, body.q, body.c, config, body.provider
|
||||||
)
|
)
|
||||||
return JSONResponse({
|
return JSONResponse({
|
||||||
"url": body.url,
|
"url": body.url,
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
import dns.resolver
|
import dns.resolver
|
||||||
import logging
|
import logging
|
||||||
import yaml
|
import yaml
|
||||||
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -19,10 +20,24 @@ class FilterType(str, Enum):
|
|||||||
LLM = "llm"
|
LLM = "llm"
|
||||||
|
|
||||||
def load_config() -> Dict:
|
def load_config() -> Dict:
|
||||||
"""Load and return application configuration."""
|
"""Load and return application configuration with environment variable overrides."""
|
||||||
config_path = Path(__file__).parent / "config.yml"
|
config_path = Path(__file__).parent / "config.yml"
|
||||||
with open(config_path, "r") as config_file:
|
with open(config_path, "r") as config_file:
|
||||||
return yaml.safe_load(config_file)
|
config = yaml.safe_load(config_file)
|
||||||
|
|
||||||
|
# Override LLM provider from environment if set
|
||||||
|
llm_provider = os.environ.get("LLM_PROVIDER")
|
||||||
|
if llm_provider:
|
||||||
|
config["llm"]["provider"] = llm_provider
|
||||||
|
logging.info(f"LLM provider overridden from environment: {llm_provider}")
|
||||||
|
|
||||||
|
# Also support direct API key from environment if the provider-specific key isn't set
|
||||||
|
llm_api_key = os.environ.get("LLM_API_KEY")
|
||||||
|
if llm_api_key and "api_key" not in config["llm"]:
|
||||||
|
config["llm"]["api_key"] = llm_api_key
|
||||||
|
logging.info("LLM API key loaded from LLM_API_KEY environment variable")
|
||||||
|
|
||||||
|
return config
|
||||||
|
|
||||||
def setup_logging(config: Dict) -> None:
|
def setup_logging(config: Dict) -> None:
|
||||||
"""Configure application logging."""
|
"""Configure application logging."""
|
||||||
@@ -56,6 +71,52 @@ def decode_redis_hash(hash_data: Dict[bytes, bytes]) -> Dict[str, str]:
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def get_llm_api_key(config: Dict, provider: Optional[str] = None) -> str:
|
||||||
|
"""Get the appropriate API key based on the LLM provider.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: The application configuration dictionary
|
||||||
|
provider: Optional provider override (e.g., "openai/gpt-4")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The API key for the provider, or empty string if not found
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Use provided provider or fall back to config
|
||||||
|
if not provider:
|
||||||
|
provider = config["llm"]["provider"]
|
||||||
|
|
||||||
|
# Check if direct API key is configured
|
||||||
|
if "api_key" in config["llm"]:
|
||||||
|
return config["llm"]["api_key"]
|
||||||
|
|
||||||
|
# Fall back to the configured api_key_env if no match
|
||||||
|
return os.environ.get(config["llm"].get("api_key_env", ""), "")
|
||||||
|
|
||||||
|
|
||||||
|
def validate_llm_provider(config: Dict, provider: Optional[str] = None) -> tuple[bool, str]:
|
||||||
|
"""Validate that the LLM provider has an associated API key.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: The application configuration dictionary
|
||||||
|
provider: Optional provider override (e.g., "openai/gpt-4")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (is_valid, error_message)
|
||||||
|
"""
|
||||||
|
# Use provided provider or fall back to config
|
||||||
|
if not provider:
|
||||||
|
provider = config["llm"]["provider"]
|
||||||
|
|
||||||
|
# Get the API key for this provider
|
||||||
|
api_key = get_llm_api_key(config, provider)
|
||||||
|
|
||||||
|
if not api_key:
|
||||||
|
return False, f"No API key found for provider '{provider}'. Please set the appropriate environment variable."
|
||||||
|
|
||||||
|
return True, ""
|
||||||
|
|
||||||
|
|
||||||
def verify_email_domain(email: str) -> bool:
|
def verify_email_domain(email: str) -> bool:
|
||||||
try:
|
try:
|
||||||
domain = email.split('@')[1]
|
domain = email.split('@')[1]
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ x-base-config: &base-config
|
|||||||
- TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
|
- TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
|
||||||
- MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
|
- MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
|
||||||
- GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
|
- GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
|
||||||
|
- LLM_PROVIDER=${LLM_PROVIDER:-} # Optional: Override default provider (e.g., "anthropic/claude-3-opus")
|
||||||
volumes:
|
volumes:
|
||||||
- /dev/shm:/dev/shm # Chromium performance
|
- /dev/shm:/dev/shm # Chromium performance
|
||||||
deploy:
|
deploy:
|
||||||
|
|||||||
@@ -30,33 +30,40 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||||
|
import asyncio
|
||||||
|
|
||||||
# Initialize with custom adaptive parameters
|
async def main():
|
||||||
config = AdaptiveConfig(
|
|
||||||
confidence_threshold=0.7, # Min confidence to stop crawling
|
|
||||||
max_depth=5, # Maximum crawl depth
|
|
||||||
max_pages=20, # Maximum number of pages to crawl
|
|
||||||
top_k_links=3, # Number of top links to follow per page
|
|
||||||
strategy="statistical", # 'statistical' or 'embedding'
|
|
||||||
coverage_weight=0.4, # Weight for coverage in confidence calculation
|
|
||||||
consistency_weight=0.3, # Weight for consistency in confidence calculation
|
|
||||||
saturation_weight=0.3 # Weight for saturation in confidence calculation
|
|
||||||
)
|
|
||||||
|
|
||||||
# Initialize adaptive crawler with web crawler
|
# Configure adaptive crawler
|
||||||
async with AsyncWebCrawler() as crawler:
|
config = AdaptiveConfig(
|
||||||
adaptive_crawler = AdaptiveCrawler(crawler, config)
|
strategy="statistical", # or "embedding" for semantic understanding
|
||||||
|
max_pages=10,
|
||||||
# Crawl and learn patterns
|
confidence_threshold=0.7, # Stop at 70% confidence
|
||||||
state = await adaptive_crawler.digest(
|
top_k_links=3, # Follow top 3 links per page
|
||||||
start_url="https://news.example.com/article/12345",
|
min_gain_threshold=0.05 # Need 5% information gain to continue
|
||||||
query="latest news articles and content"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Access results and confidence
|
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||||
print(f"Confidence Level: {adaptive_crawler.confidence:.0%}")
|
adaptive = AdaptiveCrawler(crawler, config)
|
||||||
print(f"Pages Crawled: {len(state.crawled_urls)}")
|
|
||||||
print(f"Knowledge Base: {len(adaptive_crawler.state.knowledge_base)} documents")
|
print("Starting adaptive crawl about Python decorators...")
|
||||||
|
result = await adaptive.digest(
|
||||||
|
start_url="https://docs.python.org/3/glossary.html",
|
||||||
|
query="python decorators functions wrapping"
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"\n✅ Crawling Complete!")
|
||||||
|
print(f"• Confidence Level: {adaptive.confidence:.0%}")
|
||||||
|
print(f"• Pages Crawled: {len(result.crawled_urls)}")
|
||||||
|
print(f"• Knowledge Base: {len(adaptive.state.knowledge_base)} documents")
|
||||||
|
|
||||||
|
# Get most relevant content
|
||||||
|
relevant = adaptive.get_relevant_content(top_k=3)
|
||||||
|
print(f"\nMost Relevant Pages:")
|
||||||
|
for i, page in enumerate(relevant, 1):
|
||||||
|
print(f"{i}. {page['url']} (relevance: {page['score']:.2%})")
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
```
|
```
|
||||||
|
|
||||||
**Expected Real-World Impact:**
|
**Expected Real-World Impact:**
|
||||||
@@ -141,56 +148,47 @@ async with AsyncWebCrawler() as crawler:
|
|||||||
|
|
||||||
**My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals.
|
**My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals.
|
||||||
|
|
||||||
### The Three-Layer Scoring System
|
### Intelligent Link Analysis and Scoring
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import LinkPreviewConfig, CrawlerRunConfig, CacheMode
|
import asyncio
|
||||||
|
from crawl4ai import CrawlerRunConfig, CacheMode, AsyncWebCrawler
|
||||||
|
from crawl4ai.adaptive_crawler import LinkPreviewConfig
|
||||||
|
|
||||||
# Configure intelligent link analysis
|
async def main():
|
||||||
link_config = LinkPreviewConfig(
|
# Configure intelligent link analysis
|
||||||
include_internal=True,
|
link_config = LinkPreviewConfig(
|
||||||
include_external=False,
|
include_internal=True,
|
||||||
max_links=10,
|
include_external=False,
|
||||||
concurrency=5,
|
max_links=10,
|
||||||
query="python tutorial", # For contextual scoring
|
concurrency=5,
|
||||||
score_threshold=0.3,
|
query="python tutorial", # For contextual scoring
|
||||||
verbose=True
|
score_threshold=0.3,
|
||||||
)
|
verbose=True
|
||||||
|
|
||||||
# Use in your crawl
|
|
||||||
result = await crawler.arun(
|
|
||||||
"https://tech-blog.example.com",
|
|
||||||
config=CrawlerRunConfig(
|
|
||||||
link_preview_config=link_config,
|
|
||||||
score_links=True, # Enable intrinsic scoring
|
|
||||||
cache_mode=CacheMode.BYPASS
|
|
||||||
)
|
)
|
||||||
)
|
# Use in your crawl
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(
|
||||||
|
"https://www.geeksforgeeks.org/",
|
||||||
|
config=CrawlerRunConfig(
|
||||||
|
link_preview_config=link_config,
|
||||||
|
score_links=True, # Enable intrinsic scoring
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# Access scored and sorted links
|
# Access scored and sorted links
|
||||||
if result.success and result.links:
|
if result.success and result.links:
|
||||||
# Get scored links
|
for link in result.links.get("internal", []):
|
||||||
internal_links = result.links.get("internal", [])
|
text = link.get('text', 'No text')[:40]
|
||||||
scored_links = [l for l in internal_links if l.get("total_score")]
|
print(
|
||||||
scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
|
text,
|
||||||
|
f"{link.get('intrinsic_score', 0):.1f}/10" if link.get('intrinsic_score') is not None else "0.0/10",
|
||||||
|
f"{link.get('contextual_score', 0):.2f}/1" if link.get('contextual_score') is not None else "0.00/1",
|
||||||
|
f"{link.get('total_score', 0):.3f}" if link.get('total_score') is not None else "0.000"
|
||||||
|
)
|
||||||
|
|
||||||
# Create a scoring table
|
asyncio.run(main())
|
||||||
table = Table(title="Link Scoring Results", box=box.ROUNDED)
|
|
||||||
table.add_column("Link Text", style="cyan", width=40)
|
|
||||||
table.add_column("Intrinsic Score", justify="center")
|
|
||||||
table.add_column("Contextual Score", justify="center")
|
|
||||||
table.add_column("Total Score", justify="center", style="bold green")
|
|
||||||
|
|
||||||
for link in scored_links[:5]:
|
|
||||||
text = link.get('text', 'No text')[:40]
|
|
||||||
table.add_row(
|
|
||||||
text,
|
|
||||||
f"{link.get('intrinsic_score', 0):.1f}/10",
|
|
||||||
f"{link.get('contextual_score', 0):.2f}/1",
|
|
||||||
f"{link.get('total_score', 0):.3f}"
|
|
||||||
)
|
|
||||||
|
|
||||||
console.print(table)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Scoring Components:**
|
**Scoring Components:**
|
||||||
@@ -223,58 +221,34 @@ console.print(table)
|
|||||||
### Technical Architecture
|
### Technical Architecture
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
import asyncio
|
||||||
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
||||||
|
|
||||||
# Basic discovery - find all product pages
|
async def main():
|
||||||
seeder_config = SeedingConfig(
|
async with AsyncUrlSeeder() as seeder:
|
||||||
# Discovery sources
|
# Discover Python tutorial URLs
|
||||||
source="cc+sitemap", # Sitemap + Common Crawl
|
config = SeedingConfig(
|
||||||
|
source="sitemap", # Use sitemap
|
||||||
|
pattern="*python*", # URL pattern filter
|
||||||
|
extract_head=True, # Get metadata
|
||||||
|
query="python tutorial", # For relevance scoring
|
||||||
|
scoring_method="bm25",
|
||||||
|
score_threshold=0.2,
|
||||||
|
max_urls=10
|
||||||
|
)
|
||||||
|
|
||||||
# Filtering
|
print("Discovering Python async tutorial URLs...")
|
||||||
pattern="*/product/*", # URL pattern matching
|
urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
|
||||||
|
|
||||||
# Validation
|
print(f"\n✅ Found {len(urls)} relevant URLs:")
|
||||||
live_check=True, # Verify URLs are alive
|
for i, url_info in enumerate(urls[:5], 1):
|
||||||
max_urls=50, # Stop at 50 URLs
|
print(f"\n{i}. {url_info['url']}")
|
||||||
|
if url_info.get('relevance_score'):
|
||||||
|
print(f" Relevance: {url_info['relevance_score']:.3f}")
|
||||||
|
if url_info.get('head_data', {}).get('title'):
|
||||||
|
print(f" Title: {url_info['head_data']['title'][:60]}...")
|
||||||
|
|
||||||
# Performance
|
asyncio.run(main())
|
||||||
concurrency=100, # Maximum concurrent requests for live checks/head extraction
|
|
||||||
hits_per_sec=10 # Rate limit in requests per second to avoid overwhelming servers
|
|
||||||
)
|
|
||||||
|
|
||||||
async with AsyncUrlSeeder() as seeder:
|
|
||||||
console.print("Discovering URLs from Python docs...")
|
|
||||||
urls = await seeder.urls("docs.python.org", seeding_config)
|
|
||||||
console.print(f"\n✓ Discovered {len(urls)} URLs")
|
|
||||||
|
|
||||||
# Advanced: Relevance-based discovery
|
|
||||||
research_config = SeedingConfig(
|
|
||||||
source="sitemap+cc", # Sitemap + Common Crawl
|
|
||||||
pattern="*/blog/*", # Blog posts only
|
|
||||||
|
|
||||||
# Content relevance
|
|
||||||
extract_head=True, # Get meta tags
|
|
||||||
query="quantum computing tutorials",
|
|
||||||
scoring_method="bm25", # BM25 scoring method
|
|
||||||
score_threshold=0.4, # High relevance only
|
|
||||||
|
|
||||||
# Smart filtering
|
|
||||||
filter_nonsense_urls=True, # Remove .xml, .txt, etc.
|
|
||||||
|
|
||||||
force=True # Bypass cache
|
|
||||||
)
|
|
||||||
|
|
||||||
# Discover with progress tracking
|
|
||||||
discovered = []
|
|
||||||
async with AsyncUrlSeeder() as seeder:
|
|
||||||
discovered = await seeder.urls("https://physics-blog.com", research_config)
|
|
||||||
console.print(f"\n✓ Discovered {len(discovered)} URLs")
|
|
||||||
|
|
||||||
# Results include scores and metadata
|
|
||||||
for url_data in discovered[:5]:
|
|
||||||
print(f"URL: {url_data['url']}")
|
|
||||||
print(f"Score: {url_data['relevance_score']:.3f}")
|
|
||||||
print(f"Title: {url_data['head_data']['title']}")
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Discovery Methods:**
|
**Discovery Methods:**
|
||||||
|
|||||||
43
docs/blog/release-v0.7.1.md
Normal file
43
docs/blog/release-v0.7.1.md
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
# 🛠️ Crawl4AI v0.7.1: Minor Cleanup Update
|
||||||
|
|
||||||
|
*July 17, 2025 • 2 min read*
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
A small maintenance release that removes unused code and improves documentation.
|
||||||
|
|
||||||
|
## 🎯 What's Changed
|
||||||
|
|
||||||
|
- **Removed unused StealthConfig** from `crawl4ai/browser_manager.py`
|
||||||
|
- **Updated documentation** with better examples and parameter explanations
|
||||||
|
- **Fixed virtual scroll configuration** examples in docs
|
||||||
|
|
||||||
|
## 🧹 Code Cleanup
|
||||||
|
|
||||||
|
Removed unused `StealthConfig` import and configuration that wasn't being used anywhere in the codebase. The project uses its own custom stealth implementation through JavaScript injection instead.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Removed unused code:
|
||||||
|
from playwright_stealth import StealthConfig
|
||||||
|
stealth_config = StealthConfig(...) # This was never used
|
||||||
|
```
|
||||||
|
|
||||||
|
## 📖 Documentation Updates
|
||||||
|
|
||||||
|
- Fixed adaptive crawling parameter examples
|
||||||
|
- Updated session management documentation
|
||||||
|
- Corrected virtual scroll configuration examples
|
||||||
|
|
||||||
|
## 🚀 Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install crawl4ai==0.7.1
|
||||||
|
```
|
||||||
|
|
||||||
|
No breaking changes - upgrade directly from v0.7.0.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Questions? Issues?
|
||||||
|
- GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||||
|
- Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||||
303
docs/examples/demo_multi_config_clean.py
Normal file
303
docs/examples/demo_multi_config_clean.py
Normal file
@@ -0,0 +1,303 @@
|
|||||||
|
"""
|
||||||
|
🎯 Multi-Config URL Matching Demo
|
||||||
|
=================================
|
||||||
|
Learn how to use different crawler configurations for different URL patterns
|
||||||
|
in a single crawl batch with Crawl4AI's multi-config feature.
|
||||||
|
|
||||||
|
Part 1: Understanding URL Matching (Pattern Testing)
|
||||||
|
Part 2: Practical Example with Real Crawling
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import (
|
||||||
|
AsyncWebCrawler,
|
||||||
|
CrawlerRunConfig,
|
||||||
|
MatchMode
|
||||||
|
)
|
||||||
|
from crawl4ai.processors.pdf import PDFContentScrapingStrategy
|
||||||
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||||
|
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||||
|
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||||
|
|
||||||
|
|
||||||
|
def print_section(title):
|
||||||
|
"""Print a formatted section header"""
|
||||||
|
print(f"\n{'=' * 60}")
|
||||||
|
print(f"{title}")
|
||||||
|
print(f"{'=' * 60}\n")
|
||||||
|
|
||||||
|
|
||||||
|
def test_url_matching(config, test_urls, config_name):
|
||||||
|
"""Test URL matching for a config and show results"""
|
||||||
|
print(f"Config: {config_name}")
|
||||||
|
print(f"Matcher: {config.url_matcher}")
|
||||||
|
if hasattr(config, 'match_mode'):
|
||||||
|
print(f"Mode: {config.match_mode.value}")
|
||||||
|
print("-" * 40)
|
||||||
|
|
||||||
|
for url in test_urls:
|
||||||
|
matches = config.is_match(url)
|
||||||
|
symbol = "✓" if matches else "✗"
|
||||||
|
print(f"{symbol} {url}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# PART 1: Understanding URL Matching
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
def demo_part1_pattern_matching():
|
||||||
|
"""Part 1: Learn how URL matching works without crawling"""
|
||||||
|
|
||||||
|
print_section("PART 1: Understanding URL Matching")
|
||||||
|
print("Let's explore different ways to match URLs with configs.\n")
|
||||||
|
|
||||||
|
# Test URLs we'll use throughout
|
||||||
|
test_urls = [
|
||||||
|
"https://example.com/report.pdf",
|
||||||
|
"https://example.com/data.json",
|
||||||
|
"https://example.com/blog/post-1",
|
||||||
|
"https://example.com/article/news",
|
||||||
|
"https://api.example.com/v1/users",
|
||||||
|
"https://example.com/about"
|
||||||
|
]
|
||||||
|
|
||||||
|
# 1.1 Simple String Pattern
|
||||||
|
print("1.1 Simple String Pattern Matching")
|
||||||
|
print("-" * 40)
|
||||||
|
|
||||||
|
pdf_config = CrawlerRunConfig(
|
||||||
|
url_matcher="*.pdf"
|
||||||
|
)
|
||||||
|
|
||||||
|
test_url_matching(pdf_config, test_urls, "PDF Config")
|
||||||
|
|
||||||
|
|
||||||
|
# 1.2 Multiple String Patterns
|
||||||
|
print("1.2 Multiple String Patterns (OR logic)")
|
||||||
|
print("-" * 40)
|
||||||
|
|
||||||
|
blog_config = CrawlerRunConfig(
|
||||||
|
url_matcher=["*/blog/*", "*/article/*", "*/news/*"],
|
||||||
|
match_mode=MatchMode.OR # This is default, shown for clarity
|
||||||
|
)
|
||||||
|
|
||||||
|
test_url_matching(blog_config, test_urls, "Blog/Article Config")
|
||||||
|
|
||||||
|
|
||||||
|
# 1.3 Single Function Matcher
|
||||||
|
print("1.3 Function-based Matching")
|
||||||
|
print("-" * 40)
|
||||||
|
|
||||||
|
api_config = CrawlerRunConfig(
|
||||||
|
url_matcher=lambda url: 'api' in url or url.endswith('.json')
|
||||||
|
)
|
||||||
|
|
||||||
|
test_url_matching(api_config, test_urls, "API Config")
|
||||||
|
|
||||||
|
|
||||||
|
# 1.4 List of Functions
|
||||||
|
print("1.4 Multiple Functions with AND Logic")
|
||||||
|
print("-" * 40)
|
||||||
|
|
||||||
|
# Must be HTTPS AND contain 'api' AND have version number
|
||||||
|
secure_api_config = CrawlerRunConfig(
|
||||||
|
url_matcher=[
|
||||||
|
lambda url: url.startswith('https://'),
|
||||||
|
lambda url: 'api' in url,
|
||||||
|
lambda url: '/v' in url # Version indicator
|
||||||
|
],
|
||||||
|
match_mode=MatchMode.AND
|
||||||
|
)
|
||||||
|
|
||||||
|
test_url_matching(secure_api_config, test_urls, "Secure API Config")
|
||||||
|
|
||||||
|
|
||||||
|
# 1.5 Mixed: String and Function Together
|
||||||
|
print("1.5 Mixed Patterns: String + Function")
|
||||||
|
print("-" * 40)
|
||||||
|
|
||||||
|
# Match JSON files OR any API endpoint
|
||||||
|
json_or_api_config = CrawlerRunConfig(
|
||||||
|
url_matcher=[
|
||||||
|
"*.json", # String pattern
|
||||||
|
lambda url: 'api' in url # Function
|
||||||
|
],
|
||||||
|
match_mode=MatchMode.OR
|
||||||
|
)
|
||||||
|
|
||||||
|
test_url_matching(json_or_api_config, test_urls, "JSON or API Config")
|
||||||
|
|
||||||
|
|
||||||
|
# 1.6 Complex: Multiple Strings + Multiple Functions
|
||||||
|
print("1.6 Complex Matcher: Mixed Types with AND Logic")
|
||||||
|
print("-" * 40)
|
||||||
|
|
||||||
|
# Must be: HTTPS AND (.com domain) AND (blog OR article) AND NOT a PDF
|
||||||
|
complex_config = CrawlerRunConfig(
|
||||||
|
url_matcher=[
|
||||||
|
lambda url: url.startswith('https://'), # Function: HTTPS check
|
||||||
|
"*.com/*", # String: .com domain
|
||||||
|
lambda url: any(pattern in url for pattern in ['/blog/', '/article/']), # Function: Blog OR article
|
||||||
|
lambda url: not url.endswith('.pdf') # Function: Not PDF
|
||||||
|
],
|
||||||
|
match_mode=MatchMode.AND
|
||||||
|
)
|
||||||
|
|
||||||
|
test_url_matching(complex_config, test_urls, "Complex Mixed Config")
|
||||||
|
|
||||||
|
print("\n✅ Key Takeaway: First matching config wins when passed to arun_many()!")
|
||||||
|
|
||||||
|
|
||||||
|
# ==============================================================================
|
||||||
|
# PART 2: Practical Multi-URL Crawling
|
||||||
|
# ==============================================================================
|
||||||
|
|
||||||
|
async def demo_part2_practical_crawling():
|
||||||
|
"""Part 2: Real-world example with different content types"""
|
||||||
|
|
||||||
|
print_section("PART 2: Practical Multi-URL Crawling")
|
||||||
|
print("Now let's see multi-config in action with real URLs.\n")
|
||||||
|
|
||||||
|
# Create specialized configs for different content types
|
||||||
|
configs = [
|
||||||
|
# Config 1: PDF documents - only match files ending with .pdf
|
||||||
|
CrawlerRunConfig(
|
||||||
|
url_matcher="*.pdf",
|
||||||
|
scraping_strategy=PDFContentScrapingStrategy()
|
||||||
|
),
|
||||||
|
|
||||||
|
# Config 2: Blog/article pages with content filtering
|
||||||
|
CrawlerRunConfig(
|
||||||
|
url_matcher=["*/blog/*", "*/article/*", "*python.org*"],
|
||||||
|
markdown_generator=DefaultMarkdownGenerator(
|
||||||
|
content_filter=PruningContentFilter(threshold=0.48)
|
||||||
|
)
|
||||||
|
),
|
||||||
|
|
||||||
|
# Config 3: Dynamic pages requiring JavaScript
|
||||||
|
CrawlerRunConfig(
|
||||||
|
url_matcher=lambda url: 'github.com' in url,
|
||||||
|
js_code="window.scrollTo(0, 500);" # Scroll to load content
|
||||||
|
),
|
||||||
|
|
||||||
|
# Config 4: Mixed matcher - API endpoints (string OR function)
|
||||||
|
CrawlerRunConfig(
|
||||||
|
url_matcher=[
|
||||||
|
"*.json", # String pattern for JSON files
|
||||||
|
lambda url: 'api' in url or 'httpbin.org' in url # Function for API endpoints
|
||||||
|
],
|
||||||
|
match_mode=MatchMode.OR,
|
||||||
|
),
|
||||||
|
|
||||||
|
# Config 5: Complex matcher - Secure documentation sites
|
||||||
|
CrawlerRunConfig(
|
||||||
|
url_matcher=[
|
||||||
|
lambda url: url.startswith('https://'), # Must be HTTPS
|
||||||
|
"*.org/*", # String: .org domain
|
||||||
|
lambda url: any(doc in url for doc in ['docs', 'documentation', 'reference']), # Has docs
|
||||||
|
lambda url: not url.endswith(('.pdf', '.json')) # Not PDF or JSON
|
||||||
|
],
|
||||||
|
match_mode=MatchMode.AND,
|
||||||
|
# wait_for="css:.content, css:article" # Wait for content to load
|
||||||
|
),
|
||||||
|
|
||||||
|
# Default config for everything else
|
||||||
|
# CrawlerRunConfig() # No url_matcher means it matches everything (use it as fallback)
|
||||||
|
]
|
||||||
|
|
||||||
|
# URLs to crawl - each will use a different config
|
||||||
|
urls = [
|
||||||
|
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", # → PDF config
|
||||||
|
"https://blog.python.org/", # → Blog config with content filter
|
||||||
|
"https://github.com/microsoft/playwright", # → JS config
|
||||||
|
"https://httpbin.org/json", # → Mixed matcher config (API)
|
||||||
|
"https://docs.python.org/3/reference/", # → Complex matcher config
|
||||||
|
"https://www.w3schools.com/", # → Default config, if you uncomment the default config line above, if not you will see `Error: No matching configuration`
|
||||||
|
]
|
||||||
|
|
||||||
|
print("URLs to crawl:")
|
||||||
|
for i, url in enumerate(urls, 1):
|
||||||
|
print(f"{i}. {url}")
|
||||||
|
|
||||||
|
print("\nCrawling with appropriate config for each URL...\n")
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
results = await crawler.arun_many(
|
||||||
|
urls=urls,
|
||||||
|
config=configs
|
||||||
|
)
|
||||||
|
|
||||||
|
# Display results
|
||||||
|
print("Results:")
|
||||||
|
print("-" * 60)
|
||||||
|
|
||||||
|
for result in results:
|
||||||
|
if result.success:
|
||||||
|
# Determine which config was used
|
||||||
|
config_type = "Default"
|
||||||
|
if result.url.endswith('.pdf'):
|
||||||
|
config_type = "PDF Strategy"
|
||||||
|
elif any(pattern in result.url for pattern in ['blog', 'python.org']) and 'docs' not in result.url:
|
||||||
|
config_type = "Blog + Content Filter"
|
||||||
|
elif 'github.com' in result.url:
|
||||||
|
config_type = "JavaScript Enabled"
|
||||||
|
elif 'httpbin.org' in result.url or result.url.endswith('.json'):
|
||||||
|
config_type = "Mixed Matcher (API)"
|
||||||
|
elif 'docs.python.org' in result.url:
|
||||||
|
config_type = "Complex Matcher (Secure Docs)"
|
||||||
|
|
||||||
|
print(f"\n✓ {result.url}")
|
||||||
|
print(f" Config used: {config_type}")
|
||||||
|
print(f" Content size: {len(result.markdown)} chars")
|
||||||
|
|
||||||
|
# Show if we have fit_markdown (from content filter)
|
||||||
|
if hasattr(result.markdown, 'fit_markdown') and result.markdown.fit_markdown:
|
||||||
|
print(f" Fit markdown size: {len(result.markdown.fit_markdown)} chars")
|
||||||
|
reduction = (1 - len(result.markdown.fit_markdown) / len(result.markdown)) * 100
|
||||||
|
print(f" Content reduced by: {reduction:.1f}%")
|
||||||
|
|
||||||
|
# Show extracted data if using extraction strategy
|
||||||
|
if hasattr(result, 'extracted_content') and result.extracted_content:
|
||||||
|
print(f" Extracted data: {str(result.extracted_content)[:100]}...")
|
||||||
|
else:
|
||||||
|
print(f"\n✗ {result.url}")
|
||||||
|
print(f" Error: {result.error_message}")
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("✅ Multi-config crawling complete!")
|
||||||
|
print("\nBenefits demonstrated:")
|
||||||
|
print("- PDFs handled with specialized scraper")
|
||||||
|
print("- Blog content filtered for relevance")
|
||||||
|
print("- JavaScript executed only where needed")
|
||||||
|
print("- Mixed matchers (string + function) for flexible matching")
|
||||||
|
print("- Complex matchers for precise URL targeting")
|
||||||
|
print("- Each URL got optimal configuration automatically!")
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Run both parts of the demo"""
|
||||||
|
|
||||||
|
print("""
|
||||||
|
🎯 Multi-Config URL Matching Demo
|
||||||
|
=================================
|
||||||
|
Learn how Crawl4AI can use different configurations
|
||||||
|
for different URLs in a single batch.
|
||||||
|
""")
|
||||||
|
|
||||||
|
# Part 1: Pattern matching
|
||||||
|
demo_part1_pattern_matching()
|
||||||
|
|
||||||
|
print("\nPress Enter to continue to Part 2...")
|
||||||
|
try:
|
||||||
|
input()
|
||||||
|
except EOFError:
|
||||||
|
# Running in non-interactive mode, skip input
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Part 2: Practical crawling
|
||||||
|
await demo_part2_practical_crawling()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
@@ -18,7 +18,7 @@ Usage:
|
|||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
from crawl4ai.async_configs import LinkPreviewConfig
|
from crawl4ai import LinkPreviewConfig
|
||||||
|
|
||||||
|
|
||||||
async def basic_link_head_extraction():
|
async def basic_link_head_extraction():
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
import time, re
|
import time, re
|
||||||
from crawl4ai.content_scraping_strategy import WebScrapingStrategy, LXMLWebScrapingStrategy
|
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||||
|
# WebScrapingStrategy is now an alias for LXMLWebScrapingStrategy
|
||||||
import time
|
import time
|
||||||
import functools
|
import functools
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
@@ -57,7 +58,7 @@ methods_to_profile = [
|
|||||||
|
|
||||||
|
|
||||||
# Apply decorators to both strategies
|
# Apply decorators to both strategies
|
||||||
for strategy, name in [(WebScrapingStrategy, "Original"), (LXMLWebScrapingStrategy, "LXML")]:
|
for strategy, name in [(LXMLWebScrapingStrategy, "LXML")]:
|
||||||
for method in methods_to_profile:
|
for method in methods_to_profile:
|
||||||
apply_decorators(strategy, method, name)
|
apply_decorators(strategy, method, name)
|
||||||
|
|
||||||
@@ -85,7 +86,7 @@ def generate_large_html(n_elements=1000):
|
|||||||
|
|
||||||
def test_scraping():
|
def test_scraping():
|
||||||
# Initialize both scrapers
|
# Initialize both scrapers
|
||||||
original_scraper = WebScrapingStrategy()
|
original_scraper = LXMLWebScrapingStrategy()
|
||||||
selected_scraper = LXMLWebScrapingStrategy()
|
selected_scraper = LXMLWebScrapingStrategy()
|
||||||
|
|
||||||
# Generate test HTML
|
# Generate test HTML
|
||||||
|
|||||||
@@ -404,7 +404,182 @@ for result in results:
|
|||||||
print(f"Duration: {dr.end_time - dr.start_time}")
|
print(f"Duration: {dr.end_time - dr.start_time}")
|
||||||
```
|
```
|
||||||
|
|
||||||
## 6. Summary
|
## 6. URL-Specific Configurations
|
||||||
|
|
||||||
|
When crawling diverse content types, you often need different configurations for different URLs. For example:
|
||||||
|
- PDFs need specialized extraction
|
||||||
|
- Blog pages benefit from content filtering
|
||||||
|
- Dynamic sites need JavaScript execution
|
||||||
|
- API endpoints need JSON parsing
|
||||||
|
|
||||||
|
### 6.1 Basic URL Pattern Matching
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, MatchMode
|
||||||
|
from crawl4ai.processors.pdf import PDFContentScrapingStrategy
|
||||||
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||||
|
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||||
|
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||||
|
|
||||||
|
async def crawl_mixed_content():
|
||||||
|
# Configure different strategies for different content
|
||||||
|
configs = [
|
||||||
|
# PDF files - specialized extraction
|
||||||
|
CrawlerRunConfig(
|
||||||
|
url_matcher="*.pdf",
|
||||||
|
scraping_strategy=PDFContentScrapingStrategy()
|
||||||
|
),
|
||||||
|
|
||||||
|
# Blog/article pages - content filtering
|
||||||
|
CrawlerRunConfig(
|
||||||
|
url_matcher=["*/blog/*", "*/article/*"],
|
||||||
|
markdown_generator=DefaultMarkdownGenerator(
|
||||||
|
content_filter=PruningContentFilter(threshold=0.48)
|
||||||
|
)
|
||||||
|
),
|
||||||
|
|
||||||
|
# Dynamic pages - JavaScript execution
|
||||||
|
CrawlerRunConfig(
|
||||||
|
url_matcher=lambda url: 'github.com' in url,
|
||||||
|
js_code="window.scrollTo(0, 500);"
|
||||||
|
),
|
||||||
|
|
||||||
|
# API endpoints - JSON extraction
|
||||||
|
CrawlerRunConfig(
|
||||||
|
url_matcher=lambda url: 'api' in url or url.endswith('.json'),
|
||||||
|
# Custome settings for JSON extraction
|
||||||
|
),
|
||||||
|
|
||||||
|
# Default config for everything else
|
||||||
|
CrawlerRunConfig() # No url_matcher means it matches ALL URLs (fallback)
|
||||||
|
]
|
||||||
|
|
||||||
|
# Mixed URLs
|
||||||
|
urls = [
|
||||||
|
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
|
||||||
|
"https://blog.python.org/",
|
||||||
|
"https://github.com/microsoft/playwright",
|
||||||
|
"https://httpbin.org/json",
|
||||||
|
"https://example.com/"
|
||||||
|
]
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
results = await crawler.arun_many(
|
||||||
|
urls=urls,
|
||||||
|
config=configs # Pass list of configs
|
||||||
|
)
|
||||||
|
|
||||||
|
for result in results:
|
||||||
|
print(f"{result.url}: {len(result.markdown)} chars")
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6.2 Advanced Pattern Matching
|
||||||
|
|
||||||
|
**Important**: A `CrawlerRunConfig` without `url_matcher` (or with `url_matcher=None`) matches ALL URLs. This makes it perfect as a default/fallback configuration.
|
||||||
|
|
||||||
|
The `url_matcher` parameter supports three types of patterns:
|
||||||
|
|
||||||
|
#### Glob Patterns (Strings)
|
||||||
|
```python
|
||||||
|
# Simple patterns
|
||||||
|
"*.pdf" # Any PDF file
|
||||||
|
"*/api/*" # Any URL with /api/ in path
|
||||||
|
"https://*.example.com/*" # Subdomain matching
|
||||||
|
"*://example.com/blog/*" # Any protocol
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Custom Functions
|
||||||
|
```python
|
||||||
|
# Complex logic with lambdas
|
||||||
|
lambda url: url.startswith('https://') and 'secure' in url
|
||||||
|
lambda url: len(url) > 50 and url.count('/') > 5
|
||||||
|
lambda url: any(domain in url for domain in ['api.', 'data.', 'feed.'])
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Mixed Lists with AND/OR Logic
|
||||||
|
```python
|
||||||
|
# Combine multiple conditions
|
||||||
|
CrawlerRunConfig(
|
||||||
|
url_matcher=[
|
||||||
|
"https://*", # Must be HTTPS
|
||||||
|
lambda url: 'internal' in url, # Must contain 'internal'
|
||||||
|
lambda url: not url.endswith('.pdf') # Must not be PDF
|
||||||
|
],
|
||||||
|
match_mode=MatchMode.AND # ALL conditions must match
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6.3 Practical Example: News Site Crawler
|
||||||
|
|
||||||
|
```python
|
||||||
|
async def crawl_news_site():
|
||||||
|
dispatcher = MemoryAdaptiveDispatcher(
|
||||||
|
memory_threshold_percent=70.0,
|
||||||
|
rate_limiter=RateLimiter(base_delay=(1.0, 2.0))
|
||||||
|
)
|
||||||
|
|
||||||
|
configs = [
|
||||||
|
# Homepage - light extraction
|
||||||
|
CrawlerRunConfig(
|
||||||
|
url_matcher=lambda url: url.rstrip('/') == 'https://news.ycombinator.com',
|
||||||
|
css_selector="nav, .headline",
|
||||||
|
extraction_strategy=None
|
||||||
|
),
|
||||||
|
|
||||||
|
# Article pages - full extraction
|
||||||
|
CrawlerRunConfig(
|
||||||
|
url_matcher="*/article/*",
|
||||||
|
extraction_strategy=CosineStrategy(
|
||||||
|
semantic_filter="article content",
|
||||||
|
word_count_threshold=100
|
||||||
|
),
|
||||||
|
screenshot=True,
|
||||||
|
excluded_tags=["nav", "aside", "footer"]
|
||||||
|
),
|
||||||
|
|
||||||
|
# Author pages - metadata focus
|
||||||
|
CrawlerRunConfig(
|
||||||
|
url_matcher="*/author/*",
|
||||||
|
extraction_strategy=JsonCssExtractionStrategy({
|
||||||
|
"name": "h1.author-name",
|
||||||
|
"bio": ".author-bio",
|
||||||
|
"articles": "article.post-card h2"
|
||||||
|
})
|
||||||
|
),
|
||||||
|
|
||||||
|
# Everything else
|
||||||
|
CrawlerRunConfig()
|
||||||
|
]
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
results = await crawler.arun_many(
|
||||||
|
urls=news_urls,
|
||||||
|
config=configs,
|
||||||
|
dispatcher=dispatcher
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6.4 Best Practices
|
||||||
|
|
||||||
|
1. **Order Matters**: Configs are evaluated in order - put specific patterns before general ones
|
||||||
|
2. **Default Config Behavior**:
|
||||||
|
- A config without `url_matcher` matches ALL URLs
|
||||||
|
- Always include a default config as the last item if you want to handle all URLs
|
||||||
|
- Without a default config, unmatched URLs will fail with "No matching configuration found"
|
||||||
|
3. **Test Your Patterns**: Use the config's `is_match()` method to test patterns:
|
||||||
|
```python
|
||||||
|
config = CrawlerRunConfig(url_matcher="*.pdf")
|
||||||
|
print(config.is_match("https://example.com/doc.pdf")) # True
|
||||||
|
|
||||||
|
default_config = CrawlerRunConfig() # No url_matcher
|
||||||
|
print(default_config.is_match("https://any-url.com")) # True - matches everything!
|
||||||
|
```
|
||||||
|
4. **Optimize for Performance**:
|
||||||
|
- Disable JS for static content
|
||||||
|
- Skip screenshots for data APIs
|
||||||
|
- Use appropriate extraction strategies
|
||||||
|
|
||||||
|
## 7. Summary
|
||||||
|
|
||||||
1. **Two Dispatcher Types**:
|
1. **Two Dispatcher Types**:
|
||||||
|
|
||||||
|
|||||||
@@ -7,7 +7,7 @@
|
|||||||
```python
|
```python
|
||||||
async def arun_many(
|
async def arun_many(
|
||||||
urls: Union[List[str], List[Any]],
|
urls: Union[List[str], List[Any]],
|
||||||
config: Optional[CrawlerRunConfig] = None,
|
config: Optional[Union[CrawlerRunConfig, List[CrawlerRunConfig]]] = None,
|
||||||
dispatcher: Optional[BaseDispatcher] = None,
|
dispatcher: Optional[BaseDispatcher] = None,
|
||||||
...
|
...
|
||||||
) -> Union[List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
|
) -> Union[List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
|
||||||
@@ -15,7 +15,9 @@ async def arun_many(
|
|||||||
Crawl multiple URLs concurrently or in batches.
|
Crawl multiple URLs concurrently or in batches.
|
||||||
|
|
||||||
:param urls: A list of URLs (or tasks) to crawl.
|
:param urls: A list of URLs (or tasks) to crawl.
|
||||||
:param config: (Optional) A default `CrawlerRunConfig` applying to each crawl.
|
:param config: (Optional) Either:
|
||||||
|
- A single `CrawlerRunConfig` applying to all URLs
|
||||||
|
- A list of `CrawlerRunConfig` objects with url_matcher patterns
|
||||||
:param dispatcher: (Optional) A concurrency controller (e.g. MemoryAdaptiveDispatcher).
|
:param dispatcher: (Optional) A concurrency controller (e.g. MemoryAdaptiveDispatcher).
|
||||||
...
|
...
|
||||||
:return: Either a list of `CrawlResult` objects, or an async generator if streaming is enabled.
|
:return: Either a list of `CrawlResult` objects, or an async generator if streaming is enabled.
|
||||||
@@ -95,10 +97,70 @@ results = await crawler.arun_many(
|
|||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### URL-Specific Configurations
|
||||||
|
|
||||||
|
Instead of using one config for all URLs, provide a list of configs with `url_matcher` patterns:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import CrawlerRunConfig, MatchMode
|
||||||
|
from crawl4ai.processors.pdf import PDFContentScrapingStrategy
|
||||||
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||||
|
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||||
|
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||||
|
|
||||||
|
# PDF files - specialized extraction
|
||||||
|
pdf_config = CrawlerRunConfig(
|
||||||
|
url_matcher="*.pdf",
|
||||||
|
scraping_strategy=PDFContentScrapingStrategy()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Blog/article pages - content filtering
|
||||||
|
blog_config = CrawlerRunConfig(
|
||||||
|
url_matcher=["*/blog/*", "*/article/*", "*python.org*"],
|
||||||
|
markdown_generator=DefaultMarkdownGenerator(
|
||||||
|
content_filter=PruningContentFilter(threshold=0.48)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Dynamic pages - JavaScript execution
|
||||||
|
github_config = CrawlerRunConfig(
|
||||||
|
url_matcher=lambda url: 'github.com' in url,
|
||||||
|
js_code="window.scrollTo(0, 500);"
|
||||||
|
)
|
||||||
|
|
||||||
|
# API endpoints - JSON extraction
|
||||||
|
api_config = CrawlerRunConfig(
|
||||||
|
url_matcher=lambda url: 'api' in url or url.endswith('.json'),
|
||||||
|
# Custome settings for JSON extraction
|
||||||
|
)
|
||||||
|
|
||||||
|
# Default fallback config
|
||||||
|
default_config = CrawlerRunConfig() # No url_matcher means it never matches except as fallback
|
||||||
|
|
||||||
|
# Pass the list of configs - first match wins!
|
||||||
|
results = await crawler.arun_many(
|
||||||
|
urls=[
|
||||||
|
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", # → pdf_config
|
||||||
|
"https://blog.python.org/", # → blog_config
|
||||||
|
"https://github.com/microsoft/playwright", # → github_config
|
||||||
|
"https://httpbin.org/json", # → api_config
|
||||||
|
"https://example.com/" # → default_config
|
||||||
|
],
|
||||||
|
config=[pdf_config, blog_config, github_config, api_config, default_config]
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**URL Matching Features**:
|
||||||
|
- **String patterns**: `"*.pdf"`, `"*/blog/*"`, `"*python.org*"`
|
||||||
|
- **Function matchers**: `lambda url: 'api' in url`
|
||||||
|
- **Mixed patterns**: Combine strings and functions with `MatchMode.OR` or `MatchMode.AND`
|
||||||
|
- **First match wins**: Configs are evaluated in order
|
||||||
|
|
||||||
**Key Points**:
|
**Key Points**:
|
||||||
- Each URL is processed by the same or separate sessions, depending on the dispatcher’s strategy.
|
- Each URL is processed by the same or separate sessions, depending on the dispatcher’s strategy.
|
||||||
- `dispatch_result` in each `CrawlResult` (if using concurrency) can hold memory and timing info.
|
- `dispatch_result` in each `CrawlResult` (if using concurrency) can hold memory and timing info.
|
||||||
- If you need to handle authentication or session IDs, pass them in each individual task or within your run config.
|
- If you need to handle authentication or session IDs, pass them in each individual task or within your run config.
|
||||||
|
- **Important**: Always include a default config (without `url_matcher`) as the last item if you want to handle all URLs. Otherwise, unmatched URLs will fail.
|
||||||
|
|
||||||
### Return Value
|
### Return Value
|
||||||
|
|
||||||
|
|||||||
@@ -208,6 +208,71 @@ config = CrawlerRunConfig(
|
|||||||
|
|
||||||
See [Virtual Scroll documentation](../../advanced/virtual-scroll.md) for detailed examples.
|
See [Virtual Scroll documentation](../../advanced/virtual-scroll.md) for detailed examples.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### I) **URL Matching Configuration**
|
||||||
|
|
||||||
|
| **Parameter** | **Type / Default** | **What It Does** |
|
||||||
|
|------------------------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
|
| **`url_matcher`** | `UrlMatcher` (None) | Pattern(s) to match URLs against. Can be: string (glob), function, or list of mixed types. **None means match ALL URLs** |
|
||||||
|
| **`match_mode`** | `MatchMode` (MatchMode.OR) | How to combine multiple matchers in a list: `MatchMode.OR` (any match) or `MatchMode.AND` (all must match) |
|
||||||
|
|
||||||
|
The `url_matcher` parameter enables URL-specific configurations when used with `arun_many()`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import CrawlerRunConfig, MatchMode
|
||||||
|
from crawl4ai.processors.pdf import PDFContentScrapingStrategy
|
||||||
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||||
|
|
||||||
|
# Simple string pattern (glob-style)
|
||||||
|
pdf_config = CrawlerRunConfig(
|
||||||
|
url_matcher="*.pdf",
|
||||||
|
scraping_strategy=PDFContentScrapingStrategy()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Multiple patterns with OR logic (default)
|
||||||
|
blog_config = CrawlerRunConfig(
|
||||||
|
url_matcher=["*/blog/*", "*/article/*", "*/news/*"],
|
||||||
|
match_mode=MatchMode.OR # Any pattern matches
|
||||||
|
)
|
||||||
|
|
||||||
|
# Function matcher
|
||||||
|
api_config = CrawlerRunConfig(
|
||||||
|
url_matcher=lambda url: 'api' in url or url.endswith('.json'),
|
||||||
|
# Other settings like extraction_strategy
|
||||||
|
)
|
||||||
|
|
||||||
|
# Mixed: String + Function with AND logic
|
||||||
|
complex_config = CrawlerRunConfig(
|
||||||
|
url_matcher=[
|
||||||
|
lambda url: url.startswith('https://'), # Must be HTTPS
|
||||||
|
"*.org/*", # Must be .org domain
|
||||||
|
lambda url: 'docs' in url # Must contain 'docs'
|
||||||
|
],
|
||||||
|
match_mode=MatchMode.AND # ALL conditions must match
|
||||||
|
)
|
||||||
|
|
||||||
|
# Combined patterns and functions with AND logic
|
||||||
|
secure_docs = CrawlerRunConfig(
|
||||||
|
url_matcher=["https://*", lambda url: '.doc' in url],
|
||||||
|
match_mode=MatchMode.AND # Must be HTTPS AND contain .doc
|
||||||
|
)
|
||||||
|
|
||||||
|
# Default config - matches ALL URLs
|
||||||
|
default_config = CrawlerRunConfig() # No url_matcher = matches everything
|
||||||
|
```
|
||||||
|
|
||||||
|
**UrlMatcher Types:**
|
||||||
|
- **None (default)**: When `url_matcher` is None or not set, the config matches ALL URLs
|
||||||
|
- **String patterns**: Glob-style patterns like `"*.pdf"`, `"*/api/*"`, `"https://*.example.com/*"`
|
||||||
|
- **Functions**: `lambda url: bool` - Custom logic for complex matching
|
||||||
|
- **Lists**: Mix strings and functions, combined with `MatchMode.OR` or `MatchMode.AND`
|
||||||
|
|
||||||
|
**Important Behavior:**
|
||||||
|
- When passing a list of configs to `arun_many()`, URLs are matched against each config's `url_matcher` in order. First match wins!
|
||||||
|
- If no config matches a URL and there's no default config (one without `url_matcher`), the URL will fail with "No matching configuration found"
|
||||||
|
- Always include a default config as the last item if you want to handle all URLs
|
||||||
|
|
||||||
---## 2.2 Helper Methods
|
---## 2.2 Helper Methods
|
||||||
|
|
||||||
Both `BrowserConfig` and `CrawlerRunConfig` provide a `clone()` method to create modified copies:
|
Both `BrowserConfig` and `CrawlerRunConfig` provide a `clone()` method to create modified copies:
|
||||||
|
|||||||
@@ -30,33 +30,40 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||||
|
import asyncio
|
||||||
|
|
||||||
# Initialize with custom adaptive parameters
|
async def main():
|
||||||
config = AdaptiveConfig(
|
|
||||||
confidence_threshold=0.7, # Min confidence to stop crawling
|
|
||||||
max_depth=5, # Maximum crawl depth
|
|
||||||
max_pages=20, # Maximum number of pages to crawl
|
|
||||||
top_k_links=3, # Number of top links to follow per page
|
|
||||||
strategy="statistical", # 'statistical' or 'embedding'
|
|
||||||
coverage_weight=0.4, # Weight for coverage in confidence calculation
|
|
||||||
consistency_weight=0.3, # Weight for consistency in confidence calculation
|
|
||||||
saturation_weight=0.3 # Weight for saturation in confidence calculation
|
|
||||||
)
|
|
||||||
|
|
||||||
# Initialize adaptive crawler with web crawler
|
# Configure adaptive crawler
|
||||||
async with AsyncWebCrawler() as crawler:
|
config = AdaptiveConfig(
|
||||||
adaptive_crawler = AdaptiveCrawler(crawler, config)
|
strategy="statistical", # or "embedding" for semantic understanding
|
||||||
|
max_pages=10,
|
||||||
# Crawl and learn patterns
|
confidence_threshold=0.7, # Stop at 70% confidence
|
||||||
state = await adaptive_crawler.digest(
|
top_k_links=3, # Follow top 3 links per page
|
||||||
start_url="https://news.example.com/article/12345",
|
min_gain_threshold=0.05 # Need 5% information gain to continue
|
||||||
query="latest news articles and content"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Access results and confidence
|
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||||
print(f"Confidence Level: {adaptive_crawler.confidence:.0%}")
|
adaptive = AdaptiveCrawler(crawler, config)
|
||||||
print(f"Pages Crawled: {len(state.crawled_urls)}")
|
|
||||||
print(f"Knowledge Base: {len(adaptive_crawler.state.knowledge_base)} documents")
|
print("Starting adaptive crawl about Python decorators...")
|
||||||
|
result = await adaptive.digest(
|
||||||
|
start_url="https://docs.python.org/3/glossary.html",
|
||||||
|
query="python decorators functions wrapping"
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"\n✅ Crawling Complete!")
|
||||||
|
print(f"• Confidence Level: {adaptive.confidence:.0%}")
|
||||||
|
print(f"• Pages Crawled: {len(result.crawled_urls)}")
|
||||||
|
print(f"• Knowledge Base: {len(adaptive.state.knowledge_base)} documents")
|
||||||
|
|
||||||
|
# Get most relevant content
|
||||||
|
relevant = adaptive.get_relevant_content(top_k=3)
|
||||||
|
print(f"\nMost Relevant Pages:")
|
||||||
|
for i, page in enumerate(relevant, 1):
|
||||||
|
print(f"{i}. {page['url']} (relevance: {page['score']:.2%})")
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
```
|
```
|
||||||
|
|
||||||
**Expected Real-World Impact:**
|
**Expected Real-World Impact:**
|
||||||
@@ -141,56 +148,47 @@ async with AsyncWebCrawler() as crawler:
|
|||||||
|
|
||||||
**My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals.
|
**My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals.
|
||||||
|
|
||||||
### The Three-Layer Scoring System
|
### Intelligent Link Analysis and Scoring
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import LinkPreviewConfig, CrawlerRunConfig, CacheMode
|
import asyncio
|
||||||
|
from crawl4ai import CrawlerRunConfig, CacheMode, AsyncWebCrawler
|
||||||
|
from crawl4ai.adaptive_crawler import LinkPreviewConfig
|
||||||
|
|
||||||
# Configure intelligent link analysis
|
async def main():
|
||||||
link_config = LinkPreviewConfig(
|
# Configure intelligent link analysis
|
||||||
include_internal=True,
|
link_config = LinkPreviewConfig(
|
||||||
include_external=False,
|
include_internal=True,
|
||||||
max_links=10,
|
include_external=False,
|
||||||
concurrency=5,
|
max_links=10,
|
||||||
query="python tutorial", # For contextual scoring
|
concurrency=5,
|
||||||
score_threshold=0.3,
|
query="python tutorial", # For contextual scoring
|
||||||
verbose=True
|
score_threshold=0.3,
|
||||||
)
|
verbose=True
|
||||||
|
|
||||||
# Use in your crawl
|
|
||||||
result = await crawler.arun(
|
|
||||||
"https://tech-blog.example.com",
|
|
||||||
config=CrawlerRunConfig(
|
|
||||||
link_preview_config=link_config,
|
|
||||||
score_links=True, # Enable intrinsic scoring
|
|
||||||
cache_mode=CacheMode.BYPASS
|
|
||||||
)
|
)
|
||||||
)
|
# Use in your crawl
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(
|
||||||
|
"https://www.geeksforgeeks.org/",
|
||||||
|
config=CrawlerRunConfig(
|
||||||
|
link_preview_config=link_config,
|
||||||
|
score_links=True, # Enable intrinsic scoring
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# Access scored and sorted links
|
# Access scored and sorted links
|
||||||
if result.success and result.links:
|
if result.success and result.links:
|
||||||
# Get scored links
|
for link in result.links.get("internal", []):
|
||||||
internal_links = result.links.get("internal", [])
|
text = link.get('text', 'No text')[:40]
|
||||||
scored_links = [l for l in internal_links if l.get("total_score")]
|
print(
|
||||||
scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
|
text,
|
||||||
|
f"{link.get('intrinsic_score', 0):.1f}/10" if link.get('intrinsic_score') is not None else "0.0/10",
|
||||||
|
f"{link.get('contextual_score', 0):.2f}/1" if link.get('contextual_score') is not None else "0.00/1",
|
||||||
|
f"{link.get('total_score', 0):.3f}" if link.get('total_score') is not None else "0.000"
|
||||||
|
)
|
||||||
|
|
||||||
# Create a scoring table
|
asyncio.run(main())
|
||||||
table = Table(title="Link Scoring Results", box=box.ROUNDED)
|
|
||||||
table.add_column("Link Text", style="cyan", width=40)
|
|
||||||
table.add_column("Intrinsic Score", justify="center")
|
|
||||||
table.add_column("Contextual Score", justify="center")
|
|
||||||
table.add_column("Total Score", justify="center", style="bold green")
|
|
||||||
|
|
||||||
for link in scored_links[:5]:
|
|
||||||
text = link.get('text', 'No text')[:40]
|
|
||||||
table.add_row(
|
|
||||||
text,
|
|
||||||
f"{link.get('intrinsic_score', 0):.1f}/10",
|
|
||||||
f"{link.get('contextual_score', 0):.2f}/1",
|
|
||||||
f"{link.get('total_score', 0):.3f}"
|
|
||||||
)
|
|
||||||
|
|
||||||
console.print(table)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Scoring Components:**
|
**Scoring Components:**
|
||||||
@@ -223,58 +221,34 @@ console.print(table)
|
|||||||
### Technical Architecture
|
### Technical Architecture
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
import asyncio
|
||||||
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
||||||
|
|
||||||
# Basic discovery - find all product pages
|
async def main():
|
||||||
seeder_config = SeedingConfig(
|
async with AsyncUrlSeeder() as seeder:
|
||||||
# Discovery sources
|
# Discover Python tutorial URLs
|
||||||
source="cc+sitemap", # Sitemap + Common Crawl
|
config = SeedingConfig(
|
||||||
|
source="sitemap", # Use sitemap
|
||||||
|
pattern="*python*", # URL pattern filter
|
||||||
|
extract_head=True, # Get metadata
|
||||||
|
query="python tutorial", # For relevance scoring
|
||||||
|
scoring_method="bm25",
|
||||||
|
score_threshold=0.2,
|
||||||
|
max_urls=10
|
||||||
|
)
|
||||||
|
|
||||||
# Filtering
|
print("Discovering Python async tutorial URLs...")
|
||||||
pattern="*/product/*", # URL pattern matching
|
urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
|
||||||
|
|
||||||
# Validation
|
print(f"\n✅ Found {len(urls)} relevant URLs:")
|
||||||
live_check=True, # Verify URLs are alive
|
for i, url_info in enumerate(urls[:5], 1):
|
||||||
max_urls=50, # Stop at 50 URLs
|
print(f"\n{i}. {url_info['url']}")
|
||||||
|
if url_info.get('relevance_score'):
|
||||||
|
print(f" Relevance: {url_info['relevance_score']:.3f}")
|
||||||
|
if url_info.get('head_data', {}).get('title'):
|
||||||
|
print(f" Title: {url_info['head_data']['title'][:60]}...")
|
||||||
|
|
||||||
# Performance
|
asyncio.run(main())
|
||||||
concurrency=100, # Maximum concurrent requests for live checks/head extraction
|
|
||||||
hits_per_sec=10 # Rate limit in requests per second to avoid overwhelming servers
|
|
||||||
)
|
|
||||||
|
|
||||||
async with AsyncUrlSeeder() as seeder:
|
|
||||||
console.print("Discovering URLs from Python docs...")
|
|
||||||
urls = await seeder.urls("docs.python.org", seeding_config)
|
|
||||||
console.print(f"\n✓ Discovered {len(urls)} URLs")
|
|
||||||
|
|
||||||
# Advanced: Relevance-based discovery
|
|
||||||
research_config = SeedingConfig(
|
|
||||||
source="sitemap+cc", # Sitemap + Common Crawl
|
|
||||||
pattern="*/blog/*", # Blog posts only
|
|
||||||
|
|
||||||
# Content relevance
|
|
||||||
extract_head=True, # Get meta tags
|
|
||||||
query="quantum computing tutorials",
|
|
||||||
scoring_method="bm25", # BM25 scoring method
|
|
||||||
score_threshold=0.4, # High relevance only
|
|
||||||
|
|
||||||
# Smart filtering
|
|
||||||
filter_nonsense_urls=True, # Remove .xml, .txt, etc.
|
|
||||||
|
|
||||||
force=True # Bypass cache
|
|
||||||
)
|
|
||||||
|
|
||||||
# Discover with progress tracking
|
|
||||||
discovered = []
|
|
||||||
async with AsyncUrlSeeder() as seeder:
|
|
||||||
discovered = await seeder.urls("https://physics-blog.com", research_config)
|
|
||||||
console.print(f"\n✓ Discovered {len(discovered)} URLs")
|
|
||||||
|
|
||||||
# Results include scores and metadata
|
|
||||||
for url_data in discovered[:5]:
|
|
||||||
print(f"URL: {url_data['url']}")
|
|
||||||
print(f"Score: {url_data['relevance_score']:.3f}")
|
|
||||||
print(f"Title: {url_data['head_data']['title']}")
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Discovery Methods:**
|
**Discovery Methods:**
|
||||||
|
|||||||
43
docs/md_v2/blog/releases/0.7.1.md
Normal file
43
docs/md_v2/blog/releases/0.7.1.md
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
# 🛠️ Crawl4AI v0.7.1: Minor Cleanup Update
|
||||||
|
|
||||||
|
*July 17, 2025 • 2 min read*
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
A small maintenance release that removes unused code and improves documentation.
|
||||||
|
|
||||||
|
## 🎯 What's Changed
|
||||||
|
|
||||||
|
- **Removed unused StealthConfig** from `crawl4ai/browser_manager.py`
|
||||||
|
- **Updated documentation** with better examples and parameter explanations
|
||||||
|
- **Fixed virtual scroll configuration** examples in docs
|
||||||
|
|
||||||
|
## 🧹 Code Cleanup
|
||||||
|
|
||||||
|
Removed unused `StealthConfig` import and configuration that wasn't being used anywhere in the codebase. The project uses its own custom stealth implementation through JavaScript injection instead.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Removed unused code:
|
||||||
|
from playwright_stealth import StealthConfig
|
||||||
|
stealth_config = StealthConfig(...) # This was never used
|
||||||
|
```
|
||||||
|
|
||||||
|
## 📖 Documentation Updates
|
||||||
|
|
||||||
|
- Fixed adaptive crawling parameter examples
|
||||||
|
- Updated session management documentation
|
||||||
|
- Corrected virtual scroll configuration examples
|
||||||
|
|
||||||
|
## 🚀 Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install crawl4ai==0.7.1
|
||||||
|
```
|
||||||
|
|
||||||
|
No breaking changes - upgrade directly from v0.7.0.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Questions? Issues?
|
||||||
|
- GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||||
|
- Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||||
98
docs/md_v2/blog/releases/0.7.2.md
Normal file
98
docs/md_v2/blog/releases/0.7.2.md
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
# 🚀 Crawl4AI v0.7.2: CI/CD & Dependency Optimization Update
|
||||||
|
|
||||||
|
*July 25, 2025 • 3 min read*
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
This release introduces automated CI/CD pipelines for seamless releases and optimizes dependencies for a lighter, more efficient package.
|
||||||
|
|
||||||
|
## 🎯 What's New
|
||||||
|
|
||||||
|
### 🔄 Automated Release Pipeline
|
||||||
|
- **GitHub Actions CI/CD**: Automated PyPI and Docker Hub releases on tag push
|
||||||
|
- **Multi-platform Docker images**: Support for both AMD64 and ARM64 architectures
|
||||||
|
- **Version consistency checks**: Ensures tag, package, and Docker versions align
|
||||||
|
- **Automated release notes**: GitHub releases created automatically
|
||||||
|
|
||||||
|
### 📦 Dependency Optimization
|
||||||
|
- **Moved sentence-transformers to optional dependencies**: Significantly reduces default installation size
|
||||||
|
- **Lighter Docker images**: Optimized Dockerfile for faster builds and smaller images
|
||||||
|
- **Better dependency management**: Core vs. optional dependencies clearly separated
|
||||||
|
|
||||||
|
## 🏗️ CI/CD Pipeline
|
||||||
|
|
||||||
|
The new automated release process ensures consistent, reliable releases:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Trigger releases with a simple tag
|
||||||
|
git tag v0.7.2
|
||||||
|
git push origin v0.7.2
|
||||||
|
|
||||||
|
# Automatically:
|
||||||
|
# ✅ Validates version consistency
|
||||||
|
# ✅ Builds and publishes to PyPI
|
||||||
|
# ✅ Builds multi-platform Docker images
|
||||||
|
# ✅ Pushes to Docker Hub with proper tags
|
||||||
|
# ✅ Creates GitHub release
|
||||||
|
```
|
||||||
|
|
||||||
|
## 💾 Lighter Installation
|
||||||
|
|
||||||
|
Default installation is now significantly smaller:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Core installation (smaller, faster)
|
||||||
|
pip install crawl4ai==0.7.2
|
||||||
|
|
||||||
|
# With ML features (includes sentence-transformers)
|
||||||
|
pip install crawl4ai[transformer]==0.7.2
|
||||||
|
|
||||||
|
# Full installation
|
||||||
|
pip install crawl4ai[all]==0.7.2
|
||||||
|
```
|
||||||
|
|
||||||
|
## 🐳 Docker Improvements
|
||||||
|
|
||||||
|
Enhanced Docker support with multi-platform images:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Pull the latest version
|
||||||
|
docker pull unclecode/crawl4ai:0.7.2
|
||||||
|
docker pull unclecode/crawl4ai:latest
|
||||||
|
|
||||||
|
# Available tags:
|
||||||
|
# - unclecode/crawl4ai:0.7.2 (specific version)
|
||||||
|
# - unclecode/crawl4ai:0.7 (minor version)
|
||||||
|
# - unclecode/crawl4ai:0 (major version)
|
||||||
|
# - unclecode/crawl4ai:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
## 🔧 Technical Details
|
||||||
|
|
||||||
|
### Dependency Changes
|
||||||
|
- `sentence-transformers` moved from required to optional dependencies
|
||||||
|
- Reduces default installation by ~500MB
|
||||||
|
- No impact on functionality when transformer features aren't needed
|
||||||
|
|
||||||
|
### CI/CD Configuration
|
||||||
|
- GitHub Actions workflows for automated releases
|
||||||
|
- Version validation before publishing
|
||||||
|
- Parallel PyPI and Docker Hub deployments
|
||||||
|
- Automatic tagging strategy for Docker images
|
||||||
|
|
||||||
|
## 🚀 Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install crawl4ai==0.7.2
|
||||||
|
```
|
||||||
|
|
||||||
|
No breaking changes - direct upgrade from v0.7.0 or v0.7.1.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Questions? Issues?
|
||||||
|
- GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||||
|
- Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||||
|
- Twitter: [@unclecode](https://x.com/unclecode)
|
||||||
|
|
||||||
|
*P.S. The new CI/CD pipeline will make future releases faster and more reliable. Thanks for your patience as we improve our release process!*
|
||||||
@@ -209,7 +209,13 @@ class CrawlerRunConfig:
|
|||||||
- The maximum number of concurrent crawl sessions.
|
- The maximum number of concurrent crawl sessions.
|
||||||
- Helps prevent overwhelming the system.
|
- Helps prevent overwhelming the system.
|
||||||
|
|
||||||
14. **`display_mode`**:
|
14. **`url_matcher`** & **`match_mode`**:
|
||||||
|
- Enable URL-specific configurations when used with `arun_many()`.
|
||||||
|
- Set `url_matcher` to patterns (glob, function, or list) to match specific URLs.
|
||||||
|
- Use `match_mode` (OR/AND) to control how multiple patterns combine.
|
||||||
|
- See [URL-Specific Configurations](../api/arun_many.md#url-specific-configurations) for examples.
|
||||||
|
|
||||||
|
15. **`display_mode`**:
|
||||||
- The display mode for progress information (`DETAILED`, `BRIEF`, etc.).
|
- The display mode for progress information (`DETAILED`, `BRIEF`, etc.).
|
||||||
- Affects how much information is printed during the crawl.
|
- Affects how much information is printed during the crawl.
|
||||||
|
|
||||||
|
|||||||
@@ -52,11 +52,9 @@ That's it! In just a few lines, you've automated a complete search workflow.
|
|||||||
|
|
||||||
Want to learn by doing? We've got you covered:
|
Want to learn by doing? We've got you covered:
|
||||||
|
|
||||||
**🚀 [Live Demo](https://docs.crawl4ai.com/c4a-script/demo)** - Try C4A-Script in your browser right now!
|
**🚀 [Live Demo](https://docs.crawl4ai.com/apps/c4a-script/)** - Try C4A-Script in your browser right now!
|
||||||
|
|
||||||
**📁 [Tutorial Examples](/examples/c4a_script/)** - Complete examples with source code
|
**📁 [Tutorial Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/c4a_script/)** - Complete examples with source code
|
||||||
|
|
||||||
**🛠️ [Local Tutorial](/examples/c4a_script/tutorial/)** - Run the interactive tutorial on your machine
|
|
||||||
|
|
||||||
### Running the Tutorial Locally
|
### Running the Tutorial Locally
|
||||||
|
|
||||||
|
|||||||
@@ -350,15 +350,22 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
## 6. Scraping Modes
|
## 6. Scraping Modes
|
||||||
|
|
||||||
Crawl4AI provides two different scraping strategies for HTML content processing: `WebScrapingStrategy` (BeautifulSoup-based, default) and `LXMLWebScrapingStrategy` (LXML-based). The LXML strategy offers significantly better performance, especially for large HTML documents.
|
Crawl4AI uses `LXMLWebScrapingStrategy` (LXML-based) as the default scraping strategy for HTML content processing. This strategy offers excellent performance, especially for large HTML documents.
|
||||||
|
|
||||||
|
**Note:** For backward compatibility, `WebScrapingStrategy` is still available as an alias for `LXMLWebScrapingStrategy`.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LXMLWebScrapingStrategy
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LXMLWebScrapingStrategy
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
config = CrawlerRunConfig(
|
# Default configuration already uses LXMLWebScrapingStrategy
|
||||||
scraping_strategy=LXMLWebScrapingStrategy() # Faster alternative to default BeautifulSoup
|
config = CrawlerRunConfig()
|
||||||
|
|
||||||
|
# Or explicitly specify it if desired
|
||||||
|
config_explicit = CrawlerRunConfig(
|
||||||
|
scraping_strategy=LXMLWebScrapingStrategy()
|
||||||
)
|
)
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url="https://example.com",
|
url="https://example.com",
|
||||||
@@ -417,21 +424,20 @@ class CustomScrapingStrategy(ContentScrapingStrategy):
|
|||||||
|
|
||||||
### Performance Considerations
|
### Performance Considerations
|
||||||
|
|
||||||
The LXML strategy can be up to 10-20x faster than BeautifulSoup strategy, particularly when processing large HTML documents. However, please note:
|
The LXML strategy provides excellent performance, particularly when processing large HTML documents, offering up to 10-20x faster processing compared to BeautifulSoup-based approaches.
|
||||||
|
|
||||||
1. LXML strategy is currently experimental
|
Benefits of LXML strategy:
|
||||||
2. In some edge cases, the parsing results might differ slightly from BeautifulSoup
|
- Fast processing of large HTML documents (especially >100KB)
|
||||||
3. If you encounter any inconsistencies between LXML and BeautifulSoup results, please [raise an issue](https://github.com/codeium/crawl4ai/issues) with a reproducible example
|
- Efficient memory usage
|
||||||
|
- Good handling of well-formed HTML
|
||||||
|
- Robust table detection and extraction
|
||||||
|
|
||||||
Choose LXML strategy when:
|
### Backward Compatibility
|
||||||
- Processing large HTML documents (recommended for >100KB)
|
|
||||||
- Performance is critical
|
|
||||||
- Working with well-formed HTML
|
|
||||||
|
|
||||||
Stick to BeautifulSoup strategy (default) when:
|
For users upgrading from earlier versions:
|
||||||
- Maximum compatibility is needed
|
- `WebScrapingStrategy` is now an alias for `LXMLWebScrapingStrategy`
|
||||||
- Working with malformed HTML
|
- Existing code using `WebScrapingStrategy` will continue to work without modification
|
||||||
- Exact parsing behavior is critical
|
- No changes are required to your existing code
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|||||||
@@ -19,13 +19,15 @@ class MarkdownGenerationResult(BaseModel):
|
|||||||
class CrawlResult(BaseModel):
|
class CrawlResult(BaseModel):
|
||||||
url: str
|
url: str
|
||||||
html: str
|
html: str
|
||||||
|
fit_html: Optional[str] = None
|
||||||
success: bool
|
success: bool
|
||||||
cleaned_html: Optional[str] = None
|
cleaned_html: Optional[str] = None
|
||||||
media: Dict[str, List[Dict]] = {}
|
media: Dict[str, List[Dict]] = {}
|
||||||
links: Dict[str, List[Dict]] = {}
|
links: Dict[str, List[Dict]] = {}
|
||||||
downloaded_files: Optional[List[str]] = None
|
downloaded_files: Optional[List[str]] = None
|
||||||
|
js_execution_result: Optional[Dict[str, Any]] = None
|
||||||
screenshot: Optional[str] = None
|
screenshot: Optional[str] = None
|
||||||
pdf : Optional[bytes] = None
|
pdf: Optional[bytes] = None
|
||||||
mhtml: Optional[str] = None
|
mhtml: Optional[str] = None
|
||||||
markdown: Optional[Union[str, MarkdownGenerationResult]] = None
|
markdown: Optional[Union[str, MarkdownGenerationResult]] = None
|
||||||
extracted_content: Optional[str] = None
|
extracted_content: Optional[str] = None
|
||||||
@@ -35,6 +37,12 @@ class CrawlResult(BaseModel):
|
|||||||
response_headers: Optional[dict] = None
|
response_headers: Optional[dict] = None
|
||||||
status_code: Optional[int] = None
|
status_code: Optional[int] = None
|
||||||
ssl_certificate: Optional[SSLCertificate] = None
|
ssl_certificate: Optional[SSLCertificate] = None
|
||||||
|
dispatch_result: Optional[DispatchResult] = None
|
||||||
|
redirected_url: Optional[str] = None
|
||||||
|
network_requests: Optional[List[Dict[str, Any]]] = None
|
||||||
|
console_messages: Optional[List[Dict[str, Any]]] = None
|
||||||
|
tables: List[Dict] = Field(default_factory=list)
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
arbitrary_types_allowed = True
|
arbitrary_types_allowed = True
|
||||||
```
|
```
|
||||||
@@ -45,11 +53,13 @@ class CrawlResult(BaseModel):
|
|||||||
|-------------------------------------------|-----------------------------------------------------------------------------------------------------|
|
|-------------------------------------------|-----------------------------------------------------------------------------------------------------|
|
||||||
| **url (`str`)** | The final or actual URL crawled (in case of redirects). |
|
| **url (`str`)** | The final or actual URL crawled (in case of redirects). |
|
||||||
| **html (`str`)** | Original, unmodified page HTML. Good for debugging or custom processing. |
|
| **html (`str`)** | Original, unmodified page HTML. Good for debugging or custom processing. |
|
||||||
|
| **fit_html (`Optional[str]`)** | Preprocessed HTML optimized for extraction and content filtering. |
|
||||||
| **success (`bool`)** | `True` if the crawl completed without major errors, else `False`. |
|
| **success (`bool`)** | `True` if the crawl completed without major errors, else `False`. |
|
||||||
| **cleaned_html (`Optional[str]`)** | Sanitized HTML with scripts/styles removed; can exclude tags if configured via `excluded_tags` etc. |
|
| **cleaned_html (`Optional[str]`)** | Sanitized HTML with scripts/styles removed; can exclude tags if configured via `excluded_tags` etc. |
|
||||||
| **media (`Dict[str, List[Dict]]`)** | Extracted media info (images, audio, etc.), each with attributes like `src`, `alt`, `score`, etc. |
|
| **media (`Dict[str, List[Dict]]`)** | Extracted media info (images, audio, etc.), each with attributes like `src`, `alt`, `score`, etc. |
|
||||||
| **links (`Dict[str, List[Dict]]`)** | Extracted link data, split by `internal` and `external`. Each link usually has `href`, `text`, etc. |
|
| **links (`Dict[str, List[Dict]]`)** | Extracted link data, split by `internal` and `external`. Each link usually has `href`, `text`, etc. |
|
||||||
| **downloaded_files (`Optional[List[str]]`)** | If `accept_downloads=True` in `BrowserConfig`, this lists the filepaths of saved downloads. |
|
| **downloaded_files (`Optional[List[str]]`)** | If `accept_downloads=True` in `BrowserConfig`, this lists the filepaths of saved downloads. |
|
||||||
|
| **js_execution_result (`Optional[Dict[str, Any]]`)** | Results from JavaScript execution during crawling. |
|
||||||
| **screenshot (`Optional[str]`)** | Screenshot of the page (base64-encoded) if `screenshot=True`. |
|
| **screenshot (`Optional[str]`)** | Screenshot of the page (base64-encoded) if `screenshot=True`. |
|
||||||
| **pdf (`Optional[bytes]`)** | PDF of the page if `pdf=True`. |
|
| **pdf (`Optional[bytes]`)** | PDF of the page if `pdf=True`. |
|
||||||
| **mhtml (`Optional[str]`)** | MHTML snapshot of the page if `capture_mhtml=True`. Contains the full page with all resources. |
|
| **mhtml (`Optional[str]`)** | MHTML snapshot of the page if `capture_mhtml=True`. Contains the full page with all resources. |
|
||||||
@@ -61,6 +71,11 @@ class CrawlResult(BaseModel):
|
|||||||
| **response_headers (`Optional[dict]`)** | HTTP response headers, if captured. |
|
| **response_headers (`Optional[dict]`)** | HTTP response headers, if captured. |
|
||||||
| **status_code (`Optional[int]`)** | HTTP status code (e.g., 200 for OK). |
|
| **status_code (`Optional[int]`)** | HTTP status code (e.g., 200 for OK). |
|
||||||
| **ssl_certificate (`Optional[SSLCertificate]`)** | SSL certificate info if `fetch_ssl_certificate=True`. |
|
| **ssl_certificate (`Optional[SSLCertificate]`)** | SSL certificate info if `fetch_ssl_certificate=True`. |
|
||||||
|
| **dispatch_result (`Optional[DispatchResult]`)** | Additional concurrency and resource usage information when crawling URLs in parallel. |
|
||||||
|
| **redirected_url (`Optional[str]`)** | The URL after any redirects (different from `url` which is the final URL). |
|
||||||
|
| **network_requests (`Optional[List[Dict[str, Any]]]`)** | List of network requests, responses, and failures captured during the crawl if `capture_network_requests=True`. |
|
||||||
|
| **console_messages (`Optional[List[Dict[str, Any]]]`)** | List of browser console messages captured during the crawl if `capture_console_messages=True`. |
|
||||||
|
| **tables (`List[Dict]`)** | Table data extracted from HTML tables with structure `[{headers, rows, caption, summary}]`. |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|||||||
@@ -154,6 +154,30 @@ cp deploy/docker/.llm.env.example .llm.env
|
|||||||
# Now edit .llm.env and add your API keys
|
# Now edit .llm.env and add your API keys
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Flexible LLM Provider Configuration:**
|
||||||
|
|
||||||
|
The Docker setup now supports flexible LLM provider configuration through three methods:
|
||||||
|
|
||||||
|
1. **Environment Variable** (Highest Priority): Set `LLM_PROVIDER` to override the default
|
||||||
|
```bash
|
||||||
|
export LLM_PROVIDER="anthropic/claude-3-opus"
|
||||||
|
# Or in your .llm.env file:
|
||||||
|
# LLM_PROVIDER=anthropic/claude-3-opus
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **API Request Parameter**: Specify provider per request
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"url": "https://example.com",
|
||||||
|
"f": "llm",
|
||||||
|
"provider": "groq/mixtral-8x7b"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Config File Default**: Falls back to `config.yml` (default: `openai/gpt-4o-mini`)
|
||||||
|
|
||||||
|
The system automatically selects the appropriate API key based on the configured `api_key_env` in the config file.
|
||||||
|
|
||||||
#### 3. Build and Run with Compose
|
#### 3. Build and Run with Compose
|
||||||
|
|
||||||
The `docker-compose.yml` file in the project root provides a simplified approach that automatically handles architecture detection using buildx.
|
The `docker-compose.yml` file in the project root provides a simplified approach that automatically handles architecture detection using buildx.
|
||||||
@@ -668,7 +692,7 @@ app:
|
|||||||
|
|
||||||
# Default LLM Configuration
|
# Default LLM Configuration
|
||||||
llm:
|
llm:
|
||||||
provider: "openai/gpt-4o-mini"
|
provider: "openai/gpt-4o-mini" # Can be overridden by LLM_PROVIDER env var
|
||||||
api_key_env: "OPENAI_API_KEY"
|
api_key_env: "OPENAI_API_KEY"
|
||||||
# api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
|
# api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
|
||||||
|
|
||||||
|
|||||||
@@ -28,11 +28,8 @@ This page provides a comprehensive list of example scripts that demonstrate vari
|
|||||||
| Example | Description | Link |
|
| Example | Description | Link |
|
||||||
|---------|-------------|------|
|
|---------|-------------|------|
|
||||||
| Deep Crawling | An extensive tutorial on deep crawling capabilities, demonstrating BFS and BestFirst strategies, stream vs. non-stream execution, filters, scorers, and advanced configurations. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/deepcrawl_example.py) |
|
| Deep Crawling | An extensive tutorial on deep crawling capabilities, demonstrating BFS and BestFirst strategies, stream vs. non-stream execution, filters, scorers, and advanced configurations. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/deepcrawl_example.py) |
|
||||||
<<<<<<< HEAD
|
|
||||||
| Virtual Scroll | Comprehensive examples for handling virtualized scrolling on sites like Twitter, Instagram. Demonstrates different scrolling scenarios with local test server. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/virtual_scroll_example.py) |
|
| Virtual Scroll | Comprehensive examples for handling virtualized scrolling on sites like Twitter, Instagram. Demonstrates different scrolling scenarios with local test server. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/virtual_scroll_example.py) |
|
||||||
=======
|
|
||||||
| Adaptive Crawling | Demonstrates intelligent crawling that automatically determines when sufficient information has been gathered. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/adaptive_crawling/) |
|
| Adaptive Crawling | Demonstrates intelligent crawling that automatically determines when sufficient information has been gathered. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/adaptive_crawling/) |
|
||||||
>>>>>>> feature/progressive-crawling
|
|
||||||
| Dispatcher | Shows how to use the crawl dispatcher for advanced workload management. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/dispatcher_example.py) |
|
| Dispatcher | Shows how to use the crawl dispatcher for advanced workload management. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/dispatcher_example.py) |
|
||||||
| Storage State | Tutorial on managing browser storage state for persistence. | [View Guide](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/storage_state_tutorial.md) |
|
| Storage State | Tutorial on managing browser storage state for persistence. | [View Guide](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/storage_state_tutorial.md) |
|
||||||
| Network Console Capture | Demonstrates how to capture and analyze network requests and console logs. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/network_console_capture_example.py) |
|
| Network Console Capture | Demonstrates how to capture and analyze network requests and console logs. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/network_console_capture_example.py) |
|
||||||
|
|||||||
@@ -125,7 +125,7 @@ Here's a full example you can copy, paste, and run immediately:
|
|||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
from crawl4ai.async_configs import LinkPreviewConfig
|
from crawl4ai import LinkPreviewConfig
|
||||||
|
|
||||||
async def extract_link_heads_example():
|
async def extract_link_heads_example():
|
||||||
"""
|
"""
|
||||||
@@ -237,7 +237,7 @@ if __name__ == "__main__":
|
|||||||
The `LinkPreviewConfig` class supports these options:
|
The `LinkPreviewConfig` class supports these options:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.async_configs import LinkPreviewConfig
|
from crawl4ai import LinkPreviewConfig
|
||||||
|
|
||||||
link_preview_config = LinkPreviewConfig(
|
link_preview_config = LinkPreviewConfig(
|
||||||
# BASIC SETTINGS
|
# BASIC SETTINGS
|
||||||
|
|||||||
92
docs/md_v2/migration/webscraping-strategy-migration.md
Normal file
92
docs/md_v2/migration/webscraping-strategy-migration.md
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
# WebScrapingStrategy Migration Guide
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Crawl4AI has simplified its content scraping architecture. The BeautifulSoup-based `WebScrapingStrategy` has been deprecated in favor of the faster LXML-based implementation. However, **no action is required** - your existing code will continue to work.
|
||||||
|
|
||||||
|
## What Changed?
|
||||||
|
|
||||||
|
1. **`WebScrapingStrategy` is now an alias** for `LXMLWebScrapingStrategy`
|
||||||
|
2. **The BeautifulSoup implementation has been removed** (~1000 lines of redundant code)
|
||||||
|
3. **`LXMLWebScrapingStrategy` inherits directly** from `ContentScrapingStrategy`
|
||||||
|
4. **Performance remains optimal** with LXML as the sole implementation
|
||||||
|
|
||||||
|
## Backward Compatibility
|
||||||
|
|
||||||
|
**Your existing code continues to work without any changes:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
# This still works perfectly
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, WebScrapingStrategy
|
||||||
|
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
scraping_strategy=WebScrapingStrategy() # Works as before
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Migration Options
|
||||||
|
|
||||||
|
You have three options:
|
||||||
|
|
||||||
|
### Option 1: Do Nothing (Recommended)
|
||||||
|
Your code will continue to work. `WebScrapingStrategy` is permanently aliased to `LXMLWebScrapingStrategy`.
|
||||||
|
|
||||||
|
### Option 2: Update Imports (Optional)
|
||||||
|
For clarity, you can update your imports:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Old (still works)
|
||||||
|
from crawl4ai import WebScrapingStrategy
|
||||||
|
strategy = WebScrapingStrategy()
|
||||||
|
|
||||||
|
# New (more explicit)
|
||||||
|
from crawl4ai import LXMLWebScrapingStrategy
|
||||||
|
strategy = LXMLWebScrapingStrategy()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Option 3: Use Default Configuration
|
||||||
|
Since `LXMLWebScrapingStrategy` is the default, you can omit the strategy parameter:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Simplest approach - uses LXMLWebScrapingStrategy by default
|
||||||
|
config = CrawlerRunConfig()
|
||||||
|
```
|
||||||
|
|
||||||
|
## Type Hints
|
||||||
|
|
||||||
|
If you use type hints, both work:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import WebScrapingStrategy, LXMLWebScrapingStrategy
|
||||||
|
|
||||||
|
def process_with_strategy(strategy: WebScrapingStrategy) -> None:
|
||||||
|
# Works with both WebScrapingStrategy and LXMLWebScrapingStrategy
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Both are valid
|
||||||
|
process_with_strategy(WebScrapingStrategy())
|
||||||
|
process_with_strategy(LXMLWebScrapingStrategy())
|
||||||
|
```
|
||||||
|
|
||||||
|
## Subclassing
|
||||||
|
|
||||||
|
If you've subclassed `WebScrapingStrategy`, it continues to work:
|
||||||
|
|
||||||
|
```python
|
||||||
|
class MyCustomStrategy(WebScrapingStrategy):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
# Your custom code
|
||||||
|
```
|
||||||
|
|
||||||
|
## Performance Benefits
|
||||||
|
|
||||||
|
By consolidating to LXML:
|
||||||
|
- **10-20x faster** HTML parsing for large documents
|
||||||
|
- **Lower memory usage**
|
||||||
|
- **Consistent behavior** across all use cases
|
||||||
|
- **Simplified maintenance** and bug fixes
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
This change simplifies Crawl4AI's internals while maintaining 100% backward compatibility. Your existing code continues to work, and you get better performance automatically.
|
||||||
@@ -28,7 +28,7 @@ from rich import box
|
|||||||
|
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, AdaptiveCrawler, AdaptiveConfig, BrowserConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, AdaptiveCrawler, AdaptiveConfig, BrowserConfig, CacheMode
|
||||||
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
||||||
from crawl4ai.async_configs import LinkPreviewConfig, VirtualScrollConfig
|
from crawl4ai import LinkPreviewConfig, VirtualScrollConfig
|
||||||
from crawl4ai import c4a_compile, CompilationResult
|
from crawl4ai import c4a_compile, CompilationResult
|
||||||
|
|
||||||
# Initialize Rich console for beautiful output
|
# Initialize Rich console for beautiful output
|
||||||
|
|||||||
@@ -13,14 +13,13 @@ from crawl4ai import (
|
|||||||
BrowserConfig,
|
BrowserConfig,
|
||||||
CacheMode,
|
CacheMode,
|
||||||
# New imports for v0.7.0
|
# New imports for v0.7.0
|
||||||
LinkPreviewConfig,
|
|
||||||
VirtualScrollConfig,
|
VirtualScrollConfig,
|
||||||
|
LinkPreviewConfig,
|
||||||
AdaptiveCrawler,
|
AdaptiveCrawler,
|
||||||
AdaptiveConfig,
|
AdaptiveConfig,
|
||||||
AsyncUrlSeeder,
|
AsyncUrlSeeder,
|
||||||
SeedingConfig,
|
SeedingConfig,
|
||||||
c4a_compile,
|
c4a_compile,
|
||||||
CompilationResult
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -170,16 +169,16 @@ async def demo_url_seeder():
|
|||||||
# Discover Python tutorial URLs
|
# Discover Python tutorial URLs
|
||||||
config = SeedingConfig(
|
config = SeedingConfig(
|
||||||
source="sitemap", # Use sitemap
|
source="sitemap", # Use sitemap
|
||||||
pattern="*tutorial*", # URL pattern filter
|
pattern="*python*", # URL pattern filter
|
||||||
extract_head=True, # Get metadata
|
extract_head=True, # Get metadata
|
||||||
query="python async programming", # For relevance scoring
|
query="python tutorial", # For relevance scoring
|
||||||
scoring_method="bm25",
|
scoring_method="bm25",
|
||||||
score_threshold=0.2,
|
score_threshold=0.2,
|
||||||
max_urls=10
|
max_urls=10
|
||||||
)
|
)
|
||||||
|
|
||||||
print("Discovering Python async tutorial URLs...")
|
print("Discovering Python async tutorial URLs...")
|
||||||
urls = await seeder.urls("docs.python.org", config)
|
urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
|
||||||
|
|
||||||
print(f"\n✅ Found {len(urls)} relevant URLs:")
|
print(f"\n✅ Found {len(urls)} relevant URLs:")
|
||||||
for i, url_info in enumerate(urls[:5], 1):
|
for i, url_info in enumerate(urls[:5], 1):
|
||||||
@@ -245,39 +244,6 @@ IF (EXISTS `.price-filter`) THEN CLICK `input[data-max-price="100"]`
|
|||||||
print(f"❌ Compilation error: {result.first_error.message}")
|
print(f"❌ Compilation error: {result.first_error.message}")
|
||||||
|
|
||||||
|
|
||||||
async def demo_pdf_support():
|
|
||||||
"""
|
|
||||||
Demo 6: PDF Parsing Support
|
|
||||||
|
|
||||||
Shows how to extract content from PDF files.
|
|
||||||
Note: Requires 'pip install crawl4ai[pdf]'
|
|
||||||
"""
|
|
||||||
print("\n" + "="*60)
|
|
||||||
print("📄 DEMO 6: PDF Parsing Support")
|
|
||||||
print("="*60)
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Check if PDF support is installed
|
|
||||||
import PyPDF2
|
|
||||||
|
|
||||||
# Example: Process a PDF URL
|
|
||||||
config = CrawlerRunConfig(
|
|
||||||
cache_mode=CacheMode.BYPASS,
|
|
||||||
pdf=True, # Enable PDF generation
|
|
||||||
extract_text_from_pdf=True # Extract text content
|
|
||||||
)
|
|
||||||
|
|
||||||
print("PDF parsing is available!")
|
|
||||||
print("You can now crawl PDF URLs and extract their content.")
|
|
||||||
print("\nExample usage:")
|
|
||||||
print(' result = await crawler.arun("https://example.com/document.pdf")')
|
|
||||||
print(' pdf_text = result.extracted_content # Contains extracted text')
|
|
||||||
|
|
||||||
except ImportError:
|
|
||||||
print("⚠️ PDF support not installed.")
|
|
||||||
print("Install with: pip install crawl4ai[pdf]")
|
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
"""Run all demos"""
|
"""Run all demos"""
|
||||||
print("\n🚀 Crawl4AI v0.7.0 Feature Demonstrations")
|
print("\n🚀 Crawl4AI v0.7.0 Feature Demonstrations")
|
||||||
@@ -289,7 +255,6 @@ async def main():
|
|||||||
("Virtual Scroll", demo_virtual_scroll),
|
("Virtual Scroll", demo_virtual_scroll),
|
||||||
("URL Seeder", demo_url_seeder),
|
("URL Seeder", demo_url_seeder),
|
||||||
("C4A Script", demo_c4a_script),
|
("C4A Script", demo_c4a_script),
|
||||||
("PDF Support", demo_pdf_support)
|
|
||||||
]
|
]
|
||||||
|
|
||||||
for name, demo_func in demos:
|
for name, demo_func in demos:
|
||||||
@@ -309,7 +274,6 @@ async def main():
|
|||||||
print("• Virtual Scroll: Capture all content from modern web pages")
|
print("• Virtual Scroll: Capture all content from modern web pages")
|
||||||
print("• URL Seeder: Pre-discover and filter URLs efficiently")
|
print("• URL Seeder: Pre-discover and filter URLs efficiently")
|
||||||
print("• C4A Script: Simple language for complex automations")
|
print("• C4A Script: Simple language for complex automations")
|
||||||
print("• PDF Support: Extract content from PDF documents")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -44,7 +44,6 @@ dependencies = [
|
|||||||
"brotli>=1.1.0",
|
"brotli>=1.1.0",
|
||||||
"humanize>=4.10.0",
|
"humanize>=4.10.0",
|
||||||
"lark>=1.2.2",
|
"lark>=1.2.2",
|
||||||
"sentence-transformers>=2.2.0",
|
|
||||||
"alphashape>=1.3.1",
|
"alphashape>=1.3.1",
|
||||||
"shapely>=2.0.0"
|
"shapely>=2.0.0"
|
||||||
]
|
]
|
||||||
@@ -62,8 +61,8 @@ classifiers = [
|
|||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
pdf = ["PyPDF2"]
|
pdf = ["PyPDF2"]
|
||||||
torch = ["torch", "nltk", "scikit-learn"]
|
torch = ["torch", "nltk", "scikit-learn"]
|
||||||
transformer = ["transformers", "tokenizers"]
|
transformer = ["transformers", "tokenizers", "sentence-transformers"]
|
||||||
cosine = ["torch", "transformers", "nltk"]
|
cosine = ["torch", "transformers", "nltk", "sentence-transformers"]
|
||||||
sync = ["selenium"]
|
sync = ["selenium"]
|
||||||
all = [
|
all = [
|
||||||
"PyPDF2",
|
"PyPDF2",
|
||||||
@@ -72,8 +71,8 @@ all = [
|
|||||||
"scikit-learn",
|
"scikit-learn",
|
||||||
"transformers",
|
"transformers",
|
||||||
"tokenizers",
|
"tokenizers",
|
||||||
"selenium",
|
"sentence-transformers",
|
||||||
"PyPDF2"
|
"selenium"
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
|
|||||||
@@ -24,7 +24,6 @@ cssselect>=1.2.0
|
|||||||
chardet>=5.2.0
|
chardet>=5.2.0
|
||||||
brotli>=1.1.0
|
brotli>=1.1.0
|
||||||
httpx[http2]>=0.27.2
|
httpx[http2]>=0.27.2
|
||||||
sentence-transformers>=2.2.0
|
|
||||||
alphashape>=1.3.1
|
alphashape>=1.3.1
|
||||||
shapely>=2.0.0
|
shapely>=2.0.0
|
||||||
|
|
||||||
|
|||||||
@@ -12,11 +12,8 @@ parent_dir = os.path.dirname(
|
|||||||
sys.path.append(parent_dir)
|
sys.path.append(parent_dir)
|
||||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||||
|
|
||||||
from crawl4ai.content_scraping_strategy import WebScrapingStrategy
|
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||||
from crawl4ai.content_scraping_strategy import (
|
# This test compares the same strategy with itself now since WebScrapingStrategy is deprecated
|
||||||
WebScrapingStrategy as WebScrapingStrategyCurrent,
|
|
||||||
)
|
|
||||||
# from crawl4ai.content_scrapping_strategy_current import WebScrapingStrategy as WebScrapingStrategyCurrent
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -32,8 +29,8 @@ class TestResult:
|
|||||||
|
|
||||||
class StrategyTester:
|
class StrategyTester:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.new_scraper = WebScrapingStrategy()
|
self.new_scraper = LXMLWebScrapingStrategy()
|
||||||
self.current_scraper = WebScrapingStrategyCurrent()
|
self.current_scraper = LXMLWebScrapingStrategy() # Same strategy now
|
||||||
with open(__location__ + "/sample_wikipedia.html", "r", encoding="utf-8") as f:
|
with open(__location__ + "/sample_wikipedia.html", "r", encoding="utf-8") as f:
|
||||||
self.WIKI_HTML = f.read()
|
self.WIKI_HTML = f.read()
|
||||||
self.results = {"new": [], "current": []}
|
self.results = {"new": [], "current": []}
|
||||||
|
|||||||
@@ -10,11 +10,13 @@ import sys
|
|||||||
import uuid
|
import uuid
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
|
from crawl4ai import BrowserProfiler
|
||||||
|
from crawl4ai.browser_manager import BrowserManager
|
||||||
|
|
||||||
# Add the project root to Python path if running directly
|
# Add the project root to Python path if running directly
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||||
|
|
||||||
from crawl4ai.browser import BrowserManager, BrowserProfileManager
|
|
||||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
from crawl4ai.async_logger import AsyncLogger
|
from crawl4ai.async_logger import AsyncLogger
|
||||||
|
|
||||||
@@ -25,7 +27,7 @@ async def test_profile_creation():
|
|||||||
"""Test creating and managing browser profiles."""
|
"""Test creating and managing browser profiles."""
|
||||||
logger.info("Testing profile creation and management", tag="TEST")
|
logger.info("Testing profile creation and management", tag="TEST")
|
||||||
|
|
||||||
profile_manager = BrowserProfileManager(logger=logger)
|
profile_manager = BrowserProfiler(logger=logger)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# List existing profiles
|
# List existing profiles
|
||||||
@@ -83,7 +85,7 @@ async def test_profile_with_browser():
|
|||||||
"""Test using a profile with a browser."""
|
"""Test using a profile with a browser."""
|
||||||
logger.info("Testing using a profile with a browser", tag="TEST")
|
logger.info("Testing using a profile with a browser", tag="TEST")
|
||||||
|
|
||||||
profile_manager = BrowserProfileManager(logger=logger)
|
profile_manager = BrowserProfiler(logger=logger)
|
||||||
test_profile_name = f"test-browser-profile-{uuid.uuid4().hex[:8]}"
|
test_profile_name = f"test-browser-profile-{uuid.uuid4().hex[:8]}"
|
||||||
profile_path = None
|
profile_path = None
|
||||||
|
|
||||||
@@ -101,6 +103,8 @@ async def test_profile_with_browser():
|
|||||||
# Now use this profile with a browser
|
# Now use this profile with a browser
|
||||||
browser_config = BrowserConfig(
|
browser_config = BrowserConfig(
|
||||||
user_data_dir=profile_path,
|
user_data_dir=profile_path,
|
||||||
|
use_managed_browser=True,
|
||||||
|
use_persistent_context=True,
|
||||||
headless=True
|
headless=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
345
tests/docker/simple_api_test.py
Normal file
345
tests/docker/simple_api_test.py
Normal file
@@ -0,0 +1,345 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Simple API Test for Crawl4AI Docker Server v0.7.0
|
||||||
|
Uses only built-in Python modules to test all endpoints.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import urllib.request
|
||||||
|
import urllib.parse
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import sys
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
BASE_URL = "http://localhost:11234" # Change to your server URL
|
||||||
|
TEST_TIMEOUT = 30
|
||||||
|
|
||||||
|
class SimpleApiTester:
|
||||||
|
def __init__(self, base_url: str = BASE_URL):
|
||||||
|
self.base_url = base_url
|
||||||
|
self.token = None
|
||||||
|
self.results = []
|
||||||
|
|
||||||
|
def log(self, message: str):
|
||||||
|
print(f"[INFO] {message}")
|
||||||
|
|
||||||
|
def test_get_endpoint(self, endpoint: str) -> Dict:
|
||||||
|
"""Test a GET endpoint"""
|
||||||
|
url = f"{self.base_url}{endpoint}"
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
try:
|
||||||
|
req = urllib.request.Request(url)
|
||||||
|
if self.token:
|
||||||
|
req.add_header('Authorization', f'Bearer {self.token}')
|
||||||
|
|
||||||
|
with urllib.request.urlopen(req, timeout=TEST_TIMEOUT) as response:
|
||||||
|
response_time = time.time() - start_time
|
||||||
|
status_code = response.getcode()
|
||||||
|
content = response.read().decode('utf-8')
|
||||||
|
|
||||||
|
# Try to parse JSON
|
||||||
|
try:
|
||||||
|
data = json.loads(content)
|
||||||
|
except:
|
||||||
|
data = {"raw_response": content[:200]}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"endpoint": endpoint,
|
||||||
|
"method": "GET",
|
||||||
|
"status": "PASS" if status_code < 400 else "FAIL",
|
||||||
|
"status_code": status_code,
|
||||||
|
"response_time": response_time,
|
||||||
|
"data": data
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
response_time = time.time() - start_time
|
||||||
|
return {
|
||||||
|
"endpoint": endpoint,
|
||||||
|
"method": "GET",
|
||||||
|
"status": "FAIL",
|
||||||
|
"status_code": None,
|
||||||
|
"response_time": response_time,
|
||||||
|
"error": str(e)
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_post_endpoint(self, endpoint: str, payload: Dict) -> Dict:
|
||||||
|
"""Test a POST endpoint"""
|
||||||
|
url = f"{self.base_url}{endpoint}"
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = json.dumps(payload).encode('utf-8')
|
||||||
|
req = urllib.request.Request(url, data=data, method='POST')
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
|
||||||
|
if self.token:
|
||||||
|
req.add_header('Authorization', f'Bearer {self.token}')
|
||||||
|
|
||||||
|
with urllib.request.urlopen(req, timeout=TEST_TIMEOUT) as response:
|
||||||
|
response_time = time.time() - start_time
|
||||||
|
status_code = response.getcode()
|
||||||
|
content = response.read().decode('utf-8')
|
||||||
|
|
||||||
|
# Try to parse JSON
|
||||||
|
try:
|
||||||
|
data = json.loads(content)
|
||||||
|
except:
|
||||||
|
data = {"raw_response": content[:200]}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"endpoint": endpoint,
|
||||||
|
"method": "POST",
|
||||||
|
"status": "PASS" if status_code < 400 else "FAIL",
|
||||||
|
"status_code": status_code,
|
||||||
|
"response_time": response_time,
|
||||||
|
"data": data
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
response_time = time.time() - start_time
|
||||||
|
return {
|
||||||
|
"endpoint": endpoint,
|
||||||
|
"method": "POST",
|
||||||
|
"status": "FAIL",
|
||||||
|
"status_code": None,
|
||||||
|
"response_time": response_time,
|
||||||
|
"error": str(e)
|
||||||
|
}
|
||||||
|
|
||||||
|
def print_result(self, result: Dict):
|
||||||
|
"""Print a formatted test result"""
|
||||||
|
status_color = {
|
||||||
|
"PASS": "✅",
|
||||||
|
"FAIL": "❌",
|
||||||
|
"SKIP": "⏭️"
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f"{status_color[result['status']]} {result['method']} {result['endpoint']} "
|
||||||
|
f"| {result['response_time']:.3f}s | Status: {result['status_code'] or 'N/A'}")
|
||||||
|
|
||||||
|
if result['status'] == 'FAIL' and 'error' in result:
|
||||||
|
print(f" Error: {result['error']}")
|
||||||
|
|
||||||
|
self.results.append(result)
|
||||||
|
|
||||||
|
def run_all_tests(self):
|
||||||
|
"""Run all API tests"""
|
||||||
|
print("🚀 Starting Crawl4AI v0.7.0 API Test Suite")
|
||||||
|
print(f"📡 Testing server at: {self.base_url}")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# # Test basic endpoints
|
||||||
|
# print("\n=== BASIC ENDPOINTS ===")
|
||||||
|
|
||||||
|
# # Health check
|
||||||
|
# result = self.test_get_endpoint("/health")
|
||||||
|
# self.print_result(result)
|
||||||
|
|
||||||
|
|
||||||
|
# # Schema endpoint
|
||||||
|
# result = self.test_get_endpoint("/schema")
|
||||||
|
# self.print_result(result)
|
||||||
|
|
||||||
|
# # Metrics endpoint
|
||||||
|
# result = self.test_get_endpoint("/metrics")
|
||||||
|
# self.print_result(result)
|
||||||
|
|
||||||
|
# # Root redirect
|
||||||
|
# result = self.test_get_endpoint("/")
|
||||||
|
# self.print_result(result)
|
||||||
|
|
||||||
|
# # Test authentication
|
||||||
|
# print("\n=== AUTHENTICATION ===")
|
||||||
|
|
||||||
|
# # Get token
|
||||||
|
# token_payload = {"email": "test@example.com"}
|
||||||
|
# result = self.test_post_endpoint("/token", token_payload)
|
||||||
|
# self.print_result(result)
|
||||||
|
|
||||||
|
# # Extract token if successful
|
||||||
|
# if result['status'] == 'PASS' and 'data' in result:
|
||||||
|
# token = result['data'].get('access_token')
|
||||||
|
# if token:
|
||||||
|
# self.token = token
|
||||||
|
# self.log(f"Successfully obtained auth token: {token[:20]}...")
|
||||||
|
|
||||||
|
# Test core APIs
|
||||||
|
print("\n=== CORE APIs ===")
|
||||||
|
|
||||||
|
test_url = "https://example.com"
|
||||||
|
|
||||||
|
# Test markdown endpoint
|
||||||
|
md_payload = {
|
||||||
|
"url": test_url,
|
||||||
|
"f": "fit",
|
||||||
|
"q": "test query",
|
||||||
|
"c": "0"
|
||||||
|
}
|
||||||
|
result = self.test_post_endpoint("/md", md_payload)
|
||||||
|
# print(result['data'].get('markdown', ''))
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Test HTML endpoint
|
||||||
|
html_payload = {"url": test_url}
|
||||||
|
result = self.test_post_endpoint("/html", html_payload)
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Test screenshot endpoint
|
||||||
|
screenshot_payload = {
|
||||||
|
"url": test_url,
|
||||||
|
"screenshot_wait_for": 2
|
||||||
|
}
|
||||||
|
result = self.test_post_endpoint("/screenshot", screenshot_payload)
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Test PDF endpoint
|
||||||
|
pdf_payload = {"url": test_url}
|
||||||
|
result = self.test_post_endpoint("/pdf", pdf_payload)
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Test JavaScript execution
|
||||||
|
js_payload = {
|
||||||
|
"url": test_url,
|
||||||
|
"scripts": ["(() => document.title)()"]
|
||||||
|
}
|
||||||
|
result = self.test_post_endpoint("/execute_js", js_payload)
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Test crawl endpoint
|
||||||
|
crawl_payload = {
|
||||||
|
"urls": [test_url],
|
||||||
|
"browser_config": {},
|
||||||
|
"crawler_config": {}
|
||||||
|
}
|
||||||
|
result = self.test_post_endpoint("/crawl", crawl_payload)
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Test config dump
|
||||||
|
config_payload = {"code": "CrawlerRunConfig()"}
|
||||||
|
result = self.test_post_endpoint("/config/dump", config_payload)
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Test LLM endpoint
|
||||||
|
llm_endpoint = f"/llm/{test_url}?q=Extract%20main%20content"
|
||||||
|
result = self.test_get_endpoint(llm_endpoint)
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Test ask endpoint
|
||||||
|
ask_endpoint = "/ask?context_type=all&query=crawl4ai&max_results=5"
|
||||||
|
result = self.test_get_endpoint(ask_endpoint)
|
||||||
|
print(result)
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Test job APIs
|
||||||
|
print("\n=== JOB APIs ===")
|
||||||
|
|
||||||
|
# Test LLM job
|
||||||
|
llm_job_payload = {
|
||||||
|
"url": test_url,
|
||||||
|
"q": "Extract main content",
|
||||||
|
"cache": False
|
||||||
|
}
|
||||||
|
result = self.test_post_endpoint("/llm/job", llm_job_payload)
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Test crawl job
|
||||||
|
crawl_job_payload = {
|
||||||
|
"urls": [test_url],
|
||||||
|
"browser_config": {},
|
||||||
|
"crawler_config": {}
|
||||||
|
}
|
||||||
|
result = self.test_post_endpoint("/crawl/job", crawl_job_payload)
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Test MCP
|
||||||
|
print("\n=== MCP APIs ===")
|
||||||
|
|
||||||
|
# Test MCP schema
|
||||||
|
result = self.test_get_endpoint("/mcp/schema")
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Test error handling
|
||||||
|
print("\n=== ERROR HANDLING ===")
|
||||||
|
|
||||||
|
# Test invalid URL
|
||||||
|
invalid_payload = {"url": "invalid-url", "f": "fit"}
|
||||||
|
result = self.test_post_endpoint("/md", invalid_payload)
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Test invalid endpoint
|
||||||
|
result = self.test_get_endpoint("/nonexistent")
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
self.print_summary()
|
||||||
|
|
||||||
|
def print_summary(self):
|
||||||
|
"""Print test results summary"""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("📊 TEST RESULTS SUMMARY")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
total = len(self.results)
|
||||||
|
passed = sum(1 for r in self.results if r['status'] == 'PASS')
|
||||||
|
failed = sum(1 for r in self.results if r['status'] == 'FAIL')
|
||||||
|
|
||||||
|
print(f"Total Tests: {total}")
|
||||||
|
print(f"✅ Passed: {passed}")
|
||||||
|
print(f"❌ Failed: {failed}")
|
||||||
|
print(f"📈 Success Rate: {(passed/total)*100:.1f}%")
|
||||||
|
|
||||||
|
if failed > 0:
|
||||||
|
print("\n❌ FAILED TESTS:")
|
||||||
|
for result in self.results:
|
||||||
|
if result['status'] == 'FAIL':
|
||||||
|
print(f" • {result['method']} {result['endpoint']}")
|
||||||
|
if 'error' in result:
|
||||||
|
print(f" Error: {result['error']}")
|
||||||
|
|
||||||
|
# Performance statistics
|
||||||
|
response_times = [r['response_time'] for r in self.results if r['response_time'] > 0]
|
||||||
|
if response_times:
|
||||||
|
avg_time = sum(response_times) / len(response_times)
|
||||||
|
max_time = max(response_times)
|
||||||
|
print(f"\n⏱️ Average Response Time: {avg_time:.3f}s")
|
||||||
|
print(f"⏱️ Max Response Time: {max_time:.3f}s")
|
||||||
|
|
||||||
|
# Save detailed report
|
||||||
|
report_file = f"crawl4ai_test_report_{int(time.time())}.json"
|
||||||
|
with open(report_file, 'w') as f:
|
||||||
|
json.dump({
|
||||||
|
"timestamp": time.time(),
|
||||||
|
"server_url": self.base_url,
|
||||||
|
"version": "0.7.0",
|
||||||
|
"summary": {
|
||||||
|
"total": total,
|
||||||
|
"passed": passed,
|
||||||
|
"failed": failed
|
||||||
|
},
|
||||||
|
"results": self.results
|
||||||
|
}, f, indent=2)
|
||||||
|
|
||||||
|
print(f"\n📄 Detailed report saved to: {report_file}")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main test runner"""
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description='Crawl4AI v0.7.0 API Test Suite')
|
||||||
|
parser.add_argument('--url', default=BASE_URL, help='Base URL of the server')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
tester = SimpleApiTester(args.url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
tester.run_all_tests()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\n🛑 Test suite interrupted by user")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n💥 Test suite failed with error: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
55
tests/profiler/test_keyboard_handle.py
Normal file
55
tests/profiler/test_keyboard_handle.py
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
import sys
|
||||||
|
import pytest
|
||||||
|
import asyncio
|
||||||
|
from unittest.mock import patch, MagicMock
|
||||||
|
from crawl4ai.browser_profiler import BrowserProfiler
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@pytest.mark.skipif(sys.platform != "win32", reason="Windows-specific msvcrt test")
|
||||||
|
async def test_keyboard_input_handling():
|
||||||
|
# Mock sequence of keystrokes: arrow key followed by 'q'
|
||||||
|
mock_keys = [b'\x00K', b'q']
|
||||||
|
mock_kbhit = MagicMock(side_effect=[True, True, False])
|
||||||
|
mock_getch = MagicMock(side_effect=mock_keys)
|
||||||
|
|
||||||
|
with patch('msvcrt.kbhit', mock_kbhit), patch('msvcrt.getch', mock_getch):
|
||||||
|
# profiler = BrowserProfiler()
|
||||||
|
user_done_event = asyncio.Event()
|
||||||
|
|
||||||
|
# Create a local async function to simulate the keyboard input handling
|
||||||
|
async def test_listen_for_quit_command():
|
||||||
|
if sys.platform == "win32":
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
if mock_kbhit():
|
||||||
|
raw = mock_getch()
|
||||||
|
try:
|
||||||
|
key = raw.decode("utf-8")
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(key) != 1 or not key.isprintable():
|
||||||
|
continue
|
||||||
|
|
||||||
|
if key.lower() == "q":
|
||||||
|
user_done_event.set()
|
||||||
|
return
|
||||||
|
|
||||||
|
await asyncio.sleep(0.1)
|
||||||
|
except Exception as e:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Run the listener
|
||||||
|
listener_task = asyncio.create_task(test_listen_for_quit_command())
|
||||||
|
|
||||||
|
# Wait for the event to be set
|
||||||
|
try:
|
||||||
|
await asyncio.wait_for(user_done_event.wait(), timeout=1.0)
|
||||||
|
assert user_done_event.is_set()
|
||||||
|
finally:
|
||||||
|
if not listener_task.done():
|
||||||
|
listener_task.cancel()
|
||||||
|
try:
|
||||||
|
await listener_task
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
pass
|
||||||
42
tests/test_arun_many.py
Normal file
42
tests/test_arun_many.py
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
"""
|
||||||
|
Test example for multiple crawler configs feature
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add parent directory to path for imports
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||||
|
from crawl4ai.processors.pdf import PDFContentScrapingStrategy
|
||||||
|
|
||||||
|
|
||||||
|
async def test_run_many():
|
||||||
|
default_config = CrawlerRunConfig(
|
||||||
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
# scraping_strategy=PDFContentScrapingStrategy()
|
||||||
|
)
|
||||||
|
|
||||||
|
test_urls = [
|
||||||
|
# "https://blog.python.org/", # Blog URL
|
||||||
|
"https://www.python.org/", # Generic HTTPS page
|
||||||
|
"https://www.kidocode.com/", # Generic HTTPS page
|
||||||
|
"https://www.example.com/", # Generic HTTPS page
|
||||||
|
# "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
|
||||||
|
]
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
# Single config - traditional usage still works
|
||||||
|
print("Test 1: Single config (backwards compatible)")
|
||||||
|
result = await crawler.arun_many(
|
||||||
|
urls=test_urls[:2],
|
||||||
|
config=default_config
|
||||||
|
)
|
||||||
|
print(f"Crawled {len(result)} URLs with single config\n")
|
||||||
|
for item in result:
|
||||||
|
print(f" {item.url} -> {item.status_code}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(test_run_many())
|
||||||
131
tests/test_config_matching_only.py
Normal file
131
tests/test_config_matching_only.py
Normal file
@@ -0,0 +1,131 @@
|
|||||||
|
"""
|
||||||
|
Test only the config matching logic without running crawler
|
||||||
|
"""
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add parent directory to path for imports
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from crawl4ai.async_configs import CrawlerRunConfig, MatchMode
|
||||||
|
|
||||||
|
def test_all_matching_scenarios():
|
||||||
|
print("Testing CrawlerRunConfig.is_match() method")
|
||||||
|
print("=" * 50)
|
||||||
|
|
||||||
|
# Test 1: Single string pattern
|
||||||
|
print("\n1. Single string pattern (glob style)")
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
url_matcher="*.pdf",
|
||||||
|
# For example we can set this => scraping_strategy=PDFContentScrapingStrategy()
|
||||||
|
)
|
||||||
|
test_urls = [
|
||||||
|
("https://example.com/file.pdf", True),
|
||||||
|
("https://example.com/doc.PDF", False), # Case sensitive
|
||||||
|
("https://example.com/file.txt", False),
|
||||||
|
("file.pdf", True),
|
||||||
|
]
|
||||||
|
for url, expected in test_urls:
|
||||||
|
result = config.is_match(url)
|
||||||
|
status = "✓" if result == expected else "✗"
|
||||||
|
print(f" {status} {url} -> {result}")
|
||||||
|
|
||||||
|
# Test 2: List of patterns with OR
|
||||||
|
print("\n2. List of patterns with OR (default)")
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
url_matcher=["*/article/*", "*/blog/*", "*.html"],
|
||||||
|
match_mode=MatchMode.OR
|
||||||
|
)
|
||||||
|
test_urls = [
|
||||||
|
("https://example.com/article/news", True),
|
||||||
|
("https://example.com/blog/post", True),
|
||||||
|
("https://example.com/page.html", True),
|
||||||
|
("https://example.com/page.php", False),
|
||||||
|
]
|
||||||
|
for url, expected in test_urls:
|
||||||
|
result = config.is_match(url)
|
||||||
|
status = "✓" if result == expected else "✗"
|
||||||
|
print(f" {status} {url} -> {result}")
|
||||||
|
|
||||||
|
# Test 3: Custom function
|
||||||
|
print("\n3. Custom function matcher")
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
url_matcher=lambda url: 'api' in url and (url.endswith('.json') or url.endswith('.xml'))
|
||||||
|
)
|
||||||
|
test_urls = [
|
||||||
|
("https://api.example.com/data.json", True),
|
||||||
|
("https://api.example.com/data.xml", True),
|
||||||
|
("https://api.example.com/data.html", False),
|
||||||
|
("https://example.com/data.json", False), # No 'api'
|
||||||
|
]
|
||||||
|
for url, expected in test_urls:
|
||||||
|
result = config.is_match(url)
|
||||||
|
status = "✓" if result == expected else "✗"
|
||||||
|
print(f" {status} {url} -> {result}")
|
||||||
|
|
||||||
|
# Test 4: Mixed list with AND
|
||||||
|
print("\n4. Mixed patterns and functions with AND")
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
url_matcher=[
|
||||||
|
"https://*", # Must be HTTPS
|
||||||
|
lambda url: '.com' in url, # Must have .com
|
||||||
|
lambda url: len(url) < 50 # Must be short
|
||||||
|
],
|
||||||
|
match_mode=MatchMode.AND
|
||||||
|
)
|
||||||
|
test_urls = [
|
||||||
|
("https://example.com/page", True),
|
||||||
|
("http://example.com/page", False), # Not HTTPS
|
||||||
|
("https://example.org/page", False), # No .com
|
||||||
|
("https://example.com/" + "x" * 50, False), # Too long
|
||||||
|
]
|
||||||
|
for url, expected in test_urls:
|
||||||
|
result = config.is_match(url)
|
||||||
|
status = "✓" if result == expected else "✗"
|
||||||
|
print(f" {status} {url} -> {result}")
|
||||||
|
|
||||||
|
# Test 5: Complex real-world scenario
|
||||||
|
print("\n5. Complex pattern combinations")
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
url_matcher=[
|
||||||
|
"*/api/v[0-9]/*", # API versioned endpoints
|
||||||
|
lambda url: 'graphql' in url, # GraphQL endpoints
|
||||||
|
"*.json" # JSON files
|
||||||
|
],
|
||||||
|
match_mode=MatchMode.OR
|
||||||
|
)
|
||||||
|
test_urls = [
|
||||||
|
("https://example.com/api/v1/users", True),
|
||||||
|
("https://example.com/api/v2/posts", True),
|
||||||
|
("https://example.com/graphql", True),
|
||||||
|
("https://example.com/data.json", True),
|
||||||
|
("https://example.com/api/users", False), # No version
|
||||||
|
]
|
||||||
|
for url, expected in test_urls:
|
||||||
|
result = config.is_match(url)
|
||||||
|
status = "✓" if result == expected else "✗"
|
||||||
|
print(f" {status} {url} -> {result}")
|
||||||
|
|
||||||
|
# Test 6: Edge cases
|
||||||
|
print("\n6. Edge cases")
|
||||||
|
|
||||||
|
# No matcher
|
||||||
|
config = CrawlerRunConfig()
|
||||||
|
result = config.is_match("https://example.com")
|
||||||
|
print(f" {'✓' if not result else '✗'} No matcher -> {result}")
|
||||||
|
|
||||||
|
# Empty list
|
||||||
|
config = CrawlerRunConfig(url_matcher=[])
|
||||||
|
result = config.is_match("https://example.com")
|
||||||
|
print(f" {'✓' if not result else '✗'} Empty list -> {result}")
|
||||||
|
|
||||||
|
# None in list (should be skipped)
|
||||||
|
config = CrawlerRunConfig(url_matcher=["*.pdf", None, "*.doc"])
|
||||||
|
result = config.is_match("test.pdf")
|
||||||
|
print(f" {'✓' if result else '✗'} List with None -> {result}")
|
||||||
|
|
||||||
|
print("\n" + "=" * 50)
|
||||||
|
print("All matching tests completed!")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_all_matching_scenarios()
|
||||||
87
tests/test_config_selection.py
Normal file
87
tests/test_config_selection.py
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
"""
|
||||||
|
Test config selection logic in dispatchers
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import AsyncMock, MagicMock
|
||||||
|
|
||||||
|
# Add parent directory to path for imports
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from crawl4ai.async_configs import CrawlerRunConfig, MatchMode
|
||||||
|
from crawl4ai.async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher
|
||||||
|
|
||||||
|
class TestDispatcher(BaseDispatcher):
|
||||||
|
"""Simple test dispatcher to verify config selection"""
|
||||||
|
|
||||||
|
async def crawl_url(self, url, config, task_id, **kwargs):
|
||||||
|
# Just return which config was selected
|
||||||
|
selected = self.select_config(url, config)
|
||||||
|
return {"url": url, "config_id": id(selected)}
|
||||||
|
|
||||||
|
async def run_urls(self, urls, crawler, config):
|
||||||
|
results = []
|
||||||
|
for url in urls:
|
||||||
|
result = await self.crawl_url(url, config, "test")
|
||||||
|
results.append(result)
|
||||||
|
return results
|
||||||
|
|
||||||
|
async def test_dispatcher_config_selection():
|
||||||
|
print("Testing dispatcher config selection")
|
||||||
|
print("=" * 50)
|
||||||
|
|
||||||
|
# Create test configs with different matchers
|
||||||
|
pdf_config = CrawlerRunConfig(url_matcher="*.pdf")
|
||||||
|
api_config = CrawlerRunConfig(url_matcher=lambda url: 'api' in url)
|
||||||
|
default_config = CrawlerRunConfig() # No matcher
|
||||||
|
|
||||||
|
configs = [pdf_config, api_config, default_config]
|
||||||
|
|
||||||
|
# Create test dispatcher
|
||||||
|
dispatcher = TestDispatcher()
|
||||||
|
|
||||||
|
# Test single config
|
||||||
|
print("\nTest 1: Single config")
|
||||||
|
result = await dispatcher.crawl_url("https://example.com/file.pdf", pdf_config, "test1")
|
||||||
|
assert result["config_id"] == id(pdf_config)
|
||||||
|
print("✓ Single config works")
|
||||||
|
|
||||||
|
# Test config list selection
|
||||||
|
print("\nTest 2: Config list selection")
|
||||||
|
test_cases = [
|
||||||
|
("https://example.com/file.pdf", id(pdf_config)),
|
||||||
|
("https://api.example.com/data", id(api_config)),
|
||||||
|
("https://example.com/page", id(configs[0])), # No match, uses first
|
||||||
|
]
|
||||||
|
|
||||||
|
for url, expected_id in test_cases:
|
||||||
|
result = await dispatcher.crawl_url(url, configs, "test")
|
||||||
|
assert result["config_id"] == expected_id, f"URL {url} got wrong config"
|
||||||
|
print(f"✓ {url} -> correct config selected")
|
||||||
|
|
||||||
|
# Test with MemoryAdaptiveDispatcher
|
||||||
|
print("\nTest 3: MemoryAdaptiveDispatcher config selection")
|
||||||
|
mem_dispatcher = MemoryAdaptiveDispatcher()
|
||||||
|
|
||||||
|
# Test select_config method directly
|
||||||
|
selected = mem_dispatcher.select_config("https://example.com/doc.pdf", configs)
|
||||||
|
assert selected == pdf_config
|
||||||
|
print("✓ MemoryAdaptiveDispatcher.select_config works")
|
||||||
|
|
||||||
|
# Test empty config list
|
||||||
|
print("\nTest 4: Edge cases")
|
||||||
|
selected = mem_dispatcher.select_config("https://example.com", [])
|
||||||
|
assert isinstance(selected, CrawlerRunConfig) # Should return default
|
||||||
|
print("✓ Empty config list returns default config")
|
||||||
|
|
||||||
|
# Test None config
|
||||||
|
selected = mem_dispatcher.select_config("https://example.com", None)
|
||||||
|
assert isinstance(selected, CrawlerRunConfig) # Should return default
|
||||||
|
print("✓ None config returns default config")
|
||||||
|
|
||||||
|
print("\n" + "=" * 50)
|
||||||
|
print("All dispatcher tests passed! ✓")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(test_dispatcher_config_selection())
|
||||||
122
tests/test_docker_api_with_llm_provider.py
Normal file
122
tests/test_docker_api_with_llm_provider.py
Normal file
@@ -0,0 +1,122 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Test script to verify Docker API with LLM provider configuration."""
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
|
||||||
|
BASE_URL = "http://localhost:11235"
|
||||||
|
|
||||||
|
def test_health():
|
||||||
|
"""Test health endpoint."""
|
||||||
|
print("1. Testing health endpoint...")
|
||||||
|
response = requests.get(f"{BASE_URL}/health")
|
||||||
|
print(f" Status: {response.status_code}")
|
||||||
|
print(f" Response: {response.json()}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
def test_schema():
|
||||||
|
"""Test schema endpoint to see configuration."""
|
||||||
|
print("2. Testing schema endpoint...")
|
||||||
|
response = requests.get(f"{BASE_URL}/schema")
|
||||||
|
print(f" Status: {response.status_code}")
|
||||||
|
# Print only browser config to keep output concise
|
||||||
|
print(f" Browser config keys: {list(response.json().get('browser', {}).keys())[:5]}...")
|
||||||
|
print()
|
||||||
|
|
||||||
|
def test_markdown_with_llm_filter():
|
||||||
|
"""Test markdown endpoint with LLM filter (should use configured provider)."""
|
||||||
|
print("3. Testing markdown endpoint with LLM filter...")
|
||||||
|
print(" This should use the Groq provider from LLM_PROVIDER env var")
|
||||||
|
|
||||||
|
# Note: This will fail with dummy API keys, but we can see if it tries to use Groq
|
||||||
|
payload = {
|
||||||
|
"url": "https://httpbin.org/html",
|
||||||
|
"f": "llm",
|
||||||
|
"q": "Extract the main content"
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(f"{BASE_URL}/md", json=payload)
|
||||||
|
print(f" Status: {response.status_code}")
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
print(f" Error: {response.text[:200]}...")
|
||||||
|
else:
|
||||||
|
print(f" Success! Markdown length: {len(response.json().get('markdown', ''))} chars")
|
||||||
|
print()
|
||||||
|
|
||||||
|
def test_markdown_with_provider_override():
|
||||||
|
"""Test markdown endpoint with provider override in request."""
|
||||||
|
print("4. Testing markdown endpoint with provider override...")
|
||||||
|
print(" This should use OpenAI provider from request parameter")
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"url": "https://httpbin.org/html",
|
||||||
|
"f": "llm",
|
||||||
|
"q": "Extract the main content",
|
||||||
|
"provider": "openai/gpt-4" # Override to use OpenAI
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(f"{BASE_URL}/md", json=payload)
|
||||||
|
print(f" Status: {response.status_code}")
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
print(f" Error: {response.text[:200]}...")
|
||||||
|
else:
|
||||||
|
print(f" Success! Markdown length: {len(response.json().get('markdown', ''))} chars")
|
||||||
|
print()
|
||||||
|
|
||||||
|
def test_simple_crawl():
|
||||||
|
"""Test simple crawl without LLM."""
|
||||||
|
print("5. Testing simple crawl (no LLM required)...")
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"urls": ["https://httpbin.org/html"],
|
||||||
|
"browser_config": {
|
||||||
|
"type": "BrowserConfig",
|
||||||
|
"params": {"headless": True}
|
||||||
|
},
|
||||||
|
"crawler_config": {
|
||||||
|
"type": "CrawlerRunConfig",
|
||||||
|
"params": {"cache_mode": "bypass"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(f"{BASE_URL}/crawl", json=payload)
|
||||||
|
print(f" Status: {response.status_code}")
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
result = response.json()
|
||||||
|
print(f" Success: {result.get('success')}")
|
||||||
|
print(f" Results count: {len(result.get('results', []))}")
|
||||||
|
if result.get('results'):
|
||||||
|
print(f" First result success: {result['results'][0].get('success')}")
|
||||||
|
else:
|
||||||
|
print(f" Error: {response.text[:200]}...")
|
||||||
|
print()
|
||||||
|
|
||||||
|
def test_playground():
|
||||||
|
"""Test if playground is accessible."""
|
||||||
|
print("6. Testing playground interface...")
|
||||||
|
response = requests.get(f"{BASE_URL}/playground")
|
||||||
|
print(f" Status: {response.status_code}")
|
||||||
|
print(f" Content-Type: {response.headers.get('content-type')}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("=== Crawl4AI Docker API Tests ===\n")
|
||||||
|
print(f"Testing API at {BASE_URL}\n")
|
||||||
|
|
||||||
|
# Wait a bit for server to be fully ready
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
test_health()
|
||||||
|
test_schema()
|
||||||
|
test_simple_crawl()
|
||||||
|
test_playground()
|
||||||
|
|
||||||
|
print("\nTesting LLM functionality (these may fail with dummy API keys):\n")
|
||||||
|
test_markdown_with_llm_filter()
|
||||||
|
test_markdown_with_provider_override()
|
||||||
|
|
||||||
|
print("\nTests completed!")
|
||||||
@@ -5,7 +5,7 @@ Test script for Link Extractor functionality
|
|||||||
|
|
||||||
from crawl4ai.models import Link
|
from crawl4ai.models import Link
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
from crawl4ai.async_configs import LinkPreviewConfig
|
from crawl4ai import LinkPreviewConfig
|
||||||
import asyncio
|
import asyncio
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
@@ -237,7 +237,7 @@ def test_config_examples():
|
|||||||
print(f" {key}: {value}")
|
print(f" {key}: {value}")
|
||||||
|
|
||||||
print(" Usage:")
|
print(" Usage:")
|
||||||
print(" from crawl4ai.async_configs import LinkPreviewConfig")
|
print(" from crawl4ai import LinkPreviewConfig")
|
||||||
print(" config = CrawlerRunConfig(")
|
print(" config = CrawlerRunConfig(")
|
||||||
print(" link_preview_config=LinkPreviewConfig(")
|
print(" link_preview_config=LinkPreviewConfig(")
|
||||||
for key, value in config_dict.items():
|
for key, value in config_dict.items():
|
||||||
|
|||||||
71
tests/test_memory_macos.py
Executable file
71
tests/test_memory_macos.py
Executable file
@@ -0,0 +1,71 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Test script to verify macOS memory calculation accuracy."""
|
||||||
|
|
||||||
|
import psutil
|
||||||
|
import platform
|
||||||
|
import time
|
||||||
|
from crawl4ai.memory_utils import get_true_memory_usage_percent, get_memory_stats, get_true_available_memory_gb
|
||||||
|
|
||||||
|
|
||||||
|
def test_memory_calculation():
|
||||||
|
"""Test and compare memory calculations."""
|
||||||
|
print(f"Platform: {platform.system()}")
|
||||||
|
print(f"Python version: {platform.python_version()}")
|
||||||
|
print("-" * 60)
|
||||||
|
|
||||||
|
# Get psutil's view
|
||||||
|
vm = psutil.virtual_memory()
|
||||||
|
psutil_percent = vm.percent
|
||||||
|
psutil_available_gb = vm.available / (1024**3)
|
||||||
|
total_gb = vm.total / (1024**3)
|
||||||
|
|
||||||
|
# Get our corrected view
|
||||||
|
true_percent = get_true_memory_usage_percent()
|
||||||
|
true_available_gb = get_true_available_memory_gb()
|
||||||
|
true_percent_calc, available_calc, total_calc = get_memory_stats()
|
||||||
|
|
||||||
|
print("Memory Statistics Comparison:")
|
||||||
|
print(f"Total Memory: {total_gb:.2f} GB")
|
||||||
|
print()
|
||||||
|
|
||||||
|
print("PSUtil (Standard) Calculation:")
|
||||||
|
print(f" - Memory Used: {psutil_percent:.1f}%")
|
||||||
|
print(f" - Available: {psutil_available_gb:.2f} GB")
|
||||||
|
print()
|
||||||
|
|
||||||
|
print("Platform-Aware Calculation:")
|
||||||
|
print(f" - Memory Used: {true_percent:.1f}%")
|
||||||
|
print(f" - Available: {true_available_gb:.2f} GB")
|
||||||
|
print(f" - Difference: {true_available_gb - psutil_available_gb:.2f} GB of reclaimable memory")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Show the impact on dispatcher behavior
|
||||||
|
print("Impact on MemoryAdaptiveDispatcher:")
|
||||||
|
thresholds = {
|
||||||
|
"Normal": 90.0,
|
||||||
|
"Critical": 95.0,
|
||||||
|
"Recovery": 85.0
|
||||||
|
}
|
||||||
|
|
||||||
|
for name, threshold in thresholds.items():
|
||||||
|
psutil_triggered = psutil_percent >= threshold
|
||||||
|
true_triggered = true_percent >= threshold
|
||||||
|
print(f" - {name} Threshold ({threshold}%):")
|
||||||
|
print(f" PSUtil: {'TRIGGERED' if psutil_triggered else 'OK'}")
|
||||||
|
print(f" Platform-Aware: {'TRIGGERED' if true_triggered else 'OK'}")
|
||||||
|
if psutil_triggered != true_triggered:
|
||||||
|
print(f" → Difference: Platform-aware prevents false {'pressure' if psutil_triggered else 'recovery'}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Monitor for a few seconds
|
||||||
|
print("Monitoring memory for 10 seconds...")
|
||||||
|
for i in range(10):
|
||||||
|
vm = psutil.virtual_memory()
|
||||||
|
true_pct = get_true_memory_usage_percent()
|
||||||
|
print(f" {i+1}s - PSUtil: {vm.percent:.1f}% | Platform-Aware: {true_pct:.1f}%", end="\r")
|
||||||
|
time.sleep(1)
|
||||||
|
print("\n")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_memory_calculation()
|
||||||
117
tests/test_multi_config.py
Normal file
117
tests/test_multi_config.py
Normal file
@@ -0,0 +1,117 @@
|
|||||||
|
"""
|
||||||
|
Test example for multiple crawler configs feature
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Add parent directory to path for imports
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, MatchMode, CacheMode
|
||||||
|
|
||||||
|
async def test_multi_config():
|
||||||
|
# Create different configs for different URL patterns
|
||||||
|
|
||||||
|
# Config for PDF files
|
||||||
|
pdf_config = CrawlerRunConfig(
|
||||||
|
url_matcher="*.pdf",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Config for articles (using multiple patterns with OR logic)
|
||||||
|
article_config = CrawlerRunConfig(
|
||||||
|
url_matcher=["*/news/*", "*blog*", "*/article/*"],
|
||||||
|
match_mode=MatchMode.OR,
|
||||||
|
screenshot=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Config using custom matcher function
|
||||||
|
api_config = CrawlerRunConfig(
|
||||||
|
url_matcher=lambda url: 'api' in url or 'json' in url,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Config combining patterns and functions with AND logic
|
||||||
|
secure_docs_config = CrawlerRunConfig(
|
||||||
|
url_matcher=[
|
||||||
|
"*.doc*", # Matches .doc, .docx
|
||||||
|
lambda url: url.startswith('https://') # Must be HTTPS
|
||||||
|
],
|
||||||
|
match_mode=MatchMode.AND,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Default config (no url_matcher means it won't match anything unless it's the fallback)
|
||||||
|
default_config = CrawlerRunConfig(
|
||||||
|
# cache_mode=CacheMode.BYPASS,
|
||||||
|
)
|
||||||
|
|
||||||
|
# List of configs - order matters! First match wins
|
||||||
|
configs = [
|
||||||
|
pdf_config,
|
||||||
|
article_config,
|
||||||
|
api_config,
|
||||||
|
secure_docs_config,
|
||||||
|
default_config # Fallback
|
||||||
|
]
|
||||||
|
|
||||||
|
# Test URLs - using real URLs that exist
|
||||||
|
test_urls = [
|
||||||
|
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", # Real PDF
|
||||||
|
"https://www.bbc.com/news/articles/c5y3e3glnldo", # News article
|
||||||
|
"https://blog.python.org/", # Blog URL
|
||||||
|
"https://api.github.com/users/github", # GitHub API (returns JSON)
|
||||||
|
"https://httpbin.org/json", # API endpoint that returns JSON
|
||||||
|
"https://www.python.org/", # Generic HTTPS page
|
||||||
|
"http://info.cern.ch/", # HTTP (not HTTPS) page
|
||||||
|
"https://example.com/", # → Default config
|
||||||
|
]
|
||||||
|
|
||||||
|
# Test the matching logic
|
||||||
|
print("Config matching test:")
|
||||||
|
print("-" * 50)
|
||||||
|
for url in test_urls:
|
||||||
|
for i, config in enumerate(configs):
|
||||||
|
if config.is_match(url):
|
||||||
|
print(f"{url} -> Config {i} matches")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print(f"{url} -> No match, will use fallback (first config)")
|
||||||
|
|
||||||
|
print("\n" + "=" * 50 + "\n")
|
||||||
|
|
||||||
|
# Now test with actual crawler
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
# Single config - traditional usage still works
|
||||||
|
print("Test 1: Single config (backwards compatible)")
|
||||||
|
result = await crawler.arun_many(
|
||||||
|
urls=["https://www.python.org/"],
|
||||||
|
config=default_config
|
||||||
|
)
|
||||||
|
print(f"Crawled {len(result)} URLs with single config\n")
|
||||||
|
|
||||||
|
# Multiple configs - new feature
|
||||||
|
print("Test 2: Multiple configs")
|
||||||
|
# Just test with 2 URLs to avoid timeout
|
||||||
|
results = await crawler.arun_many(
|
||||||
|
urls=test_urls[:2], # Just test first 2 URLs
|
||||||
|
config=configs # Pass list of configs
|
||||||
|
)
|
||||||
|
print(f"Crawled {len(results)} URLs with multiple configs")
|
||||||
|
|
||||||
|
# Using custom matcher inline
|
||||||
|
print("\nTest 3: Inline custom matcher")
|
||||||
|
custom_config = CrawlerRunConfig(
|
||||||
|
url_matcher=lambda url: len(url) > 50 and 'python' in url.lower(),
|
||||||
|
verbose=False
|
||||||
|
)
|
||||||
|
results = await crawler.arun_many(
|
||||||
|
urls=[
|
||||||
|
"https://docs.python.org/3/library/asyncio.html", # Long URL with 'python'
|
||||||
|
"https://python.org/", # Short URL with 'python' - won't match
|
||||||
|
"https://www.google.com/" # No 'python' - won't match
|
||||||
|
],
|
||||||
|
config=[custom_config, default_config]
|
||||||
|
)
|
||||||
|
print(f"Crawled {len(results)} URLs with custom matcher")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(test_multi_config())
|
||||||
Reference in New Issue
Block a user