Compare commits
48 Commits
v0.7.0
...
fix/exit_w
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0541b61405 | ||
|
|
6735c68288 | ||
|
|
ff6ea41ac3 | ||
|
|
31a435fb0e | ||
|
|
5de6a28055 | ||
|
|
de1561ad14 | ||
|
|
337b588732 | ||
|
|
7a6ad547f0 | ||
|
|
e6692b987d | ||
|
|
307fe28b32 | ||
|
|
438a103b17 | ||
|
|
a03e68fa2f | ||
|
|
864d87afb2 | ||
|
|
508b6fc233 | ||
|
|
e3281935bc | ||
|
|
48647300b4 | ||
|
|
9f9ea3bb3b | ||
|
|
d58b93c207 | ||
|
|
e2b4705010 | ||
|
|
4a1abd5086 | ||
|
|
04258cd4f2 | ||
|
|
84e462d9f8 | ||
|
|
9546773a07 | ||
|
|
66a979ad11 | ||
|
|
0c31e91b53 | ||
|
|
1b6a31f88f | ||
|
|
b8c261780f | ||
|
|
db6ad7a79d | ||
|
|
004d514f33 | ||
|
|
3a9e2c716e | ||
|
|
0163bd797c | ||
|
|
26bad799e4 | ||
|
|
cf8badfe27 | ||
|
|
ccbe3c105c | ||
|
|
761c19d54b | ||
|
|
14b0ecb137 | ||
|
|
0eaa9f9895 | ||
|
|
1d1970ae69 | ||
|
|
205df1e330 | ||
|
|
2640dc73a5 | ||
|
|
58024755c5 | ||
|
|
dd5ee752cf | ||
|
|
bde1bba6a2 | ||
|
|
14f690d751 | ||
|
|
ee25c771d8 | ||
|
|
c4d625fb3c | ||
|
|
ef722766f0 | ||
|
|
4bcb7171a3 |
142
.github/workflows/release.yml
vendored
Normal file
142
.github/workflows/release.yml
vendored
Normal file
@@ -0,0 +1,142 @@
|
||||
name: Release Pipeline
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- 'v*'
|
||||
- '!test-v*' # Exclude test tags
|
||||
|
||||
jobs:
|
||||
release:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write # Required for creating releases
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Extract version from tag
|
||||
id: get_version
|
||||
run: |
|
||||
TAG_VERSION=${GITHUB_REF#refs/tags/v}
|
||||
echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT
|
||||
echo "Releasing version: $TAG_VERSION"
|
||||
|
||||
- name: Install package dependencies
|
||||
run: |
|
||||
pip install -e .
|
||||
|
||||
- name: Check version consistency
|
||||
run: |
|
||||
TAG_VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||
PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)")
|
||||
|
||||
echo "Tag version: $TAG_VERSION"
|
||||
echo "Package version: $PACKAGE_VERSION"
|
||||
|
||||
if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then
|
||||
echo "❌ Version mismatch! Tag: $TAG_VERSION, Package: $PACKAGE_VERSION"
|
||||
echo "Please update crawl4ai/__version__.py to match the tag version"
|
||||
exit 1
|
||||
fi
|
||||
echo "✅ Version check passed: $TAG_VERSION"
|
||||
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install build twine
|
||||
|
||||
- name: Build package
|
||||
run: python -m build
|
||||
|
||||
- name: Check package
|
||||
run: twine check dist/*
|
||||
|
||||
- name: Upload to PyPI
|
||||
env:
|
||||
TWINE_USERNAME: __token__
|
||||
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
|
||||
run: |
|
||||
echo "📦 Uploading to PyPI..."
|
||||
twine upload dist/*
|
||||
echo "✅ Package uploaded to https://pypi.org/project/crawl4ai/"
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Extract major and minor versions
|
||||
id: versions
|
||||
run: |
|
||||
VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||
MAJOR=$(echo $VERSION | cut -d. -f1)
|
||||
MINOR=$(echo $VERSION | cut -d. -f1-2)
|
||||
echo "MAJOR=$MAJOR" >> $GITHUB_OUTPUT
|
||||
echo "MINOR=$MINOR" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Build and push Docker images
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: |
|
||||
unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
||||
unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}
|
||||
unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}
|
||||
unclecode/crawl4ai:latest
|
||||
platforms: linux/amd64,linux/arm64
|
||||
|
||||
- name: Create GitHub Release
|
||||
uses: softprops/action-gh-release@v2
|
||||
with:
|
||||
tag_name: v${{ steps.get_version.outputs.VERSION }}
|
||||
name: Release v${{ steps.get_version.outputs.VERSION }}
|
||||
body: |
|
||||
## 🎉 Crawl4AI v${{ steps.get_version.outputs.VERSION }} Released!
|
||||
|
||||
### 📦 Installation
|
||||
|
||||
**PyPI:**
|
||||
```bash
|
||||
pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}
|
||||
```
|
||||
|
||||
**Docker:**
|
||||
```bash
|
||||
docker pull unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
||||
docker pull unclecode/crawl4ai:latest
|
||||
```
|
||||
|
||||
### 📝 What's Changed
|
||||
See [CHANGELOG.md](https://github.com/${{ github.repository }}/blob/main/CHANGELOG.md) for details.
|
||||
draft: false
|
||||
prerelease: false
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Summary
|
||||
run: |
|
||||
echo "## 🚀 Release Complete!" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 📦 PyPI Package" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- Version: ${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- URL: https://pypi.org/project/crawl4ai/" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- Install: \`pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 🐳 Docker Images" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:latest\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 📋 GitHub Release" >> $GITHUB_STEP_SUMMARY
|
||||
echo "https://github.com/${{ github.repository }}/releases/tag/v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
116
.github/workflows/test-release.yml.disabled
vendored
Normal file
116
.github/workflows/test-release.yml.disabled
vendored
Normal file
@@ -0,0 +1,116 @@
|
||||
name: Test Release Pipeline
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- 'test-v*'
|
||||
|
||||
jobs:
|
||||
test-release:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Extract version from tag
|
||||
id: get_version
|
||||
run: |
|
||||
TAG_VERSION=${GITHUB_REF#refs/tags/test-v}
|
||||
echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT
|
||||
echo "Testing with version: $TAG_VERSION"
|
||||
|
||||
- name: Install package dependencies
|
||||
run: |
|
||||
pip install -e .
|
||||
|
||||
- name: Check version consistency
|
||||
run: |
|
||||
TAG_VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||
PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)")
|
||||
|
||||
echo "Tag version: $TAG_VERSION"
|
||||
echo "Package version: $PACKAGE_VERSION"
|
||||
|
||||
if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then
|
||||
echo "❌ Version mismatch! Tag: $TAG_VERSION, Package: $PACKAGE_VERSION"
|
||||
echo "Please update crawl4ai/__version__.py to match the tag version"
|
||||
exit 1
|
||||
fi
|
||||
echo "✅ Version check passed: $TAG_VERSION"
|
||||
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install build twine
|
||||
|
||||
- name: Build package
|
||||
run: python -m build
|
||||
|
||||
- name: Check package
|
||||
run: twine check dist/*
|
||||
|
||||
- name: Upload to Test PyPI
|
||||
env:
|
||||
TWINE_USERNAME: __token__
|
||||
TWINE_PASSWORD: ${{ secrets.TEST_PYPI_TOKEN }}
|
||||
run: |
|
||||
echo "📦 Uploading to Test PyPI..."
|
||||
twine upload --repository testpypi dist/* || {
|
||||
if [ $? -eq 1 ]; then
|
||||
echo "⚠️ Upload failed - likely version already exists on Test PyPI"
|
||||
echo "Continuing anyway for test purposes..."
|
||||
else
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
echo "✅ Test PyPI step complete"
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Build and push Docker test images
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: |
|
||||
unclecode/crawl4ai:test-${{ steps.get_version.outputs.VERSION }}
|
||||
unclecode/crawl4ai:test-latest
|
||||
platforms: linux/amd64,linux/arm64
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
|
||||
- name: Summary
|
||||
run: |
|
||||
echo "## 🎉 Test Release Complete!" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 📦 Test PyPI Package" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- Version: ${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- URL: https://test.pypi.org/project/crawl4ai/" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- Install: \`pip install -i https://test.pypi.org/simple/ crawl4ai==${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 🐳 Docker Test Images" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:test-${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:test-latest\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 🧹 Cleanup Commands" >> $GITHUB_STEP_SUMMARY
|
||||
echo "\`\`\`bash" >> $GITHUB_STEP_SUMMARY
|
||||
echo "# Remove test tag" >> $GITHUB_STEP_SUMMARY
|
||||
echo "git tag -d test-v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "git push origin :test-v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "# Remove Docker test images" >> $GITHUB_STEP_SUMMARY
|
||||
echo "docker rmi unclecode/crawl4ai:test-${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "docker rmi unclecode/crawl4ai:test-latest" >> $GITHUB_STEP_SUMMARY
|
||||
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
|
||||
15
CHANGELOG.md
15
CHANGELOG.md
@@ -21,6 +21,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
- **Flexible LLM Provider Configuration** (Docker):
|
||||
- Support for `LLM_PROVIDER` environment variable to override default provider
|
||||
- Per-request provider override via optional `provider` parameter in API endpoints
|
||||
- Automatic provider validation with clear error messages
|
||||
- Updated Docker documentation and examples
|
||||
|
||||
### Changed
|
||||
- **WebScrapingStrategy Refactoring**: Simplified content scraping architecture
|
||||
- `WebScrapingStrategy` is now an alias for `LXMLWebScrapingStrategy` for backward compatibility
|
||||
- Removed redundant BeautifulSoup-based implementation (~1000 lines of code)
|
||||
- `LXMLWebScrapingStrategy` now inherits directly from `ContentScrapingStrategy`
|
||||
- All existing code using `WebScrapingStrategy` continues to work without modification
|
||||
- Default scraping strategy remains `LXMLWebScrapingStrategy` for optimal performance
|
||||
|
||||
### Added
|
||||
- **AsyncUrlSeeder**: High-performance URL discovery system for intelligent crawling at scale
|
||||
- Discover URLs from sitemaps and Common Crawl index
|
||||
|
||||
19
README.md
19
README.md
@@ -28,7 +28,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant
|
||||
|
||||
[✨ Check out latest update v0.7.0](#-recent-updates)
|
||||
|
||||
🎉 **Version 0.7.0 is now available!** The Adaptive Intelligence Update introduces groundbreaking features: Adaptive Crawling that learns website patterns, Virtual Scroll support for infinite pages, intelligent Link Preview with 3-layer scoring, Async URL Seeder for massive discovery, and significant performance improvements. [Read the release notes →](https://docs.crawl4ai.com/blog/release-v0.7.0)
|
||||
🎉 **Version 0.7.0 is now available!** The Adaptive Intelligence Update introduces groundbreaking features: Adaptive Crawling that learns website patterns, Virtual Scroll support for infinite pages, intelligent Link Preview with 3-layer scoring, Async URL Seeder for massive discovery, and significant performance improvements. [Read the release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.0.md)
|
||||
|
||||
<details>
|
||||
<summary>🤓 <strong>My Personal Story</strong></summary>
|
||||
@@ -523,15 +523,18 @@ async def test_news_crawl():
|
||||
- **🧠 Adaptive Crawling**: Your crawler now learns and adapts to website patterns automatically:
|
||||
```python
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.7,
|
||||
max_history=100,
|
||||
learning_rate=0.2
|
||||
confidence_threshold=0.7, # Min confidence to stop crawling
|
||||
max_depth=5, # Maximum crawl depth
|
||||
max_pages=20, # Maximum number of pages to crawl
|
||||
strategy="statistical"
|
||||
)
|
||||
|
||||
result = await crawler.arun(
|
||||
"https://news.example.com",
|
||||
config=CrawlerRunConfig(adaptive_config=config)
|
||||
)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
adaptive_crawler = AdaptiveCrawler(crawler, config)
|
||||
state = await adaptive_crawler.digest(
|
||||
start_url="https://news.example.com",
|
||||
query="latest news content"
|
||||
)
|
||||
# Crawler learns patterns and improves extraction over time
|
||||
```
|
||||
|
||||
|
||||
@@ -3,12 +3,12 @@ import warnings
|
||||
|
||||
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
||||
# MODIFIED: Add SeedingConfig and VirtualScrollConfig here
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig, LinkPreviewConfig, MatchMode
|
||||
|
||||
from .content_scraping_strategy import (
|
||||
ContentScrapingStrategy,
|
||||
WebScrapingStrategy,
|
||||
LXMLWebScrapingStrategy,
|
||||
WebScrapingStrategy, # Backward compatibility alias
|
||||
)
|
||||
from .async_logger import (
|
||||
AsyncLoggerBase,
|
||||
@@ -132,6 +132,7 @@ __all__ = [
|
||||
"CrawlResult",
|
||||
"CrawlerHub",
|
||||
"CacheMode",
|
||||
"MatchMode",
|
||||
"ContentScrapingStrategy",
|
||||
"WebScrapingStrategy",
|
||||
"LXMLWebScrapingStrategy",
|
||||
@@ -173,6 +174,7 @@ __all__ = [
|
||||
"CompilationResult",
|
||||
"ValidationResult",
|
||||
"ErrorDetail",
|
||||
"LinkPreviewConfig"
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# crawl4ai/__version__.py
|
||||
|
||||
# This is the version that will be used for stable releases
|
||||
__version__ = "0.7.0"
|
||||
__version__ = "0.7.2"
|
||||
|
||||
# For nightly builds, this gets set during build process
|
||||
__nightly_version__ = None
|
||||
|
||||
@@ -18,17 +18,24 @@ from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy
|
||||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||
|
||||
from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
|
||||
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy, LXMLWebScrapingStrategy
|
||||
from .content_scraping_strategy import ContentScrapingStrategy, LXMLWebScrapingStrategy
|
||||
from .deep_crawling import DeepCrawlStrategy
|
||||
|
||||
from .cache_context import CacheMode
|
||||
from .proxy_strategy import ProxyRotationStrategy
|
||||
|
||||
from typing import Union, List
|
||||
from typing import Union, List, Callable
|
||||
import inspect
|
||||
from typing import Any, Dict, Optional
|
||||
from enum import Enum
|
||||
|
||||
# Type alias for URL matching
|
||||
UrlMatcher = Union[str, Callable[[str], bool], List[Union[str, Callable[[str], bool]]]]
|
||||
|
||||
class MatchMode(Enum):
|
||||
OR = "or"
|
||||
AND = "and"
|
||||
|
||||
# from .proxy_strategy import ProxyConfig
|
||||
|
||||
|
||||
@@ -862,7 +869,7 @@ class CrawlerRunConfig():
|
||||
parser_type (str): Type of parser to use for HTML parsing.
|
||||
Default: "lxml".
|
||||
scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
|
||||
Default: WebScrapingStrategy.
|
||||
Default: LXMLWebScrapingStrategy.
|
||||
proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
|
||||
If None, no additional proxy config. Default: None.
|
||||
|
||||
@@ -1113,6 +1120,9 @@ class CrawlerRunConfig():
|
||||
link_preview_config: Union[LinkPreviewConfig, Dict[str, Any]] = None,
|
||||
# Virtual Scroll Parameters
|
||||
virtual_scroll_config: Union[VirtualScrollConfig, Dict[str, Any]] = None,
|
||||
# URL Matching Parameters
|
||||
url_matcher: Optional[UrlMatcher] = None,
|
||||
match_mode: MatchMode = MatchMode.OR,
|
||||
# Experimental Parameters
|
||||
experimental: Dict[str, Any] = None,
|
||||
):
|
||||
@@ -1266,6 +1276,10 @@ class CrawlerRunConfig():
|
||||
else:
|
||||
raise ValueError("virtual_scroll_config must be VirtualScrollConfig object or dict")
|
||||
|
||||
# URL Matching Parameters
|
||||
self.url_matcher = url_matcher
|
||||
self.match_mode = match_mode
|
||||
|
||||
# Experimental Parameters
|
||||
self.experimental = experimental or {}
|
||||
|
||||
@@ -1321,6 +1335,51 @@ class CrawlerRunConfig():
|
||||
if "compilation error" not in str(e).lower():
|
||||
raise ValueError(f"Failed to compile C4A script: {str(e)}")
|
||||
raise
|
||||
|
||||
def is_match(self, url: str) -> bool:
|
||||
"""Check if this config matches the given URL.
|
||||
|
||||
Args:
|
||||
url: The URL to check against this config's matcher
|
||||
|
||||
Returns:
|
||||
bool: True if this config should be used for the URL or if no matcher is set.
|
||||
"""
|
||||
if self.url_matcher is None:
|
||||
return True
|
||||
|
||||
if callable(self.url_matcher):
|
||||
# Single function matcher
|
||||
return self.url_matcher(url)
|
||||
|
||||
elif isinstance(self.url_matcher, str):
|
||||
# Single pattern string
|
||||
from fnmatch import fnmatch
|
||||
return fnmatch(url, self.url_matcher)
|
||||
|
||||
elif isinstance(self.url_matcher, list):
|
||||
# List of mixed matchers
|
||||
if not self.url_matcher: # Empty list
|
||||
return False
|
||||
|
||||
results = []
|
||||
for matcher in self.url_matcher:
|
||||
if callable(matcher):
|
||||
results.append(matcher(url))
|
||||
elif isinstance(matcher, str):
|
||||
from fnmatch import fnmatch
|
||||
results.append(fnmatch(url, matcher))
|
||||
else:
|
||||
# Skip invalid matchers
|
||||
continue
|
||||
|
||||
# Apply match mode logic
|
||||
if self.match_mode == MatchMode.OR:
|
||||
return any(results) if results else False
|
||||
else: # AND mode
|
||||
return all(results) if results else False
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def __getattr__(self, name):
|
||||
@@ -1443,6 +1502,9 @@ class CrawlerRunConfig():
|
||||
# Link Extraction Parameters
|
||||
link_preview_config=kwargs.get("link_preview_config"),
|
||||
url=kwargs.get("url"),
|
||||
# URL Matching Parameters
|
||||
url_matcher=kwargs.get("url_matcher"),
|
||||
match_mode=kwargs.get("match_mode", MatchMode.OR),
|
||||
# Experimental Parameters
|
||||
experimental=kwargs.get("experimental"),
|
||||
)
|
||||
@@ -1540,6 +1602,8 @@ class CrawlerRunConfig():
|
||||
"deep_crawl_strategy": self.deep_crawl_strategy,
|
||||
"link_preview_config": self.link_preview_config.to_dict() if self.link_preview_config else None,
|
||||
"url": self.url,
|
||||
"url_matcher": self.url_matcher,
|
||||
"match_mode": self.match_mode,
|
||||
"experimental": self.experimental,
|
||||
}
|
||||
|
||||
|
||||
@@ -824,7 +824,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
except Error:
|
||||
visibility_info = await self.check_visibility(page)
|
||||
|
||||
if self.browser_config.config.verbose:
|
||||
if self.browser_config.verbose:
|
||||
self.logger.debug(
|
||||
message="Body visibility info: {info}",
|
||||
tag="DEBUG",
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from typing import Dict, Optional, List, Tuple
|
||||
from typing import Dict, Optional, List, Tuple, Union
|
||||
from .async_configs import CrawlerRunConfig
|
||||
from .models import (
|
||||
CrawlResult,
|
||||
@@ -22,6 +22,8 @@ from urllib.parse import urlparse
|
||||
import random
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from .memory_utils import get_true_memory_usage_percent
|
||||
|
||||
|
||||
class RateLimiter:
|
||||
def __init__(
|
||||
@@ -96,11 +98,37 @@ class BaseDispatcher(ABC):
|
||||
self.rate_limiter = rate_limiter
|
||||
self.monitor = monitor
|
||||
|
||||
def select_config(self, url: str, configs: Union[CrawlerRunConfig, List[CrawlerRunConfig]]) -> Optional[CrawlerRunConfig]:
|
||||
"""Select the appropriate config for a given URL.
|
||||
|
||||
Args:
|
||||
url: The URL to match against
|
||||
configs: Single config or list of configs to choose from
|
||||
|
||||
Returns:
|
||||
The matching config, or None if no match found
|
||||
"""
|
||||
# Single config - return as is
|
||||
if isinstance(configs, CrawlerRunConfig):
|
||||
return configs
|
||||
|
||||
# Empty list - return None
|
||||
if not configs:
|
||||
return None
|
||||
|
||||
# Find first matching config
|
||||
for config in configs:
|
||||
if config.is_match(url):
|
||||
return config
|
||||
|
||||
# No match found - return None to indicate URL should be skipped
|
||||
return None
|
||||
|
||||
@abstractmethod
|
||||
async def crawl_url(
|
||||
self,
|
||||
url: str,
|
||||
config: CrawlerRunConfig,
|
||||
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||
task_id: str,
|
||||
monitor: Optional[CrawlerMonitor] = None,
|
||||
) -> CrawlerTaskResult:
|
||||
@@ -111,7 +139,7 @@ class BaseDispatcher(ABC):
|
||||
self,
|
||||
urls: List[str],
|
||||
crawler: AsyncWebCrawler, # noqa: F821
|
||||
config: CrawlerRunConfig,
|
||||
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||
monitor: Optional[CrawlerMonitor] = None,
|
||||
) -> List[CrawlerTaskResult]:
|
||||
pass
|
||||
@@ -147,7 +175,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
async def _memory_monitor_task(self):
|
||||
"""Background task to continuously monitor memory usage and update state"""
|
||||
while True:
|
||||
self.current_memory_percent = psutil.virtual_memory().percent
|
||||
self.current_memory_percent = get_true_memory_usage_percent()
|
||||
|
||||
# Enter memory pressure mode if we cross the threshold
|
||||
if self.current_memory_percent >= self.memory_threshold_percent:
|
||||
@@ -200,7 +228,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
async def crawl_url(
|
||||
self,
|
||||
url: str,
|
||||
config: CrawlerRunConfig,
|
||||
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||
task_id: str,
|
||||
retry_count: int = 0,
|
||||
) -> CrawlerTaskResult:
|
||||
@@ -208,6 +236,37 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
error_message = ""
|
||||
memory_usage = peak_memory = 0.0
|
||||
|
||||
# Select appropriate config for this URL
|
||||
selected_config = self.select_config(url, config)
|
||||
|
||||
# If no config matches, return failed result
|
||||
if selected_config is None:
|
||||
error_message = f"No matching configuration found for URL: {url}"
|
||||
if self.monitor:
|
||||
self.monitor.update_task(
|
||||
task_id,
|
||||
status=CrawlStatus.FAILED,
|
||||
error_message=error_message
|
||||
)
|
||||
|
||||
return CrawlerTaskResult(
|
||||
task_id=task_id,
|
||||
url=url,
|
||||
result=CrawlResult(
|
||||
url=url,
|
||||
html="",
|
||||
metadata={"status": "no_config_match"},
|
||||
success=False,
|
||||
error_message=error_message
|
||||
),
|
||||
memory_usage=0,
|
||||
peak_memory=0,
|
||||
start_time=start_time,
|
||||
end_time=time.time(),
|
||||
error_message=error_message,
|
||||
retry_count=retry_count
|
||||
)
|
||||
|
||||
# Get starting memory for accurate measurement
|
||||
process = psutil.Process()
|
||||
start_memory = process.memory_info().rss / (1024 * 1024)
|
||||
@@ -257,8 +316,8 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
retry_count=retry_count + 1
|
||||
)
|
||||
|
||||
# Execute the crawl
|
||||
result = await self.crawler.arun(url, config=config, session_id=task_id)
|
||||
# Execute the crawl with selected config
|
||||
result = await self.crawler.arun(url, config=selected_config, session_id=task_id)
|
||||
|
||||
# Measure memory usage
|
||||
end_memory = process.memory_info().rss / (1024 * 1024)
|
||||
@@ -316,7 +375,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
self,
|
||||
urls: List[str],
|
||||
crawler: AsyncWebCrawler,
|
||||
config: CrawlerRunConfig,
|
||||
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||
) -> List[CrawlerTaskResult]:
|
||||
self.crawler = crawler
|
||||
|
||||
@@ -470,7 +529,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
self,
|
||||
urls: List[str],
|
||||
crawler: AsyncWebCrawler,
|
||||
config: CrawlerRunConfig,
|
||||
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||
) -> AsyncGenerator[CrawlerTaskResult, None]:
|
||||
self.crawler = crawler
|
||||
|
||||
@@ -572,7 +631,7 @@ class SemaphoreDispatcher(BaseDispatcher):
|
||||
async def crawl_url(
|
||||
self,
|
||||
url: str,
|
||||
config: CrawlerRunConfig,
|
||||
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||
task_id: str,
|
||||
semaphore: asyncio.Semaphore = None,
|
||||
) -> CrawlerTaskResult:
|
||||
@@ -580,6 +639,36 @@ class SemaphoreDispatcher(BaseDispatcher):
|
||||
error_message = ""
|
||||
memory_usage = peak_memory = 0.0
|
||||
|
||||
# Select appropriate config for this URL
|
||||
selected_config = self.select_config(url, config)
|
||||
|
||||
# If no config matches, return failed result
|
||||
if selected_config is None:
|
||||
error_message = f"No matching configuration found for URL: {url}"
|
||||
if self.monitor:
|
||||
self.monitor.update_task(
|
||||
task_id,
|
||||
status=CrawlStatus.FAILED,
|
||||
error_message=error_message
|
||||
)
|
||||
|
||||
return CrawlerTaskResult(
|
||||
task_id=task_id,
|
||||
url=url,
|
||||
result=CrawlResult(
|
||||
url=url,
|
||||
html="",
|
||||
metadata={"status": "no_config_match"},
|
||||
success=False,
|
||||
error_message=error_message
|
||||
),
|
||||
memory_usage=0,
|
||||
peak_memory=0,
|
||||
start_time=start_time,
|
||||
end_time=time.time(),
|
||||
error_message=error_message
|
||||
)
|
||||
|
||||
try:
|
||||
if self.monitor:
|
||||
self.monitor.update_task(
|
||||
@@ -592,7 +681,7 @@ class SemaphoreDispatcher(BaseDispatcher):
|
||||
async with semaphore:
|
||||
process = psutil.Process()
|
||||
start_memory = process.memory_info().rss / (1024 * 1024)
|
||||
result = await self.crawler.arun(url, config=config, session_id=task_id)
|
||||
result = await self.crawler.arun(url, config=selected_config, session_id=task_id)
|
||||
end_memory = process.memory_info().rss / (1024 * 1024)
|
||||
|
||||
memory_usage = peak_memory = end_memory - start_memory
|
||||
@@ -654,7 +743,7 @@ class SemaphoreDispatcher(BaseDispatcher):
|
||||
self,
|
||||
crawler: AsyncWebCrawler, # noqa: F821
|
||||
urls: List[str],
|
||||
config: CrawlerRunConfig,
|
||||
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||
) -> List[CrawlerTaskResult]:
|
||||
self.crawler = crawler
|
||||
if self.monitor:
|
||||
|
||||
@@ -829,7 +829,7 @@ class AsyncUrlSeeder:
|
||||
|
||||
async def _iter_sitemap(self, url: str):
|
||||
try:
|
||||
r = await self.client.get(url, timeout=15)
|
||||
r = await self.client.get(url, timeout=15, follow_redirects=True)
|
||||
r.raise_for_status()
|
||||
except httpx.HTTPStatusError as e:
|
||||
self._log("warning", "Failed to fetch sitemap {url}: HTTP {status_code}",
|
||||
|
||||
@@ -502,9 +502,12 @@ class AsyncWebCrawler:
|
||||
metadata = result.get("metadata", {})
|
||||
else:
|
||||
cleaned_html = sanitize_input_encode(result.cleaned_html)
|
||||
media = result.media.model_dump()
|
||||
tables = media.pop("tables", [])
|
||||
links = result.links.model_dump()
|
||||
# media = result.media.model_dump()
|
||||
# tables = media.pop("tables", [])
|
||||
# links = result.links.model_dump()
|
||||
media = result.media.model_dump() if hasattr(result.media, 'model_dump') else result.media
|
||||
tables = media.pop("tables", []) if isinstance(media, dict) else []
|
||||
links = result.links.model_dump() if hasattr(result.links, 'model_dump') else result.links
|
||||
metadata = result.metadata
|
||||
|
||||
fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)
|
||||
@@ -650,7 +653,7 @@ class AsyncWebCrawler:
|
||||
async def arun_many(
|
||||
self,
|
||||
urls: List[str],
|
||||
config: Optional[CrawlerRunConfig] = None,
|
||||
config: Optional[Union[CrawlerRunConfig, List[CrawlerRunConfig]]] = None,
|
||||
dispatcher: Optional[BaseDispatcher] = None,
|
||||
# Legacy parameters maintained for backwards compatibility
|
||||
# word_count_threshold=MIN_WORD_THRESHOLD,
|
||||
@@ -671,7 +674,9 @@ class AsyncWebCrawler:
|
||||
|
||||
Args:
|
||||
urls: List of URLs to crawl
|
||||
config: Configuration object controlling crawl behavior for all URLs
|
||||
config: Configuration object(s) controlling crawl behavior. Can be:
|
||||
- Single CrawlerRunConfig: Used for all URLs
|
||||
- List[CrawlerRunConfig]: Configs with url_matcher for URL-specific settings
|
||||
dispatcher: The dispatcher strategy instance to use. Defaults to MemoryAdaptiveDispatcher
|
||||
[other parameters maintained for backwards compatibility]
|
||||
|
||||
@@ -736,7 +741,11 @@ class AsyncWebCrawler:
|
||||
or task_result.result
|
||||
)
|
||||
|
||||
stream = config.stream
|
||||
# Handle stream setting - use first config's stream setting if config is a list
|
||||
if isinstance(config, list):
|
||||
stream = config[0].stream if config else False
|
||||
else:
|
||||
stream = config.stream
|
||||
|
||||
if stream:
|
||||
|
||||
|
||||
@@ -14,23 +14,8 @@ import hashlib
|
||||
from .js_snippet import load_js_script
|
||||
from .config import DOWNLOAD_PAGE_TIMEOUT
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from playwright_stealth import StealthConfig
|
||||
from .utils import get_chromium_path
|
||||
|
||||
stealth_config = StealthConfig(
|
||||
webdriver=True,
|
||||
chrome_app=True,
|
||||
chrome_csi=True,
|
||||
chrome_load_times=True,
|
||||
chrome_runtime=True,
|
||||
navigator_languages=True,
|
||||
navigator_plugins=True,
|
||||
navigator_permissions=True,
|
||||
webgl_vendor=True,
|
||||
outerdimensions=True,
|
||||
navigator_hardware_concurrency=True,
|
||||
media_codecs=True,
|
||||
)
|
||||
|
||||
BROWSER_DISABLE_OPTIONS = [
|
||||
"--disable-background-networking",
|
||||
|
||||
@@ -65,6 +65,213 @@ class BrowserProfiler:
|
||||
self.builtin_config_file = os.path.join(self.builtin_browser_dir, "browser_config.json")
|
||||
os.makedirs(self.builtin_browser_dir, exist_ok=True)
|
||||
|
||||
def _is_windows(self) -> bool:
|
||||
"""Check if running on Windows platform."""
|
||||
return sys.platform.startswith('win') or sys.platform == 'cygwin'
|
||||
|
||||
def _is_macos(self) -> bool:
|
||||
"""Check if running on macOS platform."""
|
||||
return sys.platform == 'darwin'
|
||||
|
||||
def _is_linux(self) -> bool:
|
||||
"""Check if running on Linux platform."""
|
||||
return sys.platform.startswith('linux')
|
||||
|
||||
def _get_quit_message(self, tag: str) -> str:
|
||||
"""Get appropriate quit message based on context."""
|
||||
if tag == "PROFILE":
|
||||
return "Closing browser and saving profile..."
|
||||
elif tag == "CDP":
|
||||
return "Closing browser..."
|
||||
else:
|
||||
return "Closing browser..."
|
||||
|
||||
async def _listen_windows(self, user_done_event, check_browser_process, tag: str):
|
||||
"""Windows-specific keyboard listener using msvcrt."""
|
||||
try:
|
||||
import msvcrt
|
||||
except ImportError:
|
||||
raise ImportError("msvcrt module not available on this platform")
|
||||
|
||||
while True:
|
||||
try:
|
||||
# Check for keyboard input
|
||||
if msvcrt.kbhit():
|
||||
raw = msvcrt.getch()
|
||||
|
||||
# Handle Unicode decoding more robustly
|
||||
key = None
|
||||
try:
|
||||
key = raw.decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
try:
|
||||
# Try different encodings
|
||||
key = raw.decode("latin1")
|
||||
except UnicodeDecodeError:
|
||||
# Skip if we can't decode
|
||||
continue
|
||||
|
||||
# Validate key
|
||||
if not key or len(key) != 1:
|
||||
continue
|
||||
|
||||
# Check for printable characters only
|
||||
if not key.isprintable():
|
||||
continue
|
||||
|
||||
# Check for quit command
|
||||
if key.lower() == "q":
|
||||
self.logger.info(
|
||||
self._get_quit_message(tag),
|
||||
tag=tag,
|
||||
base_color=LogColor.GREEN
|
||||
)
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
# Check if browser process ended
|
||||
if await check_browser_process():
|
||||
return
|
||||
|
||||
# Small delay to prevent busy waiting
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error in Windows keyboard listener: {e}", tag=tag)
|
||||
# Continue trying instead of failing completely
|
||||
await asyncio.sleep(0.1)
|
||||
continue
|
||||
|
||||
async def _listen_unix(self, user_done_event: asyncio.Event, check_browser_process, tag: str):
|
||||
"""Unix/Linux/macOS keyboard listener using termios and select."""
|
||||
try:
|
||||
import termios
|
||||
import tty
|
||||
import select
|
||||
except ImportError:
|
||||
raise ImportError("termios/tty/select modules not available on this platform")
|
||||
|
||||
# Get stdin file descriptor
|
||||
try:
|
||||
fd = sys.stdin.fileno()
|
||||
except (AttributeError, OSError):
|
||||
raise ImportError("stdin is not a terminal")
|
||||
|
||||
# Save original terminal settings
|
||||
old_settings = None
|
||||
try:
|
||||
old_settings = termios.tcgetattr(fd)
|
||||
except termios.error as e:
|
||||
raise ImportError(f"Cannot get terminal attributes: {e}")
|
||||
|
||||
try:
|
||||
# Switch to non-canonical mode (cbreak mode)
|
||||
tty.setcbreak(fd)
|
||||
|
||||
while True:
|
||||
try:
|
||||
# Use select to check if input is available (non-blocking)
|
||||
# Timeout of 0.5 seconds to periodically check browser process
|
||||
readable, _, _ = select.select([sys.stdin], [], [], 0.5)
|
||||
|
||||
if readable:
|
||||
# Read one character
|
||||
key = sys.stdin.read(1)
|
||||
|
||||
if key and key.lower() == "q":
|
||||
self.logger.info(
|
||||
self._get_quit_message(tag),
|
||||
tag=tag,
|
||||
base_color=LogColor.GREEN
|
||||
)
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
# Check if browser process ended
|
||||
if await check_browser_process():
|
||||
return
|
||||
|
||||
# Small delay to prevent busy waiting
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
except (KeyboardInterrupt, EOFError):
|
||||
# Handle Ctrl+C or EOF gracefully
|
||||
self.logger.info("Keyboard interrupt received", tag=tag)
|
||||
user_done_event.set()
|
||||
return
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error in Unix keyboard listener: {e}", tag=tag)
|
||||
await asyncio.sleep(0.1)
|
||||
continue
|
||||
|
||||
finally:
|
||||
# Always restore terminal settings
|
||||
if old_settings is not None:
|
||||
try:
|
||||
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to restore terminal settings: {e}", tag=tag)
|
||||
|
||||
async def _listen_fallback(self, user_done_event: asyncio.Event, check_browser_process, tag: str):
|
||||
"""Fallback keyboard listener using simple input() method."""
|
||||
self.logger.info("Using fallback input mode. Type 'q' and press Enter to quit.", tag=tag)
|
||||
|
||||
# Run input in a separate thread to avoid blocking
|
||||
import threading
|
||||
import queue
|
||||
|
||||
input_queue = queue.Queue()
|
||||
|
||||
def input_thread():
|
||||
"""Thread function to handle input."""
|
||||
try:
|
||||
while not user_done_event.is_set():
|
||||
try:
|
||||
# Use input() with a prompt
|
||||
user_input = input("Press 'q' + Enter to quit: ").strip().lower()
|
||||
input_queue.put(user_input)
|
||||
if user_input == 'q':
|
||||
break
|
||||
except (EOFError, KeyboardInterrupt):
|
||||
input_queue.put('q')
|
||||
break
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error in input thread: {e}", tag=tag)
|
||||
break
|
||||
except Exception as e:
|
||||
self.logger.error(f"Input thread failed: {e}", tag=tag)
|
||||
|
||||
# Start input thread
|
||||
thread = threading.Thread(target=input_thread, daemon=True)
|
||||
thread.start()
|
||||
|
||||
try:
|
||||
while not user_done_event.is_set():
|
||||
# Check for user input
|
||||
try:
|
||||
user_input = input_queue.get_nowait()
|
||||
if user_input == 'q':
|
||||
self.logger.info(
|
||||
self._get_quit_message(tag),
|
||||
tag=tag,
|
||||
base_color=LogColor.GREEN
|
||||
)
|
||||
user_done_event.set()
|
||||
return
|
||||
except queue.Empty:
|
||||
pass
|
||||
|
||||
# Check if browser process ended
|
||||
if await check_browser_process():
|
||||
return
|
||||
|
||||
# Small delay
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Fallback listener failed: {e}", tag=tag)
|
||||
user_done_event.set()
|
||||
|
||||
async def create_profile(self,
|
||||
profile_name: Optional[str] = None,
|
||||
browser_config: Optional[BrowserConfig] = None) -> Optional[str]:
|
||||
@@ -180,42 +387,38 @@ class BrowserProfiler:
|
||||
|
||||
# Run keyboard input loop in a separate task
|
||||
async def listen_for_quit_command():
|
||||
import termios
|
||||
import tty
|
||||
import select
|
||||
|
||||
"""Cross-platform keyboard listener that waits for 'q' key press."""
|
||||
# First output the prompt
|
||||
self.logger.info("Press 'q' when you've finished using the browser...", tag="PROFILE")
|
||||
|
||||
# Save original terminal settings
|
||||
fd = sys.stdin.fileno()
|
||||
old_settings = termios.tcgetattr(fd)
|
||||
|
||||
self.logger.info(
|
||||
"Press {segment} when you've finished using the browser...",
|
||||
tag="PROFILE",
|
||||
params={"segment": "'q'"}, colors={"segment": LogColor.YELLOW},
|
||||
base_color=LogColor.CYAN
|
||||
)
|
||||
|
||||
async def check_browser_process():
|
||||
"""Check if browser process is still running."""
|
||||
if (
|
||||
managed_browser.browser_process
|
||||
and managed_browser.browser_process.poll() is not None
|
||||
):
|
||||
self.logger.info(
|
||||
"Browser already closed. Ending input listener.", tag="PROFILE"
|
||||
)
|
||||
user_done_event.set()
|
||||
return True
|
||||
return False
|
||||
|
||||
# Try platform-specific implementations with fallback
|
||||
try:
|
||||
# Switch to non-canonical mode (no line buffering)
|
||||
tty.setcbreak(fd)
|
||||
|
||||
while True:
|
||||
# Check if input is available (non-blocking)
|
||||
readable, _, _ = select.select([sys.stdin], [], [], 0.5)
|
||||
if readable:
|
||||
key = sys.stdin.read(1)
|
||||
if key.lower() == 'q':
|
||||
self.logger.info("Closing browser and saving profile...", tag="PROFILE", base_color=LogColor.GREEN)
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
# Check if the browser process has already exited
|
||||
if managed_browser.browser_process and managed_browser.browser_process.poll() is not None:
|
||||
self.logger.info("Browser already closed. Ending input listener.", tag="PROFILE")
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
finally:
|
||||
# Restore terminal settings
|
||||
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
|
||||
if self._is_windows():
|
||||
await self._listen_windows(user_done_event, check_browser_process, "PROFILE")
|
||||
else:
|
||||
await self._listen_unix(user_done_event, check_browser_process, "PROFILE")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Platform-specific keyboard listener failed: {e}", tag="PROFILE")
|
||||
self.logger.info("Falling back to simple input mode...", tag="PROFILE")
|
||||
await self._listen_fallback(user_done_event, check_browser_process, "PROFILE")
|
||||
|
||||
try:
|
||||
from playwright.async_api import async_playwright
|
||||
@@ -682,42 +885,33 @@ class BrowserProfiler:
|
||||
|
||||
# Run keyboard input loop in a separate task
|
||||
async def listen_for_quit_command():
|
||||
import termios
|
||||
import tty
|
||||
import select
|
||||
|
||||
"""Cross-platform keyboard listener that waits for 'q' key press."""
|
||||
# First output the prompt
|
||||
self.logger.info("Press 'q' to stop the browser and exit...", tag="CDP")
|
||||
|
||||
# Save original terminal settings
|
||||
fd = sys.stdin.fileno()
|
||||
old_settings = termios.tcgetattr(fd)
|
||||
|
||||
self.logger.info(
|
||||
"Press {segment} to stop the browser and exit...",
|
||||
tag="CDP",
|
||||
params={"segment": "'q'"}, colors={"segment": LogColor.YELLOW},
|
||||
base_color=LogColor.CYAN
|
||||
)
|
||||
|
||||
async def check_browser_process():
|
||||
"""Check if browser process is still running."""
|
||||
if managed_browser.browser_process and managed_browser.browser_process.poll() is not None:
|
||||
self.logger.info("Browser already closed. Ending input listener.", tag="CDP")
|
||||
user_done_event.set()
|
||||
return True
|
||||
return False
|
||||
|
||||
# Try platform-specific implementations with fallback
|
||||
try:
|
||||
# Switch to non-canonical mode (no line buffering)
|
||||
tty.setcbreak(fd)
|
||||
|
||||
while True:
|
||||
# Check if input is available (non-blocking)
|
||||
readable, _, _ = select.select([sys.stdin], [], [], 0.5)
|
||||
if readable:
|
||||
key = sys.stdin.read(1)
|
||||
if key.lower() == 'q':
|
||||
self.logger.info("Closing browser...", tag="CDP")
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
# Check if the browser process has already exited
|
||||
if managed_browser.browser_process and managed_browser.browser_process.poll() is not None:
|
||||
self.logger.info("Browser already closed. Ending input listener.", tag="CDP")
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
finally:
|
||||
# Restore terminal settings
|
||||
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
|
||||
if self._is_windows():
|
||||
await self._listen_windows(user_done_event, check_browser_process, "CDP")
|
||||
else:
|
||||
await self._listen_unix(user_done_event, check_browser_process, "CDP")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Platform-specific keyboard listener failed: {e}", tag="CDP")
|
||||
self.logger.info("Falling back to simple input mode...", tag="CDP")
|
||||
await self._listen_fallback(user_done_event, check_browser_process, "CDP")
|
||||
|
||||
# Function to retrieve and display CDP JSON config
|
||||
async def get_cdp_json(port):
|
||||
|
||||
@@ -27,7 +27,10 @@ from crawl4ai import (
|
||||
PruningContentFilter,
|
||||
BrowserProfiler,
|
||||
DefaultMarkdownGenerator,
|
||||
LLMConfig
|
||||
LLMConfig,
|
||||
BFSDeepCrawlStrategy,
|
||||
DFSDeepCrawlStrategy,
|
||||
BestFirstCrawlingStrategy,
|
||||
)
|
||||
from crawl4ai.config import USER_SETTINGS
|
||||
from litellm import completion
|
||||
@@ -1014,9 +1017,11 @@ def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless
|
||||
@click.option("--question", "-q", help="Ask a question about the crawled content")
|
||||
@click.option("--verbose", "-v", is_flag=True)
|
||||
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
|
||||
@click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy (bfs, dfs, or best-first)")
|
||||
@click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode")
|
||||
def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str,
|
||||
extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
|
||||
output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
|
||||
output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int):
|
||||
"""Crawl a website and extract content
|
||||
|
||||
Simple Usage:
|
||||
@@ -1156,6 +1161,27 @@ Always return valid, properly formatted JSON."""
|
||||
|
||||
crawler_cfg.scraping_strategy = LXMLWebScrapingStrategy()
|
||||
|
||||
# Handle deep crawling configuration
|
||||
if deep_crawl:
|
||||
if deep_crawl == "bfs":
|
||||
crawler_cfg.deep_crawl_strategy = BFSDeepCrawlStrategy(
|
||||
max_depth=3,
|
||||
max_pages=max_pages
|
||||
)
|
||||
elif deep_crawl == "dfs":
|
||||
crawler_cfg.deep_crawl_strategy = DFSDeepCrawlStrategy(
|
||||
max_depth=3,
|
||||
max_pages=max_pages
|
||||
)
|
||||
elif deep_crawl == "best-first":
|
||||
crawler_cfg.deep_crawl_strategy = BestFirstCrawlingStrategy(
|
||||
max_depth=3,
|
||||
max_pages=max_pages
|
||||
)
|
||||
|
||||
if verbose:
|
||||
console.print(f"[green]Deep crawling enabled:[/green] {deep_crawl} strategy, max {max_pages} pages")
|
||||
|
||||
config = get_global_config()
|
||||
|
||||
browser_cfg.verbose = config.get("VERBOSE", False)
|
||||
@@ -1170,39 +1196,60 @@ Always return valid, properly formatted JSON."""
|
||||
verbose
|
||||
)
|
||||
|
||||
# Handle deep crawl results (list) vs single result
|
||||
if isinstance(result, list):
|
||||
if len(result) == 0:
|
||||
click.echo("No results found during deep crawling")
|
||||
return
|
||||
# Use the first result for question answering and output
|
||||
main_result = result[0]
|
||||
all_results = result
|
||||
else:
|
||||
# Single result from regular crawling
|
||||
main_result = result
|
||||
all_results = [result]
|
||||
|
||||
# Handle question
|
||||
if question:
|
||||
provider, token = setup_llm_config()
|
||||
markdown = result.markdown.raw_markdown
|
||||
markdown = main_result.markdown.raw_markdown
|
||||
anyio.run(stream_llm_response, url, markdown, question, provider, token)
|
||||
return
|
||||
|
||||
# Handle output
|
||||
if not output_file:
|
||||
if output == "all":
|
||||
click.echo(json.dumps(result.model_dump(), indent=2))
|
||||
if isinstance(result, list):
|
||||
output_data = [r.model_dump() for r in all_results]
|
||||
click.echo(json.dumps(output_data, indent=2))
|
||||
else:
|
||||
click.echo(json.dumps(main_result.model_dump(), indent=2))
|
||||
elif output == "json":
|
||||
print(result.extracted_content)
|
||||
extracted_items = json.loads(result.extracted_content)
|
||||
print(main_result.extracted_content)
|
||||
extracted_items = json.loads(main_result.extracted_content)
|
||||
click.echo(json.dumps(extracted_items, indent=2))
|
||||
|
||||
elif output in ["markdown", "md"]:
|
||||
click.echo(result.markdown.raw_markdown)
|
||||
click.echo(main_result.markdown.raw_markdown)
|
||||
elif output in ["markdown-fit", "md-fit"]:
|
||||
click.echo(result.markdown.fit_markdown)
|
||||
click.echo(main_result.markdown.fit_markdown)
|
||||
else:
|
||||
if output == "all":
|
||||
with open(output_file, "w") as f:
|
||||
f.write(json.dumps(result.model_dump(), indent=2))
|
||||
if isinstance(result, list):
|
||||
output_data = [r.model_dump() for r in all_results]
|
||||
f.write(json.dumps(output_data, indent=2))
|
||||
else:
|
||||
f.write(json.dumps(main_result.model_dump(), indent=2))
|
||||
elif output == "json":
|
||||
with open(output_file, "w") as f:
|
||||
f.write(result.extracted_content)
|
||||
f.write(main_result.extracted_content)
|
||||
elif output in ["markdown", "md"]:
|
||||
with open(output_file, "w") as f:
|
||||
f.write(result.markdown.raw_markdown)
|
||||
f.write(main_result.markdown.raw_markdown)
|
||||
elif output in ["markdown-fit", "md-fit"]:
|
||||
with open(output_file, "w") as f:
|
||||
f.write(result.markdown.fit_markdown)
|
||||
f.write(main_result.markdown.fit_markdown)
|
||||
|
||||
except Exception as e:
|
||||
raise click.ClickException(str(e))
|
||||
@@ -1354,9 +1401,11 @@ def profiles_cmd():
|
||||
@click.option("--question", "-q", help="Ask a question about the crawled content")
|
||||
@click.option("--verbose", "-v", is_flag=True)
|
||||
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
|
||||
@click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy")
|
||||
@click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode")
|
||||
def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str,
|
||||
extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
|
||||
output: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
|
||||
output: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int):
|
||||
"""Crawl4AI CLI - Web content extraction tool
|
||||
|
||||
Simple Usage:
|
||||
@@ -1406,7 +1455,9 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f
|
||||
bypass_cache=bypass_cache,
|
||||
question=question,
|
||||
verbose=verbose,
|
||||
profile=profile
|
||||
profile=profile,
|
||||
deep_crawl=deep_crawl,
|
||||
max_pages=max_pages
|
||||
)
|
||||
|
||||
def main():
|
||||
|
||||
@@ -98,20 +98,20 @@ class ContentScrapingStrategy(ABC):
|
||||
pass
|
||||
|
||||
|
||||
class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
class LXMLWebScrapingStrategy(ContentScrapingStrategy):
|
||||
"""
|
||||
Class for web content scraping. Perhaps the most important class.
|
||||
|
||||
How it works:
|
||||
1. Extract content from HTML using BeautifulSoup.
|
||||
2. Clean the extracted content using a content cleaning strategy.
|
||||
3. Filter the cleaned content using a content filtering strategy.
|
||||
4. Generate markdown content from the filtered content.
|
||||
5. Return the markdown content.
|
||||
LXML-based implementation for fast web content scraping.
|
||||
|
||||
This is the primary scraping strategy in Crawl4AI, providing high-performance
|
||||
HTML parsing and content extraction using the lxml library.
|
||||
|
||||
Note: WebScrapingStrategy is now an alias for this class to maintain
|
||||
backward compatibility.
|
||||
"""
|
||||
|
||||
def __init__(self, logger=None):
|
||||
self.logger = logger
|
||||
self.DIMENSION_REGEX = re.compile(r"(\d+)(\D*)")
|
||||
self.BASE64_PATTERN = re.compile(r'data:image/[^;]+;base64,([^"]+)')
|
||||
|
||||
def _log(self, level, message, tag="SCRAPE", **kwargs):
|
||||
"""Helper method to safely use logger."""
|
||||
@@ -132,7 +132,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
ScrapingResult: A structured result containing the scraped content.
|
||||
"""
|
||||
actual_url = kwargs.get("redirected_url", url)
|
||||
raw_result = self._scrap(actual_url, html, is_async=False, **kwargs)
|
||||
raw_result = self._scrap(actual_url, html, **kwargs)
|
||||
if raw_result is None:
|
||||
return ScrapingResult(
|
||||
cleaned_html="",
|
||||
@@ -196,376 +196,9 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
Returns:
|
||||
ScrapingResult: A structured result containing the scraped content.
|
||||
"""
|
||||
return await asyncio.to_thread(self._scrap, url, html, **kwargs)
|
||||
return await asyncio.to_thread(self.scrap, url, html, **kwargs)
|
||||
|
||||
def is_data_table(self, table: Tag, **kwargs) -> bool:
|
||||
"""
|
||||
Determine if a table element is a data table (not a layout table).
|
||||
|
||||
Args:
|
||||
table (Tag): BeautifulSoup Tag representing a table element
|
||||
**kwargs: Additional keyword arguments including table_score_threshold
|
||||
|
||||
Returns:
|
||||
bool: True if the table is a data table, False otherwise
|
||||
"""
|
||||
score = 0
|
||||
|
||||
# Check for thead and tbody
|
||||
has_thead = len(table.select('thead')) > 0
|
||||
has_tbody = len(table.select('tbody')) > 0
|
||||
if has_thead:
|
||||
score += 2
|
||||
if has_tbody:
|
||||
score += 1
|
||||
|
||||
# Check for th elements
|
||||
th_count = len(table.select('th'))
|
||||
if th_count > 0:
|
||||
score += 2
|
||||
if has_thead or len(table.select('tr:first-child th')) > 0:
|
||||
score += 1
|
||||
|
||||
# Check for nested tables
|
||||
if len(table.select('table')) > 0:
|
||||
score -= 3
|
||||
|
||||
# Role attribute check
|
||||
role = table.get('role', '').lower()
|
||||
if role in {'presentation', 'none'}:
|
||||
score -= 3
|
||||
|
||||
# Column consistency
|
||||
rows = table.select('tr')
|
||||
if not rows:
|
||||
return False
|
||||
|
||||
col_counts = [len(row.select('td, th')) for row in rows]
|
||||
avg_cols = sum(col_counts) / len(col_counts)
|
||||
variance = sum((c - avg_cols)**2 for c in col_counts) / len(col_counts)
|
||||
if variance < 1:
|
||||
score += 2
|
||||
|
||||
# Caption and summary
|
||||
if table.select('caption'):
|
||||
score += 2
|
||||
if table.has_attr('summary') and table['summary']:
|
||||
score += 1
|
||||
|
||||
# Text density
|
||||
total_text = sum(len(cell.get_text().strip()) for row in rows for cell in row.select('td, th'))
|
||||
total_tags = sum(1 for _ in table.descendants if isinstance(_, Tag))
|
||||
text_ratio = total_text / (total_tags + 1e-5)
|
||||
if text_ratio > 20:
|
||||
score += 3
|
||||
elif text_ratio > 10:
|
||||
score += 2
|
||||
|
||||
# Data attributes
|
||||
data_attrs = sum(1 for attr in table.attrs if attr.startswith('data-'))
|
||||
score += data_attrs * 0.5
|
||||
|
||||
# Size check
|
||||
if avg_cols >= 2 and len(rows) >= 2:
|
||||
score += 2
|
||||
|
||||
threshold = kwargs.get('table_score_threshold', 7)
|
||||
return score >= threshold
|
||||
|
||||
def extract_table_data(self, table: Tag) -> dict:
|
||||
"""
|
||||
Extract structured data from a table element.
|
||||
|
||||
Args:
|
||||
table (Tag): BeautifulSoup Tag representing a table element
|
||||
|
||||
Returns:
|
||||
dict: Dictionary containing table data (headers, rows, caption, summary)
|
||||
"""
|
||||
caption_elem = table.select_one('caption')
|
||||
caption = caption_elem.get_text().strip() if caption_elem else ""
|
||||
summary = table.get('summary', '').strip()
|
||||
|
||||
# Extract headers with colspan handling
|
||||
headers = []
|
||||
thead_rows = table.select('thead tr')
|
||||
if thead_rows:
|
||||
header_cells = thead_rows[0].select('th')
|
||||
for cell in header_cells:
|
||||
text = cell.get_text().strip()
|
||||
colspan = int(cell.get('colspan', 1))
|
||||
headers.extend([text] * colspan)
|
||||
else:
|
||||
first_row = table.select('tr:first-child')
|
||||
if first_row:
|
||||
for cell in first_row[0].select('th, td'):
|
||||
text = cell.get_text().strip()
|
||||
colspan = int(cell.get('colspan', 1))
|
||||
headers.extend([text] * colspan)
|
||||
|
||||
# Extract rows with colspan handling
|
||||
rows = []
|
||||
all_rows = table.select('tr')
|
||||
thead = table.select_one('thead')
|
||||
tbody_rows = []
|
||||
|
||||
if thead:
|
||||
thead_rows = thead.select('tr')
|
||||
tbody_rows = [row for row in all_rows if row not in thead_rows]
|
||||
else:
|
||||
if all_rows and all_rows[0].select('th'):
|
||||
tbody_rows = all_rows[1:]
|
||||
else:
|
||||
tbody_rows = all_rows
|
||||
|
||||
for row in tbody_rows:
|
||||
# for row in table.select('tr:not(:has(ancestor::thead))'):
|
||||
row_data = []
|
||||
for cell in row.select('td'):
|
||||
text = cell.get_text().strip()
|
||||
colspan = int(cell.get('colspan', 1))
|
||||
row_data.extend([text] * colspan)
|
||||
if row_data:
|
||||
rows.append(row_data)
|
||||
|
||||
# Align rows with headers
|
||||
max_columns = len(headers) if headers else (max(len(row) for row in rows) if rows else 0)
|
||||
aligned_rows = []
|
||||
for row in rows:
|
||||
aligned = row[:max_columns] + [''] * (max_columns - len(row))
|
||||
aligned_rows.append(aligned)
|
||||
|
||||
if not headers:
|
||||
headers = [f"Column {i+1}" for i in range(max_columns)]
|
||||
|
||||
return {
|
||||
"headers": headers,
|
||||
"rows": aligned_rows,
|
||||
"caption": caption,
|
||||
"summary": summary,
|
||||
}
|
||||
|
||||
def flatten_nested_elements(self, node):
|
||||
"""
|
||||
Flatten nested elements in a HTML tree.
|
||||
|
||||
Args:
|
||||
node (Tag): The root node of the HTML tree.
|
||||
|
||||
Returns:
|
||||
Tag: The flattened HTML tree.
|
||||
"""
|
||||
if isinstance(node, NavigableString):
|
||||
return node
|
||||
if (
|
||||
len(node.contents) == 1
|
||||
and isinstance(node.contents[0], Tag)
|
||||
and node.contents[0].name == node.name
|
||||
):
|
||||
return self.flatten_nested_elements(node.contents[0])
|
||||
node.contents = [self.flatten_nested_elements(child) for child in node.contents]
|
||||
return node
|
||||
|
||||
def find_closest_parent_with_useful_text(self, tag, **kwargs):
|
||||
"""
|
||||
Find the closest parent with useful text.
|
||||
|
||||
Args:
|
||||
tag (Tag): The starting tag to search from.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
Tag: The closest parent with useful text, or None if not found.
|
||||
"""
|
||||
image_description_min_word_threshold = kwargs.get(
|
||||
"image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
|
||||
)
|
||||
current_tag = tag
|
||||
while current_tag:
|
||||
current_tag = current_tag.parent
|
||||
# Get the text content of the parent tag
|
||||
if current_tag:
|
||||
text_content = current_tag.get_text(separator=" ", strip=True)
|
||||
# Check if the text content has at least word_count_threshold
|
||||
if len(text_content.split()) >= image_description_min_word_threshold:
|
||||
return text_content
|
||||
return None
|
||||
|
||||
def remove_unwanted_attributes(
|
||||
self, element, important_attrs, keep_data_attributes=False
|
||||
):
|
||||
"""
|
||||
Remove unwanted attributes from an HTML element.
|
||||
|
||||
Args:
|
||||
element (Tag): The HTML element to remove attributes from.
|
||||
important_attrs (list): List of important attributes to keep.
|
||||
keep_data_attributes (bool): Whether to keep data attributes.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
attrs_to_remove = []
|
||||
for attr in element.attrs:
|
||||
if attr not in important_attrs:
|
||||
if keep_data_attributes:
|
||||
if not attr.startswith("data-"):
|
||||
attrs_to_remove.append(attr)
|
||||
else:
|
||||
attrs_to_remove.append(attr)
|
||||
|
||||
for attr in attrs_to_remove:
|
||||
del element[attr]
|
||||
|
||||
def process_image(self, img, url, index, total_images, **kwargs):
|
||||
"""
|
||||
Process an image element.
|
||||
|
||||
How it works:
|
||||
1. Check if the image has valid display and inside undesired html elements.
|
||||
2. Score an image for it's usefulness.
|
||||
3. Extract image file metadata to extract size and extension.
|
||||
4. Generate a dictionary with the processed image information.
|
||||
5. Return the processed image information.
|
||||
|
||||
Args:
|
||||
img (Tag): The image element to process.
|
||||
url (str): The URL of the page containing the image.
|
||||
index (int): The index of the image in the list of images.
|
||||
total_images (int): The total number of images in the list.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the processed image information.
|
||||
"""
|
||||
# parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w')
|
||||
# if ' ' in u else None}
|
||||
# for u in [f"http{p}" for p in s.split("http") if p]]
|
||||
|
||||
# Constants for checks
|
||||
classes_to_check = frozenset(["button", "icon", "logo"])
|
||||
tags_to_check = frozenset(["button", "input"])
|
||||
image_formats = frozenset(["jpg", "jpeg", "png", "webp", "avif", "gif"])
|
||||
|
||||
# Pre-fetch commonly used attributes
|
||||
style = img.get("style", "")
|
||||
alt = img.get("alt", "")
|
||||
src = img.get("src", "")
|
||||
data_src = img.get("data-src", "")
|
||||
srcset = img.get("srcset", "")
|
||||
data_srcset = img.get("data-srcset", "")
|
||||
width = img.get("width")
|
||||
height = img.get("height")
|
||||
parent = img.parent
|
||||
parent_classes = parent.get("class", [])
|
||||
|
||||
# Quick validation checks
|
||||
if (
|
||||
"display:none" in style
|
||||
or parent.name in tags_to_check
|
||||
or any(c in cls for c in parent_classes for cls in classes_to_check)
|
||||
or any(c in src for c in classes_to_check)
|
||||
or any(c in alt for c in classes_to_check)
|
||||
):
|
||||
return None
|
||||
|
||||
# Quick score calculation
|
||||
score = 0
|
||||
if width and width.isdigit():
|
||||
width_val = int(width)
|
||||
score += 1 if width_val > 150 else 0
|
||||
if height and height.isdigit():
|
||||
height_val = int(height)
|
||||
score += 1 if height_val > 150 else 0
|
||||
if alt:
|
||||
score += 1
|
||||
score += index / total_images < 0.5
|
||||
|
||||
# image_format = ''
|
||||
# if "data:image/" in src:
|
||||
# image_format = src.split(',')[0].split(';')[0].split('/')[1].split(';')[0]
|
||||
# else:
|
||||
# image_format = os.path.splitext(src)[1].lower().strip('.').split('?')[0]
|
||||
|
||||
# if image_format in ('jpg', 'png', 'webp', 'avif'):
|
||||
# score += 1
|
||||
|
||||
# Check for image format in all possible sources
|
||||
def has_image_format(url):
|
||||
return any(fmt in url.lower() for fmt in image_formats)
|
||||
|
||||
# Score for having proper image sources
|
||||
if any(has_image_format(url) for url in [src, data_src, srcset, data_srcset]):
|
||||
score += 1
|
||||
if srcset or data_srcset:
|
||||
score += 1
|
||||
if img.find_parent("picture"):
|
||||
score += 1
|
||||
|
||||
# Detect format from any available source
|
||||
detected_format = None
|
||||
for url in [src, data_src, srcset, data_srcset]:
|
||||
if url:
|
||||
format_matches = [fmt for fmt in image_formats if fmt in url.lower()]
|
||||
if format_matches:
|
||||
detected_format = format_matches[0]
|
||||
break
|
||||
|
||||
if score <= kwargs.get("image_score_threshold", IMAGE_SCORE_THRESHOLD):
|
||||
return None
|
||||
|
||||
# Use set for deduplication
|
||||
unique_urls = set()
|
||||
image_variants = []
|
||||
|
||||
# Generate a unique group ID for this set of variants
|
||||
group_id = index
|
||||
|
||||
# Base image info template
|
||||
base_info = {
|
||||
"alt": alt,
|
||||
"desc": self.find_closest_parent_with_useful_text(img, **kwargs),
|
||||
"score": score,
|
||||
"type": "image",
|
||||
"group_id": group_id, # Group ID for this set of variants
|
||||
"format": detected_format,
|
||||
}
|
||||
|
||||
# Inline function for adding variants
|
||||
def add_variant(src, width=None):
|
||||
if src and not src.startswith("data:") and src not in unique_urls:
|
||||
unique_urls.add(src)
|
||||
image_variants.append({**base_info, "src": src, "width": width})
|
||||
|
||||
# Process all sources
|
||||
add_variant(src)
|
||||
add_variant(data_src)
|
||||
|
||||
# Handle srcset and data-srcset in one pass
|
||||
for attr in ("srcset", "data-srcset"):
|
||||
if value := img.get(attr):
|
||||
for source in parse_srcset(value):
|
||||
add_variant(source["url"], source["width"])
|
||||
|
||||
# Quick picture element check
|
||||
if picture := img.find_parent("picture"):
|
||||
for source in picture.find_all("source"):
|
||||
if srcset := source.get("srcset"):
|
||||
for src in parse_srcset(srcset):
|
||||
add_variant(src["url"], src["width"])
|
||||
|
||||
# Framework-specific attributes in one pass
|
||||
for attr, value in img.attrs.items():
|
||||
if (
|
||||
attr.startswith("data-")
|
||||
and ("src" in attr or "srcset" in attr)
|
||||
and "http" in value
|
||||
):
|
||||
add_variant(value)
|
||||
|
||||
return image_variants if image_variants else None
|
||||
|
||||
def process_element(self, url, element: PageElement, **kwargs) -> Dict[str, Any]:
|
||||
def process_element(self, url, element: lhtml.HtmlElement, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Process an HTML element.
|
||||
|
||||
@@ -577,7 +210,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
|
||||
Args:
|
||||
url (str): The URL of the page containing the element.
|
||||
element (Tag): The HTML element to process.
|
||||
element (lhtml.HtmlElement): The HTML element to process.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
@@ -595,514 +228,6 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
"external_links_dict": external_links_dict,
|
||||
}
|
||||
|
||||
def _process_element(
|
||||
self,
|
||||
url,
|
||||
element: PageElement,
|
||||
media: Dict[str, Any],
|
||||
internal_links_dict: Dict[str, Any],
|
||||
external_links_dict: Dict[str, Any],
|
||||
**kwargs,
|
||||
) -> bool:
|
||||
"""
|
||||
Process an HTML element.
|
||||
"""
|
||||
try:
|
||||
if isinstance(element, NavigableString):
|
||||
if isinstance(element, Comment):
|
||||
element.extract()
|
||||
return False
|
||||
|
||||
# if element.name == 'img':
|
||||
# process_image(element, url, 0, 1)
|
||||
# return True
|
||||
base_domain = kwargs.get("base_domain", get_base_domain(url))
|
||||
|
||||
if element.name in ["script", "style", "link", "meta", "noscript"]:
|
||||
element.decompose()
|
||||
return False
|
||||
|
||||
keep_element = False
|
||||
# Special case for table elements - always preserve structure
|
||||
if element.name in ["tr", "td", "th"]:
|
||||
keep_element = True
|
||||
|
||||
exclude_domains = kwargs.get("exclude_domains", [])
|
||||
# exclude_social_media_domains = kwargs.get('exclude_social_media_domains', set(SOCIAL_MEDIA_DOMAINS))
|
||||
# exclude_social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('exclude_social_media_domains', [])
|
||||
# exclude_social_media_domains = list(set(exclude_social_media_domains))
|
||||
|
||||
try:
|
||||
if element.name == "a" and element.get("href"):
|
||||
href = element.get("href", "").strip()
|
||||
if not href: # Skip empty hrefs
|
||||
return False
|
||||
|
||||
# url_base = url.split("/")[2]
|
||||
|
||||
# Normalize the URL
|
||||
try:
|
||||
normalized_href = normalize_url(href, url)
|
||||
except ValueError:
|
||||
# logging.warning(f"Invalid URL format: {href}, Error: {str(e)}")
|
||||
return False
|
||||
|
||||
link_data = {
|
||||
"href": normalized_href,
|
||||
"text": element.get_text().strip(),
|
||||
"title": element.get("title", "").strip(),
|
||||
"base_domain": base_domain,
|
||||
}
|
||||
|
||||
is_external = is_external_url(normalized_href, base_domain)
|
||||
|
||||
keep_element = True
|
||||
|
||||
# Handle external link exclusions
|
||||
if is_external:
|
||||
link_base_domain = get_base_domain(normalized_href)
|
||||
link_data["base_domain"] = link_base_domain
|
||||
if kwargs.get("exclude_external_links", False):
|
||||
element.decompose()
|
||||
return False
|
||||
# elif kwargs.get('exclude_social_media_links', False):
|
||||
# if link_base_domain in exclude_social_media_domains:
|
||||
# element.decompose()
|
||||
# return False
|
||||
# if any(domain in normalized_href.lower() for domain in exclude_social_media_domains):
|
||||
# element.decompose()
|
||||
# return False
|
||||
elif exclude_domains:
|
||||
if link_base_domain in exclude_domains:
|
||||
element.decompose()
|
||||
return False
|
||||
# if any(domain in normalized_href.lower() for domain in kwargs.get('exclude_domains', [])):
|
||||
# element.decompose()
|
||||
# return False
|
||||
|
||||
if is_external:
|
||||
if normalized_href not in external_links_dict:
|
||||
external_links_dict[normalized_href] = link_data
|
||||
else:
|
||||
if kwargs.get("exclude_internal_links", False):
|
||||
element.decompose()
|
||||
return False
|
||||
if normalized_href not in internal_links_dict:
|
||||
internal_links_dict[normalized_href] = link_data
|
||||
|
||||
except Exception as e:
|
||||
raise Exception(f"Error processing links: {str(e)}")
|
||||
|
||||
try:
|
||||
if element.name == "img":
|
||||
potential_sources = [
|
||||
"src",
|
||||
"data-src",
|
||||
"srcset" "data-lazy-src",
|
||||
"data-original",
|
||||
]
|
||||
src = element.get("src", "")
|
||||
while not src and potential_sources:
|
||||
src = element.get(potential_sources.pop(0), "")
|
||||
if not src:
|
||||
element.decompose()
|
||||
return False
|
||||
|
||||
# If it is srcset pick up the first image
|
||||
if "srcset" in element.attrs:
|
||||
src = element.attrs["srcset"].split(",")[0].split(" ")[0]
|
||||
|
||||
# If image src is internal, then skip
|
||||
if not is_external_url(src, base_domain):
|
||||
return True
|
||||
|
||||
image_src_base_domain = get_base_domain(src)
|
||||
|
||||
# Check flag if we should remove external images
|
||||
if kwargs.get("exclude_external_images", False):
|
||||
# Handle relative URLs (which are always from the same domain)
|
||||
if not src.startswith('http') and not src.startswith('//'):
|
||||
return True # Keep relative URLs
|
||||
|
||||
# For absolute URLs, compare the base domains using the existing function
|
||||
src_base_domain = get_base_domain(src)
|
||||
url_base_domain = get_base_domain(url)
|
||||
|
||||
# If the domains don't match and both are valid, the image is external
|
||||
if src_base_domain and url_base_domain and src_base_domain != url_base_domain:
|
||||
element.decompose()
|
||||
return False
|
||||
|
||||
# if kwargs.get('exclude_social_media_links', False):
|
||||
# if image_src_base_domain in exclude_social_media_domains:
|
||||
# element.decompose()
|
||||
# return False
|
||||
# src_url_base = src.split('/')[2]
|
||||
# url_base = url.split('/')[2]
|
||||
# if any(domain in src for domain in exclude_social_media_domains):
|
||||
# element.decompose()
|
||||
# return False
|
||||
|
||||
# Handle exclude domains
|
||||
if exclude_domains:
|
||||
if image_src_base_domain in exclude_domains:
|
||||
element.decompose()
|
||||
return False
|
||||
# if any(domain in src for domain in kwargs.get('exclude_domains', [])):
|
||||
# element.decompose()
|
||||
# return False
|
||||
|
||||
return True # Always keep image elements
|
||||
except Exception:
|
||||
raise "Error processing images"
|
||||
|
||||
# Check if flag to remove all forms is set
|
||||
if kwargs.get("remove_forms", False) and element.name == "form":
|
||||
element.decompose()
|
||||
return False
|
||||
|
||||
if element.name in ["video", "audio"]:
|
||||
media[f"{element.name}s"].append(
|
||||
{
|
||||
"src": element.get("src"),
|
||||
"alt": element.get("alt"),
|
||||
"type": element.name,
|
||||
"description": self.find_closest_parent_with_useful_text(
|
||||
element, **kwargs
|
||||
),
|
||||
}
|
||||
)
|
||||
source_tags = element.find_all("source")
|
||||
for source_tag in source_tags:
|
||||
media[f"{element.name}s"].append(
|
||||
{
|
||||
"src": source_tag.get("src"),
|
||||
"alt": element.get("alt"),
|
||||
"type": element.name,
|
||||
"description": self.find_closest_parent_with_useful_text(
|
||||
element, **kwargs
|
||||
),
|
||||
}
|
||||
)
|
||||
return True # Always keep video and audio elements
|
||||
|
||||
if element.name in ONLY_TEXT_ELIGIBLE_TAGS:
|
||||
if kwargs.get("only_text", False):
|
||||
element.replace_with(element.get_text())
|
||||
|
||||
try:
|
||||
self.remove_unwanted_attributes(
|
||||
element, IMPORTANT_ATTRS + kwargs.get("keep_attrs", []) , kwargs.get("keep_data_attributes", False)
|
||||
)
|
||||
except Exception as e:
|
||||
# print('Error removing unwanted attributes:', str(e))
|
||||
self._log(
|
||||
"error",
|
||||
message="Error removing unwanted attributes: {error}",
|
||||
tag="SCRAPE",
|
||||
params={"error": str(e)},
|
||||
)
|
||||
# Process children
|
||||
for child in list(element.children):
|
||||
if isinstance(child, NavigableString) and not isinstance(
|
||||
child, Comment
|
||||
):
|
||||
if len(child.strip()) > 0:
|
||||
keep_element = True
|
||||
else:
|
||||
if self._process_element(
|
||||
url,
|
||||
child,
|
||||
media,
|
||||
internal_links_dict,
|
||||
external_links_dict,
|
||||
**kwargs,
|
||||
):
|
||||
keep_element = True
|
||||
|
||||
# Check word count
|
||||
word_count_threshold = kwargs.get(
|
||||
"word_count_threshold", MIN_WORD_THRESHOLD
|
||||
)
|
||||
if not keep_element:
|
||||
word_count = len(element.get_text(strip=True).split())
|
||||
keep_element = word_count >= word_count_threshold
|
||||
|
||||
if not keep_element:
|
||||
element.decompose()
|
||||
|
||||
return keep_element
|
||||
except Exception as e:
|
||||
# print('Error processing element:', str(e))
|
||||
self._log(
|
||||
"error",
|
||||
message="Error processing element: {error}",
|
||||
tag="SCRAPE",
|
||||
params={"error": str(e)},
|
||||
)
|
||||
return False
|
||||
|
||||
def _scrap(
|
||||
self,
|
||||
url: str,
|
||||
html: str,
|
||||
word_count_threshold: int = MIN_WORD_THRESHOLD,
|
||||
css_selector: str = None,
|
||||
target_elements: List[str] = None,
|
||||
**kwargs,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract content from HTML using BeautifulSoup.
|
||||
|
||||
Args:
|
||||
url (str): The URL of the page to scrape.
|
||||
html (str): The HTML content of the page to scrape.
|
||||
word_count_threshold (int): The minimum word count threshold for content extraction.
|
||||
css_selector (str): The CSS selector to use for content extraction.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the extracted content.
|
||||
"""
|
||||
success = True
|
||||
if not html:
|
||||
return None
|
||||
|
||||
parser_type = kwargs.get("parser", "lxml")
|
||||
soup = BeautifulSoup(html, parser_type)
|
||||
body = soup.body
|
||||
if body is None:
|
||||
raise Exception("'<body>' tag is not found in fetched html. Consider adding wait_for=\"css:body\" to wait for body tag to be loaded into DOM.")
|
||||
base_domain = get_base_domain(url)
|
||||
|
||||
# Early removal of all images if exclude_all_images is set
|
||||
# This happens before any processing to minimize memory usage
|
||||
if kwargs.get("exclude_all_images", False):
|
||||
for img in body.find_all('img'):
|
||||
img.decompose()
|
||||
|
||||
try:
|
||||
meta = extract_metadata("", soup)
|
||||
except Exception as e:
|
||||
self._log(
|
||||
"error",
|
||||
message="Error extracting metadata: {error}",
|
||||
tag="SCRAPE",
|
||||
params={"error": str(e)},
|
||||
)
|
||||
meta = {}
|
||||
|
||||
# Handle tag-based removal first - faster than CSS selection
|
||||
excluded_tags = set(kwargs.get("excluded_tags", []) or [])
|
||||
if excluded_tags:
|
||||
for element in body.find_all(lambda tag: tag.name in excluded_tags):
|
||||
element.extract()
|
||||
|
||||
# Handle CSS selector-based removal
|
||||
excluded_selector = kwargs.get("excluded_selector", "")
|
||||
if excluded_selector:
|
||||
is_single_selector = (
|
||||
"," not in excluded_selector and " " not in excluded_selector
|
||||
)
|
||||
if is_single_selector:
|
||||
while element := body.select_one(excluded_selector):
|
||||
element.extract()
|
||||
else:
|
||||
for element in body.select(excluded_selector):
|
||||
element.extract()
|
||||
|
||||
content_element = None
|
||||
if target_elements:
|
||||
try:
|
||||
for_content_targeted_element = []
|
||||
for target_element in target_elements:
|
||||
for_content_targeted_element.extend(body.select(target_element))
|
||||
content_element = soup.new_tag("div")
|
||||
for el in for_content_targeted_element:
|
||||
content_element.append(copy.deepcopy(el))
|
||||
except Exception as e:
|
||||
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
|
||||
return None
|
||||
else:
|
||||
content_element = body
|
||||
|
||||
kwargs["exclude_social_media_domains"] = set(
|
||||
kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS
|
||||
)
|
||||
kwargs["exclude_domains"] = set(kwargs.get("exclude_domains", []))
|
||||
if kwargs.get("exclude_social_media_links", False):
|
||||
kwargs["exclude_domains"] = kwargs["exclude_domains"].union(
|
||||
kwargs["exclude_social_media_domains"]
|
||||
)
|
||||
|
||||
result_obj = self.process_element(
|
||||
url,
|
||||
body,
|
||||
word_count_threshold=word_count_threshold,
|
||||
base_domain=base_domain,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
links = {"internal": [], "external": []}
|
||||
media = result_obj["media"]
|
||||
internal_links_dict = result_obj["internal_links_dict"]
|
||||
external_links_dict = result_obj["external_links_dict"]
|
||||
|
||||
# Update the links dictionary with unique links
|
||||
links["internal"] = list(internal_links_dict.values())
|
||||
links["external"] = list(external_links_dict.values())
|
||||
|
||||
# Extract head content for links if configured
|
||||
link_preview_config = kwargs.get("link_preview_config")
|
||||
if link_preview_config is not None:
|
||||
try:
|
||||
import asyncio
|
||||
from .link_preview import LinkPreview
|
||||
from .models import Links, Link
|
||||
|
||||
verbose = link_preview_config.verbose
|
||||
|
||||
if verbose:
|
||||
self._log("info", "Starting link head extraction for {internal} internal and {external} external links",
|
||||
params={"internal": len(links["internal"]), "external": len(links["external"])}, tag="LINK_EXTRACT")
|
||||
|
||||
# Convert dict links to Link objects
|
||||
internal_links = [Link(**link_data) for link_data in links["internal"]]
|
||||
external_links = [Link(**link_data) for link_data in links["external"]]
|
||||
links_obj = Links(internal=internal_links, external=external_links)
|
||||
|
||||
# Create a config object for LinkPreview
|
||||
class TempCrawlerRunConfig:
|
||||
def __init__(self, link_config, score_links):
|
||||
self.link_preview_config = link_config
|
||||
self.score_links = score_links
|
||||
|
||||
config = TempCrawlerRunConfig(link_preview_config, kwargs.get("score_links", False))
|
||||
|
||||
# Extract head content (run async operation in sync context)
|
||||
async def extract_links():
|
||||
async with LinkPreview(self.logger) as extractor:
|
||||
return await extractor.extract_link_heads(links_obj, config)
|
||||
|
||||
# Run the async operation
|
||||
try:
|
||||
# Check if we're already in an async context
|
||||
loop = asyncio.get_running_loop()
|
||||
# If we're in an async context, we need to run in a thread
|
||||
import concurrent.futures
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
future = executor.submit(asyncio.run, extract_links())
|
||||
updated_links = future.result()
|
||||
except RuntimeError:
|
||||
# No running loop, we can use asyncio.run directly
|
||||
updated_links = asyncio.run(extract_links())
|
||||
|
||||
# Convert back to dict format
|
||||
links["internal"] = [link.dict() for link in updated_links.internal]
|
||||
links["external"] = [link.dict() for link in updated_links.external]
|
||||
|
||||
if verbose:
|
||||
successful_internal = len([l for l in updated_links.internal if l.head_extraction_status == "valid"])
|
||||
successful_external = len([l for l in updated_links.external if l.head_extraction_status == "valid"])
|
||||
self._log("info", "Link head extraction completed: {internal_success}/{internal_total} internal, {external_success}/{external_total} external",
|
||||
params={
|
||||
"internal_success": successful_internal,
|
||||
"internal_total": len(updated_links.internal),
|
||||
"external_success": successful_external,
|
||||
"external_total": len(updated_links.external)
|
||||
}, tag="LINK_EXTRACT")
|
||||
else:
|
||||
self._log("info", "Link head extraction completed successfully", tag="LINK_EXTRACT")
|
||||
|
||||
except Exception as e:
|
||||
self._log("error", f"Link head extraction failed: {str(e)}", tag="LINK_EXTRACT")
|
||||
# Continue with original links if extraction fails
|
||||
|
||||
# # Process images using ThreadPoolExecutor
|
||||
imgs = body.find_all("img")
|
||||
|
||||
media["images"] = [
|
||||
img
|
||||
for result in (
|
||||
self.process_image(img, url, i, len(imgs), **kwargs)
|
||||
for i, img in enumerate(imgs)
|
||||
)
|
||||
if result is not None
|
||||
for img in result
|
||||
]
|
||||
|
||||
# Process tables if not excluded
|
||||
excluded_tags = set(kwargs.get("excluded_tags", []) or [])
|
||||
if 'table' not in excluded_tags:
|
||||
tables = body.find_all('table')
|
||||
for table in tables:
|
||||
if self.is_data_table(table, **kwargs):
|
||||
table_data = self.extract_table_data(table)
|
||||
media["tables"].append(table_data)
|
||||
|
||||
body = self.flatten_nested_elements(body)
|
||||
base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)')
|
||||
for img in imgs:
|
||||
src = img.get("src", "")
|
||||
if base64_pattern.match(src):
|
||||
# Replace base64 data with empty string
|
||||
img["src"] = base64_pattern.sub("", src)
|
||||
|
||||
str_body = ""
|
||||
try:
|
||||
str_body = content_element.encode_contents().decode("utf-8")
|
||||
except Exception:
|
||||
# Reset body to the original HTML
|
||||
success = False
|
||||
body = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Create a new div with a special ID
|
||||
error_div = body.new_tag("div", id="crawl4ai_error_message")
|
||||
error_div.string = """
|
||||
Crawl4AI Error: This page is not fully supported.
|
||||
|
||||
Possible reasons:
|
||||
1. The page may have restrictions that prevent crawling.
|
||||
2. The page might not be fully loaded.
|
||||
|
||||
Suggestions:
|
||||
- Try calling the crawl function with these parameters:
|
||||
magic=True,
|
||||
- Set headless=False to visualize what's happening on the page.
|
||||
|
||||
If the issue persists, please check the page's structure and any potential anti-crawling measures.
|
||||
"""
|
||||
|
||||
# Append the error div to the body
|
||||
body.append(error_div)
|
||||
str_body = body.encode_contents().decode("utf-8")
|
||||
|
||||
print(
|
||||
"[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details."
|
||||
)
|
||||
self._log(
|
||||
"error",
|
||||
message="After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.",
|
||||
tag="SCRAPE",
|
||||
)
|
||||
|
||||
cleaned_html = str_body.replace("\n\n", "\n").replace(" ", " ")
|
||||
|
||||
return {
|
||||
"cleaned_html": cleaned_html,
|
||||
"success": success,
|
||||
"media": media,
|
||||
"links": links,
|
||||
"metadata": meta,
|
||||
}
|
||||
|
||||
|
||||
class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
def __init__(self, logger=None):
|
||||
super().__init__(logger)
|
||||
self.DIMENSION_REGEX = re.compile(r"(\d+)(\D*)")
|
||||
self.BASE64_PATTERN = re.compile(r'data:image/[^;]+;base64,([^"]+)')
|
||||
|
||||
def _process_element(
|
||||
self,
|
||||
url: str,
|
||||
@@ -1145,10 +270,10 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
link_data["intrinsic_score"] = intrinsic_score
|
||||
except Exception:
|
||||
# Fail gracefully - assign default score
|
||||
link_data["intrinsic_score"] = float('inf')
|
||||
link_data["intrinsic_score"] = 0
|
||||
else:
|
||||
# No scoring enabled - assign infinity (all links equal priority)
|
||||
link_data["intrinsic_score"] = float('inf')
|
||||
link_data["intrinsic_score"] = 0
|
||||
|
||||
is_external = is_external_url(normalized_href, base_domain)
|
||||
if is_external:
|
||||
@@ -1862,3 +987,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
"links": {"internal": [], "external": []},
|
||||
"metadata": {},
|
||||
}
|
||||
|
||||
|
||||
# Backward compatibility alias
|
||||
WebScrapingStrategy = LXMLWebScrapingStrategy
|
||||
|
||||
@@ -11,7 +11,7 @@ from .extraction_strategy import *
|
||||
from .crawler_strategy import *
|
||||
from typing import List
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from .content_scraping_strategy import WebScrapingStrategy
|
||||
from ..content_scraping_strategy import LXMLWebScrapingStrategy as WebScrapingStrategy
|
||||
from .config import *
|
||||
import warnings
|
||||
import json
|
||||
|
||||
79
crawl4ai/memory_utils.py
Normal file
79
crawl4ai/memory_utils.py
Normal file
@@ -0,0 +1,79 @@
|
||||
import psutil
|
||||
import platform
|
||||
import subprocess
|
||||
from typing import Tuple
|
||||
|
||||
|
||||
def get_true_available_memory_gb() -> float:
|
||||
"""Get truly available memory including inactive pages (cross-platform)"""
|
||||
vm = psutil.virtual_memory()
|
||||
|
||||
if platform.system() == 'Darwin': # macOS
|
||||
# On macOS, we need to include inactive memory too
|
||||
try:
|
||||
# Use vm_stat to get accurate values
|
||||
result = subprocess.run(['vm_stat'], capture_output=True, text=True)
|
||||
lines = result.stdout.split('\n')
|
||||
|
||||
page_size = 16384 # macOS page size
|
||||
pages = {}
|
||||
|
||||
for line in lines:
|
||||
if 'Pages free:' in line:
|
||||
pages['free'] = int(line.split()[-1].rstrip('.'))
|
||||
elif 'Pages inactive:' in line:
|
||||
pages['inactive'] = int(line.split()[-1].rstrip('.'))
|
||||
elif 'Pages speculative:' in line:
|
||||
pages['speculative'] = int(line.split()[-1].rstrip('.'))
|
||||
elif 'Pages purgeable:' in line:
|
||||
pages['purgeable'] = int(line.split()[-1].rstrip('.'))
|
||||
|
||||
# Calculate total available (free + inactive + speculative + purgeable)
|
||||
total_available_pages = (
|
||||
pages.get('free', 0) +
|
||||
pages.get('inactive', 0) +
|
||||
pages.get('speculative', 0) +
|
||||
pages.get('purgeable', 0)
|
||||
)
|
||||
available_gb = (total_available_pages * page_size) / (1024**3)
|
||||
|
||||
return available_gb
|
||||
except:
|
||||
# Fallback to psutil
|
||||
return vm.available / (1024**3)
|
||||
else:
|
||||
# For Windows and Linux, psutil.available is accurate
|
||||
return vm.available / (1024**3)
|
||||
|
||||
|
||||
def get_true_memory_usage_percent() -> float:
|
||||
"""
|
||||
Get memory usage percentage that accounts for platform differences.
|
||||
|
||||
Returns:
|
||||
float: Memory usage percentage (0-100)
|
||||
"""
|
||||
vm = psutil.virtual_memory()
|
||||
total_gb = vm.total / (1024**3)
|
||||
available_gb = get_true_available_memory_gb()
|
||||
|
||||
# Calculate used percentage based on truly available memory
|
||||
used_percent = 100.0 * (total_gb - available_gb) / total_gb
|
||||
|
||||
# Ensure it's within valid range
|
||||
return max(0.0, min(100.0, used_percent))
|
||||
|
||||
|
||||
def get_memory_stats() -> Tuple[float, float, float]:
|
||||
"""
|
||||
Get comprehensive memory statistics.
|
||||
|
||||
Returns:
|
||||
Tuple[float, float, float]: (used_percent, available_gb, total_gb)
|
||||
"""
|
||||
vm = psutil.virtual_memory()
|
||||
total_gb = vm.total / (1024**3)
|
||||
available_gb = get_true_available_memory_gb()
|
||||
used_percent = get_true_memory_usage_percent()
|
||||
|
||||
return used_percent, available_gb, total_gb
|
||||
@@ -23,8 +23,9 @@ SeedingConfig = Union['SeedingConfigType']
|
||||
|
||||
# Content scraping types
|
||||
ContentScrapingStrategy = Union['ContentScrapingStrategyType']
|
||||
WebScrapingStrategy = Union['WebScrapingStrategyType']
|
||||
LXMLWebScrapingStrategy = Union['LXMLWebScrapingStrategyType']
|
||||
# Backward compatibility alias
|
||||
WebScrapingStrategy = Union['LXMLWebScrapingStrategyType']
|
||||
|
||||
# Proxy types
|
||||
ProxyRotationStrategy = Union['ProxyRotationStrategyType']
|
||||
@@ -114,7 +115,6 @@ if TYPE_CHECKING:
|
||||
# Content scraping imports
|
||||
from .content_scraping_strategy import (
|
||||
ContentScrapingStrategy as ContentScrapingStrategyType,
|
||||
WebScrapingStrategy as WebScrapingStrategyType,
|
||||
LXMLWebScrapingStrategy as LXMLWebScrapingStrategyType,
|
||||
)
|
||||
|
||||
|
||||
@@ -1517,8 +1517,29 @@ def extract_metadata_using_lxml(html, doc=None):
|
||||
head = head[0]
|
||||
|
||||
# Title - using XPath
|
||||
# title = head.xpath(".//title/text()")
|
||||
# metadata["title"] = title[0].strip() if title else None
|
||||
|
||||
# === Title Extraction - New Approach ===
|
||||
# Attempt to extract <title> using XPath
|
||||
title = head.xpath(".//title/text()")
|
||||
metadata["title"] = title[0].strip() if title else None
|
||||
title = title[0] if title else None
|
||||
|
||||
# Fallback: Use .find() in case XPath fails due to malformed HTML
|
||||
if not title:
|
||||
title_el = doc.find(".//title")
|
||||
title = title_el.text if title_el is not None else None
|
||||
|
||||
# Final fallback: Use OpenGraph or Twitter title if <title> is missing or empty
|
||||
if not title:
|
||||
title_candidates = (
|
||||
doc.xpath("//meta[@property='og:title']/@content") or
|
||||
doc.xpath("//meta[@name='twitter:title']/@content")
|
||||
)
|
||||
title = title_candidates[0] if title_candidates else None
|
||||
|
||||
# Strip and assign title
|
||||
metadata["title"] = title.strip() if title else None
|
||||
|
||||
# Meta description - using XPath with multiple attribute conditions
|
||||
description = head.xpath('.//meta[@name="description"]/@content')
|
||||
@@ -3342,7 +3363,13 @@ async def get_text_embeddings(
|
||||
# Default: use sentence-transformers
|
||||
else:
|
||||
# Lazy load to avoid importing heavy libraries unless needed
|
||||
from sentence_transformers import SentenceTransformer
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"sentence-transformers is required for local embeddings. "
|
||||
"Install it with: pip install 'crawl4ai[transformer]' or pip install sentence-transformers"
|
||||
)
|
||||
|
||||
# Cache the model in function attribute to avoid reloading
|
||||
if not hasattr(get_text_embeddings, '_models'):
|
||||
|
||||
@@ -5,4 +5,9 @@ ANTHROPIC_API_KEY=your_anthropic_key_here
|
||||
GROQ_API_KEY=your_groq_key_here
|
||||
TOGETHER_API_KEY=your_together_key_here
|
||||
MISTRAL_API_KEY=your_mistral_key_here
|
||||
GEMINI_API_TOKEN=your_gemini_key_here
|
||||
GEMINI_API_TOKEN=your_gemini_key_here
|
||||
|
||||
# Optional: Override the default LLM provider
|
||||
# Examples: "openai/gpt-4", "anthropic/claude-3-opus", "deepseek/chat", etc.
|
||||
# If not set, uses the provider specified in config.yml (default: openai/gpt-4o-mini)
|
||||
# LLM_PROVIDER=anthropic/claude-3-opus
|
||||
@@ -154,6 +154,29 @@ cp deploy/docker/.llm.env.example .llm.env
|
||||
# Now edit .llm.env and add your API keys
|
||||
```
|
||||
|
||||
**Flexible LLM Provider Configuration:**
|
||||
|
||||
The Docker setup now supports flexible LLM provider configuration through three methods:
|
||||
|
||||
1. **Environment Variable** (Highest Priority): Set `LLM_PROVIDER` to override the default
|
||||
```bash
|
||||
export LLM_PROVIDER="anthropic/claude-3-opus"
|
||||
# Or in your .llm.env file:
|
||||
# LLM_PROVIDER=anthropic/claude-3-opus
|
||||
```
|
||||
|
||||
2. **API Request Parameter**: Specify provider per request
|
||||
```json
|
||||
{
|
||||
"url": "https://example.com",
|
||||
"provider": "groq/mixtral-8x7b"
|
||||
}
|
||||
```
|
||||
|
||||
3. **Config File Default**: Falls back to `config.yml` (default: `openai/gpt-4o-mini`)
|
||||
|
||||
The system automatically selects the appropriate API key based on the provider.
|
||||
|
||||
#### 3. Build and Run with Compose
|
||||
|
||||
The `docker-compose.yml` file in the project root provides a simplified approach that automatically handles architecture detection using buildx.
|
||||
@@ -668,7 +691,7 @@ app:
|
||||
|
||||
# Default LLM Configuration
|
||||
llm:
|
||||
provider: "openai/gpt-4o-mini"
|
||||
provider: "openai/gpt-4o-mini" # Can be overridden by LLM_PROVIDER env var
|
||||
api_key_env: "OPENAI_API_KEY"
|
||||
# api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ from typing import List, Tuple, Dict
|
||||
from functools import partial
|
||||
from uuid import uuid4
|
||||
from datetime import datetime
|
||||
from base64 import b64encode
|
||||
|
||||
import logging
|
||||
from typing import Optional, AsyncGenerator
|
||||
@@ -39,7 +40,9 @@ from utils import (
|
||||
get_base_url,
|
||||
is_task_id,
|
||||
should_cleanup_task,
|
||||
decode_redis_hash
|
||||
decode_redis_hash,
|
||||
get_llm_api_key,
|
||||
validate_llm_provider
|
||||
)
|
||||
|
||||
import psutil, time
|
||||
@@ -88,10 +91,12 @@ async def handle_llm_qa(
|
||||
|
||||
Answer:"""
|
||||
|
||||
# api_token=os.environ.get(config["llm"].get("api_key_env", ""))
|
||||
|
||||
response = perform_completion_with_backoff(
|
||||
provider=config["llm"]["provider"],
|
||||
prompt_with_variables=prompt,
|
||||
api_token=os.environ.get(config["llm"].get("api_key_env", ""))
|
||||
api_token=get_llm_api_key(config)
|
||||
)
|
||||
|
||||
return response.choices[0].message.content
|
||||
@@ -109,19 +114,23 @@ async def process_llm_extraction(
|
||||
url: str,
|
||||
instruction: str,
|
||||
schema: Optional[str] = None,
|
||||
cache: str = "0"
|
||||
cache: str = "0",
|
||||
provider: Optional[str] = None
|
||||
) -> None:
|
||||
"""Process LLM extraction in background."""
|
||||
try:
|
||||
# If config['llm'] has api_key then ignore the api_key_env
|
||||
api_key = ""
|
||||
if "api_key" in config["llm"]:
|
||||
api_key = config["llm"]["api_key"]
|
||||
else:
|
||||
api_key = os.environ.get(config["llm"].get("api_key_env", None), "")
|
||||
# Validate provider
|
||||
is_valid, error_msg = validate_llm_provider(config, provider)
|
||||
if not is_valid:
|
||||
await redis.hset(f"task:{task_id}", mapping={
|
||||
"status": TaskStatus.FAILED,
|
||||
"error": error_msg
|
||||
})
|
||||
return
|
||||
api_key = get_llm_api_key(config, provider)
|
||||
llm_strategy = LLMExtractionStrategy(
|
||||
llm_config=LLMConfig(
|
||||
provider=config["llm"]["provider"],
|
||||
provider=provider or config["llm"]["provider"],
|
||||
api_token=api_key
|
||||
),
|
||||
instruction=instruction,
|
||||
@@ -168,10 +177,19 @@ async def handle_markdown_request(
|
||||
filter_type: FilterType,
|
||||
query: Optional[str] = None,
|
||||
cache: str = "0",
|
||||
config: Optional[dict] = None
|
||||
config: Optional[dict] = None,
|
||||
provider: Optional[str] = None
|
||||
) -> str:
|
||||
"""Handle markdown generation requests."""
|
||||
try:
|
||||
# Validate provider if using LLM filter
|
||||
if filter_type == FilterType.LLM:
|
||||
is_valid, error_msg = validate_llm_provider(config, provider)
|
||||
if not is_valid:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=error_msg
|
||||
)
|
||||
decoded_url = unquote(url)
|
||||
if not decoded_url.startswith(('http://', 'https://')):
|
||||
decoded_url = 'https://' + decoded_url
|
||||
@@ -184,8 +202,8 @@ async def handle_markdown_request(
|
||||
FilterType.BM25: BM25ContentFilter(user_query=query or ""),
|
||||
FilterType.LLM: LLMContentFilter(
|
||||
llm_config=LLMConfig(
|
||||
provider=config["llm"]["provider"],
|
||||
api_token=os.environ.get(config["llm"].get("api_key_env", None), ""),
|
||||
provider=provider or config["llm"]["provider"],
|
||||
api_token=get_llm_api_key(config, provider),
|
||||
),
|
||||
instruction=query or "Extract main content"
|
||||
)
|
||||
@@ -229,7 +247,8 @@ async def handle_llm_request(
|
||||
query: Optional[str] = None,
|
||||
schema: Optional[str] = None,
|
||||
cache: str = "0",
|
||||
config: Optional[dict] = None
|
||||
config: Optional[dict] = None,
|
||||
provider: Optional[str] = None
|
||||
) -> JSONResponse:
|
||||
"""Handle LLM extraction requests."""
|
||||
base_url = get_base_url(request)
|
||||
@@ -259,7 +278,8 @@ async def handle_llm_request(
|
||||
schema,
|
||||
cache,
|
||||
base_url,
|
||||
config
|
||||
config,
|
||||
provider
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
@@ -303,7 +323,8 @@ async def create_new_task(
|
||||
schema: Optional[str],
|
||||
cache: str,
|
||||
base_url: str,
|
||||
config: dict
|
||||
config: dict,
|
||||
provider: Optional[str] = None
|
||||
) -> JSONResponse:
|
||||
"""Create and initialize a new task."""
|
||||
decoded_url = unquote(input_path)
|
||||
@@ -327,7 +348,8 @@ async def create_new_task(
|
||||
decoded_url,
|
||||
query,
|
||||
schema,
|
||||
cache
|
||||
cache,
|
||||
provider
|
||||
)
|
||||
|
||||
return JSONResponse({
|
||||
@@ -371,6 +393,9 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
|
||||
server_memory_mb = _get_memory_mb()
|
||||
result_dict = result.model_dump()
|
||||
result_dict['server_memory_mb'] = server_memory_mb
|
||||
# If PDF exists, encode it to base64
|
||||
if result_dict.get('pdf') is not None:
|
||||
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
|
||||
logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")
|
||||
data = json.dumps(result_dict, default=datetime_handler) + "\n"
|
||||
yield data.encode('utf-8')
|
||||
@@ -443,10 +468,19 @@ async def handle_crawl_request(
|
||||
mem_delta_mb = end_mem_mb - start_mem_mb # <--- Calculate delta
|
||||
peak_mem_mb = max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb) # <--- Get peak memory
|
||||
logger.info(f"Memory usage: Start: {start_mem_mb} MB, End: {end_mem_mb} MB, Delta: {mem_delta_mb} MB, Peak: {peak_mem_mb} MB")
|
||||
|
||||
|
||||
# Process results to handle PDF bytes
|
||||
processed_results = []
|
||||
for result in results:
|
||||
result_dict = result.model_dump()
|
||||
# If PDF exists, encode it to base64
|
||||
if result_dict.get('pdf') is not None:
|
||||
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
|
||||
processed_results.append(result_dict)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"results": [result.model_dump() for result in results],
|
||||
"results": processed_results,
|
||||
"server_processing_time_s": end_time - start_time,
|
||||
"server_memory_delta_mb": mem_delta_mb,
|
||||
"server_peak_memory_mb": peak_mem_mb
|
||||
|
||||
@@ -36,6 +36,7 @@ class LlmJobPayload(BaseModel):
|
||||
q: str
|
||||
schema: Optional[str] = None
|
||||
cache: bool = False
|
||||
provider: Optional[str] = None
|
||||
|
||||
|
||||
class CrawlJobPayload(BaseModel):
|
||||
@@ -61,6 +62,7 @@ async def llm_job_enqueue(
|
||||
schema=payload.schema,
|
||||
cache=payload.cache,
|
||||
config=_config,
|
||||
provider=payload.provider,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ class MarkdownRequest(BaseModel):
|
||||
f: FilterType = Field(FilterType.FIT, description="Content‑filter strategy: fit, raw, bm25, or llm")
|
||||
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
|
||||
c: Optional[str] = Field("0", description="Cache‑bust / revision counter")
|
||||
provider: Optional[str] = Field(None, description="LLM provider override (e.g., 'anthropic/claude-3-opus')")
|
||||
|
||||
|
||||
class RawCode(BaseModel):
|
||||
|
||||
@@ -241,7 +241,7 @@ async def get_markdown(
|
||||
raise HTTPException(
|
||||
400, "URL must be absolute and start with http/https")
|
||||
markdown = await handle_markdown_request(
|
||||
body.url, body.f, body.q, body.c, config
|
||||
body.url, body.f, body.q, body.c, config, body.provider
|
||||
)
|
||||
return JSONResponse({
|
||||
"url": body.url,
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import dns.resolver
|
||||
import logging
|
||||
import yaml
|
||||
import os
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
@@ -19,10 +20,24 @@ class FilterType(str, Enum):
|
||||
LLM = "llm"
|
||||
|
||||
def load_config() -> Dict:
|
||||
"""Load and return application configuration."""
|
||||
"""Load and return application configuration with environment variable overrides."""
|
||||
config_path = Path(__file__).parent / "config.yml"
|
||||
with open(config_path, "r") as config_file:
|
||||
return yaml.safe_load(config_file)
|
||||
config = yaml.safe_load(config_file)
|
||||
|
||||
# Override LLM provider from environment if set
|
||||
llm_provider = os.environ.get("LLM_PROVIDER")
|
||||
if llm_provider:
|
||||
config["llm"]["provider"] = llm_provider
|
||||
logging.info(f"LLM provider overridden from environment: {llm_provider}")
|
||||
|
||||
# Also support direct API key from environment if the provider-specific key isn't set
|
||||
llm_api_key = os.environ.get("LLM_API_KEY")
|
||||
if llm_api_key and "api_key" not in config["llm"]:
|
||||
config["llm"]["api_key"] = llm_api_key
|
||||
logging.info("LLM API key loaded from LLM_API_KEY environment variable")
|
||||
|
||||
return config
|
||||
|
||||
def setup_logging(config: Dict) -> None:
|
||||
"""Configure application logging."""
|
||||
@@ -56,6 +71,52 @@ def decode_redis_hash(hash_data: Dict[bytes, bytes]) -> Dict[str, str]:
|
||||
|
||||
|
||||
|
||||
def get_llm_api_key(config: Dict, provider: Optional[str] = None) -> str:
|
||||
"""Get the appropriate API key based on the LLM provider.
|
||||
|
||||
Args:
|
||||
config: The application configuration dictionary
|
||||
provider: Optional provider override (e.g., "openai/gpt-4")
|
||||
|
||||
Returns:
|
||||
The API key for the provider, or empty string if not found
|
||||
"""
|
||||
|
||||
# Use provided provider or fall back to config
|
||||
if not provider:
|
||||
provider = config["llm"]["provider"]
|
||||
|
||||
# Check if direct API key is configured
|
||||
if "api_key" in config["llm"]:
|
||||
return config["llm"]["api_key"]
|
||||
|
||||
# Fall back to the configured api_key_env if no match
|
||||
return os.environ.get(config["llm"].get("api_key_env", ""), "")
|
||||
|
||||
|
||||
def validate_llm_provider(config: Dict, provider: Optional[str] = None) -> tuple[bool, str]:
|
||||
"""Validate that the LLM provider has an associated API key.
|
||||
|
||||
Args:
|
||||
config: The application configuration dictionary
|
||||
provider: Optional provider override (e.g., "openai/gpt-4")
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid, error_message)
|
||||
"""
|
||||
# Use provided provider or fall back to config
|
||||
if not provider:
|
||||
provider = config["llm"]["provider"]
|
||||
|
||||
# Get the API key for this provider
|
||||
api_key = get_llm_api_key(config, provider)
|
||||
|
||||
if not api_key:
|
||||
return False, f"No API key found for provider '{provider}'. Please set the appropriate environment variable."
|
||||
|
||||
return True, ""
|
||||
|
||||
|
||||
def verify_email_domain(email: str) -> bool:
|
||||
try:
|
||||
domain = email.split('@')[1]
|
||||
|
||||
@@ -14,6 +14,7 @@ x-base-config: &base-config
|
||||
- TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
|
||||
- MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
|
||||
- GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
|
||||
- LLM_PROVIDER=${LLM_PROVIDER:-} # Optional: Override default provider (e.g., "anthropic/claude-3-opus")
|
||||
volumes:
|
||||
- /dev/shm:/dev/shm # Chromium performance
|
||||
deploy:
|
||||
|
||||
@@ -10,9 +10,8 @@ Today I'm releasing Crawl4AI v0.7.0—the Adaptive Intelligence Update. This rel
|
||||
|
||||
- **Adaptive Crawling**: Your crawler now learns and adapts to website patterns
|
||||
- **Virtual Scroll Support**: Complete content extraction from infinite scroll pages
|
||||
- **Link Preview with 3-Layer Scoring**: Intelligent link analysis and prioritization
|
||||
- **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization
|
||||
- **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering
|
||||
- **PDF Parsing**: Extract data from PDF documents
|
||||
- **Performance Optimizations**: Significant speed and memory improvements
|
||||
|
||||
## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning
|
||||
@@ -30,44 +29,41 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking:
|
||||
- Extraction confidence scores
|
||||
|
||||
```python
|
||||
from crawl4ai import AdaptiveCrawler, AdaptiveConfig, CrawlState
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||
import asyncio
|
||||
|
||||
# Initialize with custom learning parameters
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.7, # Min confidence to use learned patterns
|
||||
max_history=100, # Remember last 100 crawls per domain
|
||||
learning_rate=0.2, # How quickly to adapt to changes
|
||||
patterns_per_page=3, # Patterns to learn per page type
|
||||
extraction_strategy='css' # 'css' or 'xpath'
|
||||
)
|
||||
|
||||
adaptive_crawler = AdaptiveCrawler(config)
|
||||
|
||||
# First crawl - crawler learns the structure
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://news.example.com/article/12345",
|
||||
config=CrawlerRunConfig(
|
||||
adaptive_config=config,
|
||||
extraction_hints={ # Optional hints to speed up learning
|
||||
"title": "article h1",
|
||||
"content": "article .body-content"
|
||||
}
|
||||
)
|
||||
async def main():
|
||||
|
||||
# Configure adaptive crawler
|
||||
config = AdaptiveConfig(
|
||||
strategy="statistical", # or "embedding" for semantic understanding
|
||||
max_pages=10,
|
||||
confidence_threshold=0.7, # Stop at 70% confidence
|
||||
top_k_links=3, # Follow top 3 links per page
|
||||
min_gain_threshold=0.05 # Need 5% information gain to continue
|
||||
)
|
||||
|
||||
# Crawler identifies and stores patterns
|
||||
if result.success:
|
||||
state = adaptive_crawler.get_state("news.example.com")
|
||||
print(f"Learned {len(state.patterns)} patterns")
|
||||
print(f"Confidence: {state.avg_confidence:.2%}")
|
||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
|
||||
print("Starting adaptive crawl about Python decorators...")
|
||||
result = await adaptive.digest(
|
||||
start_url="https://docs.python.org/3/glossary.html",
|
||||
query="python decorators functions wrapping"
|
||||
)
|
||||
|
||||
print(f"\n✅ Crawling Complete!")
|
||||
print(f"• Confidence Level: {adaptive.confidence:.0%}")
|
||||
print(f"• Pages Crawled: {len(result.crawled_urls)}")
|
||||
print(f"• Knowledge Base: {len(adaptive.state.knowledge_base)} documents")
|
||||
|
||||
# Get most relevant content
|
||||
relevant = adaptive.get_relevant_content(top_k=3)
|
||||
print(f"\nMost Relevant Pages:")
|
||||
for i, page in enumerate(relevant, 1):
|
||||
print(f"{i}. {page['url']} (relevance: {page['score']:.2%})")
|
||||
|
||||
# Subsequent crawls - uses learned patterns
|
||||
result2 = await crawler.arun(
|
||||
"https://news.example.com/article/67890",
|
||||
config=CrawlerRunConfig(adaptive_config=config)
|
||||
)
|
||||
# Automatically extracts using learned patterns!
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
@@ -92,9 +88,7 @@ twitter_config = VirtualScrollConfig(
|
||||
container_selector="[data-testid='primaryColumn']",
|
||||
scroll_count=20, # Number of scrolls
|
||||
scroll_by="container_height", # Smart scrolling by container size
|
||||
wait_after_scroll=1.0, # Let content load
|
||||
capture_method="incremental", # Capture new content on each scroll
|
||||
deduplicate=True # Remove duplicate elements
|
||||
wait_after_scroll=1.0 # Let content load
|
||||
)
|
||||
|
||||
# For e-commerce product grids (Instagram style)
|
||||
@@ -102,8 +96,7 @@ grid_config = VirtualScrollConfig(
|
||||
container_selector="main .product-grid",
|
||||
scroll_count=30,
|
||||
scroll_by=800, # Fixed pixel scrolling
|
||||
wait_after_scroll=1.5, # Images need time
|
||||
stop_on_no_change=True # Smart stopping
|
||||
wait_after_scroll=1.5 # Images need time
|
||||
)
|
||||
|
||||
# For news feeds with lazy loading
|
||||
@@ -111,9 +104,7 @@ news_config = VirtualScrollConfig(
|
||||
container_selector=".article-feed",
|
||||
scroll_count=50,
|
||||
scroll_by="page_height", # Viewport-based scrolling
|
||||
wait_after_scroll=0.5,
|
||||
wait_for_selector=".article-card", # Wait for specific elements
|
||||
timeout=30000 # Max 30 seconds total
|
||||
wait_after_scroll=0.5 # Wait for content to load
|
||||
)
|
||||
|
||||
# Use it in your crawl
|
||||
@@ -157,68 +148,63 @@ async with AsyncWebCrawler() as crawler:
|
||||
|
||||
**My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals.
|
||||
|
||||
### The Three-Layer Scoring System
|
||||
### Intelligent Link Analysis and Scoring
|
||||
|
||||
```python
|
||||
from crawl4ai import LinkPreviewConfig
|
||||
import asyncio
|
||||
from crawl4ai import CrawlerRunConfig, CacheMode, AsyncWebCrawler
|
||||
from crawl4ai.adaptive_crawler import LinkPreviewConfig
|
||||
|
||||
# Configure intelligent link analysis
|
||||
link_config = LinkPreviewConfig(
|
||||
# What to analyze
|
||||
include_internal=True,
|
||||
include_external=True,
|
||||
max_links=100, # Analyze top 100 links
|
||||
|
||||
# Relevance scoring
|
||||
query="machine learning tutorials", # Your interest
|
||||
score_threshold=0.3, # Minimum relevance score
|
||||
|
||||
# Performance
|
||||
concurrent_requests=10, # Parallel processing
|
||||
timeout_per_link=5000, # 5s per link
|
||||
|
||||
# Advanced scoring weights
|
||||
scoring_weights={
|
||||
"intrinsic": 0.3, # Link quality indicators
|
||||
"contextual": 0.5, # Relevance to query
|
||||
"popularity": 0.2 # Link prominence
|
||||
}
|
||||
)
|
||||
|
||||
# Use in your crawl
|
||||
result = await crawler.arun(
|
||||
"https://tech-blog.example.com",
|
||||
config=CrawlerRunConfig(
|
||||
link_preview_config=link_config,
|
||||
score_links=True
|
||||
async def main():
|
||||
# Configure intelligent link analysis
|
||||
link_config = LinkPreviewConfig(
|
||||
include_internal=True,
|
||||
include_external=False,
|
||||
max_links=10,
|
||||
concurrency=5,
|
||||
query="python tutorial", # For contextual scoring
|
||||
score_threshold=0.3,
|
||||
verbose=True
|
||||
)
|
||||
)
|
||||
# Use in your crawl
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://www.geeksforgeeks.org/",
|
||||
config=CrawlerRunConfig(
|
||||
link_preview_config=link_config,
|
||||
score_links=True, # Enable intrinsic scoring
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
)
|
||||
|
||||
# Access scored and sorted links
|
||||
for link in result.links["internal"][:10]: # Top 10 internal links
|
||||
print(f"Score: {link['total_score']:.3f}")
|
||||
print(f" Intrinsic: {link['intrinsic_score']:.1f}/10") # Position, attributes
|
||||
print(f" Contextual: {link['contextual_score']:.1f}/1") # Relevance to query
|
||||
print(f" URL: {link['href']}")
|
||||
print(f" Title: {link['head_data']['title']}")
|
||||
print(f" Description: {link['head_data']['meta']['description'][:100]}...")
|
||||
# Access scored and sorted links
|
||||
if result.success and result.links:
|
||||
for link in result.links.get("internal", []):
|
||||
text = link.get('text', 'No text')[:40]
|
||||
print(
|
||||
text,
|
||||
f"{link.get('intrinsic_score', 0):.1f}/10" if link.get('intrinsic_score') is not None else "0.0/10",
|
||||
f"{link.get('contextual_score', 0):.2f}/1" if link.get('contextual_score') is not None else "0.00/1",
|
||||
f"{link.get('total_score', 0):.3f}" if link.get('total_score') is not None else "0.000"
|
||||
)
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Scoring Components:**
|
||||
|
||||
1. **Intrinsic Score (0-10)**: Based on link quality indicators
|
||||
1. **Intrinsic Score**: Based on link quality indicators
|
||||
- Position on page (navigation, content, footer)
|
||||
- Link attributes (rel, title, class names)
|
||||
- Anchor text quality and length
|
||||
- URL structure and depth
|
||||
|
||||
2. **Contextual Score (0-1)**: Relevance to your query
|
||||
- Semantic similarity using embeddings
|
||||
2. **Contextual Score**: Relevance to your query using BM25 algorithm
|
||||
- Keyword matching in link text and title
|
||||
- Meta description analysis
|
||||
- Content preview scoring
|
||||
|
||||
3. **Total Score**: Weighted combination for final ranking
|
||||
3. **Total Score**: Combined score for final ranking
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
- **Research Efficiency**: Find relevant papers 10x faster by following only high-score links
|
||||
@@ -235,58 +221,34 @@ for link in result.links["internal"][:10]: # Top 10 internal links
|
||||
### Technical Architecture
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
||||
|
||||
# Basic discovery - find all product pages
|
||||
seeder_config = SeedingConfig(
|
||||
# Discovery sources
|
||||
source="sitemap+cc", # Sitemap + Common Crawl
|
||||
|
||||
# Filtering
|
||||
pattern="*/product/*", # URL pattern matching
|
||||
ignore_patterns=["*/reviews/*", "*/questions/*"],
|
||||
|
||||
# Validation
|
||||
live_check=True, # Verify URLs are alive
|
||||
max_urls=5000, # Stop at 5000 URLs
|
||||
|
||||
# Performance
|
||||
concurrency=100, # Parallel requests
|
||||
hits_per_sec=10 # Rate limiting
|
||||
)
|
||||
async def main():
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
# Discover Python tutorial URLs
|
||||
config = SeedingConfig(
|
||||
source="sitemap", # Use sitemap
|
||||
pattern="*python*", # URL pattern filter
|
||||
extract_head=True, # Get metadata
|
||||
query="python tutorial", # For relevance scoring
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.2,
|
||||
max_urls=10
|
||||
)
|
||||
|
||||
print("Discovering Python async tutorial URLs...")
|
||||
urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
|
||||
|
||||
print(f"\n✅ Found {len(urls)} relevant URLs:")
|
||||
for i, url_info in enumerate(urls[:5], 1):
|
||||
print(f"\n{i}. {url_info['url']}")
|
||||
if url_info.get('relevance_score'):
|
||||
print(f" Relevance: {url_info['relevance_score']:.3f}")
|
||||
if url_info.get('head_data', {}).get('title'):
|
||||
print(f" Title: {url_info['head_data']['title'][:60]}...")
|
||||
|
||||
seeder = AsyncUrlSeeder(seeder_config)
|
||||
urls = await seeder.discover("https://shop.example.com")
|
||||
|
||||
# Advanced: Relevance-based discovery
|
||||
research_config = SeedingConfig(
|
||||
source="crawl+sitemap", # Deep crawl + sitemap
|
||||
pattern="*/blog/*", # Blog posts only
|
||||
|
||||
# Content relevance
|
||||
extract_head=True, # Get meta tags
|
||||
query="quantum computing tutorials",
|
||||
scoring_method="bm25", # Or "semantic" (coming soon)
|
||||
score_threshold=0.4, # High relevance only
|
||||
|
||||
# Smart filtering
|
||||
filter_nonsense_urls=True, # Remove .xml, .txt, etc.
|
||||
min_content_length=500, # Skip thin content
|
||||
|
||||
force=True # Bypass cache
|
||||
)
|
||||
|
||||
# Discover with progress tracking
|
||||
discovered = []
|
||||
async for batch in seeder.discover_iter("https://physics-blog.com", research_config):
|
||||
discovered.extend(batch)
|
||||
print(f"Found {len(discovered)} relevant URLs so far...")
|
||||
|
||||
# Results include scores and metadata
|
||||
for url_data in discovered[:5]:
|
||||
print(f"URL: {url_data['url']}")
|
||||
print(f"Score: {url_data['score']:.3f}")
|
||||
print(f"Title: {url_data['title']}")
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Discovery Methods:**
|
||||
@@ -309,35 +271,18 @@ This release includes significant performance improvements through optimized res
|
||||
### What We Optimized
|
||||
|
||||
```python
|
||||
# Before v0.7.0 (slow)
|
||||
# Optimized crawling with v0.7.0 improvements
|
||||
results = []
|
||||
for url in urls:
|
||||
result = await crawler.arun(url)
|
||||
results.append(result)
|
||||
|
||||
# After v0.7.0 (fast)
|
||||
# Automatic batching and connection pooling
|
||||
results = await crawler.arun_batch(
|
||||
urls,
|
||||
config=CrawlerRunConfig(
|
||||
# New performance options
|
||||
batch_size=10, # Process 10 URLs concurrently
|
||||
reuse_browser=True, # Keep browser warm
|
||||
eager_loading=False, # Load only what's needed
|
||||
streaming_extraction=True, # Stream large extractions
|
||||
|
||||
# Optimized defaults
|
||||
wait_until="domcontentloaded", # Faster than networkidle
|
||||
exclude_external_resources=True, # Skip third-party assets
|
||||
block_ads=True # Ad blocking built-in
|
||||
result = await crawler.arun(
|
||||
url,
|
||||
config=CrawlerRunConfig(
|
||||
# Performance optimizations
|
||||
wait_until="domcontentloaded", # Faster than networkidle
|
||||
cache_mode=CacheMode.ENABLED # Enable caching
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# Memory-efficient streaming for large crawls
|
||||
async for result in crawler.arun_stream(large_url_list):
|
||||
# Process results as they complete
|
||||
await process_result(result)
|
||||
# Memory is freed after each iteration
|
||||
results.append(result)
|
||||
```
|
||||
|
||||
**Performance Gains:**
|
||||
@@ -347,24 +292,6 @@ async for result in crawler.arun_stream(large_url_list):
|
||||
- **Memory Usage**: 60% reduction with streaming processing
|
||||
- **Concurrent Crawls**: Handle 5x more parallel requests
|
||||
|
||||
## 📄 PDF Support
|
||||
|
||||
PDF extraction is now natively supported in Crawl4AI.
|
||||
|
||||
```python
|
||||
# Extract data from PDF documents
|
||||
result = await crawler.arun(
|
||||
"https://example.com/report.pdf",
|
||||
config=CrawlerRunConfig(
|
||||
pdf_extraction=True,
|
||||
extraction_strategy=JsonCssExtractionStrategy({
|
||||
# Works on converted PDF structure
|
||||
"title": {"selector": "h1", "type": "text"},
|
||||
"sections": {"selector": "h2", "type": "list"}
|
||||
})
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
## 🔧 Important Changes
|
||||
|
||||
|
||||
43
docs/blog/release-v0.7.1.md
Normal file
43
docs/blog/release-v0.7.1.md
Normal file
@@ -0,0 +1,43 @@
|
||||
# 🛠️ Crawl4AI v0.7.1: Minor Cleanup Update
|
||||
|
||||
*July 17, 2025 • 2 min read*
|
||||
|
||||
---
|
||||
|
||||
A small maintenance release that removes unused code and improves documentation.
|
||||
|
||||
## 🎯 What's Changed
|
||||
|
||||
- **Removed unused StealthConfig** from `crawl4ai/browser_manager.py`
|
||||
- **Updated documentation** with better examples and parameter explanations
|
||||
- **Fixed virtual scroll configuration** examples in docs
|
||||
|
||||
## 🧹 Code Cleanup
|
||||
|
||||
Removed unused `StealthConfig` import and configuration that wasn't being used anywhere in the codebase. The project uses its own custom stealth implementation through JavaScript injection instead.
|
||||
|
||||
```python
|
||||
# Removed unused code:
|
||||
from playwright_stealth import StealthConfig
|
||||
stealth_config = StealthConfig(...) # This was never used
|
||||
```
|
||||
|
||||
## 📖 Documentation Updates
|
||||
|
||||
- Fixed adaptive crawling parameter examples
|
||||
- Updated session management documentation
|
||||
- Corrected virtual scroll configuration examples
|
||||
|
||||
## 🚀 Installation
|
||||
|
||||
```bash
|
||||
pip install crawl4ai==0.7.1
|
||||
```
|
||||
|
||||
No breaking changes - upgrade directly from v0.7.0.
|
||||
|
||||
---
|
||||
|
||||
Questions? Issues?
|
||||
- GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||
- Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||
303
docs/examples/demo_multi_config_clean.py
Normal file
303
docs/examples/demo_multi_config_clean.py
Normal file
@@ -0,0 +1,303 @@
|
||||
"""
|
||||
🎯 Multi-Config URL Matching Demo
|
||||
=================================
|
||||
Learn how to use different crawler configurations for different URL patterns
|
||||
in a single crawl batch with Crawl4AI's multi-config feature.
|
||||
|
||||
Part 1: Understanding URL Matching (Pattern Testing)
|
||||
Part 2: Practical Example with Real Crawling
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
CrawlerRunConfig,
|
||||
MatchMode
|
||||
)
|
||||
from crawl4ai.processors.pdf import PDFContentScrapingStrategy
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
|
||||
|
||||
def print_section(title):
|
||||
"""Print a formatted section header"""
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"{title}")
|
||||
print(f"{'=' * 60}\n")
|
||||
|
||||
|
||||
def test_url_matching(config, test_urls, config_name):
|
||||
"""Test URL matching for a config and show results"""
|
||||
print(f"Config: {config_name}")
|
||||
print(f"Matcher: {config.url_matcher}")
|
||||
if hasattr(config, 'match_mode'):
|
||||
print(f"Mode: {config.match_mode.value}")
|
||||
print("-" * 40)
|
||||
|
||||
for url in test_urls:
|
||||
matches = config.is_match(url)
|
||||
symbol = "✓" if matches else "✗"
|
||||
print(f"{symbol} {url}")
|
||||
print()
|
||||
|
||||
|
||||
# ==============================================================================
|
||||
# PART 1: Understanding URL Matching
|
||||
# ==============================================================================
|
||||
|
||||
def demo_part1_pattern_matching():
|
||||
"""Part 1: Learn how URL matching works without crawling"""
|
||||
|
||||
print_section("PART 1: Understanding URL Matching")
|
||||
print("Let's explore different ways to match URLs with configs.\n")
|
||||
|
||||
# Test URLs we'll use throughout
|
||||
test_urls = [
|
||||
"https://example.com/report.pdf",
|
||||
"https://example.com/data.json",
|
||||
"https://example.com/blog/post-1",
|
||||
"https://example.com/article/news",
|
||||
"https://api.example.com/v1/users",
|
||||
"https://example.com/about"
|
||||
]
|
||||
|
||||
# 1.1 Simple String Pattern
|
||||
print("1.1 Simple String Pattern Matching")
|
||||
print("-" * 40)
|
||||
|
||||
pdf_config = CrawlerRunConfig(
|
||||
url_matcher="*.pdf"
|
||||
)
|
||||
|
||||
test_url_matching(pdf_config, test_urls, "PDF Config")
|
||||
|
||||
|
||||
# 1.2 Multiple String Patterns
|
||||
print("1.2 Multiple String Patterns (OR logic)")
|
||||
print("-" * 40)
|
||||
|
||||
blog_config = CrawlerRunConfig(
|
||||
url_matcher=["*/blog/*", "*/article/*", "*/news/*"],
|
||||
match_mode=MatchMode.OR # This is default, shown for clarity
|
||||
)
|
||||
|
||||
test_url_matching(blog_config, test_urls, "Blog/Article Config")
|
||||
|
||||
|
||||
# 1.3 Single Function Matcher
|
||||
print("1.3 Function-based Matching")
|
||||
print("-" * 40)
|
||||
|
||||
api_config = CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'api' in url or url.endswith('.json')
|
||||
)
|
||||
|
||||
test_url_matching(api_config, test_urls, "API Config")
|
||||
|
||||
|
||||
# 1.4 List of Functions
|
||||
print("1.4 Multiple Functions with AND Logic")
|
||||
print("-" * 40)
|
||||
|
||||
# Must be HTTPS AND contain 'api' AND have version number
|
||||
secure_api_config = CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
lambda url: url.startswith('https://'),
|
||||
lambda url: 'api' in url,
|
||||
lambda url: '/v' in url # Version indicator
|
||||
],
|
||||
match_mode=MatchMode.AND
|
||||
)
|
||||
|
||||
test_url_matching(secure_api_config, test_urls, "Secure API Config")
|
||||
|
||||
|
||||
# 1.5 Mixed: String and Function Together
|
||||
print("1.5 Mixed Patterns: String + Function")
|
||||
print("-" * 40)
|
||||
|
||||
# Match JSON files OR any API endpoint
|
||||
json_or_api_config = CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
"*.json", # String pattern
|
||||
lambda url: 'api' in url # Function
|
||||
],
|
||||
match_mode=MatchMode.OR
|
||||
)
|
||||
|
||||
test_url_matching(json_or_api_config, test_urls, "JSON or API Config")
|
||||
|
||||
|
||||
# 1.6 Complex: Multiple Strings + Multiple Functions
|
||||
print("1.6 Complex Matcher: Mixed Types with AND Logic")
|
||||
print("-" * 40)
|
||||
|
||||
# Must be: HTTPS AND (.com domain) AND (blog OR article) AND NOT a PDF
|
||||
complex_config = CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
lambda url: url.startswith('https://'), # Function: HTTPS check
|
||||
"*.com/*", # String: .com domain
|
||||
lambda url: any(pattern in url for pattern in ['/blog/', '/article/']), # Function: Blog OR article
|
||||
lambda url: not url.endswith('.pdf') # Function: Not PDF
|
||||
],
|
||||
match_mode=MatchMode.AND
|
||||
)
|
||||
|
||||
test_url_matching(complex_config, test_urls, "Complex Mixed Config")
|
||||
|
||||
print("\n✅ Key Takeaway: First matching config wins when passed to arun_many()!")
|
||||
|
||||
|
||||
# ==============================================================================
|
||||
# PART 2: Practical Multi-URL Crawling
|
||||
# ==============================================================================
|
||||
|
||||
async def demo_part2_practical_crawling():
|
||||
"""Part 2: Real-world example with different content types"""
|
||||
|
||||
print_section("PART 2: Practical Multi-URL Crawling")
|
||||
print("Now let's see multi-config in action with real URLs.\n")
|
||||
|
||||
# Create specialized configs for different content types
|
||||
configs = [
|
||||
# Config 1: PDF documents - only match files ending with .pdf
|
||||
CrawlerRunConfig(
|
||||
url_matcher="*.pdf",
|
||||
scraping_strategy=PDFContentScrapingStrategy()
|
||||
),
|
||||
|
||||
# Config 2: Blog/article pages with content filtering
|
||||
CrawlerRunConfig(
|
||||
url_matcher=["*/blog/*", "*/article/*", "*python.org*"],
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(threshold=0.48)
|
||||
)
|
||||
),
|
||||
|
||||
# Config 3: Dynamic pages requiring JavaScript
|
||||
CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'github.com' in url,
|
||||
js_code="window.scrollTo(0, 500);" # Scroll to load content
|
||||
),
|
||||
|
||||
# Config 4: Mixed matcher - API endpoints (string OR function)
|
||||
CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
"*.json", # String pattern for JSON files
|
||||
lambda url: 'api' in url or 'httpbin.org' in url # Function for API endpoints
|
||||
],
|
||||
match_mode=MatchMode.OR,
|
||||
),
|
||||
|
||||
# Config 5: Complex matcher - Secure documentation sites
|
||||
CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
lambda url: url.startswith('https://'), # Must be HTTPS
|
||||
"*.org/*", # String: .org domain
|
||||
lambda url: any(doc in url for doc in ['docs', 'documentation', 'reference']), # Has docs
|
||||
lambda url: not url.endswith(('.pdf', '.json')) # Not PDF or JSON
|
||||
],
|
||||
match_mode=MatchMode.AND,
|
||||
# wait_for="css:.content, css:article" # Wait for content to load
|
||||
),
|
||||
|
||||
# Default config for everything else
|
||||
# CrawlerRunConfig() # No url_matcher means it matches everything (use it as fallback)
|
||||
]
|
||||
|
||||
# URLs to crawl - each will use a different config
|
||||
urls = [
|
||||
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", # → PDF config
|
||||
"https://blog.python.org/", # → Blog config with content filter
|
||||
"https://github.com/microsoft/playwright", # → JS config
|
||||
"https://httpbin.org/json", # → Mixed matcher config (API)
|
||||
"https://docs.python.org/3/reference/", # → Complex matcher config
|
||||
"https://www.w3schools.com/", # → Default config, if you uncomment the default config line above, if not you will see `Error: No matching configuration`
|
||||
]
|
||||
|
||||
print("URLs to crawl:")
|
||||
for i, url in enumerate(urls, 1):
|
||||
print(f"{i}. {url}")
|
||||
|
||||
print("\nCrawling with appropriate config for each URL...\n")
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=configs
|
||||
)
|
||||
|
||||
# Display results
|
||||
print("Results:")
|
||||
print("-" * 60)
|
||||
|
||||
for result in results:
|
||||
if result.success:
|
||||
# Determine which config was used
|
||||
config_type = "Default"
|
||||
if result.url.endswith('.pdf'):
|
||||
config_type = "PDF Strategy"
|
||||
elif any(pattern in result.url for pattern in ['blog', 'python.org']) and 'docs' not in result.url:
|
||||
config_type = "Blog + Content Filter"
|
||||
elif 'github.com' in result.url:
|
||||
config_type = "JavaScript Enabled"
|
||||
elif 'httpbin.org' in result.url or result.url.endswith('.json'):
|
||||
config_type = "Mixed Matcher (API)"
|
||||
elif 'docs.python.org' in result.url:
|
||||
config_type = "Complex Matcher (Secure Docs)"
|
||||
|
||||
print(f"\n✓ {result.url}")
|
||||
print(f" Config used: {config_type}")
|
||||
print(f" Content size: {len(result.markdown)} chars")
|
||||
|
||||
# Show if we have fit_markdown (from content filter)
|
||||
if hasattr(result.markdown, 'fit_markdown') and result.markdown.fit_markdown:
|
||||
print(f" Fit markdown size: {len(result.markdown.fit_markdown)} chars")
|
||||
reduction = (1 - len(result.markdown.fit_markdown) / len(result.markdown)) * 100
|
||||
print(f" Content reduced by: {reduction:.1f}%")
|
||||
|
||||
# Show extracted data if using extraction strategy
|
||||
if hasattr(result, 'extracted_content') and result.extracted_content:
|
||||
print(f" Extracted data: {str(result.extracted_content)[:100]}...")
|
||||
else:
|
||||
print(f"\n✗ {result.url}")
|
||||
print(f" Error: {result.error_message}")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("✅ Multi-config crawling complete!")
|
||||
print("\nBenefits demonstrated:")
|
||||
print("- PDFs handled with specialized scraper")
|
||||
print("- Blog content filtered for relevance")
|
||||
print("- JavaScript executed only where needed")
|
||||
print("- Mixed matchers (string + function) for flexible matching")
|
||||
print("- Complex matchers for precise URL targeting")
|
||||
print("- Each URL got optimal configuration automatically!")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run both parts of the demo"""
|
||||
|
||||
print("""
|
||||
🎯 Multi-Config URL Matching Demo
|
||||
=================================
|
||||
Learn how Crawl4AI can use different configurations
|
||||
for different URLs in a single batch.
|
||||
""")
|
||||
|
||||
# Part 1: Pattern matching
|
||||
demo_part1_pattern_matching()
|
||||
|
||||
print("\nPress Enter to continue to Part 2...")
|
||||
try:
|
||||
input()
|
||||
except EOFError:
|
||||
# Running in non-interactive mode, skip input
|
||||
pass
|
||||
|
||||
# Part 2: Practical crawling
|
||||
await demo_part2_practical_crawling()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -18,7 +18,7 @@ Usage:
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.async_configs import LinkPreviewConfig
|
||||
from crawl4ai import LinkPreviewConfig
|
||||
|
||||
|
||||
async def basic_link_head_extraction():
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import time, re
|
||||
from crawl4ai.content_scraping_strategy import WebScrapingStrategy, LXMLWebScrapingStrategy
|
||||
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||
# WebScrapingStrategy is now an alias for LXMLWebScrapingStrategy
|
||||
import time
|
||||
import functools
|
||||
from collections import defaultdict
|
||||
@@ -57,7 +58,7 @@ methods_to_profile = [
|
||||
|
||||
|
||||
# Apply decorators to both strategies
|
||||
for strategy, name in [(WebScrapingStrategy, "Original"), (LXMLWebScrapingStrategy, "LXML")]:
|
||||
for strategy, name in [(LXMLWebScrapingStrategy, "LXML")]:
|
||||
for method in methods_to_profile:
|
||||
apply_decorators(strategy, method, name)
|
||||
|
||||
@@ -85,7 +86,7 @@ def generate_large_html(n_elements=1000):
|
||||
|
||||
def test_scraping():
|
||||
# Initialize both scrapers
|
||||
original_scraper = WebScrapingStrategy()
|
||||
original_scraper = LXMLWebScrapingStrategy()
|
||||
selected_scraper = LXMLWebScrapingStrategy()
|
||||
|
||||
# Generate test HTML
|
||||
|
||||
@@ -404,7 +404,182 @@ for result in results:
|
||||
print(f"Duration: {dr.end_time - dr.start_time}")
|
||||
```
|
||||
|
||||
## 6. Summary
|
||||
## 6. URL-Specific Configurations
|
||||
|
||||
When crawling diverse content types, you often need different configurations for different URLs. For example:
|
||||
- PDFs need specialized extraction
|
||||
- Blog pages benefit from content filtering
|
||||
- Dynamic sites need JavaScript execution
|
||||
- API endpoints need JSON parsing
|
||||
|
||||
### 6.1 Basic URL Pattern Matching
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, MatchMode
|
||||
from crawl4ai.processors.pdf import PDFContentScrapingStrategy
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
|
||||
async def crawl_mixed_content():
|
||||
# Configure different strategies for different content
|
||||
configs = [
|
||||
# PDF files - specialized extraction
|
||||
CrawlerRunConfig(
|
||||
url_matcher="*.pdf",
|
||||
scraping_strategy=PDFContentScrapingStrategy()
|
||||
),
|
||||
|
||||
# Blog/article pages - content filtering
|
||||
CrawlerRunConfig(
|
||||
url_matcher=["*/blog/*", "*/article/*"],
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(threshold=0.48)
|
||||
)
|
||||
),
|
||||
|
||||
# Dynamic pages - JavaScript execution
|
||||
CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'github.com' in url,
|
||||
js_code="window.scrollTo(0, 500);"
|
||||
),
|
||||
|
||||
# API endpoints - JSON extraction
|
||||
CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'api' in url or url.endswith('.json'),
|
||||
# Custome settings for JSON extraction
|
||||
),
|
||||
|
||||
# Default config for everything else
|
||||
CrawlerRunConfig() # No url_matcher means it matches ALL URLs (fallback)
|
||||
]
|
||||
|
||||
# Mixed URLs
|
||||
urls = [
|
||||
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
|
||||
"https://blog.python.org/",
|
||||
"https://github.com/microsoft/playwright",
|
||||
"https://httpbin.org/json",
|
||||
"https://example.com/"
|
||||
]
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=configs # Pass list of configs
|
||||
)
|
||||
|
||||
for result in results:
|
||||
print(f"{result.url}: {len(result.markdown)} chars")
|
||||
```
|
||||
|
||||
### 6.2 Advanced Pattern Matching
|
||||
|
||||
**Important**: A `CrawlerRunConfig` without `url_matcher` (or with `url_matcher=None`) matches ALL URLs. This makes it perfect as a default/fallback configuration.
|
||||
|
||||
The `url_matcher` parameter supports three types of patterns:
|
||||
|
||||
#### Glob Patterns (Strings)
|
||||
```python
|
||||
# Simple patterns
|
||||
"*.pdf" # Any PDF file
|
||||
"*/api/*" # Any URL with /api/ in path
|
||||
"https://*.example.com/*" # Subdomain matching
|
||||
"*://example.com/blog/*" # Any protocol
|
||||
```
|
||||
|
||||
#### Custom Functions
|
||||
```python
|
||||
# Complex logic with lambdas
|
||||
lambda url: url.startswith('https://') and 'secure' in url
|
||||
lambda url: len(url) > 50 and url.count('/') > 5
|
||||
lambda url: any(domain in url for domain in ['api.', 'data.', 'feed.'])
|
||||
```
|
||||
|
||||
#### Mixed Lists with AND/OR Logic
|
||||
```python
|
||||
# Combine multiple conditions
|
||||
CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
"https://*", # Must be HTTPS
|
||||
lambda url: 'internal' in url, # Must contain 'internal'
|
||||
lambda url: not url.endswith('.pdf') # Must not be PDF
|
||||
],
|
||||
match_mode=MatchMode.AND # ALL conditions must match
|
||||
)
|
||||
```
|
||||
|
||||
### 6.3 Practical Example: News Site Crawler
|
||||
|
||||
```python
|
||||
async def crawl_news_site():
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=70.0,
|
||||
rate_limiter=RateLimiter(base_delay=(1.0, 2.0))
|
||||
)
|
||||
|
||||
configs = [
|
||||
# Homepage - light extraction
|
||||
CrawlerRunConfig(
|
||||
url_matcher=lambda url: url.rstrip('/') == 'https://news.ycombinator.com',
|
||||
css_selector="nav, .headline",
|
||||
extraction_strategy=None
|
||||
),
|
||||
|
||||
# Article pages - full extraction
|
||||
CrawlerRunConfig(
|
||||
url_matcher="*/article/*",
|
||||
extraction_strategy=CosineStrategy(
|
||||
semantic_filter="article content",
|
||||
word_count_threshold=100
|
||||
),
|
||||
screenshot=True,
|
||||
excluded_tags=["nav", "aside", "footer"]
|
||||
),
|
||||
|
||||
# Author pages - metadata focus
|
||||
CrawlerRunConfig(
|
||||
url_matcher="*/author/*",
|
||||
extraction_strategy=JsonCssExtractionStrategy({
|
||||
"name": "h1.author-name",
|
||||
"bio": ".author-bio",
|
||||
"articles": "article.post-card h2"
|
||||
})
|
||||
),
|
||||
|
||||
# Everything else
|
||||
CrawlerRunConfig()
|
||||
]
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results = await crawler.arun_many(
|
||||
urls=news_urls,
|
||||
config=configs,
|
||||
dispatcher=dispatcher
|
||||
)
|
||||
```
|
||||
|
||||
### 6.4 Best Practices
|
||||
|
||||
1. **Order Matters**: Configs are evaluated in order - put specific patterns before general ones
|
||||
2. **Default Config Behavior**:
|
||||
- A config without `url_matcher` matches ALL URLs
|
||||
- Always include a default config as the last item if you want to handle all URLs
|
||||
- Without a default config, unmatched URLs will fail with "No matching configuration found"
|
||||
3. **Test Your Patterns**: Use the config's `is_match()` method to test patterns:
|
||||
```python
|
||||
config = CrawlerRunConfig(url_matcher="*.pdf")
|
||||
print(config.is_match("https://example.com/doc.pdf")) # True
|
||||
|
||||
default_config = CrawlerRunConfig() # No url_matcher
|
||||
print(default_config.is_match("https://any-url.com")) # True - matches everything!
|
||||
```
|
||||
4. **Optimize for Performance**:
|
||||
- Disable JS for static content
|
||||
- Skip screenshots for data APIs
|
||||
- Use appropriate extraction strategies
|
||||
|
||||
## 7. Summary
|
||||
|
||||
1. **Two Dispatcher Types**:
|
||||
|
||||
|
||||
@@ -49,46 +49,75 @@ from crawl4ai import JsonCssExtractionStrategy
|
||||
from crawl4ai.cache_context import CacheMode
|
||||
|
||||
async def crawl_dynamic_content():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
session_id = "github_commits_session"
|
||||
url = "https://github.com/microsoft/TypeScript/commits/main"
|
||||
all_commits = []
|
||||
url = "https://github.com/microsoft/TypeScript/commits/main"
|
||||
session_id = "wait_for_session"
|
||||
all_commits = []
|
||||
|
||||
# Define extraction schema
|
||||
schema = {
|
||||
"name": "Commit Extractor",
|
||||
"baseSelector": "li.Box-sc-g0xbh4-0",
|
||||
"fields": [{
|
||||
"name": "title", "selector": "h4.markdown-title", "type": "text"
|
||||
}],
|
||||
}
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema)
|
||||
js_next_page = """
|
||||
const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4');
|
||||
if (commits.length > 0) {
|
||||
window.lastCommit = commits[0].textContent.trim();
|
||||
}
|
||||
const button = document.querySelector('a[data-testid="pagination-next-button"]');
|
||||
if (button) {button.click(); console.log('button clicked') }
|
||||
"""
|
||||
|
||||
# JavaScript and wait configurations
|
||||
js_next_page = """document.querySelector('a[data-testid="pagination-next-button"]').click();"""
|
||||
wait_for = """() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0"""
|
||||
|
||||
# Crawl multiple pages
|
||||
wait_for = """() => {
|
||||
const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4');
|
||||
if (commits.length === 0) return false;
|
||||
const firstCommit = commits[0].textContent.trim();
|
||||
return firstCommit !== window.lastCommit;
|
||||
}"""
|
||||
|
||||
schema = {
|
||||
"name": "Commit Extractor",
|
||||
"baseSelector": "li[data-testid='commit-row-item']",
|
||||
"fields": [
|
||||
{
|
||||
"name": "title",
|
||||
"selector": "h4 a",
|
||||
"type": "text",
|
||||
"transform": "strip",
|
||||
},
|
||||
],
|
||||
}
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
|
||||
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
verbose=True,
|
||||
headless=False,
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
for page in range(3):
|
||||
config = CrawlerRunConfig(
|
||||
url=url,
|
||||
crawler_config = CrawlerRunConfig(
|
||||
session_id=session_id,
|
||||
css_selector="li[data-testid='commit-row-item']",
|
||||
extraction_strategy=extraction_strategy,
|
||||
js_code=js_next_page if page > 0 else None,
|
||||
wait_for=wait_for if page > 0 else None,
|
||||
js_only=page > 0,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
capture_console_messages=True,
|
||||
)
|
||||
|
||||
result = await crawler.arun(config=config)
|
||||
if result.success:
|
||||
|
||||
result = await crawler.arun(url=url, config=crawler_config)
|
||||
|
||||
if result.console_messages:
|
||||
print(f"Page {page + 1} console messages:", result.console_messages)
|
||||
|
||||
if result.extracted_content:
|
||||
# print(f"Page {page + 1} result:", result.extracted_content)
|
||||
commits = json.loads(result.extracted_content)
|
||||
all_commits.extend(commits)
|
||||
print(f"Page {page + 1}: Found {len(commits)} commits")
|
||||
else:
|
||||
print(f"Page {page + 1}: No content extracted")
|
||||
|
||||
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
|
||||
# Clean up session
|
||||
await crawler.crawler_strategy.kill_session(session_id)
|
||||
return all_commits
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
@@ -91,13 +91,12 @@ async def crawl_twitter_timeline():
|
||||
wait_after_scroll=1.0 # Twitter needs time to load
|
||||
)
|
||||
|
||||
browser_config = BrowserConfig(headless=True) # Set to False to watch it work
|
||||
config = CrawlerRunConfig(
|
||||
virtual_scroll_config=virtual_config,
|
||||
# Optional: Set headless=False to watch it work
|
||||
# browser_config=BrowserConfig(headless=False)
|
||||
virtual_scroll_config=virtual_config
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://twitter.com/search?q=AI",
|
||||
config=config
|
||||
@@ -200,7 +199,7 @@ Use **scan_full_page** when:
|
||||
Virtual Scroll works seamlessly with extraction strategies:
|
||||
|
||||
```python
|
||||
from crawl4ai import LLMExtractionStrategy
|
||||
from crawl4ai import LLMExtractionStrategy, LLMConfig
|
||||
|
||||
# Define extraction schema
|
||||
schema = {
|
||||
@@ -222,7 +221,7 @@ config = CrawlerRunConfig(
|
||||
scroll_count=20
|
||||
),
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o-mini",
|
||||
llm_config=LLMConfig(provider="openai/gpt-4o-mini"),
|
||||
schema=schema
|
||||
)
|
||||
)
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
```python
|
||||
async def arun_many(
|
||||
urls: Union[List[str], List[Any]],
|
||||
config: Optional[CrawlerRunConfig] = None,
|
||||
config: Optional[Union[CrawlerRunConfig, List[CrawlerRunConfig]]] = None,
|
||||
dispatcher: Optional[BaseDispatcher] = None,
|
||||
...
|
||||
) -> Union[List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
|
||||
@@ -15,7 +15,9 @@ async def arun_many(
|
||||
Crawl multiple URLs concurrently or in batches.
|
||||
|
||||
:param urls: A list of URLs (or tasks) to crawl.
|
||||
:param config: (Optional) A default `CrawlerRunConfig` applying to each crawl.
|
||||
:param config: (Optional) Either:
|
||||
- A single `CrawlerRunConfig` applying to all URLs
|
||||
- A list of `CrawlerRunConfig` objects with url_matcher patterns
|
||||
:param dispatcher: (Optional) A concurrency controller (e.g. MemoryAdaptiveDispatcher).
|
||||
...
|
||||
:return: Either a list of `CrawlResult` objects, or an async generator if streaming is enabled.
|
||||
@@ -95,10 +97,70 @@ results = await crawler.arun_many(
|
||||
)
|
||||
```
|
||||
|
||||
### URL-Specific Configurations
|
||||
|
||||
Instead of using one config for all URLs, provide a list of configs with `url_matcher` patterns:
|
||||
|
||||
```python
|
||||
from crawl4ai import CrawlerRunConfig, MatchMode
|
||||
from crawl4ai.processors.pdf import PDFContentScrapingStrategy
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
|
||||
# PDF files - specialized extraction
|
||||
pdf_config = CrawlerRunConfig(
|
||||
url_matcher="*.pdf",
|
||||
scraping_strategy=PDFContentScrapingStrategy()
|
||||
)
|
||||
|
||||
# Blog/article pages - content filtering
|
||||
blog_config = CrawlerRunConfig(
|
||||
url_matcher=["*/blog/*", "*/article/*", "*python.org*"],
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(threshold=0.48)
|
||||
)
|
||||
)
|
||||
|
||||
# Dynamic pages - JavaScript execution
|
||||
github_config = CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'github.com' in url,
|
||||
js_code="window.scrollTo(0, 500);"
|
||||
)
|
||||
|
||||
# API endpoints - JSON extraction
|
||||
api_config = CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'api' in url or url.endswith('.json'),
|
||||
# Custome settings for JSON extraction
|
||||
)
|
||||
|
||||
# Default fallback config
|
||||
default_config = CrawlerRunConfig() # No url_matcher means it never matches except as fallback
|
||||
|
||||
# Pass the list of configs - first match wins!
|
||||
results = await crawler.arun_many(
|
||||
urls=[
|
||||
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", # → pdf_config
|
||||
"https://blog.python.org/", # → blog_config
|
||||
"https://github.com/microsoft/playwright", # → github_config
|
||||
"https://httpbin.org/json", # → api_config
|
||||
"https://example.com/" # → default_config
|
||||
],
|
||||
config=[pdf_config, blog_config, github_config, api_config, default_config]
|
||||
)
|
||||
```
|
||||
|
||||
**URL Matching Features**:
|
||||
- **String patterns**: `"*.pdf"`, `"*/blog/*"`, `"*python.org*"`
|
||||
- **Function matchers**: `lambda url: 'api' in url`
|
||||
- **Mixed patterns**: Combine strings and functions with `MatchMode.OR` or `MatchMode.AND`
|
||||
- **First match wins**: Configs are evaluated in order
|
||||
|
||||
**Key Points**:
|
||||
- Each URL is processed by the same or separate sessions, depending on the dispatcher’s strategy.
|
||||
- `dispatch_result` in each `CrawlResult` (if using concurrency) can hold memory and timing info.
|
||||
- If you need to handle authentication or session IDs, pass them in each individual task or within your run config.
|
||||
- **Important**: Always include a default config (without `url_matcher`) as the last item if you want to handle all URLs. Otherwise, unmatched URLs will fail.
|
||||
|
||||
### Return Value
|
||||
|
||||
|
||||
@@ -208,6 +208,71 @@ config = CrawlerRunConfig(
|
||||
|
||||
See [Virtual Scroll documentation](../../advanced/virtual-scroll.md) for detailed examples.
|
||||
|
||||
---
|
||||
|
||||
### I) **URL Matching Configuration**
|
||||
|
||||
| **Parameter** | **Type / Default** | **What It Does** |
|
||||
|------------------------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| **`url_matcher`** | `UrlMatcher` (None) | Pattern(s) to match URLs against. Can be: string (glob), function, or list of mixed types. **None means match ALL URLs** |
|
||||
| **`match_mode`** | `MatchMode` (MatchMode.OR) | How to combine multiple matchers in a list: `MatchMode.OR` (any match) or `MatchMode.AND` (all must match) |
|
||||
|
||||
The `url_matcher` parameter enables URL-specific configurations when used with `arun_many()`:
|
||||
|
||||
```python
|
||||
from crawl4ai import CrawlerRunConfig, MatchMode
|
||||
from crawl4ai.processors.pdf import PDFContentScrapingStrategy
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
|
||||
# Simple string pattern (glob-style)
|
||||
pdf_config = CrawlerRunConfig(
|
||||
url_matcher="*.pdf",
|
||||
scraping_strategy=PDFContentScrapingStrategy()
|
||||
)
|
||||
|
||||
# Multiple patterns with OR logic (default)
|
||||
blog_config = CrawlerRunConfig(
|
||||
url_matcher=["*/blog/*", "*/article/*", "*/news/*"],
|
||||
match_mode=MatchMode.OR # Any pattern matches
|
||||
)
|
||||
|
||||
# Function matcher
|
||||
api_config = CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'api' in url or url.endswith('.json'),
|
||||
# Other settings like extraction_strategy
|
||||
)
|
||||
|
||||
# Mixed: String + Function with AND logic
|
||||
complex_config = CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
lambda url: url.startswith('https://'), # Must be HTTPS
|
||||
"*.org/*", # Must be .org domain
|
||||
lambda url: 'docs' in url # Must contain 'docs'
|
||||
],
|
||||
match_mode=MatchMode.AND # ALL conditions must match
|
||||
)
|
||||
|
||||
# Combined patterns and functions with AND logic
|
||||
secure_docs = CrawlerRunConfig(
|
||||
url_matcher=["https://*", lambda url: '.doc' in url],
|
||||
match_mode=MatchMode.AND # Must be HTTPS AND contain .doc
|
||||
)
|
||||
|
||||
# Default config - matches ALL URLs
|
||||
default_config = CrawlerRunConfig() # No url_matcher = matches everything
|
||||
```
|
||||
|
||||
**UrlMatcher Types:**
|
||||
- **None (default)**: When `url_matcher` is None or not set, the config matches ALL URLs
|
||||
- **String patterns**: Glob-style patterns like `"*.pdf"`, `"*/api/*"`, `"https://*.example.com/*"`
|
||||
- **Functions**: `lambda url: bool` - Custom logic for complex matching
|
||||
- **Lists**: Mix strings and functions, combined with `MatchMode.OR` or `MatchMode.AND`
|
||||
|
||||
**Important Behavior:**
|
||||
- When passing a list of configs to `arun_many()`, URLs are matched against each config's `url_matcher` in order. First match wins!
|
||||
- If no config matches a URL and there's no default config (one without `url_matcher`), the URL will fail with "No matching configuration found"
|
||||
- Always include a default config as the last item if you want to handle all URLs
|
||||
|
||||
---## 2.2 Helper Methods
|
||||
|
||||
Both `BrowserConfig` and `CrawlerRunConfig` provide a `clone()` method to create modified copies:
|
||||
|
||||
@@ -20,14 +20,28 @@ Ever wondered why your AI coding assistant struggles with your library despite c
|
||||
|
||||
## Latest Release
|
||||
|
||||
Here’s the blog index entry for **v0.6.0**, written to match the exact tone and structure of your previous entries:
|
||||
### [Crawl4AI v0.7.0 – The Adaptive Intelligence Update](releases/0.7.0.md)
|
||||
*January 28, 2025*
|
||||
|
||||
Crawl4AI v0.7.0 introduces groundbreaking intelligence features that transform how crawlers understand and adapt to websites. This release brings Adaptive Crawling that learns website patterns, Virtual Scroll support for infinite pages, intelligent Link Preview with 3-layer scoring, and the powerful Async URL Seeder for massive URL discovery.
|
||||
|
||||
Key highlights:
|
||||
- **Adaptive Crawling**: Crawlers that learn and adapt to website structures automatically
|
||||
- **Virtual Scroll Support**: Complete content extraction from modern infinite scroll pages
|
||||
- **Link Preview**: 3-layer scoring system for intelligent link prioritization
|
||||
- **Async URL Seeder**: Discover thousands of URLs in seconds with smart filtering
|
||||
- **Performance Boost**: Up to 3x faster with optimized resource handling
|
||||
|
||||
[Read full release notes →](releases/0.7.0.md)
|
||||
|
||||
---
|
||||
|
||||
### [Crawl4AI v0.6.0 – World-Aware Crawling, Pre-Warmed Browsers, and the MCP API](releases/0.6.0.md)
|
||||
*April 23, 2025*
|
||||
## Previous Releases
|
||||
|
||||
Crawl4AI v0.6.0 is our most powerful release yet. This update brings major architectural upgrades including world-aware crawling (set geolocation, locale, and timezone), real-time traffic capture, and a memory-efficient crawler pool with pre-warmed pages.
|
||||
### [Crawl4AI v0.6.0 – World-Aware Crawling, Pre-Warmed Browsers, and the MCP API](releases/0.6.0.md)
|
||||
*December 23, 2024*
|
||||
|
||||
Crawl4AI v0.6.0 brought major architectural upgrades including world-aware crawling (set geolocation, locale, and timezone), real-time traffic capture, and a memory-efficient crawler pool with pre-warmed pages.
|
||||
|
||||
The Docker server now exposes a full-featured MCP socket + SSE interface, supports streaming, and comes with a new Playground UI. Plus, table extraction is now native, and the new stress-test framework supports crawling 1,000+ URLs.
|
||||
|
||||
@@ -45,8 +59,6 @@ Other key changes:
|
||||
|
||||
---
|
||||
|
||||
Let me know if you want me to auto-update the actual file or just paste this into the markdown.
|
||||
|
||||
### [Crawl4AI v0.5.0: Deep Crawling, Scalability, and a New CLI!](releases/0.5.0.md)
|
||||
|
||||
My dear friends and crawlers, there you go, this is the release of Crawl4AI v0.5.0! This release brings a wealth of new features, performance improvements, and a more streamlined developer experience. Here's a breakdown of what's new:
|
||||
@@ -140,5 +152,4 @@ Curious about how Crawl4AI has evolved? Check out our [complete changelog](https
|
||||
|
||||
- Star us on [GitHub](https://github.com/unclecode/crawl4ai)
|
||||
- Follow [@unclecode](https://twitter.com/unclecode) on Twitter
|
||||
- Join our community discussions on GitHub
|
||||
|
||||
- Join our community discussions on GitHub
|
||||
144
docs/md_v2/blog/index.md.bak
Normal file
144
docs/md_v2/blog/index.md.bak
Normal file
@@ -0,0 +1,144 @@
|
||||
# Crawl4AI Blog
|
||||
|
||||
Welcome to the Crawl4AI blog! Here you'll find detailed release notes, technical insights, and updates about the project. Whether you're looking for the latest improvements or want to dive deep into web crawling techniques, this is the place.
|
||||
|
||||
## Featured Articles
|
||||
|
||||
### [When to Stop Crawling: The Art of Knowing "Enough"](articles/adaptive-crawling-revolution.md)
|
||||
*January 29, 2025*
|
||||
|
||||
Traditional crawlers are like tourists with unlimited time—they'll visit every street, every alley, every dead end. But what if your crawler could think like a researcher with a deadline? Discover how Adaptive Crawling revolutionizes web scraping by knowing when to stop. Learn about the three-layer intelligence system that evaluates coverage, consistency, and saturation to build focused knowledge bases instead of endless page collections.
|
||||
|
||||
[Read the full article →](articles/adaptive-crawling-revolution.md)
|
||||
|
||||
### [The LLM Context Protocol: Why Your AI Assistant Needs Memory, Reasoning, and Examples](articles/llm-context-revolution.md)
|
||||
*January 24, 2025*
|
||||
|
||||
Ever wondered why your AI coding assistant struggles with your library despite comprehensive documentation? This article introduces the three-dimensional context protocol that transforms how AI understands code. Learn why memory, reasoning, and examples together create wisdom—not just information.
|
||||
|
||||
[Read the full article →](articles/llm-context-revolution.md)
|
||||
|
||||
## Latest Release
|
||||
|
||||
Here’s the blog index entry for **v0.6.0**, written to match the exact tone and structure of your previous entries:
|
||||
|
||||
---
|
||||
|
||||
### [Crawl4AI v0.6.0 – World-Aware Crawling, Pre-Warmed Browsers, and the MCP API](releases/0.6.0.md)
|
||||
*April 23, 2025*
|
||||
|
||||
Crawl4AI v0.6.0 is our most powerful release yet. This update brings major architectural upgrades including world-aware crawling (set geolocation, locale, and timezone), real-time traffic capture, and a memory-efficient crawler pool with pre-warmed pages.
|
||||
|
||||
The Docker server now exposes a full-featured MCP socket + SSE interface, supports streaming, and comes with a new Playground UI. Plus, table extraction is now native, and the new stress-test framework supports crawling 1,000+ URLs.
|
||||
|
||||
Other key changes:
|
||||
|
||||
* Native support for `result.media["tables"]` to export DataFrames
|
||||
* Full network + console logs and MHTML snapshot per crawl
|
||||
* Browser pooling and pre-warming for faster cold starts
|
||||
* New streaming endpoints via MCP API and Playground
|
||||
* Robots.txt support, proxy rotation, and improved session handling
|
||||
* Deprecated old markdown names, legacy modules cleaned up
|
||||
* Massive repo cleanup: ~36K insertions, ~5K deletions across 121 files
|
||||
|
||||
[Read full release notes →](releases/0.6.0.md)
|
||||
|
||||
---
|
||||
|
||||
Let me know if you want me to auto-update the actual file or just paste this into the markdown.
|
||||
|
||||
### [Crawl4AI v0.5.0: Deep Crawling, Scalability, and a New CLI!](releases/0.5.0.md)
|
||||
|
||||
My dear friends and crawlers, there you go, this is the release of Crawl4AI v0.5.0! This release brings a wealth of new features, performance improvements, and a more streamlined developer experience. Here's a breakdown of what's new:
|
||||
|
||||
**Major New Features:**
|
||||
|
||||
* **Deep Crawling:** Explore entire websites with configurable strategies (BFS, DFS, Best-First). Define custom filters and URL scoring for targeted crawls.
|
||||
* **Memory-Adaptive Dispatcher:** Handle large-scale crawls with ease! Our new dispatcher dynamically adjusts concurrency based on available memory and includes built-in rate limiting.
|
||||
* **Multiple Crawler Strategies:** Choose between the full-featured Playwright browser-based crawler or a new, *much* faster HTTP-only crawler for simpler tasks.
|
||||
* **Docker Deployment:** Deploy Crawl4AI as a scalable, self-contained service with built-in API endpoints and optional JWT authentication.
|
||||
* **Command-Line Interface (CLI):** Interact with Crawl4AI directly from your terminal. Crawl, configure, and extract data with simple commands.
|
||||
* **LLM Configuration (`LLMConfig`):** A new, unified way to configure LLM providers (OpenAI, Anthropic, Ollama, etc.) for extraction, filtering, and schema generation. Simplifies API key management and switching between models.
|
||||
|
||||
**Minor Updates & Improvements:**
|
||||
|
||||
* **LXML Scraping Mode:** Faster HTML parsing with `LXMLWebScrapingStrategy`.
|
||||
* **Proxy Rotation:** Added `ProxyRotationStrategy` with a `RoundRobinProxyStrategy` implementation.
|
||||
* **PDF Processing:** Extract text, images, and metadata from PDF files.
|
||||
* **URL Redirection Tracking:** Automatically follows and records redirects.
|
||||
* **Robots.txt Compliance:** Optionally respect website crawling rules.
|
||||
* **LLM-Powered Schema Generation:** Automatically create extraction schemas using an LLM.
|
||||
* **`LLMContentFilter`:** Generate high-quality, focused markdown using an LLM.
|
||||
* **Improved Error Handling & Stability:** Numerous bug fixes and performance enhancements.
|
||||
* **Enhanced Documentation:** Updated guides and examples.
|
||||
|
||||
**Breaking Changes & Migration:**
|
||||
|
||||
This release includes several breaking changes to improve the library's structure and consistency. Here's what you need to know:
|
||||
|
||||
* **`arun_many()` Behavior:** Now uses the `MemoryAdaptiveDispatcher` by default. The return type depends on the `stream` parameter in `CrawlerRunConfig`. Adjust code that relied on unbounded concurrency.
|
||||
* **`max_depth` Location:** Moved to `CrawlerRunConfig` and now controls *crawl depth*.
|
||||
* **Deep Crawling Imports:** Import `DeepCrawlStrategy` and related classes from `crawl4ai.deep_crawling`.
|
||||
* **`BrowserContext` API:** Updated; the old `get_context` method is deprecated.
|
||||
* **Optional Model Fields:** Many data model fields are now optional. Handle potential `None` values.
|
||||
* **`ScrapingMode` Enum:** Replaced with strategy pattern (`WebScrapingStrategy`, `LXMLWebScrapingStrategy`).
|
||||
* **`content_filter` Parameter:** Removed from `CrawlerRunConfig`. Use extraction strategies or markdown generators with filters.
|
||||
* **Removed Functionality:** The synchronous `WebCrawler`, the old CLI, and docs management tools have been removed.
|
||||
* **Docker:** Significant changes to deployment. See the [Docker documentation](../deploy/docker/README.md).
|
||||
* **`ssl_certificate.json`:** This file has been removed.
|
||||
* **Config**: FastFilterChain has been replaced with FilterChain
|
||||
* **Deep-Crawl**: DeepCrawlStrategy.arun now returns Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
||||
* **Proxy**: Removed synchronous WebCrawler support and related rate limiting configurations
|
||||
* **LLM Parameters:** Use the new `LLMConfig` object instead of passing `provider`, `api_token`, `base_url`, and `api_base` directly to `LLMExtractionStrategy` and `LLMContentFilter`.
|
||||
|
||||
**In short:** Update imports, adjust `arun_many()` usage, check for optional fields, and review the Docker deployment guide.
|
||||
|
||||
## License Change
|
||||
|
||||
Crawl4AI v0.5.0 updates the license to Apache 2.0 *with a required attribution clause*. This means you are free to use, modify, and distribute Crawl4AI (even commercially), but you *must* clearly attribute the project in any public use or distribution. See the updated `LICENSE` file for the full legal text and specific requirements.
|
||||
|
||||
**Get Started:**
|
||||
|
||||
* **Installation:** `pip install "crawl4ai[all]"` (or use the Docker image)
|
||||
* **Documentation:** [https://docs.crawl4ai.com](https://docs.crawl4ai.com)
|
||||
* **GitHub:** [https://github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||
|
||||
I'm very excited to see what you build with Crawl4AI v0.5.0!
|
||||
|
||||
---
|
||||
|
||||
### [0.4.2 - Configurable Crawlers, Session Management, and Smarter Screenshots](releases/0.4.2.md)
|
||||
*December 12, 2024*
|
||||
|
||||
The 0.4.2 update brings massive improvements to configuration, making crawlers and browsers easier to manage with dedicated objects. You can now import/export local storage for seamless session management. Plus, long-page screenshots are faster and cleaner, and full-page PDF exports are now possible. Check out all the new features to make your crawling experience even smoother.
|
||||
|
||||
[Read full release notes →](releases/0.4.2.md)
|
||||
|
||||
---
|
||||
|
||||
### [0.4.1 - Smarter Crawling with Lazy-Load Handling, Text-Only Mode, and More](releases/0.4.1.md)
|
||||
*December 8, 2024*
|
||||
|
||||
This release brings major improvements to handling lazy-loaded images, a blazing-fast Text-Only Mode, full-page scanning for infinite scrolls, dynamic viewport adjustments, and session reuse for efficient crawling. If you're looking to improve speed, reliability, or handle dynamic content with ease, this update has you covered.
|
||||
|
||||
[Read full release notes →](releases/0.4.1.md)
|
||||
|
||||
---
|
||||
|
||||
### [0.4.0 - Major Content Filtering Update](releases/0.4.0.md)
|
||||
*December 1, 2024*
|
||||
|
||||
Introduced significant improvements to content filtering, multi-threaded environment handling, and user-agent generation. This release features the new PruningContentFilter, enhanced thread safety, and improved test coverage.
|
||||
|
||||
[Read full release notes →](releases/0.4.0.md)
|
||||
|
||||
## Project History
|
||||
|
||||
Curious about how Crawl4AI has evolved? Check out our [complete changelog](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md) for a detailed history of all versions and updates.
|
||||
|
||||
## Stay Updated
|
||||
|
||||
- Star us on [GitHub](https://github.com/unclecode/crawl4ai)
|
||||
- Follow [@unclecode](https://twitter.com/unclecode) on Twitter
|
||||
- Join our community discussions on GitHub
|
||||
|
||||
343
docs/md_v2/blog/releases/0.7.0.md
Normal file
343
docs/md_v2/blog/releases/0.7.0.md
Normal file
@@ -0,0 +1,343 @@
|
||||
# 🚀 Crawl4AI v0.7.0: The Adaptive Intelligence Update
|
||||
|
||||
*January 28, 2025 • 10 min read*
|
||||
|
||||
---
|
||||
|
||||
Today I'm releasing Crawl4AI v0.7.0—the Adaptive Intelligence Update. This release introduces fundamental improvements in how Crawl4AI handles modern web complexity through adaptive learning, intelligent content discovery, and advanced extraction capabilities.
|
||||
|
||||
## 🎯 What's New at a Glance
|
||||
|
||||
- **Adaptive Crawling**: Your crawler now learns and adapts to website patterns
|
||||
- **Virtual Scroll Support**: Complete content extraction from infinite scroll pages
|
||||
- **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization
|
||||
- **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering
|
||||
- **Performance Optimizations**: Significant speed and memory improvements
|
||||
|
||||
## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning
|
||||
|
||||
**The Problem:** Websites change. Class names shift. IDs disappear. Your carefully crafted selectors break at 3 AM, and you wake up to empty datasets and angry stakeholders.
|
||||
|
||||
**My Solution:** I implemented an adaptive learning system that observes patterns, builds confidence scores, and adjusts extraction strategies on the fly. It's like having a junior developer who gets better at their job with every page they scrape.
|
||||
|
||||
### Technical Deep-Dive
|
||||
|
||||
The Adaptive Crawler maintains a persistent state for each domain, tracking:
|
||||
- Pattern success rates
|
||||
- Selector stability over time
|
||||
- Content structure variations
|
||||
- Extraction confidence scores
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||
import asyncio
|
||||
|
||||
async def main():
|
||||
|
||||
# Configure adaptive crawler
|
||||
config = AdaptiveConfig(
|
||||
strategy="statistical", # or "embedding" for semantic understanding
|
||||
max_pages=10,
|
||||
confidence_threshold=0.7, # Stop at 70% confidence
|
||||
top_k_links=3, # Follow top 3 links per page
|
||||
min_gain_threshold=0.05 # Need 5% information gain to continue
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
|
||||
print("Starting adaptive crawl about Python decorators...")
|
||||
result = await adaptive.digest(
|
||||
start_url="https://docs.python.org/3/glossary.html",
|
||||
query="python decorators functions wrapping"
|
||||
)
|
||||
|
||||
print(f"\n✅ Crawling Complete!")
|
||||
print(f"• Confidence Level: {adaptive.confidence:.0%}")
|
||||
print(f"• Pages Crawled: {len(result.crawled_urls)}")
|
||||
print(f"• Knowledge Base: {len(adaptive.state.knowledge_base)} documents")
|
||||
|
||||
# Get most relevant content
|
||||
relevant = adaptive.get_relevant_content(top_k=3)
|
||||
print(f"\nMost Relevant Pages:")
|
||||
for i, page in enumerate(relevant, 1):
|
||||
print(f"{i}. {page['url']} (relevance: {page['score']:.2%})")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
- **News Aggregation**: Maintain 95%+ extraction accuracy even as news sites update their templates
|
||||
- **E-commerce Monitoring**: Track product changes across hundreds of stores without constant maintenance
|
||||
- **Research Data Collection**: Build robust academic datasets that survive website redesigns
|
||||
- **Reduced Maintenance**: Cut selector update time by 80% for frequently-changing sites
|
||||
|
||||
## 🌊 Virtual Scroll: Complete Content Capture
|
||||
|
||||
**The Problem:** Modern web apps only render what's visible. Scroll down, new content appears, old content vanishes into the void. Traditional crawlers capture that first viewport and miss 90% of the content. It's like reading only the first page of every book.
|
||||
|
||||
**My Solution:** I built Virtual Scroll support that mimics human browsing behavior, capturing content as it loads and preserving it before the browser's garbage collector strikes.
|
||||
|
||||
### Implementation Details
|
||||
|
||||
```python
|
||||
from crawl4ai import VirtualScrollConfig
|
||||
|
||||
# For social media feeds (Twitter/X style)
|
||||
twitter_config = VirtualScrollConfig(
|
||||
container_selector="[data-testid='primaryColumn']",
|
||||
scroll_count=20, # Number of scrolls
|
||||
scroll_by="container_height", # Smart scrolling by container size
|
||||
wait_after_scroll=1.0 # Let content load
|
||||
)
|
||||
|
||||
# For e-commerce product grids (Instagram style)
|
||||
grid_config = VirtualScrollConfig(
|
||||
container_selector="main .product-grid",
|
||||
scroll_count=30,
|
||||
scroll_by=800, # Fixed pixel scrolling
|
||||
wait_after_scroll=1.5 # Images need time
|
||||
)
|
||||
|
||||
# For news feeds with lazy loading
|
||||
news_config = VirtualScrollConfig(
|
||||
container_selector=".article-feed",
|
||||
scroll_count=50,
|
||||
scroll_by="page_height", # Viewport-based scrolling
|
||||
wait_after_scroll=0.5 # Wait for content to load
|
||||
)
|
||||
|
||||
# Use it in your crawl
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://twitter.com/trending",
|
||||
config=CrawlerRunConfig(
|
||||
virtual_scroll_config=twitter_config,
|
||||
# Combine with other features
|
||||
extraction_strategy=JsonCssExtractionStrategy({
|
||||
"tweets": {
|
||||
"selector": "[data-testid='tweet']",
|
||||
"fields": {
|
||||
"text": {"selector": "[data-testid='tweetText']", "type": "text"},
|
||||
"likes": {"selector": "[data-testid='like']", "type": "text"}
|
||||
}
|
||||
}
|
||||
})
|
||||
)
|
||||
)
|
||||
|
||||
print(f"Captured {len(result.extracted_content['tweets'])} tweets")
|
||||
```
|
||||
|
||||
**Key Capabilities:**
|
||||
- **DOM Recycling Awareness**: Detects and handles virtual DOM element recycling
|
||||
- **Smart Scroll Physics**: Three modes - container height, page height, or fixed pixels
|
||||
- **Content Preservation**: Captures content before it's destroyed
|
||||
- **Intelligent Stopping**: Stops when no new content appears
|
||||
- **Memory Efficient**: Streams content instead of holding everything in memory
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
- **Social Media Analysis**: Capture entire Twitter threads with hundreds of replies, not just top 10
|
||||
- **E-commerce Scraping**: Extract 500+ products from infinite scroll catalogs vs. 20-50 with traditional methods
|
||||
- **News Aggregation**: Get all articles from modern news sites, not just above-the-fold content
|
||||
- **Research Applications**: Complete data extraction from academic databases using virtual pagination
|
||||
|
||||
## 🔗 Link Preview: Intelligent Link Analysis and Scoring
|
||||
|
||||
**The Problem:** You crawl a page and get 200 links. Which ones matter? Which lead to the content you actually want? Traditional crawlers force you to follow everything or build complex filters.
|
||||
|
||||
**My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals.
|
||||
|
||||
### Intelligent Link Analysis and Scoring
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import CrawlerRunConfig, CacheMode, AsyncWebCrawler
|
||||
from crawl4ai.adaptive_crawler import LinkPreviewConfig
|
||||
|
||||
async def main():
|
||||
# Configure intelligent link analysis
|
||||
link_config = LinkPreviewConfig(
|
||||
include_internal=True,
|
||||
include_external=False,
|
||||
max_links=10,
|
||||
concurrency=5,
|
||||
query="python tutorial", # For contextual scoring
|
||||
score_threshold=0.3,
|
||||
verbose=True
|
||||
)
|
||||
# Use in your crawl
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://www.geeksforgeeks.org/",
|
||||
config=CrawlerRunConfig(
|
||||
link_preview_config=link_config,
|
||||
score_links=True, # Enable intrinsic scoring
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
)
|
||||
|
||||
# Access scored and sorted links
|
||||
if result.success and result.links:
|
||||
for link in result.links.get("internal", []):
|
||||
text = link.get('text', 'No text')[:40]
|
||||
print(
|
||||
text,
|
||||
f"{link.get('intrinsic_score', 0):.1f}/10" if link.get('intrinsic_score') is not None else "0.0/10",
|
||||
f"{link.get('contextual_score', 0):.2f}/1" if link.get('contextual_score') is not None else "0.00/1",
|
||||
f"{link.get('total_score', 0):.3f}" if link.get('total_score') is not None else "0.000"
|
||||
)
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Scoring Components:**
|
||||
|
||||
1. **Intrinsic Score**: Based on link quality indicators
|
||||
- Position on page (navigation, content, footer)
|
||||
- Link attributes (rel, title, class names)
|
||||
- Anchor text quality and length
|
||||
- URL structure and depth
|
||||
|
||||
2. **Contextual Score**: Relevance to your query using BM25 algorithm
|
||||
- Keyword matching in link text and title
|
||||
- Meta description analysis
|
||||
- Content preview scoring
|
||||
|
||||
3. **Total Score**: Combined score for final ranking
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
- **Research Efficiency**: Find relevant papers 10x faster by following only high-score links
|
||||
- **Competitive Analysis**: Automatically identify important pages on competitor sites
|
||||
- **Content Discovery**: Build topic-focused crawlers that stay on track
|
||||
- **SEO Audits**: Identify and prioritize high-value internal linking opportunities
|
||||
|
||||
## 🎣 Async URL Seeder: Automated URL Discovery at Scale
|
||||
|
||||
**The Problem:** You want to crawl an entire domain but only have the homepage. Or worse, you want specific content types across thousands of pages. Manual URL discovery? That's a job for machines, not humans.
|
||||
|
||||
**My Solution:** I built Async URL Seeder—a turbocharged URL discovery engine that combines multiple sources with intelligent filtering and relevance scoring.
|
||||
|
||||
### Technical Architecture
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
||||
|
||||
async def main():
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
# Discover Python tutorial URLs
|
||||
config = SeedingConfig(
|
||||
source="sitemap", # Use sitemap
|
||||
pattern="*python*", # URL pattern filter
|
||||
extract_head=True, # Get metadata
|
||||
query="python tutorial", # For relevance scoring
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.2,
|
||||
max_urls=10
|
||||
)
|
||||
|
||||
print("Discovering Python async tutorial URLs...")
|
||||
urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
|
||||
|
||||
print(f"\n✅ Found {len(urls)} relevant URLs:")
|
||||
for i, url_info in enumerate(urls[:5], 1):
|
||||
print(f"\n{i}. {url_info['url']}")
|
||||
if url_info.get('relevance_score'):
|
||||
print(f" Relevance: {url_info['relevance_score']:.3f}")
|
||||
if url_info.get('head_data', {}).get('title'):
|
||||
print(f" Title: {url_info['head_data']['title'][:60]}...")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Discovery Methods:**
|
||||
- **Sitemap Mining**: Parses robots.txt and all linked sitemaps
|
||||
- **Common Crawl**: Queries the Common Crawl index for historical URLs
|
||||
- **Intelligent Crawling**: Follows links with smart depth control
|
||||
- **Pattern Analysis**: Learns URL structures and generates variations
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
- **Migration Projects**: Discover 10,000+ URLs from legacy sites in under 60 seconds
|
||||
- **Market Research**: Map entire competitor ecosystems automatically
|
||||
- **Academic Research**: Build comprehensive datasets without manual URL collection
|
||||
- **SEO Audits**: Find every indexable page with content scoring
|
||||
- **Content Archival**: Ensure no content is left behind during site migrations
|
||||
|
||||
## ⚡ Performance Optimizations
|
||||
|
||||
This release includes significant performance improvements through optimized resource handling, better concurrency management, and reduced memory footprint.
|
||||
|
||||
### What We Optimized
|
||||
|
||||
```python
|
||||
# Optimized crawling with v0.7.0 improvements
|
||||
results = []
|
||||
for url in urls:
|
||||
result = await crawler.arun(
|
||||
url,
|
||||
config=CrawlerRunConfig(
|
||||
# Performance optimizations
|
||||
wait_until="domcontentloaded", # Faster than networkidle
|
||||
cache_mode=CacheMode.ENABLED # Enable caching
|
||||
)
|
||||
)
|
||||
results.append(result)
|
||||
```
|
||||
|
||||
**Performance Gains:**
|
||||
- **Startup Time**: 70% faster browser initialization
|
||||
- **Page Loading**: 40% reduction with smart resource blocking
|
||||
- **Extraction**: 3x faster with compiled CSS selectors
|
||||
- **Memory Usage**: 60% reduction with streaming processing
|
||||
- **Concurrent Crawls**: Handle 5x more parallel requests
|
||||
|
||||
|
||||
## 🔧 Important Changes
|
||||
|
||||
### Breaking Changes
|
||||
- `link_extractor` renamed to `link_preview` (better reflects functionality)
|
||||
- Minimum Python version now 3.9
|
||||
- `CrawlerConfig` split into `CrawlerRunConfig` and `BrowserConfig`
|
||||
|
||||
### Migration Guide
|
||||
```python
|
||||
# Old (v0.6.x)
|
||||
from crawl4ai import CrawlerConfig
|
||||
config = CrawlerConfig(timeout=30000)
|
||||
|
||||
# New (v0.7.0)
|
||||
from crawl4ai import CrawlerRunConfig, BrowserConfig
|
||||
browser_config = BrowserConfig(timeout=30000)
|
||||
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
```
|
||||
|
||||
## 🤖 Coming Soon: Intelligent Web Automation
|
||||
|
||||
I'm currently working on bringing advanced automation capabilities to Crawl4AI. This includes:
|
||||
|
||||
- **Crawl Agents**: Autonomous crawlers that understand your goals and adapt their strategies
|
||||
- **Auto JS Generation**: Automatic JavaScript code generation for complex interactions
|
||||
- **Smart Form Handling**: Intelligent form detection and filling
|
||||
- **Context-Aware Actions**: Crawlers that understand page context and make decisions
|
||||
|
||||
These features are under active development and will revolutionize how we approach web automation. Stay tuned!
|
||||
|
||||
## 🚀 Get Started
|
||||
|
||||
```bash
|
||||
pip install crawl4ai==0.7.0
|
||||
```
|
||||
|
||||
Check out the [updated documentation](https://docs.crawl4ai.com).
|
||||
|
||||
Questions? Issues? I'm always listening:
|
||||
- GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||
- Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||
- Twitter: [@unclecode](https://x.com/unclecode)
|
||||
|
||||
Happy crawling! 🕷️
|
||||
|
||||
---
|
||||
|
||||
*P.S. If you're using Crawl4AI in production, I'd love to hear about it. Your use cases inspire the next features.*
|
||||
43
docs/md_v2/blog/releases/0.7.1.md
Normal file
43
docs/md_v2/blog/releases/0.7.1.md
Normal file
@@ -0,0 +1,43 @@
|
||||
# 🛠️ Crawl4AI v0.7.1: Minor Cleanup Update
|
||||
|
||||
*July 17, 2025 • 2 min read*
|
||||
|
||||
---
|
||||
|
||||
A small maintenance release that removes unused code and improves documentation.
|
||||
|
||||
## 🎯 What's Changed
|
||||
|
||||
- **Removed unused StealthConfig** from `crawl4ai/browser_manager.py`
|
||||
- **Updated documentation** with better examples and parameter explanations
|
||||
- **Fixed virtual scroll configuration** examples in docs
|
||||
|
||||
## 🧹 Code Cleanup
|
||||
|
||||
Removed unused `StealthConfig` import and configuration that wasn't being used anywhere in the codebase. The project uses its own custom stealth implementation through JavaScript injection instead.
|
||||
|
||||
```python
|
||||
# Removed unused code:
|
||||
from playwright_stealth import StealthConfig
|
||||
stealth_config = StealthConfig(...) # This was never used
|
||||
```
|
||||
|
||||
## 📖 Documentation Updates
|
||||
|
||||
- Fixed adaptive crawling parameter examples
|
||||
- Updated session management documentation
|
||||
- Corrected virtual scroll configuration examples
|
||||
|
||||
## 🚀 Installation
|
||||
|
||||
```bash
|
||||
pip install crawl4ai==0.7.1
|
||||
```
|
||||
|
||||
No breaking changes - upgrade directly from v0.7.0.
|
||||
|
||||
---
|
||||
|
||||
Questions? Issues?
|
||||
- GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||
- Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||
98
docs/md_v2/blog/releases/0.7.2.md
Normal file
98
docs/md_v2/blog/releases/0.7.2.md
Normal file
@@ -0,0 +1,98 @@
|
||||
# 🚀 Crawl4AI v0.7.2: CI/CD & Dependency Optimization Update
|
||||
|
||||
*July 25, 2025 • 3 min read*
|
||||
|
||||
---
|
||||
|
||||
This release introduces automated CI/CD pipelines for seamless releases and optimizes dependencies for a lighter, more efficient package.
|
||||
|
||||
## 🎯 What's New
|
||||
|
||||
### 🔄 Automated Release Pipeline
|
||||
- **GitHub Actions CI/CD**: Automated PyPI and Docker Hub releases on tag push
|
||||
- **Multi-platform Docker images**: Support for both AMD64 and ARM64 architectures
|
||||
- **Version consistency checks**: Ensures tag, package, and Docker versions align
|
||||
- **Automated release notes**: GitHub releases created automatically
|
||||
|
||||
### 📦 Dependency Optimization
|
||||
- **Moved sentence-transformers to optional dependencies**: Significantly reduces default installation size
|
||||
- **Lighter Docker images**: Optimized Dockerfile for faster builds and smaller images
|
||||
- **Better dependency management**: Core vs. optional dependencies clearly separated
|
||||
|
||||
## 🏗️ CI/CD Pipeline
|
||||
|
||||
The new automated release process ensures consistent, reliable releases:
|
||||
|
||||
```yaml
|
||||
# Trigger releases with a simple tag
|
||||
git tag v0.7.2
|
||||
git push origin v0.7.2
|
||||
|
||||
# Automatically:
|
||||
# ✅ Validates version consistency
|
||||
# ✅ Builds and publishes to PyPI
|
||||
# ✅ Builds multi-platform Docker images
|
||||
# ✅ Pushes to Docker Hub with proper tags
|
||||
# ✅ Creates GitHub release
|
||||
```
|
||||
|
||||
## 💾 Lighter Installation
|
||||
|
||||
Default installation is now significantly smaller:
|
||||
|
||||
```bash
|
||||
# Core installation (smaller, faster)
|
||||
pip install crawl4ai==0.7.2
|
||||
|
||||
# With ML features (includes sentence-transformers)
|
||||
pip install crawl4ai[transformer]==0.7.2
|
||||
|
||||
# Full installation
|
||||
pip install crawl4ai[all]==0.7.2
|
||||
```
|
||||
|
||||
## 🐳 Docker Improvements
|
||||
|
||||
Enhanced Docker support with multi-platform images:
|
||||
|
||||
```bash
|
||||
# Pull the latest version
|
||||
docker pull unclecode/crawl4ai:0.7.2
|
||||
docker pull unclecode/crawl4ai:latest
|
||||
|
||||
# Available tags:
|
||||
# - unclecode/crawl4ai:0.7.2 (specific version)
|
||||
# - unclecode/crawl4ai:0.7 (minor version)
|
||||
# - unclecode/crawl4ai:0 (major version)
|
||||
# - unclecode/crawl4ai:latest
|
||||
```
|
||||
|
||||
## 🔧 Technical Details
|
||||
|
||||
### Dependency Changes
|
||||
- `sentence-transformers` moved from required to optional dependencies
|
||||
- Reduces default installation by ~500MB
|
||||
- No impact on functionality when transformer features aren't needed
|
||||
|
||||
### CI/CD Configuration
|
||||
- GitHub Actions workflows for automated releases
|
||||
- Version validation before publishing
|
||||
- Parallel PyPI and Docker Hub deployments
|
||||
- Automatic tagging strategy for Docker images
|
||||
|
||||
## 🚀 Installation
|
||||
|
||||
```bash
|
||||
pip install crawl4ai==0.7.2
|
||||
```
|
||||
|
||||
No breaking changes - direct upgrade from v0.7.0 or v0.7.1.
|
||||
|
||||
---
|
||||
|
||||
Questions? Issues?
|
||||
- GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||
- Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||
- Twitter: [@unclecode](https://x.com/unclecode)
|
||||
|
||||
*P.S. The new CI/CD pipeline will make future releases faster and more reliable. Thanks for your patience as we improve our release process!*
|
||||
@@ -35,7 +35,7 @@ from crawl4ai import AsyncWebCrawler, AdaptiveCrawler
|
||||
|
||||
async def main():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Create an adaptive crawler
|
||||
# Create an adaptive crawler (config is optional)
|
||||
adaptive = AdaptiveCrawler(crawler)
|
||||
|
||||
# Start crawling with a query
|
||||
@@ -59,13 +59,13 @@ async def main():
|
||||
from crawl4ai import AdaptiveConfig
|
||||
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.7, # Stop when 70% confident (default: 0.8)
|
||||
max_pages=20, # Maximum pages to crawl (default: 50)
|
||||
top_k_links=3, # Links to follow per page (default: 5)
|
||||
confidence_threshold=0.8, # Stop when 80% confident (default: 0.7)
|
||||
max_pages=30, # Maximum pages to crawl (default: 20)
|
||||
top_k_links=5, # Links to follow per page (default: 3)
|
||||
min_gain_threshold=0.05 # Minimum expected gain to continue (default: 0.1)
|
||||
)
|
||||
|
||||
adaptive = AdaptiveCrawler(crawler, config=config)
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
```
|
||||
|
||||
## Crawling Strategies
|
||||
@@ -198,8 +198,8 @@ if result.metrics.get('is_irrelevant', False):
|
||||
The confidence score (0-1) indicates how sufficient the gathered information is:
|
||||
- **0.0-0.3**: Insufficient information, needs more crawling
|
||||
- **0.3-0.6**: Partial information, may answer basic queries
|
||||
- **0.6-0.8**: Good coverage, can answer most queries
|
||||
- **0.8-1.0**: Excellent coverage, comprehensive information
|
||||
- **0.6-0.7**: Good coverage, can answer most queries
|
||||
- **0.7-1.0**: Excellent coverage, comprehensive information
|
||||
|
||||
### Statistics Display
|
||||
|
||||
@@ -257,9 +257,9 @@ new_adaptive.import_knowledge_base("knowledge_base.jsonl")
|
||||
- Avoid overly broad queries
|
||||
|
||||
### 2. Threshold Tuning
|
||||
- Start with default (0.8) for general use
|
||||
- Lower to 0.6-0.7 for exploratory crawling
|
||||
- Raise to 0.9+ for exhaustive coverage
|
||||
- Start with default (0.7) for general use
|
||||
- Lower to 0.5-0.6 for exploratory crawling
|
||||
- Raise to 0.8+ for exhaustive coverage
|
||||
|
||||
### 3. Performance Optimization
|
||||
- Use appropriate `max_pages` limits
|
||||
|
||||
@@ -209,7 +209,13 @@ class CrawlerRunConfig:
|
||||
- The maximum number of concurrent crawl sessions.
|
||||
- Helps prevent overwhelming the system.
|
||||
|
||||
14. **`display_mode`**:
|
||||
14. **`url_matcher`** & **`match_mode`**:
|
||||
- Enable URL-specific configurations when used with `arun_many()`.
|
||||
- Set `url_matcher` to patterns (glob, function, or list) to match specific URLs.
|
||||
- Use `match_mode` (OR/AND) to control how multiple patterns combine.
|
||||
- See [URL-Specific Configurations](../api/arun_many.md#url-specific-configurations) for examples.
|
||||
|
||||
15. **`display_mode`**:
|
||||
- The display mode for progress information (`DETAILED`, `BRIEF`, etc.).
|
||||
- Affects how much information is printed during the crawl.
|
||||
|
||||
|
||||
@@ -52,11 +52,9 @@ That's it! In just a few lines, you've automated a complete search workflow.
|
||||
|
||||
Want to learn by doing? We've got you covered:
|
||||
|
||||
**🚀 [Live Demo](https://docs.crawl4ai.com/c4a-script/demo)** - Try C4A-Script in your browser right now!
|
||||
**🚀 [Live Demo](https://docs.crawl4ai.com/apps/c4a-script/)** - Try C4A-Script in your browser right now!
|
||||
|
||||
**📁 [Tutorial Examples](/examples/c4a_script/)** - Complete examples with source code
|
||||
|
||||
**🛠️ [Local Tutorial](/examples/c4a_script/tutorial/)** - Run the interactive tutorial on your machine
|
||||
**📁 [Tutorial Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/c4a_script/)** - Complete examples with source code
|
||||
|
||||
### Running the Tutorial Locally
|
||||
|
||||
|
||||
@@ -350,15 +350,22 @@ if __name__ == "__main__":
|
||||
|
||||
## 6. Scraping Modes
|
||||
|
||||
Crawl4AI provides two different scraping strategies for HTML content processing: `WebScrapingStrategy` (BeautifulSoup-based, default) and `LXMLWebScrapingStrategy` (LXML-based). The LXML strategy offers significantly better performance, especially for large HTML documents.
|
||||
Crawl4AI uses `LXMLWebScrapingStrategy` (LXML-based) as the default scraping strategy for HTML content processing. This strategy offers excellent performance, especially for large HTML documents.
|
||||
|
||||
**Note:** For backward compatibility, `WebScrapingStrategy` is still available as an alias for `LXMLWebScrapingStrategy`.
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LXMLWebScrapingStrategy
|
||||
|
||||
async def main():
|
||||
config = CrawlerRunConfig(
|
||||
scraping_strategy=LXMLWebScrapingStrategy() # Faster alternative to default BeautifulSoup
|
||||
# Default configuration already uses LXMLWebScrapingStrategy
|
||||
config = CrawlerRunConfig()
|
||||
|
||||
# Or explicitly specify it if desired
|
||||
config_explicit = CrawlerRunConfig(
|
||||
scraping_strategy=LXMLWebScrapingStrategy()
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
@@ -417,21 +424,20 @@ class CustomScrapingStrategy(ContentScrapingStrategy):
|
||||
|
||||
### Performance Considerations
|
||||
|
||||
The LXML strategy can be up to 10-20x faster than BeautifulSoup strategy, particularly when processing large HTML documents. However, please note:
|
||||
The LXML strategy provides excellent performance, particularly when processing large HTML documents, offering up to 10-20x faster processing compared to BeautifulSoup-based approaches.
|
||||
|
||||
1. LXML strategy is currently experimental
|
||||
2. In some edge cases, the parsing results might differ slightly from BeautifulSoup
|
||||
3. If you encounter any inconsistencies between LXML and BeautifulSoup results, please [raise an issue](https://github.com/codeium/crawl4ai/issues) with a reproducible example
|
||||
Benefits of LXML strategy:
|
||||
- Fast processing of large HTML documents (especially >100KB)
|
||||
- Efficient memory usage
|
||||
- Good handling of well-formed HTML
|
||||
- Robust table detection and extraction
|
||||
|
||||
Choose LXML strategy when:
|
||||
- Processing large HTML documents (recommended for >100KB)
|
||||
- Performance is critical
|
||||
- Working with well-formed HTML
|
||||
### Backward Compatibility
|
||||
|
||||
Stick to BeautifulSoup strategy (default) when:
|
||||
- Maximum compatibility is needed
|
||||
- Working with malformed HTML
|
||||
- Exact parsing behavior is critical
|
||||
For users upgrading from earlier versions:
|
||||
- `WebScrapingStrategy` is now an alias for `LXMLWebScrapingStrategy`
|
||||
- Existing code using `WebScrapingStrategy` will continue to work without modification
|
||||
- No changes are required to your existing code
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -19,13 +19,15 @@ class MarkdownGenerationResult(BaseModel):
|
||||
class CrawlResult(BaseModel):
|
||||
url: str
|
||||
html: str
|
||||
fit_html: Optional[str] = None
|
||||
success: bool
|
||||
cleaned_html: Optional[str] = None
|
||||
media: Dict[str, List[Dict]] = {}
|
||||
links: Dict[str, List[Dict]] = {}
|
||||
downloaded_files: Optional[List[str]] = None
|
||||
js_execution_result: Optional[Dict[str, Any]] = None
|
||||
screenshot: Optional[str] = None
|
||||
pdf : Optional[bytes] = None
|
||||
pdf: Optional[bytes] = None
|
||||
mhtml: Optional[str] = None
|
||||
markdown: Optional[Union[str, MarkdownGenerationResult]] = None
|
||||
extracted_content: Optional[str] = None
|
||||
@@ -35,6 +37,12 @@ class CrawlResult(BaseModel):
|
||||
response_headers: Optional[dict] = None
|
||||
status_code: Optional[int] = None
|
||||
ssl_certificate: Optional[SSLCertificate] = None
|
||||
dispatch_result: Optional[DispatchResult] = None
|
||||
redirected_url: Optional[str] = None
|
||||
network_requests: Optional[List[Dict[str, Any]]] = None
|
||||
console_messages: Optional[List[Dict[str, Any]]] = None
|
||||
tables: List[Dict] = Field(default_factory=list)
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
```
|
||||
@@ -45,11 +53,13 @@ class CrawlResult(BaseModel):
|
||||
|-------------------------------------------|-----------------------------------------------------------------------------------------------------|
|
||||
| **url (`str`)** | The final or actual URL crawled (in case of redirects). |
|
||||
| **html (`str`)** | Original, unmodified page HTML. Good for debugging or custom processing. |
|
||||
| **fit_html (`Optional[str]`)** | Preprocessed HTML optimized for extraction and content filtering. |
|
||||
| **success (`bool`)** | `True` if the crawl completed without major errors, else `False`. |
|
||||
| **cleaned_html (`Optional[str]`)** | Sanitized HTML with scripts/styles removed; can exclude tags if configured via `excluded_tags` etc. |
|
||||
| **media (`Dict[str, List[Dict]]`)** | Extracted media info (images, audio, etc.), each with attributes like `src`, `alt`, `score`, etc. |
|
||||
| **links (`Dict[str, List[Dict]]`)** | Extracted link data, split by `internal` and `external`. Each link usually has `href`, `text`, etc. |
|
||||
| **downloaded_files (`Optional[List[str]]`)** | If `accept_downloads=True` in `BrowserConfig`, this lists the filepaths of saved downloads. |
|
||||
| **js_execution_result (`Optional[Dict[str, Any]]`)** | Results from JavaScript execution during crawling. |
|
||||
| **screenshot (`Optional[str]`)** | Screenshot of the page (base64-encoded) if `screenshot=True`. |
|
||||
| **pdf (`Optional[bytes]`)** | PDF of the page if `pdf=True`. |
|
||||
| **mhtml (`Optional[str]`)** | MHTML snapshot of the page if `capture_mhtml=True`. Contains the full page with all resources. |
|
||||
@@ -61,6 +71,11 @@ class CrawlResult(BaseModel):
|
||||
| **response_headers (`Optional[dict]`)** | HTTP response headers, if captured. |
|
||||
| **status_code (`Optional[int]`)** | HTTP status code (e.g., 200 for OK). |
|
||||
| **ssl_certificate (`Optional[SSLCertificate]`)** | SSL certificate info if `fetch_ssl_certificate=True`. |
|
||||
| **dispatch_result (`Optional[DispatchResult]`)** | Additional concurrency and resource usage information when crawling URLs in parallel. |
|
||||
| **redirected_url (`Optional[str]`)** | The URL after any redirects (different from `url` which is the final URL). |
|
||||
| **network_requests (`Optional[List[Dict[str, Any]]]`)** | List of network requests, responses, and failures captured during the crawl if `capture_network_requests=True`. |
|
||||
| **console_messages (`Optional[List[Dict[str, Any]]]`)** | List of browser console messages captured during the crawl if `capture_console_messages=True`. |
|
||||
| **tables (`List[Dict]`)** | Table data extracted from HTML tables with structure `[{headers, rows, caption, summary}]`. |
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -154,6 +154,30 @@ cp deploy/docker/.llm.env.example .llm.env
|
||||
# Now edit .llm.env and add your API keys
|
||||
```
|
||||
|
||||
**Flexible LLM Provider Configuration:**
|
||||
|
||||
The Docker setup now supports flexible LLM provider configuration through three methods:
|
||||
|
||||
1. **Environment Variable** (Highest Priority): Set `LLM_PROVIDER` to override the default
|
||||
```bash
|
||||
export LLM_PROVIDER="anthropic/claude-3-opus"
|
||||
# Or in your .llm.env file:
|
||||
# LLM_PROVIDER=anthropic/claude-3-opus
|
||||
```
|
||||
|
||||
2. **API Request Parameter**: Specify provider per request
|
||||
```json
|
||||
{
|
||||
"url": "https://example.com",
|
||||
"f": "llm",
|
||||
"provider": "groq/mixtral-8x7b"
|
||||
}
|
||||
```
|
||||
|
||||
3. **Config File Default**: Falls back to `config.yml` (default: `openai/gpt-4o-mini`)
|
||||
|
||||
The system automatically selects the appropriate API key based on the configured `api_key_env` in the config file.
|
||||
|
||||
#### 3. Build and Run with Compose
|
||||
|
||||
The `docker-compose.yml` file in the project root provides a simplified approach that automatically handles architecture detection using buildx.
|
||||
@@ -668,7 +692,7 @@ app:
|
||||
|
||||
# Default LLM Configuration
|
||||
llm:
|
||||
provider: "openai/gpt-4o-mini"
|
||||
provider: "openai/gpt-4o-mini" # Can be overridden by LLM_PROVIDER env var
|
||||
api_key_env: "OPENAI_API_KEY"
|
||||
# api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
|
||||
|
||||
|
||||
@@ -28,11 +28,8 @@ This page provides a comprehensive list of example scripts that demonstrate vari
|
||||
| Example | Description | Link |
|
||||
|---------|-------------|------|
|
||||
| Deep Crawling | An extensive tutorial on deep crawling capabilities, demonstrating BFS and BestFirst strategies, stream vs. non-stream execution, filters, scorers, and advanced configurations. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/deepcrawl_example.py) |
|
||||
<<<<<<< HEAD
|
||||
| Virtual Scroll | Comprehensive examples for handling virtualized scrolling on sites like Twitter, Instagram. Demonstrates different scrolling scenarios with local test server. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/virtual_scroll_example.py) |
|
||||
=======
|
||||
| Adaptive Crawling | Demonstrates intelligent crawling that automatically determines when sufficient information has been gathered. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/adaptive_crawling/) |
|
||||
>>>>>>> feature/progressive-crawling
|
||||
| Dispatcher | Shows how to use the crawl dispatcher for advanced workload management. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/dispatcher_example.py) |
|
||||
| Storage State | Tutorial on managing browser storage state for persistence. | [View Guide](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/storage_state_tutorial.md) |
|
||||
| Network Console Capture | Demonstrates how to capture and analyze network requests and console logs. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/network_console_capture_example.py) |
|
||||
@@ -117,4 +114,4 @@ Some examples may require:
|
||||
|
||||
## Contributing New Examples
|
||||
|
||||
If you've created an interesting example that demonstrates a unique use case or feature of Crawl4AI, we encourage you to contribute it to our examples collection. Please see our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTORS.md) for more information.
|
||||
If you've created an interesting example that demonstrates a unique use case or feature of Crawl4AI, we encourage you to contribute it to our examples collection. Please see our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTORS.md) for more information.
|
||||
|
||||
@@ -125,7 +125,7 @@ Here's a full example you can copy, paste, and run immediately:
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.async_configs import LinkPreviewConfig
|
||||
from crawl4ai import LinkPreviewConfig
|
||||
|
||||
async def extract_link_heads_example():
|
||||
"""
|
||||
@@ -237,7 +237,7 @@ if __name__ == "__main__":
|
||||
The `LinkPreviewConfig` class supports these options:
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import LinkPreviewConfig
|
||||
from crawl4ai import LinkPreviewConfig
|
||||
|
||||
link_preview_config = LinkPreviewConfig(
|
||||
# BASIC SETTINGS
|
||||
|
||||
@@ -137,7 +137,7 @@ async def smart_blog_crawler():
|
||||
word_count_threshold=300 # Only substantial articles
|
||||
)
|
||||
|
||||
# Extract URLs and stream results as they come
|
||||
# Extract URLs and crawl them
|
||||
tutorial_urls = [t["url"] for t in tutorials[:10]]
|
||||
results = await crawler.arun_many(tutorial_urls, config=config)
|
||||
|
||||
@@ -231,7 +231,7 @@ Common Crawl is a massive public dataset that regularly crawls the entire web. I
|
||||
|
||||
```python
|
||||
# Use both sources
|
||||
config = SeedingConfig(source="cc+sitemap")
|
||||
config = SeedingConfig(source="sitemap+cc")
|
||||
urls = await seeder.urls("example.com", config)
|
||||
```
|
||||
|
||||
@@ -241,13 +241,13 @@ The `SeedingConfig` object is your control panel. Here's everything you can conf
|
||||
|
||||
| Parameter | Type | Default | Description |
|
||||
|-----------|------|---------|-------------|
|
||||
| `source` | str | "cc" | URL source: "cc" (Common Crawl), "sitemap", or "cc+sitemap" |
|
||||
| `source` | str | "sitemap+cc" | URL source: "cc" (Common Crawl), "sitemap", or "sitemap+cc" |
|
||||
| `pattern` | str | "*" | URL pattern filter (e.g., "*/blog/*", "*.html") |
|
||||
| `extract_head` | bool | False | Extract metadata from page `<head>` |
|
||||
| `live_check` | bool | False | Verify URLs are accessible |
|
||||
| `max_urls` | int | -1 | Maximum URLs to return (-1 = unlimited) |
|
||||
| `concurrency` | int | 10 | Parallel workers for fetching |
|
||||
| `hits_per_sec` | int | None | Rate limit for requests |
|
||||
| `hits_per_sec` | int | 5 | Rate limit for requests |
|
||||
| `force` | bool | False | Bypass cache, fetch fresh data |
|
||||
| `verbose` | bool | False | Show detailed progress |
|
||||
| `query` | str | None | Search query for BM25 scoring |
|
||||
@@ -522,7 +522,7 @@ urls = await seeder.urls("docs.example.com", config)
|
||||
```python
|
||||
# Find specific products
|
||||
config = SeedingConfig(
|
||||
source="cc+sitemap", # Use both sources
|
||||
source="sitemap+cc", # Use both sources
|
||||
extract_head=True,
|
||||
query="wireless headphones noise canceling",
|
||||
scoring_method="bm25",
|
||||
@@ -782,7 +782,7 @@ class ResearchAssistant:
|
||||
|
||||
# Step 1: Discover relevant URLs
|
||||
config = SeedingConfig(
|
||||
source="cc+sitemap", # Maximum coverage
|
||||
source="sitemap+cc", # Maximum coverage
|
||||
extract_head=True, # Get metadata
|
||||
query=topic, # Research topic
|
||||
scoring_method="bm25", # Smart scoring
|
||||
@@ -832,7 +832,8 @@ class ResearchAssistant:
|
||||
# Extract URLs and crawl all articles
|
||||
article_urls = [article['url'] for article in top_articles]
|
||||
results = []
|
||||
async for result in await crawler.arun_many(article_urls, config=config):
|
||||
crawl_results = await crawler.arun_many(article_urls, config=config)
|
||||
async for result in crawl_results:
|
||||
if result.success:
|
||||
results.append({
|
||||
'url': result.url,
|
||||
@@ -933,10 +934,10 @@ config = SeedingConfig(concurrency=10, hits_per_sec=5)
|
||||
# When crawling many URLs
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Assuming urls is a list of URL strings
|
||||
results = await crawler.arun_many(urls, config=config)
|
||||
crawl_results = await crawler.arun_many(urls, config=config)
|
||||
|
||||
# Process as they arrive
|
||||
async for result in results:
|
||||
async for result in crawl_results:
|
||||
process_immediately(result) # Don't wait for all
|
||||
```
|
||||
|
||||
@@ -1020,7 +1021,7 @@ config = SeedingConfig(
|
||||
|
||||
# E-commerce product discovery
|
||||
config = SeedingConfig(
|
||||
source="cc+sitemap",
|
||||
source="sitemap+cc",
|
||||
pattern="*/product/*",
|
||||
extract_head=True,
|
||||
live_check=True
|
||||
|
||||
92
docs/md_v2/migration/webscraping-strategy-migration.md
Normal file
92
docs/md_v2/migration/webscraping-strategy-migration.md
Normal file
@@ -0,0 +1,92 @@
|
||||
# WebScrapingStrategy Migration Guide
|
||||
|
||||
## Overview
|
||||
|
||||
Crawl4AI has simplified its content scraping architecture. The BeautifulSoup-based `WebScrapingStrategy` has been deprecated in favor of the faster LXML-based implementation. However, **no action is required** - your existing code will continue to work.
|
||||
|
||||
## What Changed?
|
||||
|
||||
1. **`WebScrapingStrategy` is now an alias** for `LXMLWebScrapingStrategy`
|
||||
2. **The BeautifulSoup implementation has been removed** (~1000 lines of redundant code)
|
||||
3. **`LXMLWebScrapingStrategy` inherits directly** from `ContentScrapingStrategy`
|
||||
4. **Performance remains optimal** with LXML as the sole implementation
|
||||
|
||||
## Backward Compatibility
|
||||
|
||||
**Your existing code continues to work without any changes:**
|
||||
|
||||
```python
|
||||
# This still works perfectly
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, WebScrapingStrategy
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
scraping_strategy=WebScrapingStrategy() # Works as before
|
||||
)
|
||||
```
|
||||
|
||||
## Migration Options
|
||||
|
||||
You have three options:
|
||||
|
||||
### Option 1: Do Nothing (Recommended)
|
||||
Your code will continue to work. `WebScrapingStrategy` is permanently aliased to `LXMLWebScrapingStrategy`.
|
||||
|
||||
### Option 2: Update Imports (Optional)
|
||||
For clarity, you can update your imports:
|
||||
|
||||
```python
|
||||
# Old (still works)
|
||||
from crawl4ai import WebScrapingStrategy
|
||||
strategy = WebScrapingStrategy()
|
||||
|
||||
# New (more explicit)
|
||||
from crawl4ai import LXMLWebScrapingStrategy
|
||||
strategy = LXMLWebScrapingStrategy()
|
||||
```
|
||||
|
||||
### Option 3: Use Default Configuration
|
||||
Since `LXMLWebScrapingStrategy` is the default, you can omit the strategy parameter:
|
||||
|
||||
```python
|
||||
# Simplest approach - uses LXMLWebScrapingStrategy by default
|
||||
config = CrawlerRunConfig()
|
||||
```
|
||||
|
||||
## Type Hints
|
||||
|
||||
If you use type hints, both work:
|
||||
|
||||
```python
|
||||
from crawl4ai import WebScrapingStrategy, LXMLWebScrapingStrategy
|
||||
|
||||
def process_with_strategy(strategy: WebScrapingStrategy) -> None:
|
||||
# Works with both WebScrapingStrategy and LXMLWebScrapingStrategy
|
||||
pass
|
||||
|
||||
# Both are valid
|
||||
process_with_strategy(WebScrapingStrategy())
|
||||
process_with_strategy(LXMLWebScrapingStrategy())
|
||||
```
|
||||
|
||||
## Subclassing
|
||||
|
||||
If you've subclassed `WebScrapingStrategy`, it continues to work:
|
||||
|
||||
```python
|
||||
class MyCustomStrategy(WebScrapingStrategy):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
# Your custom code
|
||||
```
|
||||
|
||||
## Performance Benefits
|
||||
|
||||
By consolidating to LXML:
|
||||
- **10-20x faster** HTML parsing for large documents
|
||||
- **Lower memory usage**
|
||||
- **Consistent behavior** across all use cases
|
||||
- **Simplified maintenance** and bug fixes
|
||||
|
||||
## Summary
|
||||
|
||||
This change simplifies Crawl4AI's internals while maintaining 100% backward compatibility. Your existing code continues to work, and you get better performance automatically.
|
||||
@@ -28,7 +28,7 @@ from rich import box
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, AdaptiveCrawler, AdaptiveConfig, BrowserConfig, CacheMode
|
||||
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
||||
from crawl4ai.async_configs import LinkPreviewConfig, VirtualScrollConfig
|
||||
from crawl4ai import LinkPreviewConfig, VirtualScrollConfig
|
||||
from crawl4ai import c4a_compile, CompilationResult
|
||||
|
||||
# Initialize Rich console for beautiful output
|
||||
|
||||
@@ -13,14 +13,13 @@ from crawl4ai import (
|
||||
BrowserConfig,
|
||||
CacheMode,
|
||||
# New imports for v0.7.0
|
||||
LinkPreviewConfig,
|
||||
VirtualScrollConfig,
|
||||
LinkPreviewConfig,
|
||||
AdaptiveCrawler,
|
||||
AdaptiveConfig,
|
||||
AsyncUrlSeeder,
|
||||
SeedingConfig,
|
||||
c4a_compile,
|
||||
CompilationResult
|
||||
)
|
||||
|
||||
|
||||
@@ -170,16 +169,16 @@ async def demo_url_seeder():
|
||||
# Discover Python tutorial URLs
|
||||
config = SeedingConfig(
|
||||
source="sitemap", # Use sitemap
|
||||
pattern="*tutorial*", # URL pattern filter
|
||||
pattern="*python*", # URL pattern filter
|
||||
extract_head=True, # Get metadata
|
||||
query="python async programming", # For relevance scoring
|
||||
query="python tutorial", # For relevance scoring
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.2,
|
||||
max_urls=10
|
||||
)
|
||||
|
||||
print("Discovering Python async tutorial URLs...")
|
||||
urls = await seeder.urls("docs.python.org", config)
|
||||
urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
|
||||
|
||||
print(f"\n✅ Found {len(urls)} relevant URLs:")
|
||||
for i, url_info in enumerate(urls[:5], 1):
|
||||
@@ -245,39 +244,6 @@ IF (EXISTS `.price-filter`) THEN CLICK `input[data-max-price="100"]`
|
||||
print(f"❌ Compilation error: {result.first_error.message}")
|
||||
|
||||
|
||||
async def demo_pdf_support():
|
||||
"""
|
||||
Demo 6: PDF Parsing Support
|
||||
|
||||
Shows how to extract content from PDF files.
|
||||
Note: Requires 'pip install crawl4ai[pdf]'
|
||||
"""
|
||||
print("\n" + "="*60)
|
||||
print("📄 DEMO 6: PDF Parsing Support")
|
||||
print("="*60)
|
||||
|
||||
try:
|
||||
# Check if PDF support is installed
|
||||
import PyPDF2
|
||||
|
||||
# Example: Process a PDF URL
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
pdf=True, # Enable PDF generation
|
||||
extract_text_from_pdf=True # Extract text content
|
||||
)
|
||||
|
||||
print("PDF parsing is available!")
|
||||
print("You can now crawl PDF URLs and extract their content.")
|
||||
print("\nExample usage:")
|
||||
print(' result = await crawler.arun("https://example.com/document.pdf")')
|
||||
print(' pdf_text = result.extracted_content # Contains extracted text')
|
||||
|
||||
except ImportError:
|
||||
print("⚠️ PDF support not installed.")
|
||||
print("Install with: pip install crawl4ai[pdf]")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all demos"""
|
||||
print("\n🚀 Crawl4AI v0.7.0 Feature Demonstrations")
|
||||
@@ -289,7 +255,6 @@ async def main():
|
||||
("Virtual Scroll", demo_virtual_scroll),
|
||||
("URL Seeder", demo_url_seeder),
|
||||
("C4A Script", demo_c4a_script),
|
||||
("PDF Support", demo_pdf_support)
|
||||
]
|
||||
|
||||
for name, demo_func in demos:
|
||||
@@ -309,7 +274,6 @@ async def main():
|
||||
print("• Virtual Scroll: Capture all content from modern web pages")
|
||||
print("• URL Seeder: Pre-discover and filter URLs efficiently")
|
||||
print("• C4A Script: Simple language for complex automations")
|
||||
print("• PDF Support: Extract content from PDF documents")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
site_name: Crawl4AI Documentation (v0.6.x)
|
||||
site_name: Crawl4AI Documentation (v0.7.x)
|
||||
site_favicon: docs/md_v2/favicon.ico
|
||||
site_description: 🚀🤖 Crawl4AI, Open-source LLM-Friendly Web Crawler & Scraper
|
||||
site_url: https://docs.crawl4ai.com
|
||||
@@ -25,6 +25,8 @@ nav:
|
||||
- "Command Line Interface": "core/cli.md"
|
||||
- "Simple Crawling": "core/simple-crawling.md"
|
||||
- "Deep Crawling": "core/deep-crawling.md"
|
||||
- "Adaptive Crawling": "core/adaptive-crawling.md"
|
||||
- "URL Seeding": "core/url-seeding.md"
|
||||
- "C4A-Script": "core/c4a-script.md"
|
||||
- "Crawler Result": "core/crawler-result.md"
|
||||
- "Browser, Crawler & LLM Config": "core/browser-crawler-config.md"
|
||||
@@ -37,6 +39,7 @@ nav:
|
||||
- "Link & Media": "core/link-media.md"
|
||||
- Advanced:
|
||||
- "Overview": "advanced/advanced-features.md"
|
||||
- "Adaptive Strategies": "advanced/adaptive-strategies.md"
|
||||
- "Virtual Scroll": "advanced/virtual-scroll.md"
|
||||
- "File Downloading": "advanced/file-downloading.md"
|
||||
- "Lazy Loading": "advanced/lazy-loading.md"
|
||||
|
||||
@@ -44,7 +44,6 @@ dependencies = [
|
||||
"brotli>=1.1.0",
|
||||
"humanize>=4.10.0",
|
||||
"lark>=1.2.2",
|
||||
"sentence-transformers>=2.2.0",
|
||||
"alphashape>=1.3.1",
|
||||
"shapely>=2.0.0"
|
||||
]
|
||||
@@ -62,8 +61,8 @@ classifiers = [
|
||||
[project.optional-dependencies]
|
||||
pdf = ["PyPDF2"]
|
||||
torch = ["torch", "nltk", "scikit-learn"]
|
||||
transformer = ["transformers", "tokenizers"]
|
||||
cosine = ["torch", "transformers", "nltk"]
|
||||
transformer = ["transformers", "tokenizers", "sentence-transformers"]
|
||||
cosine = ["torch", "transformers", "nltk", "sentence-transformers"]
|
||||
sync = ["selenium"]
|
||||
all = [
|
||||
"PyPDF2",
|
||||
@@ -72,8 +71,8 @@ all = [
|
||||
"scikit-learn",
|
||||
"transformers",
|
||||
"tokenizers",
|
||||
"selenium",
|
||||
"PyPDF2"
|
||||
"sentence-transformers",
|
||||
"selenium"
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
|
||||
@@ -24,7 +24,6 @@ cssselect>=1.2.0
|
||||
chardet>=5.2.0
|
||||
brotli>=1.1.0
|
||||
httpx[http2]>=0.27.2
|
||||
sentence-transformers>=2.2.0
|
||||
alphashape>=1.3.1
|
||||
shapely>=2.0.0
|
||||
|
||||
|
||||
@@ -12,11 +12,8 @@ parent_dir = os.path.dirname(
|
||||
sys.path.append(parent_dir)
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
from crawl4ai.content_scraping_strategy import WebScrapingStrategy
|
||||
from crawl4ai.content_scraping_strategy import (
|
||||
WebScrapingStrategy as WebScrapingStrategyCurrent,
|
||||
)
|
||||
# from crawl4ai.content_scrapping_strategy_current import WebScrapingStrategy as WebScrapingStrategyCurrent
|
||||
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||
# This test compares the same strategy with itself now since WebScrapingStrategy is deprecated
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -32,8 +29,8 @@ class TestResult:
|
||||
|
||||
class StrategyTester:
|
||||
def __init__(self):
|
||||
self.new_scraper = WebScrapingStrategy()
|
||||
self.current_scraper = WebScrapingStrategyCurrent()
|
||||
self.new_scraper = LXMLWebScrapingStrategy()
|
||||
self.current_scraper = LXMLWebScrapingStrategy() # Same strategy now
|
||||
with open(__location__ + "/sample_wikipedia.html", "r", encoding="utf-8") as f:
|
||||
self.WIKI_HTML = f.read()
|
||||
self.results = {"new": [], "current": []}
|
||||
|
||||
@@ -10,11 +10,13 @@ import sys
|
||||
import uuid
|
||||
import shutil
|
||||
|
||||
from crawl4ai import BrowserProfiler
|
||||
from crawl4ai.browser_manager import BrowserManager
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager, BrowserProfileManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
@@ -25,7 +27,7 @@ async def test_profile_creation():
|
||||
"""Test creating and managing browser profiles."""
|
||||
logger.info("Testing profile creation and management", tag="TEST")
|
||||
|
||||
profile_manager = BrowserProfileManager(logger=logger)
|
||||
profile_manager = BrowserProfiler(logger=logger)
|
||||
|
||||
try:
|
||||
# List existing profiles
|
||||
@@ -83,7 +85,7 @@ async def test_profile_with_browser():
|
||||
"""Test using a profile with a browser."""
|
||||
logger.info("Testing using a profile with a browser", tag="TEST")
|
||||
|
||||
profile_manager = BrowserProfileManager(logger=logger)
|
||||
profile_manager = BrowserProfiler(logger=logger)
|
||||
test_profile_name = f"test-browser-profile-{uuid.uuid4().hex[:8]}"
|
||||
profile_path = None
|
||||
|
||||
@@ -101,6 +103,8 @@ async def test_profile_with_browser():
|
||||
# Now use this profile with a browser
|
||||
browser_config = BrowserConfig(
|
||||
user_data_dir=profile_path,
|
||||
use_managed_browser=True,
|
||||
use_persistent_context=True,
|
||||
headless=True
|
||||
)
|
||||
|
||||
|
||||
345
tests/docker/simple_api_test.py
Normal file
345
tests/docker/simple_api_test.py
Normal file
@@ -0,0 +1,345 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Simple API Test for Crawl4AI Docker Server v0.7.0
|
||||
Uses only built-in Python modules to test all endpoints.
|
||||
"""
|
||||
|
||||
import urllib.request
|
||||
import urllib.parse
|
||||
import json
|
||||
import time
|
||||
import sys
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
# Configuration
|
||||
BASE_URL = "http://localhost:11234" # Change to your server URL
|
||||
TEST_TIMEOUT = 30
|
||||
|
||||
class SimpleApiTester:
|
||||
def __init__(self, base_url: str = BASE_URL):
|
||||
self.base_url = base_url
|
||||
self.token = None
|
||||
self.results = []
|
||||
|
||||
def log(self, message: str):
|
||||
print(f"[INFO] {message}")
|
||||
|
||||
def test_get_endpoint(self, endpoint: str) -> Dict:
|
||||
"""Test a GET endpoint"""
|
||||
url = f"{self.base_url}{endpoint}"
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(url)
|
||||
if self.token:
|
||||
req.add_header('Authorization', f'Bearer {self.token}')
|
||||
|
||||
with urllib.request.urlopen(req, timeout=TEST_TIMEOUT) as response:
|
||||
response_time = time.time() - start_time
|
||||
status_code = response.getcode()
|
||||
content = response.read().decode('utf-8')
|
||||
|
||||
# Try to parse JSON
|
||||
try:
|
||||
data = json.loads(content)
|
||||
except:
|
||||
data = {"raw_response": content[:200]}
|
||||
|
||||
return {
|
||||
"endpoint": endpoint,
|
||||
"method": "GET",
|
||||
"status": "PASS" if status_code < 400 else "FAIL",
|
||||
"status_code": status_code,
|
||||
"response_time": response_time,
|
||||
"data": data
|
||||
}
|
||||
except Exception as e:
|
||||
response_time = time.time() - start_time
|
||||
return {
|
||||
"endpoint": endpoint,
|
||||
"method": "GET",
|
||||
"status": "FAIL",
|
||||
"status_code": None,
|
||||
"response_time": response_time,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
def test_post_endpoint(self, endpoint: str, payload: Dict) -> Dict:
|
||||
"""Test a POST endpoint"""
|
||||
url = f"{self.base_url}{endpoint}"
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
data = json.dumps(payload).encode('utf-8')
|
||||
req = urllib.request.Request(url, data=data, method='POST')
|
||||
req.add_header('Content-Type', 'application/json')
|
||||
|
||||
if self.token:
|
||||
req.add_header('Authorization', f'Bearer {self.token}')
|
||||
|
||||
with urllib.request.urlopen(req, timeout=TEST_TIMEOUT) as response:
|
||||
response_time = time.time() - start_time
|
||||
status_code = response.getcode()
|
||||
content = response.read().decode('utf-8')
|
||||
|
||||
# Try to parse JSON
|
||||
try:
|
||||
data = json.loads(content)
|
||||
except:
|
||||
data = {"raw_response": content[:200]}
|
||||
|
||||
return {
|
||||
"endpoint": endpoint,
|
||||
"method": "POST",
|
||||
"status": "PASS" if status_code < 400 else "FAIL",
|
||||
"status_code": status_code,
|
||||
"response_time": response_time,
|
||||
"data": data
|
||||
}
|
||||
except Exception as e:
|
||||
response_time = time.time() - start_time
|
||||
return {
|
||||
"endpoint": endpoint,
|
||||
"method": "POST",
|
||||
"status": "FAIL",
|
||||
"status_code": None,
|
||||
"response_time": response_time,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
def print_result(self, result: Dict):
|
||||
"""Print a formatted test result"""
|
||||
status_color = {
|
||||
"PASS": "✅",
|
||||
"FAIL": "❌",
|
||||
"SKIP": "⏭️"
|
||||
}
|
||||
|
||||
print(f"{status_color[result['status']]} {result['method']} {result['endpoint']} "
|
||||
f"| {result['response_time']:.3f}s | Status: {result['status_code'] or 'N/A'}")
|
||||
|
||||
if result['status'] == 'FAIL' and 'error' in result:
|
||||
print(f" Error: {result['error']}")
|
||||
|
||||
self.results.append(result)
|
||||
|
||||
def run_all_tests(self):
|
||||
"""Run all API tests"""
|
||||
print("🚀 Starting Crawl4AI v0.7.0 API Test Suite")
|
||||
print(f"📡 Testing server at: {self.base_url}")
|
||||
print("=" * 60)
|
||||
|
||||
# # Test basic endpoints
|
||||
# print("\n=== BASIC ENDPOINTS ===")
|
||||
|
||||
# # Health check
|
||||
# result = self.test_get_endpoint("/health")
|
||||
# self.print_result(result)
|
||||
|
||||
|
||||
# # Schema endpoint
|
||||
# result = self.test_get_endpoint("/schema")
|
||||
# self.print_result(result)
|
||||
|
||||
# # Metrics endpoint
|
||||
# result = self.test_get_endpoint("/metrics")
|
||||
# self.print_result(result)
|
||||
|
||||
# # Root redirect
|
||||
# result = self.test_get_endpoint("/")
|
||||
# self.print_result(result)
|
||||
|
||||
# # Test authentication
|
||||
# print("\n=== AUTHENTICATION ===")
|
||||
|
||||
# # Get token
|
||||
# token_payload = {"email": "test@example.com"}
|
||||
# result = self.test_post_endpoint("/token", token_payload)
|
||||
# self.print_result(result)
|
||||
|
||||
# # Extract token if successful
|
||||
# if result['status'] == 'PASS' and 'data' in result:
|
||||
# token = result['data'].get('access_token')
|
||||
# if token:
|
||||
# self.token = token
|
||||
# self.log(f"Successfully obtained auth token: {token[:20]}...")
|
||||
|
||||
# Test core APIs
|
||||
print("\n=== CORE APIs ===")
|
||||
|
||||
test_url = "https://example.com"
|
||||
|
||||
# Test markdown endpoint
|
||||
md_payload = {
|
||||
"url": test_url,
|
||||
"f": "fit",
|
||||
"q": "test query",
|
||||
"c": "0"
|
||||
}
|
||||
result = self.test_post_endpoint("/md", md_payload)
|
||||
# print(result['data'].get('markdown', ''))
|
||||
self.print_result(result)
|
||||
|
||||
# Test HTML endpoint
|
||||
html_payload = {"url": test_url}
|
||||
result = self.test_post_endpoint("/html", html_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test screenshot endpoint
|
||||
screenshot_payload = {
|
||||
"url": test_url,
|
||||
"screenshot_wait_for": 2
|
||||
}
|
||||
result = self.test_post_endpoint("/screenshot", screenshot_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test PDF endpoint
|
||||
pdf_payload = {"url": test_url}
|
||||
result = self.test_post_endpoint("/pdf", pdf_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test JavaScript execution
|
||||
js_payload = {
|
||||
"url": test_url,
|
||||
"scripts": ["(() => document.title)()"]
|
||||
}
|
||||
result = self.test_post_endpoint("/execute_js", js_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test crawl endpoint
|
||||
crawl_payload = {
|
||||
"urls": [test_url],
|
||||
"browser_config": {},
|
||||
"crawler_config": {}
|
||||
}
|
||||
result = self.test_post_endpoint("/crawl", crawl_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test config dump
|
||||
config_payload = {"code": "CrawlerRunConfig()"}
|
||||
result = self.test_post_endpoint("/config/dump", config_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test LLM endpoint
|
||||
llm_endpoint = f"/llm/{test_url}?q=Extract%20main%20content"
|
||||
result = self.test_get_endpoint(llm_endpoint)
|
||||
self.print_result(result)
|
||||
|
||||
# Test ask endpoint
|
||||
ask_endpoint = "/ask?context_type=all&query=crawl4ai&max_results=5"
|
||||
result = self.test_get_endpoint(ask_endpoint)
|
||||
print(result)
|
||||
self.print_result(result)
|
||||
|
||||
# Test job APIs
|
||||
print("\n=== JOB APIs ===")
|
||||
|
||||
# Test LLM job
|
||||
llm_job_payload = {
|
||||
"url": test_url,
|
||||
"q": "Extract main content",
|
||||
"cache": False
|
||||
}
|
||||
result = self.test_post_endpoint("/llm/job", llm_job_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test crawl job
|
||||
crawl_job_payload = {
|
||||
"urls": [test_url],
|
||||
"browser_config": {},
|
||||
"crawler_config": {}
|
||||
}
|
||||
result = self.test_post_endpoint("/crawl/job", crawl_job_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test MCP
|
||||
print("\n=== MCP APIs ===")
|
||||
|
||||
# Test MCP schema
|
||||
result = self.test_get_endpoint("/mcp/schema")
|
||||
self.print_result(result)
|
||||
|
||||
# Test error handling
|
||||
print("\n=== ERROR HANDLING ===")
|
||||
|
||||
# Test invalid URL
|
||||
invalid_payload = {"url": "invalid-url", "f": "fit"}
|
||||
result = self.test_post_endpoint("/md", invalid_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test invalid endpoint
|
||||
result = self.test_get_endpoint("/nonexistent")
|
||||
self.print_result(result)
|
||||
|
||||
# Print summary
|
||||
self.print_summary()
|
||||
|
||||
def print_summary(self):
|
||||
"""Print test results summary"""
|
||||
print("\n" + "=" * 60)
|
||||
print("📊 TEST RESULTS SUMMARY")
|
||||
print("=" * 60)
|
||||
|
||||
total = len(self.results)
|
||||
passed = sum(1 for r in self.results if r['status'] == 'PASS')
|
||||
failed = sum(1 for r in self.results if r['status'] == 'FAIL')
|
||||
|
||||
print(f"Total Tests: {total}")
|
||||
print(f"✅ Passed: {passed}")
|
||||
print(f"❌ Failed: {failed}")
|
||||
print(f"📈 Success Rate: {(passed/total)*100:.1f}%")
|
||||
|
||||
if failed > 0:
|
||||
print("\n❌ FAILED TESTS:")
|
||||
for result in self.results:
|
||||
if result['status'] == 'FAIL':
|
||||
print(f" • {result['method']} {result['endpoint']}")
|
||||
if 'error' in result:
|
||||
print(f" Error: {result['error']}")
|
||||
|
||||
# Performance statistics
|
||||
response_times = [r['response_time'] for r in self.results if r['response_time'] > 0]
|
||||
if response_times:
|
||||
avg_time = sum(response_times) / len(response_times)
|
||||
max_time = max(response_times)
|
||||
print(f"\n⏱️ Average Response Time: {avg_time:.3f}s")
|
||||
print(f"⏱️ Max Response Time: {max_time:.3f}s")
|
||||
|
||||
# Save detailed report
|
||||
report_file = f"crawl4ai_test_report_{int(time.time())}.json"
|
||||
with open(report_file, 'w') as f:
|
||||
json.dump({
|
||||
"timestamp": time.time(),
|
||||
"server_url": self.base_url,
|
||||
"version": "0.7.0",
|
||||
"summary": {
|
||||
"total": total,
|
||||
"passed": passed,
|
||||
"failed": failed
|
||||
},
|
||||
"results": self.results
|
||||
}, f, indent=2)
|
||||
|
||||
print(f"\n📄 Detailed report saved to: {report_file}")
|
||||
|
||||
def main():
|
||||
"""Main test runner"""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Crawl4AI v0.7.0 API Test Suite')
|
||||
parser.add_argument('--url', default=BASE_URL, help='Base URL of the server')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
tester = SimpleApiTester(args.url)
|
||||
|
||||
try:
|
||||
tester.run_all_tests()
|
||||
except KeyboardInterrupt:
|
||||
print("\n🛑 Test suite interrupted by user")
|
||||
except Exception as e:
|
||||
print(f"\n💥 Test suite failed with error: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
55
tests/profiler/test_keyboard_handle.py
Normal file
55
tests/profiler/test_keyboard_handle.py
Normal file
@@ -0,0 +1,55 @@
|
||||
import sys
|
||||
import pytest
|
||||
import asyncio
|
||||
from unittest.mock import patch, MagicMock
|
||||
from crawl4ai.browser_profiler import BrowserProfiler
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skipif(sys.platform != "win32", reason="Windows-specific msvcrt test")
|
||||
async def test_keyboard_input_handling():
|
||||
# Mock sequence of keystrokes: arrow key followed by 'q'
|
||||
mock_keys = [b'\x00K', b'q']
|
||||
mock_kbhit = MagicMock(side_effect=[True, True, False])
|
||||
mock_getch = MagicMock(side_effect=mock_keys)
|
||||
|
||||
with patch('msvcrt.kbhit', mock_kbhit), patch('msvcrt.getch', mock_getch):
|
||||
# profiler = BrowserProfiler()
|
||||
user_done_event = asyncio.Event()
|
||||
|
||||
# Create a local async function to simulate the keyboard input handling
|
||||
async def test_listen_for_quit_command():
|
||||
if sys.platform == "win32":
|
||||
while True:
|
||||
try:
|
||||
if mock_kbhit():
|
||||
raw = mock_getch()
|
||||
try:
|
||||
key = raw.decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
|
||||
if len(key) != 1 or not key.isprintable():
|
||||
continue
|
||||
|
||||
if key.lower() == "q":
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
await asyncio.sleep(0.1)
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
# Run the listener
|
||||
listener_task = asyncio.create_task(test_listen_for_quit_command())
|
||||
|
||||
# Wait for the event to be set
|
||||
try:
|
||||
await asyncio.wait_for(user_done_event.wait(), timeout=1.0)
|
||||
assert user_done_event.is_set()
|
||||
finally:
|
||||
if not listener_task.done():
|
||||
listener_task.cancel()
|
||||
try:
|
||||
await listener_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
42
tests/test_arun_many.py
Normal file
42
tests/test_arun_many.py
Normal file
@@ -0,0 +1,42 @@
|
||||
"""
|
||||
Test example for multiple crawler configs feature
|
||||
"""
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.processors.pdf import PDFContentScrapingStrategy
|
||||
|
||||
|
||||
async def test_run_many():
|
||||
default_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
# scraping_strategy=PDFContentScrapingStrategy()
|
||||
)
|
||||
|
||||
test_urls = [
|
||||
# "https://blog.python.org/", # Blog URL
|
||||
"https://www.python.org/", # Generic HTTPS page
|
||||
"https://www.kidocode.com/", # Generic HTTPS page
|
||||
"https://www.example.com/", # Generic HTTPS page
|
||||
# "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
|
||||
]
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Single config - traditional usage still works
|
||||
print("Test 1: Single config (backwards compatible)")
|
||||
result = await crawler.arun_many(
|
||||
urls=test_urls[:2],
|
||||
config=default_config
|
||||
)
|
||||
print(f"Crawled {len(result)} URLs with single config\n")
|
||||
for item in result:
|
||||
print(f" {item.url} -> {item.status_code}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_run_many())
|
||||
131
tests/test_config_matching_only.py
Normal file
131
tests/test_config_matching_only.py
Normal file
@@ -0,0 +1,131 @@
|
||||
"""
|
||||
Test only the config matching logic without running crawler
|
||||
"""
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from crawl4ai.async_configs import CrawlerRunConfig, MatchMode
|
||||
|
||||
def test_all_matching_scenarios():
|
||||
print("Testing CrawlerRunConfig.is_match() method")
|
||||
print("=" * 50)
|
||||
|
||||
# Test 1: Single string pattern
|
||||
print("\n1. Single string pattern (glob style)")
|
||||
config = CrawlerRunConfig(
|
||||
url_matcher="*.pdf",
|
||||
# For example we can set this => scraping_strategy=PDFContentScrapingStrategy()
|
||||
)
|
||||
test_urls = [
|
||||
("https://example.com/file.pdf", True),
|
||||
("https://example.com/doc.PDF", False), # Case sensitive
|
||||
("https://example.com/file.txt", False),
|
||||
("file.pdf", True),
|
||||
]
|
||||
for url, expected in test_urls:
|
||||
result = config.is_match(url)
|
||||
status = "✓" if result == expected else "✗"
|
||||
print(f" {status} {url} -> {result}")
|
||||
|
||||
# Test 2: List of patterns with OR
|
||||
print("\n2. List of patterns with OR (default)")
|
||||
config = CrawlerRunConfig(
|
||||
url_matcher=["*/article/*", "*/blog/*", "*.html"],
|
||||
match_mode=MatchMode.OR
|
||||
)
|
||||
test_urls = [
|
||||
("https://example.com/article/news", True),
|
||||
("https://example.com/blog/post", True),
|
||||
("https://example.com/page.html", True),
|
||||
("https://example.com/page.php", False),
|
||||
]
|
||||
for url, expected in test_urls:
|
||||
result = config.is_match(url)
|
||||
status = "✓" if result == expected else "✗"
|
||||
print(f" {status} {url} -> {result}")
|
||||
|
||||
# Test 3: Custom function
|
||||
print("\n3. Custom function matcher")
|
||||
config = CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'api' in url and (url.endswith('.json') or url.endswith('.xml'))
|
||||
)
|
||||
test_urls = [
|
||||
("https://api.example.com/data.json", True),
|
||||
("https://api.example.com/data.xml", True),
|
||||
("https://api.example.com/data.html", False),
|
||||
("https://example.com/data.json", False), # No 'api'
|
||||
]
|
||||
for url, expected in test_urls:
|
||||
result = config.is_match(url)
|
||||
status = "✓" if result == expected else "✗"
|
||||
print(f" {status} {url} -> {result}")
|
||||
|
||||
# Test 4: Mixed list with AND
|
||||
print("\n4. Mixed patterns and functions with AND")
|
||||
config = CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
"https://*", # Must be HTTPS
|
||||
lambda url: '.com' in url, # Must have .com
|
||||
lambda url: len(url) < 50 # Must be short
|
||||
],
|
||||
match_mode=MatchMode.AND
|
||||
)
|
||||
test_urls = [
|
||||
("https://example.com/page", True),
|
||||
("http://example.com/page", False), # Not HTTPS
|
||||
("https://example.org/page", False), # No .com
|
||||
("https://example.com/" + "x" * 50, False), # Too long
|
||||
]
|
||||
for url, expected in test_urls:
|
||||
result = config.is_match(url)
|
||||
status = "✓" if result == expected else "✗"
|
||||
print(f" {status} {url} -> {result}")
|
||||
|
||||
# Test 5: Complex real-world scenario
|
||||
print("\n5. Complex pattern combinations")
|
||||
config = CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
"*/api/v[0-9]/*", # API versioned endpoints
|
||||
lambda url: 'graphql' in url, # GraphQL endpoints
|
||||
"*.json" # JSON files
|
||||
],
|
||||
match_mode=MatchMode.OR
|
||||
)
|
||||
test_urls = [
|
||||
("https://example.com/api/v1/users", True),
|
||||
("https://example.com/api/v2/posts", True),
|
||||
("https://example.com/graphql", True),
|
||||
("https://example.com/data.json", True),
|
||||
("https://example.com/api/users", False), # No version
|
||||
]
|
||||
for url, expected in test_urls:
|
||||
result = config.is_match(url)
|
||||
status = "✓" if result == expected else "✗"
|
||||
print(f" {status} {url} -> {result}")
|
||||
|
||||
# Test 6: Edge cases
|
||||
print("\n6. Edge cases")
|
||||
|
||||
# No matcher
|
||||
config = CrawlerRunConfig()
|
||||
result = config.is_match("https://example.com")
|
||||
print(f" {'✓' if not result else '✗'} No matcher -> {result}")
|
||||
|
||||
# Empty list
|
||||
config = CrawlerRunConfig(url_matcher=[])
|
||||
result = config.is_match("https://example.com")
|
||||
print(f" {'✓' if not result else '✗'} Empty list -> {result}")
|
||||
|
||||
# None in list (should be skipped)
|
||||
config = CrawlerRunConfig(url_matcher=["*.pdf", None, "*.doc"])
|
||||
result = config.is_match("test.pdf")
|
||||
print(f" {'✓' if result else '✗'} List with None -> {result}")
|
||||
|
||||
print("\n" + "=" * 50)
|
||||
print("All matching tests completed!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_all_matching_scenarios()
|
||||
87
tests/test_config_selection.py
Normal file
87
tests/test_config_selection.py
Normal file
@@ -0,0 +1,87 @@
|
||||
"""
|
||||
Test config selection logic in dispatchers
|
||||
"""
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from crawl4ai.async_configs import CrawlerRunConfig, MatchMode
|
||||
from crawl4ai.async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher
|
||||
|
||||
class TestDispatcher(BaseDispatcher):
|
||||
"""Simple test dispatcher to verify config selection"""
|
||||
|
||||
async def crawl_url(self, url, config, task_id, **kwargs):
|
||||
# Just return which config was selected
|
||||
selected = self.select_config(url, config)
|
||||
return {"url": url, "config_id": id(selected)}
|
||||
|
||||
async def run_urls(self, urls, crawler, config):
|
||||
results = []
|
||||
for url in urls:
|
||||
result = await self.crawl_url(url, config, "test")
|
||||
results.append(result)
|
||||
return results
|
||||
|
||||
async def test_dispatcher_config_selection():
|
||||
print("Testing dispatcher config selection")
|
||||
print("=" * 50)
|
||||
|
||||
# Create test configs with different matchers
|
||||
pdf_config = CrawlerRunConfig(url_matcher="*.pdf")
|
||||
api_config = CrawlerRunConfig(url_matcher=lambda url: 'api' in url)
|
||||
default_config = CrawlerRunConfig() # No matcher
|
||||
|
||||
configs = [pdf_config, api_config, default_config]
|
||||
|
||||
# Create test dispatcher
|
||||
dispatcher = TestDispatcher()
|
||||
|
||||
# Test single config
|
||||
print("\nTest 1: Single config")
|
||||
result = await dispatcher.crawl_url("https://example.com/file.pdf", pdf_config, "test1")
|
||||
assert result["config_id"] == id(pdf_config)
|
||||
print("✓ Single config works")
|
||||
|
||||
# Test config list selection
|
||||
print("\nTest 2: Config list selection")
|
||||
test_cases = [
|
||||
("https://example.com/file.pdf", id(pdf_config)),
|
||||
("https://api.example.com/data", id(api_config)),
|
||||
("https://example.com/page", id(configs[0])), # No match, uses first
|
||||
]
|
||||
|
||||
for url, expected_id in test_cases:
|
||||
result = await dispatcher.crawl_url(url, configs, "test")
|
||||
assert result["config_id"] == expected_id, f"URL {url} got wrong config"
|
||||
print(f"✓ {url} -> correct config selected")
|
||||
|
||||
# Test with MemoryAdaptiveDispatcher
|
||||
print("\nTest 3: MemoryAdaptiveDispatcher config selection")
|
||||
mem_dispatcher = MemoryAdaptiveDispatcher()
|
||||
|
||||
# Test select_config method directly
|
||||
selected = mem_dispatcher.select_config("https://example.com/doc.pdf", configs)
|
||||
assert selected == pdf_config
|
||||
print("✓ MemoryAdaptiveDispatcher.select_config works")
|
||||
|
||||
# Test empty config list
|
||||
print("\nTest 4: Edge cases")
|
||||
selected = mem_dispatcher.select_config("https://example.com", [])
|
||||
assert isinstance(selected, CrawlerRunConfig) # Should return default
|
||||
print("✓ Empty config list returns default config")
|
||||
|
||||
# Test None config
|
||||
selected = mem_dispatcher.select_config("https://example.com", None)
|
||||
assert isinstance(selected, CrawlerRunConfig) # Should return default
|
||||
print("✓ None config returns default config")
|
||||
|
||||
print("\n" + "=" * 50)
|
||||
print("All dispatcher tests passed! ✓")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_dispatcher_config_selection())
|
||||
122
tests/test_docker_api_with_llm_provider.py
Normal file
122
tests/test_docker_api_with_llm_provider.py
Normal file
@@ -0,0 +1,122 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test script to verify Docker API with LLM provider configuration."""
|
||||
|
||||
import requests
|
||||
import json
|
||||
import time
|
||||
|
||||
BASE_URL = "http://localhost:11235"
|
||||
|
||||
def test_health():
|
||||
"""Test health endpoint."""
|
||||
print("1. Testing health endpoint...")
|
||||
response = requests.get(f"{BASE_URL}/health")
|
||||
print(f" Status: {response.status_code}")
|
||||
print(f" Response: {response.json()}")
|
||||
print()
|
||||
|
||||
def test_schema():
|
||||
"""Test schema endpoint to see configuration."""
|
||||
print("2. Testing schema endpoint...")
|
||||
response = requests.get(f"{BASE_URL}/schema")
|
||||
print(f" Status: {response.status_code}")
|
||||
# Print only browser config to keep output concise
|
||||
print(f" Browser config keys: {list(response.json().get('browser', {}).keys())[:5]}...")
|
||||
print()
|
||||
|
||||
def test_markdown_with_llm_filter():
|
||||
"""Test markdown endpoint with LLM filter (should use configured provider)."""
|
||||
print("3. Testing markdown endpoint with LLM filter...")
|
||||
print(" This should use the Groq provider from LLM_PROVIDER env var")
|
||||
|
||||
# Note: This will fail with dummy API keys, but we can see if it tries to use Groq
|
||||
payload = {
|
||||
"url": "https://httpbin.org/html",
|
||||
"f": "llm",
|
||||
"q": "Extract the main content"
|
||||
}
|
||||
|
||||
response = requests.post(f"{BASE_URL}/md", json=payload)
|
||||
print(f" Status: {response.status_code}")
|
||||
|
||||
if response.status_code != 200:
|
||||
print(f" Error: {response.text[:200]}...")
|
||||
else:
|
||||
print(f" Success! Markdown length: {len(response.json().get('markdown', ''))} chars")
|
||||
print()
|
||||
|
||||
def test_markdown_with_provider_override():
|
||||
"""Test markdown endpoint with provider override in request."""
|
||||
print("4. Testing markdown endpoint with provider override...")
|
||||
print(" This should use OpenAI provider from request parameter")
|
||||
|
||||
payload = {
|
||||
"url": "https://httpbin.org/html",
|
||||
"f": "llm",
|
||||
"q": "Extract the main content",
|
||||
"provider": "openai/gpt-4" # Override to use OpenAI
|
||||
}
|
||||
|
||||
response = requests.post(f"{BASE_URL}/md", json=payload)
|
||||
print(f" Status: {response.status_code}")
|
||||
|
||||
if response.status_code != 200:
|
||||
print(f" Error: {response.text[:200]}...")
|
||||
else:
|
||||
print(f" Success! Markdown length: {len(response.json().get('markdown', ''))} chars")
|
||||
print()
|
||||
|
||||
def test_simple_crawl():
|
||||
"""Test simple crawl without LLM."""
|
||||
print("5. Testing simple crawl (no LLM required)...")
|
||||
|
||||
payload = {
|
||||
"urls": ["https://httpbin.org/html"],
|
||||
"browser_config": {
|
||||
"type": "BrowserConfig",
|
||||
"params": {"headless": True}
|
||||
},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {"cache_mode": "bypass"}
|
||||
}
|
||||
}
|
||||
|
||||
response = requests.post(f"{BASE_URL}/crawl", json=payload)
|
||||
print(f" Status: {response.status_code}")
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
print(f" Success: {result.get('success')}")
|
||||
print(f" Results count: {len(result.get('results', []))}")
|
||||
if result.get('results'):
|
||||
print(f" First result success: {result['results'][0].get('success')}")
|
||||
else:
|
||||
print(f" Error: {response.text[:200]}...")
|
||||
print()
|
||||
|
||||
def test_playground():
|
||||
"""Test if playground is accessible."""
|
||||
print("6. Testing playground interface...")
|
||||
response = requests.get(f"{BASE_URL}/playground")
|
||||
print(f" Status: {response.status_code}")
|
||||
print(f" Content-Type: {response.headers.get('content-type')}")
|
||||
print()
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("=== Crawl4AI Docker API Tests ===\n")
|
||||
print(f"Testing API at {BASE_URL}\n")
|
||||
|
||||
# Wait a bit for server to be fully ready
|
||||
time.sleep(2)
|
||||
|
||||
test_health()
|
||||
test_schema()
|
||||
test_simple_crawl()
|
||||
test_playground()
|
||||
|
||||
print("\nTesting LLM functionality (these may fail with dummy API keys):\n")
|
||||
test_markdown_with_llm_filter()
|
||||
test_markdown_with_provider_override()
|
||||
|
||||
print("\nTests completed!")
|
||||
@@ -5,7 +5,7 @@ Test script for Link Extractor functionality
|
||||
|
||||
from crawl4ai.models import Link
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.async_configs import LinkPreviewConfig
|
||||
from crawl4ai import LinkPreviewConfig
|
||||
import asyncio
|
||||
import sys
|
||||
import os
|
||||
@@ -237,7 +237,7 @@ def test_config_examples():
|
||||
print(f" {key}: {value}")
|
||||
|
||||
print(" Usage:")
|
||||
print(" from crawl4ai.async_configs import LinkPreviewConfig")
|
||||
print(" from crawl4ai import LinkPreviewConfig")
|
||||
print(" config = CrawlerRunConfig(")
|
||||
print(" link_preview_config=LinkPreviewConfig(")
|
||||
for key, value in config_dict.items():
|
||||
|
||||
71
tests/test_memory_macos.py
Executable file
71
tests/test_memory_macos.py
Executable file
@@ -0,0 +1,71 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test script to verify macOS memory calculation accuracy."""
|
||||
|
||||
import psutil
|
||||
import platform
|
||||
import time
|
||||
from crawl4ai.memory_utils import get_true_memory_usage_percent, get_memory_stats, get_true_available_memory_gb
|
||||
|
||||
|
||||
def test_memory_calculation():
|
||||
"""Test and compare memory calculations."""
|
||||
print(f"Platform: {platform.system()}")
|
||||
print(f"Python version: {platform.python_version()}")
|
||||
print("-" * 60)
|
||||
|
||||
# Get psutil's view
|
||||
vm = psutil.virtual_memory()
|
||||
psutil_percent = vm.percent
|
||||
psutil_available_gb = vm.available / (1024**3)
|
||||
total_gb = vm.total / (1024**3)
|
||||
|
||||
# Get our corrected view
|
||||
true_percent = get_true_memory_usage_percent()
|
||||
true_available_gb = get_true_available_memory_gb()
|
||||
true_percent_calc, available_calc, total_calc = get_memory_stats()
|
||||
|
||||
print("Memory Statistics Comparison:")
|
||||
print(f"Total Memory: {total_gb:.2f} GB")
|
||||
print()
|
||||
|
||||
print("PSUtil (Standard) Calculation:")
|
||||
print(f" - Memory Used: {psutil_percent:.1f}%")
|
||||
print(f" - Available: {psutil_available_gb:.2f} GB")
|
||||
print()
|
||||
|
||||
print("Platform-Aware Calculation:")
|
||||
print(f" - Memory Used: {true_percent:.1f}%")
|
||||
print(f" - Available: {true_available_gb:.2f} GB")
|
||||
print(f" - Difference: {true_available_gb - psutil_available_gb:.2f} GB of reclaimable memory")
|
||||
print()
|
||||
|
||||
# Show the impact on dispatcher behavior
|
||||
print("Impact on MemoryAdaptiveDispatcher:")
|
||||
thresholds = {
|
||||
"Normal": 90.0,
|
||||
"Critical": 95.0,
|
||||
"Recovery": 85.0
|
||||
}
|
||||
|
||||
for name, threshold in thresholds.items():
|
||||
psutil_triggered = psutil_percent >= threshold
|
||||
true_triggered = true_percent >= threshold
|
||||
print(f" - {name} Threshold ({threshold}%):")
|
||||
print(f" PSUtil: {'TRIGGERED' if psutil_triggered else 'OK'}")
|
||||
print(f" Platform-Aware: {'TRIGGERED' if true_triggered else 'OK'}")
|
||||
if psutil_triggered != true_triggered:
|
||||
print(f" → Difference: Platform-aware prevents false {'pressure' if psutil_triggered else 'recovery'}")
|
||||
print()
|
||||
|
||||
# Monitor for a few seconds
|
||||
print("Monitoring memory for 10 seconds...")
|
||||
for i in range(10):
|
||||
vm = psutil.virtual_memory()
|
||||
true_pct = get_true_memory_usage_percent()
|
||||
print(f" {i+1}s - PSUtil: {vm.percent:.1f}% | Platform-Aware: {true_pct:.1f}%", end="\r")
|
||||
time.sleep(1)
|
||||
print("\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_memory_calculation()
|
||||
117
tests/test_multi_config.py
Normal file
117
tests/test_multi_config.py
Normal file
@@ -0,0 +1,117 @@
|
||||
"""
|
||||
Test example for multiple crawler configs feature
|
||||
"""
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, MatchMode, CacheMode
|
||||
|
||||
async def test_multi_config():
|
||||
# Create different configs for different URL patterns
|
||||
|
||||
# Config for PDF files
|
||||
pdf_config = CrawlerRunConfig(
|
||||
url_matcher="*.pdf",
|
||||
)
|
||||
|
||||
# Config for articles (using multiple patterns with OR logic)
|
||||
article_config = CrawlerRunConfig(
|
||||
url_matcher=["*/news/*", "*blog*", "*/article/*"],
|
||||
match_mode=MatchMode.OR,
|
||||
screenshot=True,
|
||||
)
|
||||
|
||||
# Config using custom matcher function
|
||||
api_config = CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'api' in url or 'json' in url,
|
||||
)
|
||||
|
||||
# Config combining patterns and functions with AND logic
|
||||
secure_docs_config = CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
"*.doc*", # Matches .doc, .docx
|
||||
lambda url: url.startswith('https://') # Must be HTTPS
|
||||
],
|
||||
match_mode=MatchMode.AND,
|
||||
)
|
||||
|
||||
# Default config (no url_matcher means it won't match anything unless it's the fallback)
|
||||
default_config = CrawlerRunConfig(
|
||||
# cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
|
||||
# List of configs - order matters! First match wins
|
||||
configs = [
|
||||
pdf_config,
|
||||
article_config,
|
||||
api_config,
|
||||
secure_docs_config,
|
||||
default_config # Fallback
|
||||
]
|
||||
|
||||
# Test URLs - using real URLs that exist
|
||||
test_urls = [
|
||||
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", # Real PDF
|
||||
"https://www.bbc.com/news/articles/c5y3e3glnldo", # News article
|
||||
"https://blog.python.org/", # Blog URL
|
||||
"https://api.github.com/users/github", # GitHub API (returns JSON)
|
||||
"https://httpbin.org/json", # API endpoint that returns JSON
|
||||
"https://www.python.org/", # Generic HTTPS page
|
||||
"http://info.cern.ch/", # HTTP (not HTTPS) page
|
||||
"https://example.com/", # → Default config
|
||||
]
|
||||
|
||||
# Test the matching logic
|
||||
print("Config matching test:")
|
||||
print("-" * 50)
|
||||
for url in test_urls:
|
||||
for i, config in enumerate(configs):
|
||||
if config.is_match(url):
|
||||
print(f"{url} -> Config {i} matches")
|
||||
break
|
||||
else:
|
||||
print(f"{url} -> No match, will use fallback (first config)")
|
||||
|
||||
print("\n" + "=" * 50 + "\n")
|
||||
|
||||
# Now test with actual crawler
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Single config - traditional usage still works
|
||||
print("Test 1: Single config (backwards compatible)")
|
||||
result = await crawler.arun_many(
|
||||
urls=["https://www.python.org/"],
|
||||
config=default_config
|
||||
)
|
||||
print(f"Crawled {len(result)} URLs with single config\n")
|
||||
|
||||
# Multiple configs - new feature
|
||||
print("Test 2: Multiple configs")
|
||||
# Just test with 2 URLs to avoid timeout
|
||||
results = await crawler.arun_many(
|
||||
urls=test_urls[:2], # Just test first 2 URLs
|
||||
config=configs # Pass list of configs
|
||||
)
|
||||
print(f"Crawled {len(results)} URLs with multiple configs")
|
||||
|
||||
# Using custom matcher inline
|
||||
print("\nTest 3: Inline custom matcher")
|
||||
custom_config = CrawlerRunConfig(
|
||||
url_matcher=lambda url: len(url) > 50 and 'python' in url.lower(),
|
||||
verbose=False
|
||||
)
|
||||
results = await crawler.arun_many(
|
||||
urls=[
|
||||
"https://docs.python.org/3/library/asyncio.html", # Long URL with 'python'
|
||||
"https://python.org/", # Short URL with 'python' - won't match
|
||||
"https://www.google.com/" # No 'python' - won't match
|
||||
],
|
||||
config=[custom_config, default_config]
|
||||
)
|
||||
print(f"Crawled {len(results)} URLs with custom matcher")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_multi_config())
|
||||
Reference in New Issue
Block a user