Compare commits
47 Commits
next
...
fix/exit_w
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0541b61405 | ||
|
|
6735c68288 | ||
|
|
ff6ea41ac3 | ||
|
|
31a435fb0e | ||
|
|
5de6a28055 | ||
|
|
de1561ad14 | ||
|
|
337b588732 | ||
|
|
7a6ad547f0 | ||
|
|
e6692b987d | ||
|
|
307fe28b32 | ||
|
|
438a103b17 | ||
|
|
a03e68fa2f | ||
|
|
864d87afb2 | ||
|
|
508b6fc233 | ||
|
|
e3281935bc | ||
|
|
48647300b4 | ||
|
|
9f9ea3bb3b | ||
|
|
d58b93c207 | ||
|
|
e2b4705010 | ||
|
|
4a1abd5086 | ||
|
|
04258cd4f2 | ||
|
|
84e462d9f8 | ||
|
|
9546773a07 | ||
|
|
66a979ad11 | ||
|
|
0c31e91b53 | ||
|
|
1b6a31f88f | ||
|
|
b8c261780f | ||
|
|
db6ad7a79d | ||
|
|
004d514f33 | ||
|
|
3a9e2c716e | ||
|
|
0163bd797c | ||
|
|
26bad799e4 | ||
|
|
cf8badfe27 | ||
|
|
ccbe3c105c | ||
|
|
761c19d54b | ||
|
|
14b0ecb137 | ||
|
|
0eaa9f9895 | ||
|
|
1d1970ae69 | ||
|
|
205df1e330 | ||
|
|
2640dc73a5 | ||
|
|
58024755c5 | ||
|
|
dd5ee752cf | ||
|
|
bde1bba6a2 | ||
|
|
ee25c771d8 | ||
|
|
c4d625fb3c | ||
|
|
ef722766f0 | ||
|
|
4bcb7171a3 |
142
.github/workflows/release.yml
vendored
Normal file
142
.github/workflows/release.yml
vendored
Normal file
@@ -0,0 +1,142 @@
|
||||
name: Release Pipeline
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- 'v*'
|
||||
- '!test-v*' # Exclude test tags
|
||||
|
||||
jobs:
|
||||
release:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write # Required for creating releases
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Extract version from tag
|
||||
id: get_version
|
||||
run: |
|
||||
TAG_VERSION=${GITHUB_REF#refs/tags/v}
|
||||
echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT
|
||||
echo "Releasing version: $TAG_VERSION"
|
||||
|
||||
- name: Install package dependencies
|
||||
run: |
|
||||
pip install -e .
|
||||
|
||||
- name: Check version consistency
|
||||
run: |
|
||||
TAG_VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||
PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)")
|
||||
|
||||
echo "Tag version: $TAG_VERSION"
|
||||
echo "Package version: $PACKAGE_VERSION"
|
||||
|
||||
if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then
|
||||
echo "❌ Version mismatch! Tag: $TAG_VERSION, Package: $PACKAGE_VERSION"
|
||||
echo "Please update crawl4ai/__version__.py to match the tag version"
|
||||
exit 1
|
||||
fi
|
||||
echo "✅ Version check passed: $TAG_VERSION"
|
||||
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install build twine
|
||||
|
||||
- name: Build package
|
||||
run: python -m build
|
||||
|
||||
- name: Check package
|
||||
run: twine check dist/*
|
||||
|
||||
- name: Upload to PyPI
|
||||
env:
|
||||
TWINE_USERNAME: __token__
|
||||
TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
|
||||
run: |
|
||||
echo "📦 Uploading to PyPI..."
|
||||
twine upload dist/*
|
||||
echo "✅ Package uploaded to https://pypi.org/project/crawl4ai/"
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Extract major and minor versions
|
||||
id: versions
|
||||
run: |
|
||||
VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||
MAJOR=$(echo $VERSION | cut -d. -f1)
|
||||
MINOR=$(echo $VERSION | cut -d. -f1-2)
|
||||
echo "MAJOR=$MAJOR" >> $GITHUB_OUTPUT
|
||||
echo "MINOR=$MINOR" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Build and push Docker images
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: |
|
||||
unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
||||
unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}
|
||||
unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}
|
||||
unclecode/crawl4ai:latest
|
||||
platforms: linux/amd64,linux/arm64
|
||||
|
||||
- name: Create GitHub Release
|
||||
uses: softprops/action-gh-release@v2
|
||||
with:
|
||||
tag_name: v${{ steps.get_version.outputs.VERSION }}
|
||||
name: Release v${{ steps.get_version.outputs.VERSION }}
|
||||
body: |
|
||||
## 🎉 Crawl4AI v${{ steps.get_version.outputs.VERSION }} Released!
|
||||
|
||||
### 📦 Installation
|
||||
|
||||
**PyPI:**
|
||||
```bash
|
||||
pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}
|
||||
```
|
||||
|
||||
**Docker:**
|
||||
```bash
|
||||
docker pull unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}
|
||||
docker pull unclecode/crawl4ai:latest
|
||||
```
|
||||
|
||||
### 📝 What's Changed
|
||||
See [CHANGELOG.md](https://github.com/${{ github.repository }}/blob/main/CHANGELOG.md) for details.
|
||||
draft: false
|
||||
prerelease: false
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Summary
|
||||
run: |
|
||||
echo "## 🚀 Release Complete!" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 📦 PyPI Package" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- Version: ${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- URL: https://pypi.org/project/crawl4ai/" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- Install: \`pip install crawl4ai==${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 🐳 Docker Images" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MINOR }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:${{ steps.versions.outputs.MAJOR }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:latest\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 📋 GitHub Release" >> $GITHUB_STEP_SUMMARY
|
||||
echo "https://github.com/${{ github.repository }}/releases/tag/v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
116
.github/workflows/test-release.yml.disabled
vendored
Normal file
116
.github/workflows/test-release.yml.disabled
vendored
Normal file
@@ -0,0 +1,116 @@
|
||||
name: Test Release Pipeline
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- 'test-v*'
|
||||
|
||||
jobs:
|
||||
test-release:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.12'
|
||||
|
||||
- name: Extract version from tag
|
||||
id: get_version
|
||||
run: |
|
||||
TAG_VERSION=${GITHUB_REF#refs/tags/test-v}
|
||||
echo "VERSION=$TAG_VERSION" >> $GITHUB_OUTPUT
|
||||
echo "Testing with version: $TAG_VERSION"
|
||||
|
||||
- name: Install package dependencies
|
||||
run: |
|
||||
pip install -e .
|
||||
|
||||
- name: Check version consistency
|
||||
run: |
|
||||
TAG_VERSION=${{ steps.get_version.outputs.VERSION }}
|
||||
PACKAGE_VERSION=$(python -c "from crawl4ai.__version__ import __version__; print(__version__)")
|
||||
|
||||
echo "Tag version: $TAG_VERSION"
|
||||
echo "Package version: $PACKAGE_VERSION"
|
||||
|
||||
if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then
|
||||
echo "❌ Version mismatch! Tag: $TAG_VERSION, Package: $PACKAGE_VERSION"
|
||||
echo "Please update crawl4ai/__version__.py to match the tag version"
|
||||
exit 1
|
||||
fi
|
||||
echo "✅ Version check passed: $TAG_VERSION"
|
||||
|
||||
- name: Install build dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install build twine
|
||||
|
||||
- name: Build package
|
||||
run: python -m build
|
||||
|
||||
- name: Check package
|
||||
run: twine check dist/*
|
||||
|
||||
- name: Upload to Test PyPI
|
||||
env:
|
||||
TWINE_USERNAME: __token__
|
||||
TWINE_PASSWORD: ${{ secrets.TEST_PYPI_TOKEN }}
|
||||
run: |
|
||||
echo "📦 Uploading to Test PyPI..."
|
||||
twine upload --repository testpypi dist/* || {
|
||||
if [ $? -eq 1 ]; then
|
||||
echo "⚠️ Upload failed - likely version already exists on Test PyPI"
|
||||
echo "Continuing anyway for test purposes..."
|
||||
else
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
echo "✅ Test PyPI step complete"
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Build and push Docker test images
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
push: true
|
||||
tags: |
|
||||
unclecode/crawl4ai:test-${{ steps.get_version.outputs.VERSION }}
|
||||
unclecode/crawl4ai:test-latest
|
||||
platforms: linux/amd64,linux/arm64
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
|
||||
- name: Summary
|
||||
run: |
|
||||
echo "## 🎉 Test Release Complete!" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 📦 Test PyPI Package" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- Version: ${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- URL: https://test.pypi.org/project/crawl4ai/" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- Install: \`pip install -i https://test.pypi.org/simple/ crawl4ai==${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 🐳 Docker Test Images" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:test-${{ steps.get_version.outputs.VERSION }}\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "- \`unclecode/crawl4ai:test-latest\`" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "### 🧹 Cleanup Commands" >> $GITHUB_STEP_SUMMARY
|
||||
echo "\`\`\`bash" >> $GITHUB_STEP_SUMMARY
|
||||
echo "# Remove test tag" >> $GITHUB_STEP_SUMMARY
|
||||
echo "git tag -d test-v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "git push origin :test-v${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "" >> $GITHUB_STEP_SUMMARY
|
||||
echo "# Remove Docker test images" >> $GITHUB_STEP_SUMMARY
|
||||
echo "docker rmi unclecode/crawl4ai:test-${{ steps.get_version.outputs.VERSION }}" >> $GITHUB_STEP_SUMMARY
|
||||
echo "docker rmi unclecode/crawl4ai:test-latest" >> $GITHUB_STEP_SUMMARY
|
||||
echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
|
||||
15
CHANGELOG.md
15
CHANGELOG.md
@@ -21,6 +21,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
- **Flexible LLM Provider Configuration** (Docker):
|
||||
- Support for `LLM_PROVIDER` environment variable to override default provider
|
||||
- Per-request provider override via optional `provider` parameter in API endpoints
|
||||
- Automatic provider validation with clear error messages
|
||||
- Updated Docker documentation and examples
|
||||
|
||||
### Changed
|
||||
- **WebScrapingStrategy Refactoring**: Simplified content scraping architecture
|
||||
- `WebScrapingStrategy` is now an alias for `LXMLWebScrapingStrategy` for backward compatibility
|
||||
- Removed redundant BeautifulSoup-based implementation (~1000 lines of code)
|
||||
- `LXMLWebScrapingStrategy` now inherits directly from `ContentScrapingStrategy`
|
||||
- All existing code using `WebScrapingStrategy` continues to work without modification
|
||||
- Default scraping strategy remains `LXMLWebScrapingStrategy` for optimal performance
|
||||
|
||||
### Added
|
||||
- **AsyncUrlSeeder**: High-performance URL discovery system for intelligent crawling at scale
|
||||
- Discover URLs from sitemaps and Common Crawl index
|
||||
|
||||
19
README.md
19
README.md
@@ -28,7 +28,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant
|
||||
|
||||
[✨ Check out latest update v0.7.0](#-recent-updates)
|
||||
|
||||
🎉 **Version 0.7.0 is now available!** The Adaptive Intelligence Update introduces groundbreaking features: Adaptive Crawling that learns website patterns, Virtual Scroll support for infinite pages, intelligent Link Preview with 3-layer scoring, Async URL Seeder for massive discovery, and significant performance improvements. [Read the release notes →](https://docs.crawl4ai.com/blog/release-v0.7.0)
|
||||
🎉 **Version 0.7.0 is now available!** The Adaptive Intelligence Update introduces groundbreaking features: Adaptive Crawling that learns website patterns, Virtual Scroll support for infinite pages, intelligent Link Preview with 3-layer scoring, Async URL Seeder for massive discovery, and significant performance improvements. [Read the release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.0.md)
|
||||
|
||||
<details>
|
||||
<summary>🤓 <strong>My Personal Story</strong></summary>
|
||||
@@ -523,15 +523,18 @@ async def test_news_crawl():
|
||||
- **🧠 Adaptive Crawling**: Your crawler now learns and adapts to website patterns automatically:
|
||||
```python
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.7,
|
||||
max_history=100,
|
||||
learning_rate=0.2
|
||||
confidence_threshold=0.7, # Min confidence to stop crawling
|
||||
max_depth=5, # Maximum crawl depth
|
||||
max_pages=20, # Maximum number of pages to crawl
|
||||
strategy="statistical"
|
||||
)
|
||||
|
||||
result = await crawler.arun(
|
||||
"https://news.example.com",
|
||||
config=CrawlerRunConfig(adaptive_config=config)
|
||||
)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
adaptive_crawler = AdaptiveCrawler(crawler, config)
|
||||
state = await adaptive_crawler.digest(
|
||||
start_url="https://news.example.com",
|
||||
query="latest news content"
|
||||
)
|
||||
# Crawler learns patterns and improves extraction over time
|
||||
```
|
||||
|
||||
|
||||
@@ -3,12 +3,12 @@ import warnings
|
||||
|
||||
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
||||
# MODIFIED: Add SeedingConfig and VirtualScrollConfig here
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig, LinkPreviewConfig, MatchMode
|
||||
|
||||
from .content_scraping_strategy import (
|
||||
ContentScrapingStrategy,
|
||||
WebScrapingStrategy,
|
||||
LXMLWebScrapingStrategy,
|
||||
WebScrapingStrategy, # Backward compatibility alias
|
||||
)
|
||||
from .async_logger import (
|
||||
AsyncLoggerBase,
|
||||
@@ -132,6 +132,7 @@ __all__ = [
|
||||
"CrawlResult",
|
||||
"CrawlerHub",
|
||||
"CacheMode",
|
||||
"MatchMode",
|
||||
"ContentScrapingStrategy",
|
||||
"WebScrapingStrategy",
|
||||
"LXMLWebScrapingStrategy",
|
||||
@@ -173,6 +174,7 @@ __all__ = [
|
||||
"CompilationResult",
|
||||
"ValidationResult",
|
||||
"ErrorDetail",
|
||||
"LinkPreviewConfig"
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# crawl4ai/__version__.py
|
||||
|
||||
# This is the version that will be used for stable releases
|
||||
__version__ = "0.7.0"
|
||||
__version__ = "0.7.2"
|
||||
|
||||
# For nightly builds, this gets set during build process
|
||||
__nightly_version__ = None
|
||||
|
||||
@@ -18,17 +18,24 @@ from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy
|
||||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||
|
||||
from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
|
||||
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy, LXMLWebScrapingStrategy
|
||||
from .content_scraping_strategy import ContentScrapingStrategy, LXMLWebScrapingStrategy
|
||||
from .deep_crawling import DeepCrawlStrategy
|
||||
|
||||
from .cache_context import CacheMode
|
||||
from .proxy_strategy import ProxyRotationStrategy
|
||||
|
||||
from typing import Union, List
|
||||
from typing import Union, List, Callable
|
||||
import inspect
|
||||
from typing import Any, Dict, Optional
|
||||
from enum import Enum
|
||||
|
||||
# Type alias for URL matching
|
||||
UrlMatcher = Union[str, Callable[[str], bool], List[Union[str, Callable[[str], bool]]]]
|
||||
|
||||
class MatchMode(Enum):
|
||||
OR = "or"
|
||||
AND = "and"
|
||||
|
||||
# from .proxy_strategy import ProxyConfig
|
||||
|
||||
|
||||
@@ -862,7 +869,7 @@ class CrawlerRunConfig():
|
||||
parser_type (str): Type of parser to use for HTML parsing.
|
||||
Default: "lxml".
|
||||
scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
|
||||
Default: WebScrapingStrategy.
|
||||
Default: LXMLWebScrapingStrategy.
|
||||
proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
|
||||
If None, no additional proxy config. Default: None.
|
||||
|
||||
@@ -1113,6 +1120,9 @@ class CrawlerRunConfig():
|
||||
link_preview_config: Union[LinkPreviewConfig, Dict[str, Any]] = None,
|
||||
# Virtual Scroll Parameters
|
||||
virtual_scroll_config: Union[VirtualScrollConfig, Dict[str, Any]] = None,
|
||||
# URL Matching Parameters
|
||||
url_matcher: Optional[UrlMatcher] = None,
|
||||
match_mode: MatchMode = MatchMode.OR,
|
||||
# Experimental Parameters
|
||||
experimental: Dict[str, Any] = None,
|
||||
):
|
||||
@@ -1266,6 +1276,10 @@ class CrawlerRunConfig():
|
||||
else:
|
||||
raise ValueError("virtual_scroll_config must be VirtualScrollConfig object or dict")
|
||||
|
||||
# URL Matching Parameters
|
||||
self.url_matcher = url_matcher
|
||||
self.match_mode = match_mode
|
||||
|
||||
# Experimental Parameters
|
||||
self.experimental = experimental or {}
|
||||
|
||||
@@ -1321,6 +1335,51 @@ class CrawlerRunConfig():
|
||||
if "compilation error" not in str(e).lower():
|
||||
raise ValueError(f"Failed to compile C4A script: {str(e)}")
|
||||
raise
|
||||
|
||||
def is_match(self, url: str) -> bool:
|
||||
"""Check if this config matches the given URL.
|
||||
|
||||
Args:
|
||||
url: The URL to check against this config's matcher
|
||||
|
||||
Returns:
|
||||
bool: True if this config should be used for the URL or if no matcher is set.
|
||||
"""
|
||||
if self.url_matcher is None:
|
||||
return True
|
||||
|
||||
if callable(self.url_matcher):
|
||||
# Single function matcher
|
||||
return self.url_matcher(url)
|
||||
|
||||
elif isinstance(self.url_matcher, str):
|
||||
# Single pattern string
|
||||
from fnmatch import fnmatch
|
||||
return fnmatch(url, self.url_matcher)
|
||||
|
||||
elif isinstance(self.url_matcher, list):
|
||||
# List of mixed matchers
|
||||
if not self.url_matcher: # Empty list
|
||||
return False
|
||||
|
||||
results = []
|
||||
for matcher in self.url_matcher:
|
||||
if callable(matcher):
|
||||
results.append(matcher(url))
|
||||
elif isinstance(matcher, str):
|
||||
from fnmatch import fnmatch
|
||||
results.append(fnmatch(url, matcher))
|
||||
else:
|
||||
# Skip invalid matchers
|
||||
continue
|
||||
|
||||
# Apply match mode logic
|
||||
if self.match_mode == MatchMode.OR:
|
||||
return any(results) if results else False
|
||||
else: # AND mode
|
||||
return all(results) if results else False
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def __getattr__(self, name):
|
||||
@@ -1443,6 +1502,9 @@ class CrawlerRunConfig():
|
||||
# Link Extraction Parameters
|
||||
link_preview_config=kwargs.get("link_preview_config"),
|
||||
url=kwargs.get("url"),
|
||||
# URL Matching Parameters
|
||||
url_matcher=kwargs.get("url_matcher"),
|
||||
match_mode=kwargs.get("match_mode", MatchMode.OR),
|
||||
# Experimental Parameters
|
||||
experimental=kwargs.get("experimental"),
|
||||
)
|
||||
@@ -1540,6 +1602,8 @@ class CrawlerRunConfig():
|
||||
"deep_crawl_strategy": self.deep_crawl_strategy,
|
||||
"link_preview_config": self.link_preview_config.to_dict() if self.link_preview_config else None,
|
||||
"url": self.url,
|
||||
"url_matcher": self.url_matcher,
|
||||
"match_mode": self.match_mode,
|
||||
"experimental": self.experimental,
|
||||
}
|
||||
|
||||
|
||||
@@ -824,7 +824,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
except Error:
|
||||
visibility_info = await self.check_visibility(page)
|
||||
|
||||
if self.browser_config.config.verbose:
|
||||
if self.browser_config.verbose:
|
||||
self.logger.debug(
|
||||
message="Body visibility info: {info}",
|
||||
tag="DEBUG",
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from typing import Dict, Optional, List, Tuple
|
||||
from typing import Dict, Optional, List, Tuple, Union
|
||||
from .async_configs import CrawlerRunConfig
|
||||
from .models import (
|
||||
CrawlResult,
|
||||
@@ -22,6 +22,8 @@ from urllib.parse import urlparse
|
||||
import random
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from .memory_utils import get_true_memory_usage_percent
|
||||
|
||||
|
||||
class RateLimiter:
|
||||
def __init__(
|
||||
@@ -96,11 +98,37 @@ class BaseDispatcher(ABC):
|
||||
self.rate_limiter = rate_limiter
|
||||
self.monitor = monitor
|
||||
|
||||
def select_config(self, url: str, configs: Union[CrawlerRunConfig, List[CrawlerRunConfig]]) -> Optional[CrawlerRunConfig]:
|
||||
"""Select the appropriate config for a given URL.
|
||||
|
||||
Args:
|
||||
url: The URL to match against
|
||||
configs: Single config or list of configs to choose from
|
||||
|
||||
Returns:
|
||||
The matching config, or None if no match found
|
||||
"""
|
||||
# Single config - return as is
|
||||
if isinstance(configs, CrawlerRunConfig):
|
||||
return configs
|
||||
|
||||
# Empty list - return None
|
||||
if not configs:
|
||||
return None
|
||||
|
||||
# Find first matching config
|
||||
for config in configs:
|
||||
if config.is_match(url):
|
||||
return config
|
||||
|
||||
# No match found - return None to indicate URL should be skipped
|
||||
return None
|
||||
|
||||
@abstractmethod
|
||||
async def crawl_url(
|
||||
self,
|
||||
url: str,
|
||||
config: CrawlerRunConfig,
|
||||
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||
task_id: str,
|
||||
monitor: Optional[CrawlerMonitor] = None,
|
||||
) -> CrawlerTaskResult:
|
||||
@@ -111,7 +139,7 @@ class BaseDispatcher(ABC):
|
||||
self,
|
||||
urls: List[str],
|
||||
crawler: AsyncWebCrawler, # noqa: F821
|
||||
config: CrawlerRunConfig,
|
||||
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||
monitor: Optional[CrawlerMonitor] = None,
|
||||
) -> List[CrawlerTaskResult]:
|
||||
pass
|
||||
@@ -147,7 +175,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
async def _memory_monitor_task(self):
|
||||
"""Background task to continuously monitor memory usage and update state"""
|
||||
while True:
|
||||
self.current_memory_percent = psutil.virtual_memory().percent
|
||||
self.current_memory_percent = get_true_memory_usage_percent()
|
||||
|
||||
# Enter memory pressure mode if we cross the threshold
|
||||
if self.current_memory_percent >= self.memory_threshold_percent:
|
||||
@@ -200,7 +228,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
async def crawl_url(
|
||||
self,
|
||||
url: str,
|
||||
config: CrawlerRunConfig,
|
||||
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||
task_id: str,
|
||||
retry_count: int = 0,
|
||||
) -> CrawlerTaskResult:
|
||||
@@ -208,6 +236,37 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
error_message = ""
|
||||
memory_usage = peak_memory = 0.0
|
||||
|
||||
# Select appropriate config for this URL
|
||||
selected_config = self.select_config(url, config)
|
||||
|
||||
# If no config matches, return failed result
|
||||
if selected_config is None:
|
||||
error_message = f"No matching configuration found for URL: {url}"
|
||||
if self.monitor:
|
||||
self.monitor.update_task(
|
||||
task_id,
|
||||
status=CrawlStatus.FAILED,
|
||||
error_message=error_message
|
||||
)
|
||||
|
||||
return CrawlerTaskResult(
|
||||
task_id=task_id,
|
||||
url=url,
|
||||
result=CrawlResult(
|
||||
url=url,
|
||||
html="",
|
||||
metadata={"status": "no_config_match"},
|
||||
success=False,
|
||||
error_message=error_message
|
||||
),
|
||||
memory_usage=0,
|
||||
peak_memory=0,
|
||||
start_time=start_time,
|
||||
end_time=time.time(),
|
||||
error_message=error_message,
|
||||
retry_count=retry_count
|
||||
)
|
||||
|
||||
# Get starting memory for accurate measurement
|
||||
process = psutil.Process()
|
||||
start_memory = process.memory_info().rss / (1024 * 1024)
|
||||
@@ -257,8 +316,8 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
retry_count=retry_count + 1
|
||||
)
|
||||
|
||||
# Execute the crawl
|
||||
result = await self.crawler.arun(url, config=config, session_id=task_id)
|
||||
# Execute the crawl with selected config
|
||||
result = await self.crawler.arun(url, config=selected_config, session_id=task_id)
|
||||
|
||||
# Measure memory usage
|
||||
end_memory = process.memory_info().rss / (1024 * 1024)
|
||||
@@ -316,7 +375,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
self,
|
||||
urls: List[str],
|
||||
crawler: AsyncWebCrawler,
|
||||
config: CrawlerRunConfig,
|
||||
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||
) -> List[CrawlerTaskResult]:
|
||||
self.crawler = crawler
|
||||
|
||||
@@ -470,7 +529,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
self,
|
||||
urls: List[str],
|
||||
crawler: AsyncWebCrawler,
|
||||
config: CrawlerRunConfig,
|
||||
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||
) -> AsyncGenerator[CrawlerTaskResult, None]:
|
||||
self.crawler = crawler
|
||||
|
||||
@@ -572,7 +631,7 @@ class SemaphoreDispatcher(BaseDispatcher):
|
||||
async def crawl_url(
|
||||
self,
|
||||
url: str,
|
||||
config: CrawlerRunConfig,
|
||||
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||
task_id: str,
|
||||
semaphore: asyncio.Semaphore = None,
|
||||
) -> CrawlerTaskResult:
|
||||
@@ -580,6 +639,36 @@ class SemaphoreDispatcher(BaseDispatcher):
|
||||
error_message = ""
|
||||
memory_usage = peak_memory = 0.0
|
||||
|
||||
# Select appropriate config for this URL
|
||||
selected_config = self.select_config(url, config)
|
||||
|
||||
# If no config matches, return failed result
|
||||
if selected_config is None:
|
||||
error_message = f"No matching configuration found for URL: {url}"
|
||||
if self.monitor:
|
||||
self.monitor.update_task(
|
||||
task_id,
|
||||
status=CrawlStatus.FAILED,
|
||||
error_message=error_message
|
||||
)
|
||||
|
||||
return CrawlerTaskResult(
|
||||
task_id=task_id,
|
||||
url=url,
|
||||
result=CrawlResult(
|
||||
url=url,
|
||||
html="",
|
||||
metadata={"status": "no_config_match"},
|
||||
success=False,
|
||||
error_message=error_message
|
||||
),
|
||||
memory_usage=0,
|
||||
peak_memory=0,
|
||||
start_time=start_time,
|
||||
end_time=time.time(),
|
||||
error_message=error_message
|
||||
)
|
||||
|
||||
try:
|
||||
if self.monitor:
|
||||
self.monitor.update_task(
|
||||
@@ -592,7 +681,7 @@ class SemaphoreDispatcher(BaseDispatcher):
|
||||
async with semaphore:
|
||||
process = psutil.Process()
|
||||
start_memory = process.memory_info().rss / (1024 * 1024)
|
||||
result = await self.crawler.arun(url, config=config, session_id=task_id)
|
||||
result = await self.crawler.arun(url, config=selected_config, session_id=task_id)
|
||||
end_memory = process.memory_info().rss / (1024 * 1024)
|
||||
|
||||
memory_usage = peak_memory = end_memory - start_memory
|
||||
@@ -654,7 +743,7 @@ class SemaphoreDispatcher(BaseDispatcher):
|
||||
self,
|
||||
crawler: AsyncWebCrawler, # noqa: F821
|
||||
urls: List[str],
|
||||
config: CrawlerRunConfig,
|
||||
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||
) -> List[CrawlerTaskResult]:
|
||||
self.crawler = crawler
|
||||
if self.monitor:
|
||||
|
||||
@@ -829,7 +829,7 @@ class AsyncUrlSeeder:
|
||||
|
||||
async def _iter_sitemap(self, url: str):
|
||||
try:
|
||||
r = await self.client.get(url, timeout=15)
|
||||
r = await self.client.get(url, timeout=15, follow_redirects=True)
|
||||
r.raise_for_status()
|
||||
except httpx.HTTPStatusError as e:
|
||||
self._log("warning", "Failed to fetch sitemap {url}: HTTP {status_code}",
|
||||
|
||||
@@ -502,9 +502,12 @@ class AsyncWebCrawler:
|
||||
metadata = result.get("metadata", {})
|
||||
else:
|
||||
cleaned_html = sanitize_input_encode(result.cleaned_html)
|
||||
media = result.media.model_dump()
|
||||
tables = media.pop("tables", [])
|
||||
links = result.links.model_dump()
|
||||
# media = result.media.model_dump()
|
||||
# tables = media.pop("tables", [])
|
||||
# links = result.links.model_dump()
|
||||
media = result.media.model_dump() if hasattr(result.media, 'model_dump') else result.media
|
||||
tables = media.pop("tables", []) if isinstance(media, dict) else []
|
||||
links = result.links.model_dump() if hasattr(result.links, 'model_dump') else result.links
|
||||
metadata = result.metadata
|
||||
|
||||
fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)
|
||||
@@ -650,7 +653,7 @@ class AsyncWebCrawler:
|
||||
async def arun_many(
|
||||
self,
|
||||
urls: List[str],
|
||||
config: Optional[CrawlerRunConfig] = None,
|
||||
config: Optional[Union[CrawlerRunConfig, List[CrawlerRunConfig]]] = None,
|
||||
dispatcher: Optional[BaseDispatcher] = None,
|
||||
# Legacy parameters maintained for backwards compatibility
|
||||
# word_count_threshold=MIN_WORD_THRESHOLD,
|
||||
@@ -671,7 +674,9 @@ class AsyncWebCrawler:
|
||||
|
||||
Args:
|
||||
urls: List of URLs to crawl
|
||||
config: Configuration object controlling crawl behavior for all URLs
|
||||
config: Configuration object(s) controlling crawl behavior. Can be:
|
||||
- Single CrawlerRunConfig: Used for all URLs
|
||||
- List[CrawlerRunConfig]: Configs with url_matcher for URL-specific settings
|
||||
dispatcher: The dispatcher strategy instance to use. Defaults to MemoryAdaptiveDispatcher
|
||||
[other parameters maintained for backwards compatibility]
|
||||
|
||||
@@ -736,7 +741,11 @@ class AsyncWebCrawler:
|
||||
or task_result.result
|
||||
)
|
||||
|
||||
stream = config.stream
|
||||
# Handle stream setting - use first config's stream setting if config is a list
|
||||
if isinstance(config, list):
|
||||
stream = config[0].stream if config else False
|
||||
else:
|
||||
stream = config.stream
|
||||
|
||||
if stream:
|
||||
|
||||
|
||||
@@ -14,23 +14,8 @@ import hashlib
|
||||
from .js_snippet import load_js_script
|
||||
from .config import DOWNLOAD_PAGE_TIMEOUT
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from playwright_stealth import StealthConfig
|
||||
from .utils import get_chromium_path
|
||||
|
||||
stealth_config = StealthConfig(
|
||||
webdriver=True,
|
||||
chrome_app=True,
|
||||
chrome_csi=True,
|
||||
chrome_load_times=True,
|
||||
chrome_runtime=True,
|
||||
navigator_languages=True,
|
||||
navigator_plugins=True,
|
||||
navigator_permissions=True,
|
||||
webgl_vendor=True,
|
||||
outerdimensions=True,
|
||||
navigator_hardware_concurrency=True,
|
||||
media_codecs=True,
|
||||
)
|
||||
|
||||
BROWSER_DISABLE_OPTIONS = [
|
||||
"--disable-background-networking",
|
||||
|
||||
@@ -65,6 +65,213 @@ class BrowserProfiler:
|
||||
self.builtin_config_file = os.path.join(self.builtin_browser_dir, "browser_config.json")
|
||||
os.makedirs(self.builtin_browser_dir, exist_ok=True)
|
||||
|
||||
def _is_windows(self) -> bool:
|
||||
"""Check if running on Windows platform."""
|
||||
return sys.platform.startswith('win') or sys.platform == 'cygwin'
|
||||
|
||||
def _is_macos(self) -> bool:
|
||||
"""Check if running on macOS platform."""
|
||||
return sys.platform == 'darwin'
|
||||
|
||||
def _is_linux(self) -> bool:
|
||||
"""Check if running on Linux platform."""
|
||||
return sys.platform.startswith('linux')
|
||||
|
||||
def _get_quit_message(self, tag: str) -> str:
|
||||
"""Get appropriate quit message based on context."""
|
||||
if tag == "PROFILE":
|
||||
return "Closing browser and saving profile..."
|
||||
elif tag == "CDP":
|
||||
return "Closing browser..."
|
||||
else:
|
||||
return "Closing browser..."
|
||||
|
||||
async def _listen_windows(self, user_done_event, check_browser_process, tag: str):
|
||||
"""Windows-specific keyboard listener using msvcrt."""
|
||||
try:
|
||||
import msvcrt
|
||||
except ImportError:
|
||||
raise ImportError("msvcrt module not available on this platform")
|
||||
|
||||
while True:
|
||||
try:
|
||||
# Check for keyboard input
|
||||
if msvcrt.kbhit():
|
||||
raw = msvcrt.getch()
|
||||
|
||||
# Handle Unicode decoding more robustly
|
||||
key = None
|
||||
try:
|
||||
key = raw.decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
try:
|
||||
# Try different encodings
|
||||
key = raw.decode("latin1")
|
||||
except UnicodeDecodeError:
|
||||
# Skip if we can't decode
|
||||
continue
|
||||
|
||||
# Validate key
|
||||
if not key or len(key) != 1:
|
||||
continue
|
||||
|
||||
# Check for printable characters only
|
||||
if not key.isprintable():
|
||||
continue
|
||||
|
||||
# Check for quit command
|
||||
if key.lower() == "q":
|
||||
self.logger.info(
|
||||
self._get_quit_message(tag),
|
||||
tag=tag,
|
||||
base_color=LogColor.GREEN
|
||||
)
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
# Check if browser process ended
|
||||
if await check_browser_process():
|
||||
return
|
||||
|
||||
# Small delay to prevent busy waiting
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error in Windows keyboard listener: {e}", tag=tag)
|
||||
# Continue trying instead of failing completely
|
||||
await asyncio.sleep(0.1)
|
||||
continue
|
||||
|
||||
async def _listen_unix(self, user_done_event: asyncio.Event, check_browser_process, tag: str):
|
||||
"""Unix/Linux/macOS keyboard listener using termios and select."""
|
||||
try:
|
||||
import termios
|
||||
import tty
|
||||
import select
|
||||
except ImportError:
|
||||
raise ImportError("termios/tty/select modules not available on this platform")
|
||||
|
||||
# Get stdin file descriptor
|
||||
try:
|
||||
fd = sys.stdin.fileno()
|
||||
except (AttributeError, OSError):
|
||||
raise ImportError("stdin is not a terminal")
|
||||
|
||||
# Save original terminal settings
|
||||
old_settings = None
|
||||
try:
|
||||
old_settings = termios.tcgetattr(fd)
|
||||
except termios.error as e:
|
||||
raise ImportError(f"Cannot get terminal attributes: {e}")
|
||||
|
||||
try:
|
||||
# Switch to non-canonical mode (cbreak mode)
|
||||
tty.setcbreak(fd)
|
||||
|
||||
while True:
|
||||
try:
|
||||
# Use select to check if input is available (non-blocking)
|
||||
# Timeout of 0.5 seconds to periodically check browser process
|
||||
readable, _, _ = select.select([sys.stdin], [], [], 0.5)
|
||||
|
||||
if readable:
|
||||
# Read one character
|
||||
key = sys.stdin.read(1)
|
||||
|
||||
if key and key.lower() == "q":
|
||||
self.logger.info(
|
||||
self._get_quit_message(tag),
|
||||
tag=tag,
|
||||
base_color=LogColor.GREEN
|
||||
)
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
# Check if browser process ended
|
||||
if await check_browser_process():
|
||||
return
|
||||
|
||||
# Small delay to prevent busy waiting
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
except (KeyboardInterrupt, EOFError):
|
||||
# Handle Ctrl+C or EOF gracefully
|
||||
self.logger.info("Keyboard interrupt received", tag=tag)
|
||||
user_done_event.set()
|
||||
return
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error in Unix keyboard listener: {e}", tag=tag)
|
||||
await asyncio.sleep(0.1)
|
||||
continue
|
||||
|
||||
finally:
|
||||
# Always restore terminal settings
|
||||
if old_settings is not None:
|
||||
try:
|
||||
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to restore terminal settings: {e}", tag=tag)
|
||||
|
||||
async def _listen_fallback(self, user_done_event: asyncio.Event, check_browser_process, tag: str):
|
||||
"""Fallback keyboard listener using simple input() method."""
|
||||
self.logger.info("Using fallback input mode. Type 'q' and press Enter to quit.", tag=tag)
|
||||
|
||||
# Run input in a separate thread to avoid blocking
|
||||
import threading
|
||||
import queue
|
||||
|
||||
input_queue = queue.Queue()
|
||||
|
||||
def input_thread():
|
||||
"""Thread function to handle input."""
|
||||
try:
|
||||
while not user_done_event.is_set():
|
||||
try:
|
||||
# Use input() with a prompt
|
||||
user_input = input("Press 'q' + Enter to quit: ").strip().lower()
|
||||
input_queue.put(user_input)
|
||||
if user_input == 'q':
|
||||
break
|
||||
except (EOFError, KeyboardInterrupt):
|
||||
input_queue.put('q')
|
||||
break
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error in input thread: {e}", tag=tag)
|
||||
break
|
||||
except Exception as e:
|
||||
self.logger.error(f"Input thread failed: {e}", tag=tag)
|
||||
|
||||
# Start input thread
|
||||
thread = threading.Thread(target=input_thread, daemon=True)
|
||||
thread.start()
|
||||
|
||||
try:
|
||||
while not user_done_event.is_set():
|
||||
# Check for user input
|
||||
try:
|
||||
user_input = input_queue.get_nowait()
|
||||
if user_input == 'q':
|
||||
self.logger.info(
|
||||
self._get_quit_message(tag),
|
||||
tag=tag,
|
||||
base_color=LogColor.GREEN
|
||||
)
|
||||
user_done_event.set()
|
||||
return
|
||||
except queue.Empty:
|
||||
pass
|
||||
|
||||
# Check if browser process ended
|
||||
if await check_browser_process():
|
||||
return
|
||||
|
||||
# Small delay
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Fallback listener failed: {e}", tag=tag)
|
||||
user_done_event.set()
|
||||
|
||||
async def create_profile(self,
|
||||
profile_name: Optional[str] = None,
|
||||
browser_config: Optional[BrowserConfig] = None) -> Optional[str]:
|
||||
@@ -180,42 +387,38 @@ class BrowserProfiler:
|
||||
|
||||
# Run keyboard input loop in a separate task
|
||||
async def listen_for_quit_command():
|
||||
import termios
|
||||
import tty
|
||||
import select
|
||||
|
||||
"""Cross-platform keyboard listener that waits for 'q' key press."""
|
||||
# First output the prompt
|
||||
self.logger.info("Press 'q' when you've finished using the browser...", tag="PROFILE")
|
||||
|
||||
# Save original terminal settings
|
||||
fd = sys.stdin.fileno()
|
||||
old_settings = termios.tcgetattr(fd)
|
||||
|
||||
self.logger.info(
|
||||
"Press {segment} when you've finished using the browser...",
|
||||
tag="PROFILE",
|
||||
params={"segment": "'q'"}, colors={"segment": LogColor.YELLOW},
|
||||
base_color=LogColor.CYAN
|
||||
)
|
||||
|
||||
async def check_browser_process():
|
||||
"""Check if browser process is still running."""
|
||||
if (
|
||||
managed_browser.browser_process
|
||||
and managed_browser.browser_process.poll() is not None
|
||||
):
|
||||
self.logger.info(
|
||||
"Browser already closed. Ending input listener.", tag="PROFILE"
|
||||
)
|
||||
user_done_event.set()
|
||||
return True
|
||||
return False
|
||||
|
||||
# Try platform-specific implementations with fallback
|
||||
try:
|
||||
# Switch to non-canonical mode (no line buffering)
|
||||
tty.setcbreak(fd)
|
||||
|
||||
while True:
|
||||
# Check if input is available (non-blocking)
|
||||
readable, _, _ = select.select([sys.stdin], [], [], 0.5)
|
||||
if readable:
|
||||
key = sys.stdin.read(1)
|
||||
if key.lower() == 'q':
|
||||
self.logger.info("Closing browser and saving profile...", tag="PROFILE", base_color=LogColor.GREEN)
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
# Check if the browser process has already exited
|
||||
if managed_browser.browser_process and managed_browser.browser_process.poll() is not None:
|
||||
self.logger.info("Browser already closed. Ending input listener.", tag="PROFILE")
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
finally:
|
||||
# Restore terminal settings
|
||||
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
|
||||
if self._is_windows():
|
||||
await self._listen_windows(user_done_event, check_browser_process, "PROFILE")
|
||||
else:
|
||||
await self._listen_unix(user_done_event, check_browser_process, "PROFILE")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Platform-specific keyboard listener failed: {e}", tag="PROFILE")
|
||||
self.logger.info("Falling back to simple input mode...", tag="PROFILE")
|
||||
await self._listen_fallback(user_done_event, check_browser_process, "PROFILE")
|
||||
|
||||
try:
|
||||
from playwright.async_api import async_playwright
|
||||
@@ -682,42 +885,33 @@ class BrowserProfiler:
|
||||
|
||||
# Run keyboard input loop in a separate task
|
||||
async def listen_for_quit_command():
|
||||
import termios
|
||||
import tty
|
||||
import select
|
||||
|
||||
"""Cross-platform keyboard listener that waits for 'q' key press."""
|
||||
# First output the prompt
|
||||
self.logger.info("Press 'q' to stop the browser and exit...", tag="CDP")
|
||||
|
||||
# Save original terminal settings
|
||||
fd = sys.stdin.fileno()
|
||||
old_settings = termios.tcgetattr(fd)
|
||||
|
||||
self.logger.info(
|
||||
"Press {segment} to stop the browser and exit...",
|
||||
tag="CDP",
|
||||
params={"segment": "'q'"}, colors={"segment": LogColor.YELLOW},
|
||||
base_color=LogColor.CYAN
|
||||
)
|
||||
|
||||
async def check_browser_process():
|
||||
"""Check if browser process is still running."""
|
||||
if managed_browser.browser_process and managed_browser.browser_process.poll() is not None:
|
||||
self.logger.info("Browser already closed. Ending input listener.", tag="CDP")
|
||||
user_done_event.set()
|
||||
return True
|
||||
return False
|
||||
|
||||
# Try platform-specific implementations with fallback
|
||||
try:
|
||||
# Switch to non-canonical mode (no line buffering)
|
||||
tty.setcbreak(fd)
|
||||
|
||||
while True:
|
||||
# Check if input is available (non-blocking)
|
||||
readable, _, _ = select.select([sys.stdin], [], [], 0.5)
|
||||
if readable:
|
||||
key = sys.stdin.read(1)
|
||||
if key.lower() == 'q':
|
||||
self.logger.info("Closing browser...", tag="CDP")
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
# Check if the browser process has already exited
|
||||
if managed_browser.browser_process and managed_browser.browser_process.poll() is not None:
|
||||
self.logger.info("Browser already closed. Ending input listener.", tag="CDP")
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
finally:
|
||||
# Restore terminal settings
|
||||
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
|
||||
if self._is_windows():
|
||||
await self._listen_windows(user_done_event, check_browser_process, "CDP")
|
||||
else:
|
||||
await self._listen_unix(user_done_event, check_browser_process, "CDP")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Platform-specific keyboard listener failed: {e}", tag="CDP")
|
||||
self.logger.info("Falling back to simple input mode...", tag="CDP")
|
||||
await self._listen_fallback(user_done_event, check_browser_process, "CDP")
|
||||
|
||||
# Function to retrieve and display CDP JSON config
|
||||
async def get_cdp_json(port):
|
||||
|
||||
@@ -27,7 +27,10 @@ from crawl4ai import (
|
||||
PruningContentFilter,
|
||||
BrowserProfiler,
|
||||
DefaultMarkdownGenerator,
|
||||
LLMConfig
|
||||
LLMConfig,
|
||||
BFSDeepCrawlStrategy,
|
||||
DFSDeepCrawlStrategy,
|
||||
BestFirstCrawlingStrategy,
|
||||
)
|
||||
from crawl4ai.config import USER_SETTINGS
|
||||
from litellm import completion
|
||||
@@ -1014,9 +1017,11 @@ def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless
|
||||
@click.option("--question", "-q", help="Ask a question about the crawled content")
|
||||
@click.option("--verbose", "-v", is_flag=True)
|
||||
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
|
||||
@click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy (bfs, dfs, or best-first)")
|
||||
@click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode")
|
||||
def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str,
|
||||
extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
|
||||
output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
|
||||
output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int):
|
||||
"""Crawl a website and extract content
|
||||
|
||||
Simple Usage:
|
||||
@@ -1156,6 +1161,27 @@ Always return valid, properly formatted JSON."""
|
||||
|
||||
crawler_cfg.scraping_strategy = LXMLWebScrapingStrategy()
|
||||
|
||||
# Handle deep crawling configuration
|
||||
if deep_crawl:
|
||||
if deep_crawl == "bfs":
|
||||
crawler_cfg.deep_crawl_strategy = BFSDeepCrawlStrategy(
|
||||
max_depth=3,
|
||||
max_pages=max_pages
|
||||
)
|
||||
elif deep_crawl == "dfs":
|
||||
crawler_cfg.deep_crawl_strategy = DFSDeepCrawlStrategy(
|
||||
max_depth=3,
|
||||
max_pages=max_pages
|
||||
)
|
||||
elif deep_crawl == "best-first":
|
||||
crawler_cfg.deep_crawl_strategy = BestFirstCrawlingStrategy(
|
||||
max_depth=3,
|
||||
max_pages=max_pages
|
||||
)
|
||||
|
||||
if verbose:
|
||||
console.print(f"[green]Deep crawling enabled:[/green] {deep_crawl} strategy, max {max_pages} pages")
|
||||
|
||||
config = get_global_config()
|
||||
|
||||
browser_cfg.verbose = config.get("VERBOSE", False)
|
||||
@@ -1170,39 +1196,60 @@ Always return valid, properly formatted JSON."""
|
||||
verbose
|
||||
)
|
||||
|
||||
# Handle deep crawl results (list) vs single result
|
||||
if isinstance(result, list):
|
||||
if len(result) == 0:
|
||||
click.echo("No results found during deep crawling")
|
||||
return
|
||||
# Use the first result for question answering and output
|
||||
main_result = result[0]
|
||||
all_results = result
|
||||
else:
|
||||
# Single result from regular crawling
|
||||
main_result = result
|
||||
all_results = [result]
|
||||
|
||||
# Handle question
|
||||
if question:
|
||||
provider, token = setup_llm_config()
|
||||
markdown = result.markdown.raw_markdown
|
||||
markdown = main_result.markdown.raw_markdown
|
||||
anyio.run(stream_llm_response, url, markdown, question, provider, token)
|
||||
return
|
||||
|
||||
# Handle output
|
||||
if not output_file:
|
||||
if output == "all":
|
||||
click.echo(json.dumps(result.model_dump(), indent=2))
|
||||
if isinstance(result, list):
|
||||
output_data = [r.model_dump() for r in all_results]
|
||||
click.echo(json.dumps(output_data, indent=2))
|
||||
else:
|
||||
click.echo(json.dumps(main_result.model_dump(), indent=2))
|
||||
elif output == "json":
|
||||
print(result.extracted_content)
|
||||
extracted_items = json.loads(result.extracted_content)
|
||||
print(main_result.extracted_content)
|
||||
extracted_items = json.loads(main_result.extracted_content)
|
||||
click.echo(json.dumps(extracted_items, indent=2))
|
||||
|
||||
elif output in ["markdown", "md"]:
|
||||
click.echo(result.markdown.raw_markdown)
|
||||
click.echo(main_result.markdown.raw_markdown)
|
||||
elif output in ["markdown-fit", "md-fit"]:
|
||||
click.echo(result.markdown.fit_markdown)
|
||||
click.echo(main_result.markdown.fit_markdown)
|
||||
else:
|
||||
if output == "all":
|
||||
with open(output_file, "w") as f:
|
||||
f.write(json.dumps(result.model_dump(), indent=2))
|
||||
if isinstance(result, list):
|
||||
output_data = [r.model_dump() for r in all_results]
|
||||
f.write(json.dumps(output_data, indent=2))
|
||||
else:
|
||||
f.write(json.dumps(main_result.model_dump(), indent=2))
|
||||
elif output == "json":
|
||||
with open(output_file, "w") as f:
|
||||
f.write(result.extracted_content)
|
||||
f.write(main_result.extracted_content)
|
||||
elif output in ["markdown", "md"]:
|
||||
with open(output_file, "w") as f:
|
||||
f.write(result.markdown.raw_markdown)
|
||||
f.write(main_result.markdown.raw_markdown)
|
||||
elif output in ["markdown-fit", "md-fit"]:
|
||||
with open(output_file, "w") as f:
|
||||
f.write(result.markdown.fit_markdown)
|
||||
f.write(main_result.markdown.fit_markdown)
|
||||
|
||||
except Exception as e:
|
||||
raise click.ClickException(str(e))
|
||||
@@ -1354,9 +1401,11 @@ def profiles_cmd():
|
||||
@click.option("--question", "-q", help="Ask a question about the crawled content")
|
||||
@click.option("--verbose", "-v", is_flag=True)
|
||||
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
|
||||
@click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy")
|
||||
@click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode")
|
||||
def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str,
|
||||
extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
|
||||
output: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
|
||||
output: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int):
|
||||
"""Crawl4AI CLI - Web content extraction tool
|
||||
|
||||
Simple Usage:
|
||||
@@ -1406,7 +1455,9 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f
|
||||
bypass_cache=bypass_cache,
|
||||
question=question,
|
||||
verbose=verbose,
|
||||
profile=profile
|
||||
profile=profile,
|
||||
deep_crawl=deep_crawl,
|
||||
max_pages=max_pages
|
||||
)
|
||||
|
||||
def main():
|
||||
|
||||
@@ -98,20 +98,20 @@ class ContentScrapingStrategy(ABC):
|
||||
pass
|
||||
|
||||
|
||||
class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
class LXMLWebScrapingStrategy(ContentScrapingStrategy):
|
||||
"""
|
||||
Class for web content scraping. Perhaps the most important class.
|
||||
|
||||
How it works:
|
||||
1. Extract content from HTML using BeautifulSoup.
|
||||
2. Clean the extracted content using a content cleaning strategy.
|
||||
3. Filter the cleaned content using a content filtering strategy.
|
||||
4. Generate markdown content from the filtered content.
|
||||
5. Return the markdown content.
|
||||
LXML-based implementation for fast web content scraping.
|
||||
|
||||
This is the primary scraping strategy in Crawl4AI, providing high-performance
|
||||
HTML parsing and content extraction using the lxml library.
|
||||
|
||||
Note: WebScrapingStrategy is now an alias for this class to maintain
|
||||
backward compatibility.
|
||||
"""
|
||||
|
||||
def __init__(self, logger=None):
|
||||
self.logger = logger
|
||||
self.DIMENSION_REGEX = re.compile(r"(\d+)(\D*)")
|
||||
self.BASE64_PATTERN = re.compile(r'data:image/[^;]+;base64,([^"]+)')
|
||||
|
||||
def _log(self, level, message, tag="SCRAPE", **kwargs):
|
||||
"""Helper method to safely use logger."""
|
||||
@@ -132,7 +132,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
ScrapingResult: A structured result containing the scraped content.
|
||||
"""
|
||||
actual_url = kwargs.get("redirected_url", url)
|
||||
raw_result = self._scrap(actual_url, html, is_async=False, **kwargs)
|
||||
raw_result = self._scrap(actual_url, html, **kwargs)
|
||||
if raw_result is None:
|
||||
return ScrapingResult(
|
||||
cleaned_html="",
|
||||
@@ -196,376 +196,9 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
Returns:
|
||||
ScrapingResult: A structured result containing the scraped content.
|
||||
"""
|
||||
return await asyncio.to_thread(self._scrap, url, html, **kwargs)
|
||||
return await asyncio.to_thread(self.scrap, url, html, **kwargs)
|
||||
|
||||
def is_data_table(self, table: Tag, **kwargs) -> bool:
|
||||
"""
|
||||
Determine if a table element is a data table (not a layout table).
|
||||
|
||||
Args:
|
||||
table (Tag): BeautifulSoup Tag representing a table element
|
||||
**kwargs: Additional keyword arguments including table_score_threshold
|
||||
|
||||
Returns:
|
||||
bool: True if the table is a data table, False otherwise
|
||||
"""
|
||||
score = 0
|
||||
|
||||
# Check for thead and tbody
|
||||
has_thead = len(table.select('thead')) > 0
|
||||
has_tbody = len(table.select('tbody')) > 0
|
||||
if has_thead:
|
||||
score += 2
|
||||
if has_tbody:
|
||||
score += 1
|
||||
|
||||
# Check for th elements
|
||||
th_count = len(table.select('th'))
|
||||
if th_count > 0:
|
||||
score += 2
|
||||
if has_thead or len(table.select('tr:first-child th')) > 0:
|
||||
score += 1
|
||||
|
||||
# Check for nested tables
|
||||
if len(table.select('table')) > 0:
|
||||
score -= 3
|
||||
|
||||
# Role attribute check
|
||||
role = table.get('role', '').lower()
|
||||
if role in {'presentation', 'none'}:
|
||||
score -= 3
|
||||
|
||||
# Column consistency
|
||||
rows = table.select('tr')
|
||||
if not rows:
|
||||
return False
|
||||
|
||||
col_counts = [len(row.select('td, th')) for row in rows]
|
||||
avg_cols = sum(col_counts) / len(col_counts)
|
||||
variance = sum((c - avg_cols)**2 for c in col_counts) / len(col_counts)
|
||||
if variance < 1:
|
||||
score += 2
|
||||
|
||||
# Caption and summary
|
||||
if table.select('caption'):
|
||||
score += 2
|
||||
if table.has_attr('summary') and table['summary']:
|
||||
score += 1
|
||||
|
||||
# Text density
|
||||
total_text = sum(len(cell.get_text().strip()) for row in rows for cell in row.select('td, th'))
|
||||
total_tags = sum(1 for _ in table.descendants if isinstance(_, Tag))
|
||||
text_ratio = total_text / (total_tags + 1e-5)
|
||||
if text_ratio > 20:
|
||||
score += 3
|
||||
elif text_ratio > 10:
|
||||
score += 2
|
||||
|
||||
# Data attributes
|
||||
data_attrs = sum(1 for attr in table.attrs if attr.startswith('data-'))
|
||||
score += data_attrs * 0.5
|
||||
|
||||
# Size check
|
||||
if avg_cols >= 2 and len(rows) >= 2:
|
||||
score += 2
|
||||
|
||||
threshold = kwargs.get('table_score_threshold', 7)
|
||||
return score >= threshold
|
||||
|
||||
def extract_table_data(self, table: Tag) -> dict:
|
||||
"""
|
||||
Extract structured data from a table element.
|
||||
|
||||
Args:
|
||||
table (Tag): BeautifulSoup Tag representing a table element
|
||||
|
||||
Returns:
|
||||
dict: Dictionary containing table data (headers, rows, caption, summary)
|
||||
"""
|
||||
caption_elem = table.select_one('caption')
|
||||
caption = caption_elem.get_text().strip() if caption_elem else ""
|
||||
summary = table.get('summary', '').strip()
|
||||
|
||||
# Extract headers with colspan handling
|
||||
headers = []
|
||||
thead_rows = table.select('thead tr')
|
||||
if thead_rows:
|
||||
header_cells = thead_rows[0].select('th')
|
||||
for cell in header_cells:
|
||||
text = cell.get_text().strip()
|
||||
colspan = int(cell.get('colspan', 1))
|
||||
headers.extend([text] * colspan)
|
||||
else:
|
||||
first_row = table.select('tr:first-child')
|
||||
if first_row:
|
||||
for cell in first_row[0].select('th, td'):
|
||||
text = cell.get_text().strip()
|
||||
colspan = int(cell.get('colspan', 1))
|
||||
headers.extend([text] * colspan)
|
||||
|
||||
# Extract rows with colspan handling
|
||||
rows = []
|
||||
all_rows = table.select('tr')
|
||||
thead = table.select_one('thead')
|
||||
tbody_rows = []
|
||||
|
||||
if thead:
|
||||
thead_rows = thead.select('tr')
|
||||
tbody_rows = [row for row in all_rows if row not in thead_rows]
|
||||
else:
|
||||
if all_rows and all_rows[0].select('th'):
|
||||
tbody_rows = all_rows[1:]
|
||||
else:
|
||||
tbody_rows = all_rows
|
||||
|
||||
for row in tbody_rows:
|
||||
# for row in table.select('tr:not(:has(ancestor::thead))'):
|
||||
row_data = []
|
||||
for cell in row.select('td'):
|
||||
text = cell.get_text().strip()
|
||||
colspan = int(cell.get('colspan', 1))
|
||||
row_data.extend([text] * colspan)
|
||||
if row_data:
|
||||
rows.append(row_data)
|
||||
|
||||
# Align rows with headers
|
||||
max_columns = len(headers) if headers else (max(len(row) for row in rows) if rows else 0)
|
||||
aligned_rows = []
|
||||
for row in rows:
|
||||
aligned = row[:max_columns] + [''] * (max_columns - len(row))
|
||||
aligned_rows.append(aligned)
|
||||
|
||||
if not headers:
|
||||
headers = [f"Column {i+1}" for i in range(max_columns)]
|
||||
|
||||
return {
|
||||
"headers": headers,
|
||||
"rows": aligned_rows,
|
||||
"caption": caption,
|
||||
"summary": summary,
|
||||
}
|
||||
|
||||
def flatten_nested_elements(self, node):
|
||||
"""
|
||||
Flatten nested elements in a HTML tree.
|
||||
|
||||
Args:
|
||||
node (Tag): The root node of the HTML tree.
|
||||
|
||||
Returns:
|
||||
Tag: The flattened HTML tree.
|
||||
"""
|
||||
if isinstance(node, NavigableString):
|
||||
return node
|
||||
if (
|
||||
len(node.contents) == 1
|
||||
and isinstance(node.contents[0], Tag)
|
||||
and node.contents[0].name == node.name
|
||||
):
|
||||
return self.flatten_nested_elements(node.contents[0])
|
||||
node.contents = [self.flatten_nested_elements(child) for child in node.contents]
|
||||
return node
|
||||
|
||||
def find_closest_parent_with_useful_text(self, tag, **kwargs):
|
||||
"""
|
||||
Find the closest parent with useful text.
|
||||
|
||||
Args:
|
||||
tag (Tag): The starting tag to search from.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
Tag: The closest parent with useful text, or None if not found.
|
||||
"""
|
||||
image_description_min_word_threshold = kwargs.get(
|
||||
"image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
|
||||
)
|
||||
current_tag = tag
|
||||
while current_tag:
|
||||
current_tag = current_tag.parent
|
||||
# Get the text content of the parent tag
|
||||
if current_tag:
|
||||
text_content = current_tag.get_text(separator=" ", strip=True)
|
||||
# Check if the text content has at least word_count_threshold
|
||||
if len(text_content.split()) >= image_description_min_word_threshold:
|
||||
return text_content
|
||||
return None
|
||||
|
||||
def remove_unwanted_attributes(
|
||||
self, element, important_attrs, keep_data_attributes=False
|
||||
):
|
||||
"""
|
||||
Remove unwanted attributes from an HTML element.
|
||||
|
||||
Args:
|
||||
element (Tag): The HTML element to remove attributes from.
|
||||
important_attrs (list): List of important attributes to keep.
|
||||
keep_data_attributes (bool): Whether to keep data attributes.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
attrs_to_remove = []
|
||||
for attr in element.attrs:
|
||||
if attr not in important_attrs:
|
||||
if keep_data_attributes:
|
||||
if not attr.startswith("data-"):
|
||||
attrs_to_remove.append(attr)
|
||||
else:
|
||||
attrs_to_remove.append(attr)
|
||||
|
||||
for attr in attrs_to_remove:
|
||||
del element[attr]
|
||||
|
||||
def process_image(self, img, url, index, total_images, **kwargs):
|
||||
"""
|
||||
Process an image element.
|
||||
|
||||
How it works:
|
||||
1. Check if the image has valid display and inside undesired html elements.
|
||||
2. Score an image for it's usefulness.
|
||||
3. Extract image file metadata to extract size and extension.
|
||||
4. Generate a dictionary with the processed image information.
|
||||
5. Return the processed image information.
|
||||
|
||||
Args:
|
||||
img (Tag): The image element to process.
|
||||
url (str): The URL of the page containing the image.
|
||||
index (int): The index of the image in the list of images.
|
||||
total_images (int): The total number of images in the list.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the processed image information.
|
||||
"""
|
||||
# parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w')
|
||||
# if ' ' in u else None}
|
||||
# for u in [f"http{p}" for p in s.split("http") if p]]
|
||||
|
||||
# Constants for checks
|
||||
classes_to_check = frozenset(["button", "icon", "logo"])
|
||||
tags_to_check = frozenset(["button", "input"])
|
||||
image_formats = frozenset(["jpg", "jpeg", "png", "webp", "avif", "gif"])
|
||||
|
||||
# Pre-fetch commonly used attributes
|
||||
style = img.get("style", "")
|
||||
alt = img.get("alt", "")
|
||||
src = img.get("src", "")
|
||||
data_src = img.get("data-src", "")
|
||||
srcset = img.get("srcset", "")
|
||||
data_srcset = img.get("data-srcset", "")
|
||||
width = img.get("width")
|
||||
height = img.get("height")
|
||||
parent = img.parent
|
||||
parent_classes = parent.get("class", [])
|
||||
|
||||
# Quick validation checks
|
||||
if (
|
||||
"display:none" in style
|
||||
or parent.name in tags_to_check
|
||||
or any(c in cls for c in parent_classes for cls in classes_to_check)
|
||||
or any(c in src for c in classes_to_check)
|
||||
or any(c in alt for c in classes_to_check)
|
||||
):
|
||||
return None
|
||||
|
||||
# Quick score calculation
|
||||
score = 0
|
||||
if width and width.isdigit():
|
||||
width_val = int(width)
|
||||
score += 1 if width_val > 150 else 0
|
||||
if height and height.isdigit():
|
||||
height_val = int(height)
|
||||
score += 1 if height_val > 150 else 0
|
||||
if alt:
|
||||
score += 1
|
||||
score += index / total_images < 0.5
|
||||
|
||||
# image_format = ''
|
||||
# if "data:image/" in src:
|
||||
# image_format = src.split(',')[0].split(';')[0].split('/')[1].split(';')[0]
|
||||
# else:
|
||||
# image_format = os.path.splitext(src)[1].lower().strip('.').split('?')[0]
|
||||
|
||||
# if image_format in ('jpg', 'png', 'webp', 'avif'):
|
||||
# score += 1
|
||||
|
||||
# Check for image format in all possible sources
|
||||
def has_image_format(url):
|
||||
return any(fmt in url.lower() for fmt in image_formats)
|
||||
|
||||
# Score for having proper image sources
|
||||
if any(has_image_format(url) for url in [src, data_src, srcset, data_srcset]):
|
||||
score += 1
|
||||
if srcset or data_srcset:
|
||||
score += 1
|
||||
if img.find_parent("picture"):
|
||||
score += 1
|
||||
|
||||
# Detect format from any available source
|
||||
detected_format = None
|
||||
for url in [src, data_src, srcset, data_srcset]:
|
||||
if url:
|
||||
format_matches = [fmt for fmt in image_formats if fmt in url.lower()]
|
||||
if format_matches:
|
||||
detected_format = format_matches[0]
|
||||
break
|
||||
|
||||
if score <= kwargs.get("image_score_threshold", IMAGE_SCORE_THRESHOLD):
|
||||
return None
|
||||
|
||||
# Use set for deduplication
|
||||
unique_urls = set()
|
||||
image_variants = []
|
||||
|
||||
# Generate a unique group ID for this set of variants
|
||||
group_id = index
|
||||
|
||||
# Base image info template
|
||||
base_info = {
|
||||
"alt": alt,
|
||||
"desc": self.find_closest_parent_with_useful_text(img, **kwargs),
|
||||
"score": score,
|
||||
"type": "image",
|
||||
"group_id": group_id, # Group ID for this set of variants
|
||||
"format": detected_format,
|
||||
}
|
||||
|
||||
# Inline function for adding variants
|
||||
def add_variant(src, width=None):
|
||||
if src and not src.startswith("data:") and src not in unique_urls:
|
||||
unique_urls.add(src)
|
||||
image_variants.append({**base_info, "src": src, "width": width})
|
||||
|
||||
# Process all sources
|
||||
add_variant(src)
|
||||
add_variant(data_src)
|
||||
|
||||
# Handle srcset and data-srcset in one pass
|
||||
for attr in ("srcset", "data-srcset"):
|
||||
if value := img.get(attr):
|
||||
for source in parse_srcset(value):
|
||||
add_variant(source["url"], source["width"])
|
||||
|
||||
# Quick picture element check
|
||||
if picture := img.find_parent("picture"):
|
||||
for source in picture.find_all("source"):
|
||||
if srcset := source.get("srcset"):
|
||||
for src in parse_srcset(srcset):
|
||||
add_variant(src["url"], src["width"])
|
||||
|
||||
# Framework-specific attributes in one pass
|
||||
for attr, value in img.attrs.items():
|
||||
if (
|
||||
attr.startswith("data-")
|
||||
and ("src" in attr or "srcset" in attr)
|
||||
and "http" in value
|
||||
):
|
||||
add_variant(value)
|
||||
|
||||
return image_variants if image_variants else None
|
||||
|
||||
def process_element(self, url, element: PageElement, **kwargs) -> Dict[str, Any]:
|
||||
def process_element(self, url, element: lhtml.HtmlElement, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Process an HTML element.
|
||||
|
||||
@@ -577,7 +210,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
|
||||
Args:
|
||||
url (str): The URL of the page containing the element.
|
||||
element (Tag): The HTML element to process.
|
||||
element (lhtml.HtmlElement): The HTML element to process.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
@@ -595,514 +228,6 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
"external_links_dict": external_links_dict,
|
||||
}
|
||||
|
||||
def _process_element(
|
||||
self,
|
||||
url,
|
||||
element: PageElement,
|
||||
media: Dict[str, Any],
|
||||
internal_links_dict: Dict[str, Any],
|
||||
external_links_dict: Dict[str, Any],
|
||||
**kwargs,
|
||||
) -> bool:
|
||||
"""
|
||||
Process an HTML element.
|
||||
"""
|
||||
try:
|
||||
if isinstance(element, NavigableString):
|
||||
if isinstance(element, Comment):
|
||||
element.extract()
|
||||
return False
|
||||
|
||||
# if element.name == 'img':
|
||||
# process_image(element, url, 0, 1)
|
||||
# return True
|
||||
base_domain = kwargs.get("base_domain", get_base_domain(url))
|
||||
|
||||
if element.name in ["script", "style", "link", "meta", "noscript"]:
|
||||
element.decompose()
|
||||
return False
|
||||
|
||||
keep_element = False
|
||||
# Special case for table elements - always preserve structure
|
||||
if element.name in ["tr", "td", "th"]:
|
||||
keep_element = True
|
||||
|
||||
exclude_domains = kwargs.get("exclude_domains", [])
|
||||
# exclude_social_media_domains = kwargs.get('exclude_social_media_domains', set(SOCIAL_MEDIA_DOMAINS))
|
||||
# exclude_social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('exclude_social_media_domains', [])
|
||||
# exclude_social_media_domains = list(set(exclude_social_media_domains))
|
||||
|
||||
try:
|
||||
if element.name == "a" and element.get("href"):
|
||||
href = element.get("href", "").strip()
|
||||
if not href: # Skip empty hrefs
|
||||
return False
|
||||
|
||||
# url_base = url.split("/")[2]
|
||||
|
||||
# Normalize the URL
|
||||
try:
|
||||
normalized_href = normalize_url(href, url)
|
||||
except ValueError:
|
||||
# logging.warning(f"Invalid URL format: {href}, Error: {str(e)}")
|
||||
return False
|
||||
|
||||
link_data = {
|
||||
"href": normalized_href,
|
||||
"text": element.get_text().strip(),
|
||||
"title": element.get("title", "").strip(),
|
||||
"base_domain": base_domain,
|
||||
}
|
||||
|
||||
is_external = is_external_url(normalized_href, base_domain)
|
||||
|
||||
keep_element = True
|
||||
|
||||
# Handle external link exclusions
|
||||
if is_external:
|
||||
link_base_domain = get_base_domain(normalized_href)
|
||||
link_data["base_domain"] = link_base_domain
|
||||
if kwargs.get("exclude_external_links", False):
|
||||
element.decompose()
|
||||
return False
|
||||
# elif kwargs.get('exclude_social_media_links', False):
|
||||
# if link_base_domain in exclude_social_media_domains:
|
||||
# element.decompose()
|
||||
# return False
|
||||
# if any(domain in normalized_href.lower() for domain in exclude_social_media_domains):
|
||||
# element.decompose()
|
||||
# return False
|
||||
elif exclude_domains:
|
||||
if link_base_domain in exclude_domains:
|
||||
element.decompose()
|
||||
return False
|
||||
# if any(domain in normalized_href.lower() for domain in kwargs.get('exclude_domains', [])):
|
||||
# element.decompose()
|
||||
# return False
|
||||
|
||||
if is_external:
|
||||
if normalized_href not in external_links_dict:
|
||||
external_links_dict[normalized_href] = link_data
|
||||
else:
|
||||
if kwargs.get("exclude_internal_links", False):
|
||||
element.decompose()
|
||||
return False
|
||||
if normalized_href not in internal_links_dict:
|
||||
internal_links_dict[normalized_href] = link_data
|
||||
|
||||
except Exception as e:
|
||||
raise Exception(f"Error processing links: {str(e)}")
|
||||
|
||||
try:
|
||||
if element.name == "img":
|
||||
potential_sources = [
|
||||
"src",
|
||||
"data-src",
|
||||
"srcset" "data-lazy-src",
|
||||
"data-original",
|
||||
]
|
||||
src = element.get("src", "")
|
||||
while not src and potential_sources:
|
||||
src = element.get(potential_sources.pop(0), "")
|
||||
if not src:
|
||||
element.decompose()
|
||||
return False
|
||||
|
||||
# If it is srcset pick up the first image
|
||||
if "srcset" in element.attrs:
|
||||
src = element.attrs["srcset"].split(",")[0].split(" ")[0]
|
||||
|
||||
# If image src is internal, then skip
|
||||
if not is_external_url(src, base_domain):
|
||||
return True
|
||||
|
||||
image_src_base_domain = get_base_domain(src)
|
||||
|
||||
# Check flag if we should remove external images
|
||||
if kwargs.get("exclude_external_images", False):
|
||||
# Handle relative URLs (which are always from the same domain)
|
||||
if not src.startswith('http') and not src.startswith('//'):
|
||||
return True # Keep relative URLs
|
||||
|
||||
# For absolute URLs, compare the base domains using the existing function
|
||||
src_base_domain = get_base_domain(src)
|
||||
url_base_domain = get_base_domain(url)
|
||||
|
||||
# If the domains don't match and both are valid, the image is external
|
||||
if src_base_domain and url_base_domain and src_base_domain != url_base_domain:
|
||||
element.decompose()
|
||||
return False
|
||||
|
||||
# if kwargs.get('exclude_social_media_links', False):
|
||||
# if image_src_base_domain in exclude_social_media_domains:
|
||||
# element.decompose()
|
||||
# return False
|
||||
# src_url_base = src.split('/')[2]
|
||||
# url_base = url.split('/')[2]
|
||||
# if any(domain in src for domain in exclude_social_media_domains):
|
||||
# element.decompose()
|
||||
# return False
|
||||
|
||||
# Handle exclude domains
|
||||
if exclude_domains:
|
||||
if image_src_base_domain in exclude_domains:
|
||||
element.decompose()
|
||||
return False
|
||||
# if any(domain in src for domain in kwargs.get('exclude_domains', [])):
|
||||
# element.decompose()
|
||||
# return False
|
||||
|
||||
return True # Always keep image elements
|
||||
except Exception:
|
||||
raise "Error processing images"
|
||||
|
||||
# Check if flag to remove all forms is set
|
||||
if kwargs.get("remove_forms", False) and element.name == "form":
|
||||
element.decompose()
|
||||
return False
|
||||
|
||||
if element.name in ["video", "audio"]:
|
||||
media[f"{element.name}s"].append(
|
||||
{
|
||||
"src": element.get("src"),
|
||||
"alt": element.get("alt"),
|
||||
"type": element.name,
|
||||
"description": self.find_closest_parent_with_useful_text(
|
||||
element, **kwargs
|
||||
),
|
||||
}
|
||||
)
|
||||
source_tags = element.find_all("source")
|
||||
for source_tag in source_tags:
|
||||
media[f"{element.name}s"].append(
|
||||
{
|
||||
"src": source_tag.get("src"),
|
||||
"alt": element.get("alt"),
|
||||
"type": element.name,
|
||||
"description": self.find_closest_parent_with_useful_text(
|
||||
element, **kwargs
|
||||
),
|
||||
}
|
||||
)
|
||||
return True # Always keep video and audio elements
|
||||
|
||||
if element.name in ONLY_TEXT_ELIGIBLE_TAGS:
|
||||
if kwargs.get("only_text", False):
|
||||
element.replace_with(element.get_text())
|
||||
|
||||
try:
|
||||
self.remove_unwanted_attributes(
|
||||
element, IMPORTANT_ATTRS + kwargs.get("keep_attrs", []) , kwargs.get("keep_data_attributes", False)
|
||||
)
|
||||
except Exception as e:
|
||||
# print('Error removing unwanted attributes:', str(e))
|
||||
self._log(
|
||||
"error",
|
||||
message="Error removing unwanted attributes: {error}",
|
||||
tag="SCRAPE",
|
||||
params={"error": str(e)},
|
||||
)
|
||||
# Process children
|
||||
for child in list(element.children):
|
||||
if isinstance(child, NavigableString) and not isinstance(
|
||||
child, Comment
|
||||
):
|
||||
if len(child.strip()) > 0:
|
||||
keep_element = True
|
||||
else:
|
||||
if self._process_element(
|
||||
url,
|
||||
child,
|
||||
media,
|
||||
internal_links_dict,
|
||||
external_links_dict,
|
||||
**kwargs,
|
||||
):
|
||||
keep_element = True
|
||||
|
||||
# Check word count
|
||||
word_count_threshold = kwargs.get(
|
||||
"word_count_threshold", MIN_WORD_THRESHOLD
|
||||
)
|
||||
if not keep_element:
|
||||
word_count = len(element.get_text(strip=True).split())
|
||||
keep_element = word_count >= word_count_threshold
|
||||
|
||||
if not keep_element:
|
||||
element.decompose()
|
||||
|
||||
return keep_element
|
||||
except Exception as e:
|
||||
# print('Error processing element:', str(e))
|
||||
self._log(
|
||||
"error",
|
||||
message="Error processing element: {error}",
|
||||
tag="SCRAPE",
|
||||
params={"error": str(e)},
|
||||
)
|
||||
return False
|
||||
|
||||
def _scrap(
|
||||
self,
|
||||
url: str,
|
||||
html: str,
|
||||
word_count_threshold: int = MIN_WORD_THRESHOLD,
|
||||
css_selector: str = None,
|
||||
target_elements: List[str] = None,
|
||||
**kwargs,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract content from HTML using BeautifulSoup.
|
||||
|
||||
Args:
|
||||
url (str): The URL of the page to scrape.
|
||||
html (str): The HTML content of the page to scrape.
|
||||
word_count_threshold (int): The minimum word count threshold for content extraction.
|
||||
css_selector (str): The CSS selector to use for content extraction.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the extracted content.
|
||||
"""
|
||||
success = True
|
||||
if not html:
|
||||
return None
|
||||
|
||||
parser_type = kwargs.get("parser", "lxml")
|
||||
soup = BeautifulSoup(html, parser_type)
|
||||
body = soup.body
|
||||
if body is None:
|
||||
raise Exception("'<body>' tag is not found in fetched html. Consider adding wait_for=\"css:body\" to wait for body tag to be loaded into DOM.")
|
||||
base_domain = get_base_domain(url)
|
||||
|
||||
# Early removal of all images if exclude_all_images is set
|
||||
# This happens before any processing to minimize memory usage
|
||||
if kwargs.get("exclude_all_images", False):
|
||||
for img in body.find_all('img'):
|
||||
img.decompose()
|
||||
|
||||
try:
|
||||
meta = extract_metadata("", soup)
|
||||
except Exception as e:
|
||||
self._log(
|
||||
"error",
|
||||
message="Error extracting metadata: {error}",
|
||||
tag="SCRAPE",
|
||||
params={"error": str(e)},
|
||||
)
|
||||
meta = {}
|
||||
|
||||
# Handle tag-based removal first - faster than CSS selection
|
||||
excluded_tags = set(kwargs.get("excluded_tags", []) or [])
|
||||
if excluded_tags:
|
||||
for element in body.find_all(lambda tag: tag.name in excluded_tags):
|
||||
element.extract()
|
||||
|
||||
# Handle CSS selector-based removal
|
||||
excluded_selector = kwargs.get("excluded_selector", "")
|
||||
if excluded_selector:
|
||||
is_single_selector = (
|
||||
"," not in excluded_selector and " " not in excluded_selector
|
||||
)
|
||||
if is_single_selector:
|
||||
while element := body.select_one(excluded_selector):
|
||||
element.extract()
|
||||
else:
|
||||
for element in body.select(excluded_selector):
|
||||
element.extract()
|
||||
|
||||
content_element = None
|
||||
if target_elements:
|
||||
try:
|
||||
for_content_targeted_element = []
|
||||
for target_element in target_elements:
|
||||
for_content_targeted_element.extend(body.select(target_element))
|
||||
content_element = soup.new_tag("div")
|
||||
for el in for_content_targeted_element:
|
||||
content_element.append(copy.deepcopy(el))
|
||||
except Exception as e:
|
||||
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
|
||||
return None
|
||||
else:
|
||||
content_element = body
|
||||
|
||||
kwargs["exclude_social_media_domains"] = set(
|
||||
kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS
|
||||
)
|
||||
kwargs["exclude_domains"] = set(kwargs.get("exclude_domains", []))
|
||||
if kwargs.get("exclude_social_media_links", False):
|
||||
kwargs["exclude_domains"] = kwargs["exclude_domains"].union(
|
||||
kwargs["exclude_social_media_domains"]
|
||||
)
|
||||
|
||||
result_obj = self.process_element(
|
||||
url,
|
||||
body,
|
||||
word_count_threshold=word_count_threshold,
|
||||
base_domain=base_domain,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
links = {"internal": [], "external": []}
|
||||
media = result_obj["media"]
|
||||
internal_links_dict = result_obj["internal_links_dict"]
|
||||
external_links_dict = result_obj["external_links_dict"]
|
||||
|
||||
# Update the links dictionary with unique links
|
||||
links["internal"] = list(internal_links_dict.values())
|
||||
links["external"] = list(external_links_dict.values())
|
||||
|
||||
# Extract head content for links if configured
|
||||
link_preview_config = kwargs.get("link_preview_config")
|
||||
if link_preview_config is not None:
|
||||
try:
|
||||
import asyncio
|
||||
from .link_preview import LinkPreview
|
||||
from .models import Links, Link
|
||||
|
||||
verbose = link_preview_config.verbose
|
||||
|
||||
if verbose:
|
||||
self._log("info", "Starting link head extraction for {internal} internal and {external} external links",
|
||||
params={"internal": len(links["internal"]), "external": len(links["external"])}, tag="LINK_EXTRACT")
|
||||
|
||||
# Convert dict links to Link objects
|
||||
internal_links = [Link(**link_data) for link_data in links["internal"]]
|
||||
external_links = [Link(**link_data) for link_data in links["external"]]
|
||||
links_obj = Links(internal=internal_links, external=external_links)
|
||||
|
||||
# Create a config object for LinkPreview
|
||||
class TempCrawlerRunConfig:
|
||||
def __init__(self, link_config, score_links):
|
||||
self.link_preview_config = link_config
|
||||
self.score_links = score_links
|
||||
|
||||
config = TempCrawlerRunConfig(link_preview_config, kwargs.get("score_links", False))
|
||||
|
||||
# Extract head content (run async operation in sync context)
|
||||
async def extract_links():
|
||||
async with LinkPreview(self.logger) as extractor:
|
||||
return await extractor.extract_link_heads(links_obj, config)
|
||||
|
||||
# Run the async operation
|
||||
try:
|
||||
# Check if we're already in an async context
|
||||
loop = asyncio.get_running_loop()
|
||||
# If we're in an async context, we need to run in a thread
|
||||
import concurrent.futures
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
future = executor.submit(asyncio.run, extract_links())
|
||||
updated_links = future.result()
|
||||
except RuntimeError:
|
||||
# No running loop, we can use asyncio.run directly
|
||||
updated_links = asyncio.run(extract_links())
|
||||
|
||||
# Convert back to dict format
|
||||
links["internal"] = [link.dict() for link in updated_links.internal]
|
||||
links["external"] = [link.dict() for link in updated_links.external]
|
||||
|
||||
if verbose:
|
||||
successful_internal = len([l for l in updated_links.internal if l.head_extraction_status == "valid"])
|
||||
successful_external = len([l for l in updated_links.external if l.head_extraction_status == "valid"])
|
||||
self._log("info", "Link head extraction completed: {internal_success}/{internal_total} internal, {external_success}/{external_total} external",
|
||||
params={
|
||||
"internal_success": successful_internal,
|
||||
"internal_total": len(updated_links.internal),
|
||||
"external_success": successful_external,
|
||||
"external_total": len(updated_links.external)
|
||||
}, tag="LINK_EXTRACT")
|
||||
else:
|
||||
self._log("info", "Link head extraction completed successfully", tag="LINK_EXTRACT")
|
||||
|
||||
except Exception as e:
|
||||
self._log("error", f"Link head extraction failed: {str(e)}", tag="LINK_EXTRACT")
|
||||
# Continue with original links if extraction fails
|
||||
|
||||
# # Process images using ThreadPoolExecutor
|
||||
imgs = body.find_all("img")
|
||||
|
||||
media["images"] = [
|
||||
img
|
||||
for result in (
|
||||
self.process_image(img, url, i, len(imgs), **kwargs)
|
||||
for i, img in enumerate(imgs)
|
||||
)
|
||||
if result is not None
|
||||
for img in result
|
||||
]
|
||||
|
||||
# Process tables if not excluded
|
||||
excluded_tags = set(kwargs.get("excluded_tags", []) or [])
|
||||
if 'table' not in excluded_tags:
|
||||
tables = body.find_all('table')
|
||||
for table in tables:
|
||||
if self.is_data_table(table, **kwargs):
|
||||
table_data = self.extract_table_data(table)
|
||||
media["tables"].append(table_data)
|
||||
|
||||
body = self.flatten_nested_elements(body)
|
||||
base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)')
|
||||
for img in imgs:
|
||||
src = img.get("src", "")
|
||||
if base64_pattern.match(src):
|
||||
# Replace base64 data with empty string
|
||||
img["src"] = base64_pattern.sub("", src)
|
||||
|
||||
str_body = ""
|
||||
try:
|
||||
str_body = content_element.encode_contents().decode("utf-8")
|
||||
except Exception:
|
||||
# Reset body to the original HTML
|
||||
success = False
|
||||
body = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Create a new div with a special ID
|
||||
error_div = body.new_tag("div", id="crawl4ai_error_message")
|
||||
error_div.string = """
|
||||
Crawl4AI Error: This page is not fully supported.
|
||||
|
||||
Possible reasons:
|
||||
1. The page may have restrictions that prevent crawling.
|
||||
2. The page might not be fully loaded.
|
||||
|
||||
Suggestions:
|
||||
- Try calling the crawl function with these parameters:
|
||||
magic=True,
|
||||
- Set headless=False to visualize what's happening on the page.
|
||||
|
||||
If the issue persists, please check the page's structure and any potential anti-crawling measures.
|
||||
"""
|
||||
|
||||
# Append the error div to the body
|
||||
body.append(error_div)
|
||||
str_body = body.encode_contents().decode("utf-8")
|
||||
|
||||
print(
|
||||
"[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details."
|
||||
)
|
||||
self._log(
|
||||
"error",
|
||||
message="After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.",
|
||||
tag="SCRAPE",
|
||||
)
|
||||
|
||||
cleaned_html = str_body.replace("\n\n", "\n").replace(" ", " ")
|
||||
|
||||
return {
|
||||
"cleaned_html": cleaned_html,
|
||||
"success": success,
|
||||
"media": media,
|
||||
"links": links,
|
||||
"metadata": meta,
|
||||
}
|
||||
|
||||
|
||||
class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
def __init__(self, logger=None):
|
||||
super().__init__(logger)
|
||||
self.DIMENSION_REGEX = re.compile(r"(\d+)(\D*)")
|
||||
self.BASE64_PATTERN = re.compile(r'data:image/[^;]+;base64,([^"]+)')
|
||||
|
||||
def _process_element(
|
||||
self,
|
||||
url: str,
|
||||
@@ -1145,10 +270,10 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
link_data["intrinsic_score"] = intrinsic_score
|
||||
except Exception:
|
||||
# Fail gracefully - assign default score
|
||||
link_data["intrinsic_score"] = float('inf')
|
||||
link_data["intrinsic_score"] = 0
|
||||
else:
|
||||
# No scoring enabled - assign infinity (all links equal priority)
|
||||
link_data["intrinsic_score"] = float('inf')
|
||||
link_data["intrinsic_score"] = 0
|
||||
|
||||
is_external = is_external_url(normalized_href, base_domain)
|
||||
if is_external:
|
||||
@@ -1862,3 +987,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
"links": {"internal": [], "external": []},
|
||||
"metadata": {},
|
||||
}
|
||||
|
||||
|
||||
# Backward compatibility alias
|
||||
WebScrapingStrategy = LXMLWebScrapingStrategy
|
||||
|
||||
@@ -11,7 +11,7 @@ from .extraction_strategy import *
|
||||
from .crawler_strategy import *
|
||||
from typing import List
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from .content_scraping_strategy import WebScrapingStrategy
|
||||
from ..content_scraping_strategy import LXMLWebScrapingStrategy as WebScrapingStrategy
|
||||
from .config import *
|
||||
import warnings
|
||||
import json
|
||||
|
||||
79
crawl4ai/memory_utils.py
Normal file
79
crawl4ai/memory_utils.py
Normal file
@@ -0,0 +1,79 @@
|
||||
import psutil
|
||||
import platform
|
||||
import subprocess
|
||||
from typing import Tuple
|
||||
|
||||
|
||||
def get_true_available_memory_gb() -> float:
|
||||
"""Get truly available memory including inactive pages (cross-platform)"""
|
||||
vm = psutil.virtual_memory()
|
||||
|
||||
if platform.system() == 'Darwin': # macOS
|
||||
# On macOS, we need to include inactive memory too
|
||||
try:
|
||||
# Use vm_stat to get accurate values
|
||||
result = subprocess.run(['vm_stat'], capture_output=True, text=True)
|
||||
lines = result.stdout.split('\n')
|
||||
|
||||
page_size = 16384 # macOS page size
|
||||
pages = {}
|
||||
|
||||
for line in lines:
|
||||
if 'Pages free:' in line:
|
||||
pages['free'] = int(line.split()[-1].rstrip('.'))
|
||||
elif 'Pages inactive:' in line:
|
||||
pages['inactive'] = int(line.split()[-1].rstrip('.'))
|
||||
elif 'Pages speculative:' in line:
|
||||
pages['speculative'] = int(line.split()[-1].rstrip('.'))
|
||||
elif 'Pages purgeable:' in line:
|
||||
pages['purgeable'] = int(line.split()[-1].rstrip('.'))
|
||||
|
||||
# Calculate total available (free + inactive + speculative + purgeable)
|
||||
total_available_pages = (
|
||||
pages.get('free', 0) +
|
||||
pages.get('inactive', 0) +
|
||||
pages.get('speculative', 0) +
|
||||
pages.get('purgeable', 0)
|
||||
)
|
||||
available_gb = (total_available_pages * page_size) / (1024**3)
|
||||
|
||||
return available_gb
|
||||
except:
|
||||
# Fallback to psutil
|
||||
return vm.available / (1024**3)
|
||||
else:
|
||||
# For Windows and Linux, psutil.available is accurate
|
||||
return vm.available / (1024**3)
|
||||
|
||||
|
||||
def get_true_memory_usage_percent() -> float:
|
||||
"""
|
||||
Get memory usage percentage that accounts for platform differences.
|
||||
|
||||
Returns:
|
||||
float: Memory usage percentage (0-100)
|
||||
"""
|
||||
vm = psutil.virtual_memory()
|
||||
total_gb = vm.total / (1024**3)
|
||||
available_gb = get_true_available_memory_gb()
|
||||
|
||||
# Calculate used percentage based on truly available memory
|
||||
used_percent = 100.0 * (total_gb - available_gb) / total_gb
|
||||
|
||||
# Ensure it's within valid range
|
||||
return max(0.0, min(100.0, used_percent))
|
||||
|
||||
|
||||
def get_memory_stats() -> Tuple[float, float, float]:
|
||||
"""
|
||||
Get comprehensive memory statistics.
|
||||
|
||||
Returns:
|
||||
Tuple[float, float, float]: (used_percent, available_gb, total_gb)
|
||||
"""
|
||||
vm = psutil.virtual_memory()
|
||||
total_gb = vm.total / (1024**3)
|
||||
available_gb = get_true_available_memory_gb()
|
||||
used_percent = get_true_memory_usage_percent()
|
||||
|
||||
return used_percent, available_gb, total_gb
|
||||
@@ -23,8 +23,9 @@ SeedingConfig = Union['SeedingConfigType']
|
||||
|
||||
# Content scraping types
|
||||
ContentScrapingStrategy = Union['ContentScrapingStrategyType']
|
||||
WebScrapingStrategy = Union['WebScrapingStrategyType']
|
||||
LXMLWebScrapingStrategy = Union['LXMLWebScrapingStrategyType']
|
||||
# Backward compatibility alias
|
||||
WebScrapingStrategy = Union['LXMLWebScrapingStrategyType']
|
||||
|
||||
# Proxy types
|
||||
ProxyRotationStrategy = Union['ProxyRotationStrategyType']
|
||||
@@ -114,7 +115,6 @@ if TYPE_CHECKING:
|
||||
# Content scraping imports
|
||||
from .content_scraping_strategy import (
|
||||
ContentScrapingStrategy as ContentScrapingStrategyType,
|
||||
WebScrapingStrategy as WebScrapingStrategyType,
|
||||
LXMLWebScrapingStrategy as LXMLWebScrapingStrategyType,
|
||||
)
|
||||
|
||||
|
||||
@@ -1517,8 +1517,29 @@ def extract_metadata_using_lxml(html, doc=None):
|
||||
head = head[0]
|
||||
|
||||
# Title - using XPath
|
||||
# title = head.xpath(".//title/text()")
|
||||
# metadata["title"] = title[0].strip() if title else None
|
||||
|
||||
# === Title Extraction - New Approach ===
|
||||
# Attempt to extract <title> using XPath
|
||||
title = head.xpath(".//title/text()")
|
||||
metadata["title"] = title[0].strip() if title else None
|
||||
title = title[0] if title else None
|
||||
|
||||
# Fallback: Use .find() in case XPath fails due to malformed HTML
|
||||
if not title:
|
||||
title_el = doc.find(".//title")
|
||||
title = title_el.text if title_el is not None else None
|
||||
|
||||
# Final fallback: Use OpenGraph or Twitter title if <title> is missing or empty
|
||||
if not title:
|
||||
title_candidates = (
|
||||
doc.xpath("//meta[@property='og:title']/@content") or
|
||||
doc.xpath("//meta[@name='twitter:title']/@content")
|
||||
)
|
||||
title = title_candidates[0] if title_candidates else None
|
||||
|
||||
# Strip and assign title
|
||||
metadata["title"] = title.strip() if title else None
|
||||
|
||||
# Meta description - using XPath with multiple attribute conditions
|
||||
description = head.xpath('.//meta[@name="description"]/@content')
|
||||
@@ -3342,7 +3363,13 @@ async def get_text_embeddings(
|
||||
# Default: use sentence-transformers
|
||||
else:
|
||||
# Lazy load to avoid importing heavy libraries unless needed
|
||||
from sentence_transformers import SentenceTransformer
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"sentence-transformers is required for local embeddings. "
|
||||
"Install it with: pip install 'crawl4ai[transformer]' or pip install sentence-transformers"
|
||||
)
|
||||
|
||||
# Cache the model in function attribute to avoid reloading
|
||||
if not hasattr(get_text_embeddings, '_models'):
|
||||
|
||||
@@ -5,4 +5,9 @@ ANTHROPIC_API_KEY=your_anthropic_key_here
|
||||
GROQ_API_KEY=your_groq_key_here
|
||||
TOGETHER_API_KEY=your_together_key_here
|
||||
MISTRAL_API_KEY=your_mistral_key_here
|
||||
GEMINI_API_TOKEN=your_gemini_key_here
|
||||
GEMINI_API_TOKEN=your_gemini_key_here
|
||||
|
||||
# Optional: Override the default LLM provider
|
||||
# Examples: "openai/gpt-4", "anthropic/claude-3-opus", "deepseek/chat", etc.
|
||||
# If not set, uses the provider specified in config.yml (default: openai/gpt-4o-mini)
|
||||
# LLM_PROVIDER=anthropic/claude-3-opus
|
||||
@@ -154,6 +154,29 @@ cp deploy/docker/.llm.env.example .llm.env
|
||||
# Now edit .llm.env and add your API keys
|
||||
```
|
||||
|
||||
**Flexible LLM Provider Configuration:**
|
||||
|
||||
The Docker setup now supports flexible LLM provider configuration through three methods:
|
||||
|
||||
1. **Environment Variable** (Highest Priority): Set `LLM_PROVIDER` to override the default
|
||||
```bash
|
||||
export LLM_PROVIDER="anthropic/claude-3-opus"
|
||||
# Or in your .llm.env file:
|
||||
# LLM_PROVIDER=anthropic/claude-3-opus
|
||||
```
|
||||
|
||||
2. **API Request Parameter**: Specify provider per request
|
||||
```json
|
||||
{
|
||||
"url": "https://example.com",
|
||||
"provider": "groq/mixtral-8x7b"
|
||||
}
|
||||
```
|
||||
|
||||
3. **Config File Default**: Falls back to `config.yml` (default: `openai/gpt-4o-mini`)
|
||||
|
||||
The system automatically selects the appropriate API key based on the provider.
|
||||
|
||||
#### 3. Build and Run with Compose
|
||||
|
||||
The `docker-compose.yml` file in the project root provides a simplified approach that automatically handles architecture detection using buildx.
|
||||
@@ -668,7 +691,7 @@ app:
|
||||
|
||||
# Default LLM Configuration
|
||||
llm:
|
||||
provider: "openai/gpt-4o-mini"
|
||||
provider: "openai/gpt-4o-mini" # Can be overridden by LLM_PROVIDER env var
|
||||
api_key_env: "OPENAI_API_KEY"
|
||||
# api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ from typing import List, Tuple, Dict
|
||||
from functools import partial
|
||||
from uuid import uuid4
|
||||
from datetime import datetime
|
||||
from base64 import b64encode
|
||||
|
||||
import logging
|
||||
from typing import Optional, AsyncGenerator
|
||||
@@ -39,7 +40,9 @@ from utils import (
|
||||
get_base_url,
|
||||
is_task_id,
|
||||
should_cleanup_task,
|
||||
decode_redis_hash
|
||||
decode_redis_hash,
|
||||
get_llm_api_key,
|
||||
validate_llm_provider
|
||||
)
|
||||
|
||||
import psutil, time
|
||||
@@ -88,10 +91,12 @@ async def handle_llm_qa(
|
||||
|
||||
Answer:"""
|
||||
|
||||
# api_token=os.environ.get(config["llm"].get("api_key_env", ""))
|
||||
|
||||
response = perform_completion_with_backoff(
|
||||
provider=config["llm"]["provider"],
|
||||
prompt_with_variables=prompt,
|
||||
api_token=os.environ.get(config["llm"].get("api_key_env", ""))
|
||||
api_token=get_llm_api_key(config)
|
||||
)
|
||||
|
||||
return response.choices[0].message.content
|
||||
@@ -109,19 +114,23 @@ async def process_llm_extraction(
|
||||
url: str,
|
||||
instruction: str,
|
||||
schema: Optional[str] = None,
|
||||
cache: str = "0"
|
||||
cache: str = "0",
|
||||
provider: Optional[str] = None
|
||||
) -> None:
|
||||
"""Process LLM extraction in background."""
|
||||
try:
|
||||
# If config['llm'] has api_key then ignore the api_key_env
|
||||
api_key = ""
|
||||
if "api_key" in config["llm"]:
|
||||
api_key = config["llm"]["api_key"]
|
||||
else:
|
||||
api_key = os.environ.get(config["llm"].get("api_key_env", None), "")
|
||||
# Validate provider
|
||||
is_valid, error_msg = validate_llm_provider(config, provider)
|
||||
if not is_valid:
|
||||
await redis.hset(f"task:{task_id}", mapping={
|
||||
"status": TaskStatus.FAILED,
|
||||
"error": error_msg
|
||||
})
|
||||
return
|
||||
api_key = get_llm_api_key(config, provider)
|
||||
llm_strategy = LLMExtractionStrategy(
|
||||
llm_config=LLMConfig(
|
||||
provider=config["llm"]["provider"],
|
||||
provider=provider or config["llm"]["provider"],
|
||||
api_token=api_key
|
||||
),
|
||||
instruction=instruction,
|
||||
@@ -168,10 +177,19 @@ async def handle_markdown_request(
|
||||
filter_type: FilterType,
|
||||
query: Optional[str] = None,
|
||||
cache: str = "0",
|
||||
config: Optional[dict] = None
|
||||
config: Optional[dict] = None,
|
||||
provider: Optional[str] = None
|
||||
) -> str:
|
||||
"""Handle markdown generation requests."""
|
||||
try:
|
||||
# Validate provider if using LLM filter
|
||||
if filter_type == FilterType.LLM:
|
||||
is_valid, error_msg = validate_llm_provider(config, provider)
|
||||
if not is_valid:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=error_msg
|
||||
)
|
||||
decoded_url = unquote(url)
|
||||
if not decoded_url.startswith(('http://', 'https://')):
|
||||
decoded_url = 'https://' + decoded_url
|
||||
@@ -184,8 +202,8 @@ async def handle_markdown_request(
|
||||
FilterType.BM25: BM25ContentFilter(user_query=query or ""),
|
||||
FilterType.LLM: LLMContentFilter(
|
||||
llm_config=LLMConfig(
|
||||
provider=config["llm"]["provider"],
|
||||
api_token=os.environ.get(config["llm"].get("api_key_env", None), ""),
|
||||
provider=provider or config["llm"]["provider"],
|
||||
api_token=get_llm_api_key(config, provider),
|
||||
),
|
||||
instruction=query or "Extract main content"
|
||||
)
|
||||
@@ -229,7 +247,8 @@ async def handle_llm_request(
|
||||
query: Optional[str] = None,
|
||||
schema: Optional[str] = None,
|
||||
cache: str = "0",
|
||||
config: Optional[dict] = None
|
||||
config: Optional[dict] = None,
|
||||
provider: Optional[str] = None
|
||||
) -> JSONResponse:
|
||||
"""Handle LLM extraction requests."""
|
||||
base_url = get_base_url(request)
|
||||
@@ -259,7 +278,8 @@ async def handle_llm_request(
|
||||
schema,
|
||||
cache,
|
||||
base_url,
|
||||
config
|
||||
config,
|
||||
provider
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
@@ -303,7 +323,8 @@ async def create_new_task(
|
||||
schema: Optional[str],
|
||||
cache: str,
|
||||
base_url: str,
|
||||
config: dict
|
||||
config: dict,
|
||||
provider: Optional[str] = None
|
||||
) -> JSONResponse:
|
||||
"""Create and initialize a new task."""
|
||||
decoded_url = unquote(input_path)
|
||||
@@ -327,7 +348,8 @@ async def create_new_task(
|
||||
decoded_url,
|
||||
query,
|
||||
schema,
|
||||
cache
|
||||
cache,
|
||||
provider
|
||||
)
|
||||
|
||||
return JSONResponse({
|
||||
@@ -371,6 +393,9 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
|
||||
server_memory_mb = _get_memory_mb()
|
||||
result_dict = result.model_dump()
|
||||
result_dict['server_memory_mb'] = server_memory_mb
|
||||
# If PDF exists, encode it to base64
|
||||
if result_dict.get('pdf') is not None:
|
||||
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
|
||||
logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")
|
||||
data = json.dumps(result_dict, default=datetime_handler) + "\n"
|
||||
yield data.encode('utf-8')
|
||||
@@ -443,10 +468,19 @@ async def handle_crawl_request(
|
||||
mem_delta_mb = end_mem_mb - start_mem_mb # <--- Calculate delta
|
||||
peak_mem_mb = max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb) # <--- Get peak memory
|
||||
logger.info(f"Memory usage: Start: {start_mem_mb} MB, End: {end_mem_mb} MB, Delta: {mem_delta_mb} MB, Peak: {peak_mem_mb} MB")
|
||||
|
||||
|
||||
# Process results to handle PDF bytes
|
||||
processed_results = []
|
||||
for result in results:
|
||||
result_dict = result.model_dump()
|
||||
# If PDF exists, encode it to base64
|
||||
if result_dict.get('pdf') is not None:
|
||||
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
|
||||
processed_results.append(result_dict)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"results": [result.model_dump() for result in results],
|
||||
"results": processed_results,
|
||||
"server_processing_time_s": end_time - start_time,
|
||||
"server_memory_delta_mb": mem_delta_mb,
|
||||
"server_peak_memory_mb": peak_mem_mb
|
||||
|
||||
@@ -36,6 +36,7 @@ class LlmJobPayload(BaseModel):
|
||||
q: str
|
||||
schema: Optional[str] = None
|
||||
cache: bool = False
|
||||
provider: Optional[str] = None
|
||||
|
||||
|
||||
class CrawlJobPayload(BaseModel):
|
||||
@@ -61,6 +62,7 @@ async def llm_job_enqueue(
|
||||
schema=payload.schema,
|
||||
cache=payload.cache,
|
||||
config=_config,
|
||||
provider=payload.provider,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ class MarkdownRequest(BaseModel):
|
||||
f: FilterType = Field(FilterType.FIT, description="Content‑filter strategy: fit, raw, bm25, or llm")
|
||||
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
|
||||
c: Optional[str] = Field("0", description="Cache‑bust / revision counter")
|
||||
provider: Optional[str] = Field(None, description="LLM provider override (e.g., 'anthropic/claude-3-opus')")
|
||||
|
||||
|
||||
class RawCode(BaseModel):
|
||||
|
||||
@@ -241,7 +241,7 @@ async def get_markdown(
|
||||
raise HTTPException(
|
||||
400, "URL must be absolute and start with http/https")
|
||||
markdown = await handle_markdown_request(
|
||||
body.url, body.f, body.q, body.c, config
|
||||
body.url, body.f, body.q, body.c, config, body.provider
|
||||
)
|
||||
return JSONResponse({
|
||||
"url": body.url,
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import dns.resolver
|
||||
import logging
|
||||
import yaml
|
||||
import os
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
@@ -19,10 +20,24 @@ class FilterType(str, Enum):
|
||||
LLM = "llm"
|
||||
|
||||
def load_config() -> Dict:
|
||||
"""Load and return application configuration."""
|
||||
"""Load and return application configuration with environment variable overrides."""
|
||||
config_path = Path(__file__).parent / "config.yml"
|
||||
with open(config_path, "r") as config_file:
|
||||
return yaml.safe_load(config_file)
|
||||
config = yaml.safe_load(config_file)
|
||||
|
||||
# Override LLM provider from environment if set
|
||||
llm_provider = os.environ.get("LLM_PROVIDER")
|
||||
if llm_provider:
|
||||
config["llm"]["provider"] = llm_provider
|
||||
logging.info(f"LLM provider overridden from environment: {llm_provider}")
|
||||
|
||||
# Also support direct API key from environment if the provider-specific key isn't set
|
||||
llm_api_key = os.environ.get("LLM_API_KEY")
|
||||
if llm_api_key and "api_key" not in config["llm"]:
|
||||
config["llm"]["api_key"] = llm_api_key
|
||||
logging.info("LLM API key loaded from LLM_API_KEY environment variable")
|
||||
|
||||
return config
|
||||
|
||||
def setup_logging(config: Dict) -> None:
|
||||
"""Configure application logging."""
|
||||
@@ -56,6 +71,52 @@ def decode_redis_hash(hash_data: Dict[bytes, bytes]) -> Dict[str, str]:
|
||||
|
||||
|
||||
|
||||
def get_llm_api_key(config: Dict, provider: Optional[str] = None) -> str:
|
||||
"""Get the appropriate API key based on the LLM provider.
|
||||
|
||||
Args:
|
||||
config: The application configuration dictionary
|
||||
provider: Optional provider override (e.g., "openai/gpt-4")
|
||||
|
||||
Returns:
|
||||
The API key for the provider, or empty string if not found
|
||||
"""
|
||||
|
||||
# Use provided provider or fall back to config
|
||||
if not provider:
|
||||
provider = config["llm"]["provider"]
|
||||
|
||||
# Check if direct API key is configured
|
||||
if "api_key" in config["llm"]:
|
||||
return config["llm"]["api_key"]
|
||||
|
||||
# Fall back to the configured api_key_env if no match
|
||||
return os.environ.get(config["llm"].get("api_key_env", ""), "")
|
||||
|
||||
|
||||
def validate_llm_provider(config: Dict, provider: Optional[str] = None) -> tuple[bool, str]:
|
||||
"""Validate that the LLM provider has an associated API key.
|
||||
|
||||
Args:
|
||||
config: The application configuration dictionary
|
||||
provider: Optional provider override (e.g., "openai/gpt-4")
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid, error_message)
|
||||
"""
|
||||
# Use provided provider or fall back to config
|
||||
if not provider:
|
||||
provider = config["llm"]["provider"]
|
||||
|
||||
# Get the API key for this provider
|
||||
api_key = get_llm_api_key(config, provider)
|
||||
|
||||
if not api_key:
|
||||
return False, f"No API key found for provider '{provider}'. Please set the appropriate environment variable."
|
||||
|
||||
return True, ""
|
||||
|
||||
|
||||
def verify_email_domain(email: str) -> bool:
|
||||
try:
|
||||
domain = email.split('@')[1]
|
||||
|
||||
@@ -14,6 +14,7 @@ x-base-config: &base-config
|
||||
- TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
|
||||
- MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
|
||||
- GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
|
||||
- LLM_PROVIDER=${LLM_PROVIDER:-} # Optional: Override default provider (e.g., "anthropic/claude-3-opus")
|
||||
volumes:
|
||||
- /dev/shm:/dev/shm # Chromium performance
|
||||
deploy:
|
||||
|
||||
@@ -10,9 +10,8 @@ Today I'm releasing Crawl4AI v0.7.0—the Adaptive Intelligence Update. This rel
|
||||
|
||||
- **Adaptive Crawling**: Your crawler now learns and adapts to website patterns
|
||||
- **Virtual Scroll Support**: Complete content extraction from infinite scroll pages
|
||||
- **Link Preview with 3-Layer Scoring**: Intelligent link analysis and prioritization
|
||||
- **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization
|
||||
- **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering
|
||||
- **PDF Parsing**: Extract data from PDF documents
|
||||
- **Performance Optimizations**: Significant speed and memory improvements
|
||||
|
||||
## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning
|
||||
@@ -30,44 +29,41 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking:
|
||||
- Extraction confidence scores
|
||||
|
||||
```python
|
||||
from crawl4ai import AdaptiveCrawler, AdaptiveConfig, CrawlState
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||
import asyncio
|
||||
|
||||
# Initialize with custom learning parameters
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.7, # Min confidence to use learned patterns
|
||||
max_history=100, # Remember last 100 crawls per domain
|
||||
learning_rate=0.2, # How quickly to adapt to changes
|
||||
patterns_per_page=3, # Patterns to learn per page type
|
||||
extraction_strategy='css' # 'css' or 'xpath'
|
||||
)
|
||||
|
||||
adaptive_crawler = AdaptiveCrawler(config)
|
||||
|
||||
# First crawl - crawler learns the structure
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://news.example.com/article/12345",
|
||||
config=CrawlerRunConfig(
|
||||
adaptive_config=config,
|
||||
extraction_hints={ # Optional hints to speed up learning
|
||||
"title": "article h1",
|
||||
"content": "article .body-content"
|
||||
}
|
||||
)
|
||||
async def main():
|
||||
|
||||
# Configure adaptive crawler
|
||||
config = AdaptiveConfig(
|
||||
strategy="statistical", # or "embedding" for semantic understanding
|
||||
max_pages=10,
|
||||
confidence_threshold=0.7, # Stop at 70% confidence
|
||||
top_k_links=3, # Follow top 3 links per page
|
||||
min_gain_threshold=0.05 # Need 5% information gain to continue
|
||||
)
|
||||
|
||||
# Crawler identifies and stores patterns
|
||||
if result.success:
|
||||
state = adaptive_crawler.get_state("news.example.com")
|
||||
print(f"Learned {len(state.patterns)} patterns")
|
||||
print(f"Confidence: {state.avg_confidence:.2%}")
|
||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
|
||||
print("Starting adaptive crawl about Python decorators...")
|
||||
result = await adaptive.digest(
|
||||
start_url="https://docs.python.org/3/glossary.html",
|
||||
query="python decorators functions wrapping"
|
||||
)
|
||||
|
||||
print(f"\n✅ Crawling Complete!")
|
||||
print(f"• Confidence Level: {adaptive.confidence:.0%}")
|
||||
print(f"• Pages Crawled: {len(result.crawled_urls)}")
|
||||
print(f"• Knowledge Base: {len(adaptive.state.knowledge_base)} documents")
|
||||
|
||||
# Get most relevant content
|
||||
relevant = adaptive.get_relevant_content(top_k=3)
|
||||
print(f"\nMost Relevant Pages:")
|
||||
for i, page in enumerate(relevant, 1):
|
||||
print(f"{i}. {page['url']} (relevance: {page['score']:.2%})")
|
||||
|
||||
# Subsequent crawls - uses learned patterns
|
||||
result2 = await crawler.arun(
|
||||
"https://news.example.com/article/67890",
|
||||
config=CrawlerRunConfig(adaptive_config=config)
|
||||
)
|
||||
# Automatically extracts using learned patterns!
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
@@ -92,9 +88,7 @@ twitter_config = VirtualScrollConfig(
|
||||
container_selector="[data-testid='primaryColumn']",
|
||||
scroll_count=20, # Number of scrolls
|
||||
scroll_by="container_height", # Smart scrolling by container size
|
||||
wait_after_scroll=1.0, # Let content load
|
||||
capture_method="incremental", # Capture new content on each scroll
|
||||
deduplicate=True # Remove duplicate elements
|
||||
wait_after_scroll=1.0 # Let content load
|
||||
)
|
||||
|
||||
# For e-commerce product grids (Instagram style)
|
||||
@@ -102,8 +96,7 @@ grid_config = VirtualScrollConfig(
|
||||
container_selector="main .product-grid",
|
||||
scroll_count=30,
|
||||
scroll_by=800, # Fixed pixel scrolling
|
||||
wait_after_scroll=1.5, # Images need time
|
||||
stop_on_no_change=True # Smart stopping
|
||||
wait_after_scroll=1.5 # Images need time
|
||||
)
|
||||
|
||||
# For news feeds with lazy loading
|
||||
@@ -111,9 +104,7 @@ news_config = VirtualScrollConfig(
|
||||
container_selector=".article-feed",
|
||||
scroll_count=50,
|
||||
scroll_by="page_height", # Viewport-based scrolling
|
||||
wait_after_scroll=0.5,
|
||||
wait_for_selector=".article-card", # Wait for specific elements
|
||||
timeout=30000 # Max 30 seconds total
|
||||
wait_after_scroll=0.5 # Wait for content to load
|
||||
)
|
||||
|
||||
# Use it in your crawl
|
||||
@@ -157,68 +148,63 @@ async with AsyncWebCrawler() as crawler:
|
||||
|
||||
**My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals.
|
||||
|
||||
### The Three-Layer Scoring System
|
||||
### Intelligent Link Analysis and Scoring
|
||||
|
||||
```python
|
||||
from crawl4ai import LinkPreviewConfig
|
||||
import asyncio
|
||||
from crawl4ai import CrawlerRunConfig, CacheMode, AsyncWebCrawler
|
||||
from crawl4ai.adaptive_crawler import LinkPreviewConfig
|
||||
|
||||
# Configure intelligent link analysis
|
||||
link_config = LinkPreviewConfig(
|
||||
# What to analyze
|
||||
include_internal=True,
|
||||
include_external=True,
|
||||
max_links=100, # Analyze top 100 links
|
||||
|
||||
# Relevance scoring
|
||||
query="machine learning tutorials", # Your interest
|
||||
score_threshold=0.3, # Minimum relevance score
|
||||
|
||||
# Performance
|
||||
concurrent_requests=10, # Parallel processing
|
||||
timeout_per_link=5000, # 5s per link
|
||||
|
||||
# Advanced scoring weights
|
||||
scoring_weights={
|
||||
"intrinsic": 0.3, # Link quality indicators
|
||||
"contextual": 0.5, # Relevance to query
|
||||
"popularity": 0.2 # Link prominence
|
||||
}
|
||||
)
|
||||
|
||||
# Use in your crawl
|
||||
result = await crawler.arun(
|
||||
"https://tech-blog.example.com",
|
||||
config=CrawlerRunConfig(
|
||||
link_preview_config=link_config,
|
||||
score_links=True
|
||||
async def main():
|
||||
# Configure intelligent link analysis
|
||||
link_config = LinkPreviewConfig(
|
||||
include_internal=True,
|
||||
include_external=False,
|
||||
max_links=10,
|
||||
concurrency=5,
|
||||
query="python tutorial", # For contextual scoring
|
||||
score_threshold=0.3,
|
||||
verbose=True
|
||||
)
|
||||
)
|
||||
# Use in your crawl
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://www.geeksforgeeks.org/",
|
||||
config=CrawlerRunConfig(
|
||||
link_preview_config=link_config,
|
||||
score_links=True, # Enable intrinsic scoring
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
)
|
||||
|
||||
# Access scored and sorted links
|
||||
for link in result.links["internal"][:10]: # Top 10 internal links
|
||||
print(f"Score: {link['total_score']:.3f}")
|
||||
print(f" Intrinsic: {link['intrinsic_score']:.1f}/10") # Position, attributes
|
||||
print(f" Contextual: {link['contextual_score']:.1f}/1") # Relevance to query
|
||||
print(f" URL: {link['href']}")
|
||||
print(f" Title: {link['head_data']['title']}")
|
||||
print(f" Description: {link['head_data']['meta']['description'][:100]}...")
|
||||
# Access scored and sorted links
|
||||
if result.success and result.links:
|
||||
for link in result.links.get("internal", []):
|
||||
text = link.get('text', 'No text')[:40]
|
||||
print(
|
||||
text,
|
||||
f"{link.get('intrinsic_score', 0):.1f}/10" if link.get('intrinsic_score') is not None else "0.0/10",
|
||||
f"{link.get('contextual_score', 0):.2f}/1" if link.get('contextual_score') is not None else "0.00/1",
|
||||
f"{link.get('total_score', 0):.3f}" if link.get('total_score') is not None else "0.000"
|
||||
)
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Scoring Components:**
|
||||
|
||||
1. **Intrinsic Score (0-10)**: Based on link quality indicators
|
||||
1. **Intrinsic Score**: Based on link quality indicators
|
||||
- Position on page (navigation, content, footer)
|
||||
- Link attributes (rel, title, class names)
|
||||
- Anchor text quality and length
|
||||
- URL structure and depth
|
||||
|
||||
2. **Contextual Score (0-1)**: Relevance to your query
|
||||
- Semantic similarity using embeddings
|
||||
2. **Contextual Score**: Relevance to your query using BM25 algorithm
|
||||
- Keyword matching in link text and title
|
||||
- Meta description analysis
|
||||
- Content preview scoring
|
||||
|
||||
3. **Total Score**: Weighted combination for final ranking
|
||||
3. **Total Score**: Combined score for final ranking
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
- **Research Efficiency**: Find relevant papers 10x faster by following only high-score links
|
||||
@@ -235,58 +221,34 @@ for link in result.links["internal"][:10]: # Top 10 internal links
|
||||
### Technical Architecture
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
||||
|
||||
# Basic discovery - find all product pages
|
||||
seeder_config = SeedingConfig(
|
||||
# Discovery sources
|
||||
source="sitemap+cc", # Sitemap + Common Crawl
|
||||
|
||||
# Filtering
|
||||
pattern="*/product/*", # URL pattern matching
|
||||
ignore_patterns=["*/reviews/*", "*/questions/*"],
|
||||
|
||||
# Validation
|
||||
live_check=True, # Verify URLs are alive
|
||||
max_urls=5000, # Stop at 5000 URLs
|
||||
|
||||
# Performance
|
||||
concurrency=100, # Parallel requests
|
||||
hits_per_sec=10 # Rate limiting
|
||||
)
|
||||
async def main():
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
# Discover Python tutorial URLs
|
||||
config = SeedingConfig(
|
||||
source="sitemap", # Use sitemap
|
||||
pattern="*python*", # URL pattern filter
|
||||
extract_head=True, # Get metadata
|
||||
query="python tutorial", # For relevance scoring
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.2,
|
||||
max_urls=10
|
||||
)
|
||||
|
||||
print("Discovering Python async tutorial URLs...")
|
||||
urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
|
||||
|
||||
print(f"\n✅ Found {len(urls)} relevant URLs:")
|
||||
for i, url_info in enumerate(urls[:5], 1):
|
||||
print(f"\n{i}. {url_info['url']}")
|
||||
if url_info.get('relevance_score'):
|
||||
print(f" Relevance: {url_info['relevance_score']:.3f}")
|
||||
if url_info.get('head_data', {}).get('title'):
|
||||
print(f" Title: {url_info['head_data']['title'][:60]}...")
|
||||
|
||||
seeder = AsyncUrlSeeder(seeder_config)
|
||||
urls = await seeder.discover("https://shop.example.com")
|
||||
|
||||
# Advanced: Relevance-based discovery
|
||||
research_config = SeedingConfig(
|
||||
source="crawl+sitemap", # Deep crawl + sitemap
|
||||
pattern="*/blog/*", # Blog posts only
|
||||
|
||||
# Content relevance
|
||||
extract_head=True, # Get meta tags
|
||||
query="quantum computing tutorials",
|
||||
scoring_method="bm25", # Or "semantic" (coming soon)
|
||||
score_threshold=0.4, # High relevance only
|
||||
|
||||
# Smart filtering
|
||||
filter_nonsense_urls=True, # Remove .xml, .txt, etc.
|
||||
min_content_length=500, # Skip thin content
|
||||
|
||||
force=True # Bypass cache
|
||||
)
|
||||
|
||||
# Discover with progress tracking
|
||||
discovered = []
|
||||
async for batch in seeder.discover_iter("https://physics-blog.com", research_config):
|
||||
discovered.extend(batch)
|
||||
print(f"Found {len(discovered)} relevant URLs so far...")
|
||||
|
||||
# Results include scores and metadata
|
||||
for url_data in discovered[:5]:
|
||||
print(f"URL: {url_data['url']}")
|
||||
print(f"Score: {url_data['score']:.3f}")
|
||||
print(f"Title: {url_data['title']}")
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Discovery Methods:**
|
||||
@@ -309,35 +271,18 @@ This release includes significant performance improvements through optimized res
|
||||
### What We Optimized
|
||||
|
||||
```python
|
||||
# Before v0.7.0 (slow)
|
||||
# Optimized crawling with v0.7.0 improvements
|
||||
results = []
|
||||
for url in urls:
|
||||
result = await crawler.arun(url)
|
||||
results.append(result)
|
||||
|
||||
# After v0.7.0 (fast)
|
||||
# Automatic batching and connection pooling
|
||||
results = await crawler.arun_batch(
|
||||
urls,
|
||||
config=CrawlerRunConfig(
|
||||
# New performance options
|
||||
batch_size=10, # Process 10 URLs concurrently
|
||||
reuse_browser=True, # Keep browser warm
|
||||
eager_loading=False, # Load only what's needed
|
||||
streaming_extraction=True, # Stream large extractions
|
||||
|
||||
# Optimized defaults
|
||||
wait_until="domcontentloaded", # Faster than networkidle
|
||||
exclude_external_resources=True, # Skip third-party assets
|
||||
block_ads=True # Ad blocking built-in
|
||||
result = await crawler.arun(
|
||||
url,
|
||||
config=CrawlerRunConfig(
|
||||
# Performance optimizations
|
||||
wait_until="domcontentloaded", # Faster than networkidle
|
||||
cache_mode=CacheMode.ENABLED # Enable caching
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# Memory-efficient streaming for large crawls
|
||||
async for result in crawler.arun_stream(large_url_list):
|
||||
# Process results as they complete
|
||||
await process_result(result)
|
||||
# Memory is freed after each iteration
|
||||
results.append(result)
|
||||
```
|
||||
|
||||
**Performance Gains:**
|
||||
@@ -347,24 +292,6 @@ async for result in crawler.arun_stream(large_url_list):
|
||||
- **Memory Usage**: 60% reduction with streaming processing
|
||||
- **Concurrent Crawls**: Handle 5x more parallel requests
|
||||
|
||||
## 📄 PDF Support
|
||||
|
||||
PDF extraction is now natively supported in Crawl4AI.
|
||||
|
||||
```python
|
||||
# Extract data from PDF documents
|
||||
result = await crawler.arun(
|
||||
"https://example.com/report.pdf",
|
||||
config=CrawlerRunConfig(
|
||||
pdf_extraction=True,
|
||||
extraction_strategy=JsonCssExtractionStrategy({
|
||||
# Works on converted PDF structure
|
||||
"title": {"selector": "h1", "type": "text"},
|
||||
"sections": {"selector": "h2", "type": "list"}
|
||||
})
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
## 🔧 Important Changes
|
||||
|
||||
|
||||
43
docs/blog/release-v0.7.1.md
Normal file
43
docs/blog/release-v0.7.1.md
Normal file
@@ -0,0 +1,43 @@
|
||||
# 🛠️ Crawl4AI v0.7.1: Minor Cleanup Update
|
||||
|
||||
*July 17, 2025 • 2 min read*
|
||||
|
||||
---
|
||||
|
||||
A small maintenance release that removes unused code and improves documentation.
|
||||
|
||||
## 🎯 What's Changed
|
||||
|
||||
- **Removed unused StealthConfig** from `crawl4ai/browser_manager.py`
|
||||
- **Updated documentation** with better examples and parameter explanations
|
||||
- **Fixed virtual scroll configuration** examples in docs
|
||||
|
||||
## 🧹 Code Cleanup
|
||||
|
||||
Removed unused `StealthConfig` import and configuration that wasn't being used anywhere in the codebase. The project uses its own custom stealth implementation through JavaScript injection instead.
|
||||
|
||||
```python
|
||||
# Removed unused code:
|
||||
from playwright_stealth import StealthConfig
|
||||
stealth_config = StealthConfig(...) # This was never used
|
||||
```
|
||||
|
||||
## 📖 Documentation Updates
|
||||
|
||||
- Fixed adaptive crawling parameter examples
|
||||
- Updated session management documentation
|
||||
- Corrected virtual scroll configuration examples
|
||||
|
||||
## 🚀 Installation
|
||||
|
||||
```bash
|
||||
pip install crawl4ai==0.7.1
|
||||
```
|
||||
|
||||
No breaking changes - upgrade directly from v0.7.0.
|
||||
|
||||
---
|
||||
|
||||
Questions? Issues?
|
||||
- GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||
- Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||
303
docs/examples/demo_multi_config_clean.py
Normal file
303
docs/examples/demo_multi_config_clean.py
Normal file
@@ -0,0 +1,303 @@
|
||||
"""
|
||||
🎯 Multi-Config URL Matching Demo
|
||||
=================================
|
||||
Learn how to use different crawler configurations for different URL patterns
|
||||
in a single crawl batch with Crawl4AI's multi-config feature.
|
||||
|
||||
Part 1: Understanding URL Matching (Pattern Testing)
|
||||
Part 2: Practical Example with Real Crawling
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
CrawlerRunConfig,
|
||||
MatchMode
|
||||
)
|
||||
from crawl4ai.processors.pdf import PDFContentScrapingStrategy
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
|
||||
|
||||
def print_section(title):
|
||||
"""Print a formatted section header"""
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"{title}")
|
||||
print(f"{'=' * 60}\n")
|
||||
|
||||
|
||||
def test_url_matching(config, test_urls, config_name):
|
||||
"""Test URL matching for a config and show results"""
|
||||
print(f"Config: {config_name}")
|
||||
print(f"Matcher: {config.url_matcher}")
|
||||
if hasattr(config, 'match_mode'):
|
||||
print(f"Mode: {config.match_mode.value}")
|
||||
print("-" * 40)
|
||||
|
||||
for url in test_urls:
|
||||
matches = config.is_match(url)
|
||||
symbol = "✓" if matches else "✗"
|
||||
print(f"{symbol} {url}")
|
||||
print()
|
||||
|
||||
|
||||
# ==============================================================================
|
||||
# PART 1: Understanding URL Matching
|
||||
# ==============================================================================
|
||||
|
||||
def demo_part1_pattern_matching():
|
||||
"""Part 1: Learn how URL matching works without crawling"""
|
||||
|
||||
print_section("PART 1: Understanding URL Matching")
|
||||
print("Let's explore different ways to match URLs with configs.\n")
|
||||
|
||||
# Test URLs we'll use throughout
|
||||
test_urls = [
|
||||
"https://example.com/report.pdf",
|
||||
"https://example.com/data.json",
|
||||
"https://example.com/blog/post-1",
|
||||
"https://example.com/article/news",
|
||||
"https://api.example.com/v1/users",
|
||||
"https://example.com/about"
|
||||
]
|
||||
|
||||
# 1.1 Simple String Pattern
|
||||
print("1.1 Simple String Pattern Matching")
|
||||
print("-" * 40)
|
||||
|
||||
pdf_config = CrawlerRunConfig(
|
||||
url_matcher="*.pdf"
|
||||
)
|
||||
|
||||
test_url_matching(pdf_config, test_urls, "PDF Config")
|
||||
|
||||
|
||||
# 1.2 Multiple String Patterns
|
||||
print("1.2 Multiple String Patterns (OR logic)")
|
||||
print("-" * 40)
|
||||
|
||||
blog_config = CrawlerRunConfig(
|
||||
url_matcher=["*/blog/*", "*/article/*", "*/news/*"],
|
||||
match_mode=MatchMode.OR # This is default, shown for clarity
|
||||
)
|
||||
|
||||
test_url_matching(blog_config, test_urls, "Blog/Article Config")
|
||||
|
||||
|
||||
# 1.3 Single Function Matcher
|
||||
print("1.3 Function-based Matching")
|
||||
print("-" * 40)
|
||||
|
||||
api_config = CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'api' in url or url.endswith('.json')
|
||||
)
|
||||
|
||||
test_url_matching(api_config, test_urls, "API Config")
|
||||
|
||||
|
||||
# 1.4 List of Functions
|
||||
print("1.4 Multiple Functions with AND Logic")
|
||||
print("-" * 40)
|
||||
|
||||
# Must be HTTPS AND contain 'api' AND have version number
|
||||
secure_api_config = CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
lambda url: url.startswith('https://'),
|
||||
lambda url: 'api' in url,
|
||||
lambda url: '/v' in url # Version indicator
|
||||
],
|
||||
match_mode=MatchMode.AND
|
||||
)
|
||||
|
||||
test_url_matching(secure_api_config, test_urls, "Secure API Config")
|
||||
|
||||
|
||||
# 1.5 Mixed: String and Function Together
|
||||
print("1.5 Mixed Patterns: String + Function")
|
||||
print("-" * 40)
|
||||
|
||||
# Match JSON files OR any API endpoint
|
||||
json_or_api_config = CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
"*.json", # String pattern
|
||||
lambda url: 'api' in url # Function
|
||||
],
|
||||
match_mode=MatchMode.OR
|
||||
)
|
||||
|
||||
test_url_matching(json_or_api_config, test_urls, "JSON or API Config")
|
||||
|
||||
|
||||
# 1.6 Complex: Multiple Strings + Multiple Functions
|
||||
print("1.6 Complex Matcher: Mixed Types with AND Logic")
|
||||
print("-" * 40)
|
||||
|
||||
# Must be: HTTPS AND (.com domain) AND (blog OR article) AND NOT a PDF
|
||||
complex_config = CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
lambda url: url.startswith('https://'), # Function: HTTPS check
|
||||
"*.com/*", # String: .com domain
|
||||
lambda url: any(pattern in url for pattern in ['/blog/', '/article/']), # Function: Blog OR article
|
||||
lambda url: not url.endswith('.pdf') # Function: Not PDF
|
||||
],
|
||||
match_mode=MatchMode.AND
|
||||
)
|
||||
|
||||
test_url_matching(complex_config, test_urls, "Complex Mixed Config")
|
||||
|
||||
print("\n✅ Key Takeaway: First matching config wins when passed to arun_many()!")
|
||||
|
||||
|
||||
# ==============================================================================
|
||||
# PART 2: Practical Multi-URL Crawling
|
||||
# ==============================================================================
|
||||
|
||||
async def demo_part2_practical_crawling():
|
||||
"""Part 2: Real-world example with different content types"""
|
||||
|
||||
print_section("PART 2: Practical Multi-URL Crawling")
|
||||
print("Now let's see multi-config in action with real URLs.\n")
|
||||
|
||||
# Create specialized configs for different content types
|
||||
configs = [
|
||||
# Config 1: PDF documents - only match files ending with .pdf
|
||||
CrawlerRunConfig(
|
||||
url_matcher="*.pdf",
|
||||
scraping_strategy=PDFContentScrapingStrategy()
|
||||
),
|
||||
|
||||
# Config 2: Blog/article pages with content filtering
|
||||
CrawlerRunConfig(
|
||||
url_matcher=["*/blog/*", "*/article/*", "*python.org*"],
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(threshold=0.48)
|
||||
)
|
||||
),
|
||||
|
||||
# Config 3: Dynamic pages requiring JavaScript
|
||||
CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'github.com' in url,
|
||||
js_code="window.scrollTo(0, 500);" # Scroll to load content
|
||||
),
|
||||
|
||||
# Config 4: Mixed matcher - API endpoints (string OR function)
|
||||
CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
"*.json", # String pattern for JSON files
|
||||
lambda url: 'api' in url or 'httpbin.org' in url # Function for API endpoints
|
||||
],
|
||||
match_mode=MatchMode.OR,
|
||||
),
|
||||
|
||||
# Config 5: Complex matcher - Secure documentation sites
|
||||
CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
lambda url: url.startswith('https://'), # Must be HTTPS
|
||||
"*.org/*", # String: .org domain
|
||||
lambda url: any(doc in url for doc in ['docs', 'documentation', 'reference']), # Has docs
|
||||
lambda url: not url.endswith(('.pdf', '.json')) # Not PDF or JSON
|
||||
],
|
||||
match_mode=MatchMode.AND,
|
||||
# wait_for="css:.content, css:article" # Wait for content to load
|
||||
),
|
||||
|
||||
# Default config for everything else
|
||||
# CrawlerRunConfig() # No url_matcher means it matches everything (use it as fallback)
|
||||
]
|
||||
|
||||
# URLs to crawl - each will use a different config
|
||||
urls = [
|
||||
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", # → PDF config
|
||||
"https://blog.python.org/", # → Blog config with content filter
|
||||
"https://github.com/microsoft/playwright", # → JS config
|
||||
"https://httpbin.org/json", # → Mixed matcher config (API)
|
||||
"https://docs.python.org/3/reference/", # → Complex matcher config
|
||||
"https://www.w3schools.com/", # → Default config, if you uncomment the default config line above, if not you will see `Error: No matching configuration`
|
||||
]
|
||||
|
||||
print("URLs to crawl:")
|
||||
for i, url in enumerate(urls, 1):
|
||||
print(f"{i}. {url}")
|
||||
|
||||
print("\nCrawling with appropriate config for each URL...\n")
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=configs
|
||||
)
|
||||
|
||||
# Display results
|
||||
print("Results:")
|
||||
print("-" * 60)
|
||||
|
||||
for result in results:
|
||||
if result.success:
|
||||
# Determine which config was used
|
||||
config_type = "Default"
|
||||
if result.url.endswith('.pdf'):
|
||||
config_type = "PDF Strategy"
|
||||
elif any(pattern in result.url for pattern in ['blog', 'python.org']) and 'docs' not in result.url:
|
||||
config_type = "Blog + Content Filter"
|
||||
elif 'github.com' in result.url:
|
||||
config_type = "JavaScript Enabled"
|
||||
elif 'httpbin.org' in result.url or result.url.endswith('.json'):
|
||||
config_type = "Mixed Matcher (API)"
|
||||
elif 'docs.python.org' in result.url:
|
||||
config_type = "Complex Matcher (Secure Docs)"
|
||||
|
||||
print(f"\n✓ {result.url}")
|
||||
print(f" Config used: {config_type}")
|
||||
print(f" Content size: {len(result.markdown)} chars")
|
||||
|
||||
# Show if we have fit_markdown (from content filter)
|
||||
if hasattr(result.markdown, 'fit_markdown') and result.markdown.fit_markdown:
|
||||
print(f" Fit markdown size: {len(result.markdown.fit_markdown)} chars")
|
||||
reduction = (1 - len(result.markdown.fit_markdown) / len(result.markdown)) * 100
|
||||
print(f" Content reduced by: {reduction:.1f}%")
|
||||
|
||||
# Show extracted data if using extraction strategy
|
||||
if hasattr(result, 'extracted_content') and result.extracted_content:
|
||||
print(f" Extracted data: {str(result.extracted_content)[:100]}...")
|
||||
else:
|
||||
print(f"\n✗ {result.url}")
|
||||
print(f" Error: {result.error_message}")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("✅ Multi-config crawling complete!")
|
||||
print("\nBenefits demonstrated:")
|
||||
print("- PDFs handled with specialized scraper")
|
||||
print("- Blog content filtered for relevance")
|
||||
print("- JavaScript executed only where needed")
|
||||
print("- Mixed matchers (string + function) for flexible matching")
|
||||
print("- Complex matchers for precise URL targeting")
|
||||
print("- Each URL got optimal configuration automatically!")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run both parts of the demo"""
|
||||
|
||||
print("""
|
||||
🎯 Multi-Config URL Matching Demo
|
||||
=================================
|
||||
Learn how Crawl4AI can use different configurations
|
||||
for different URLs in a single batch.
|
||||
""")
|
||||
|
||||
# Part 1: Pattern matching
|
||||
demo_part1_pattern_matching()
|
||||
|
||||
print("\nPress Enter to continue to Part 2...")
|
||||
try:
|
||||
input()
|
||||
except EOFError:
|
||||
# Running in non-interactive mode, skip input
|
||||
pass
|
||||
|
||||
# Part 2: Practical crawling
|
||||
await demo_part2_practical_crawling()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -18,7 +18,7 @@ Usage:
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.async_configs import LinkPreviewConfig
|
||||
from crawl4ai import LinkPreviewConfig
|
||||
|
||||
|
||||
async def basic_link_head_extraction():
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import time, re
|
||||
from crawl4ai.content_scraping_strategy import WebScrapingStrategy, LXMLWebScrapingStrategy
|
||||
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||
# WebScrapingStrategy is now an alias for LXMLWebScrapingStrategy
|
||||
import time
|
||||
import functools
|
||||
from collections import defaultdict
|
||||
@@ -57,7 +58,7 @@ methods_to_profile = [
|
||||
|
||||
|
||||
# Apply decorators to both strategies
|
||||
for strategy, name in [(WebScrapingStrategy, "Original"), (LXMLWebScrapingStrategy, "LXML")]:
|
||||
for strategy, name in [(LXMLWebScrapingStrategy, "LXML")]:
|
||||
for method in methods_to_profile:
|
||||
apply_decorators(strategy, method, name)
|
||||
|
||||
@@ -85,7 +86,7 @@ def generate_large_html(n_elements=1000):
|
||||
|
||||
def test_scraping():
|
||||
# Initialize both scrapers
|
||||
original_scraper = WebScrapingStrategy()
|
||||
original_scraper = LXMLWebScrapingStrategy()
|
||||
selected_scraper = LXMLWebScrapingStrategy()
|
||||
|
||||
# Generate test HTML
|
||||
|
||||
@@ -404,7 +404,182 @@ for result in results:
|
||||
print(f"Duration: {dr.end_time - dr.start_time}")
|
||||
```
|
||||
|
||||
## 6. Summary
|
||||
## 6. URL-Specific Configurations
|
||||
|
||||
When crawling diverse content types, you often need different configurations for different URLs. For example:
|
||||
- PDFs need specialized extraction
|
||||
- Blog pages benefit from content filtering
|
||||
- Dynamic sites need JavaScript execution
|
||||
- API endpoints need JSON parsing
|
||||
|
||||
### 6.1 Basic URL Pattern Matching
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, MatchMode
|
||||
from crawl4ai.processors.pdf import PDFContentScrapingStrategy
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
|
||||
async def crawl_mixed_content():
|
||||
# Configure different strategies for different content
|
||||
configs = [
|
||||
# PDF files - specialized extraction
|
||||
CrawlerRunConfig(
|
||||
url_matcher="*.pdf",
|
||||
scraping_strategy=PDFContentScrapingStrategy()
|
||||
),
|
||||
|
||||
# Blog/article pages - content filtering
|
||||
CrawlerRunConfig(
|
||||
url_matcher=["*/blog/*", "*/article/*"],
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(threshold=0.48)
|
||||
)
|
||||
),
|
||||
|
||||
# Dynamic pages - JavaScript execution
|
||||
CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'github.com' in url,
|
||||
js_code="window.scrollTo(0, 500);"
|
||||
),
|
||||
|
||||
# API endpoints - JSON extraction
|
||||
CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'api' in url or url.endswith('.json'),
|
||||
# Custome settings for JSON extraction
|
||||
),
|
||||
|
||||
# Default config for everything else
|
||||
CrawlerRunConfig() # No url_matcher means it matches ALL URLs (fallback)
|
||||
]
|
||||
|
||||
# Mixed URLs
|
||||
urls = [
|
||||
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
|
||||
"https://blog.python.org/",
|
||||
"https://github.com/microsoft/playwright",
|
||||
"https://httpbin.org/json",
|
||||
"https://example.com/"
|
||||
]
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=configs # Pass list of configs
|
||||
)
|
||||
|
||||
for result in results:
|
||||
print(f"{result.url}: {len(result.markdown)} chars")
|
||||
```
|
||||
|
||||
### 6.2 Advanced Pattern Matching
|
||||
|
||||
**Important**: A `CrawlerRunConfig` without `url_matcher` (or with `url_matcher=None`) matches ALL URLs. This makes it perfect as a default/fallback configuration.
|
||||
|
||||
The `url_matcher` parameter supports three types of patterns:
|
||||
|
||||
#### Glob Patterns (Strings)
|
||||
```python
|
||||
# Simple patterns
|
||||
"*.pdf" # Any PDF file
|
||||
"*/api/*" # Any URL with /api/ in path
|
||||
"https://*.example.com/*" # Subdomain matching
|
||||
"*://example.com/blog/*" # Any protocol
|
||||
```
|
||||
|
||||
#### Custom Functions
|
||||
```python
|
||||
# Complex logic with lambdas
|
||||
lambda url: url.startswith('https://') and 'secure' in url
|
||||
lambda url: len(url) > 50 and url.count('/') > 5
|
||||
lambda url: any(domain in url for domain in ['api.', 'data.', 'feed.'])
|
||||
```
|
||||
|
||||
#### Mixed Lists with AND/OR Logic
|
||||
```python
|
||||
# Combine multiple conditions
|
||||
CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
"https://*", # Must be HTTPS
|
||||
lambda url: 'internal' in url, # Must contain 'internal'
|
||||
lambda url: not url.endswith('.pdf') # Must not be PDF
|
||||
],
|
||||
match_mode=MatchMode.AND # ALL conditions must match
|
||||
)
|
||||
```
|
||||
|
||||
### 6.3 Practical Example: News Site Crawler
|
||||
|
||||
```python
|
||||
async def crawl_news_site():
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=70.0,
|
||||
rate_limiter=RateLimiter(base_delay=(1.0, 2.0))
|
||||
)
|
||||
|
||||
configs = [
|
||||
# Homepage - light extraction
|
||||
CrawlerRunConfig(
|
||||
url_matcher=lambda url: url.rstrip('/') == 'https://news.ycombinator.com',
|
||||
css_selector="nav, .headline",
|
||||
extraction_strategy=None
|
||||
),
|
||||
|
||||
# Article pages - full extraction
|
||||
CrawlerRunConfig(
|
||||
url_matcher="*/article/*",
|
||||
extraction_strategy=CosineStrategy(
|
||||
semantic_filter="article content",
|
||||
word_count_threshold=100
|
||||
),
|
||||
screenshot=True,
|
||||
excluded_tags=["nav", "aside", "footer"]
|
||||
),
|
||||
|
||||
# Author pages - metadata focus
|
||||
CrawlerRunConfig(
|
||||
url_matcher="*/author/*",
|
||||
extraction_strategy=JsonCssExtractionStrategy({
|
||||
"name": "h1.author-name",
|
||||
"bio": ".author-bio",
|
||||
"articles": "article.post-card h2"
|
||||
})
|
||||
),
|
||||
|
||||
# Everything else
|
||||
CrawlerRunConfig()
|
||||
]
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results = await crawler.arun_many(
|
||||
urls=news_urls,
|
||||
config=configs,
|
||||
dispatcher=dispatcher
|
||||
)
|
||||
```
|
||||
|
||||
### 6.4 Best Practices
|
||||
|
||||
1. **Order Matters**: Configs are evaluated in order - put specific patterns before general ones
|
||||
2. **Default Config Behavior**:
|
||||
- A config without `url_matcher` matches ALL URLs
|
||||
- Always include a default config as the last item if you want to handle all URLs
|
||||
- Without a default config, unmatched URLs will fail with "No matching configuration found"
|
||||
3. **Test Your Patterns**: Use the config's `is_match()` method to test patterns:
|
||||
```python
|
||||
config = CrawlerRunConfig(url_matcher="*.pdf")
|
||||
print(config.is_match("https://example.com/doc.pdf")) # True
|
||||
|
||||
default_config = CrawlerRunConfig() # No url_matcher
|
||||
print(default_config.is_match("https://any-url.com")) # True - matches everything!
|
||||
```
|
||||
4. **Optimize for Performance**:
|
||||
- Disable JS for static content
|
||||
- Skip screenshots for data APIs
|
||||
- Use appropriate extraction strategies
|
||||
|
||||
## 7. Summary
|
||||
|
||||
1. **Two Dispatcher Types**:
|
||||
|
||||
|
||||
@@ -49,46 +49,75 @@ from crawl4ai import JsonCssExtractionStrategy
|
||||
from crawl4ai.cache_context import CacheMode
|
||||
|
||||
async def crawl_dynamic_content():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
session_id = "github_commits_session"
|
||||
url = "https://github.com/microsoft/TypeScript/commits/main"
|
||||
all_commits = []
|
||||
url = "https://github.com/microsoft/TypeScript/commits/main"
|
||||
session_id = "wait_for_session"
|
||||
all_commits = []
|
||||
|
||||
# Define extraction schema
|
||||
schema = {
|
||||
"name": "Commit Extractor",
|
||||
"baseSelector": "li.Box-sc-g0xbh4-0",
|
||||
"fields": [{
|
||||
"name": "title", "selector": "h4.markdown-title", "type": "text"
|
||||
}],
|
||||
}
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema)
|
||||
js_next_page = """
|
||||
const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4');
|
||||
if (commits.length > 0) {
|
||||
window.lastCommit = commits[0].textContent.trim();
|
||||
}
|
||||
const button = document.querySelector('a[data-testid="pagination-next-button"]');
|
||||
if (button) {button.click(); console.log('button clicked') }
|
||||
"""
|
||||
|
||||
# JavaScript and wait configurations
|
||||
js_next_page = """document.querySelector('a[data-testid="pagination-next-button"]').click();"""
|
||||
wait_for = """() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0"""
|
||||
|
||||
# Crawl multiple pages
|
||||
wait_for = """() => {
|
||||
const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4');
|
||||
if (commits.length === 0) return false;
|
||||
const firstCommit = commits[0].textContent.trim();
|
||||
return firstCommit !== window.lastCommit;
|
||||
}"""
|
||||
|
||||
schema = {
|
||||
"name": "Commit Extractor",
|
||||
"baseSelector": "li[data-testid='commit-row-item']",
|
||||
"fields": [
|
||||
{
|
||||
"name": "title",
|
||||
"selector": "h4 a",
|
||||
"type": "text",
|
||||
"transform": "strip",
|
||||
},
|
||||
],
|
||||
}
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
|
||||
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
verbose=True,
|
||||
headless=False,
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
for page in range(3):
|
||||
config = CrawlerRunConfig(
|
||||
url=url,
|
||||
crawler_config = CrawlerRunConfig(
|
||||
session_id=session_id,
|
||||
css_selector="li[data-testid='commit-row-item']",
|
||||
extraction_strategy=extraction_strategy,
|
||||
js_code=js_next_page if page > 0 else None,
|
||||
wait_for=wait_for if page > 0 else None,
|
||||
js_only=page > 0,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
capture_console_messages=True,
|
||||
)
|
||||
|
||||
result = await crawler.arun(config=config)
|
||||
if result.success:
|
||||
|
||||
result = await crawler.arun(url=url, config=crawler_config)
|
||||
|
||||
if result.console_messages:
|
||||
print(f"Page {page + 1} console messages:", result.console_messages)
|
||||
|
||||
if result.extracted_content:
|
||||
# print(f"Page {page + 1} result:", result.extracted_content)
|
||||
commits = json.loads(result.extracted_content)
|
||||
all_commits.extend(commits)
|
||||
print(f"Page {page + 1}: Found {len(commits)} commits")
|
||||
else:
|
||||
print(f"Page {page + 1}: No content extracted")
|
||||
|
||||
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
|
||||
# Clean up session
|
||||
await crawler.crawler_strategy.kill_session(session_id)
|
||||
return all_commits
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
@@ -91,13 +91,12 @@ async def crawl_twitter_timeline():
|
||||
wait_after_scroll=1.0 # Twitter needs time to load
|
||||
)
|
||||
|
||||
browser_config = BrowserConfig(headless=True) # Set to False to watch it work
|
||||
config = CrawlerRunConfig(
|
||||
virtual_scroll_config=virtual_config,
|
||||
# Optional: Set headless=False to watch it work
|
||||
# browser_config=BrowserConfig(headless=False)
|
||||
virtual_scroll_config=virtual_config
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://twitter.com/search?q=AI",
|
||||
config=config
|
||||
@@ -200,7 +199,7 @@ Use **scan_full_page** when:
|
||||
Virtual Scroll works seamlessly with extraction strategies:
|
||||
|
||||
```python
|
||||
from crawl4ai import LLMExtractionStrategy
|
||||
from crawl4ai import LLMExtractionStrategy, LLMConfig
|
||||
|
||||
# Define extraction schema
|
||||
schema = {
|
||||
@@ -222,7 +221,7 @@ config = CrawlerRunConfig(
|
||||
scroll_count=20
|
||||
),
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o-mini",
|
||||
llm_config=LLMConfig(provider="openai/gpt-4o-mini"),
|
||||
schema=schema
|
||||
)
|
||||
)
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
```python
|
||||
async def arun_many(
|
||||
urls: Union[List[str], List[Any]],
|
||||
config: Optional[CrawlerRunConfig] = None,
|
||||
config: Optional[Union[CrawlerRunConfig, List[CrawlerRunConfig]]] = None,
|
||||
dispatcher: Optional[BaseDispatcher] = None,
|
||||
...
|
||||
) -> Union[List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
|
||||
@@ -15,7 +15,9 @@ async def arun_many(
|
||||
Crawl multiple URLs concurrently or in batches.
|
||||
|
||||
:param urls: A list of URLs (or tasks) to crawl.
|
||||
:param config: (Optional) A default `CrawlerRunConfig` applying to each crawl.
|
||||
:param config: (Optional) Either:
|
||||
- A single `CrawlerRunConfig` applying to all URLs
|
||||
- A list of `CrawlerRunConfig` objects with url_matcher patterns
|
||||
:param dispatcher: (Optional) A concurrency controller (e.g. MemoryAdaptiveDispatcher).
|
||||
...
|
||||
:return: Either a list of `CrawlResult` objects, or an async generator if streaming is enabled.
|
||||
@@ -95,10 +97,70 @@ results = await crawler.arun_many(
|
||||
)
|
||||
```
|
||||
|
||||
### URL-Specific Configurations
|
||||
|
||||
Instead of using one config for all URLs, provide a list of configs with `url_matcher` patterns:
|
||||
|
||||
```python
|
||||
from crawl4ai import CrawlerRunConfig, MatchMode
|
||||
from crawl4ai.processors.pdf import PDFContentScrapingStrategy
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
|
||||
# PDF files - specialized extraction
|
||||
pdf_config = CrawlerRunConfig(
|
||||
url_matcher="*.pdf",
|
||||
scraping_strategy=PDFContentScrapingStrategy()
|
||||
)
|
||||
|
||||
# Blog/article pages - content filtering
|
||||
blog_config = CrawlerRunConfig(
|
||||
url_matcher=["*/blog/*", "*/article/*", "*python.org*"],
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(threshold=0.48)
|
||||
)
|
||||
)
|
||||
|
||||
# Dynamic pages - JavaScript execution
|
||||
github_config = CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'github.com' in url,
|
||||
js_code="window.scrollTo(0, 500);"
|
||||
)
|
||||
|
||||
# API endpoints - JSON extraction
|
||||
api_config = CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'api' in url or url.endswith('.json'),
|
||||
# Custome settings for JSON extraction
|
||||
)
|
||||
|
||||
# Default fallback config
|
||||
default_config = CrawlerRunConfig() # No url_matcher means it never matches except as fallback
|
||||
|
||||
# Pass the list of configs - first match wins!
|
||||
results = await crawler.arun_many(
|
||||
urls=[
|
||||
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", # → pdf_config
|
||||
"https://blog.python.org/", # → blog_config
|
||||
"https://github.com/microsoft/playwright", # → github_config
|
||||
"https://httpbin.org/json", # → api_config
|
||||
"https://example.com/" # → default_config
|
||||
],
|
||||
config=[pdf_config, blog_config, github_config, api_config, default_config]
|
||||
)
|
||||
```
|
||||
|
||||
**URL Matching Features**:
|
||||
- **String patterns**: `"*.pdf"`, `"*/blog/*"`, `"*python.org*"`
|
||||
- **Function matchers**: `lambda url: 'api' in url`
|
||||
- **Mixed patterns**: Combine strings and functions with `MatchMode.OR` or `MatchMode.AND`
|
||||
- **First match wins**: Configs are evaluated in order
|
||||
|
||||
**Key Points**:
|
||||
- Each URL is processed by the same or separate sessions, depending on the dispatcher’s strategy.
|
||||
- `dispatch_result` in each `CrawlResult` (if using concurrency) can hold memory and timing info.
|
||||
- If you need to handle authentication or session IDs, pass them in each individual task or within your run config.
|
||||
- **Important**: Always include a default config (without `url_matcher`) as the last item if you want to handle all URLs. Otherwise, unmatched URLs will fail.
|
||||
|
||||
### Return Value
|
||||
|
||||
|
||||
@@ -208,6 +208,71 @@ config = CrawlerRunConfig(
|
||||
|
||||
See [Virtual Scroll documentation](../../advanced/virtual-scroll.md) for detailed examples.
|
||||
|
||||
---
|
||||
|
||||
### I) **URL Matching Configuration**
|
||||
|
||||
| **Parameter** | **Type / Default** | **What It Does** |
|
||||
|------------------------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| **`url_matcher`** | `UrlMatcher` (None) | Pattern(s) to match URLs against. Can be: string (glob), function, or list of mixed types. **None means match ALL URLs** |
|
||||
| **`match_mode`** | `MatchMode` (MatchMode.OR) | How to combine multiple matchers in a list: `MatchMode.OR` (any match) or `MatchMode.AND` (all must match) |
|
||||
|
||||
The `url_matcher` parameter enables URL-specific configurations when used with `arun_many()`:
|
||||
|
||||
```python
|
||||
from crawl4ai import CrawlerRunConfig, MatchMode
|
||||
from crawl4ai.processors.pdf import PDFContentScrapingStrategy
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
|
||||
# Simple string pattern (glob-style)
|
||||
pdf_config = CrawlerRunConfig(
|
||||
url_matcher="*.pdf",
|
||||
scraping_strategy=PDFContentScrapingStrategy()
|
||||
)
|
||||
|
||||
# Multiple patterns with OR logic (default)
|
||||
blog_config = CrawlerRunConfig(
|
||||
url_matcher=["*/blog/*", "*/article/*", "*/news/*"],
|
||||
match_mode=MatchMode.OR # Any pattern matches
|
||||
)
|
||||
|
||||
# Function matcher
|
||||
api_config = CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'api' in url or url.endswith('.json'),
|
||||
# Other settings like extraction_strategy
|
||||
)
|
||||
|
||||
# Mixed: String + Function with AND logic
|
||||
complex_config = CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
lambda url: url.startswith('https://'), # Must be HTTPS
|
||||
"*.org/*", # Must be .org domain
|
||||
lambda url: 'docs' in url # Must contain 'docs'
|
||||
],
|
||||
match_mode=MatchMode.AND # ALL conditions must match
|
||||
)
|
||||
|
||||
# Combined patterns and functions with AND logic
|
||||
secure_docs = CrawlerRunConfig(
|
||||
url_matcher=["https://*", lambda url: '.doc' in url],
|
||||
match_mode=MatchMode.AND # Must be HTTPS AND contain .doc
|
||||
)
|
||||
|
||||
# Default config - matches ALL URLs
|
||||
default_config = CrawlerRunConfig() # No url_matcher = matches everything
|
||||
```
|
||||
|
||||
**UrlMatcher Types:**
|
||||
- **None (default)**: When `url_matcher` is None or not set, the config matches ALL URLs
|
||||
- **String patterns**: Glob-style patterns like `"*.pdf"`, `"*/api/*"`, `"https://*.example.com/*"`
|
||||
- **Functions**: `lambda url: bool` - Custom logic for complex matching
|
||||
- **Lists**: Mix strings and functions, combined with `MatchMode.OR` or `MatchMode.AND`
|
||||
|
||||
**Important Behavior:**
|
||||
- When passing a list of configs to `arun_many()`, URLs are matched against each config's `url_matcher` in order. First match wins!
|
||||
- If no config matches a URL and there's no default config (one without `url_matcher`), the URL will fail with "No matching configuration found"
|
||||
- Always include a default config as the last item if you want to handle all URLs
|
||||
|
||||
---## 2.2 Helper Methods
|
||||
|
||||
Both `BrowserConfig` and `CrawlerRunConfig` provide a `clone()` method to create modified copies:
|
||||
|
||||
@@ -10,9 +10,8 @@ Today I'm releasing Crawl4AI v0.7.0—the Adaptive Intelligence Update. This rel
|
||||
|
||||
- **Adaptive Crawling**: Your crawler now learns and adapts to website patterns
|
||||
- **Virtual Scroll Support**: Complete content extraction from infinite scroll pages
|
||||
- **Link Preview with 3-Layer Scoring**: Intelligent link analysis and prioritization
|
||||
- **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization
|
||||
- **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering
|
||||
- **PDF Parsing**: Extract data from PDF documents
|
||||
- **Performance Optimizations**: Significant speed and memory improvements
|
||||
|
||||
## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning
|
||||
@@ -30,44 +29,41 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking:
|
||||
- Extraction confidence scores
|
||||
|
||||
```python
|
||||
from crawl4ai import AdaptiveCrawler, AdaptiveConfig, CrawlState
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||
import asyncio
|
||||
|
||||
# Initialize with custom learning parameters
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.7, # Min confidence to use learned patterns
|
||||
max_history=100, # Remember last 100 crawls per domain
|
||||
learning_rate=0.2, # How quickly to adapt to changes
|
||||
patterns_per_page=3, # Patterns to learn per page type
|
||||
extraction_strategy='css' # 'css' or 'xpath'
|
||||
)
|
||||
|
||||
adaptive_crawler = AdaptiveCrawler(config)
|
||||
|
||||
# First crawl - crawler learns the structure
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://news.example.com/article/12345",
|
||||
config=CrawlerRunConfig(
|
||||
adaptive_config=config,
|
||||
extraction_hints={ # Optional hints to speed up learning
|
||||
"title": "article h1",
|
||||
"content": "article .body-content"
|
||||
}
|
||||
)
|
||||
async def main():
|
||||
|
||||
# Configure adaptive crawler
|
||||
config = AdaptiveConfig(
|
||||
strategy="statistical", # or "embedding" for semantic understanding
|
||||
max_pages=10,
|
||||
confidence_threshold=0.7, # Stop at 70% confidence
|
||||
top_k_links=3, # Follow top 3 links per page
|
||||
min_gain_threshold=0.05 # Need 5% information gain to continue
|
||||
)
|
||||
|
||||
# Crawler identifies and stores patterns
|
||||
if result.success:
|
||||
state = adaptive_crawler.get_state("news.example.com")
|
||||
print(f"Learned {len(state.patterns)} patterns")
|
||||
print(f"Confidence: {state.avg_confidence:.2%}")
|
||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
|
||||
print("Starting adaptive crawl about Python decorators...")
|
||||
result = await adaptive.digest(
|
||||
start_url="https://docs.python.org/3/glossary.html",
|
||||
query="python decorators functions wrapping"
|
||||
)
|
||||
|
||||
print(f"\n✅ Crawling Complete!")
|
||||
print(f"• Confidence Level: {adaptive.confidence:.0%}")
|
||||
print(f"• Pages Crawled: {len(result.crawled_urls)}")
|
||||
print(f"• Knowledge Base: {len(adaptive.state.knowledge_base)} documents")
|
||||
|
||||
# Get most relevant content
|
||||
relevant = adaptive.get_relevant_content(top_k=3)
|
||||
print(f"\nMost Relevant Pages:")
|
||||
for i, page in enumerate(relevant, 1):
|
||||
print(f"{i}. {page['url']} (relevance: {page['score']:.2%})")
|
||||
|
||||
# Subsequent crawls - uses learned patterns
|
||||
result2 = await crawler.arun(
|
||||
"https://news.example.com/article/67890",
|
||||
config=CrawlerRunConfig(adaptive_config=config)
|
||||
)
|
||||
# Automatically extracts using learned patterns!
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
@@ -92,9 +88,7 @@ twitter_config = VirtualScrollConfig(
|
||||
container_selector="[data-testid='primaryColumn']",
|
||||
scroll_count=20, # Number of scrolls
|
||||
scroll_by="container_height", # Smart scrolling by container size
|
||||
wait_after_scroll=1.0, # Let content load
|
||||
capture_method="incremental", # Capture new content on each scroll
|
||||
deduplicate=True # Remove duplicate elements
|
||||
wait_after_scroll=1.0 # Let content load
|
||||
)
|
||||
|
||||
# For e-commerce product grids (Instagram style)
|
||||
@@ -102,8 +96,7 @@ grid_config = VirtualScrollConfig(
|
||||
container_selector="main .product-grid",
|
||||
scroll_count=30,
|
||||
scroll_by=800, # Fixed pixel scrolling
|
||||
wait_after_scroll=1.5, # Images need time
|
||||
stop_on_no_change=True # Smart stopping
|
||||
wait_after_scroll=1.5 # Images need time
|
||||
)
|
||||
|
||||
# For news feeds with lazy loading
|
||||
@@ -111,9 +104,7 @@ news_config = VirtualScrollConfig(
|
||||
container_selector=".article-feed",
|
||||
scroll_count=50,
|
||||
scroll_by="page_height", # Viewport-based scrolling
|
||||
wait_after_scroll=0.5,
|
||||
wait_for_selector=".article-card", # Wait for specific elements
|
||||
timeout=30000 # Max 30 seconds total
|
||||
wait_after_scroll=0.5 # Wait for content to load
|
||||
)
|
||||
|
||||
# Use it in your crawl
|
||||
@@ -157,68 +148,63 @@ async with AsyncWebCrawler() as crawler:
|
||||
|
||||
**My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals.
|
||||
|
||||
### The Three-Layer Scoring System
|
||||
### Intelligent Link Analysis and Scoring
|
||||
|
||||
```python
|
||||
from crawl4ai import LinkPreviewConfig
|
||||
import asyncio
|
||||
from crawl4ai import CrawlerRunConfig, CacheMode, AsyncWebCrawler
|
||||
from crawl4ai.adaptive_crawler import LinkPreviewConfig
|
||||
|
||||
# Configure intelligent link analysis
|
||||
link_config = LinkPreviewConfig(
|
||||
# What to analyze
|
||||
include_internal=True,
|
||||
include_external=True,
|
||||
max_links=100, # Analyze top 100 links
|
||||
|
||||
# Relevance scoring
|
||||
query="machine learning tutorials", # Your interest
|
||||
score_threshold=0.3, # Minimum relevance score
|
||||
|
||||
# Performance
|
||||
concurrent_requests=10, # Parallel processing
|
||||
timeout_per_link=5000, # 5s per link
|
||||
|
||||
# Advanced scoring weights
|
||||
scoring_weights={
|
||||
"intrinsic": 0.3, # Link quality indicators
|
||||
"contextual": 0.5, # Relevance to query
|
||||
"popularity": 0.2 # Link prominence
|
||||
}
|
||||
)
|
||||
|
||||
# Use in your crawl
|
||||
result = await crawler.arun(
|
||||
"https://tech-blog.example.com",
|
||||
config=CrawlerRunConfig(
|
||||
link_preview_config=link_config,
|
||||
score_links=True
|
||||
async def main():
|
||||
# Configure intelligent link analysis
|
||||
link_config = LinkPreviewConfig(
|
||||
include_internal=True,
|
||||
include_external=False,
|
||||
max_links=10,
|
||||
concurrency=5,
|
||||
query="python tutorial", # For contextual scoring
|
||||
score_threshold=0.3,
|
||||
verbose=True
|
||||
)
|
||||
)
|
||||
# Use in your crawl
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://www.geeksforgeeks.org/",
|
||||
config=CrawlerRunConfig(
|
||||
link_preview_config=link_config,
|
||||
score_links=True, # Enable intrinsic scoring
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
)
|
||||
|
||||
# Access scored and sorted links
|
||||
for link in result.links["internal"][:10]: # Top 10 internal links
|
||||
print(f"Score: {link['total_score']:.3f}")
|
||||
print(f" Intrinsic: {link['intrinsic_score']:.1f}/10") # Position, attributes
|
||||
print(f" Contextual: {link['contextual_score']:.1f}/1") # Relevance to query
|
||||
print(f" URL: {link['href']}")
|
||||
print(f" Title: {link['head_data']['title']}")
|
||||
print(f" Description: {link['head_data']['meta']['description'][:100]}...")
|
||||
# Access scored and sorted links
|
||||
if result.success and result.links:
|
||||
for link in result.links.get("internal", []):
|
||||
text = link.get('text', 'No text')[:40]
|
||||
print(
|
||||
text,
|
||||
f"{link.get('intrinsic_score', 0):.1f}/10" if link.get('intrinsic_score') is not None else "0.0/10",
|
||||
f"{link.get('contextual_score', 0):.2f}/1" if link.get('contextual_score') is not None else "0.00/1",
|
||||
f"{link.get('total_score', 0):.3f}" if link.get('total_score') is not None else "0.000"
|
||||
)
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Scoring Components:**
|
||||
|
||||
1. **Intrinsic Score (0-10)**: Based on link quality indicators
|
||||
1. **Intrinsic Score**: Based on link quality indicators
|
||||
- Position on page (navigation, content, footer)
|
||||
- Link attributes (rel, title, class names)
|
||||
- Anchor text quality and length
|
||||
- URL structure and depth
|
||||
|
||||
2. **Contextual Score (0-1)**: Relevance to your query
|
||||
- Semantic similarity using embeddings
|
||||
2. **Contextual Score**: Relevance to your query using BM25 algorithm
|
||||
- Keyword matching in link text and title
|
||||
- Meta description analysis
|
||||
- Content preview scoring
|
||||
|
||||
3. **Total Score**: Weighted combination for final ranking
|
||||
3. **Total Score**: Combined score for final ranking
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
- **Research Efficiency**: Find relevant papers 10x faster by following only high-score links
|
||||
@@ -235,58 +221,34 @@ for link in result.links["internal"][:10]: # Top 10 internal links
|
||||
### Technical Architecture
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
||||
|
||||
# Basic discovery - find all product pages
|
||||
seeder_config = SeedingConfig(
|
||||
# Discovery sources
|
||||
source="sitemap+cc", # Sitemap + Common Crawl
|
||||
|
||||
# Filtering
|
||||
pattern="*/product/*", # URL pattern matching
|
||||
ignore_patterns=["*/reviews/*", "*/questions/*"],
|
||||
|
||||
# Validation
|
||||
live_check=True, # Verify URLs are alive
|
||||
max_urls=5000, # Stop at 5000 URLs
|
||||
|
||||
# Performance
|
||||
concurrency=100, # Parallel requests
|
||||
hits_per_sec=10 # Rate limiting
|
||||
)
|
||||
async def main():
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
# Discover Python tutorial URLs
|
||||
config = SeedingConfig(
|
||||
source="sitemap", # Use sitemap
|
||||
pattern="*python*", # URL pattern filter
|
||||
extract_head=True, # Get metadata
|
||||
query="python tutorial", # For relevance scoring
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.2,
|
||||
max_urls=10
|
||||
)
|
||||
|
||||
print("Discovering Python async tutorial URLs...")
|
||||
urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
|
||||
|
||||
print(f"\n✅ Found {len(urls)} relevant URLs:")
|
||||
for i, url_info in enumerate(urls[:5], 1):
|
||||
print(f"\n{i}. {url_info['url']}")
|
||||
if url_info.get('relevance_score'):
|
||||
print(f" Relevance: {url_info['relevance_score']:.3f}")
|
||||
if url_info.get('head_data', {}).get('title'):
|
||||
print(f" Title: {url_info['head_data']['title'][:60]}...")
|
||||
|
||||
seeder = AsyncUrlSeeder(seeder_config)
|
||||
urls = await seeder.discover("https://shop.example.com")
|
||||
|
||||
# Advanced: Relevance-based discovery
|
||||
research_config = SeedingConfig(
|
||||
source="crawl+sitemap", # Deep crawl + sitemap
|
||||
pattern="*/blog/*", # Blog posts only
|
||||
|
||||
# Content relevance
|
||||
extract_head=True, # Get meta tags
|
||||
query="quantum computing tutorials",
|
||||
scoring_method="bm25", # Or "semantic" (coming soon)
|
||||
score_threshold=0.4, # High relevance only
|
||||
|
||||
# Smart filtering
|
||||
filter_nonsense_urls=True, # Remove .xml, .txt, etc.
|
||||
min_content_length=500, # Skip thin content
|
||||
|
||||
force=True # Bypass cache
|
||||
)
|
||||
|
||||
# Discover with progress tracking
|
||||
discovered = []
|
||||
async for batch in seeder.discover_iter("https://physics-blog.com", research_config):
|
||||
discovered.extend(batch)
|
||||
print(f"Found {len(discovered)} relevant URLs so far...")
|
||||
|
||||
# Results include scores and metadata
|
||||
for url_data in discovered[:5]:
|
||||
print(f"URL: {url_data['url']}")
|
||||
print(f"Score: {url_data['score']:.3f}")
|
||||
print(f"Title: {url_data['title']}")
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Discovery Methods:**
|
||||
@@ -309,35 +271,18 @@ This release includes significant performance improvements through optimized res
|
||||
### What We Optimized
|
||||
|
||||
```python
|
||||
# Before v0.7.0 (slow)
|
||||
# Optimized crawling with v0.7.0 improvements
|
||||
results = []
|
||||
for url in urls:
|
||||
result = await crawler.arun(url)
|
||||
results.append(result)
|
||||
|
||||
# After v0.7.0 (fast)
|
||||
# Automatic batching and connection pooling
|
||||
results = await crawler.arun_batch(
|
||||
urls,
|
||||
config=CrawlerRunConfig(
|
||||
# New performance options
|
||||
batch_size=10, # Process 10 URLs concurrently
|
||||
reuse_browser=True, # Keep browser warm
|
||||
eager_loading=False, # Load only what's needed
|
||||
streaming_extraction=True, # Stream large extractions
|
||||
|
||||
# Optimized defaults
|
||||
wait_until="domcontentloaded", # Faster than networkidle
|
||||
exclude_external_resources=True, # Skip third-party assets
|
||||
block_ads=True # Ad blocking built-in
|
||||
result = await crawler.arun(
|
||||
url,
|
||||
config=CrawlerRunConfig(
|
||||
# Performance optimizations
|
||||
wait_until="domcontentloaded", # Faster than networkidle
|
||||
cache_mode=CacheMode.ENABLED # Enable caching
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# Memory-efficient streaming for large crawls
|
||||
async for result in crawler.arun_stream(large_url_list):
|
||||
# Process results as they complete
|
||||
await process_result(result)
|
||||
# Memory is freed after each iteration
|
||||
results.append(result)
|
||||
```
|
||||
|
||||
**Performance Gains:**
|
||||
@@ -347,24 +292,6 @@ async for result in crawler.arun_stream(large_url_list):
|
||||
- **Memory Usage**: 60% reduction with streaming processing
|
||||
- **Concurrent Crawls**: Handle 5x more parallel requests
|
||||
|
||||
## 📄 PDF Support
|
||||
|
||||
PDF extraction is now natively supported in Crawl4AI.
|
||||
|
||||
```python
|
||||
# Extract data from PDF documents
|
||||
result = await crawler.arun(
|
||||
"https://example.com/report.pdf",
|
||||
config=CrawlerRunConfig(
|
||||
pdf_extraction=True,
|
||||
extraction_strategy=JsonCssExtractionStrategy({
|
||||
# Works on converted PDF structure
|
||||
"title": {"selector": "h1", "type": "text"},
|
||||
"sections": {"selector": "h2", "type": "list"}
|
||||
})
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
## 🔧 Important Changes
|
||||
|
||||
|
||||
43
docs/md_v2/blog/releases/0.7.1.md
Normal file
43
docs/md_v2/blog/releases/0.7.1.md
Normal file
@@ -0,0 +1,43 @@
|
||||
# 🛠️ Crawl4AI v0.7.1: Minor Cleanup Update
|
||||
|
||||
*July 17, 2025 • 2 min read*
|
||||
|
||||
---
|
||||
|
||||
A small maintenance release that removes unused code and improves documentation.
|
||||
|
||||
## 🎯 What's Changed
|
||||
|
||||
- **Removed unused StealthConfig** from `crawl4ai/browser_manager.py`
|
||||
- **Updated documentation** with better examples and parameter explanations
|
||||
- **Fixed virtual scroll configuration** examples in docs
|
||||
|
||||
## 🧹 Code Cleanup
|
||||
|
||||
Removed unused `StealthConfig` import and configuration that wasn't being used anywhere in the codebase. The project uses its own custom stealth implementation through JavaScript injection instead.
|
||||
|
||||
```python
|
||||
# Removed unused code:
|
||||
from playwright_stealth import StealthConfig
|
||||
stealth_config = StealthConfig(...) # This was never used
|
||||
```
|
||||
|
||||
## 📖 Documentation Updates
|
||||
|
||||
- Fixed adaptive crawling parameter examples
|
||||
- Updated session management documentation
|
||||
- Corrected virtual scroll configuration examples
|
||||
|
||||
## 🚀 Installation
|
||||
|
||||
```bash
|
||||
pip install crawl4ai==0.7.1
|
||||
```
|
||||
|
||||
No breaking changes - upgrade directly from v0.7.0.
|
||||
|
||||
---
|
||||
|
||||
Questions? Issues?
|
||||
- GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||
- Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||
98
docs/md_v2/blog/releases/0.7.2.md
Normal file
98
docs/md_v2/blog/releases/0.7.2.md
Normal file
@@ -0,0 +1,98 @@
|
||||
# 🚀 Crawl4AI v0.7.2: CI/CD & Dependency Optimization Update
|
||||
|
||||
*July 25, 2025 • 3 min read*
|
||||
|
||||
---
|
||||
|
||||
This release introduces automated CI/CD pipelines for seamless releases and optimizes dependencies for a lighter, more efficient package.
|
||||
|
||||
## 🎯 What's New
|
||||
|
||||
### 🔄 Automated Release Pipeline
|
||||
- **GitHub Actions CI/CD**: Automated PyPI and Docker Hub releases on tag push
|
||||
- **Multi-platform Docker images**: Support for both AMD64 and ARM64 architectures
|
||||
- **Version consistency checks**: Ensures tag, package, and Docker versions align
|
||||
- **Automated release notes**: GitHub releases created automatically
|
||||
|
||||
### 📦 Dependency Optimization
|
||||
- **Moved sentence-transformers to optional dependencies**: Significantly reduces default installation size
|
||||
- **Lighter Docker images**: Optimized Dockerfile for faster builds and smaller images
|
||||
- **Better dependency management**: Core vs. optional dependencies clearly separated
|
||||
|
||||
## 🏗️ CI/CD Pipeline
|
||||
|
||||
The new automated release process ensures consistent, reliable releases:
|
||||
|
||||
```yaml
|
||||
# Trigger releases with a simple tag
|
||||
git tag v0.7.2
|
||||
git push origin v0.7.2
|
||||
|
||||
# Automatically:
|
||||
# ✅ Validates version consistency
|
||||
# ✅ Builds and publishes to PyPI
|
||||
# ✅ Builds multi-platform Docker images
|
||||
# ✅ Pushes to Docker Hub with proper tags
|
||||
# ✅ Creates GitHub release
|
||||
```
|
||||
|
||||
## 💾 Lighter Installation
|
||||
|
||||
Default installation is now significantly smaller:
|
||||
|
||||
```bash
|
||||
# Core installation (smaller, faster)
|
||||
pip install crawl4ai==0.7.2
|
||||
|
||||
# With ML features (includes sentence-transformers)
|
||||
pip install crawl4ai[transformer]==0.7.2
|
||||
|
||||
# Full installation
|
||||
pip install crawl4ai[all]==0.7.2
|
||||
```
|
||||
|
||||
## 🐳 Docker Improvements
|
||||
|
||||
Enhanced Docker support with multi-platform images:
|
||||
|
||||
```bash
|
||||
# Pull the latest version
|
||||
docker pull unclecode/crawl4ai:0.7.2
|
||||
docker pull unclecode/crawl4ai:latest
|
||||
|
||||
# Available tags:
|
||||
# - unclecode/crawl4ai:0.7.2 (specific version)
|
||||
# - unclecode/crawl4ai:0.7 (minor version)
|
||||
# - unclecode/crawl4ai:0 (major version)
|
||||
# - unclecode/crawl4ai:latest
|
||||
```
|
||||
|
||||
## 🔧 Technical Details
|
||||
|
||||
### Dependency Changes
|
||||
- `sentence-transformers` moved from required to optional dependencies
|
||||
- Reduces default installation by ~500MB
|
||||
- No impact on functionality when transformer features aren't needed
|
||||
|
||||
### CI/CD Configuration
|
||||
- GitHub Actions workflows for automated releases
|
||||
- Version validation before publishing
|
||||
- Parallel PyPI and Docker Hub deployments
|
||||
- Automatic tagging strategy for Docker images
|
||||
|
||||
## 🚀 Installation
|
||||
|
||||
```bash
|
||||
pip install crawl4ai==0.7.2
|
||||
```
|
||||
|
||||
No breaking changes - direct upgrade from v0.7.0 or v0.7.1.
|
||||
|
||||
---
|
||||
|
||||
Questions? Issues?
|
||||
- GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||
- Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||
- Twitter: [@unclecode](https://x.com/unclecode)
|
||||
|
||||
*P.S. The new CI/CD pipeline will make future releases faster and more reliable. Thanks for your patience as we improve our release process!*
|
||||
@@ -35,7 +35,7 @@ from crawl4ai import AsyncWebCrawler, AdaptiveCrawler
|
||||
|
||||
async def main():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Create an adaptive crawler
|
||||
# Create an adaptive crawler (config is optional)
|
||||
adaptive = AdaptiveCrawler(crawler)
|
||||
|
||||
# Start crawling with a query
|
||||
@@ -59,13 +59,13 @@ async def main():
|
||||
from crawl4ai import AdaptiveConfig
|
||||
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.7, # Stop when 70% confident (default: 0.8)
|
||||
max_pages=20, # Maximum pages to crawl (default: 50)
|
||||
top_k_links=3, # Links to follow per page (default: 5)
|
||||
confidence_threshold=0.8, # Stop when 80% confident (default: 0.7)
|
||||
max_pages=30, # Maximum pages to crawl (default: 20)
|
||||
top_k_links=5, # Links to follow per page (default: 3)
|
||||
min_gain_threshold=0.05 # Minimum expected gain to continue (default: 0.1)
|
||||
)
|
||||
|
||||
adaptive = AdaptiveCrawler(crawler, config=config)
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
```
|
||||
|
||||
## Crawling Strategies
|
||||
@@ -198,8 +198,8 @@ if result.metrics.get('is_irrelevant', False):
|
||||
The confidence score (0-1) indicates how sufficient the gathered information is:
|
||||
- **0.0-0.3**: Insufficient information, needs more crawling
|
||||
- **0.3-0.6**: Partial information, may answer basic queries
|
||||
- **0.6-0.8**: Good coverage, can answer most queries
|
||||
- **0.8-1.0**: Excellent coverage, comprehensive information
|
||||
- **0.6-0.7**: Good coverage, can answer most queries
|
||||
- **0.7-1.0**: Excellent coverage, comprehensive information
|
||||
|
||||
### Statistics Display
|
||||
|
||||
@@ -257,9 +257,9 @@ new_adaptive.import_knowledge_base("knowledge_base.jsonl")
|
||||
- Avoid overly broad queries
|
||||
|
||||
### 2. Threshold Tuning
|
||||
- Start with default (0.8) for general use
|
||||
- Lower to 0.6-0.7 for exploratory crawling
|
||||
- Raise to 0.9+ for exhaustive coverage
|
||||
- Start with default (0.7) for general use
|
||||
- Lower to 0.5-0.6 for exploratory crawling
|
||||
- Raise to 0.8+ for exhaustive coverage
|
||||
|
||||
### 3. Performance Optimization
|
||||
- Use appropriate `max_pages` limits
|
||||
|
||||
@@ -209,7 +209,13 @@ class CrawlerRunConfig:
|
||||
- The maximum number of concurrent crawl sessions.
|
||||
- Helps prevent overwhelming the system.
|
||||
|
||||
14. **`display_mode`**:
|
||||
14. **`url_matcher`** & **`match_mode`**:
|
||||
- Enable URL-specific configurations when used with `arun_many()`.
|
||||
- Set `url_matcher` to patterns (glob, function, or list) to match specific URLs.
|
||||
- Use `match_mode` (OR/AND) to control how multiple patterns combine.
|
||||
- See [URL-Specific Configurations](../api/arun_many.md#url-specific-configurations) for examples.
|
||||
|
||||
15. **`display_mode`**:
|
||||
- The display mode for progress information (`DETAILED`, `BRIEF`, etc.).
|
||||
- Affects how much information is printed during the crawl.
|
||||
|
||||
|
||||
@@ -52,11 +52,9 @@ That's it! In just a few lines, you've automated a complete search workflow.
|
||||
|
||||
Want to learn by doing? We've got you covered:
|
||||
|
||||
**🚀 [Live Demo](https://docs.crawl4ai.com/c4a-script/demo)** - Try C4A-Script in your browser right now!
|
||||
**🚀 [Live Demo](https://docs.crawl4ai.com/apps/c4a-script/)** - Try C4A-Script in your browser right now!
|
||||
|
||||
**📁 [Tutorial Examples](/examples/c4a_script/)** - Complete examples with source code
|
||||
|
||||
**🛠️ [Local Tutorial](/examples/c4a_script/tutorial/)** - Run the interactive tutorial on your machine
|
||||
**📁 [Tutorial Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/c4a_script/)** - Complete examples with source code
|
||||
|
||||
### Running the Tutorial Locally
|
||||
|
||||
|
||||
@@ -350,15 +350,22 @@ if __name__ == "__main__":
|
||||
|
||||
## 6. Scraping Modes
|
||||
|
||||
Crawl4AI provides two different scraping strategies for HTML content processing: `WebScrapingStrategy` (BeautifulSoup-based, default) and `LXMLWebScrapingStrategy` (LXML-based). The LXML strategy offers significantly better performance, especially for large HTML documents.
|
||||
Crawl4AI uses `LXMLWebScrapingStrategy` (LXML-based) as the default scraping strategy for HTML content processing. This strategy offers excellent performance, especially for large HTML documents.
|
||||
|
||||
**Note:** For backward compatibility, `WebScrapingStrategy` is still available as an alias for `LXMLWebScrapingStrategy`.
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LXMLWebScrapingStrategy
|
||||
|
||||
async def main():
|
||||
config = CrawlerRunConfig(
|
||||
scraping_strategy=LXMLWebScrapingStrategy() # Faster alternative to default BeautifulSoup
|
||||
# Default configuration already uses LXMLWebScrapingStrategy
|
||||
config = CrawlerRunConfig()
|
||||
|
||||
# Or explicitly specify it if desired
|
||||
config_explicit = CrawlerRunConfig(
|
||||
scraping_strategy=LXMLWebScrapingStrategy()
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
@@ -417,21 +424,20 @@ class CustomScrapingStrategy(ContentScrapingStrategy):
|
||||
|
||||
### Performance Considerations
|
||||
|
||||
The LXML strategy can be up to 10-20x faster than BeautifulSoup strategy, particularly when processing large HTML documents. However, please note:
|
||||
The LXML strategy provides excellent performance, particularly when processing large HTML documents, offering up to 10-20x faster processing compared to BeautifulSoup-based approaches.
|
||||
|
||||
1. LXML strategy is currently experimental
|
||||
2. In some edge cases, the parsing results might differ slightly from BeautifulSoup
|
||||
3. If you encounter any inconsistencies between LXML and BeautifulSoup results, please [raise an issue](https://github.com/codeium/crawl4ai/issues) with a reproducible example
|
||||
Benefits of LXML strategy:
|
||||
- Fast processing of large HTML documents (especially >100KB)
|
||||
- Efficient memory usage
|
||||
- Good handling of well-formed HTML
|
||||
- Robust table detection and extraction
|
||||
|
||||
Choose LXML strategy when:
|
||||
- Processing large HTML documents (recommended for >100KB)
|
||||
- Performance is critical
|
||||
- Working with well-formed HTML
|
||||
### Backward Compatibility
|
||||
|
||||
Stick to BeautifulSoup strategy (default) when:
|
||||
- Maximum compatibility is needed
|
||||
- Working with malformed HTML
|
||||
- Exact parsing behavior is critical
|
||||
For users upgrading from earlier versions:
|
||||
- `WebScrapingStrategy` is now an alias for `LXMLWebScrapingStrategy`
|
||||
- Existing code using `WebScrapingStrategy` will continue to work without modification
|
||||
- No changes are required to your existing code
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -19,13 +19,15 @@ class MarkdownGenerationResult(BaseModel):
|
||||
class CrawlResult(BaseModel):
|
||||
url: str
|
||||
html: str
|
||||
fit_html: Optional[str] = None
|
||||
success: bool
|
||||
cleaned_html: Optional[str] = None
|
||||
media: Dict[str, List[Dict]] = {}
|
||||
links: Dict[str, List[Dict]] = {}
|
||||
downloaded_files: Optional[List[str]] = None
|
||||
js_execution_result: Optional[Dict[str, Any]] = None
|
||||
screenshot: Optional[str] = None
|
||||
pdf : Optional[bytes] = None
|
||||
pdf: Optional[bytes] = None
|
||||
mhtml: Optional[str] = None
|
||||
markdown: Optional[Union[str, MarkdownGenerationResult]] = None
|
||||
extracted_content: Optional[str] = None
|
||||
@@ -35,6 +37,12 @@ class CrawlResult(BaseModel):
|
||||
response_headers: Optional[dict] = None
|
||||
status_code: Optional[int] = None
|
||||
ssl_certificate: Optional[SSLCertificate] = None
|
||||
dispatch_result: Optional[DispatchResult] = None
|
||||
redirected_url: Optional[str] = None
|
||||
network_requests: Optional[List[Dict[str, Any]]] = None
|
||||
console_messages: Optional[List[Dict[str, Any]]] = None
|
||||
tables: List[Dict] = Field(default_factory=list)
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
```
|
||||
@@ -45,11 +53,13 @@ class CrawlResult(BaseModel):
|
||||
|-------------------------------------------|-----------------------------------------------------------------------------------------------------|
|
||||
| **url (`str`)** | The final or actual URL crawled (in case of redirects). |
|
||||
| **html (`str`)** | Original, unmodified page HTML. Good for debugging or custom processing. |
|
||||
| **fit_html (`Optional[str]`)** | Preprocessed HTML optimized for extraction and content filtering. |
|
||||
| **success (`bool`)** | `True` if the crawl completed without major errors, else `False`. |
|
||||
| **cleaned_html (`Optional[str]`)** | Sanitized HTML with scripts/styles removed; can exclude tags if configured via `excluded_tags` etc. |
|
||||
| **media (`Dict[str, List[Dict]]`)** | Extracted media info (images, audio, etc.), each with attributes like `src`, `alt`, `score`, etc. |
|
||||
| **links (`Dict[str, List[Dict]]`)** | Extracted link data, split by `internal` and `external`. Each link usually has `href`, `text`, etc. |
|
||||
| **downloaded_files (`Optional[List[str]]`)** | If `accept_downloads=True` in `BrowserConfig`, this lists the filepaths of saved downloads. |
|
||||
| **js_execution_result (`Optional[Dict[str, Any]]`)** | Results from JavaScript execution during crawling. |
|
||||
| **screenshot (`Optional[str]`)** | Screenshot of the page (base64-encoded) if `screenshot=True`. |
|
||||
| **pdf (`Optional[bytes]`)** | PDF of the page if `pdf=True`. |
|
||||
| **mhtml (`Optional[str]`)** | MHTML snapshot of the page if `capture_mhtml=True`. Contains the full page with all resources. |
|
||||
@@ -61,6 +71,11 @@ class CrawlResult(BaseModel):
|
||||
| **response_headers (`Optional[dict]`)** | HTTP response headers, if captured. |
|
||||
| **status_code (`Optional[int]`)** | HTTP status code (e.g., 200 for OK). |
|
||||
| **ssl_certificate (`Optional[SSLCertificate]`)** | SSL certificate info if `fetch_ssl_certificate=True`. |
|
||||
| **dispatch_result (`Optional[DispatchResult]`)** | Additional concurrency and resource usage information when crawling URLs in parallel. |
|
||||
| **redirected_url (`Optional[str]`)** | The URL after any redirects (different from `url` which is the final URL). |
|
||||
| **network_requests (`Optional[List[Dict[str, Any]]]`)** | List of network requests, responses, and failures captured during the crawl if `capture_network_requests=True`. |
|
||||
| **console_messages (`Optional[List[Dict[str, Any]]]`)** | List of browser console messages captured during the crawl if `capture_console_messages=True`. |
|
||||
| **tables (`List[Dict]`)** | Table data extracted from HTML tables with structure `[{headers, rows, caption, summary}]`. |
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -154,6 +154,30 @@ cp deploy/docker/.llm.env.example .llm.env
|
||||
# Now edit .llm.env and add your API keys
|
||||
```
|
||||
|
||||
**Flexible LLM Provider Configuration:**
|
||||
|
||||
The Docker setup now supports flexible LLM provider configuration through three methods:
|
||||
|
||||
1. **Environment Variable** (Highest Priority): Set `LLM_PROVIDER` to override the default
|
||||
```bash
|
||||
export LLM_PROVIDER="anthropic/claude-3-opus"
|
||||
# Or in your .llm.env file:
|
||||
# LLM_PROVIDER=anthropic/claude-3-opus
|
||||
```
|
||||
|
||||
2. **API Request Parameter**: Specify provider per request
|
||||
```json
|
||||
{
|
||||
"url": "https://example.com",
|
||||
"f": "llm",
|
||||
"provider": "groq/mixtral-8x7b"
|
||||
}
|
||||
```
|
||||
|
||||
3. **Config File Default**: Falls back to `config.yml` (default: `openai/gpt-4o-mini`)
|
||||
|
||||
The system automatically selects the appropriate API key based on the configured `api_key_env` in the config file.
|
||||
|
||||
#### 3. Build and Run with Compose
|
||||
|
||||
The `docker-compose.yml` file in the project root provides a simplified approach that automatically handles architecture detection using buildx.
|
||||
@@ -668,7 +692,7 @@ app:
|
||||
|
||||
# Default LLM Configuration
|
||||
llm:
|
||||
provider: "openai/gpt-4o-mini"
|
||||
provider: "openai/gpt-4o-mini" # Can be overridden by LLM_PROVIDER env var
|
||||
api_key_env: "OPENAI_API_KEY"
|
||||
# api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
|
||||
|
||||
|
||||
@@ -28,11 +28,8 @@ This page provides a comprehensive list of example scripts that demonstrate vari
|
||||
| Example | Description | Link |
|
||||
|---------|-------------|------|
|
||||
| Deep Crawling | An extensive tutorial on deep crawling capabilities, demonstrating BFS and BestFirst strategies, stream vs. non-stream execution, filters, scorers, and advanced configurations. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/deepcrawl_example.py) |
|
||||
<<<<<<< HEAD
|
||||
| Virtual Scroll | Comprehensive examples for handling virtualized scrolling on sites like Twitter, Instagram. Demonstrates different scrolling scenarios with local test server. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/virtual_scroll_example.py) |
|
||||
=======
|
||||
| Adaptive Crawling | Demonstrates intelligent crawling that automatically determines when sufficient information has been gathered. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/adaptive_crawling/) |
|
||||
>>>>>>> feature/progressive-crawling
|
||||
| Dispatcher | Shows how to use the crawl dispatcher for advanced workload management. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/dispatcher_example.py) |
|
||||
| Storage State | Tutorial on managing browser storage state for persistence. | [View Guide](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/storage_state_tutorial.md) |
|
||||
| Network Console Capture | Demonstrates how to capture and analyze network requests and console logs. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/network_console_capture_example.py) |
|
||||
@@ -117,4 +114,4 @@ Some examples may require:
|
||||
|
||||
## Contributing New Examples
|
||||
|
||||
If you've created an interesting example that demonstrates a unique use case or feature of Crawl4AI, we encourage you to contribute it to our examples collection. Please see our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTORS.md) for more information.
|
||||
If you've created an interesting example that demonstrates a unique use case or feature of Crawl4AI, we encourage you to contribute it to our examples collection. Please see our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTORS.md) for more information.
|
||||
|
||||
@@ -125,7 +125,7 @@ Here's a full example you can copy, paste, and run immediately:
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.async_configs import LinkPreviewConfig
|
||||
from crawl4ai import LinkPreviewConfig
|
||||
|
||||
async def extract_link_heads_example():
|
||||
"""
|
||||
@@ -237,7 +237,7 @@ if __name__ == "__main__":
|
||||
The `LinkPreviewConfig` class supports these options:
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import LinkPreviewConfig
|
||||
from crawl4ai import LinkPreviewConfig
|
||||
|
||||
link_preview_config = LinkPreviewConfig(
|
||||
# BASIC SETTINGS
|
||||
|
||||
@@ -137,7 +137,7 @@ async def smart_blog_crawler():
|
||||
word_count_threshold=300 # Only substantial articles
|
||||
)
|
||||
|
||||
# Extract URLs and stream results as they come
|
||||
# Extract URLs and crawl them
|
||||
tutorial_urls = [t["url"] for t in tutorials[:10]]
|
||||
results = await crawler.arun_many(tutorial_urls, config=config)
|
||||
|
||||
@@ -231,7 +231,7 @@ Common Crawl is a massive public dataset that regularly crawls the entire web. I
|
||||
|
||||
```python
|
||||
# Use both sources
|
||||
config = SeedingConfig(source="cc+sitemap")
|
||||
config = SeedingConfig(source="sitemap+cc")
|
||||
urls = await seeder.urls("example.com", config)
|
||||
```
|
||||
|
||||
@@ -241,13 +241,13 @@ The `SeedingConfig` object is your control panel. Here's everything you can conf
|
||||
|
||||
| Parameter | Type | Default | Description |
|
||||
|-----------|------|---------|-------------|
|
||||
| `source` | str | "cc" | URL source: "cc" (Common Crawl), "sitemap", or "cc+sitemap" |
|
||||
| `source` | str | "sitemap+cc" | URL source: "cc" (Common Crawl), "sitemap", or "sitemap+cc" |
|
||||
| `pattern` | str | "*" | URL pattern filter (e.g., "*/blog/*", "*.html") |
|
||||
| `extract_head` | bool | False | Extract metadata from page `<head>` |
|
||||
| `live_check` | bool | False | Verify URLs are accessible |
|
||||
| `max_urls` | int | -1 | Maximum URLs to return (-1 = unlimited) |
|
||||
| `concurrency` | int | 10 | Parallel workers for fetching |
|
||||
| `hits_per_sec` | int | None | Rate limit for requests |
|
||||
| `hits_per_sec` | int | 5 | Rate limit for requests |
|
||||
| `force` | bool | False | Bypass cache, fetch fresh data |
|
||||
| `verbose` | bool | False | Show detailed progress |
|
||||
| `query` | str | None | Search query for BM25 scoring |
|
||||
@@ -522,7 +522,7 @@ urls = await seeder.urls("docs.example.com", config)
|
||||
```python
|
||||
# Find specific products
|
||||
config = SeedingConfig(
|
||||
source="cc+sitemap", # Use both sources
|
||||
source="sitemap+cc", # Use both sources
|
||||
extract_head=True,
|
||||
query="wireless headphones noise canceling",
|
||||
scoring_method="bm25",
|
||||
@@ -782,7 +782,7 @@ class ResearchAssistant:
|
||||
|
||||
# Step 1: Discover relevant URLs
|
||||
config = SeedingConfig(
|
||||
source="cc+sitemap", # Maximum coverage
|
||||
source="sitemap+cc", # Maximum coverage
|
||||
extract_head=True, # Get metadata
|
||||
query=topic, # Research topic
|
||||
scoring_method="bm25", # Smart scoring
|
||||
@@ -832,7 +832,8 @@ class ResearchAssistant:
|
||||
# Extract URLs and crawl all articles
|
||||
article_urls = [article['url'] for article in top_articles]
|
||||
results = []
|
||||
async for result in await crawler.arun_many(article_urls, config=config):
|
||||
crawl_results = await crawler.arun_many(article_urls, config=config)
|
||||
async for result in crawl_results:
|
||||
if result.success:
|
||||
results.append({
|
||||
'url': result.url,
|
||||
@@ -933,10 +934,10 @@ config = SeedingConfig(concurrency=10, hits_per_sec=5)
|
||||
# When crawling many URLs
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Assuming urls is a list of URL strings
|
||||
results = await crawler.arun_many(urls, config=config)
|
||||
crawl_results = await crawler.arun_many(urls, config=config)
|
||||
|
||||
# Process as they arrive
|
||||
async for result in results:
|
||||
async for result in crawl_results:
|
||||
process_immediately(result) # Don't wait for all
|
||||
```
|
||||
|
||||
@@ -1020,7 +1021,7 @@ config = SeedingConfig(
|
||||
|
||||
# E-commerce product discovery
|
||||
config = SeedingConfig(
|
||||
source="cc+sitemap",
|
||||
source="sitemap+cc",
|
||||
pattern="*/product/*",
|
||||
extract_head=True,
|
||||
live_check=True
|
||||
|
||||
92
docs/md_v2/migration/webscraping-strategy-migration.md
Normal file
92
docs/md_v2/migration/webscraping-strategy-migration.md
Normal file
@@ -0,0 +1,92 @@
|
||||
# WebScrapingStrategy Migration Guide
|
||||
|
||||
## Overview
|
||||
|
||||
Crawl4AI has simplified its content scraping architecture. The BeautifulSoup-based `WebScrapingStrategy` has been deprecated in favor of the faster LXML-based implementation. However, **no action is required** - your existing code will continue to work.
|
||||
|
||||
## What Changed?
|
||||
|
||||
1. **`WebScrapingStrategy` is now an alias** for `LXMLWebScrapingStrategy`
|
||||
2. **The BeautifulSoup implementation has been removed** (~1000 lines of redundant code)
|
||||
3. **`LXMLWebScrapingStrategy` inherits directly** from `ContentScrapingStrategy`
|
||||
4. **Performance remains optimal** with LXML as the sole implementation
|
||||
|
||||
## Backward Compatibility
|
||||
|
||||
**Your existing code continues to work without any changes:**
|
||||
|
||||
```python
|
||||
# This still works perfectly
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, WebScrapingStrategy
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
scraping_strategy=WebScrapingStrategy() # Works as before
|
||||
)
|
||||
```
|
||||
|
||||
## Migration Options
|
||||
|
||||
You have three options:
|
||||
|
||||
### Option 1: Do Nothing (Recommended)
|
||||
Your code will continue to work. `WebScrapingStrategy` is permanently aliased to `LXMLWebScrapingStrategy`.
|
||||
|
||||
### Option 2: Update Imports (Optional)
|
||||
For clarity, you can update your imports:
|
||||
|
||||
```python
|
||||
# Old (still works)
|
||||
from crawl4ai import WebScrapingStrategy
|
||||
strategy = WebScrapingStrategy()
|
||||
|
||||
# New (more explicit)
|
||||
from crawl4ai import LXMLWebScrapingStrategy
|
||||
strategy = LXMLWebScrapingStrategy()
|
||||
```
|
||||
|
||||
### Option 3: Use Default Configuration
|
||||
Since `LXMLWebScrapingStrategy` is the default, you can omit the strategy parameter:
|
||||
|
||||
```python
|
||||
# Simplest approach - uses LXMLWebScrapingStrategy by default
|
||||
config = CrawlerRunConfig()
|
||||
```
|
||||
|
||||
## Type Hints
|
||||
|
||||
If you use type hints, both work:
|
||||
|
||||
```python
|
||||
from crawl4ai import WebScrapingStrategy, LXMLWebScrapingStrategy
|
||||
|
||||
def process_with_strategy(strategy: WebScrapingStrategy) -> None:
|
||||
# Works with both WebScrapingStrategy and LXMLWebScrapingStrategy
|
||||
pass
|
||||
|
||||
# Both are valid
|
||||
process_with_strategy(WebScrapingStrategy())
|
||||
process_with_strategy(LXMLWebScrapingStrategy())
|
||||
```
|
||||
|
||||
## Subclassing
|
||||
|
||||
If you've subclassed `WebScrapingStrategy`, it continues to work:
|
||||
|
||||
```python
|
||||
class MyCustomStrategy(WebScrapingStrategy):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
# Your custom code
|
||||
```
|
||||
|
||||
## Performance Benefits
|
||||
|
||||
By consolidating to LXML:
|
||||
- **10-20x faster** HTML parsing for large documents
|
||||
- **Lower memory usage**
|
||||
- **Consistent behavior** across all use cases
|
||||
- **Simplified maintenance** and bug fixes
|
||||
|
||||
## Summary
|
||||
|
||||
This change simplifies Crawl4AI's internals while maintaining 100% backward compatibility. Your existing code continues to work, and you get better performance automatically.
|
||||
@@ -28,7 +28,7 @@ from rich import box
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, AdaptiveCrawler, AdaptiveConfig, BrowserConfig, CacheMode
|
||||
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
||||
from crawl4ai.async_configs import LinkPreviewConfig, VirtualScrollConfig
|
||||
from crawl4ai import LinkPreviewConfig, VirtualScrollConfig
|
||||
from crawl4ai import c4a_compile, CompilationResult
|
||||
|
||||
# Initialize Rich console for beautiful output
|
||||
|
||||
@@ -13,14 +13,13 @@ from crawl4ai import (
|
||||
BrowserConfig,
|
||||
CacheMode,
|
||||
# New imports for v0.7.0
|
||||
LinkPreviewConfig,
|
||||
VirtualScrollConfig,
|
||||
LinkPreviewConfig,
|
||||
AdaptiveCrawler,
|
||||
AdaptiveConfig,
|
||||
AsyncUrlSeeder,
|
||||
SeedingConfig,
|
||||
c4a_compile,
|
||||
CompilationResult
|
||||
)
|
||||
|
||||
|
||||
@@ -170,16 +169,16 @@ async def demo_url_seeder():
|
||||
# Discover Python tutorial URLs
|
||||
config = SeedingConfig(
|
||||
source="sitemap", # Use sitemap
|
||||
pattern="*tutorial*", # URL pattern filter
|
||||
pattern="*python*", # URL pattern filter
|
||||
extract_head=True, # Get metadata
|
||||
query="python async programming", # For relevance scoring
|
||||
query="python tutorial", # For relevance scoring
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.2,
|
||||
max_urls=10
|
||||
)
|
||||
|
||||
print("Discovering Python async tutorial URLs...")
|
||||
urls = await seeder.urls("docs.python.org", config)
|
||||
urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
|
||||
|
||||
print(f"\n✅ Found {len(urls)} relevant URLs:")
|
||||
for i, url_info in enumerate(urls[:5], 1):
|
||||
@@ -245,39 +244,6 @@ IF (EXISTS `.price-filter`) THEN CLICK `input[data-max-price="100"]`
|
||||
print(f"❌ Compilation error: {result.first_error.message}")
|
||||
|
||||
|
||||
async def demo_pdf_support():
|
||||
"""
|
||||
Demo 6: PDF Parsing Support
|
||||
|
||||
Shows how to extract content from PDF files.
|
||||
Note: Requires 'pip install crawl4ai[pdf]'
|
||||
"""
|
||||
print("\n" + "="*60)
|
||||
print("📄 DEMO 6: PDF Parsing Support")
|
||||
print("="*60)
|
||||
|
||||
try:
|
||||
# Check if PDF support is installed
|
||||
import PyPDF2
|
||||
|
||||
# Example: Process a PDF URL
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
pdf=True, # Enable PDF generation
|
||||
extract_text_from_pdf=True # Extract text content
|
||||
)
|
||||
|
||||
print("PDF parsing is available!")
|
||||
print("You can now crawl PDF URLs and extract their content.")
|
||||
print("\nExample usage:")
|
||||
print(' result = await crawler.arun("https://example.com/document.pdf")')
|
||||
print(' pdf_text = result.extracted_content # Contains extracted text')
|
||||
|
||||
except ImportError:
|
||||
print("⚠️ PDF support not installed.")
|
||||
print("Install with: pip install crawl4ai[pdf]")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all demos"""
|
||||
print("\n🚀 Crawl4AI v0.7.0 Feature Demonstrations")
|
||||
@@ -289,7 +255,6 @@ async def main():
|
||||
("Virtual Scroll", demo_virtual_scroll),
|
||||
("URL Seeder", demo_url_seeder),
|
||||
("C4A Script", demo_c4a_script),
|
||||
("PDF Support", demo_pdf_support)
|
||||
]
|
||||
|
||||
for name, demo_func in demos:
|
||||
@@ -309,7 +274,6 @@ async def main():
|
||||
print("• Virtual Scroll: Capture all content from modern web pages")
|
||||
print("• URL Seeder: Pre-discover and filter URLs efficiently")
|
||||
print("• C4A Script: Simple language for complex automations")
|
||||
print("• PDF Support: Extract content from PDF documents")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -25,6 +25,8 @@ nav:
|
||||
- "Command Line Interface": "core/cli.md"
|
||||
- "Simple Crawling": "core/simple-crawling.md"
|
||||
- "Deep Crawling": "core/deep-crawling.md"
|
||||
- "Adaptive Crawling": "core/adaptive-crawling.md"
|
||||
- "URL Seeding": "core/url-seeding.md"
|
||||
- "C4A-Script": "core/c4a-script.md"
|
||||
- "Crawler Result": "core/crawler-result.md"
|
||||
- "Browser, Crawler & LLM Config": "core/browser-crawler-config.md"
|
||||
@@ -37,6 +39,7 @@ nav:
|
||||
- "Link & Media": "core/link-media.md"
|
||||
- Advanced:
|
||||
- "Overview": "advanced/advanced-features.md"
|
||||
- "Adaptive Strategies": "advanced/adaptive-strategies.md"
|
||||
- "Virtual Scroll": "advanced/virtual-scroll.md"
|
||||
- "File Downloading": "advanced/file-downloading.md"
|
||||
- "Lazy Loading": "advanced/lazy-loading.md"
|
||||
|
||||
@@ -44,7 +44,6 @@ dependencies = [
|
||||
"brotli>=1.1.0",
|
||||
"humanize>=4.10.0",
|
||||
"lark>=1.2.2",
|
||||
"sentence-transformers>=2.2.0",
|
||||
"alphashape>=1.3.1",
|
||||
"shapely>=2.0.0"
|
||||
]
|
||||
@@ -62,8 +61,8 @@ classifiers = [
|
||||
[project.optional-dependencies]
|
||||
pdf = ["PyPDF2"]
|
||||
torch = ["torch", "nltk", "scikit-learn"]
|
||||
transformer = ["transformers", "tokenizers"]
|
||||
cosine = ["torch", "transformers", "nltk"]
|
||||
transformer = ["transformers", "tokenizers", "sentence-transformers"]
|
||||
cosine = ["torch", "transformers", "nltk", "sentence-transformers"]
|
||||
sync = ["selenium"]
|
||||
all = [
|
||||
"PyPDF2",
|
||||
@@ -72,8 +71,8 @@ all = [
|
||||
"scikit-learn",
|
||||
"transformers",
|
||||
"tokenizers",
|
||||
"selenium",
|
||||
"PyPDF2"
|
||||
"sentence-transformers",
|
||||
"selenium"
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
|
||||
@@ -24,7 +24,6 @@ cssselect>=1.2.0
|
||||
chardet>=5.2.0
|
||||
brotli>=1.1.0
|
||||
httpx[http2]>=0.27.2
|
||||
sentence-transformers>=2.2.0
|
||||
alphashape>=1.3.1
|
||||
shapely>=2.0.0
|
||||
|
||||
|
||||
@@ -12,11 +12,8 @@ parent_dir = os.path.dirname(
|
||||
sys.path.append(parent_dir)
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
from crawl4ai.content_scraping_strategy import WebScrapingStrategy
|
||||
from crawl4ai.content_scraping_strategy import (
|
||||
WebScrapingStrategy as WebScrapingStrategyCurrent,
|
||||
)
|
||||
# from crawl4ai.content_scrapping_strategy_current import WebScrapingStrategy as WebScrapingStrategyCurrent
|
||||
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||
# This test compares the same strategy with itself now since WebScrapingStrategy is deprecated
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -32,8 +29,8 @@ class TestResult:
|
||||
|
||||
class StrategyTester:
|
||||
def __init__(self):
|
||||
self.new_scraper = WebScrapingStrategy()
|
||||
self.current_scraper = WebScrapingStrategyCurrent()
|
||||
self.new_scraper = LXMLWebScrapingStrategy()
|
||||
self.current_scraper = LXMLWebScrapingStrategy() # Same strategy now
|
||||
with open(__location__ + "/sample_wikipedia.html", "r", encoding="utf-8") as f:
|
||||
self.WIKI_HTML = f.read()
|
||||
self.results = {"new": [], "current": []}
|
||||
|
||||
@@ -10,11 +10,13 @@ import sys
|
||||
import uuid
|
||||
import shutil
|
||||
|
||||
from crawl4ai import BrowserProfiler
|
||||
from crawl4ai.browser_manager import BrowserManager
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager, BrowserProfileManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
@@ -25,7 +27,7 @@ async def test_profile_creation():
|
||||
"""Test creating and managing browser profiles."""
|
||||
logger.info("Testing profile creation and management", tag="TEST")
|
||||
|
||||
profile_manager = BrowserProfileManager(logger=logger)
|
||||
profile_manager = BrowserProfiler(logger=logger)
|
||||
|
||||
try:
|
||||
# List existing profiles
|
||||
@@ -83,7 +85,7 @@ async def test_profile_with_browser():
|
||||
"""Test using a profile with a browser."""
|
||||
logger.info("Testing using a profile with a browser", tag="TEST")
|
||||
|
||||
profile_manager = BrowserProfileManager(logger=logger)
|
||||
profile_manager = BrowserProfiler(logger=logger)
|
||||
test_profile_name = f"test-browser-profile-{uuid.uuid4().hex[:8]}"
|
||||
profile_path = None
|
||||
|
||||
@@ -101,6 +103,8 @@ async def test_profile_with_browser():
|
||||
# Now use this profile with a browser
|
||||
browser_config = BrowserConfig(
|
||||
user_data_dir=profile_path,
|
||||
use_managed_browser=True,
|
||||
use_persistent_context=True,
|
||||
headless=True
|
||||
)
|
||||
|
||||
|
||||
345
tests/docker/simple_api_test.py
Normal file
345
tests/docker/simple_api_test.py
Normal file
@@ -0,0 +1,345 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Simple API Test for Crawl4AI Docker Server v0.7.0
|
||||
Uses only built-in Python modules to test all endpoints.
|
||||
"""
|
||||
|
||||
import urllib.request
|
||||
import urllib.parse
|
||||
import json
|
||||
import time
|
||||
import sys
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
# Configuration
|
||||
BASE_URL = "http://localhost:11234" # Change to your server URL
|
||||
TEST_TIMEOUT = 30
|
||||
|
||||
class SimpleApiTester:
|
||||
def __init__(self, base_url: str = BASE_URL):
|
||||
self.base_url = base_url
|
||||
self.token = None
|
||||
self.results = []
|
||||
|
||||
def log(self, message: str):
|
||||
print(f"[INFO] {message}")
|
||||
|
||||
def test_get_endpoint(self, endpoint: str) -> Dict:
|
||||
"""Test a GET endpoint"""
|
||||
url = f"{self.base_url}{endpoint}"
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(url)
|
||||
if self.token:
|
||||
req.add_header('Authorization', f'Bearer {self.token}')
|
||||
|
||||
with urllib.request.urlopen(req, timeout=TEST_TIMEOUT) as response:
|
||||
response_time = time.time() - start_time
|
||||
status_code = response.getcode()
|
||||
content = response.read().decode('utf-8')
|
||||
|
||||
# Try to parse JSON
|
||||
try:
|
||||
data = json.loads(content)
|
||||
except:
|
||||
data = {"raw_response": content[:200]}
|
||||
|
||||
return {
|
||||
"endpoint": endpoint,
|
||||
"method": "GET",
|
||||
"status": "PASS" if status_code < 400 else "FAIL",
|
||||
"status_code": status_code,
|
||||
"response_time": response_time,
|
||||
"data": data
|
||||
}
|
||||
except Exception as e:
|
||||
response_time = time.time() - start_time
|
||||
return {
|
||||
"endpoint": endpoint,
|
||||
"method": "GET",
|
||||
"status": "FAIL",
|
||||
"status_code": None,
|
||||
"response_time": response_time,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
def test_post_endpoint(self, endpoint: str, payload: Dict) -> Dict:
|
||||
"""Test a POST endpoint"""
|
||||
url = f"{self.base_url}{endpoint}"
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
data = json.dumps(payload).encode('utf-8')
|
||||
req = urllib.request.Request(url, data=data, method='POST')
|
||||
req.add_header('Content-Type', 'application/json')
|
||||
|
||||
if self.token:
|
||||
req.add_header('Authorization', f'Bearer {self.token}')
|
||||
|
||||
with urllib.request.urlopen(req, timeout=TEST_TIMEOUT) as response:
|
||||
response_time = time.time() - start_time
|
||||
status_code = response.getcode()
|
||||
content = response.read().decode('utf-8')
|
||||
|
||||
# Try to parse JSON
|
||||
try:
|
||||
data = json.loads(content)
|
||||
except:
|
||||
data = {"raw_response": content[:200]}
|
||||
|
||||
return {
|
||||
"endpoint": endpoint,
|
||||
"method": "POST",
|
||||
"status": "PASS" if status_code < 400 else "FAIL",
|
||||
"status_code": status_code,
|
||||
"response_time": response_time,
|
||||
"data": data
|
||||
}
|
||||
except Exception as e:
|
||||
response_time = time.time() - start_time
|
||||
return {
|
||||
"endpoint": endpoint,
|
||||
"method": "POST",
|
||||
"status": "FAIL",
|
||||
"status_code": None,
|
||||
"response_time": response_time,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
def print_result(self, result: Dict):
|
||||
"""Print a formatted test result"""
|
||||
status_color = {
|
||||
"PASS": "✅",
|
||||
"FAIL": "❌",
|
||||
"SKIP": "⏭️"
|
||||
}
|
||||
|
||||
print(f"{status_color[result['status']]} {result['method']} {result['endpoint']} "
|
||||
f"| {result['response_time']:.3f}s | Status: {result['status_code'] or 'N/A'}")
|
||||
|
||||
if result['status'] == 'FAIL' and 'error' in result:
|
||||
print(f" Error: {result['error']}")
|
||||
|
||||
self.results.append(result)
|
||||
|
||||
def run_all_tests(self):
|
||||
"""Run all API tests"""
|
||||
print("🚀 Starting Crawl4AI v0.7.0 API Test Suite")
|
||||
print(f"📡 Testing server at: {self.base_url}")
|
||||
print("=" * 60)
|
||||
|
||||
# # Test basic endpoints
|
||||
# print("\n=== BASIC ENDPOINTS ===")
|
||||
|
||||
# # Health check
|
||||
# result = self.test_get_endpoint("/health")
|
||||
# self.print_result(result)
|
||||
|
||||
|
||||
# # Schema endpoint
|
||||
# result = self.test_get_endpoint("/schema")
|
||||
# self.print_result(result)
|
||||
|
||||
# # Metrics endpoint
|
||||
# result = self.test_get_endpoint("/metrics")
|
||||
# self.print_result(result)
|
||||
|
||||
# # Root redirect
|
||||
# result = self.test_get_endpoint("/")
|
||||
# self.print_result(result)
|
||||
|
||||
# # Test authentication
|
||||
# print("\n=== AUTHENTICATION ===")
|
||||
|
||||
# # Get token
|
||||
# token_payload = {"email": "test@example.com"}
|
||||
# result = self.test_post_endpoint("/token", token_payload)
|
||||
# self.print_result(result)
|
||||
|
||||
# # Extract token if successful
|
||||
# if result['status'] == 'PASS' and 'data' in result:
|
||||
# token = result['data'].get('access_token')
|
||||
# if token:
|
||||
# self.token = token
|
||||
# self.log(f"Successfully obtained auth token: {token[:20]}...")
|
||||
|
||||
# Test core APIs
|
||||
print("\n=== CORE APIs ===")
|
||||
|
||||
test_url = "https://example.com"
|
||||
|
||||
# Test markdown endpoint
|
||||
md_payload = {
|
||||
"url": test_url,
|
||||
"f": "fit",
|
||||
"q": "test query",
|
||||
"c": "0"
|
||||
}
|
||||
result = self.test_post_endpoint("/md", md_payload)
|
||||
# print(result['data'].get('markdown', ''))
|
||||
self.print_result(result)
|
||||
|
||||
# Test HTML endpoint
|
||||
html_payload = {"url": test_url}
|
||||
result = self.test_post_endpoint("/html", html_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test screenshot endpoint
|
||||
screenshot_payload = {
|
||||
"url": test_url,
|
||||
"screenshot_wait_for": 2
|
||||
}
|
||||
result = self.test_post_endpoint("/screenshot", screenshot_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test PDF endpoint
|
||||
pdf_payload = {"url": test_url}
|
||||
result = self.test_post_endpoint("/pdf", pdf_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test JavaScript execution
|
||||
js_payload = {
|
||||
"url": test_url,
|
||||
"scripts": ["(() => document.title)()"]
|
||||
}
|
||||
result = self.test_post_endpoint("/execute_js", js_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test crawl endpoint
|
||||
crawl_payload = {
|
||||
"urls": [test_url],
|
||||
"browser_config": {},
|
||||
"crawler_config": {}
|
||||
}
|
||||
result = self.test_post_endpoint("/crawl", crawl_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test config dump
|
||||
config_payload = {"code": "CrawlerRunConfig()"}
|
||||
result = self.test_post_endpoint("/config/dump", config_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test LLM endpoint
|
||||
llm_endpoint = f"/llm/{test_url}?q=Extract%20main%20content"
|
||||
result = self.test_get_endpoint(llm_endpoint)
|
||||
self.print_result(result)
|
||||
|
||||
# Test ask endpoint
|
||||
ask_endpoint = "/ask?context_type=all&query=crawl4ai&max_results=5"
|
||||
result = self.test_get_endpoint(ask_endpoint)
|
||||
print(result)
|
||||
self.print_result(result)
|
||||
|
||||
# Test job APIs
|
||||
print("\n=== JOB APIs ===")
|
||||
|
||||
# Test LLM job
|
||||
llm_job_payload = {
|
||||
"url": test_url,
|
||||
"q": "Extract main content",
|
||||
"cache": False
|
||||
}
|
||||
result = self.test_post_endpoint("/llm/job", llm_job_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test crawl job
|
||||
crawl_job_payload = {
|
||||
"urls": [test_url],
|
||||
"browser_config": {},
|
||||
"crawler_config": {}
|
||||
}
|
||||
result = self.test_post_endpoint("/crawl/job", crawl_job_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test MCP
|
||||
print("\n=== MCP APIs ===")
|
||||
|
||||
# Test MCP schema
|
||||
result = self.test_get_endpoint("/mcp/schema")
|
||||
self.print_result(result)
|
||||
|
||||
# Test error handling
|
||||
print("\n=== ERROR HANDLING ===")
|
||||
|
||||
# Test invalid URL
|
||||
invalid_payload = {"url": "invalid-url", "f": "fit"}
|
||||
result = self.test_post_endpoint("/md", invalid_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test invalid endpoint
|
||||
result = self.test_get_endpoint("/nonexistent")
|
||||
self.print_result(result)
|
||||
|
||||
# Print summary
|
||||
self.print_summary()
|
||||
|
||||
def print_summary(self):
|
||||
"""Print test results summary"""
|
||||
print("\n" + "=" * 60)
|
||||
print("📊 TEST RESULTS SUMMARY")
|
||||
print("=" * 60)
|
||||
|
||||
total = len(self.results)
|
||||
passed = sum(1 for r in self.results if r['status'] == 'PASS')
|
||||
failed = sum(1 for r in self.results if r['status'] == 'FAIL')
|
||||
|
||||
print(f"Total Tests: {total}")
|
||||
print(f"✅ Passed: {passed}")
|
||||
print(f"❌ Failed: {failed}")
|
||||
print(f"📈 Success Rate: {(passed/total)*100:.1f}%")
|
||||
|
||||
if failed > 0:
|
||||
print("\n❌ FAILED TESTS:")
|
||||
for result in self.results:
|
||||
if result['status'] == 'FAIL':
|
||||
print(f" • {result['method']} {result['endpoint']}")
|
||||
if 'error' in result:
|
||||
print(f" Error: {result['error']}")
|
||||
|
||||
# Performance statistics
|
||||
response_times = [r['response_time'] for r in self.results if r['response_time'] > 0]
|
||||
if response_times:
|
||||
avg_time = sum(response_times) / len(response_times)
|
||||
max_time = max(response_times)
|
||||
print(f"\n⏱️ Average Response Time: {avg_time:.3f}s")
|
||||
print(f"⏱️ Max Response Time: {max_time:.3f}s")
|
||||
|
||||
# Save detailed report
|
||||
report_file = f"crawl4ai_test_report_{int(time.time())}.json"
|
||||
with open(report_file, 'w') as f:
|
||||
json.dump({
|
||||
"timestamp": time.time(),
|
||||
"server_url": self.base_url,
|
||||
"version": "0.7.0",
|
||||
"summary": {
|
||||
"total": total,
|
||||
"passed": passed,
|
||||
"failed": failed
|
||||
},
|
||||
"results": self.results
|
||||
}, f, indent=2)
|
||||
|
||||
print(f"\n📄 Detailed report saved to: {report_file}")
|
||||
|
||||
def main():
|
||||
"""Main test runner"""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Crawl4AI v0.7.0 API Test Suite')
|
||||
parser.add_argument('--url', default=BASE_URL, help='Base URL of the server')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
tester = SimpleApiTester(args.url)
|
||||
|
||||
try:
|
||||
tester.run_all_tests()
|
||||
except KeyboardInterrupt:
|
||||
print("\n🛑 Test suite interrupted by user")
|
||||
except Exception as e:
|
||||
print(f"\n💥 Test suite failed with error: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
55
tests/profiler/test_keyboard_handle.py
Normal file
55
tests/profiler/test_keyboard_handle.py
Normal file
@@ -0,0 +1,55 @@
|
||||
import sys
|
||||
import pytest
|
||||
import asyncio
|
||||
from unittest.mock import patch, MagicMock
|
||||
from crawl4ai.browser_profiler import BrowserProfiler
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skipif(sys.platform != "win32", reason="Windows-specific msvcrt test")
|
||||
async def test_keyboard_input_handling():
|
||||
# Mock sequence of keystrokes: arrow key followed by 'q'
|
||||
mock_keys = [b'\x00K', b'q']
|
||||
mock_kbhit = MagicMock(side_effect=[True, True, False])
|
||||
mock_getch = MagicMock(side_effect=mock_keys)
|
||||
|
||||
with patch('msvcrt.kbhit', mock_kbhit), patch('msvcrt.getch', mock_getch):
|
||||
# profiler = BrowserProfiler()
|
||||
user_done_event = asyncio.Event()
|
||||
|
||||
# Create a local async function to simulate the keyboard input handling
|
||||
async def test_listen_for_quit_command():
|
||||
if sys.platform == "win32":
|
||||
while True:
|
||||
try:
|
||||
if mock_kbhit():
|
||||
raw = mock_getch()
|
||||
try:
|
||||
key = raw.decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
|
||||
if len(key) != 1 or not key.isprintable():
|
||||
continue
|
||||
|
||||
if key.lower() == "q":
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
await asyncio.sleep(0.1)
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
# Run the listener
|
||||
listener_task = asyncio.create_task(test_listen_for_quit_command())
|
||||
|
||||
# Wait for the event to be set
|
||||
try:
|
||||
await asyncio.wait_for(user_done_event.wait(), timeout=1.0)
|
||||
assert user_done_event.is_set()
|
||||
finally:
|
||||
if not listener_task.done():
|
||||
listener_task.cancel()
|
||||
try:
|
||||
await listener_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
42
tests/test_arun_many.py
Normal file
42
tests/test_arun_many.py
Normal file
@@ -0,0 +1,42 @@
|
||||
"""
|
||||
Test example for multiple crawler configs feature
|
||||
"""
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.processors.pdf import PDFContentScrapingStrategy
|
||||
|
||||
|
||||
async def test_run_many():
|
||||
default_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
# scraping_strategy=PDFContentScrapingStrategy()
|
||||
)
|
||||
|
||||
test_urls = [
|
||||
# "https://blog.python.org/", # Blog URL
|
||||
"https://www.python.org/", # Generic HTTPS page
|
||||
"https://www.kidocode.com/", # Generic HTTPS page
|
||||
"https://www.example.com/", # Generic HTTPS page
|
||||
# "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
|
||||
]
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Single config - traditional usage still works
|
||||
print("Test 1: Single config (backwards compatible)")
|
||||
result = await crawler.arun_many(
|
||||
urls=test_urls[:2],
|
||||
config=default_config
|
||||
)
|
||||
print(f"Crawled {len(result)} URLs with single config\n")
|
||||
for item in result:
|
||||
print(f" {item.url} -> {item.status_code}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_run_many())
|
||||
131
tests/test_config_matching_only.py
Normal file
131
tests/test_config_matching_only.py
Normal file
@@ -0,0 +1,131 @@
|
||||
"""
|
||||
Test only the config matching logic without running crawler
|
||||
"""
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from crawl4ai.async_configs import CrawlerRunConfig, MatchMode
|
||||
|
||||
def test_all_matching_scenarios():
|
||||
print("Testing CrawlerRunConfig.is_match() method")
|
||||
print("=" * 50)
|
||||
|
||||
# Test 1: Single string pattern
|
||||
print("\n1. Single string pattern (glob style)")
|
||||
config = CrawlerRunConfig(
|
||||
url_matcher="*.pdf",
|
||||
# For example we can set this => scraping_strategy=PDFContentScrapingStrategy()
|
||||
)
|
||||
test_urls = [
|
||||
("https://example.com/file.pdf", True),
|
||||
("https://example.com/doc.PDF", False), # Case sensitive
|
||||
("https://example.com/file.txt", False),
|
||||
("file.pdf", True),
|
||||
]
|
||||
for url, expected in test_urls:
|
||||
result = config.is_match(url)
|
||||
status = "✓" if result == expected else "✗"
|
||||
print(f" {status} {url} -> {result}")
|
||||
|
||||
# Test 2: List of patterns with OR
|
||||
print("\n2. List of patterns with OR (default)")
|
||||
config = CrawlerRunConfig(
|
||||
url_matcher=["*/article/*", "*/blog/*", "*.html"],
|
||||
match_mode=MatchMode.OR
|
||||
)
|
||||
test_urls = [
|
||||
("https://example.com/article/news", True),
|
||||
("https://example.com/blog/post", True),
|
||||
("https://example.com/page.html", True),
|
||||
("https://example.com/page.php", False),
|
||||
]
|
||||
for url, expected in test_urls:
|
||||
result = config.is_match(url)
|
||||
status = "✓" if result == expected else "✗"
|
||||
print(f" {status} {url} -> {result}")
|
||||
|
||||
# Test 3: Custom function
|
||||
print("\n3. Custom function matcher")
|
||||
config = CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'api' in url and (url.endswith('.json') or url.endswith('.xml'))
|
||||
)
|
||||
test_urls = [
|
||||
("https://api.example.com/data.json", True),
|
||||
("https://api.example.com/data.xml", True),
|
||||
("https://api.example.com/data.html", False),
|
||||
("https://example.com/data.json", False), # No 'api'
|
||||
]
|
||||
for url, expected in test_urls:
|
||||
result = config.is_match(url)
|
||||
status = "✓" if result == expected else "✗"
|
||||
print(f" {status} {url} -> {result}")
|
||||
|
||||
# Test 4: Mixed list with AND
|
||||
print("\n4. Mixed patterns and functions with AND")
|
||||
config = CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
"https://*", # Must be HTTPS
|
||||
lambda url: '.com' in url, # Must have .com
|
||||
lambda url: len(url) < 50 # Must be short
|
||||
],
|
||||
match_mode=MatchMode.AND
|
||||
)
|
||||
test_urls = [
|
||||
("https://example.com/page", True),
|
||||
("http://example.com/page", False), # Not HTTPS
|
||||
("https://example.org/page", False), # No .com
|
||||
("https://example.com/" + "x" * 50, False), # Too long
|
||||
]
|
||||
for url, expected in test_urls:
|
||||
result = config.is_match(url)
|
||||
status = "✓" if result == expected else "✗"
|
||||
print(f" {status} {url} -> {result}")
|
||||
|
||||
# Test 5: Complex real-world scenario
|
||||
print("\n5. Complex pattern combinations")
|
||||
config = CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
"*/api/v[0-9]/*", # API versioned endpoints
|
||||
lambda url: 'graphql' in url, # GraphQL endpoints
|
||||
"*.json" # JSON files
|
||||
],
|
||||
match_mode=MatchMode.OR
|
||||
)
|
||||
test_urls = [
|
||||
("https://example.com/api/v1/users", True),
|
||||
("https://example.com/api/v2/posts", True),
|
||||
("https://example.com/graphql", True),
|
||||
("https://example.com/data.json", True),
|
||||
("https://example.com/api/users", False), # No version
|
||||
]
|
||||
for url, expected in test_urls:
|
||||
result = config.is_match(url)
|
||||
status = "✓" if result == expected else "✗"
|
||||
print(f" {status} {url} -> {result}")
|
||||
|
||||
# Test 6: Edge cases
|
||||
print("\n6. Edge cases")
|
||||
|
||||
# No matcher
|
||||
config = CrawlerRunConfig()
|
||||
result = config.is_match("https://example.com")
|
||||
print(f" {'✓' if not result else '✗'} No matcher -> {result}")
|
||||
|
||||
# Empty list
|
||||
config = CrawlerRunConfig(url_matcher=[])
|
||||
result = config.is_match("https://example.com")
|
||||
print(f" {'✓' if not result else '✗'} Empty list -> {result}")
|
||||
|
||||
# None in list (should be skipped)
|
||||
config = CrawlerRunConfig(url_matcher=["*.pdf", None, "*.doc"])
|
||||
result = config.is_match("test.pdf")
|
||||
print(f" {'✓' if result else '✗'} List with None -> {result}")
|
||||
|
||||
print("\n" + "=" * 50)
|
||||
print("All matching tests completed!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_all_matching_scenarios()
|
||||
87
tests/test_config_selection.py
Normal file
87
tests/test_config_selection.py
Normal file
@@ -0,0 +1,87 @@
|
||||
"""
|
||||
Test config selection logic in dispatchers
|
||||
"""
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from crawl4ai.async_configs import CrawlerRunConfig, MatchMode
|
||||
from crawl4ai.async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher
|
||||
|
||||
class TestDispatcher(BaseDispatcher):
|
||||
"""Simple test dispatcher to verify config selection"""
|
||||
|
||||
async def crawl_url(self, url, config, task_id, **kwargs):
|
||||
# Just return which config was selected
|
||||
selected = self.select_config(url, config)
|
||||
return {"url": url, "config_id": id(selected)}
|
||||
|
||||
async def run_urls(self, urls, crawler, config):
|
||||
results = []
|
||||
for url in urls:
|
||||
result = await self.crawl_url(url, config, "test")
|
||||
results.append(result)
|
||||
return results
|
||||
|
||||
async def test_dispatcher_config_selection():
|
||||
print("Testing dispatcher config selection")
|
||||
print("=" * 50)
|
||||
|
||||
# Create test configs with different matchers
|
||||
pdf_config = CrawlerRunConfig(url_matcher="*.pdf")
|
||||
api_config = CrawlerRunConfig(url_matcher=lambda url: 'api' in url)
|
||||
default_config = CrawlerRunConfig() # No matcher
|
||||
|
||||
configs = [pdf_config, api_config, default_config]
|
||||
|
||||
# Create test dispatcher
|
||||
dispatcher = TestDispatcher()
|
||||
|
||||
# Test single config
|
||||
print("\nTest 1: Single config")
|
||||
result = await dispatcher.crawl_url("https://example.com/file.pdf", pdf_config, "test1")
|
||||
assert result["config_id"] == id(pdf_config)
|
||||
print("✓ Single config works")
|
||||
|
||||
# Test config list selection
|
||||
print("\nTest 2: Config list selection")
|
||||
test_cases = [
|
||||
("https://example.com/file.pdf", id(pdf_config)),
|
||||
("https://api.example.com/data", id(api_config)),
|
||||
("https://example.com/page", id(configs[0])), # No match, uses first
|
||||
]
|
||||
|
||||
for url, expected_id in test_cases:
|
||||
result = await dispatcher.crawl_url(url, configs, "test")
|
||||
assert result["config_id"] == expected_id, f"URL {url} got wrong config"
|
||||
print(f"✓ {url} -> correct config selected")
|
||||
|
||||
# Test with MemoryAdaptiveDispatcher
|
||||
print("\nTest 3: MemoryAdaptiveDispatcher config selection")
|
||||
mem_dispatcher = MemoryAdaptiveDispatcher()
|
||||
|
||||
# Test select_config method directly
|
||||
selected = mem_dispatcher.select_config("https://example.com/doc.pdf", configs)
|
||||
assert selected == pdf_config
|
||||
print("✓ MemoryAdaptiveDispatcher.select_config works")
|
||||
|
||||
# Test empty config list
|
||||
print("\nTest 4: Edge cases")
|
||||
selected = mem_dispatcher.select_config("https://example.com", [])
|
||||
assert isinstance(selected, CrawlerRunConfig) # Should return default
|
||||
print("✓ Empty config list returns default config")
|
||||
|
||||
# Test None config
|
||||
selected = mem_dispatcher.select_config("https://example.com", None)
|
||||
assert isinstance(selected, CrawlerRunConfig) # Should return default
|
||||
print("✓ None config returns default config")
|
||||
|
||||
print("\n" + "=" * 50)
|
||||
print("All dispatcher tests passed! ✓")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_dispatcher_config_selection())
|
||||
122
tests/test_docker_api_with_llm_provider.py
Normal file
122
tests/test_docker_api_with_llm_provider.py
Normal file
@@ -0,0 +1,122 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test script to verify Docker API with LLM provider configuration."""
|
||||
|
||||
import requests
|
||||
import json
|
||||
import time
|
||||
|
||||
BASE_URL = "http://localhost:11235"
|
||||
|
||||
def test_health():
|
||||
"""Test health endpoint."""
|
||||
print("1. Testing health endpoint...")
|
||||
response = requests.get(f"{BASE_URL}/health")
|
||||
print(f" Status: {response.status_code}")
|
||||
print(f" Response: {response.json()}")
|
||||
print()
|
||||
|
||||
def test_schema():
|
||||
"""Test schema endpoint to see configuration."""
|
||||
print("2. Testing schema endpoint...")
|
||||
response = requests.get(f"{BASE_URL}/schema")
|
||||
print(f" Status: {response.status_code}")
|
||||
# Print only browser config to keep output concise
|
||||
print(f" Browser config keys: {list(response.json().get('browser', {}).keys())[:5]}...")
|
||||
print()
|
||||
|
||||
def test_markdown_with_llm_filter():
|
||||
"""Test markdown endpoint with LLM filter (should use configured provider)."""
|
||||
print("3. Testing markdown endpoint with LLM filter...")
|
||||
print(" This should use the Groq provider from LLM_PROVIDER env var")
|
||||
|
||||
# Note: This will fail with dummy API keys, but we can see if it tries to use Groq
|
||||
payload = {
|
||||
"url": "https://httpbin.org/html",
|
||||
"f": "llm",
|
||||
"q": "Extract the main content"
|
||||
}
|
||||
|
||||
response = requests.post(f"{BASE_URL}/md", json=payload)
|
||||
print(f" Status: {response.status_code}")
|
||||
|
||||
if response.status_code != 200:
|
||||
print(f" Error: {response.text[:200]}...")
|
||||
else:
|
||||
print(f" Success! Markdown length: {len(response.json().get('markdown', ''))} chars")
|
||||
print()
|
||||
|
||||
def test_markdown_with_provider_override():
|
||||
"""Test markdown endpoint with provider override in request."""
|
||||
print("4. Testing markdown endpoint with provider override...")
|
||||
print(" This should use OpenAI provider from request parameter")
|
||||
|
||||
payload = {
|
||||
"url": "https://httpbin.org/html",
|
||||
"f": "llm",
|
||||
"q": "Extract the main content",
|
||||
"provider": "openai/gpt-4" # Override to use OpenAI
|
||||
}
|
||||
|
||||
response = requests.post(f"{BASE_URL}/md", json=payload)
|
||||
print(f" Status: {response.status_code}")
|
||||
|
||||
if response.status_code != 200:
|
||||
print(f" Error: {response.text[:200]}...")
|
||||
else:
|
||||
print(f" Success! Markdown length: {len(response.json().get('markdown', ''))} chars")
|
||||
print()
|
||||
|
||||
def test_simple_crawl():
|
||||
"""Test simple crawl without LLM."""
|
||||
print("5. Testing simple crawl (no LLM required)...")
|
||||
|
||||
payload = {
|
||||
"urls": ["https://httpbin.org/html"],
|
||||
"browser_config": {
|
||||
"type": "BrowserConfig",
|
||||
"params": {"headless": True}
|
||||
},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {"cache_mode": "bypass"}
|
||||
}
|
||||
}
|
||||
|
||||
response = requests.post(f"{BASE_URL}/crawl", json=payload)
|
||||
print(f" Status: {response.status_code}")
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
print(f" Success: {result.get('success')}")
|
||||
print(f" Results count: {len(result.get('results', []))}")
|
||||
if result.get('results'):
|
||||
print(f" First result success: {result['results'][0].get('success')}")
|
||||
else:
|
||||
print(f" Error: {response.text[:200]}...")
|
||||
print()
|
||||
|
||||
def test_playground():
|
||||
"""Test if playground is accessible."""
|
||||
print("6. Testing playground interface...")
|
||||
response = requests.get(f"{BASE_URL}/playground")
|
||||
print(f" Status: {response.status_code}")
|
||||
print(f" Content-Type: {response.headers.get('content-type')}")
|
||||
print()
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("=== Crawl4AI Docker API Tests ===\n")
|
||||
print(f"Testing API at {BASE_URL}\n")
|
||||
|
||||
# Wait a bit for server to be fully ready
|
||||
time.sleep(2)
|
||||
|
||||
test_health()
|
||||
test_schema()
|
||||
test_simple_crawl()
|
||||
test_playground()
|
||||
|
||||
print("\nTesting LLM functionality (these may fail with dummy API keys):\n")
|
||||
test_markdown_with_llm_filter()
|
||||
test_markdown_with_provider_override()
|
||||
|
||||
print("\nTests completed!")
|
||||
@@ -5,7 +5,7 @@ Test script for Link Extractor functionality
|
||||
|
||||
from crawl4ai.models import Link
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.async_configs import LinkPreviewConfig
|
||||
from crawl4ai import LinkPreviewConfig
|
||||
import asyncio
|
||||
import sys
|
||||
import os
|
||||
@@ -237,7 +237,7 @@ def test_config_examples():
|
||||
print(f" {key}: {value}")
|
||||
|
||||
print(" Usage:")
|
||||
print(" from crawl4ai.async_configs import LinkPreviewConfig")
|
||||
print(" from crawl4ai import LinkPreviewConfig")
|
||||
print(" config = CrawlerRunConfig(")
|
||||
print(" link_preview_config=LinkPreviewConfig(")
|
||||
for key, value in config_dict.items():
|
||||
|
||||
71
tests/test_memory_macos.py
Executable file
71
tests/test_memory_macos.py
Executable file
@@ -0,0 +1,71 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test script to verify macOS memory calculation accuracy."""
|
||||
|
||||
import psutil
|
||||
import platform
|
||||
import time
|
||||
from crawl4ai.memory_utils import get_true_memory_usage_percent, get_memory_stats, get_true_available_memory_gb
|
||||
|
||||
|
||||
def test_memory_calculation():
|
||||
"""Test and compare memory calculations."""
|
||||
print(f"Platform: {platform.system()}")
|
||||
print(f"Python version: {platform.python_version()}")
|
||||
print("-" * 60)
|
||||
|
||||
# Get psutil's view
|
||||
vm = psutil.virtual_memory()
|
||||
psutil_percent = vm.percent
|
||||
psutil_available_gb = vm.available / (1024**3)
|
||||
total_gb = vm.total / (1024**3)
|
||||
|
||||
# Get our corrected view
|
||||
true_percent = get_true_memory_usage_percent()
|
||||
true_available_gb = get_true_available_memory_gb()
|
||||
true_percent_calc, available_calc, total_calc = get_memory_stats()
|
||||
|
||||
print("Memory Statistics Comparison:")
|
||||
print(f"Total Memory: {total_gb:.2f} GB")
|
||||
print()
|
||||
|
||||
print("PSUtil (Standard) Calculation:")
|
||||
print(f" - Memory Used: {psutil_percent:.1f}%")
|
||||
print(f" - Available: {psutil_available_gb:.2f} GB")
|
||||
print()
|
||||
|
||||
print("Platform-Aware Calculation:")
|
||||
print(f" - Memory Used: {true_percent:.1f}%")
|
||||
print(f" - Available: {true_available_gb:.2f} GB")
|
||||
print(f" - Difference: {true_available_gb - psutil_available_gb:.2f} GB of reclaimable memory")
|
||||
print()
|
||||
|
||||
# Show the impact on dispatcher behavior
|
||||
print("Impact on MemoryAdaptiveDispatcher:")
|
||||
thresholds = {
|
||||
"Normal": 90.0,
|
||||
"Critical": 95.0,
|
||||
"Recovery": 85.0
|
||||
}
|
||||
|
||||
for name, threshold in thresholds.items():
|
||||
psutil_triggered = psutil_percent >= threshold
|
||||
true_triggered = true_percent >= threshold
|
||||
print(f" - {name} Threshold ({threshold}%):")
|
||||
print(f" PSUtil: {'TRIGGERED' if psutil_triggered else 'OK'}")
|
||||
print(f" Platform-Aware: {'TRIGGERED' if true_triggered else 'OK'}")
|
||||
if psutil_triggered != true_triggered:
|
||||
print(f" → Difference: Platform-aware prevents false {'pressure' if psutil_triggered else 'recovery'}")
|
||||
print()
|
||||
|
||||
# Monitor for a few seconds
|
||||
print("Monitoring memory for 10 seconds...")
|
||||
for i in range(10):
|
||||
vm = psutil.virtual_memory()
|
||||
true_pct = get_true_memory_usage_percent()
|
||||
print(f" {i+1}s - PSUtil: {vm.percent:.1f}% | Platform-Aware: {true_pct:.1f}%", end="\r")
|
||||
time.sleep(1)
|
||||
print("\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_memory_calculation()
|
||||
117
tests/test_multi_config.py
Normal file
117
tests/test_multi_config.py
Normal file
@@ -0,0 +1,117 @@
|
||||
"""
|
||||
Test example for multiple crawler configs feature
|
||||
"""
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, MatchMode, CacheMode
|
||||
|
||||
async def test_multi_config():
|
||||
# Create different configs for different URL patterns
|
||||
|
||||
# Config for PDF files
|
||||
pdf_config = CrawlerRunConfig(
|
||||
url_matcher="*.pdf",
|
||||
)
|
||||
|
||||
# Config for articles (using multiple patterns with OR logic)
|
||||
article_config = CrawlerRunConfig(
|
||||
url_matcher=["*/news/*", "*blog*", "*/article/*"],
|
||||
match_mode=MatchMode.OR,
|
||||
screenshot=True,
|
||||
)
|
||||
|
||||
# Config using custom matcher function
|
||||
api_config = CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'api' in url or 'json' in url,
|
||||
)
|
||||
|
||||
# Config combining patterns and functions with AND logic
|
||||
secure_docs_config = CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
"*.doc*", # Matches .doc, .docx
|
||||
lambda url: url.startswith('https://') # Must be HTTPS
|
||||
],
|
||||
match_mode=MatchMode.AND,
|
||||
)
|
||||
|
||||
# Default config (no url_matcher means it won't match anything unless it's the fallback)
|
||||
default_config = CrawlerRunConfig(
|
||||
# cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
|
||||
# List of configs - order matters! First match wins
|
||||
configs = [
|
||||
pdf_config,
|
||||
article_config,
|
||||
api_config,
|
||||
secure_docs_config,
|
||||
default_config # Fallback
|
||||
]
|
||||
|
||||
# Test URLs - using real URLs that exist
|
||||
test_urls = [
|
||||
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", # Real PDF
|
||||
"https://www.bbc.com/news/articles/c5y3e3glnldo", # News article
|
||||
"https://blog.python.org/", # Blog URL
|
||||
"https://api.github.com/users/github", # GitHub API (returns JSON)
|
||||
"https://httpbin.org/json", # API endpoint that returns JSON
|
||||
"https://www.python.org/", # Generic HTTPS page
|
||||
"http://info.cern.ch/", # HTTP (not HTTPS) page
|
||||
"https://example.com/", # → Default config
|
||||
]
|
||||
|
||||
# Test the matching logic
|
||||
print("Config matching test:")
|
||||
print("-" * 50)
|
||||
for url in test_urls:
|
||||
for i, config in enumerate(configs):
|
||||
if config.is_match(url):
|
||||
print(f"{url} -> Config {i} matches")
|
||||
break
|
||||
else:
|
||||
print(f"{url} -> No match, will use fallback (first config)")
|
||||
|
||||
print("\n" + "=" * 50 + "\n")
|
||||
|
||||
# Now test with actual crawler
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Single config - traditional usage still works
|
||||
print("Test 1: Single config (backwards compatible)")
|
||||
result = await crawler.arun_many(
|
||||
urls=["https://www.python.org/"],
|
||||
config=default_config
|
||||
)
|
||||
print(f"Crawled {len(result)} URLs with single config\n")
|
||||
|
||||
# Multiple configs - new feature
|
||||
print("Test 2: Multiple configs")
|
||||
# Just test with 2 URLs to avoid timeout
|
||||
results = await crawler.arun_many(
|
||||
urls=test_urls[:2], # Just test first 2 URLs
|
||||
config=configs # Pass list of configs
|
||||
)
|
||||
print(f"Crawled {len(results)} URLs with multiple configs")
|
||||
|
||||
# Using custom matcher inline
|
||||
print("\nTest 3: Inline custom matcher")
|
||||
custom_config = CrawlerRunConfig(
|
||||
url_matcher=lambda url: len(url) > 50 and 'python' in url.lower(),
|
||||
verbose=False
|
||||
)
|
||||
results = await crawler.arun_many(
|
||||
urls=[
|
||||
"https://docs.python.org/3/library/asyncio.html", # Long URL with 'python'
|
||||
"https://python.org/", # Short URL with 'python' - won't match
|
||||
"https://www.google.com/" # No 'python' - won't match
|
||||
],
|
||||
config=[custom_config, default_config]
|
||||
)
|
||||
print(f"Crawled {len(results)} URLs with custom matcher")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_multi_config())
|
||||
Reference in New Issue
Block a user