Compare commits
18 Commits
v0.7.2
...
fix/exit_w
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0541b61405 | ||
|
|
6735c68288 | ||
|
|
ff6ea41ac3 | ||
|
|
31a435fb0e | ||
|
|
5de6a28055 | ||
|
|
de1561ad14 | ||
|
|
337b588732 | ||
|
|
7a6ad547f0 | ||
|
|
e6692b987d | ||
|
|
307fe28b32 | ||
|
|
438a103b17 | ||
|
|
a03e68fa2f | ||
|
|
864d87afb2 | ||
|
|
508b6fc233 | ||
|
|
e3281935bc | ||
|
|
c4d625fb3c | ||
|
|
ef722766f0 | ||
|
|
4bcb7171a3 |
9
.github/workflows/release.yml
vendored
9
.github/workflows/release.yml
vendored
@@ -8,6 +8,8 @@ on:
|
||||
jobs:
|
||||
release:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: write # Required for creating releases
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
@@ -95,12 +97,10 @@ jobs:
|
||||
platforms: linux/amd64,linux/arm64
|
||||
|
||||
- name: Create GitHub Release
|
||||
uses: actions/create-release@v1
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: softprops/action-gh-release@v2
|
||||
with:
|
||||
tag_name: v${{ steps.get_version.outputs.VERSION }}
|
||||
release_name: Release v${{ steps.get_version.outputs.VERSION }}
|
||||
name: Release v${{ steps.get_version.outputs.VERSION }}
|
||||
body: |
|
||||
## 🎉 Crawl4AI v${{ steps.get_version.outputs.VERSION }} Released!
|
||||
|
||||
@@ -121,6 +121,7 @@ jobs:
|
||||
See [CHANGELOG.md](https://github.com/${{ github.repository }}/blob/main/CHANGELOG.md) for details.
|
||||
draft: false
|
||||
prerelease: false
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Summary
|
||||
run: |
|
||||
|
||||
15
CHANGELOG.md
15
CHANGELOG.md
@@ -21,6 +21,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
- **Flexible LLM Provider Configuration** (Docker):
|
||||
- Support for `LLM_PROVIDER` environment variable to override default provider
|
||||
- Per-request provider override via optional `provider` parameter in API endpoints
|
||||
- Automatic provider validation with clear error messages
|
||||
- Updated Docker documentation and examples
|
||||
|
||||
### Changed
|
||||
- **WebScrapingStrategy Refactoring**: Simplified content scraping architecture
|
||||
- `WebScrapingStrategy` is now an alias for `LXMLWebScrapingStrategy` for backward compatibility
|
||||
- Removed redundant BeautifulSoup-based implementation (~1000 lines of code)
|
||||
- `LXMLWebScrapingStrategy` now inherits directly from `ContentScrapingStrategy`
|
||||
- All existing code using `WebScrapingStrategy` continues to work without modification
|
||||
- Default scraping strategy remains `LXMLWebScrapingStrategy` for optimal performance
|
||||
|
||||
### Added
|
||||
- **AsyncUrlSeeder**: High-performance URL discovery system for intelligent crawling at scale
|
||||
- Discover URLs from sitemaps and Common Crawl index
|
||||
|
||||
@@ -3,12 +3,12 @@ import warnings
|
||||
|
||||
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
||||
# MODIFIED: Add SeedingConfig and VirtualScrollConfig here
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig, LinkPreviewConfig
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig, LinkPreviewConfig, MatchMode
|
||||
|
||||
from .content_scraping_strategy import (
|
||||
ContentScrapingStrategy,
|
||||
WebScrapingStrategy,
|
||||
LXMLWebScrapingStrategy,
|
||||
WebScrapingStrategy, # Backward compatibility alias
|
||||
)
|
||||
from .async_logger import (
|
||||
AsyncLoggerBase,
|
||||
@@ -132,6 +132,7 @@ __all__ = [
|
||||
"CrawlResult",
|
||||
"CrawlerHub",
|
||||
"CacheMode",
|
||||
"MatchMode",
|
||||
"ContentScrapingStrategy",
|
||||
"WebScrapingStrategy",
|
||||
"LXMLWebScrapingStrategy",
|
||||
|
||||
@@ -18,17 +18,24 @@ from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy
|
||||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||
|
||||
from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
|
||||
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy, LXMLWebScrapingStrategy
|
||||
from .content_scraping_strategy import ContentScrapingStrategy, LXMLWebScrapingStrategy
|
||||
from .deep_crawling import DeepCrawlStrategy
|
||||
|
||||
from .cache_context import CacheMode
|
||||
from .proxy_strategy import ProxyRotationStrategy
|
||||
|
||||
from typing import Union, List
|
||||
from typing import Union, List, Callable
|
||||
import inspect
|
||||
from typing import Any, Dict, Optional
|
||||
from enum import Enum
|
||||
|
||||
# Type alias for URL matching
|
||||
UrlMatcher = Union[str, Callable[[str], bool], List[Union[str, Callable[[str], bool]]]]
|
||||
|
||||
class MatchMode(Enum):
|
||||
OR = "or"
|
||||
AND = "and"
|
||||
|
||||
# from .proxy_strategy import ProxyConfig
|
||||
|
||||
|
||||
@@ -862,7 +869,7 @@ class CrawlerRunConfig():
|
||||
parser_type (str): Type of parser to use for HTML parsing.
|
||||
Default: "lxml".
|
||||
scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
|
||||
Default: WebScrapingStrategy.
|
||||
Default: LXMLWebScrapingStrategy.
|
||||
proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
|
||||
If None, no additional proxy config. Default: None.
|
||||
|
||||
@@ -1113,6 +1120,9 @@ class CrawlerRunConfig():
|
||||
link_preview_config: Union[LinkPreviewConfig, Dict[str, Any]] = None,
|
||||
# Virtual Scroll Parameters
|
||||
virtual_scroll_config: Union[VirtualScrollConfig, Dict[str, Any]] = None,
|
||||
# URL Matching Parameters
|
||||
url_matcher: Optional[UrlMatcher] = None,
|
||||
match_mode: MatchMode = MatchMode.OR,
|
||||
# Experimental Parameters
|
||||
experimental: Dict[str, Any] = None,
|
||||
):
|
||||
@@ -1266,6 +1276,10 @@ class CrawlerRunConfig():
|
||||
else:
|
||||
raise ValueError("virtual_scroll_config must be VirtualScrollConfig object or dict")
|
||||
|
||||
# URL Matching Parameters
|
||||
self.url_matcher = url_matcher
|
||||
self.match_mode = match_mode
|
||||
|
||||
# Experimental Parameters
|
||||
self.experimental = experimental or {}
|
||||
|
||||
@@ -1321,6 +1335,51 @@ class CrawlerRunConfig():
|
||||
if "compilation error" not in str(e).lower():
|
||||
raise ValueError(f"Failed to compile C4A script: {str(e)}")
|
||||
raise
|
||||
|
||||
def is_match(self, url: str) -> bool:
|
||||
"""Check if this config matches the given URL.
|
||||
|
||||
Args:
|
||||
url: The URL to check against this config's matcher
|
||||
|
||||
Returns:
|
||||
bool: True if this config should be used for the URL or if no matcher is set.
|
||||
"""
|
||||
if self.url_matcher is None:
|
||||
return True
|
||||
|
||||
if callable(self.url_matcher):
|
||||
# Single function matcher
|
||||
return self.url_matcher(url)
|
||||
|
||||
elif isinstance(self.url_matcher, str):
|
||||
# Single pattern string
|
||||
from fnmatch import fnmatch
|
||||
return fnmatch(url, self.url_matcher)
|
||||
|
||||
elif isinstance(self.url_matcher, list):
|
||||
# List of mixed matchers
|
||||
if not self.url_matcher: # Empty list
|
||||
return False
|
||||
|
||||
results = []
|
||||
for matcher in self.url_matcher:
|
||||
if callable(matcher):
|
||||
results.append(matcher(url))
|
||||
elif isinstance(matcher, str):
|
||||
from fnmatch import fnmatch
|
||||
results.append(fnmatch(url, matcher))
|
||||
else:
|
||||
# Skip invalid matchers
|
||||
continue
|
||||
|
||||
# Apply match mode logic
|
||||
if self.match_mode == MatchMode.OR:
|
||||
return any(results) if results else False
|
||||
else: # AND mode
|
||||
return all(results) if results else False
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def __getattr__(self, name):
|
||||
@@ -1443,6 +1502,9 @@ class CrawlerRunConfig():
|
||||
# Link Extraction Parameters
|
||||
link_preview_config=kwargs.get("link_preview_config"),
|
||||
url=kwargs.get("url"),
|
||||
# URL Matching Parameters
|
||||
url_matcher=kwargs.get("url_matcher"),
|
||||
match_mode=kwargs.get("match_mode", MatchMode.OR),
|
||||
# Experimental Parameters
|
||||
experimental=kwargs.get("experimental"),
|
||||
)
|
||||
@@ -1540,6 +1602,8 @@ class CrawlerRunConfig():
|
||||
"deep_crawl_strategy": self.deep_crawl_strategy,
|
||||
"link_preview_config": self.link_preview_config.to_dict() if self.link_preview_config else None,
|
||||
"url": self.url,
|
||||
"url_matcher": self.url_matcher,
|
||||
"match_mode": self.match_mode,
|
||||
"experimental": self.experimental,
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from typing import Dict, Optional, List, Tuple
|
||||
from typing import Dict, Optional, List, Tuple, Union
|
||||
from .async_configs import CrawlerRunConfig
|
||||
from .models import (
|
||||
CrawlResult,
|
||||
@@ -22,6 +22,8 @@ from urllib.parse import urlparse
|
||||
import random
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from .memory_utils import get_true_memory_usage_percent
|
||||
|
||||
|
||||
class RateLimiter:
|
||||
def __init__(
|
||||
@@ -96,11 +98,37 @@ class BaseDispatcher(ABC):
|
||||
self.rate_limiter = rate_limiter
|
||||
self.monitor = monitor
|
||||
|
||||
def select_config(self, url: str, configs: Union[CrawlerRunConfig, List[CrawlerRunConfig]]) -> Optional[CrawlerRunConfig]:
|
||||
"""Select the appropriate config for a given URL.
|
||||
|
||||
Args:
|
||||
url: The URL to match against
|
||||
configs: Single config or list of configs to choose from
|
||||
|
||||
Returns:
|
||||
The matching config, or None if no match found
|
||||
"""
|
||||
# Single config - return as is
|
||||
if isinstance(configs, CrawlerRunConfig):
|
||||
return configs
|
||||
|
||||
# Empty list - return None
|
||||
if not configs:
|
||||
return None
|
||||
|
||||
# Find first matching config
|
||||
for config in configs:
|
||||
if config.is_match(url):
|
||||
return config
|
||||
|
||||
# No match found - return None to indicate URL should be skipped
|
||||
return None
|
||||
|
||||
@abstractmethod
|
||||
async def crawl_url(
|
||||
self,
|
||||
url: str,
|
||||
config: CrawlerRunConfig,
|
||||
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||
task_id: str,
|
||||
monitor: Optional[CrawlerMonitor] = None,
|
||||
) -> CrawlerTaskResult:
|
||||
@@ -111,7 +139,7 @@ class BaseDispatcher(ABC):
|
||||
self,
|
||||
urls: List[str],
|
||||
crawler: AsyncWebCrawler, # noqa: F821
|
||||
config: CrawlerRunConfig,
|
||||
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||
monitor: Optional[CrawlerMonitor] = None,
|
||||
) -> List[CrawlerTaskResult]:
|
||||
pass
|
||||
@@ -147,7 +175,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
async def _memory_monitor_task(self):
|
||||
"""Background task to continuously monitor memory usage and update state"""
|
||||
while True:
|
||||
self.current_memory_percent = psutil.virtual_memory().percent
|
||||
self.current_memory_percent = get_true_memory_usage_percent()
|
||||
|
||||
# Enter memory pressure mode if we cross the threshold
|
||||
if self.current_memory_percent >= self.memory_threshold_percent:
|
||||
@@ -200,7 +228,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
async def crawl_url(
|
||||
self,
|
||||
url: str,
|
||||
config: CrawlerRunConfig,
|
||||
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||
task_id: str,
|
||||
retry_count: int = 0,
|
||||
) -> CrawlerTaskResult:
|
||||
@@ -208,6 +236,37 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
error_message = ""
|
||||
memory_usage = peak_memory = 0.0
|
||||
|
||||
# Select appropriate config for this URL
|
||||
selected_config = self.select_config(url, config)
|
||||
|
||||
# If no config matches, return failed result
|
||||
if selected_config is None:
|
||||
error_message = f"No matching configuration found for URL: {url}"
|
||||
if self.monitor:
|
||||
self.monitor.update_task(
|
||||
task_id,
|
||||
status=CrawlStatus.FAILED,
|
||||
error_message=error_message
|
||||
)
|
||||
|
||||
return CrawlerTaskResult(
|
||||
task_id=task_id,
|
||||
url=url,
|
||||
result=CrawlResult(
|
||||
url=url,
|
||||
html="",
|
||||
metadata={"status": "no_config_match"},
|
||||
success=False,
|
||||
error_message=error_message
|
||||
),
|
||||
memory_usage=0,
|
||||
peak_memory=0,
|
||||
start_time=start_time,
|
||||
end_time=time.time(),
|
||||
error_message=error_message,
|
||||
retry_count=retry_count
|
||||
)
|
||||
|
||||
# Get starting memory for accurate measurement
|
||||
process = psutil.Process()
|
||||
start_memory = process.memory_info().rss / (1024 * 1024)
|
||||
@@ -257,8 +316,8 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
retry_count=retry_count + 1
|
||||
)
|
||||
|
||||
# Execute the crawl
|
||||
result = await self.crawler.arun(url, config=config, session_id=task_id)
|
||||
# Execute the crawl with selected config
|
||||
result = await self.crawler.arun(url, config=selected_config, session_id=task_id)
|
||||
|
||||
# Measure memory usage
|
||||
end_memory = process.memory_info().rss / (1024 * 1024)
|
||||
@@ -316,7 +375,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
self,
|
||||
urls: List[str],
|
||||
crawler: AsyncWebCrawler,
|
||||
config: CrawlerRunConfig,
|
||||
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||
) -> List[CrawlerTaskResult]:
|
||||
self.crawler = crawler
|
||||
|
||||
@@ -470,7 +529,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
self,
|
||||
urls: List[str],
|
||||
crawler: AsyncWebCrawler,
|
||||
config: CrawlerRunConfig,
|
||||
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||
) -> AsyncGenerator[CrawlerTaskResult, None]:
|
||||
self.crawler = crawler
|
||||
|
||||
@@ -572,7 +631,7 @@ class SemaphoreDispatcher(BaseDispatcher):
|
||||
async def crawl_url(
|
||||
self,
|
||||
url: str,
|
||||
config: CrawlerRunConfig,
|
||||
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||
task_id: str,
|
||||
semaphore: asyncio.Semaphore = None,
|
||||
) -> CrawlerTaskResult:
|
||||
@@ -580,6 +639,36 @@ class SemaphoreDispatcher(BaseDispatcher):
|
||||
error_message = ""
|
||||
memory_usage = peak_memory = 0.0
|
||||
|
||||
# Select appropriate config for this URL
|
||||
selected_config = self.select_config(url, config)
|
||||
|
||||
# If no config matches, return failed result
|
||||
if selected_config is None:
|
||||
error_message = f"No matching configuration found for URL: {url}"
|
||||
if self.monitor:
|
||||
self.monitor.update_task(
|
||||
task_id,
|
||||
status=CrawlStatus.FAILED,
|
||||
error_message=error_message
|
||||
)
|
||||
|
||||
return CrawlerTaskResult(
|
||||
task_id=task_id,
|
||||
url=url,
|
||||
result=CrawlResult(
|
||||
url=url,
|
||||
html="",
|
||||
metadata={"status": "no_config_match"},
|
||||
success=False,
|
||||
error_message=error_message
|
||||
),
|
||||
memory_usage=0,
|
||||
peak_memory=0,
|
||||
start_time=start_time,
|
||||
end_time=time.time(),
|
||||
error_message=error_message
|
||||
)
|
||||
|
||||
try:
|
||||
if self.monitor:
|
||||
self.monitor.update_task(
|
||||
@@ -592,7 +681,7 @@ class SemaphoreDispatcher(BaseDispatcher):
|
||||
async with semaphore:
|
||||
process = psutil.Process()
|
||||
start_memory = process.memory_info().rss / (1024 * 1024)
|
||||
result = await self.crawler.arun(url, config=config, session_id=task_id)
|
||||
result = await self.crawler.arun(url, config=selected_config, session_id=task_id)
|
||||
end_memory = process.memory_info().rss / (1024 * 1024)
|
||||
|
||||
memory_usage = peak_memory = end_memory - start_memory
|
||||
@@ -654,7 +743,7 @@ class SemaphoreDispatcher(BaseDispatcher):
|
||||
self,
|
||||
crawler: AsyncWebCrawler, # noqa: F821
|
||||
urls: List[str],
|
||||
config: CrawlerRunConfig,
|
||||
config: Union[CrawlerRunConfig, List[CrawlerRunConfig]],
|
||||
) -> List[CrawlerTaskResult]:
|
||||
self.crawler = crawler
|
||||
if self.monitor:
|
||||
|
||||
@@ -829,7 +829,7 @@ class AsyncUrlSeeder:
|
||||
|
||||
async def _iter_sitemap(self, url: str):
|
||||
try:
|
||||
r = await self.client.get(url, timeout=15)
|
||||
r = await self.client.get(url, timeout=15, follow_redirects=True)
|
||||
r.raise_for_status()
|
||||
except httpx.HTTPStatusError as e:
|
||||
self._log("warning", "Failed to fetch sitemap {url}: HTTP {status_code}",
|
||||
|
||||
@@ -653,7 +653,7 @@ class AsyncWebCrawler:
|
||||
async def arun_many(
|
||||
self,
|
||||
urls: List[str],
|
||||
config: Optional[CrawlerRunConfig] = None,
|
||||
config: Optional[Union[CrawlerRunConfig, List[CrawlerRunConfig]]] = None,
|
||||
dispatcher: Optional[BaseDispatcher] = None,
|
||||
# Legacy parameters maintained for backwards compatibility
|
||||
# word_count_threshold=MIN_WORD_THRESHOLD,
|
||||
@@ -674,7 +674,9 @@ class AsyncWebCrawler:
|
||||
|
||||
Args:
|
||||
urls: List of URLs to crawl
|
||||
config: Configuration object controlling crawl behavior for all URLs
|
||||
config: Configuration object(s) controlling crawl behavior. Can be:
|
||||
- Single CrawlerRunConfig: Used for all URLs
|
||||
- List[CrawlerRunConfig]: Configs with url_matcher for URL-specific settings
|
||||
dispatcher: The dispatcher strategy instance to use. Defaults to MemoryAdaptiveDispatcher
|
||||
[other parameters maintained for backwards compatibility]
|
||||
|
||||
@@ -739,7 +741,11 @@ class AsyncWebCrawler:
|
||||
or task_result.result
|
||||
)
|
||||
|
||||
stream = config.stream
|
||||
# Handle stream setting - use first config's stream setting if config is a list
|
||||
if isinstance(config, list):
|
||||
stream = config[0].stream if config else False
|
||||
else:
|
||||
stream = config.stream
|
||||
|
||||
if stream:
|
||||
|
||||
|
||||
@@ -65,6 +65,213 @@ class BrowserProfiler:
|
||||
self.builtin_config_file = os.path.join(self.builtin_browser_dir, "browser_config.json")
|
||||
os.makedirs(self.builtin_browser_dir, exist_ok=True)
|
||||
|
||||
def _is_windows(self) -> bool:
|
||||
"""Check if running on Windows platform."""
|
||||
return sys.platform.startswith('win') or sys.platform == 'cygwin'
|
||||
|
||||
def _is_macos(self) -> bool:
|
||||
"""Check if running on macOS platform."""
|
||||
return sys.platform == 'darwin'
|
||||
|
||||
def _is_linux(self) -> bool:
|
||||
"""Check if running on Linux platform."""
|
||||
return sys.platform.startswith('linux')
|
||||
|
||||
def _get_quit_message(self, tag: str) -> str:
|
||||
"""Get appropriate quit message based on context."""
|
||||
if tag == "PROFILE":
|
||||
return "Closing browser and saving profile..."
|
||||
elif tag == "CDP":
|
||||
return "Closing browser..."
|
||||
else:
|
||||
return "Closing browser..."
|
||||
|
||||
async def _listen_windows(self, user_done_event, check_browser_process, tag: str):
|
||||
"""Windows-specific keyboard listener using msvcrt."""
|
||||
try:
|
||||
import msvcrt
|
||||
except ImportError:
|
||||
raise ImportError("msvcrt module not available on this platform")
|
||||
|
||||
while True:
|
||||
try:
|
||||
# Check for keyboard input
|
||||
if msvcrt.kbhit():
|
||||
raw = msvcrt.getch()
|
||||
|
||||
# Handle Unicode decoding more robustly
|
||||
key = None
|
||||
try:
|
||||
key = raw.decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
try:
|
||||
# Try different encodings
|
||||
key = raw.decode("latin1")
|
||||
except UnicodeDecodeError:
|
||||
# Skip if we can't decode
|
||||
continue
|
||||
|
||||
# Validate key
|
||||
if not key or len(key) != 1:
|
||||
continue
|
||||
|
||||
# Check for printable characters only
|
||||
if not key.isprintable():
|
||||
continue
|
||||
|
||||
# Check for quit command
|
||||
if key.lower() == "q":
|
||||
self.logger.info(
|
||||
self._get_quit_message(tag),
|
||||
tag=tag,
|
||||
base_color=LogColor.GREEN
|
||||
)
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
# Check if browser process ended
|
||||
if await check_browser_process():
|
||||
return
|
||||
|
||||
# Small delay to prevent busy waiting
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error in Windows keyboard listener: {e}", tag=tag)
|
||||
# Continue trying instead of failing completely
|
||||
await asyncio.sleep(0.1)
|
||||
continue
|
||||
|
||||
async def _listen_unix(self, user_done_event: asyncio.Event, check_browser_process, tag: str):
|
||||
"""Unix/Linux/macOS keyboard listener using termios and select."""
|
||||
try:
|
||||
import termios
|
||||
import tty
|
||||
import select
|
||||
except ImportError:
|
||||
raise ImportError("termios/tty/select modules not available on this platform")
|
||||
|
||||
# Get stdin file descriptor
|
||||
try:
|
||||
fd = sys.stdin.fileno()
|
||||
except (AttributeError, OSError):
|
||||
raise ImportError("stdin is not a terminal")
|
||||
|
||||
# Save original terminal settings
|
||||
old_settings = None
|
||||
try:
|
||||
old_settings = termios.tcgetattr(fd)
|
||||
except termios.error as e:
|
||||
raise ImportError(f"Cannot get terminal attributes: {e}")
|
||||
|
||||
try:
|
||||
# Switch to non-canonical mode (cbreak mode)
|
||||
tty.setcbreak(fd)
|
||||
|
||||
while True:
|
||||
try:
|
||||
# Use select to check if input is available (non-blocking)
|
||||
# Timeout of 0.5 seconds to periodically check browser process
|
||||
readable, _, _ = select.select([sys.stdin], [], [], 0.5)
|
||||
|
||||
if readable:
|
||||
# Read one character
|
||||
key = sys.stdin.read(1)
|
||||
|
||||
if key and key.lower() == "q":
|
||||
self.logger.info(
|
||||
self._get_quit_message(tag),
|
||||
tag=tag,
|
||||
base_color=LogColor.GREEN
|
||||
)
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
# Check if browser process ended
|
||||
if await check_browser_process():
|
||||
return
|
||||
|
||||
# Small delay to prevent busy waiting
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
except (KeyboardInterrupt, EOFError):
|
||||
# Handle Ctrl+C or EOF gracefully
|
||||
self.logger.info("Keyboard interrupt received", tag=tag)
|
||||
user_done_event.set()
|
||||
return
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error in Unix keyboard listener: {e}", tag=tag)
|
||||
await asyncio.sleep(0.1)
|
||||
continue
|
||||
|
||||
finally:
|
||||
# Always restore terminal settings
|
||||
if old_settings is not None:
|
||||
try:
|
||||
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to restore terminal settings: {e}", tag=tag)
|
||||
|
||||
async def _listen_fallback(self, user_done_event: asyncio.Event, check_browser_process, tag: str):
|
||||
"""Fallback keyboard listener using simple input() method."""
|
||||
self.logger.info("Using fallback input mode. Type 'q' and press Enter to quit.", tag=tag)
|
||||
|
||||
# Run input in a separate thread to avoid blocking
|
||||
import threading
|
||||
import queue
|
||||
|
||||
input_queue = queue.Queue()
|
||||
|
||||
def input_thread():
|
||||
"""Thread function to handle input."""
|
||||
try:
|
||||
while not user_done_event.is_set():
|
||||
try:
|
||||
# Use input() with a prompt
|
||||
user_input = input("Press 'q' + Enter to quit: ").strip().lower()
|
||||
input_queue.put(user_input)
|
||||
if user_input == 'q':
|
||||
break
|
||||
except (EOFError, KeyboardInterrupt):
|
||||
input_queue.put('q')
|
||||
break
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Error in input thread: {e}", tag=tag)
|
||||
break
|
||||
except Exception as e:
|
||||
self.logger.error(f"Input thread failed: {e}", tag=tag)
|
||||
|
||||
# Start input thread
|
||||
thread = threading.Thread(target=input_thread, daemon=True)
|
||||
thread.start()
|
||||
|
||||
try:
|
||||
while not user_done_event.is_set():
|
||||
# Check for user input
|
||||
try:
|
||||
user_input = input_queue.get_nowait()
|
||||
if user_input == 'q':
|
||||
self.logger.info(
|
||||
self._get_quit_message(tag),
|
||||
tag=tag,
|
||||
base_color=LogColor.GREEN
|
||||
)
|
||||
user_done_event.set()
|
||||
return
|
||||
except queue.Empty:
|
||||
pass
|
||||
|
||||
# Check if browser process ended
|
||||
if await check_browser_process():
|
||||
return
|
||||
|
||||
# Small delay
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Fallback listener failed: {e}", tag=tag)
|
||||
user_done_event.set()
|
||||
|
||||
async def create_profile(self,
|
||||
profile_name: Optional[str] = None,
|
||||
browser_config: Optional[BrowserConfig] = None) -> Optional[str]:
|
||||
@@ -180,42 +387,38 @@ class BrowserProfiler:
|
||||
|
||||
# Run keyboard input loop in a separate task
|
||||
async def listen_for_quit_command():
|
||||
import termios
|
||||
import tty
|
||||
import select
|
||||
|
||||
"""Cross-platform keyboard listener that waits for 'q' key press."""
|
||||
# First output the prompt
|
||||
self.logger.info("Press 'q' when you've finished using the browser...", tag="PROFILE")
|
||||
|
||||
# Save original terminal settings
|
||||
fd = sys.stdin.fileno()
|
||||
old_settings = termios.tcgetattr(fd)
|
||||
|
||||
self.logger.info(
|
||||
"Press {segment} when you've finished using the browser...",
|
||||
tag="PROFILE",
|
||||
params={"segment": "'q'"}, colors={"segment": LogColor.YELLOW},
|
||||
base_color=LogColor.CYAN
|
||||
)
|
||||
|
||||
async def check_browser_process():
|
||||
"""Check if browser process is still running."""
|
||||
if (
|
||||
managed_browser.browser_process
|
||||
and managed_browser.browser_process.poll() is not None
|
||||
):
|
||||
self.logger.info(
|
||||
"Browser already closed. Ending input listener.", tag="PROFILE"
|
||||
)
|
||||
user_done_event.set()
|
||||
return True
|
||||
return False
|
||||
|
||||
# Try platform-specific implementations with fallback
|
||||
try:
|
||||
# Switch to non-canonical mode (no line buffering)
|
||||
tty.setcbreak(fd)
|
||||
|
||||
while True:
|
||||
# Check if input is available (non-blocking)
|
||||
readable, _, _ = select.select([sys.stdin], [], [], 0.5)
|
||||
if readable:
|
||||
key = sys.stdin.read(1)
|
||||
if key.lower() == 'q':
|
||||
self.logger.info("Closing browser and saving profile...", tag="PROFILE", base_color=LogColor.GREEN)
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
# Check if the browser process has already exited
|
||||
if managed_browser.browser_process and managed_browser.browser_process.poll() is not None:
|
||||
self.logger.info("Browser already closed. Ending input listener.", tag="PROFILE")
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
finally:
|
||||
# Restore terminal settings
|
||||
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
|
||||
if self._is_windows():
|
||||
await self._listen_windows(user_done_event, check_browser_process, "PROFILE")
|
||||
else:
|
||||
await self._listen_unix(user_done_event, check_browser_process, "PROFILE")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Platform-specific keyboard listener failed: {e}", tag="PROFILE")
|
||||
self.logger.info("Falling back to simple input mode...", tag="PROFILE")
|
||||
await self._listen_fallback(user_done_event, check_browser_process, "PROFILE")
|
||||
|
||||
try:
|
||||
from playwright.async_api import async_playwright
|
||||
@@ -682,42 +885,33 @@ class BrowserProfiler:
|
||||
|
||||
# Run keyboard input loop in a separate task
|
||||
async def listen_for_quit_command():
|
||||
import termios
|
||||
import tty
|
||||
import select
|
||||
|
||||
"""Cross-platform keyboard listener that waits for 'q' key press."""
|
||||
# First output the prompt
|
||||
self.logger.info("Press 'q' to stop the browser and exit...", tag="CDP")
|
||||
|
||||
# Save original terminal settings
|
||||
fd = sys.stdin.fileno()
|
||||
old_settings = termios.tcgetattr(fd)
|
||||
|
||||
self.logger.info(
|
||||
"Press {segment} to stop the browser and exit...",
|
||||
tag="CDP",
|
||||
params={"segment": "'q'"}, colors={"segment": LogColor.YELLOW},
|
||||
base_color=LogColor.CYAN
|
||||
)
|
||||
|
||||
async def check_browser_process():
|
||||
"""Check if browser process is still running."""
|
||||
if managed_browser.browser_process and managed_browser.browser_process.poll() is not None:
|
||||
self.logger.info("Browser already closed. Ending input listener.", tag="CDP")
|
||||
user_done_event.set()
|
||||
return True
|
||||
return False
|
||||
|
||||
# Try platform-specific implementations with fallback
|
||||
try:
|
||||
# Switch to non-canonical mode (no line buffering)
|
||||
tty.setcbreak(fd)
|
||||
|
||||
while True:
|
||||
# Check if input is available (non-blocking)
|
||||
readable, _, _ = select.select([sys.stdin], [], [], 0.5)
|
||||
if readable:
|
||||
key = sys.stdin.read(1)
|
||||
if key.lower() == 'q':
|
||||
self.logger.info("Closing browser...", tag="CDP")
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
# Check if the browser process has already exited
|
||||
if managed_browser.browser_process and managed_browser.browser_process.poll() is not None:
|
||||
self.logger.info("Browser already closed. Ending input listener.", tag="CDP")
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
finally:
|
||||
# Restore terminal settings
|
||||
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
|
||||
if self._is_windows():
|
||||
await self._listen_windows(user_done_event, check_browser_process, "CDP")
|
||||
else:
|
||||
await self._listen_unix(user_done_event, check_browser_process, "CDP")
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Platform-specific keyboard listener failed: {e}", tag="CDP")
|
||||
self.logger.info("Falling back to simple input mode...", tag="CDP")
|
||||
await self._listen_fallback(user_done_event, check_browser_process, "CDP")
|
||||
|
||||
# Function to retrieve and display CDP JSON config
|
||||
async def get_cdp_json(port):
|
||||
|
||||
@@ -98,20 +98,20 @@ class ContentScrapingStrategy(ABC):
|
||||
pass
|
||||
|
||||
|
||||
class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
class LXMLWebScrapingStrategy(ContentScrapingStrategy):
|
||||
"""
|
||||
Class for web content scraping. Perhaps the most important class.
|
||||
|
||||
How it works:
|
||||
1. Extract content from HTML using BeautifulSoup.
|
||||
2. Clean the extracted content using a content cleaning strategy.
|
||||
3. Filter the cleaned content using a content filtering strategy.
|
||||
4. Generate markdown content from the filtered content.
|
||||
5. Return the markdown content.
|
||||
LXML-based implementation for fast web content scraping.
|
||||
|
||||
This is the primary scraping strategy in Crawl4AI, providing high-performance
|
||||
HTML parsing and content extraction using the lxml library.
|
||||
|
||||
Note: WebScrapingStrategy is now an alias for this class to maintain
|
||||
backward compatibility.
|
||||
"""
|
||||
|
||||
def __init__(self, logger=None):
|
||||
self.logger = logger
|
||||
self.DIMENSION_REGEX = re.compile(r"(\d+)(\D*)")
|
||||
self.BASE64_PATTERN = re.compile(r'data:image/[^;]+;base64,([^"]+)')
|
||||
|
||||
def _log(self, level, message, tag="SCRAPE", **kwargs):
|
||||
"""Helper method to safely use logger."""
|
||||
@@ -132,7 +132,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
ScrapingResult: A structured result containing the scraped content.
|
||||
"""
|
||||
actual_url = kwargs.get("redirected_url", url)
|
||||
raw_result = self._scrap(actual_url, html, is_async=False, **kwargs)
|
||||
raw_result = self._scrap(actual_url, html, **kwargs)
|
||||
if raw_result is None:
|
||||
return ScrapingResult(
|
||||
cleaned_html="",
|
||||
@@ -196,376 +196,9 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
Returns:
|
||||
ScrapingResult: A structured result containing the scraped content.
|
||||
"""
|
||||
return await asyncio.to_thread(self._scrap, url, html, **kwargs)
|
||||
return await asyncio.to_thread(self.scrap, url, html, **kwargs)
|
||||
|
||||
def is_data_table(self, table: Tag, **kwargs) -> bool:
|
||||
"""
|
||||
Determine if a table element is a data table (not a layout table).
|
||||
|
||||
Args:
|
||||
table (Tag): BeautifulSoup Tag representing a table element
|
||||
**kwargs: Additional keyword arguments including table_score_threshold
|
||||
|
||||
Returns:
|
||||
bool: True if the table is a data table, False otherwise
|
||||
"""
|
||||
score = 0
|
||||
|
||||
# Check for thead and tbody
|
||||
has_thead = len(table.select('thead')) > 0
|
||||
has_tbody = len(table.select('tbody')) > 0
|
||||
if has_thead:
|
||||
score += 2
|
||||
if has_tbody:
|
||||
score += 1
|
||||
|
||||
# Check for th elements
|
||||
th_count = len(table.select('th'))
|
||||
if th_count > 0:
|
||||
score += 2
|
||||
if has_thead or len(table.select('tr:first-child th')) > 0:
|
||||
score += 1
|
||||
|
||||
# Check for nested tables
|
||||
if len(table.select('table')) > 0:
|
||||
score -= 3
|
||||
|
||||
# Role attribute check
|
||||
role = table.get('role', '').lower()
|
||||
if role in {'presentation', 'none'}:
|
||||
score -= 3
|
||||
|
||||
# Column consistency
|
||||
rows = table.select('tr')
|
||||
if not rows:
|
||||
return False
|
||||
|
||||
col_counts = [len(row.select('td, th')) for row in rows]
|
||||
avg_cols = sum(col_counts) / len(col_counts)
|
||||
variance = sum((c - avg_cols)**2 for c in col_counts) / len(col_counts)
|
||||
if variance < 1:
|
||||
score += 2
|
||||
|
||||
# Caption and summary
|
||||
if table.select('caption'):
|
||||
score += 2
|
||||
if table.has_attr('summary') and table['summary']:
|
||||
score += 1
|
||||
|
||||
# Text density
|
||||
total_text = sum(len(cell.get_text().strip()) for row in rows for cell in row.select('td, th'))
|
||||
total_tags = sum(1 for _ in table.descendants if isinstance(_, Tag))
|
||||
text_ratio = total_text / (total_tags + 1e-5)
|
||||
if text_ratio > 20:
|
||||
score += 3
|
||||
elif text_ratio > 10:
|
||||
score += 2
|
||||
|
||||
# Data attributes
|
||||
data_attrs = sum(1 for attr in table.attrs if attr.startswith('data-'))
|
||||
score += data_attrs * 0.5
|
||||
|
||||
# Size check
|
||||
if avg_cols >= 2 and len(rows) >= 2:
|
||||
score += 2
|
||||
|
||||
threshold = kwargs.get('table_score_threshold', 7)
|
||||
return score >= threshold
|
||||
|
||||
def extract_table_data(self, table: Tag) -> dict:
|
||||
"""
|
||||
Extract structured data from a table element.
|
||||
|
||||
Args:
|
||||
table (Tag): BeautifulSoup Tag representing a table element
|
||||
|
||||
Returns:
|
||||
dict: Dictionary containing table data (headers, rows, caption, summary)
|
||||
"""
|
||||
caption_elem = table.select_one('caption')
|
||||
caption = caption_elem.get_text().strip() if caption_elem else ""
|
||||
summary = table.get('summary', '').strip()
|
||||
|
||||
# Extract headers with colspan handling
|
||||
headers = []
|
||||
thead_rows = table.select('thead tr')
|
||||
if thead_rows:
|
||||
header_cells = thead_rows[0].select('th')
|
||||
for cell in header_cells:
|
||||
text = cell.get_text().strip()
|
||||
colspan = int(cell.get('colspan', 1))
|
||||
headers.extend([text] * colspan)
|
||||
else:
|
||||
first_row = table.select('tr:first-child')
|
||||
if first_row:
|
||||
for cell in first_row[0].select('th, td'):
|
||||
text = cell.get_text().strip()
|
||||
colspan = int(cell.get('colspan', 1))
|
||||
headers.extend([text] * colspan)
|
||||
|
||||
# Extract rows with colspan handling
|
||||
rows = []
|
||||
all_rows = table.select('tr')
|
||||
thead = table.select_one('thead')
|
||||
tbody_rows = []
|
||||
|
||||
if thead:
|
||||
thead_rows = thead.select('tr')
|
||||
tbody_rows = [row for row in all_rows if row not in thead_rows]
|
||||
else:
|
||||
if all_rows and all_rows[0].select('th'):
|
||||
tbody_rows = all_rows[1:]
|
||||
else:
|
||||
tbody_rows = all_rows
|
||||
|
||||
for row in tbody_rows:
|
||||
# for row in table.select('tr:not(:has(ancestor::thead))'):
|
||||
row_data = []
|
||||
for cell in row.select('td'):
|
||||
text = cell.get_text().strip()
|
||||
colspan = int(cell.get('colspan', 1))
|
||||
row_data.extend([text] * colspan)
|
||||
if row_data:
|
||||
rows.append(row_data)
|
||||
|
||||
# Align rows with headers
|
||||
max_columns = len(headers) if headers else (max(len(row) for row in rows) if rows else 0)
|
||||
aligned_rows = []
|
||||
for row in rows:
|
||||
aligned = row[:max_columns] + [''] * (max_columns - len(row))
|
||||
aligned_rows.append(aligned)
|
||||
|
||||
if not headers:
|
||||
headers = [f"Column {i+1}" for i in range(max_columns)]
|
||||
|
||||
return {
|
||||
"headers": headers,
|
||||
"rows": aligned_rows,
|
||||
"caption": caption,
|
||||
"summary": summary,
|
||||
}
|
||||
|
||||
def flatten_nested_elements(self, node):
|
||||
"""
|
||||
Flatten nested elements in a HTML tree.
|
||||
|
||||
Args:
|
||||
node (Tag): The root node of the HTML tree.
|
||||
|
||||
Returns:
|
||||
Tag: The flattened HTML tree.
|
||||
"""
|
||||
if isinstance(node, NavigableString):
|
||||
return node
|
||||
if (
|
||||
len(node.contents) == 1
|
||||
and isinstance(node.contents[0], Tag)
|
||||
and node.contents[0].name == node.name
|
||||
):
|
||||
return self.flatten_nested_elements(node.contents[0])
|
||||
node.contents = [self.flatten_nested_elements(child) for child in node.contents]
|
||||
return node
|
||||
|
||||
def find_closest_parent_with_useful_text(self, tag, **kwargs):
|
||||
"""
|
||||
Find the closest parent with useful text.
|
||||
|
||||
Args:
|
||||
tag (Tag): The starting tag to search from.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
Tag: The closest parent with useful text, or None if not found.
|
||||
"""
|
||||
image_description_min_word_threshold = kwargs.get(
|
||||
"image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
|
||||
)
|
||||
current_tag = tag
|
||||
while current_tag:
|
||||
current_tag = current_tag.parent
|
||||
# Get the text content of the parent tag
|
||||
if current_tag:
|
||||
text_content = current_tag.get_text(separator=" ", strip=True)
|
||||
# Check if the text content has at least word_count_threshold
|
||||
if len(text_content.split()) >= image_description_min_word_threshold:
|
||||
return text_content
|
||||
return None
|
||||
|
||||
def remove_unwanted_attributes(
|
||||
self, element, important_attrs, keep_data_attributes=False
|
||||
):
|
||||
"""
|
||||
Remove unwanted attributes from an HTML element.
|
||||
|
||||
Args:
|
||||
element (Tag): The HTML element to remove attributes from.
|
||||
important_attrs (list): List of important attributes to keep.
|
||||
keep_data_attributes (bool): Whether to keep data attributes.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
attrs_to_remove = []
|
||||
for attr in element.attrs:
|
||||
if attr not in important_attrs:
|
||||
if keep_data_attributes:
|
||||
if not attr.startswith("data-"):
|
||||
attrs_to_remove.append(attr)
|
||||
else:
|
||||
attrs_to_remove.append(attr)
|
||||
|
||||
for attr in attrs_to_remove:
|
||||
del element[attr]
|
||||
|
||||
def process_image(self, img, url, index, total_images, **kwargs):
|
||||
"""
|
||||
Process an image element.
|
||||
|
||||
How it works:
|
||||
1. Check if the image has valid display and inside undesired html elements.
|
||||
2. Score an image for it's usefulness.
|
||||
3. Extract image file metadata to extract size and extension.
|
||||
4. Generate a dictionary with the processed image information.
|
||||
5. Return the processed image information.
|
||||
|
||||
Args:
|
||||
img (Tag): The image element to process.
|
||||
url (str): The URL of the page containing the image.
|
||||
index (int): The index of the image in the list of images.
|
||||
total_images (int): The total number of images in the list.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the processed image information.
|
||||
"""
|
||||
# parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w')
|
||||
# if ' ' in u else None}
|
||||
# for u in [f"http{p}" for p in s.split("http") if p]]
|
||||
|
||||
# Constants for checks
|
||||
classes_to_check = frozenset(["button", "icon", "logo"])
|
||||
tags_to_check = frozenset(["button", "input"])
|
||||
image_formats = frozenset(["jpg", "jpeg", "png", "webp", "avif", "gif"])
|
||||
|
||||
# Pre-fetch commonly used attributes
|
||||
style = img.get("style", "")
|
||||
alt = img.get("alt", "")
|
||||
src = img.get("src", "")
|
||||
data_src = img.get("data-src", "")
|
||||
srcset = img.get("srcset", "")
|
||||
data_srcset = img.get("data-srcset", "")
|
||||
width = img.get("width")
|
||||
height = img.get("height")
|
||||
parent = img.parent
|
||||
parent_classes = parent.get("class", [])
|
||||
|
||||
# Quick validation checks
|
||||
if (
|
||||
"display:none" in style
|
||||
or parent.name in tags_to_check
|
||||
or any(c in cls for c in parent_classes for cls in classes_to_check)
|
||||
or any(c in src for c in classes_to_check)
|
||||
or any(c in alt for c in classes_to_check)
|
||||
):
|
||||
return None
|
||||
|
||||
# Quick score calculation
|
||||
score = 0
|
||||
if width and width.isdigit():
|
||||
width_val = int(width)
|
||||
score += 1 if width_val > 150 else 0
|
||||
if height and height.isdigit():
|
||||
height_val = int(height)
|
||||
score += 1 if height_val > 150 else 0
|
||||
if alt:
|
||||
score += 1
|
||||
score += index / total_images < 0.5
|
||||
|
||||
# image_format = ''
|
||||
# if "data:image/" in src:
|
||||
# image_format = src.split(',')[0].split(';')[0].split('/')[1].split(';')[0]
|
||||
# else:
|
||||
# image_format = os.path.splitext(src)[1].lower().strip('.').split('?')[0]
|
||||
|
||||
# if image_format in ('jpg', 'png', 'webp', 'avif'):
|
||||
# score += 1
|
||||
|
||||
# Check for image format in all possible sources
|
||||
def has_image_format(url):
|
||||
return any(fmt in url.lower() for fmt in image_formats)
|
||||
|
||||
# Score for having proper image sources
|
||||
if any(has_image_format(url) for url in [src, data_src, srcset, data_srcset]):
|
||||
score += 1
|
||||
if srcset or data_srcset:
|
||||
score += 1
|
||||
if img.find_parent("picture"):
|
||||
score += 1
|
||||
|
||||
# Detect format from any available source
|
||||
detected_format = None
|
||||
for url in [src, data_src, srcset, data_srcset]:
|
||||
if url:
|
||||
format_matches = [fmt for fmt in image_formats if fmt in url.lower()]
|
||||
if format_matches:
|
||||
detected_format = format_matches[0]
|
||||
break
|
||||
|
||||
if score <= kwargs.get("image_score_threshold", IMAGE_SCORE_THRESHOLD):
|
||||
return None
|
||||
|
||||
# Use set for deduplication
|
||||
unique_urls = set()
|
||||
image_variants = []
|
||||
|
||||
# Generate a unique group ID for this set of variants
|
||||
group_id = index
|
||||
|
||||
# Base image info template
|
||||
base_info = {
|
||||
"alt": alt,
|
||||
"desc": self.find_closest_parent_with_useful_text(img, **kwargs),
|
||||
"score": score,
|
||||
"type": "image",
|
||||
"group_id": group_id, # Group ID for this set of variants
|
||||
"format": detected_format,
|
||||
}
|
||||
|
||||
# Inline function for adding variants
|
||||
def add_variant(src, width=None):
|
||||
if src and not src.startswith("data:") and src not in unique_urls:
|
||||
unique_urls.add(src)
|
||||
image_variants.append({**base_info, "src": src, "width": width})
|
||||
|
||||
# Process all sources
|
||||
add_variant(src)
|
||||
add_variant(data_src)
|
||||
|
||||
# Handle srcset and data-srcset in one pass
|
||||
for attr in ("srcset", "data-srcset"):
|
||||
if value := img.get(attr):
|
||||
for source in parse_srcset(value):
|
||||
add_variant(source["url"], source["width"])
|
||||
|
||||
# Quick picture element check
|
||||
if picture := img.find_parent("picture"):
|
||||
for source in picture.find_all("source"):
|
||||
if srcset := source.get("srcset"):
|
||||
for src in parse_srcset(srcset):
|
||||
add_variant(src["url"], src["width"])
|
||||
|
||||
# Framework-specific attributes in one pass
|
||||
for attr, value in img.attrs.items():
|
||||
if (
|
||||
attr.startswith("data-")
|
||||
and ("src" in attr or "srcset" in attr)
|
||||
and "http" in value
|
||||
):
|
||||
add_variant(value)
|
||||
|
||||
return image_variants if image_variants else None
|
||||
|
||||
def process_element(self, url, element: PageElement, **kwargs) -> Dict[str, Any]:
|
||||
def process_element(self, url, element: lhtml.HtmlElement, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Process an HTML element.
|
||||
|
||||
@@ -577,7 +210,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
|
||||
Args:
|
||||
url (str): The URL of the page containing the element.
|
||||
element (Tag): The HTML element to process.
|
||||
element (lhtml.HtmlElement): The HTML element to process.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
@@ -595,514 +228,6 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
"external_links_dict": external_links_dict,
|
||||
}
|
||||
|
||||
def _process_element(
|
||||
self,
|
||||
url,
|
||||
element: PageElement,
|
||||
media: Dict[str, Any],
|
||||
internal_links_dict: Dict[str, Any],
|
||||
external_links_dict: Dict[str, Any],
|
||||
**kwargs,
|
||||
) -> bool:
|
||||
"""
|
||||
Process an HTML element.
|
||||
"""
|
||||
try:
|
||||
if isinstance(element, NavigableString):
|
||||
if isinstance(element, Comment):
|
||||
element.extract()
|
||||
return False
|
||||
|
||||
# if element.name == 'img':
|
||||
# process_image(element, url, 0, 1)
|
||||
# return True
|
||||
base_domain = kwargs.get("base_domain", get_base_domain(url))
|
||||
|
||||
if element.name in ["script", "style", "link", "meta", "noscript"]:
|
||||
element.decompose()
|
||||
return False
|
||||
|
||||
keep_element = False
|
||||
# Special case for table elements - always preserve structure
|
||||
if element.name in ["tr", "td", "th"]:
|
||||
keep_element = True
|
||||
|
||||
exclude_domains = kwargs.get("exclude_domains", [])
|
||||
# exclude_social_media_domains = kwargs.get('exclude_social_media_domains', set(SOCIAL_MEDIA_DOMAINS))
|
||||
# exclude_social_media_domains = SOCIAL_MEDIA_DOMAINS + kwargs.get('exclude_social_media_domains', [])
|
||||
# exclude_social_media_domains = list(set(exclude_social_media_domains))
|
||||
|
||||
try:
|
||||
if element.name == "a" and element.get("href"):
|
||||
href = element.get("href", "").strip()
|
||||
if not href: # Skip empty hrefs
|
||||
return False
|
||||
|
||||
# url_base = url.split("/")[2]
|
||||
|
||||
# Normalize the URL
|
||||
try:
|
||||
normalized_href = normalize_url(href, url)
|
||||
except ValueError:
|
||||
# logging.warning(f"Invalid URL format: {href}, Error: {str(e)}")
|
||||
return False
|
||||
|
||||
link_data = {
|
||||
"href": normalized_href,
|
||||
"text": element.get_text().strip(),
|
||||
"title": element.get("title", "").strip(),
|
||||
"base_domain": base_domain,
|
||||
}
|
||||
|
||||
is_external = is_external_url(normalized_href, base_domain)
|
||||
|
||||
keep_element = True
|
||||
|
||||
# Handle external link exclusions
|
||||
if is_external:
|
||||
link_base_domain = get_base_domain(normalized_href)
|
||||
link_data["base_domain"] = link_base_domain
|
||||
if kwargs.get("exclude_external_links", False):
|
||||
element.decompose()
|
||||
return False
|
||||
# elif kwargs.get('exclude_social_media_links', False):
|
||||
# if link_base_domain in exclude_social_media_domains:
|
||||
# element.decompose()
|
||||
# return False
|
||||
# if any(domain in normalized_href.lower() for domain in exclude_social_media_domains):
|
||||
# element.decompose()
|
||||
# return False
|
||||
elif exclude_domains:
|
||||
if link_base_domain in exclude_domains:
|
||||
element.decompose()
|
||||
return False
|
||||
# if any(domain in normalized_href.lower() for domain in kwargs.get('exclude_domains', [])):
|
||||
# element.decompose()
|
||||
# return False
|
||||
|
||||
if is_external:
|
||||
if normalized_href not in external_links_dict:
|
||||
external_links_dict[normalized_href] = link_data
|
||||
else:
|
||||
if kwargs.get("exclude_internal_links", False):
|
||||
element.decompose()
|
||||
return False
|
||||
if normalized_href not in internal_links_dict:
|
||||
internal_links_dict[normalized_href] = link_data
|
||||
|
||||
except Exception as e:
|
||||
raise Exception(f"Error processing links: {str(e)}")
|
||||
|
||||
try:
|
||||
if element.name == "img":
|
||||
potential_sources = [
|
||||
"src",
|
||||
"data-src",
|
||||
"srcset" "data-lazy-src",
|
||||
"data-original",
|
||||
]
|
||||
src = element.get("src", "")
|
||||
while not src and potential_sources:
|
||||
src = element.get(potential_sources.pop(0), "")
|
||||
if not src:
|
||||
element.decompose()
|
||||
return False
|
||||
|
||||
# If it is srcset pick up the first image
|
||||
if "srcset" in element.attrs:
|
||||
src = element.attrs["srcset"].split(",")[0].split(" ")[0]
|
||||
|
||||
# If image src is internal, then skip
|
||||
if not is_external_url(src, base_domain):
|
||||
return True
|
||||
|
||||
image_src_base_domain = get_base_domain(src)
|
||||
|
||||
# Check flag if we should remove external images
|
||||
if kwargs.get("exclude_external_images", False):
|
||||
# Handle relative URLs (which are always from the same domain)
|
||||
if not src.startswith('http') and not src.startswith('//'):
|
||||
return True # Keep relative URLs
|
||||
|
||||
# For absolute URLs, compare the base domains using the existing function
|
||||
src_base_domain = get_base_domain(src)
|
||||
url_base_domain = get_base_domain(url)
|
||||
|
||||
# If the domains don't match and both are valid, the image is external
|
||||
if src_base_domain and url_base_domain and src_base_domain != url_base_domain:
|
||||
element.decompose()
|
||||
return False
|
||||
|
||||
# if kwargs.get('exclude_social_media_links', False):
|
||||
# if image_src_base_domain in exclude_social_media_domains:
|
||||
# element.decompose()
|
||||
# return False
|
||||
# src_url_base = src.split('/')[2]
|
||||
# url_base = url.split('/')[2]
|
||||
# if any(domain in src for domain in exclude_social_media_domains):
|
||||
# element.decompose()
|
||||
# return False
|
||||
|
||||
# Handle exclude domains
|
||||
if exclude_domains:
|
||||
if image_src_base_domain in exclude_domains:
|
||||
element.decompose()
|
||||
return False
|
||||
# if any(domain in src for domain in kwargs.get('exclude_domains', [])):
|
||||
# element.decompose()
|
||||
# return False
|
||||
|
||||
return True # Always keep image elements
|
||||
except Exception:
|
||||
raise "Error processing images"
|
||||
|
||||
# Check if flag to remove all forms is set
|
||||
if kwargs.get("remove_forms", False) and element.name == "form":
|
||||
element.decompose()
|
||||
return False
|
||||
|
||||
if element.name in ["video", "audio"]:
|
||||
media[f"{element.name}s"].append(
|
||||
{
|
||||
"src": element.get("src"),
|
||||
"alt": element.get("alt"),
|
||||
"type": element.name,
|
||||
"description": self.find_closest_parent_with_useful_text(
|
||||
element, **kwargs
|
||||
),
|
||||
}
|
||||
)
|
||||
source_tags = element.find_all("source")
|
||||
for source_tag in source_tags:
|
||||
media[f"{element.name}s"].append(
|
||||
{
|
||||
"src": source_tag.get("src"),
|
||||
"alt": element.get("alt"),
|
||||
"type": element.name,
|
||||
"description": self.find_closest_parent_with_useful_text(
|
||||
element, **kwargs
|
||||
),
|
||||
}
|
||||
)
|
||||
return True # Always keep video and audio elements
|
||||
|
||||
if element.name in ONLY_TEXT_ELIGIBLE_TAGS:
|
||||
if kwargs.get("only_text", False):
|
||||
element.replace_with(element.get_text())
|
||||
|
||||
try:
|
||||
self.remove_unwanted_attributes(
|
||||
element, IMPORTANT_ATTRS + kwargs.get("keep_attrs", []) , kwargs.get("keep_data_attributes", False)
|
||||
)
|
||||
except Exception as e:
|
||||
# print('Error removing unwanted attributes:', str(e))
|
||||
self._log(
|
||||
"error",
|
||||
message="Error removing unwanted attributes: {error}",
|
||||
tag="SCRAPE",
|
||||
params={"error": str(e)},
|
||||
)
|
||||
# Process children
|
||||
for child in list(element.children):
|
||||
if isinstance(child, NavigableString) and not isinstance(
|
||||
child, Comment
|
||||
):
|
||||
if len(child.strip()) > 0:
|
||||
keep_element = True
|
||||
else:
|
||||
if self._process_element(
|
||||
url,
|
||||
child,
|
||||
media,
|
||||
internal_links_dict,
|
||||
external_links_dict,
|
||||
**kwargs,
|
||||
):
|
||||
keep_element = True
|
||||
|
||||
# Check word count
|
||||
word_count_threshold = kwargs.get(
|
||||
"word_count_threshold", MIN_WORD_THRESHOLD
|
||||
)
|
||||
if not keep_element:
|
||||
word_count = len(element.get_text(strip=True).split())
|
||||
keep_element = word_count >= word_count_threshold
|
||||
|
||||
if not keep_element:
|
||||
element.decompose()
|
||||
|
||||
return keep_element
|
||||
except Exception as e:
|
||||
# print('Error processing element:', str(e))
|
||||
self._log(
|
||||
"error",
|
||||
message="Error processing element: {error}",
|
||||
tag="SCRAPE",
|
||||
params={"error": str(e)},
|
||||
)
|
||||
return False
|
||||
|
||||
def _scrap(
|
||||
self,
|
||||
url: str,
|
||||
html: str,
|
||||
word_count_threshold: int = MIN_WORD_THRESHOLD,
|
||||
css_selector: str = None,
|
||||
target_elements: List[str] = None,
|
||||
**kwargs,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract content from HTML using BeautifulSoup.
|
||||
|
||||
Args:
|
||||
url (str): The URL of the page to scrape.
|
||||
html (str): The HTML content of the page to scrape.
|
||||
word_count_threshold (int): The minimum word count threshold for content extraction.
|
||||
css_selector (str): The CSS selector to use for content extraction.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the extracted content.
|
||||
"""
|
||||
success = True
|
||||
if not html:
|
||||
return None
|
||||
|
||||
parser_type = kwargs.get("parser", "lxml")
|
||||
soup = BeautifulSoup(html, parser_type)
|
||||
body = soup.body
|
||||
if body is None:
|
||||
raise Exception("'<body>' tag is not found in fetched html. Consider adding wait_for=\"css:body\" to wait for body tag to be loaded into DOM.")
|
||||
base_domain = get_base_domain(url)
|
||||
|
||||
# Early removal of all images if exclude_all_images is set
|
||||
# This happens before any processing to minimize memory usage
|
||||
if kwargs.get("exclude_all_images", False):
|
||||
for img in body.find_all('img'):
|
||||
img.decompose()
|
||||
|
||||
try:
|
||||
meta = extract_metadata("", soup)
|
||||
except Exception as e:
|
||||
self._log(
|
||||
"error",
|
||||
message="Error extracting metadata: {error}",
|
||||
tag="SCRAPE",
|
||||
params={"error": str(e)},
|
||||
)
|
||||
meta = {}
|
||||
|
||||
# Handle tag-based removal first - faster than CSS selection
|
||||
excluded_tags = set(kwargs.get("excluded_tags", []) or [])
|
||||
if excluded_tags:
|
||||
for element in body.find_all(lambda tag: tag.name in excluded_tags):
|
||||
element.extract()
|
||||
|
||||
# Handle CSS selector-based removal
|
||||
excluded_selector = kwargs.get("excluded_selector", "")
|
||||
if excluded_selector:
|
||||
is_single_selector = (
|
||||
"," not in excluded_selector and " " not in excluded_selector
|
||||
)
|
||||
if is_single_selector:
|
||||
while element := body.select_one(excluded_selector):
|
||||
element.extract()
|
||||
else:
|
||||
for element in body.select(excluded_selector):
|
||||
element.extract()
|
||||
|
||||
content_element = None
|
||||
if target_elements:
|
||||
try:
|
||||
for_content_targeted_element = []
|
||||
for target_element in target_elements:
|
||||
for_content_targeted_element.extend(body.select(target_element))
|
||||
content_element = soup.new_tag("div")
|
||||
for el in for_content_targeted_element:
|
||||
content_element.append(copy.deepcopy(el))
|
||||
except Exception as e:
|
||||
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
|
||||
return None
|
||||
else:
|
||||
content_element = body
|
||||
|
||||
kwargs["exclude_social_media_domains"] = set(
|
||||
kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS
|
||||
)
|
||||
kwargs["exclude_domains"] = set(kwargs.get("exclude_domains", []))
|
||||
if kwargs.get("exclude_social_media_links", False):
|
||||
kwargs["exclude_domains"] = kwargs["exclude_domains"].union(
|
||||
kwargs["exclude_social_media_domains"]
|
||||
)
|
||||
|
||||
result_obj = self.process_element(
|
||||
url,
|
||||
body,
|
||||
word_count_threshold=word_count_threshold,
|
||||
base_domain=base_domain,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
links = {"internal": [], "external": []}
|
||||
media = result_obj["media"]
|
||||
internal_links_dict = result_obj["internal_links_dict"]
|
||||
external_links_dict = result_obj["external_links_dict"]
|
||||
|
||||
# Update the links dictionary with unique links
|
||||
links["internal"] = list(internal_links_dict.values())
|
||||
links["external"] = list(external_links_dict.values())
|
||||
|
||||
# Extract head content for links if configured
|
||||
link_preview_config = kwargs.get("link_preview_config")
|
||||
if link_preview_config is not None:
|
||||
try:
|
||||
import asyncio
|
||||
from .link_preview import LinkPreview
|
||||
from .models import Links, Link
|
||||
|
||||
verbose = link_preview_config.verbose
|
||||
|
||||
if verbose:
|
||||
self._log("info", "Starting link head extraction for {internal} internal and {external} external links",
|
||||
params={"internal": len(links["internal"]), "external": len(links["external"])}, tag="LINK_EXTRACT")
|
||||
|
||||
# Convert dict links to Link objects
|
||||
internal_links = [Link(**link_data) for link_data in links["internal"]]
|
||||
external_links = [Link(**link_data) for link_data in links["external"]]
|
||||
links_obj = Links(internal=internal_links, external=external_links)
|
||||
|
||||
# Create a config object for LinkPreview
|
||||
class TempCrawlerRunConfig:
|
||||
def __init__(self, link_config, score_links):
|
||||
self.link_preview_config = link_config
|
||||
self.score_links = score_links
|
||||
|
||||
config = TempCrawlerRunConfig(link_preview_config, kwargs.get("score_links", False))
|
||||
|
||||
# Extract head content (run async operation in sync context)
|
||||
async def extract_links():
|
||||
async with LinkPreview(self.logger) as extractor:
|
||||
return await extractor.extract_link_heads(links_obj, config)
|
||||
|
||||
# Run the async operation
|
||||
try:
|
||||
# Check if we're already in an async context
|
||||
loop = asyncio.get_running_loop()
|
||||
# If we're in an async context, we need to run in a thread
|
||||
import concurrent.futures
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
future = executor.submit(asyncio.run, extract_links())
|
||||
updated_links = future.result()
|
||||
except RuntimeError:
|
||||
# No running loop, we can use asyncio.run directly
|
||||
updated_links = asyncio.run(extract_links())
|
||||
|
||||
# Convert back to dict format
|
||||
links["internal"] = [link.dict() for link in updated_links.internal]
|
||||
links["external"] = [link.dict() for link in updated_links.external]
|
||||
|
||||
if verbose:
|
||||
successful_internal = len([l for l in updated_links.internal if l.head_extraction_status == "valid"])
|
||||
successful_external = len([l for l in updated_links.external if l.head_extraction_status == "valid"])
|
||||
self._log("info", "Link head extraction completed: {internal_success}/{internal_total} internal, {external_success}/{external_total} external",
|
||||
params={
|
||||
"internal_success": successful_internal,
|
||||
"internal_total": len(updated_links.internal),
|
||||
"external_success": successful_external,
|
||||
"external_total": len(updated_links.external)
|
||||
}, tag="LINK_EXTRACT")
|
||||
else:
|
||||
self._log("info", "Link head extraction completed successfully", tag="LINK_EXTRACT")
|
||||
|
||||
except Exception as e:
|
||||
self._log("error", f"Link head extraction failed: {str(e)}", tag="LINK_EXTRACT")
|
||||
# Continue with original links if extraction fails
|
||||
|
||||
# # Process images using ThreadPoolExecutor
|
||||
imgs = body.find_all("img")
|
||||
|
||||
media["images"] = [
|
||||
img
|
||||
for result in (
|
||||
self.process_image(img, url, i, len(imgs), **kwargs)
|
||||
for i, img in enumerate(imgs)
|
||||
)
|
||||
if result is not None
|
||||
for img in result
|
||||
]
|
||||
|
||||
# Process tables if not excluded
|
||||
excluded_tags = set(kwargs.get("excluded_tags", []) or [])
|
||||
if 'table' not in excluded_tags:
|
||||
tables = body.find_all('table')
|
||||
for table in tables:
|
||||
if self.is_data_table(table, **kwargs):
|
||||
table_data = self.extract_table_data(table)
|
||||
media["tables"].append(table_data)
|
||||
|
||||
body = self.flatten_nested_elements(body)
|
||||
base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)')
|
||||
for img in imgs:
|
||||
src = img.get("src", "")
|
||||
if base64_pattern.match(src):
|
||||
# Replace base64 data with empty string
|
||||
img["src"] = base64_pattern.sub("", src)
|
||||
|
||||
str_body = ""
|
||||
try:
|
||||
str_body = content_element.encode_contents().decode("utf-8")
|
||||
except Exception:
|
||||
# Reset body to the original HTML
|
||||
success = False
|
||||
body = BeautifulSoup(html, "html.parser")
|
||||
|
||||
# Create a new div with a special ID
|
||||
error_div = body.new_tag("div", id="crawl4ai_error_message")
|
||||
error_div.string = """
|
||||
Crawl4AI Error: This page is not fully supported.
|
||||
|
||||
Possible reasons:
|
||||
1. The page may have restrictions that prevent crawling.
|
||||
2. The page might not be fully loaded.
|
||||
|
||||
Suggestions:
|
||||
- Try calling the crawl function with these parameters:
|
||||
magic=True,
|
||||
- Set headless=False to visualize what's happening on the page.
|
||||
|
||||
If the issue persists, please check the page's structure and any potential anti-crawling measures.
|
||||
"""
|
||||
|
||||
# Append the error div to the body
|
||||
body.append(error_div)
|
||||
str_body = body.encode_contents().decode("utf-8")
|
||||
|
||||
print(
|
||||
"[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details."
|
||||
)
|
||||
self._log(
|
||||
"error",
|
||||
message="After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.",
|
||||
tag="SCRAPE",
|
||||
)
|
||||
|
||||
cleaned_html = str_body.replace("\n\n", "\n").replace(" ", " ")
|
||||
|
||||
return {
|
||||
"cleaned_html": cleaned_html,
|
||||
"success": success,
|
||||
"media": media,
|
||||
"links": links,
|
||||
"metadata": meta,
|
||||
}
|
||||
|
||||
|
||||
class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
def __init__(self, logger=None):
|
||||
super().__init__(logger)
|
||||
self.DIMENSION_REGEX = re.compile(r"(\d+)(\D*)")
|
||||
self.BASE64_PATTERN = re.compile(r'data:image/[^;]+;base64,([^"]+)')
|
||||
|
||||
def _process_element(
|
||||
self,
|
||||
url: str,
|
||||
@@ -1862,3 +987,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
"links": {"internal": [], "external": []},
|
||||
"metadata": {},
|
||||
}
|
||||
|
||||
|
||||
# Backward compatibility alias
|
||||
WebScrapingStrategy = LXMLWebScrapingStrategy
|
||||
|
||||
@@ -11,7 +11,7 @@ from .extraction_strategy import *
|
||||
from .crawler_strategy import *
|
||||
from typing import List
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from .content_scraping_strategy import WebScrapingStrategy
|
||||
from ..content_scraping_strategy import LXMLWebScrapingStrategy as WebScrapingStrategy
|
||||
from .config import *
|
||||
import warnings
|
||||
import json
|
||||
|
||||
79
crawl4ai/memory_utils.py
Normal file
79
crawl4ai/memory_utils.py
Normal file
@@ -0,0 +1,79 @@
|
||||
import psutil
|
||||
import platform
|
||||
import subprocess
|
||||
from typing import Tuple
|
||||
|
||||
|
||||
def get_true_available_memory_gb() -> float:
|
||||
"""Get truly available memory including inactive pages (cross-platform)"""
|
||||
vm = psutil.virtual_memory()
|
||||
|
||||
if platform.system() == 'Darwin': # macOS
|
||||
# On macOS, we need to include inactive memory too
|
||||
try:
|
||||
# Use vm_stat to get accurate values
|
||||
result = subprocess.run(['vm_stat'], capture_output=True, text=True)
|
||||
lines = result.stdout.split('\n')
|
||||
|
||||
page_size = 16384 # macOS page size
|
||||
pages = {}
|
||||
|
||||
for line in lines:
|
||||
if 'Pages free:' in line:
|
||||
pages['free'] = int(line.split()[-1].rstrip('.'))
|
||||
elif 'Pages inactive:' in line:
|
||||
pages['inactive'] = int(line.split()[-1].rstrip('.'))
|
||||
elif 'Pages speculative:' in line:
|
||||
pages['speculative'] = int(line.split()[-1].rstrip('.'))
|
||||
elif 'Pages purgeable:' in line:
|
||||
pages['purgeable'] = int(line.split()[-1].rstrip('.'))
|
||||
|
||||
# Calculate total available (free + inactive + speculative + purgeable)
|
||||
total_available_pages = (
|
||||
pages.get('free', 0) +
|
||||
pages.get('inactive', 0) +
|
||||
pages.get('speculative', 0) +
|
||||
pages.get('purgeable', 0)
|
||||
)
|
||||
available_gb = (total_available_pages * page_size) / (1024**3)
|
||||
|
||||
return available_gb
|
||||
except:
|
||||
# Fallback to psutil
|
||||
return vm.available / (1024**3)
|
||||
else:
|
||||
# For Windows and Linux, psutil.available is accurate
|
||||
return vm.available / (1024**3)
|
||||
|
||||
|
||||
def get_true_memory_usage_percent() -> float:
|
||||
"""
|
||||
Get memory usage percentage that accounts for platform differences.
|
||||
|
||||
Returns:
|
||||
float: Memory usage percentage (0-100)
|
||||
"""
|
||||
vm = psutil.virtual_memory()
|
||||
total_gb = vm.total / (1024**3)
|
||||
available_gb = get_true_available_memory_gb()
|
||||
|
||||
# Calculate used percentage based on truly available memory
|
||||
used_percent = 100.0 * (total_gb - available_gb) / total_gb
|
||||
|
||||
# Ensure it's within valid range
|
||||
return max(0.0, min(100.0, used_percent))
|
||||
|
||||
|
||||
def get_memory_stats() -> Tuple[float, float, float]:
|
||||
"""
|
||||
Get comprehensive memory statistics.
|
||||
|
||||
Returns:
|
||||
Tuple[float, float, float]: (used_percent, available_gb, total_gb)
|
||||
"""
|
||||
vm = psutil.virtual_memory()
|
||||
total_gb = vm.total / (1024**3)
|
||||
available_gb = get_true_available_memory_gb()
|
||||
used_percent = get_true_memory_usage_percent()
|
||||
|
||||
return used_percent, available_gb, total_gb
|
||||
@@ -23,8 +23,9 @@ SeedingConfig = Union['SeedingConfigType']
|
||||
|
||||
# Content scraping types
|
||||
ContentScrapingStrategy = Union['ContentScrapingStrategyType']
|
||||
WebScrapingStrategy = Union['WebScrapingStrategyType']
|
||||
LXMLWebScrapingStrategy = Union['LXMLWebScrapingStrategyType']
|
||||
# Backward compatibility alias
|
||||
WebScrapingStrategy = Union['LXMLWebScrapingStrategyType']
|
||||
|
||||
# Proxy types
|
||||
ProxyRotationStrategy = Union['ProxyRotationStrategyType']
|
||||
@@ -114,7 +115,6 @@ if TYPE_CHECKING:
|
||||
# Content scraping imports
|
||||
from .content_scraping_strategy import (
|
||||
ContentScrapingStrategy as ContentScrapingStrategyType,
|
||||
WebScrapingStrategy as WebScrapingStrategyType,
|
||||
LXMLWebScrapingStrategy as LXMLWebScrapingStrategyType,
|
||||
)
|
||||
|
||||
|
||||
@@ -1517,8 +1517,29 @@ def extract_metadata_using_lxml(html, doc=None):
|
||||
head = head[0]
|
||||
|
||||
# Title - using XPath
|
||||
# title = head.xpath(".//title/text()")
|
||||
# metadata["title"] = title[0].strip() if title else None
|
||||
|
||||
# === Title Extraction - New Approach ===
|
||||
# Attempt to extract <title> using XPath
|
||||
title = head.xpath(".//title/text()")
|
||||
metadata["title"] = title[0].strip() if title else None
|
||||
title = title[0] if title else None
|
||||
|
||||
# Fallback: Use .find() in case XPath fails due to malformed HTML
|
||||
if not title:
|
||||
title_el = doc.find(".//title")
|
||||
title = title_el.text if title_el is not None else None
|
||||
|
||||
# Final fallback: Use OpenGraph or Twitter title if <title> is missing or empty
|
||||
if not title:
|
||||
title_candidates = (
|
||||
doc.xpath("//meta[@property='og:title']/@content") or
|
||||
doc.xpath("//meta[@name='twitter:title']/@content")
|
||||
)
|
||||
title = title_candidates[0] if title_candidates else None
|
||||
|
||||
# Strip and assign title
|
||||
metadata["title"] = title.strip() if title else None
|
||||
|
||||
# Meta description - using XPath with multiple attribute conditions
|
||||
description = head.xpath('.//meta[@name="description"]/@content')
|
||||
|
||||
@@ -5,4 +5,9 @@ ANTHROPIC_API_KEY=your_anthropic_key_here
|
||||
GROQ_API_KEY=your_groq_key_here
|
||||
TOGETHER_API_KEY=your_together_key_here
|
||||
MISTRAL_API_KEY=your_mistral_key_here
|
||||
GEMINI_API_TOKEN=your_gemini_key_here
|
||||
GEMINI_API_TOKEN=your_gemini_key_here
|
||||
|
||||
# Optional: Override the default LLM provider
|
||||
# Examples: "openai/gpt-4", "anthropic/claude-3-opus", "deepseek/chat", etc.
|
||||
# If not set, uses the provider specified in config.yml (default: openai/gpt-4o-mini)
|
||||
# LLM_PROVIDER=anthropic/claude-3-opus
|
||||
@@ -154,6 +154,29 @@ cp deploy/docker/.llm.env.example .llm.env
|
||||
# Now edit .llm.env and add your API keys
|
||||
```
|
||||
|
||||
**Flexible LLM Provider Configuration:**
|
||||
|
||||
The Docker setup now supports flexible LLM provider configuration through three methods:
|
||||
|
||||
1. **Environment Variable** (Highest Priority): Set `LLM_PROVIDER` to override the default
|
||||
```bash
|
||||
export LLM_PROVIDER="anthropic/claude-3-opus"
|
||||
# Or in your .llm.env file:
|
||||
# LLM_PROVIDER=anthropic/claude-3-opus
|
||||
```
|
||||
|
||||
2. **API Request Parameter**: Specify provider per request
|
||||
```json
|
||||
{
|
||||
"url": "https://example.com",
|
||||
"provider": "groq/mixtral-8x7b"
|
||||
}
|
||||
```
|
||||
|
||||
3. **Config File Default**: Falls back to `config.yml` (default: `openai/gpt-4o-mini`)
|
||||
|
||||
The system automatically selects the appropriate API key based on the provider.
|
||||
|
||||
#### 3. Build and Run with Compose
|
||||
|
||||
The `docker-compose.yml` file in the project root provides a simplified approach that automatically handles architecture detection using buildx.
|
||||
@@ -668,7 +691,7 @@ app:
|
||||
|
||||
# Default LLM Configuration
|
||||
llm:
|
||||
provider: "openai/gpt-4o-mini"
|
||||
provider: "openai/gpt-4o-mini" # Can be overridden by LLM_PROVIDER env var
|
||||
api_key_env: "OPENAI_API_KEY"
|
||||
# api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
|
||||
|
||||
|
||||
@@ -40,7 +40,9 @@ from utils import (
|
||||
get_base_url,
|
||||
is_task_id,
|
||||
should_cleanup_task,
|
||||
decode_redis_hash
|
||||
decode_redis_hash,
|
||||
get_llm_api_key,
|
||||
validate_llm_provider
|
||||
)
|
||||
|
||||
import psutil, time
|
||||
@@ -89,10 +91,12 @@ async def handle_llm_qa(
|
||||
|
||||
Answer:"""
|
||||
|
||||
# api_token=os.environ.get(config["llm"].get("api_key_env", ""))
|
||||
|
||||
response = perform_completion_with_backoff(
|
||||
provider=config["llm"]["provider"],
|
||||
prompt_with_variables=prompt,
|
||||
api_token=os.environ.get(config["llm"].get("api_key_env", ""))
|
||||
api_token=get_llm_api_key(config)
|
||||
)
|
||||
|
||||
return response.choices[0].message.content
|
||||
@@ -110,19 +114,23 @@ async def process_llm_extraction(
|
||||
url: str,
|
||||
instruction: str,
|
||||
schema: Optional[str] = None,
|
||||
cache: str = "0"
|
||||
cache: str = "0",
|
||||
provider: Optional[str] = None
|
||||
) -> None:
|
||||
"""Process LLM extraction in background."""
|
||||
try:
|
||||
# If config['llm'] has api_key then ignore the api_key_env
|
||||
api_key = ""
|
||||
if "api_key" in config["llm"]:
|
||||
api_key = config["llm"]["api_key"]
|
||||
else:
|
||||
api_key = os.environ.get(config["llm"].get("api_key_env", None), "")
|
||||
# Validate provider
|
||||
is_valid, error_msg = validate_llm_provider(config, provider)
|
||||
if not is_valid:
|
||||
await redis.hset(f"task:{task_id}", mapping={
|
||||
"status": TaskStatus.FAILED,
|
||||
"error": error_msg
|
||||
})
|
||||
return
|
||||
api_key = get_llm_api_key(config, provider)
|
||||
llm_strategy = LLMExtractionStrategy(
|
||||
llm_config=LLMConfig(
|
||||
provider=config["llm"]["provider"],
|
||||
provider=provider or config["llm"]["provider"],
|
||||
api_token=api_key
|
||||
),
|
||||
instruction=instruction,
|
||||
@@ -169,10 +177,19 @@ async def handle_markdown_request(
|
||||
filter_type: FilterType,
|
||||
query: Optional[str] = None,
|
||||
cache: str = "0",
|
||||
config: Optional[dict] = None
|
||||
config: Optional[dict] = None,
|
||||
provider: Optional[str] = None
|
||||
) -> str:
|
||||
"""Handle markdown generation requests."""
|
||||
try:
|
||||
# Validate provider if using LLM filter
|
||||
if filter_type == FilterType.LLM:
|
||||
is_valid, error_msg = validate_llm_provider(config, provider)
|
||||
if not is_valid:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=error_msg
|
||||
)
|
||||
decoded_url = unquote(url)
|
||||
if not decoded_url.startswith(('http://', 'https://')):
|
||||
decoded_url = 'https://' + decoded_url
|
||||
@@ -185,8 +202,8 @@ async def handle_markdown_request(
|
||||
FilterType.BM25: BM25ContentFilter(user_query=query or ""),
|
||||
FilterType.LLM: LLMContentFilter(
|
||||
llm_config=LLMConfig(
|
||||
provider=config["llm"]["provider"],
|
||||
api_token=os.environ.get(config["llm"].get("api_key_env", None), ""),
|
||||
provider=provider or config["llm"]["provider"],
|
||||
api_token=get_llm_api_key(config, provider),
|
||||
),
|
||||
instruction=query or "Extract main content"
|
||||
)
|
||||
@@ -230,7 +247,8 @@ async def handle_llm_request(
|
||||
query: Optional[str] = None,
|
||||
schema: Optional[str] = None,
|
||||
cache: str = "0",
|
||||
config: Optional[dict] = None
|
||||
config: Optional[dict] = None,
|
||||
provider: Optional[str] = None
|
||||
) -> JSONResponse:
|
||||
"""Handle LLM extraction requests."""
|
||||
base_url = get_base_url(request)
|
||||
@@ -260,7 +278,8 @@ async def handle_llm_request(
|
||||
schema,
|
||||
cache,
|
||||
base_url,
|
||||
config
|
||||
config,
|
||||
provider
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
@@ -304,7 +323,8 @@ async def create_new_task(
|
||||
schema: Optional[str],
|
||||
cache: str,
|
||||
base_url: str,
|
||||
config: dict
|
||||
config: dict,
|
||||
provider: Optional[str] = None
|
||||
) -> JSONResponse:
|
||||
"""Create and initialize a new task."""
|
||||
decoded_url = unquote(input_path)
|
||||
@@ -328,7 +348,8 @@ async def create_new_task(
|
||||
decoded_url,
|
||||
query,
|
||||
schema,
|
||||
cache
|
||||
cache,
|
||||
provider
|
||||
)
|
||||
|
||||
return JSONResponse({
|
||||
|
||||
@@ -36,6 +36,7 @@ class LlmJobPayload(BaseModel):
|
||||
q: str
|
||||
schema: Optional[str] = None
|
||||
cache: bool = False
|
||||
provider: Optional[str] = None
|
||||
|
||||
|
||||
class CrawlJobPayload(BaseModel):
|
||||
@@ -61,6 +62,7 @@ async def llm_job_enqueue(
|
||||
schema=payload.schema,
|
||||
cache=payload.cache,
|
||||
config=_config,
|
||||
provider=payload.provider,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ class MarkdownRequest(BaseModel):
|
||||
f: FilterType = Field(FilterType.FIT, description="Content‑filter strategy: fit, raw, bm25, or llm")
|
||||
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
|
||||
c: Optional[str] = Field("0", description="Cache‑bust / revision counter")
|
||||
provider: Optional[str] = Field(None, description="LLM provider override (e.g., 'anthropic/claude-3-opus')")
|
||||
|
||||
|
||||
class RawCode(BaseModel):
|
||||
|
||||
@@ -241,7 +241,7 @@ async def get_markdown(
|
||||
raise HTTPException(
|
||||
400, "URL must be absolute and start with http/https")
|
||||
markdown = await handle_markdown_request(
|
||||
body.url, body.f, body.q, body.c, config
|
||||
body.url, body.f, body.q, body.c, config, body.provider
|
||||
)
|
||||
return JSONResponse({
|
||||
"url": body.url,
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import dns.resolver
|
||||
import logging
|
||||
import yaml
|
||||
import os
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
@@ -19,10 +20,24 @@ class FilterType(str, Enum):
|
||||
LLM = "llm"
|
||||
|
||||
def load_config() -> Dict:
|
||||
"""Load and return application configuration."""
|
||||
"""Load and return application configuration with environment variable overrides."""
|
||||
config_path = Path(__file__).parent / "config.yml"
|
||||
with open(config_path, "r") as config_file:
|
||||
return yaml.safe_load(config_file)
|
||||
config = yaml.safe_load(config_file)
|
||||
|
||||
# Override LLM provider from environment if set
|
||||
llm_provider = os.environ.get("LLM_PROVIDER")
|
||||
if llm_provider:
|
||||
config["llm"]["provider"] = llm_provider
|
||||
logging.info(f"LLM provider overridden from environment: {llm_provider}")
|
||||
|
||||
# Also support direct API key from environment if the provider-specific key isn't set
|
||||
llm_api_key = os.environ.get("LLM_API_KEY")
|
||||
if llm_api_key and "api_key" not in config["llm"]:
|
||||
config["llm"]["api_key"] = llm_api_key
|
||||
logging.info("LLM API key loaded from LLM_API_KEY environment variable")
|
||||
|
||||
return config
|
||||
|
||||
def setup_logging(config: Dict) -> None:
|
||||
"""Configure application logging."""
|
||||
@@ -56,6 +71,52 @@ def decode_redis_hash(hash_data: Dict[bytes, bytes]) -> Dict[str, str]:
|
||||
|
||||
|
||||
|
||||
def get_llm_api_key(config: Dict, provider: Optional[str] = None) -> str:
|
||||
"""Get the appropriate API key based on the LLM provider.
|
||||
|
||||
Args:
|
||||
config: The application configuration dictionary
|
||||
provider: Optional provider override (e.g., "openai/gpt-4")
|
||||
|
||||
Returns:
|
||||
The API key for the provider, or empty string if not found
|
||||
"""
|
||||
|
||||
# Use provided provider or fall back to config
|
||||
if not provider:
|
||||
provider = config["llm"]["provider"]
|
||||
|
||||
# Check if direct API key is configured
|
||||
if "api_key" in config["llm"]:
|
||||
return config["llm"]["api_key"]
|
||||
|
||||
# Fall back to the configured api_key_env if no match
|
||||
return os.environ.get(config["llm"].get("api_key_env", ""), "")
|
||||
|
||||
|
||||
def validate_llm_provider(config: Dict, provider: Optional[str] = None) -> tuple[bool, str]:
|
||||
"""Validate that the LLM provider has an associated API key.
|
||||
|
||||
Args:
|
||||
config: The application configuration dictionary
|
||||
provider: Optional provider override (e.g., "openai/gpt-4")
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid, error_message)
|
||||
"""
|
||||
# Use provided provider or fall back to config
|
||||
if not provider:
|
||||
provider = config["llm"]["provider"]
|
||||
|
||||
# Get the API key for this provider
|
||||
api_key = get_llm_api_key(config, provider)
|
||||
|
||||
if not api_key:
|
||||
return False, f"No API key found for provider '{provider}'. Please set the appropriate environment variable."
|
||||
|
||||
return True, ""
|
||||
|
||||
|
||||
def verify_email_domain(email: str) -> bool:
|
||||
try:
|
||||
domain = email.split('@')[1]
|
||||
|
||||
@@ -14,6 +14,7 @@ x-base-config: &base-config
|
||||
- TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
|
||||
- MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
|
||||
- GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
|
||||
- LLM_PROVIDER=${LLM_PROVIDER:-} # Optional: Override default provider (e.g., "anthropic/claude-3-opus")
|
||||
volumes:
|
||||
- /dev/shm:/dev/shm # Chromium performance
|
||||
deploy:
|
||||
|
||||
303
docs/examples/demo_multi_config_clean.py
Normal file
303
docs/examples/demo_multi_config_clean.py
Normal file
@@ -0,0 +1,303 @@
|
||||
"""
|
||||
🎯 Multi-Config URL Matching Demo
|
||||
=================================
|
||||
Learn how to use different crawler configurations for different URL patterns
|
||||
in a single crawl batch with Crawl4AI's multi-config feature.
|
||||
|
||||
Part 1: Understanding URL Matching (Pattern Testing)
|
||||
Part 2: Practical Example with Real Crawling
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
CrawlerRunConfig,
|
||||
MatchMode
|
||||
)
|
||||
from crawl4ai.processors.pdf import PDFContentScrapingStrategy
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
|
||||
|
||||
def print_section(title):
|
||||
"""Print a formatted section header"""
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"{title}")
|
||||
print(f"{'=' * 60}\n")
|
||||
|
||||
|
||||
def test_url_matching(config, test_urls, config_name):
|
||||
"""Test URL matching for a config and show results"""
|
||||
print(f"Config: {config_name}")
|
||||
print(f"Matcher: {config.url_matcher}")
|
||||
if hasattr(config, 'match_mode'):
|
||||
print(f"Mode: {config.match_mode.value}")
|
||||
print("-" * 40)
|
||||
|
||||
for url in test_urls:
|
||||
matches = config.is_match(url)
|
||||
symbol = "✓" if matches else "✗"
|
||||
print(f"{symbol} {url}")
|
||||
print()
|
||||
|
||||
|
||||
# ==============================================================================
|
||||
# PART 1: Understanding URL Matching
|
||||
# ==============================================================================
|
||||
|
||||
def demo_part1_pattern_matching():
|
||||
"""Part 1: Learn how URL matching works without crawling"""
|
||||
|
||||
print_section("PART 1: Understanding URL Matching")
|
||||
print("Let's explore different ways to match URLs with configs.\n")
|
||||
|
||||
# Test URLs we'll use throughout
|
||||
test_urls = [
|
||||
"https://example.com/report.pdf",
|
||||
"https://example.com/data.json",
|
||||
"https://example.com/blog/post-1",
|
||||
"https://example.com/article/news",
|
||||
"https://api.example.com/v1/users",
|
||||
"https://example.com/about"
|
||||
]
|
||||
|
||||
# 1.1 Simple String Pattern
|
||||
print("1.1 Simple String Pattern Matching")
|
||||
print("-" * 40)
|
||||
|
||||
pdf_config = CrawlerRunConfig(
|
||||
url_matcher="*.pdf"
|
||||
)
|
||||
|
||||
test_url_matching(pdf_config, test_urls, "PDF Config")
|
||||
|
||||
|
||||
# 1.2 Multiple String Patterns
|
||||
print("1.2 Multiple String Patterns (OR logic)")
|
||||
print("-" * 40)
|
||||
|
||||
blog_config = CrawlerRunConfig(
|
||||
url_matcher=["*/blog/*", "*/article/*", "*/news/*"],
|
||||
match_mode=MatchMode.OR # This is default, shown for clarity
|
||||
)
|
||||
|
||||
test_url_matching(blog_config, test_urls, "Blog/Article Config")
|
||||
|
||||
|
||||
# 1.3 Single Function Matcher
|
||||
print("1.3 Function-based Matching")
|
||||
print("-" * 40)
|
||||
|
||||
api_config = CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'api' in url or url.endswith('.json')
|
||||
)
|
||||
|
||||
test_url_matching(api_config, test_urls, "API Config")
|
||||
|
||||
|
||||
# 1.4 List of Functions
|
||||
print("1.4 Multiple Functions with AND Logic")
|
||||
print("-" * 40)
|
||||
|
||||
# Must be HTTPS AND contain 'api' AND have version number
|
||||
secure_api_config = CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
lambda url: url.startswith('https://'),
|
||||
lambda url: 'api' in url,
|
||||
lambda url: '/v' in url # Version indicator
|
||||
],
|
||||
match_mode=MatchMode.AND
|
||||
)
|
||||
|
||||
test_url_matching(secure_api_config, test_urls, "Secure API Config")
|
||||
|
||||
|
||||
# 1.5 Mixed: String and Function Together
|
||||
print("1.5 Mixed Patterns: String + Function")
|
||||
print("-" * 40)
|
||||
|
||||
# Match JSON files OR any API endpoint
|
||||
json_or_api_config = CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
"*.json", # String pattern
|
||||
lambda url: 'api' in url # Function
|
||||
],
|
||||
match_mode=MatchMode.OR
|
||||
)
|
||||
|
||||
test_url_matching(json_or_api_config, test_urls, "JSON or API Config")
|
||||
|
||||
|
||||
# 1.6 Complex: Multiple Strings + Multiple Functions
|
||||
print("1.6 Complex Matcher: Mixed Types with AND Logic")
|
||||
print("-" * 40)
|
||||
|
||||
# Must be: HTTPS AND (.com domain) AND (blog OR article) AND NOT a PDF
|
||||
complex_config = CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
lambda url: url.startswith('https://'), # Function: HTTPS check
|
||||
"*.com/*", # String: .com domain
|
||||
lambda url: any(pattern in url for pattern in ['/blog/', '/article/']), # Function: Blog OR article
|
||||
lambda url: not url.endswith('.pdf') # Function: Not PDF
|
||||
],
|
||||
match_mode=MatchMode.AND
|
||||
)
|
||||
|
||||
test_url_matching(complex_config, test_urls, "Complex Mixed Config")
|
||||
|
||||
print("\n✅ Key Takeaway: First matching config wins when passed to arun_many()!")
|
||||
|
||||
|
||||
# ==============================================================================
|
||||
# PART 2: Practical Multi-URL Crawling
|
||||
# ==============================================================================
|
||||
|
||||
async def demo_part2_practical_crawling():
|
||||
"""Part 2: Real-world example with different content types"""
|
||||
|
||||
print_section("PART 2: Practical Multi-URL Crawling")
|
||||
print("Now let's see multi-config in action with real URLs.\n")
|
||||
|
||||
# Create specialized configs for different content types
|
||||
configs = [
|
||||
# Config 1: PDF documents - only match files ending with .pdf
|
||||
CrawlerRunConfig(
|
||||
url_matcher="*.pdf",
|
||||
scraping_strategy=PDFContentScrapingStrategy()
|
||||
),
|
||||
|
||||
# Config 2: Blog/article pages with content filtering
|
||||
CrawlerRunConfig(
|
||||
url_matcher=["*/blog/*", "*/article/*", "*python.org*"],
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(threshold=0.48)
|
||||
)
|
||||
),
|
||||
|
||||
# Config 3: Dynamic pages requiring JavaScript
|
||||
CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'github.com' in url,
|
||||
js_code="window.scrollTo(0, 500);" # Scroll to load content
|
||||
),
|
||||
|
||||
# Config 4: Mixed matcher - API endpoints (string OR function)
|
||||
CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
"*.json", # String pattern for JSON files
|
||||
lambda url: 'api' in url or 'httpbin.org' in url # Function for API endpoints
|
||||
],
|
||||
match_mode=MatchMode.OR,
|
||||
),
|
||||
|
||||
# Config 5: Complex matcher - Secure documentation sites
|
||||
CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
lambda url: url.startswith('https://'), # Must be HTTPS
|
||||
"*.org/*", # String: .org domain
|
||||
lambda url: any(doc in url for doc in ['docs', 'documentation', 'reference']), # Has docs
|
||||
lambda url: not url.endswith(('.pdf', '.json')) # Not PDF or JSON
|
||||
],
|
||||
match_mode=MatchMode.AND,
|
||||
# wait_for="css:.content, css:article" # Wait for content to load
|
||||
),
|
||||
|
||||
# Default config for everything else
|
||||
# CrawlerRunConfig() # No url_matcher means it matches everything (use it as fallback)
|
||||
]
|
||||
|
||||
# URLs to crawl - each will use a different config
|
||||
urls = [
|
||||
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", # → PDF config
|
||||
"https://blog.python.org/", # → Blog config with content filter
|
||||
"https://github.com/microsoft/playwright", # → JS config
|
||||
"https://httpbin.org/json", # → Mixed matcher config (API)
|
||||
"https://docs.python.org/3/reference/", # → Complex matcher config
|
||||
"https://www.w3schools.com/", # → Default config, if you uncomment the default config line above, if not you will see `Error: No matching configuration`
|
||||
]
|
||||
|
||||
print("URLs to crawl:")
|
||||
for i, url in enumerate(urls, 1):
|
||||
print(f"{i}. {url}")
|
||||
|
||||
print("\nCrawling with appropriate config for each URL...\n")
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=configs
|
||||
)
|
||||
|
||||
# Display results
|
||||
print("Results:")
|
||||
print("-" * 60)
|
||||
|
||||
for result in results:
|
||||
if result.success:
|
||||
# Determine which config was used
|
||||
config_type = "Default"
|
||||
if result.url.endswith('.pdf'):
|
||||
config_type = "PDF Strategy"
|
||||
elif any(pattern in result.url for pattern in ['blog', 'python.org']) and 'docs' not in result.url:
|
||||
config_type = "Blog + Content Filter"
|
||||
elif 'github.com' in result.url:
|
||||
config_type = "JavaScript Enabled"
|
||||
elif 'httpbin.org' in result.url or result.url.endswith('.json'):
|
||||
config_type = "Mixed Matcher (API)"
|
||||
elif 'docs.python.org' in result.url:
|
||||
config_type = "Complex Matcher (Secure Docs)"
|
||||
|
||||
print(f"\n✓ {result.url}")
|
||||
print(f" Config used: {config_type}")
|
||||
print(f" Content size: {len(result.markdown)} chars")
|
||||
|
||||
# Show if we have fit_markdown (from content filter)
|
||||
if hasattr(result.markdown, 'fit_markdown') and result.markdown.fit_markdown:
|
||||
print(f" Fit markdown size: {len(result.markdown.fit_markdown)} chars")
|
||||
reduction = (1 - len(result.markdown.fit_markdown) / len(result.markdown)) * 100
|
||||
print(f" Content reduced by: {reduction:.1f}%")
|
||||
|
||||
# Show extracted data if using extraction strategy
|
||||
if hasattr(result, 'extracted_content') and result.extracted_content:
|
||||
print(f" Extracted data: {str(result.extracted_content)[:100]}...")
|
||||
else:
|
||||
print(f"\n✗ {result.url}")
|
||||
print(f" Error: {result.error_message}")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("✅ Multi-config crawling complete!")
|
||||
print("\nBenefits demonstrated:")
|
||||
print("- PDFs handled with specialized scraper")
|
||||
print("- Blog content filtered for relevance")
|
||||
print("- JavaScript executed only where needed")
|
||||
print("- Mixed matchers (string + function) for flexible matching")
|
||||
print("- Complex matchers for precise URL targeting")
|
||||
print("- Each URL got optimal configuration automatically!")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run both parts of the demo"""
|
||||
|
||||
print("""
|
||||
🎯 Multi-Config URL Matching Demo
|
||||
=================================
|
||||
Learn how Crawl4AI can use different configurations
|
||||
for different URLs in a single batch.
|
||||
""")
|
||||
|
||||
# Part 1: Pattern matching
|
||||
demo_part1_pattern_matching()
|
||||
|
||||
print("\nPress Enter to continue to Part 2...")
|
||||
try:
|
||||
input()
|
||||
except EOFError:
|
||||
# Running in non-interactive mode, skip input
|
||||
pass
|
||||
|
||||
# Part 2: Practical crawling
|
||||
await demo_part2_practical_crawling()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,5 +1,6 @@
|
||||
import time, re
|
||||
from crawl4ai.content_scraping_strategy import WebScrapingStrategy, LXMLWebScrapingStrategy
|
||||
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||
# WebScrapingStrategy is now an alias for LXMLWebScrapingStrategy
|
||||
import time
|
||||
import functools
|
||||
from collections import defaultdict
|
||||
@@ -57,7 +58,7 @@ methods_to_profile = [
|
||||
|
||||
|
||||
# Apply decorators to both strategies
|
||||
for strategy, name in [(WebScrapingStrategy, "Original"), (LXMLWebScrapingStrategy, "LXML")]:
|
||||
for strategy, name in [(LXMLWebScrapingStrategy, "LXML")]:
|
||||
for method in methods_to_profile:
|
||||
apply_decorators(strategy, method, name)
|
||||
|
||||
@@ -85,7 +86,7 @@ def generate_large_html(n_elements=1000):
|
||||
|
||||
def test_scraping():
|
||||
# Initialize both scrapers
|
||||
original_scraper = WebScrapingStrategy()
|
||||
original_scraper = LXMLWebScrapingStrategy()
|
||||
selected_scraper = LXMLWebScrapingStrategy()
|
||||
|
||||
# Generate test HTML
|
||||
|
||||
@@ -404,7 +404,182 @@ for result in results:
|
||||
print(f"Duration: {dr.end_time - dr.start_time}")
|
||||
```
|
||||
|
||||
## 6. Summary
|
||||
## 6. URL-Specific Configurations
|
||||
|
||||
When crawling diverse content types, you often need different configurations for different URLs. For example:
|
||||
- PDFs need specialized extraction
|
||||
- Blog pages benefit from content filtering
|
||||
- Dynamic sites need JavaScript execution
|
||||
- API endpoints need JSON parsing
|
||||
|
||||
### 6.1 Basic URL Pattern Matching
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, MatchMode
|
||||
from crawl4ai.processors.pdf import PDFContentScrapingStrategy
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
|
||||
async def crawl_mixed_content():
|
||||
# Configure different strategies for different content
|
||||
configs = [
|
||||
# PDF files - specialized extraction
|
||||
CrawlerRunConfig(
|
||||
url_matcher="*.pdf",
|
||||
scraping_strategy=PDFContentScrapingStrategy()
|
||||
),
|
||||
|
||||
# Blog/article pages - content filtering
|
||||
CrawlerRunConfig(
|
||||
url_matcher=["*/blog/*", "*/article/*"],
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(threshold=0.48)
|
||||
)
|
||||
),
|
||||
|
||||
# Dynamic pages - JavaScript execution
|
||||
CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'github.com' in url,
|
||||
js_code="window.scrollTo(0, 500);"
|
||||
),
|
||||
|
||||
# API endpoints - JSON extraction
|
||||
CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'api' in url or url.endswith('.json'),
|
||||
# Custome settings for JSON extraction
|
||||
),
|
||||
|
||||
# Default config for everything else
|
||||
CrawlerRunConfig() # No url_matcher means it matches ALL URLs (fallback)
|
||||
]
|
||||
|
||||
# Mixed URLs
|
||||
urls = [
|
||||
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
|
||||
"https://blog.python.org/",
|
||||
"https://github.com/microsoft/playwright",
|
||||
"https://httpbin.org/json",
|
||||
"https://example.com/"
|
||||
]
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=configs # Pass list of configs
|
||||
)
|
||||
|
||||
for result in results:
|
||||
print(f"{result.url}: {len(result.markdown)} chars")
|
||||
```
|
||||
|
||||
### 6.2 Advanced Pattern Matching
|
||||
|
||||
**Important**: A `CrawlerRunConfig` without `url_matcher` (or with `url_matcher=None`) matches ALL URLs. This makes it perfect as a default/fallback configuration.
|
||||
|
||||
The `url_matcher` parameter supports three types of patterns:
|
||||
|
||||
#### Glob Patterns (Strings)
|
||||
```python
|
||||
# Simple patterns
|
||||
"*.pdf" # Any PDF file
|
||||
"*/api/*" # Any URL with /api/ in path
|
||||
"https://*.example.com/*" # Subdomain matching
|
||||
"*://example.com/blog/*" # Any protocol
|
||||
```
|
||||
|
||||
#### Custom Functions
|
||||
```python
|
||||
# Complex logic with lambdas
|
||||
lambda url: url.startswith('https://') and 'secure' in url
|
||||
lambda url: len(url) > 50 and url.count('/') > 5
|
||||
lambda url: any(domain in url for domain in ['api.', 'data.', 'feed.'])
|
||||
```
|
||||
|
||||
#### Mixed Lists with AND/OR Logic
|
||||
```python
|
||||
# Combine multiple conditions
|
||||
CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
"https://*", # Must be HTTPS
|
||||
lambda url: 'internal' in url, # Must contain 'internal'
|
||||
lambda url: not url.endswith('.pdf') # Must not be PDF
|
||||
],
|
||||
match_mode=MatchMode.AND # ALL conditions must match
|
||||
)
|
||||
```
|
||||
|
||||
### 6.3 Practical Example: News Site Crawler
|
||||
|
||||
```python
|
||||
async def crawl_news_site():
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=70.0,
|
||||
rate_limiter=RateLimiter(base_delay=(1.0, 2.0))
|
||||
)
|
||||
|
||||
configs = [
|
||||
# Homepage - light extraction
|
||||
CrawlerRunConfig(
|
||||
url_matcher=lambda url: url.rstrip('/') == 'https://news.ycombinator.com',
|
||||
css_selector="nav, .headline",
|
||||
extraction_strategy=None
|
||||
),
|
||||
|
||||
# Article pages - full extraction
|
||||
CrawlerRunConfig(
|
||||
url_matcher="*/article/*",
|
||||
extraction_strategy=CosineStrategy(
|
||||
semantic_filter="article content",
|
||||
word_count_threshold=100
|
||||
),
|
||||
screenshot=True,
|
||||
excluded_tags=["nav", "aside", "footer"]
|
||||
),
|
||||
|
||||
# Author pages - metadata focus
|
||||
CrawlerRunConfig(
|
||||
url_matcher="*/author/*",
|
||||
extraction_strategy=JsonCssExtractionStrategy({
|
||||
"name": "h1.author-name",
|
||||
"bio": ".author-bio",
|
||||
"articles": "article.post-card h2"
|
||||
})
|
||||
),
|
||||
|
||||
# Everything else
|
||||
CrawlerRunConfig()
|
||||
]
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results = await crawler.arun_many(
|
||||
urls=news_urls,
|
||||
config=configs,
|
||||
dispatcher=dispatcher
|
||||
)
|
||||
```
|
||||
|
||||
### 6.4 Best Practices
|
||||
|
||||
1. **Order Matters**: Configs are evaluated in order - put specific patterns before general ones
|
||||
2. **Default Config Behavior**:
|
||||
- A config without `url_matcher` matches ALL URLs
|
||||
- Always include a default config as the last item if you want to handle all URLs
|
||||
- Without a default config, unmatched URLs will fail with "No matching configuration found"
|
||||
3. **Test Your Patterns**: Use the config's `is_match()` method to test patterns:
|
||||
```python
|
||||
config = CrawlerRunConfig(url_matcher="*.pdf")
|
||||
print(config.is_match("https://example.com/doc.pdf")) # True
|
||||
|
||||
default_config = CrawlerRunConfig() # No url_matcher
|
||||
print(default_config.is_match("https://any-url.com")) # True - matches everything!
|
||||
```
|
||||
4. **Optimize for Performance**:
|
||||
- Disable JS for static content
|
||||
- Skip screenshots for data APIs
|
||||
- Use appropriate extraction strategies
|
||||
|
||||
## 7. Summary
|
||||
|
||||
1. **Two Dispatcher Types**:
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
```python
|
||||
async def arun_many(
|
||||
urls: Union[List[str], List[Any]],
|
||||
config: Optional[CrawlerRunConfig] = None,
|
||||
config: Optional[Union[CrawlerRunConfig, List[CrawlerRunConfig]]] = None,
|
||||
dispatcher: Optional[BaseDispatcher] = None,
|
||||
...
|
||||
) -> Union[List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
|
||||
@@ -15,7 +15,9 @@ async def arun_many(
|
||||
Crawl multiple URLs concurrently or in batches.
|
||||
|
||||
:param urls: A list of URLs (or tasks) to crawl.
|
||||
:param config: (Optional) A default `CrawlerRunConfig` applying to each crawl.
|
||||
:param config: (Optional) Either:
|
||||
- A single `CrawlerRunConfig` applying to all URLs
|
||||
- A list of `CrawlerRunConfig` objects with url_matcher patterns
|
||||
:param dispatcher: (Optional) A concurrency controller (e.g. MemoryAdaptiveDispatcher).
|
||||
...
|
||||
:return: Either a list of `CrawlResult` objects, or an async generator if streaming is enabled.
|
||||
@@ -95,10 +97,70 @@ results = await crawler.arun_many(
|
||||
)
|
||||
```
|
||||
|
||||
### URL-Specific Configurations
|
||||
|
||||
Instead of using one config for all URLs, provide a list of configs with `url_matcher` patterns:
|
||||
|
||||
```python
|
||||
from crawl4ai import CrawlerRunConfig, MatchMode
|
||||
from crawl4ai.processors.pdf import PDFContentScrapingStrategy
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
|
||||
# PDF files - specialized extraction
|
||||
pdf_config = CrawlerRunConfig(
|
||||
url_matcher="*.pdf",
|
||||
scraping_strategy=PDFContentScrapingStrategy()
|
||||
)
|
||||
|
||||
# Blog/article pages - content filtering
|
||||
blog_config = CrawlerRunConfig(
|
||||
url_matcher=["*/blog/*", "*/article/*", "*python.org*"],
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(threshold=0.48)
|
||||
)
|
||||
)
|
||||
|
||||
# Dynamic pages - JavaScript execution
|
||||
github_config = CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'github.com' in url,
|
||||
js_code="window.scrollTo(0, 500);"
|
||||
)
|
||||
|
||||
# API endpoints - JSON extraction
|
||||
api_config = CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'api' in url or url.endswith('.json'),
|
||||
# Custome settings for JSON extraction
|
||||
)
|
||||
|
||||
# Default fallback config
|
||||
default_config = CrawlerRunConfig() # No url_matcher means it never matches except as fallback
|
||||
|
||||
# Pass the list of configs - first match wins!
|
||||
results = await crawler.arun_many(
|
||||
urls=[
|
||||
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", # → pdf_config
|
||||
"https://blog.python.org/", # → blog_config
|
||||
"https://github.com/microsoft/playwright", # → github_config
|
||||
"https://httpbin.org/json", # → api_config
|
||||
"https://example.com/" # → default_config
|
||||
],
|
||||
config=[pdf_config, blog_config, github_config, api_config, default_config]
|
||||
)
|
||||
```
|
||||
|
||||
**URL Matching Features**:
|
||||
- **String patterns**: `"*.pdf"`, `"*/blog/*"`, `"*python.org*"`
|
||||
- **Function matchers**: `lambda url: 'api' in url`
|
||||
- **Mixed patterns**: Combine strings and functions with `MatchMode.OR` or `MatchMode.AND`
|
||||
- **First match wins**: Configs are evaluated in order
|
||||
|
||||
**Key Points**:
|
||||
- Each URL is processed by the same or separate sessions, depending on the dispatcher’s strategy.
|
||||
- `dispatch_result` in each `CrawlResult` (if using concurrency) can hold memory and timing info.
|
||||
- If you need to handle authentication or session IDs, pass them in each individual task or within your run config.
|
||||
- **Important**: Always include a default config (without `url_matcher`) as the last item if you want to handle all URLs. Otherwise, unmatched URLs will fail.
|
||||
|
||||
### Return Value
|
||||
|
||||
|
||||
@@ -208,6 +208,71 @@ config = CrawlerRunConfig(
|
||||
|
||||
See [Virtual Scroll documentation](../../advanced/virtual-scroll.md) for detailed examples.
|
||||
|
||||
---
|
||||
|
||||
### I) **URL Matching Configuration**
|
||||
|
||||
| **Parameter** | **Type / Default** | **What It Does** |
|
||||
|------------------------|------------------------------|-------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| **`url_matcher`** | `UrlMatcher` (None) | Pattern(s) to match URLs against. Can be: string (glob), function, or list of mixed types. **None means match ALL URLs** |
|
||||
| **`match_mode`** | `MatchMode` (MatchMode.OR) | How to combine multiple matchers in a list: `MatchMode.OR` (any match) or `MatchMode.AND` (all must match) |
|
||||
|
||||
The `url_matcher` parameter enables URL-specific configurations when used with `arun_many()`:
|
||||
|
||||
```python
|
||||
from crawl4ai import CrawlerRunConfig, MatchMode
|
||||
from crawl4ai.processors.pdf import PDFContentScrapingStrategy
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
|
||||
# Simple string pattern (glob-style)
|
||||
pdf_config = CrawlerRunConfig(
|
||||
url_matcher="*.pdf",
|
||||
scraping_strategy=PDFContentScrapingStrategy()
|
||||
)
|
||||
|
||||
# Multiple patterns with OR logic (default)
|
||||
blog_config = CrawlerRunConfig(
|
||||
url_matcher=["*/blog/*", "*/article/*", "*/news/*"],
|
||||
match_mode=MatchMode.OR # Any pattern matches
|
||||
)
|
||||
|
||||
# Function matcher
|
||||
api_config = CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'api' in url or url.endswith('.json'),
|
||||
# Other settings like extraction_strategy
|
||||
)
|
||||
|
||||
# Mixed: String + Function with AND logic
|
||||
complex_config = CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
lambda url: url.startswith('https://'), # Must be HTTPS
|
||||
"*.org/*", # Must be .org domain
|
||||
lambda url: 'docs' in url # Must contain 'docs'
|
||||
],
|
||||
match_mode=MatchMode.AND # ALL conditions must match
|
||||
)
|
||||
|
||||
# Combined patterns and functions with AND logic
|
||||
secure_docs = CrawlerRunConfig(
|
||||
url_matcher=["https://*", lambda url: '.doc' in url],
|
||||
match_mode=MatchMode.AND # Must be HTTPS AND contain .doc
|
||||
)
|
||||
|
||||
# Default config - matches ALL URLs
|
||||
default_config = CrawlerRunConfig() # No url_matcher = matches everything
|
||||
```
|
||||
|
||||
**UrlMatcher Types:**
|
||||
- **None (default)**: When `url_matcher` is None or not set, the config matches ALL URLs
|
||||
- **String patterns**: Glob-style patterns like `"*.pdf"`, `"*/api/*"`, `"https://*.example.com/*"`
|
||||
- **Functions**: `lambda url: bool` - Custom logic for complex matching
|
||||
- **Lists**: Mix strings and functions, combined with `MatchMode.OR` or `MatchMode.AND`
|
||||
|
||||
**Important Behavior:**
|
||||
- When passing a list of configs to `arun_many()`, URLs are matched against each config's `url_matcher` in order. First match wins!
|
||||
- If no config matches a URL and there's no default config (one without `url_matcher`), the URL will fail with "No matching configuration found"
|
||||
- Always include a default config as the last item if you want to handle all URLs
|
||||
|
||||
---## 2.2 Helper Methods
|
||||
|
||||
Both `BrowserConfig` and `CrawlerRunConfig` provide a `clone()` method to create modified copies:
|
||||
|
||||
43
docs/md_v2/blog/releases/0.7.1.md
Normal file
43
docs/md_v2/blog/releases/0.7.1.md
Normal file
@@ -0,0 +1,43 @@
|
||||
# 🛠️ Crawl4AI v0.7.1: Minor Cleanup Update
|
||||
|
||||
*July 17, 2025 • 2 min read*
|
||||
|
||||
---
|
||||
|
||||
A small maintenance release that removes unused code and improves documentation.
|
||||
|
||||
## 🎯 What's Changed
|
||||
|
||||
- **Removed unused StealthConfig** from `crawl4ai/browser_manager.py`
|
||||
- **Updated documentation** with better examples and parameter explanations
|
||||
- **Fixed virtual scroll configuration** examples in docs
|
||||
|
||||
## 🧹 Code Cleanup
|
||||
|
||||
Removed unused `StealthConfig` import and configuration that wasn't being used anywhere in the codebase. The project uses its own custom stealth implementation through JavaScript injection instead.
|
||||
|
||||
```python
|
||||
# Removed unused code:
|
||||
from playwright_stealth import StealthConfig
|
||||
stealth_config = StealthConfig(...) # This was never used
|
||||
```
|
||||
|
||||
## 📖 Documentation Updates
|
||||
|
||||
- Fixed adaptive crawling parameter examples
|
||||
- Updated session management documentation
|
||||
- Corrected virtual scroll configuration examples
|
||||
|
||||
## 🚀 Installation
|
||||
|
||||
```bash
|
||||
pip install crawl4ai==0.7.1
|
||||
```
|
||||
|
||||
No breaking changes - upgrade directly from v0.7.0.
|
||||
|
||||
---
|
||||
|
||||
Questions? Issues?
|
||||
- GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||
- Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||
98
docs/md_v2/blog/releases/0.7.2.md
Normal file
98
docs/md_v2/blog/releases/0.7.2.md
Normal file
@@ -0,0 +1,98 @@
|
||||
# 🚀 Crawl4AI v0.7.2: CI/CD & Dependency Optimization Update
|
||||
|
||||
*July 25, 2025 • 3 min read*
|
||||
|
||||
---
|
||||
|
||||
This release introduces automated CI/CD pipelines for seamless releases and optimizes dependencies for a lighter, more efficient package.
|
||||
|
||||
## 🎯 What's New
|
||||
|
||||
### 🔄 Automated Release Pipeline
|
||||
- **GitHub Actions CI/CD**: Automated PyPI and Docker Hub releases on tag push
|
||||
- **Multi-platform Docker images**: Support for both AMD64 and ARM64 architectures
|
||||
- **Version consistency checks**: Ensures tag, package, and Docker versions align
|
||||
- **Automated release notes**: GitHub releases created automatically
|
||||
|
||||
### 📦 Dependency Optimization
|
||||
- **Moved sentence-transformers to optional dependencies**: Significantly reduces default installation size
|
||||
- **Lighter Docker images**: Optimized Dockerfile for faster builds and smaller images
|
||||
- **Better dependency management**: Core vs. optional dependencies clearly separated
|
||||
|
||||
## 🏗️ CI/CD Pipeline
|
||||
|
||||
The new automated release process ensures consistent, reliable releases:
|
||||
|
||||
```yaml
|
||||
# Trigger releases with a simple tag
|
||||
git tag v0.7.2
|
||||
git push origin v0.7.2
|
||||
|
||||
# Automatically:
|
||||
# ✅ Validates version consistency
|
||||
# ✅ Builds and publishes to PyPI
|
||||
# ✅ Builds multi-platform Docker images
|
||||
# ✅ Pushes to Docker Hub with proper tags
|
||||
# ✅ Creates GitHub release
|
||||
```
|
||||
|
||||
## 💾 Lighter Installation
|
||||
|
||||
Default installation is now significantly smaller:
|
||||
|
||||
```bash
|
||||
# Core installation (smaller, faster)
|
||||
pip install crawl4ai==0.7.2
|
||||
|
||||
# With ML features (includes sentence-transformers)
|
||||
pip install crawl4ai[transformer]==0.7.2
|
||||
|
||||
# Full installation
|
||||
pip install crawl4ai[all]==0.7.2
|
||||
```
|
||||
|
||||
## 🐳 Docker Improvements
|
||||
|
||||
Enhanced Docker support with multi-platform images:
|
||||
|
||||
```bash
|
||||
# Pull the latest version
|
||||
docker pull unclecode/crawl4ai:0.7.2
|
||||
docker pull unclecode/crawl4ai:latest
|
||||
|
||||
# Available tags:
|
||||
# - unclecode/crawl4ai:0.7.2 (specific version)
|
||||
# - unclecode/crawl4ai:0.7 (minor version)
|
||||
# - unclecode/crawl4ai:0 (major version)
|
||||
# - unclecode/crawl4ai:latest
|
||||
```
|
||||
|
||||
## 🔧 Technical Details
|
||||
|
||||
### Dependency Changes
|
||||
- `sentence-transformers` moved from required to optional dependencies
|
||||
- Reduces default installation by ~500MB
|
||||
- No impact on functionality when transformer features aren't needed
|
||||
|
||||
### CI/CD Configuration
|
||||
- GitHub Actions workflows for automated releases
|
||||
- Version validation before publishing
|
||||
- Parallel PyPI and Docker Hub deployments
|
||||
- Automatic tagging strategy for Docker images
|
||||
|
||||
## 🚀 Installation
|
||||
|
||||
```bash
|
||||
pip install crawl4ai==0.7.2
|
||||
```
|
||||
|
||||
No breaking changes - direct upgrade from v0.7.0 or v0.7.1.
|
||||
|
||||
---
|
||||
|
||||
Questions? Issues?
|
||||
- GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||
- Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||
- Twitter: [@unclecode](https://x.com/unclecode)
|
||||
|
||||
*P.S. The new CI/CD pipeline will make future releases faster and more reliable. Thanks for your patience as we improve our release process!*
|
||||
@@ -209,7 +209,13 @@ class CrawlerRunConfig:
|
||||
- The maximum number of concurrent crawl sessions.
|
||||
- Helps prevent overwhelming the system.
|
||||
|
||||
14. **`display_mode`**:
|
||||
14. **`url_matcher`** & **`match_mode`**:
|
||||
- Enable URL-specific configurations when used with `arun_many()`.
|
||||
- Set `url_matcher` to patterns (glob, function, or list) to match specific URLs.
|
||||
- Use `match_mode` (OR/AND) to control how multiple patterns combine.
|
||||
- See [URL-Specific Configurations](../api/arun_many.md#url-specific-configurations) for examples.
|
||||
|
||||
15. **`display_mode`**:
|
||||
- The display mode for progress information (`DETAILED`, `BRIEF`, etc.).
|
||||
- Affects how much information is printed during the crawl.
|
||||
|
||||
|
||||
@@ -350,15 +350,22 @@ if __name__ == "__main__":
|
||||
|
||||
## 6. Scraping Modes
|
||||
|
||||
Crawl4AI provides two different scraping strategies for HTML content processing: `WebScrapingStrategy` (BeautifulSoup-based, default) and `LXMLWebScrapingStrategy` (LXML-based). The LXML strategy offers significantly better performance, especially for large HTML documents.
|
||||
Crawl4AI uses `LXMLWebScrapingStrategy` (LXML-based) as the default scraping strategy for HTML content processing. This strategy offers excellent performance, especially for large HTML documents.
|
||||
|
||||
**Note:** For backward compatibility, `WebScrapingStrategy` is still available as an alias for `LXMLWebScrapingStrategy`.
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LXMLWebScrapingStrategy
|
||||
|
||||
async def main():
|
||||
config = CrawlerRunConfig(
|
||||
scraping_strategy=LXMLWebScrapingStrategy() # Faster alternative to default BeautifulSoup
|
||||
# Default configuration already uses LXMLWebScrapingStrategy
|
||||
config = CrawlerRunConfig()
|
||||
|
||||
# Or explicitly specify it if desired
|
||||
config_explicit = CrawlerRunConfig(
|
||||
scraping_strategy=LXMLWebScrapingStrategy()
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
@@ -417,21 +424,20 @@ class CustomScrapingStrategy(ContentScrapingStrategy):
|
||||
|
||||
### Performance Considerations
|
||||
|
||||
The LXML strategy can be up to 10-20x faster than BeautifulSoup strategy, particularly when processing large HTML documents. However, please note:
|
||||
The LXML strategy provides excellent performance, particularly when processing large HTML documents, offering up to 10-20x faster processing compared to BeautifulSoup-based approaches.
|
||||
|
||||
1. LXML strategy is currently experimental
|
||||
2. In some edge cases, the parsing results might differ slightly from BeautifulSoup
|
||||
3. If you encounter any inconsistencies between LXML and BeautifulSoup results, please [raise an issue](https://github.com/codeium/crawl4ai/issues) with a reproducible example
|
||||
Benefits of LXML strategy:
|
||||
- Fast processing of large HTML documents (especially >100KB)
|
||||
- Efficient memory usage
|
||||
- Good handling of well-formed HTML
|
||||
- Robust table detection and extraction
|
||||
|
||||
Choose LXML strategy when:
|
||||
- Processing large HTML documents (recommended for >100KB)
|
||||
- Performance is critical
|
||||
- Working with well-formed HTML
|
||||
### Backward Compatibility
|
||||
|
||||
Stick to BeautifulSoup strategy (default) when:
|
||||
- Maximum compatibility is needed
|
||||
- Working with malformed HTML
|
||||
- Exact parsing behavior is critical
|
||||
For users upgrading from earlier versions:
|
||||
- `WebScrapingStrategy` is now an alias for `LXMLWebScrapingStrategy`
|
||||
- Existing code using `WebScrapingStrategy` will continue to work without modification
|
||||
- No changes are required to your existing code
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -19,13 +19,15 @@ class MarkdownGenerationResult(BaseModel):
|
||||
class CrawlResult(BaseModel):
|
||||
url: str
|
||||
html: str
|
||||
fit_html: Optional[str] = None
|
||||
success: bool
|
||||
cleaned_html: Optional[str] = None
|
||||
media: Dict[str, List[Dict]] = {}
|
||||
links: Dict[str, List[Dict]] = {}
|
||||
downloaded_files: Optional[List[str]] = None
|
||||
js_execution_result: Optional[Dict[str, Any]] = None
|
||||
screenshot: Optional[str] = None
|
||||
pdf : Optional[bytes] = None
|
||||
pdf: Optional[bytes] = None
|
||||
mhtml: Optional[str] = None
|
||||
markdown: Optional[Union[str, MarkdownGenerationResult]] = None
|
||||
extracted_content: Optional[str] = None
|
||||
@@ -35,6 +37,12 @@ class CrawlResult(BaseModel):
|
||||
response_headers: Optional[dict] = None
|
||||
status_code: Optional[int] = None
|
||||
ssl_certificate: Optional[SSLCertificate] = None
|
||||
dispatch_result: Optional[DispatchResult] = None
|
||||
redirected_url: Optional[str] = None
|
||||
network_requests: Optional[List[Dict[str, Any]]] = None
|
||||
console_messages: Optional[List[Dict[str, Any]]] = None
|
||||
tables: List[Dict] = Field(default_factory=list)
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
```
|
||||
@@ -45,11 +53,13 @@ class CrawlResult(BaseModel):
|
||||
|-------------------------------------------|-----------------------------------------------------------------------------------------------------|
|
||||
| **url (`str`)** | The final or actual URL crawled (in case of redirects). |
|
||||
| **html (`str`)** | Original, unmodified page HTML. Good for debugging or custom processing. |
|
||||
| **fit_html (`Optional[str]`)** | Preprocessed HTML optimized for extraction and content filtering. |
|
||||
| **success (`bool`)** | `True` if the crawl completed without major errors, else `False`. |
|
||||
| **cleaned_html (`Optional[str]`)** | Sanitized HTML with scripts/styles removed; can exclude tags if configured via `excluded_tags` etc. |
|
||||
| **media (`Dict[str, List[Dict]]`)** | Extracted media info (images, audio, etc.), each with attributes like `src`, `alt`, `score`, etc. |
|
||||
| **links (`Dict[str, List[Dict]]`)** | Extracted link data, split by `internal` and `external`. Each link usually has `href`, `text`, etc. |
|
||||
| **downloaded_files (`Optional[List[str]]`)** | If `accept_downloads=True` in `BrowserConfig`, this lists the filepaths of saved downloads. |
|
||||
| **js_execution_result (`Optional[Dict[str, Any]]`)** | Results from JavaScript execution during crawling. |
|
||||
| **screenshot (`Optional[str]`)** | Screenshot of the page (base64-encoded) if `screenshot=True`. |
|
||||
| **pdf (`Optional[bytes]`)** | PDF of the page if `pdf=True`. |
|
||||
| **mhtml (`Optional[str]`)** | MHTML snapshot of the page if `capture_mhtml=True`. Contains the full page with all resources. |
|
||||
@@ -61,6 +71,11 @@ class CrawlResult(BaseModel):
|
||||
| **response_headers (`Optional[dict]`)** | HTTP response headers, if captured. |
|
||||
| **status_code (`Optional[int]`)** | HTTP status code (e.g., 200 for OK). |
|
||||
| **ssl_certificate (`Optional[SSLCertificate]`)** | SSL certificate info if `fetch_ssl_certificate=True`. |
|
||||
| **dispatch_result (`Optional[DispatchResult]`)** | Additional concurrency and resource usage information when crawling URLs in parallel. |
|
||||
| **redirected_url (`Optional[str]`)** | The URL after any redirects (different from `url` which is the final URL). |
|
||||
| **network_requests (`Optional[List[Dict[str, Any]]]`)** | List of network requests, responses, and failures captured during the crawl if `capture_network_requests=True`. |
|
||||
| **console_messages (`Optional[List[Dict[str, Any]]]`)** | List of browser console messages captured during the crawl if `capture_console_messages=True`. |
|
||||
| **tables (`List[Dict]`)** | Table data extracted from HTML tables with structure `[{headers, rows, caption, summary}]`. |
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -154,6 +154,30 @@ cp deploy/docker/.llm.env.example .llm.env
|
||||
# Now edit .llm.env and add your API keys
|
||||
```
|
||||
|
||||
**Flexible LLM Provider Configuration:**
|
||||
|
||||
The Docker setup now supports flexible LLM provider configuration through three methods:
|
||||
|
||||
1. **Environment Variable** (Highest Priority): Set `LLM_PROVIDER` to override the default
|
||||
```bash
|
||||
export LLM_PROVIDER="anthropic/claude-3-opus"
|
||||
# Or in your .llm.env file:
|
||||
# LLM_PROVIDER=anthropic/claude-3-opus
|
||||
```
|
||||
|
||||
2. **API Request Parameter**: Specify provider per request
|
||||
```json
|
||||
{
|
||||
"url": "https://example.com",
|
||||
"f": "llm",
|
||||
"provider": "groq/mixtral-8x7b"
|
||||
}
|
||||
```
|
||||
|
||||
3. **Config File Default**: Falls back to `config.yml` (default: `openai/gpt-4o-mini`)
|
||||
|
||||
The system automatically selects the appropriate API key based on the configured `api_key_env` in the config file.
|
||||
|
||||
#### 3. Build and Run with Compose
|
||||
|
||||
The `docker-compose.yml` file in the project root provides a simplified approach that automatically handles architecture detection using buildx.
|
||||
@@ -668,7 +692,7 @@ app:
|
||||
|
||||
# Default LLM Configuration
|
||||
llm:
|
||||
provider: "openai/gpt-4o-mini"
|
||||
provider: "openai/gpt-4o-mini" # Can be overridden by LLM_PROVIDER env var
|
||||
api_key_env: "OPENAI_API_KEY"
|
||||
# api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
|
||||
|
||||
|
||||
@@ -28,11 +28,8 @@ This page provides a comprehensive list of example scripts that demonstrate vari
|
||||
| Example | Description | Link |
|
||||
|---------|-------------|------|
|
||||
| Deep Crawling | An extensive tutorial on deep crawling capabilities, demonstrating BFS and BestFirst strategies, stream vs. non-stream execution, filters, scorers, and advanced configurations. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/deepcrawl_example.py) |
|
||||
<<<<<<< HEAD
|
||||
| Virtual Scroll | Comprehensive examples for handling virtualized scrolling on sites like Twitter, Instagram. Demonstrates different scrolling scenarios with local test server. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/virtual_scroll_example.py) |
|
||||
=======
|
||||
| Adaptive Crawling | Demonstrates intelligent crawling that automatically determines when sufficient information has been gathered. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/adaptive_crawling/) |
|
||||
>>>>>>> feature/progressive-crawling
|
||||
| Dispatcher | Shows how to use the crawl dispatcher for advanced workload management. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/dispatcher_example.py) |
|
||||
| Storage State | Tutorial on managing browser storage state for persistence. | [View Guide](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/storage_state_tutorial.md) |
|
||||
| Network Console Capture | Demonstrates how to capture and analyze network requests and console logs. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/network_console_capture_example.py) |
|
||||
@@ -117,4 +114,4 @@ Some examples may require:
|
||||
|
||||
## Contributing New Examples
|
||||
|
||||
If you've created an interesting example that demonstrates a unique use case or feature of Crawl4AI, we encourage you to contribute it to our examples collection. Please see our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTORS.md) for more information.
|
||||
If you've created an interesting example that demonstrates a unique use case or feature of Crawl4AI, we encourage you to contribute it to our examples collection. Please see our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTORS.md) for more information.
|
||||
|
||||
92
docs/md_v2/migration/webscraping-strategy-migration.md
Normal file
92
docs/md_v2/migration/webscraping-strategy-migration.md
Normal file
@@ -0,0 +1,92 @@
|
||||
# WebScrapingStrategy Migration Guide
|
||||
|
||||
## Overview
|
||||
|
||||
Crawl4AI has simplified its content scraping architecture. The BeautifulSoup-based `WebScrapingStrategy` has been deprecated in favor of the faster LXML-based implementation. However, **no action is required** - your existing code will continue to work.
|
||||
|
||||
## What Changed?
|
||||
|
||||
1. **`WebScrapingStrategy` is now an alias** for `LXMLWebScrapingStrategy`
|
||||
2. **The BeautifulSoup implementation has been removed** (~1000 lines of redundant code)
|
||||
3. **`LXMLWebScrapingStrategy` inherits directly** from `ContentScrapingStrategy`
|
||||
4. **Performance remains optimal** with LXML as the sole implementation
|
||||
|
||||
## Backward Compatibility
|
||||
|
||||
**Your existing code continues to work without any changes:**
|
||||
|
||||
```python
|
||||
# This still works perfectly
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, WebScrapingStrategy
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
scraping_strategy=WebScrapingStrategy() # Works as before
|
||||
)
|
||||
```
|
||||
|
||||
## Migration Options
|
||||
|
||||
You have three options:
|
||||
|
||||
### Option 1: Do Nothing (Recommended)
|
||||
Your code will continue to work. `WebScrapingStrategy` is permanently aliased to `LXMLWebScrapingStrategy`.
|
||||
|
||||
### Option 2: Update Imports (Optional)
|
||||
For clarity, you can update your imports:
|
||||
|
||||
```python
|
||||
# Old (still works)
|
||||
from crawl4ai import WebScrapingStrategy
|
||||
strategy = WebScrapingStrategy()
|
||||
|
||||
# New (more explicit)
|
||||
from crawl4ai import LXMLWebScrapingStrategy
|
||||
strategy = LXMLWebScrapingStrategy()
|
||||
```
|
||||
|
||||
### Option 3: Use Default Configuration
|
||||
Since `LXMLWebScrapingStrategy` is the default, you can omit the strategy parameter:
|
||||
|
||||
```python
|
||||
# Simplest approach - uses LXMLWebScrapingStrategy by default
|
||||
config = CrawlerRunConfig()
|
||||
```
|
||||
|
||||
## Type Hints
|
||||
|
||||
If you use type hints, both work:
|
||||
|
||||
```python
|
||||
from crawl4ai import WebScrapingStrategy, LXMLWebScrapingStrategy
|
||||
|
||||
def process_with_strategy(strategy: WebScrapingStrategy) -> None:
|
||||
# Works with both WebScrapingStrategy and LXMLWebScrapingStrategy
|
||||
pass
|
||||
|
||||
# Both are valid
|
||||
process_with_strategy(WebScrapingStrategy())
|
||||
process_with_strategy(LXMLWebScrapingStrategy())
|
||||
```
|
||||
|
||||
## Subclassing
|
||||
|
||||
If you've subclassed `WebScrapingStrategy`, it continues to work:
|
||||
|
||||
```python
|
||||
class MyCustomStrategy(WebScrapingStrategy):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
# Your custom code
|
||||
```
|
||||
|
||||
## Performance Benefits
|
||||
|
||||
By consolidating to LXML:
|
||||
- **10-20x faster** HTML parsing for large documents
|
||||
- **Lower memory usage**
|
||||
- **Consistent behavior** across all use cases
|
||||
- **Simplified maintenance** and bug fixes
|
||||
|
||||
## Summary
|
||||
|
||||
This change simplifies Crawl4AI's internals while maintaining 100% backward compatibility. Your existing code continues to work, and you get better performance automatically.
|
||||
@@ -12,11 +12,8 @@ parent_dir = os.path.dirname(
|
||||
sys.path.append(parent_dir)
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
from crawl4ai.content_scraping_strategy import WebScrapingStrategy
|
||||
from crawl4ai.content_scraping_strategy import (
|
||||
WebScrapingStrategy as WebScrapingStrategyCurrent,
|
||||
)
|
||||
# from crawl4ai.content_scrapping_strategy_current import WebScrapingStrategy as WebScrapingStrategyCurrent
|
||||
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||
# This test compares the same strategy with itself now since WebScrapingStrategy is deprecated
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -32,8 +29,8 @@ class TestResult:
|
||||
|
||||
class StrategyTester:
|
||||
def __init__(self):
|
||||
self.new_scraper = WebScrapingStrategy()
|
||||
self.current_scraper = WebScrapingStrategyCurrent()
|
||||
self.new_scraper = LXMLWebScrapingStrategy()
|
||||
self.current_scraper = LXMLWebScrapingStrategy() # Same strategy now
|
||||
with open(__location__ + "/sample_wikipedia.html", "r", encoding="utf-8") as f:
|
||||
self.WIKI_HTML = f.read()
|
||||
self.results = {"new": [], "current": []}
|
||||
|
||||
@@ -10,11 +10,13 @@ import sys
|
||||
import uuid
|
||||
import shutil
|
||||
|
||||
from crawl4ai import BrowserProfiler
|
||||
from crawl4ai.browser_manager import BrowserManager
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager, BrowserProfileManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
@@ -25,7 +27,7 @@ async def test_profile_creation():
|
||||
"""Test creating and managing browser profiles."""
|
||||
logger.info("Testing profile creation and management", tag="TEST")
|
||||
|
||||
profile_manager = BrowserProfileManager(logger=logger)
|
||||
profile_manager = BrowserProfiler(logger=logger)
|
||||
|
||||
try:
|
||||
# List existing profiles
|
||||
@@ -83,7 +85,7 @@ async def test_profile_with_browser():
|
||||
"""Test using a profile with a browser."""
|
||||
logger.info("Testing using a profile with a browser", tag="TEST")
|
||||
|
||||
profile_manager = BrowserProfileManager(logger=logger)
|
||||
profile_manager = BrowserProfiler(logger=logger)
|
||||
test_profile_name = f"test-browser-profile-{uuid.uuid4().hex[:8]}"
|
||||
profile_path = None
|
||||
|
||||
@@ -101,6 +103,8 @@ async def test_profile_with_browser():
|
||||
# Now use this profile with a browser
|
||||
browser_config = BrowserConfig(
|
||||
user_data_dir=profile_path,
|
||||
use_managed_browser=True,
|
||||
use_persistent_context=True,
|
||||
headless=True
|
||||
)
|
||||
|
||||
|
||||
55
tests/profiler/test_keyboard_handle.py
Normal file
55
tests/profiler/test_keyboard_handle.py
Normal file
@@ -0,0 +1,55 @@
|
||||
import sys
|
||||
import pytest
|
||||
import asyncio
|
||||
from unittest.mock import patch, MagicMock
|
||||
from crawl4ai.browser_profiler import BrowserProfiler
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skipif(sys.platform != "win32", reason="Windows-specific msvcrt test")
|
||||
async def test_keyboard_input_handling():
|
||||
# Mock sequence of keystrokes: arrow key followed by 'q'
|
||||
mock_keys = [b'\x00K', b'q']
|
||||
mock_kbhit = MagicMock(side_effect=[True, True, False])
|
||||
mock_getch = MagicMock(side_effect=mock_keys)
|
||||
|
||||
with patch('msvcrt.kbhit', mock_kbhit), patch('msvcrt.getch', mock_getch):
|
||||
# profiler = BrowserProfiler()
|
||||
user_done_event = asyncio.Event()
|
||||
|
||||
# Create a local async function to simulate the keyboard input handling
|
||||
async def test_listen_for_quit_command():
|
||||
if sys.platform == "win32":
|
||||
while True:
|
||||
try:
|
||||
if mock_kbhit():
|
||||
raw = mock_getch()
|
||||
try:
|
||||
key = raw.decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
|
||||
if len(key) != 1 or not key.isprintable():
|
||||
continue
|
||||
|
||||
if key.lower() == "q":
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
await asyncio.sleep(0.1)
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
# Run the listener
|
||||
listener_task = asyncio.create_task(test_listen_for_quit_command())
|
||||
|
||||
# Wait for the event to be set
|
||||
try:
|
||||
await asyncio.wait_for(user_done_event.wait(), timeout=1.0)
|
||||
assert user_done_event.is_set()
|
||||
finally:
|
||||
if not listener_task.done():
|
||||
listener_task.cancel()
|
||||
try:
|
||||
await listener_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
42
tests/test_arun_many.py
Normal file
42
tests/test_arun_many.py
Normal file
@@ -0,0 +1,42 @@
|
||||
"""
|
||||
Test example for multiple crawler configs feature
|
||||
"""
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.processors.pdf import PDFContentScrapingStrategy
|
||||
|
||||
|
||||
async def test_run_many():
|
||||
default_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
# scraping_strategy=PDFContentScrapingStrategy()
|
||||
)
|
||||
|
||||
test_urls = [
|
||||
# "https://blog.python.org/", # Blog URL
|
||||
"https://www.python.org/", # Generic HTTPS page
|
||||
"https://www.kidocode.com/", # Generic HTTPS page
|
||||
"https://www.example.com/", # Generic HTTPS page
|
||||
# "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
|
||||
]
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Single config - traditional usage still works
|
||||
print("Test 1: Single config (backwards compatible)")
|
||||
result = await crawler.arun_many(
|
||||
urls=test_urls[:2],
|
||||
config=default_config
|
||||
)
|
||||
print(f"Crawled {len(result)} URLs with single config\n")
|
||||
for item in result:
|
||||
print(f" {item.url} -> {item.status_code}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_run_many())
|
||||
131
tests/test_config_matching_only.py
Normal file
131
tests/test_config_matching_only.py
Normal file
@@ -0,0 +1,131 @@
|
||||
"""
|
||||
Test only the config matching logic without running crawler
|
||||
"""
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from crawl4ai.async_configs import CrawlerRunConfig, MatchMode
|
||||
|
||||
def test_all_matching_scenarios():
|
||||
print("Testing CrawlerRunConfig.is_match() method")
|
||||
print("=" * 50)
|
||||
|
||||
# Test 1: Single string pattern
|
||||
print("\n1. Single string pattern (glob style)")
|
||||
config = CrawlerRunConfig(
|
||||
url_matcher="*.pdf",
|
||||
# For example we can set this => scraping_strategy=PDFContentScrapingStrategy()
|
||||
)
|
||||
test_urls = [
|
||||
("https://example.com/file.pdf", True),
|
||||
("https://example.com/doc.PDF", False), # Case sensitive
|
||||
("https://example.com/file.txt", False),
|
||||
("file.pdf", True),
|
||||
]
|
||||
for url, expected in test_urls:
|
||||
result = config.is_match(url)
|
||||
status = "✓" if result == expected else "✗"
|
||||
print(f" {status} {url} -> {result}")
|
||||
|
||||
# Test 2: List of patterns with OR
|
||||
print("\n2. List of patterns with OR (default)")
|
||||
config = CrawlerRunConfig(
|
||||
url_matcher=["*/article/*", "*/blog/*", "*.html"],
|
||||
match_mode=MatchMode.OR
|
||||
)
|
||||
test_urls = [
|
||||
("https://example.com/article/news", True),
|
||||
("https://example.com/blog/post", True),
|
||||
("https://example.com/page.html", True),
|
||||
("https://example.com/page.php", False),
|
||||
]
|
||||
for url, expected in test_urls:
|
||||
result = config.is_match(url)
|
||||
status = "✓" if result == expected else "✗"
|
||||
print(f" {status} {url} -> {result}")
|
||||
|
||||
# Test 3: Custom function
|
||||
print("\n3. Custom function matcher")
|
||||
config = CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'api' in url and (url.endswith('.json') or url.endswith('.xml'))
|
||||
)
|
||||
test_urls = [
|
||||
("https://api.example.com/data.json", True),
|
||||
("https://api.example.com/data.xml", True),
|
||||
("https://api.example.com/data.html", False),
|
||||
("https://example.com/data.json", False), # No 'api'
|
||||
]
|
||||
for url, expected in test_urls:
|
||||
result = config.is_match(url)
|
||||
status = "✓" if result == expected else "✗"
|
||||
print(f" {status} {url} -> {result}")
|
||||
|
||||
# Test 4: Mixed list with AND
|
||||
print("\n4. Mixed patterns and functions with AND")
|
||||
config = CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
"https://*", # Must be HTTPS
|
||||
lambda url: '.com' in url, # Must have .com
|
||||
lambda url: len(url) < 50 # Must be short
|
||||
],
|
||||
match_mode=MatchMode.AND
|
||||
)
|
||||
test_urls = [
|
||||
("https://example.com/page", True),
|
||||
("http://example.com/page", False), # Not HTTPS
|
||||
("https://example.org/page", False), # No .com
|
||||
("https://example.com/" + "x" * 50, False), # Too long
|
||||
]
|
||||
for url, expected in test_urls:
|
||||
result = config.is_match(url)
|
||||
status = "✓" if result == expected else "✗"
|
||||
print(f" {status} {url} -> {result}")
|
||||
|
||||
# Test 5: Complex real-world scenario
|
||||
print("\n5. Complex pattern combinations")
|
||||
config = CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
"*/api/v[0-9]/*", # API versioned endpoints
|
||||
lambda url: 'graphql' in url, # GraphQL endpoints
|
||||
"*.json" # JSON files
|
||||
],
|
||||
match_mode=MatchMode.OR
|
||||
)
|
||||
test_urls = [
|
||||
("https://example.com/api/v1/users", True),
|
||||
("https://example.com/api/v2/posts", True),
|
||||
("https://example.com/graphql", True),
|
||||
("https://example.com/data.json", True),
|
||||
("https://example.com/api/users", False), # No version
|
||||
]
|
||||
for url, expected in test_urls:
|
||||
result = config.is_match(url)
|
||||
status = "✓" if result == expected else "✗"
|
||||
print(f" {status} {url} -> {result}")
|
||||
|
||||
# Test 6: Edge cases
|
||||
print("\n6. Edge cases")
|
||||
|
||||
# No matcher
|
||||
config = CrawlerRunConfig()
|
||||
result = config.is_match("https://example.com")
|
||||
print(f" {'✓' if not result else '✗'} No matcher -> {result}")
|
||||
|
||||
# Empty list
|
||||
config = CrawlerRunConfig(url_matcher=[])
|
||||
result = config.is_match("https://example.com")
|
||||
print(f" {'✓' if not result else '✗'} Empty list -> {result}")
|
||||
|
||||
# None in list (should be skipped)
|
||||
config = CrawlerRunConfig(url_matcher=["*.pdf", None, "*.doc"])
|
||||
result = config.is_match("test.pdf")
|
||||
print(f" {'✓' if result else '✗'} List with None -> {result}")
|
||||
|
||||
print("\n" + "=" * 50)
|
||||
print("All matching tests completed!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_all_matching_scenarios()
|
||||
87
tests/test_config_selection.py
Normal file
87
tests/test_config_selection.py
Normal file
@@ -0,0 +1,87 @@
|
||||
"""
|
||||
Test config selection logic in dispatchers
|
||||
"""
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from unittest.mock import AsyncMock, MagicMock
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from crawl4ai.async_configs import CrawlerRunConfig, MatchMode
|
||||
from crawl4ai.async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher
|
||||
|
||||
class TestDispatcher(BaseDispatcher):
|
||||
"""Simple test dispatcher to verify config selection"""
|
||||
|
||||
async def crawl_url(self, url, config, task_id, **kwargs):
|
||||
# Just return which config was selected
|
||||
selected = self.select_config(url, config)
|
||||
return {"url": url, "config_id": id(selected)}
|
||||
|
||||
async def run_urls(self, urls, crawler, config):
|
||||
results = []
|
||||
for url in urls:
|
||||
result = await self.crawl_url(url, config, "test")
|
||||
results.append(result)
|
||||
return results
|
||||
|
||||
async def test_dispatcher_config_selection():
|
||||
print("Testing dispatcher config selection")
|
||||
print("=" * 50)
|
||||
|
||||
# Create test configs with different matchers
|
||||
pdf_config = CrawlerRunConfig(url_matcher="*.pdf")
|
||||
api_config = CrawlerRunConfig(url_matcher=lambda url: 'api' in url)
|
||||
default_config = CrawlerRunConfig() # No matcher
|
||||
|
||||
configs = [pdf_config, api_config, default_config]
|
||||
|
||||
# Create test dispatcher
|
||||
dispatcher = TestDispatcher()
|
||||
|
||||
# Test single config
|
||||
print("\nTest 1: Single config")
|
||||
result = await dispatcher.crawl_url("https://example.com/file.pdf", pdf_config, "test1")
|
||||
assert result["config_id"] == id(pdf_config)
|
||||
print("✓ Single config works")
|
||||
|
||||
# Test config list selection
|
||||
print("\nTest 2: Config list selection")
|
||||
test_cases = [
|
||||
("https://example.com/file.pdf", id(pdf_config)),
|
||||
("https://api.example.com/data", id(api_config)),
|
||||
("https://example.com/page", id(configs[0])), # No match, uses first
|
||||
]
|
||||
|
||||
for url, expected_id in test_cases:
|
||||
result = await dispatcher.crawl_url(url, configs, "test")
|
||||
assert result["config_id"] == expected_id, f"URL {url} got wrong config"
|
||||
print(f"✓ {url} -> correct config selected")
|
||||
|
||||
# Test with MemoryAdaptiveDispatcher
|
||||
print("\nTest 3: MemoryAdaptiveDispatcher config selection")
|
||||
mem_dispatcher = MemoryAdaptiveDispatcher()
|
||||
|
||||
# Test select_config method directly
|
||||
selected = mem_dispatcher.select_config("https://example.com/doc.pdf", configs)
|
||||
assert selected == pdf_config
|
||||
print("✓ MemoryAdaptiveDispatcher.select_config works")
|
||||
|
||||
# Test empty config list
|
||||
print("\nTest 4: Edge cases")
|
||||
selected = mem_dispatcher.select_config("https://example.com", [])
|
||||
assert isinstance(selected, CrawlerRunConfig) # Should return default
|
||||
print("✓ Empty config list returns default config")
|
||||
|
||||
# Test None config
|
||||
selected = mem_dispatcher.select_config("https://example.com", None)
|
||||
assert isinstance(selected, CrawlerRunConfig) # Should return default
|
||||
print("✓ None config returns default config")
|
||||
|
||||
print("\n" + "=" * 50)
|
||||
print("All dispatcher tests passed! ✓")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_dispatcher_config_selection())
|
||||
122
tests/test_docker_api_with_llm_provider.py
Normal file
122
tests/test_docker_api_with_llm_provider.py
Normal file
@@ -0,0 +1,122 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test script to verify Docker API with LLM provider configuration."""
|
||||
|
||||
import requests
|
||||
import json
|
||||
import time
|
||||
|
||||
BASE_URL = "http://localhost:11235"
|
||||
|
||||
def test_health():
|
||||
"""Test health endpoint."""
|
||||
print("1. Testing health endpoint...")
|
||||
response = requests.get(f"{BASE_URL}/health")
|
||||
print(f" Status: {response.status_code}")
|
||||
print(f" Response: {response.json()}")
|
||||
print()
|
||||
|
||||
def test_schema():
|
||||
"""Test schema endpoint to see configuration."""
|
||||
print("2. Testing schema endpoint...")
|
||||
response = requests.get(f"{BASE_URL}/schema")
|
||||
print(f" Status: {response.status_code}")
|
||||
# Print only browser config to keep output concise
|
||||
print(f" Browser config keys: {list(response.json().get('browser', {}).keys())[:5]}...")
|
||||
print()
|
||||
|
||||
def test_markdown_with_llm_filter():
|
||||
"""Test markdown endpoint with LLM filter (should use configured provider)."""
|
||||
print("3. Testing markdown endpoint with LLM filter...")
|
||||
print(" This should use the Groq provider from LLM_PROVIDER env var")
|
||||
|
||||
# Note: This will fail with dummy API keys, but we can see if it tries to use Groq
|
||||
payload = {
|
||||
"url": "https://httpbin.org/html",
|
||||
"f": "llm",
|
||||
"q": "Extract the main content"
|
||||
}
|
||||
|
||||
response = requests.post(f"{BASE_URL}/md", json=payload)
|
||||
print(f" Status: {response.status_code}")
|
||||
|
||||
if response.status_code != 200:
|
||||
print(f" Error: {response.text[:200]}...")
|
||||
else:
|
||||
print(f" Success! Markdown length: {len(response.json().get('markdown', ''))} chars")
|
||||
print()
|
||||
|
||||
def test_markdown_with_provider_override():
|
||||
"""Test markdown endpoint with provider override in request."""
|
||||
print("4. Testing markdown endpoint with provider override...")
|
||||
print(" This should use OpenAI provider from request parameter")
|
||||
|
||||
payload = {
|
||||
"url": "https://httpbin.org/html",
|
||||
"f": "llm",
|
||||
"q": "Extract the main content",
|
||||
"provider": "openai/gpt-4" # Override to use OpenAI
|
||||
}
|
||||
|
||||
response = requests.post(f"{BASE_URL}/md", json=payload)
|
||||
print(f" Status: {response.status_code}")
|
||||
|
||||
if response.status_code != 200:
|
||||
print(f" Error: {response.text[:200]}...")
|
||||
else:
|
||||
print(f" Success! Markdown length: {len(response.json().get('markdown', ''))} chars")
|
||||
print()
|
||||
|
||||
def test_simple_crawl():
|
||||
"""Test simple crawl without LLM."""
|
||||
print("5. Testing simple crawl (no LLM required)...")
|
||||
|
||||
payload = {
|
||||
"urls": ["https://httpbin.org/html"],
|
||||
"browser_config": {
|
||||
"type": "BrowserConfig",
|
||||
"params": {"headless": True}
|
||||
},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {"cache_mode": "bypass"}
|
||||
}
|
||||
}
|
||||
|
||||
response = requests.post(f"{BASE_URL}/crawl", json=payload)
|
||||
print(f" Status: {response.status_code}")
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
print(f" Success: {result.get('success')}")
|
||||
print(f" Results count: {len(result.get('results', []))}")
|
||||
if result.get('results'):
|
||||
print(f" First result success: {result['results'][0].get('success')}")
|
||||
else:
|
||||
print(f" Error: {response.text[:200]}...")
|
||||
print()
|
||||
|
||||
def test_playground():
|
||||
"""Test if playground is accessible."""
|
||||
print("6. Testing playground interface...")
|
||||
response = requests.get(f"{BASE_URL}/playground")
|
||||
print(f" Status: {response.status_code}")
|
||||
print(f" Content-Type: {response.headers.get('content-type')}")
|
||||
print()
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("=== Crawl4AI Docker API Tests ===\n")
|
||||
print(f"Testing API at {BASE_URL}\n")
|
||||
|
||||
# Wait a bit for server to be fully ready
|
||||
time.sleep(2)
|
||||
|
||||
test_health()
|
||||
test_schema()
|
||||
test_simple_crawl()
|
||||
test_playground()
|
||||
|
||||
print("\nTesting LLM functionality (these may fail with dummy API keys):\n")
|
||||
test_markdown_with_llm_filter()
|
||||
test_markdown_with_provider_override()
|
||||
|
||||
print("\nTests completed!")
|
||||
71
tests/test_memory_macos.py
Executable file
71
tests/test_memory_macos.py
Executable file
@@ -0,0 +1,71 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test script to verify macOS memory calculation accuracy."""
|
||||
|
||||
import psutil
|
||||
import platform
|
||||
import time
|
||||
from crawl4ai.memory_utils import get_true_memory_usage_percent, get_memory_stats, get_true_available_memory_gb
|
||||
|
||||
|
||||
def test_memory_calculation():
|
||||
"""Test and compare memory calculations."""
|
||||
print(f"Platform: {platform.system()}")
|
||||
print(f"Python version: {platform.python_version()}")
|
||||
print("-" * 60)
|
||||
|
||||
# Get psutil's view
|
||||
vm = psutil.virtual_memory()
|
||||
psutil_percent = vm.percent
|
||||
psutil_available_gb = vm.available / (1024**3)
|
||||
total_gb = vm.total / (1024**3)
|
||||
|
||||
# Get our corrected view
|
||||
true_percent = get_true_memory_usage_percent()
|
||||
true_available_gb = get_true_available_memory_gb()
|
||||
true_percent_calc, available_calc, total_calc = get_memory_stats()
|
||||
|
||||
print("Memory Statistics Comparison:")
|
||||
print(f"Total Memory: {total_gb:.2f} GB")
|
||||
print()
|
||||
|
||||
print("PSUtil (Standard) Calculation:")
|
||||
print(f" - Memory Used: {psutil_percent:.1f}%")
|
||||
print(f" - Available: {psutil_available_gb:.2f} GB")
|
||||
print()
|
||||
|
||||
print("Platform-Aware Calculation:")
|
||||
print(f" - Memory Used: {true_percent:.1f}%")
|
||||
print(f" - Available: {true_available_gb:.2f} GB")
|
||||
print(f" - Difference: {true_available_gb - psutil_available_gb:.2f} GB of reclaimable memory")
|
||||
print()
|
||||
|
||||
# Show the impact on dispatcher behavior
|
||||
print("Impact on MemoryAdaptiveDispatcher:")
|
||||
thresholds = {
|
||||
"Normal": 90.0,
|
||||
"Critical": 95.0,
|
||||
"Recovery": 85.0
|
||||
}
|
||||
|
||||
for name, threshold in thresholds.items():
|
||||
psutil_triggered = psutil_percent >= threshold
|
||||
true_triggered = true_percent >= threshold
|
||||
print(f" - {name} Threshold ({threshold}%):")
|
||||
print(f" PSUtil: {'TRIGGERED' if psutil_triggered else 'OK'}")
|
||||
print(f" Platform-Aware: {'TRIGGERED' if true_triggered else 'OK'}")
|
||||
if psutil_triggered != true_triggered:
|
||||
print(f" → Difference: Platform-aware prevents false {'pressure' if psutil_triggered else 'recovery'}")
|
||||
print()
|
||||
|
||||
# Monitor for a few seconds
|
||||
print("Monitoring memory for 10 seconds...")
|
||||
for i in range(10):
|
||||
vm = psutil.virtual_memory()
|
||||
true_pct = get_true_memory_usage_percent()
|
||||
print(f" {i+1}s - PSUtil: {vm.percent:.1f}% | Platform-Aware: {true_pct:.1f}%", end="\r")
|
||||
time.sleep(1)
|
||||
print("\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_memory_calculation()
|
||||
117
tests/test_multi_config.py
Normal file
117
tests/test_multi_config.py
Normal file
@@ -0,0 +1,117 @@
|
||||
"""
|
||||
Test example for multiple crawler configs feature
|
||||
"""
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, MatchMode, CacheMode
|
||||
|
||||
async def test_multi_config():
|
||||
# Create different configs for different URL patterns
|
||||
|
||||
# Config for PDF files
|
||||
pdf_config = CrawlerRunConfig(
|
||||
url_matcher="*.pdf",
|
||||
)
|
||||
|
||||
# Config for articles (using multiple patterns with OR logic)
|
||||
article_config = CrawlerRunConfig(
|
||||
url_matcher=["*/news/*", "*blog*", "*/article/*"],
|
||||
match_mode=MatchMode.OR,
|
||||
screenshot=True,
|
||||
)
|
||||
|
||||
# Config using custom matcher function
|
||||
api_config = CrawlerRunConfig(
|
||||
url_matcher=lambda url: 'api' in url or 'json' in url,
|
||||
)
|
||||
|
||||
# Config combining patterns and functions with AND logic
|
||||
secure_docs_config = CrawlerRunConfig(
|
||||
url_matcher=[
|
||||
"*.doc*", # Matches .doc, .docx
|
||||
lambda url: url.startswith('https://') # Must be HTTPS
|
||||
],
|
||||
match_mode=MatchMode.AND,
|
||||
)
|
||||
|
||||
# Default config (no url_matcher means it won't match anything unless it's the fallback)
|
||||
default_config = CrawlerRunConfig(
|
||||
# cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
|
||||
# List of configs - order matters! First match wins
|
||||
configs = [
|
||||
pdf_config,
|
||||
article_config,
|
||||
api_config,
|
||||
secure_docs_config,
|
||||
default_config # Fallback
|
||||
]
|
||||
|
||||
# Test URLs - using real URLs that exist
|
||||
test_urls = [
|
||||
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf", # Real PDF
|
||||
"https://www.bbc.com/news/articles/c5y3e3glnldo", # News article
|
||||
"https://blog.python.org/", # Blog URL
|
||||
"https://api.github.com/users/github", # GitHub API (returns JSON)
|
||||
"https://httpbin.org/json", # API endpoint that returns JSON
|
||||
"https://www.python.org/", # Generic HTTPS page
|
||||
"http://info.cern.ch/", # HTTP (not HTTPS) page
|
||||
"https://example.com/", # → Default config
|
||||
]
|
||||
|
||||
# Test the matching logic
|
||||
print("Config matching test:")
|
||||
print("-" * 50)
|
||||
for url in test_urls:
|
||||
for i, config in enumerate(configs):
|
||||
if config.is_match(url):
|
||||
print(f"{url} -> Config {i} matches")
|
||||
break
|
||||
else:
|
||||
print(f"{url} -> No match, will use fallback (first config)")
|
||||
|
||||
print("\n" + "=" * 50 + "\n")
|
||||
|
||||
# Now test with actual crawler
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Single config - traditional usage still works
|
||||
print("Test 1: Single config (backwards compatible)")
|
||||
result = await crawler.arun_many(
|
||||
urls=["https://www.python.org/"],
|
||||
config=default_config
|
||||
)
|
||||
print(f"Crawled {len(result)} URLs with single config\n")
|
||||
|
||||
# Multiple configs - new feature
|
||||
print("Test 2: Multiple configs")
|
||||
# Just test with 2 URLs to avoid timeout
|
||||
results = await crawler.arun_many(
|
||||
urls=test_urls[:2], # Just test first 2 URLs
|
||||
config=configs # Pass list of configs
|
||||
)
|
||||
print(f"Crawled {len(results)} URLs with multiple configs")
|
||||
|
||||
# Using custom matcher inline
|
||||
print("\nTest 3: Inline custom matcher")
|
||||
custom_config = CrawlerRunConfig(
|
||||
url_matcher=lambda url: len(url) > 50 and 'python' in url.lower(),
|
||||
verbose=False
|
||||
)
|
||||
results = await crawler.arun_many(
|
||||
urls=[
|
||||
"https://docs.python.org/3/library/asyncio.html", # Long URL with 'python'
|
||||
"https://python.org/", # Short URL with 'python' - won't match
|
||||
"https://www.google.com/" # No 'python' - won't match
|
||||
],
|
||||
config=[custom_config, default_config]
|
||||
)
|
||||
print(f"Crawled {len(results)} URLs with custom matcher")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_multi_config())
|
||||
Reference in New Issue
Block a user