Files
crawl4ai/crawl4ai/cache_context.py
UncleCode 3a66aa8a60 feat(cache): introduce CacheMode and CacheContext for enhanced caching behavior
chore(requirements): add colorama dependency
refactor(config): add SHOW_DEPRECATION_WARNINGS flag and clean up code
fix(docs): update example scripts for clarity and consistency
2024-11-17 15:30:56 +08:00

80 lines
2.6 KiB
Python

from enum import Enum
class CacheMode(Enum):
"""
Defines the caching behavior for web crawling operations.
Modes:
- ENABLED: Normal caching behavior (read and write)
- DISABLED: No caching at all
- READ_ONLY: Only read from cache, don't write
- WRITE_ONLY: Only write to cache, don't read
- BYPASS: Bypass cache for this operation
"""
ENABLED = "enabled"
DISABLED = "disabled"
READ_ONLY = "read_only"
WRITE_ONLY = "write_only"
BYPASS = "bypass"
class CacheContext:
"""
Encapsulates cache-related decisions and URL handling.
This class centralizes all cache-related logic and URL type checking,
making the caching behavior more predictable and maintainable.
"""
def __init__(self, url: str, cache_mode: CacheMode, always_bypass: bool = False):
self.url = url
self.cache_mode = cache_mode
self.always_bypass = always_bypass
self.is_cacheable = url.startswith(('http://', 'https://', 'file://'))
self.is_web_url = url.startswith(('http://', 'https://'))
self.is_local_file = url.startswith("file://")
self.is_raw_html = url.startswith("raw:")
self._url_display = url if not self.is_raw_html else "Raw HTML"
def should_read(self) -> bool:
"""Determines if cache should be read based on context."""
if self.always_bypass or not self.is_cacheable:
return False
return self.cache_mode in [CacheMode.ENABLED, CacheMode.READ_ONLY]
def should_write(self) -> bool:
"""Determines if cache should be written based on context."""
if self.always_bypass or not self.is_cacheable:
return False
return self.cache_mode in [CacheMode.ENABLED, CacheMode.WRITE_ONLY]
@property
def display_url(self) -> str:
"""Returns the URL in display format."""
return self._url_display
def _legacy_to_cache_mode(
disable_cache: bool = False,
bypass_cache: bool = False,
no_cache_read: bool = False,
no_cache_write: bool = False
) -> CacheMode:
"""
Converts legacy cache parameters to the new CacheMode enum.
This is an internal function to help transition from the old boolean flags
to the new CacheMode system.
"""
if disable_cache:
return CacheMode.DISABLED
if bypass_cache:
return CacheMode.BYPASS
if no_cache_read and no_cache_write:
return CacheMode.DISABLED
if no_cache_read:
return CacheMode.WRITE_ONLY
if no_cache_write:
return CacheMode.READ_ONLY
return CacheMode.ENABLED