feat(logger): add abstract logger base class and file logger implementation

Add AsyncLoggerBase abstract class to standardize logger interface and introduce AsyncFileLogger for file-only logging. Remove deprecated always_bypass_cache parameter and clean up AsyncWebCrawler initialization.

BREAKING CHANGE: Removed deprecated 'always_by_pass_cache' parameter. Use BrowserConfig cache settings instead.
This commit is contained in:
UncleCode
2025-02-23 21:23:41 +08:00
parent 46d2f12851
commit c6d48080a4
7 changed files with 198 additions and 63 deletions

View File

@@ -8,6 +8,10 @@ from .content_scraping_strategy import (
WebScrapingStrategy, WebScrapingStrategy,
LXMLWebScrapingStrategy, LXMLWebScrapingStrategy,
) )
from .async_logger import (
AsyncLoggerBase,
AsyncLogger,
)
from .proxy_strategy import ( from .proxy_strategy import (
ProxyRotationStrategy, ProxyRotationStrategy,
RoundRobinProxyStrategy, RoundRobinProxyStrategy,
@@ -59,6 +63,8 @@ from .deep_crawling import (
) )
__all__ = [ __all__ = [
"AsyncLoggerBase",
"AsyncLogger",
"AsyncWebCrawler", "AsyncWebCrawler",
"DeepCrawlStrategy", "DeepCrawlStrategy",
"BFSDeepCrawlStrategy", "BFSDeepCrawlStrategy",

View File

@@ -1,3 +1,2 @@
# crawl4ai/_version.py # crawl4ai/_version.py
# __version__ = "0.4.3b3" __version__ = "0.5.0"
__version__ = "0.4.300"

View File

@@ -1,3 +1,4 @@
from abc import ABC, abstractmethod
from enum import Enum from enum import Enum
from typing import Optional, Dict, Any from typing import Optional, Dict, Any
from colorama import Fore, Style, init from colorama import Fore, Style, init
@@ -13,7 +14,37 @@ class LogLevel(Enum):
ERROR = 5 ERROR = 5
class AsyncLogger:
class AsyncLoggerBase(ABC):
@abstractmethod
def debug(self, message: str, tag: str = "DEBUG", **kwargs):
pass
@abstractmethod
def info(self, message: str, tag: str = "INFO", **kwargs):
pass
@abstractmethod
def success(self, message: str, tag: str = "SUCCESS", **kwargs):
pass
@abstractmethod
def warning(self, message: str, tag: str = "WARNING", **kwargs):
pass
@abstractmethod
def error(self, message: str, tag: str = "ERROR", **kwargs):
pass
@abstractmethod
def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
pass
@abstractmethod
def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
pass
class AsyncLogger(AsyncLoggerBase):
""" """
Asynchronous logger with support for colored console output and file logging. Asynchronous logger with support for colored console output and file logging.
Supports templated messages with colored components. Supports templated messages with colored components.
@@ -225,3 +256,55 @@ class AsyncLogger:
tag=tag, tag=tag,
params={"url": url, "url_length": url_length, "error": error}, params={"url": url, "url_length": url_length, "error": error},
) )
class AsyncFileLogger(AsyncLoggerBase):
"""
File-only asynchronous logger that writes logs to a specified file.
"""
def __init__(self, log_file: str):
"""
Initialize the file logger.
Args:
log_file: File path for logging
"""
self.log_file = log_file
os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True)
def _write_to_file(self, level: str, message: str, tag: str):
"""Write a message to the log file."""
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
with open(self.log_file, "a", encoding="utf-8") as f:
f.write(f"[{timestamp}] [{level}] [{tag}] {message}\n")
def debug(self, message: str, tag: str = "DEBUG", **kwargs):
"""Log a debug message to file."""
self._write_to_file("DEBUG", message, tag)
def info(self, message: str, tag: str = "INFO", **kwargs):
"""Log an info message to file."""
self._write_to_file("INFO", message, tag)
def success(self, message: str, tag: str = "SUCCESS", **kwargs):
"""Log a success message to file."""
self._write_to_file("SUCCESS", message, tag)
def warning(self, message: str, tag: str = "WARNING", **kwargs):
"""Log a warning message to file."""
self._write_to_file("WARNING", message, tag)
def error(self, message: str, tag: str = "ERROR", **kwargs):
"""Log an error message to file."""
self._write_to_file("ERROR", message, tag)
def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
"""Log URL fetch status to file."""
status = "SUCCESS" if success else "FAILED"
message = f"{url[:url_length]}... | Status: {status} | Time: {timing:.2f}s"
self._write_to_file("URL_STATUS", message, tag)
def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
"""Log error status to file."""
message = f"{url[:url_length]}... | Error: {error}"
self._write_to_file("ERROR", message, tag)

View File

@@ -2,7 +2,6 @@ from .__version__ import __version__ as crawl4ai_version
import os import os
import sys import sys
import time import time
import warnings
from colorama import Fore from colorama import Fore
from pathlib import Path from pathlib import Path
from typing import Optional, List from typing import Optional, List
@@ -30,7 +29,7 @@ from .markdown_generation_strategy import (
MarkdownGenerationStrategy, MarkdownGenerationStrategy,
) )
from .deep_crawling import DeepCrawlDecorator from .deep_crawling import DeepCrawlDecorator
from .async_logger import AsyncLogger from .async_logger import AsyncLogger, AsyncLoggerBase
from .async_configs import BrowserConfig, CrawlerRunConfig from .async_configs import BrowserConfig, CrawlerRunConfig
from .async_dispatcher import * # noqa: F403 from .async_dispatcher import * # noqa: F403
from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
@@ -80,22 +79,21 @@ class AsyncWebCrawler:
await crawler.close() await crawler.close()
``` ```
Attributes: Attributes:
browser_config (BrowserConfig): Configuration object for browser settings. browser_config (BrowserConfig): Configuration object for browser settings.
crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages. crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages.
logger (AsyncLogger): Logger instance for recording events and errors. logger (AsyncLogger): Logger instance for recording events and errors.
always_bypass_cache (bool): Whether to always bypass cache.
crawl4ai_folder (str): Directory for storing cache. crawl4ai_folder (str): Directory for storing cache.
base_directory (str): Base directory for storing cache. base_directory (str): Base directory for storing cache.
ready (bool): Whether the crawler is ready for use. ready (bool): Whether the crawler is ready for use.
Methods: Methods:
start(): Start the crawler explicitly without using context manager. start(): Start the crawler explicitly without using context manager.
close(): Close the crawler explicitly without using context manager. close(): Close the crawler explicitly without using context manager.
arun(): Run the crawler for a single source: URL (web, local file, or raw HTML). arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
awarmup(): Perform warmup sequence. awarmup(): Perform warmup sequence.
arun_many(): Run the crawler for multiple sources. arun_many(): Run the crawler for multiple sources.
aprocess_html(): Process HTML content. aprocess_html(): Process HTML content.
Typical Usage: Typical Usage:
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:
@@ -116,50 +114,30 @@ class AsyncWebCrawler:
def __init__( def __init__(
self, self,
crawler_strategy: Optional[AsyncCrawlerStrategy] = None, crawler_strategy: AsyncCrawlerStrategy = None,
config: Optional[BrowserConfig] = None, config: BrowserConfig = None,
always_bypass_cache: bool = False,
always_by_pass_cache: Optional[bool] = None, # Deprecated parameter
base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
thread_safe: bool = False, thread_safe: bool = False,
logger: AsyncLoggerBase = None,
**kwargs, **kwargs,
): ):
""" """
Initialize the AsyncWebCrawler. Initialize the AsyncWebCrawler.
Args: Args:
crawler_strategy: Strategy for crawling web pages. If None, will create AsyncPlaywrightCrawlerStrategy crawler_strategy: Strategy for crawling web pages. Default AsyncPlaywrightCrawlerStrategy
config: Configuration object for browser settings. If None, will be created from kwargs config: Configuration object for browser settings. Default BrowserConfig()
always_bypass_cache: Whether to always bypass cache (new parameter)
always_by_pass_cache: Deprecated, use always_bypass_cache instead
base_directory: Base directory for storing cache base_directory: Base directory for storing cache
thread_safe: Whether to use thread-safe operations thread_safe: Whether to use thread-safe operations
**kwargs: Additional arguments for backwards compatibility **kwargs: Additional arguments for backwards compatibility
""" """
# Handle browser configuration # Handle browser configuration
browser_config = config browser_config = config or BrowserConfig()
if browser_config is not None:
if any(
k in kwargs
for k in [
"browser_type",
"headless",
"viewport_width",
"viewport_height",
]
):
self.logger.warning(
message="Both browser_config and legacy browser parameters provided. browser_config will take precedence.",
tag="WARNING",
)
else:
# Create browser config from kwargs for backwards compatibility
browser_config = BrowserConfig.from_kwargs(kwargs)
self.browser_config = browser_config self.browser_config = browser_config
# Initialize logger first since other components may need it # Initialize logger first since other components may need it
self.logger = AsyncLogger( self.logger = logger or AsyncLogger(
log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"), log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"),
verbose=self.browser_config.verbose, verbose=self.browser_config.verbose,
tag_width=10, tag_width=10,
@@ -173,24 +151,6 @@ class AsyncWebCrawler:
**params, # Pass remaining kwargs for backwards compatibility **params, # Pass remaining kwargs for backwards compatibility
) )
# If craweler strategy doesnt have logger, use crawler logger
if not self.crawler_strategy.logger:
self.crawler_strategy.logger = self.logger
# Handle deprecated cache parameter
if always_by_pass_cache is not None:
if kwargs.get("warning", True):
warnings.warn(
"'always_by_pass_cache' is deprecated and will be removed in version 0.5.0. "
"Use 'always_bypass_cache' instead. "
"Pass warning=False to suppress this warning.",
DeprecationWarning,
stacklevel=2,
)
self.always_bypass_cache = always_by_pass_cache
else:
self.always_bypass_cache = always_bypass_cache
# Thread safety setup # Thread safety setup
self._lock = asyncio.Lock() if thread_safe else None self._lock = asyncio.Lock() if thread_safe else None
@@ -356,7 +316,7 @@ class AsyncWebCrawler:
# Create cache context # Create cache context
cache_context = CacheContext( cache_context = CacheContext(
url, config.cache_mode, self.always_bypass_cache url, config.cache_mode, False
) )
# Initialize processing variables # Initialize processing variables

View File

@@ -1,6 +1,6 @@
# crawl4ai/hub.py # crawl4ai/hub.py
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Dict, Type from typing import Dict, Type, Union
import logging import logging
import importlib import importlib
from pathlib import Path from pathlib import Path
@@ -63,7 +63,7 @@ class CrawlerHub:
cls._crawlers[name] = obj cls._crawlers[name] = obj
@classmethod @classmethod
def get(cls, name: str) -> Type[BaseCrawler] | None: def get(cls, name: str) -> Union[Type[BaseCrawler], None]:
if not cls._crawlers: if not cls._crawlers:
cls._discover_crawlers() cls._discover_crawlers()
return cls._crawlers.get(name) return cls._crawlers.get(name)

View File

@@ -1,5 +1,12 @@
import asyncio import asyncio
from crawl4ai import * from crawl4ai import (
AsyncWebCrawler,
BrowserConfig,
CrawlerRunConfig,
CacheMode,
DefaultMarkdownGenerator,
PruningContentFilter,
)
async def main(): async def main():

View File

@@ -0,0 +1,80 @@
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, AsyncLoggerBase
import os
from datetime import datetime
class AsyncFileLogger(AsyncLoggerBase):
"""
File-only asynchronous logger that writes logs to a specified file.
"""
def __init__(self, log_file: str):
"""
Initialize the file logger.
Args:
log_file: File path for logging
"""
self.log_file = log_file
os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True)
def _write_to_file(self, level: str, message: str, tag: str):
"""Write a message to the log file."""
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
with open(self.log_file, "a", encoding="utf-8") as f:
f.write(f"[{timestamp}] [{level}] [{tag}] {message}\n")
def debug(self, message: str, tag: str = "DEBUG", **kwargs):
"""Log a debug message to file."""
self._write_to_file("DEBUG", message, tag)
def info(self, message: str, tag: str = "INFO", **kwargs):
"""Log an info message to file."""
self._write_to_file("INFO", message, tag)
def success(self, message: str, tag: str = "SUCCESS", **kwargs):
"""Log a success message to file."""
self._write_to_file("SUCCESS", message, tag)
def warning(self, message: str, tag: str = "WARNING", **kwargs):
"""Log a warning message to file."""
self._write_to_file("WARNING", message, tag)
def error(self, message: str, tag: str = "ERROR", **kwargs):
"""Log an error message to file."""
self._write_to_file("ERROR", message, tag)
def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
"""Log URL fetch status to file."""
status = "SUCCESS" if success else "FAILED"
message = f"{url[:url_length]}... | Status: {status} | Time: {timing:.2f}s"
self._write_to_file("URL_STATUS", message, tag)
def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
"""Log error status to file."""
message = f"{url[:url_length]}... | Error: {error}"
self._write_to_file("ERROR", message, tag)
async def main():
browser_config = BrowserConfig(headless=True, verbose=True)
crawler = AsyncWebCrawler(config=browser_config, logger=AsyncFileLogger("/Users/unclecode/devs/crawl4ai/.private/tmp/crawl.log"))
await crawler.start()
try:
crawl_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
)
# Use the crawler multiple times
result = await crawler.arun(
url='https://kidocode.com/',
config=crawl_config
)
if result.success:
print("First crawl - Raw Markdown Length:", len(result.markdown_v2.raw_markdown))
finally:
# Always ensure we close the crawler
await crawler.close()
if __name__ == "__main__":
asyncio.run(main())