feat(logger): add abstract logger base class and file logger implementation

Add AsyncLoggerBase abstract class to standardize logger interface and introduce AsyncFileLogger for file-only logging. Remove deprecated always_bypass_cache parameter and clean up AsyncWebCrawler initialization. BREAKING CHANGE: Removed deprecated 'always_by_pass_cache' parameter. Use BrowserConfig cache settings instead.
2025-02-23 21:23:41 +08:00
parent 46d2f12851
commit c6d48080a4
7 changed files with 198 additions and 63 deletions
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -8,6 +8,10 @@ from .content_scraping_strategy import (
    WebScrapingStrategy,
    LXMLWebScrapingStrategy,
 )
 from .async_logger import (
    AsyncLoggerBase,
    AsyncLogger,
 )
 from .proxy_strategy import (
    ProxyRotationStrategy,
    RoundRobinProxyStrategy,
@@ -59,6 +63,8 @@ from .deep_crawling import (
 )
 __all__ = [
    "AsyncLoggerBase",
    "AsyncLogger",
    "AsyncWebCrawler",
    "DeepCrawlStrategy",
    "BFSDeepCrawlStrategy",
--- a/crawl4ai/version.py
+++ b/crawl4ai/version.py
@@ -1,3 +1,2 @@
 # crawl4ai/_version.py
-# __version__ = "0.4.3b3"
+__version__ = "0.5.0"
 __version__ = "0.4.300"
--- a/crawl4ai/async_logger.py
+++ b/crawl4ai/async_logger.py
@@ -1,3 +1,4 @@
 from abc import ABC, abstractmethod
 from enum import Enum
 from typing import Optional, Dict, Any
 from colorama import Fore, Style, init
@@ -13,7 +14,37 @@ class LogLevel(Enum):
    ERROR = 5
-class AsyncLogger:
+
 class AsyncLoggerBase(ABC):
    @abstractmethod
    def debug(self, message: str, tag: str = "DEBUG", **kwargs):
        pass
    @abstractmethod
    def info(self, message: str, tag: str = "INFO", **kwargs):
        pass
    @abstractmethod
    def success(self, message: str, tag: str = "SUCCESS", **kwargs):
        pass
    @abstractmethod
    def warning(self, message: str, tag: str = "WARNING", **kwargs):
        pass
    @abstractmethod
    def error(self, message: str, tag: str = "ERROR", **kwargs):
        pass
    @abstractmethod
    def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
        pass
    @abstractmethod
    def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
        pass
 class AsyncLogger(AsyncLoggerBase):
    """
    Asynchronous logger with support for colored console output and file logging.
    Supports templated messages with colored components.
@@ -225,3 +256,55 @@ class AsyncLogger:
            tag=tag,
            params={"url": url, "url_length": url_length, "error": error},
        )
 class AsyncFileLogger(AsyncLoggerBase):
    """
    File-only asynchronous logger that writes logs to a specified file.
    """
    def __init__(self, log_file: str):
        """
        Initialize the file logger.
        Args:
            log_file: File path for logging
        """
        self.log_file = log_file
        os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True)
    def _write_to_file(self, level: str, message: str, tag: str):
        """Write a message to the log file."""
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
        with open(self.log_file, "a", encoding="utf-8") as f:
            f.write(f"[{timestamp}] [{level}] [{tag}] {message}\n")
    def debug(self, message: str, tag: str = "DEBUG", **kwargs):
        """Log a debug message to file."""
        self._write_to_file("DEBUG", message, tag)
    def info(self, message: str, tag: str = "INFO", **kwargs):
        """Log an info message to file."""
        self._write_to_file("INFO", message, tag)
    def success(self, message: str, tag: str = "SUCCESS", **kwargs):
        """Log a success message to file."""
        self._write_to_file("SUCCESS", message, tag)
    def warning(self, message: str, tag: str = "WARNING", **kwargs):
        """Log a warning message to file."""
        self._write_to_file("WARNING", message, tag)
    def error(self, message: str, tag: str = "ERROR", **kwargs):
        """Log an error message to file."""
        self._write_to_file("ERROR", message, tag)
    def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
        """Log URL fetch status to file."""
        status = "SUCCESS" if success else "FAILED"
        message = f"{url[:url_length]}... | Status: {status} | Time: {timing:.2f}s"
        self._write_to_file("URL_STATUS", message, tag)
    def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
        """Log error status to file."""
        message = f"{url[:url_length]}... | Error: {error}"
        self._write_to_file("ERROR", message, tag)
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -2,7 +2,6 @@ from .__version__ import __version__ as crawl4ai_version
 import os
 import sys
 import time
 import warnings
 from colorama import Fore
 from pathlib import Path
 from typing import Optional, List
@@ -30,7 +29,7 @@ from .markdown_generation_strategy import (
    MarkdownGenerationStrategy,
 )
 from .deep_crawling import DeepCrawlDecorator
-from .async_logger import AsyncLogger
+from .async_logger import AsyncLogger, AsyncLoggerBase
 from .async_configs import BrowserConfig, CrawlerRunConfig
 from .async_dispatcher import * # noqa: F403
 from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
@@ -80,22 +79,21 @@ class AsyncWebCrawler:
        await crawler.close()
        ```
-            Attributes:
+    Attributes:
        browser_config (BrowserConfig): Configuration object for browser settings.
        crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages.
        logger (AsyncLogger): Logger instance for recording events and errors.
        always_bypass_cache (bool): Whether to always bypass cache.
        crawl4ai_folder (str): Directory for storing cache.
        base_directory (str): Base directory for storing cache.
        ready (bool): Whether the crawler is ready for use.
-        Methods:
+    Methods:
-            start(): Start the crawler explicitly without using context manager.
+        start(): Start the crawler explicitly without using context manager.
-            close(): Close the crawler explicitly without using context manager.
+        close(): Close the crawler explicitly without using context manager.
-            arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
+        arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
-            awarmup(): Perform warmup sequence.
+        awarmup(): Perform warmup sequence.
-            arun_many(): Run the crawler for multiple sources.
+        arun_many(): Run the crawler for multiple sources.
-            aprocess_html(): Process HTML content.
+        aprocess_html(): Process HTML content.
    Typical Usage:
        async with AsyncWebCrawler() as crawler:
@@ -116,50 +114,30 @@ class AsyncWebCrawler:
    def __init__(
        self,
-        crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
+        crawler_strategy: AsyncCrawlerStrategy = None,
-        config: Optional[BrowserConfig] = None,
+        config: BrowserConfig = None,
        always_bypass_cache: bool = False,
        always_by_pass_cache: Optional[bool] = None,  # Deprecated parameter
        base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
        thread_safe: bool = False,
        logger: AsyncLoggerBase = None,
        **kwargs,
    ):
        """
        Initialize the AsyncWebCrawler.
        Args:
-            crawler_strategy: Strategy for crawling web pages. If None, will create AsyncPlaywrightCrawlerStrategy
+            crawler_strategy: Strategy for crawling web pages. Default AsyncPlaywrightCrawlerStrategy
-            config: Configuration object for browser settings. If None, will be created from kwargs
+            config: Configuration object for browser settings. Default BrowserConfig()
            always_bypass_cache: Whether to always bypass cache (new parameter)
            always_by_pass_cache: Deprecated, use always_bypass_cache instead
            base_directory: Base directory for storing cache
            thread_safe: Whether to use thread-safe operations
            **kwargs: Additional arguments for backwards compatibility
        """
        # Handle browser configuration
-        browser_config = config
+        browser_config = config or BrowserConfig()
        if browser_config is not None:
            if any(
                k in kwargs
                for k in [
                    "browser_type",
                    "headless",
                    "viewport_width",
                    "viewport_height",
                ]
            ):
                self.logger.warning(
                    message="Both browser_config and legacy browser parameters provided. browser_config will take precedence.",
                    tag="WARNING",
                )
        else:
            # Create browser config from kwargs for backwards compatibility
            browser_config = BrowserConfig.from_kwargs(kwargs)
        self.browser_config = browser_config
        # Initialize logger first since other components may need it
-        self.logger = AsyncLogger(
+        self.logger = logger or AsyncLogger(
            log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"),
            verbose=self.browser_config.verbose,
            tag_width=10,
@@ -173,24 +151,6 @@ class AsyncWebCrawler:
            **params,  # Pass remaining kwargs for backwards compatibility
        )
        # If craweler strategy doesnt have logger, use crawler logger
        if not self.crawler_strategy.logger:
            self.crawler_strategy.logger = self.logger
        # Handle deprecated cache parameter
        if always_by_pass_cache is not None:
            if kwargs.get("warning", True):
                warnings.warn(
                    "'always_by_pass_cache' is deprecated and will be removed in version 0.5.0. "
                    "Use 'always_bypass_cache' instead. "
                    "Pass warning=False to suppress this warning.",
                    DeprecationWarning,
                    stacklevel=2,
                )
            self.always_bypass_cache = always_by_pass_cache
        else:
            self.always_bypass_cache = always_bypass_cache
        # Thread safety setup
        self._lock = asyncio.Lock() if thread_safe else None
@@ -356,7 +316,7 @@ class AsyncWebCrawler:
                # Create cache context
                cache_context = CacheContext(
-                    url, config.cache_mode, self.always_bypass_cache
+                    url, config.cache_mode, False
                )
                # Initialize processing variables
--- a/crawl4ai/hub.py
+++ b/crawl4ai/hub.py
@@ -1,6 +1,6 @@
 # crawl4ai/hub.py
 from abc import ABC, abstractmethod
-from typing import Dict, Type
+from typing import Dict, Type, Union
 import logging
 import importlib
 from pathlib import Path
@@ -63,7 +63,7 @@ class CrawlerHub:
            cls._crawlers[name] = obj
    @classmethod
-    def get(cls, name: str) -> Type[BaseCrawler] | None:
+    def get(cls, name: str) -> Union[Type[BaseCrawler], None]:
        if not cls._crawlers:
            cls._discover_crawlers()
        return cls._crawlers.get(name)
--- a/docs/examples/hello_world.py
+++ b/docs/examples/hello_world.py
@@ -1,5 +1,12 @@
 import asyncio
-from crawl4ai import *
+from crawl4ai import (
    AsyncWebCrawler,
    BrowserConfig,
    CrawlerRunConfig,
    CacheMode,
    DefaultMarkdownGenerator,
    PruningContentFilter,
 )
 async def main():
--- a/tests/loggers/test_logger.py
+++ b/tests/loggers/test_logger.py
@@ -0,0 +1,80 @@
 import asyncio
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, AsyncLoggerBase
 import os
 from datetime import datetime
 class AsyncFileLogger(AsyncLoggerBase):
    """
    File-only asynchronous logger that writes logs to a specified file.
    """
    def __init__(self, log_file: str):
        """
        Initialize the file logger.
        Args:
            log_file: File path for logging
        """
        self.log_file = log_file
        os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True)
    def _write_to_file(self, level: str, message: str, tag: str):
        """Write a message to the log file."""
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
        with open(self.log_file, "a", encoding="utf-8") as f:
            f.write(f"[{timestamp}] [{level}] [{tag}] {message}\n")
    def debug(self, message: str, tag: str = "DEBUG", **kwargs):
        """Log a debug message to file."""
        self._write_to_file("DEBUG", message, tag)
    def info(self, message: str, tag: str = "INFO", **kwargs):
        """Log an info message to file."""
        self._write_to_file("INFO", message, tag)
    def success(self, message: str, tag: str = "SUCCESS", **kwargs):
        """Log a success message to file."""
        self._write_to_file("SUCCESS", message, tag)
    def warning(self, message: str, tag: str = "WARNING", **kwargs):
        """Log a warning message to file."""
        self._write_to_file("WARNING", message, tag)
    def error(self, message: str, tag: str = "ERROR", **kwargs):
        """Log an error message to file."""
        self._write_to_file("ERROR", message, tag)
    def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
        """Log URL fetch status to file."""
        status = "SUCCESS" if success else "FAILED"
        message = f"{url[:url_length]}... | Status: {status} | Time: {timing:.2f}s"
        self._write_to_file("URL_STATUS", message, tag)
    def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
        """Log error status to file."""
        message = f"{url[:url_length]}... | Error: {error}"
        self._write_to_file("ERROR", message, tag)
 async def main():
    browser_config = BrowserConfig(headless=True, verbose=True)
    crawler = AsyncWebCrawler(config=browser_config, logger=AsyncFileLogger("/Users/unclecode/devs/crawl4ai/.private/tmp/crawl.log"))
    await crawler.start()
    try:
        crawl_config = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS,
        )
        # Use the crawler multiple times
        result = await crawler.arun(
            url='https://kidocode.com/',
            config=crawl_config
        )
        if result.success:
            print("First crawl - Raw Markdown Length:", len(result.markdown_v2.raw_markdown))
    finally:
        # Always ensure we close the crawler
        await crawler.close()
 if __name__ == "__main__":
    asyncio.run(main())