diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 746df82b..12322540 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -8,6 +8,10 @@ from .content_scraping_strategy import ( WebScrapingStrategy, LXMLWebScrapingStrategy, ) +from .async_logger import ( + AsyncLoggerBase, + AsyncLogger, +) from .proxy_strategy import ( ProxyRotationStrategy, RoundRobinProxyStrategy, @@ -59,6 +63,8 @@ from .deep_crawling import ( ) __all__ = [ + "AsyncLoggerBase", + "AsyncLogger", "AsyncWebCrawler", "DeepCrawlStrategy", "BFSDeepCrawlStrategy", diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 58b9c4ec..77366e02 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,3 +1,2 @@ # crawl4ai/_version.py -# __version__ = "0.4.3b3" -__version__ = "0.4.300" +__version__ = "0.5.0" diff --git a/crawl4ai/async_logger.py b/crawl4ai/async_logger.py index 0e049289..6f89c217 100644 --- a/crawl4ai/async_logger.py +++ b/crawl4ai/async_logger.py @@ -1,3 +1,4 @@ +from abc import ABC, abstractmethod from enum import Enum from typing import Optional, Dict, Any from colorama import Fore, Style, init @@ -13,7 +14,37 @@ class LogLevel(Enum): ERROR = 5 -class AsyncLogger: + +class AsyncLoggerBase(ABC): + @abstractmethod + def debug(self, message: str, tag: str = "DEBUG", **kwargs): + pass + + @abstractmethod + def info(self, message: str, tag: str = "INFO", **kwargs): + pass + + @abstractmethod + def success(self, message: str, tag: str = "SUCCESS", **kwargs): + pass + + @abstractmethod + def warning(self, message: str, tag: str = "WARNING", **kwargs): + pass + + @abstractmethod + def error(self, message: str, tag: str = "ERROR", **kwargs): + pass + + @abstractmethod + def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50): + pass + + @abstractmethod + def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50): + pass + +class AsyncLogger(AsyncLoggerBase): """ Asynchronous logger with support for colored console output and file logging. Supports templated messages with colored components. @@ -225,3 +256,55 @@ class AsyncLogger: tag=tag, params={"url": url, "url_length": url_length, "error": error}, ) + +class AsyncFileLogger(AsyncLoggerBase): + """ + File-only asynchronous logger that writes logs to a specified file. + """ + + def __init__(self, log_file: str): + """ + Initialize the file logger. + + Args: + log_file: File path for logging + """ + self.log_file = log_file + os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True) + + def _write_to_file(self, level: str, message: str, tag: str): + """Write a message to the log file.""" + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] + with open(self.log_file, "a", encoding="utf-8") as f: + f.write(f"[{timestamp}] [{level}] [{tag}] {message}\n") + + def debug(self, message: str, tag: str = "DEBUG", **kwargs): + """Log a debug message to file.""" + self._write_to_file("DEBUG", message, tag) + + def info(self, message: str, tag: str = "INFO", **kwargs): + """Log an info message to file.""" + self._write_to_file("INFO", message, tag) + + def success(self, message: str, tag: str = "SUCCESS", **kwargs): + """Log a success message to file.""" + self._write_to_file("SUCCESS", message, tag) + + def warning(self, message: str, tag: str = "WARNING", **kwargs): + """Log a warning message to file.""" + self._write_to_file("WARNING", message, tag) + + def error(self, message: str, tag: str = "ERROR", **kwargs): + """Log an error message to file.""" + self._write_to_file("ERROR", message, tag) + + def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50): + """Log URL fetch status to file.""" + status = "SUCCESS" if success else "FAILED" + message = f"{url[:url_length]}... | Status: {status} | Time: {timing:.2f}s" + self._write_to_file("URL_STATUS", message, tag) + + def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50): + """Log error status to file.""" + message = f"{url[:url_length]}... | Error: {error}" + self._write_to_file("ERROR", message, tag) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index c68036e8..1a4cdcef 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -2,7 +2,6 @@ from .__version__ import __version__ as crawl4ai_version import os import sys import time -import warnings from colorama import Fore from pathlib import Path from typing import Optional, List @@ -30,7 +29,7 @@ from .markdown_generation_strategy import ( MarkdownGenerationStrategy, ) from .deep_crawling import DeepCrawlDecorator -from .async_logger import AsyncLogger +from .async_logger import AsyncLogger, AsyncLoggerBase from .async_configs import BrowserConfig, CrawlerRunConfig from .async_dispatcher import * # noqa: F403 from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter @@ -80,22 +79,21 @@ class AsyncWebCrawler: await crawler.close() ``` - Attributes: + Attributes: browser_config (BrowserConfig): Configuration object for browser settings. crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages. logger (AsyncLogger): Logger instance for recording events and errors. - always_bypass_cache (bool): Whether to always bypass cache. crawl4ai_folder (str): Directory for storing cache. base_directory (str): Base directory for storing cache. ready (bool): Whether the crawler is ready for use. - Methods: - start(): Start the crawler explicitly without using context manager. - close(): Close the crawler explicitly without using context manager. - arun(): Run the crawler for a single source: URL (web, local file, or raw HTML). - awarmup(): Perform warmup sequence. - arun_many(): Run the crawler for multiple sources. - aprocess_html(): Process HTML content. + Methods: + start(): Start the crawler explicitly without using context manager. + close(): Close the crawler explicitly without using context manager. + arun(): Run the crawler for a single source: URL (web, local file, or raw HTML). + awarmup(): Perform warmup sequence. + arun_many(): Run the crawler for multiple sources. + aprocess_html(): Process HTML content. Typical Usage: async with AsyncWebCrawler() as crawler: @@ -116,50 +114,30 @@ class AsyncWebCrawler: def __init__( self, - crawler_strategy: Optional[AsyncCrawlerStrategy] = None, - config: Optional[BrowserConfig] = None, - always_bypass_cache: bool = False, - always_by_pass_cache: Optional[bool] = None, # Deprecated parameter + crawler_strategy: AsyncCrawlerStrategy = None, + config: BrowserConfig = None, base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), thread_safe: bool = False, + logger: AsyncLoggerBase = None, **kwargs, ): """ Initialize the AsyncWebCrawler. Args: - crawler_strategy: Strategy for crawling web pages. If None, will create AsyncPlaywrightCrawlerStrategy - config: Configuration object for browser settings. If None, will be created from kwargs - always_bypass_cache: Whether to always bypass cache (new parameter) - always_by_pass_cache: Deprecated, use always_bypass_cache instead + crawler_strategy: Strategy for crawling web pages. Default AsyncPlaywrightCrawlerStrategy + config: Configuration object for browser settings. Default BrowserConfig() base_directory: Base directory for storing cache thread_safe: Whether to use thread-safe operations **kwargs: Additional arguments for backwards compatibility """ # Handle browser configuration - browser_config = config - if browser_config is not None: - if any( - k in kwargs - for k in [ - "browser_type", - "headless", - "viewport_width", - "viewport_height", - ] - ): - self.logger.warning( - message="Both browser_config and legacy browser parameters provided. browser_config will take precedence.", - tag="WARNING", - ) - else: - # Create browser config from kwargs for backwards compatibility - browser_config = BrowserConfig.from_kwargs(kwargs) + browser_config = config or BrowserConfig() self.browser_config = browser_config # Initialize logger first since other components may need it - self.logger = AsyncLogger( + self.logger = logger or AsyncLogger( log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"), verbose=self.browser_config.verbose, tag_width=10, @@ -173,24 +151,6 @@ class AsyncWebCrawler: **params, # Pass remaining kwargs for backwards compatibility ) - # If craweler strategy doesnt have logger, use crawler logger - if not self.crawler_strategy.logger: - self.crawler_strategy.logger = self.logger - - # Handle deprecated cache parameter - if always_by_pass_cache is not None: - if kwargs.get("warning", True): - warnings.warn( - "'always_by_pass_cache' is deprecated and will be removed in version 0.5.0. " - "Use 'always_bypass_cache' instead. " - "Pass warning=False to suppress this warning.", - DeprecationWarning, - stacklevel=2, - ) - self.always_bypass_cache = always_by_pass_cache - else: - self.always_bypass_cache = always_bypass_cache - # Thread safety setup self._lock = asyncio.Lock() if thread_safe else None @@ -356,7 +316,7 @@ class AsyncWebCrawler: # Create cache context cache_context = CacheContext( - url, config.cache_mode, self.always_bypass_cache + url, config.cache_mode, False ) # Initialize processing variables diff --git a/crawl4ai/hub.py b/crawl4ai/hub.py index e4b0fa3e..75056df7 100644 --- a/crawl4ai/hub.py +++ b/crawl4ai/hub.py @@ -1,6 +1,6 @@ # crawl4ai/hub.py from abc import ABC, abstractmethod -from typing import Dict, Type +from typing import Dict, Type, Union import logging import importlib from pathlib import Path @@ -63,7 +63,7 @@ class CrawlerHub: cls._crawlers[name] = obj @classmethod - def get(cls, name: str) -> Type[BaseCrawler] | None: + def get(cls, name: str) -> Union[Type[BaseCrawler], None]: if not cls._crawlers: cls._discover_crawlers() return cls._crawlers.get(name) \ No newline at end of file diff --git a/docs/examples/hello_world.py b/docs/examples/hello_world.py index 97a8187e..89f3188a 100644 --- a/docs/examples/hello_world.py +++ b/docs/examples/hello_world.py @@ -1,5 +1,12 @@ import asyncio -from crawl4ai import * +from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CrawlerRunConfig, + CacheMode, + DefaultMarkdownGenerator, + PruningContentFilter, +) async def main(): diff --git a/tests/loggers/test_logger.py b/tests/loggers/test_logger.py new file mode 100644 index 00000000..8469b713 --- /dev/null +++ b/tests/loggers/test_logger.py @@ -0,0 +1,80 @@ +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, AsyncLoggerBase +import os +from datetime import datetime + +class AsyncFileLogger(AsyncLoggerBase): + """ + File-only asynchronous logger that writes logs to a specified file. + """ + + def __init__(self, log_file: str): + """ + Initialize the file logger. + + Args: + log_file: File path for logging + """ + self.log_file = log_file + os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True) + + def _write_to_file(self, level: str, message: str, tag: str): + """Write a message to the log file.""" + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] + with open(self.log_file, "a", encoding="utf-8") as f: + f.write(f"[{timestamp}] [{level}] [{tag}] {message}\n") + + def debug(self, message: str, tag: str = "DEBUG", **kwargs): + """Log a debug message to file.""" + self._write_to_file("DEBUG", message, tag) + + def info(self, message: str, tag: str = "INFO", **kwargs): + """Log an info message to file.""" + self._write_to_file("INFO", message, tag) + + def success(self, message: str, tag: str = "SUCCESS", **kwargs): + """Log a success message to file.""" + self._write_to_file("SUCCESS", message, tag) + + def warning(self, message: str, tag: str = "WARNING", **kwargs): + """Log a warning message to file.""" + self._write_to_file("WARNING", message, tag) + + def error(self, message: str, tag: str = "ERROR", **kwargs): + """Log an error message to file.""" + self._write_to_file("ERROR", message, tag) + + def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50): + """Log URL fetch status to file.""" + status = "SUCCESS" if success else "FAILED" + message = f"{url[:url_length]}... | Status: {status} | Time: {timing:.2f}s" + self._write_to_file("URL_STATUS", message, tag) + + def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50): + """Log error status to file.""" + message = f"{url[:url_length]}... | Error: {error}" + self._write_to_file("ERROR", message, tag) + +async def main(): + browser_config = BrowserConfig(headless=True, verbose=True) + crawler = AsyncWebCrawler(config=browser_config, logger=AsyncFileLogger("/Users/unclecode/devs/crawl4ai/.private/tmp/crawl.log")) + await crawler.start() + + try: + crawl_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + ) + # Use the crawler multiple times + result = await crawler.arun( + url='https://kidocode.com/', + config=crawl_config + ) + if result.success: + print("First crawl - Raw Markdown Length:", len(result.markdown_v2.raw_markdown)) + + finally: + # Always ensure we close the crawler + await crawler.close() + +if __name__ == "__main__": + asyncio.run(main())