feat(logger): add abstract logger base class and file logger implementation
Add AsyncLoggerBase abstract class to standardize logger interface and introduce AsyncFileLogger for file-only logging. Remove deprecated always_bypass_cache parameter and clean up AsyncWebCrawler initialization. BREAKING CHANGE: Removed deprecated 'always_by_pass_cache' parameter. Use BrowserConfig cache settings instead.
This commit is contained in:
@@ -8,6 +8,10 @@ from .content_scraping_strategy import (
|
|||||||
WebScrapingStrategy,
|
WebScrapingStrategy,
|
||||||
LXMLWebScrapingStrategy,
|
LXMLWebScrapingStrategy,
|
||||||
)
|
)
|
||||||
|
from .async_logger import (
|
||||||
|
AsyncLoggerBase,
|
||||||
|
AsyncLogger,
|
||||||
|
)
|
||||||
from .proxy_strategy import (
|
from .proxy_strategy import (
|
||||||
ProxyRotationStrategy,
|
ProxyRotationStrategy,
|
||||||
RoundRobinProxyStrategy,
|
RoundRobinProxyStrategy,
|
||||||
@@ -59,6 +63,8 @@ from .deep_crawling import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
|
"AsyncLoggerBase",
|
||||||
|
"AsyncLogger",
|
||||||
"AsyncWebCrawler",
|
"AsyncWebCrawler",
|
||||||
"DeepCrawlStrategy",
|
"DeepCrawlStrategy",
|
||||||
"BFSDeepCrawlStrategy",
|
"BFSDeepCrawlStrategy",
|
||||||
|
|||||||
@@ -1,3 +1,2 @@
|
|||||||
# crawl4ai/_version.py
|
# crawl4ai/_version.py
|
||||||
# __version__ = "0.4.3b3"
|
__version__ = "0.5.0"
|
||||||
__version__ = "0.4.300"
|
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
from abc import ABC, abstractmethod
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Optional, Dict, Any
|
from typing import Optional, Dict, Any
|
||||||
from colorama import Fore, Style, init
|
from colorama import Fore, Style, init
|
||||||
@@ -13,7 +14,37 @@ class LogLevel(Enum):
|
|||||||
ERROR = 5
|
ERROR = 5
|
||||||
|
|
||||||
|
|
||||||
class AsyncLogger:
|
|
||||||
|
class AsyncLoggerBase(ABC):
|
||||||
|
@abstractmethod
|
||||||
|
def debug(self, message: str, tag: str = "DEBUG", **kwargs):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def info(self, message: str, tag: str = "INFO", **kwargs):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def success(self, message: str, tag: str = "SUCCESS", **kwargs):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def warning(self, message: str, tag: str = "WARNING", **kwargs):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def error(self, message: str, tag: str = "ERROR", **kwargs):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class AsyncLogger(AsyncLoggerBase):
|
||||||
"""
|
"""
|
||||||
Asynchronous logger with support for colored console output and file logging.
|
Asynchronous logger with support for colored console output and file logging.
|
||||||
Supports templated messages with colored components.
|
Supports templated messages with colored components.
|
||||||
@@ -225,3 +256,55 @@ class AsyncLogger:
|
|||||||
tag=tag,
|
tag=tag,
|
||||||
params={"url": url, "url_length": url_length, "error": error},
|
params={"url": url, "url_length": url_length, "error": error},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
class AsyncFileLogger(AsyncLoggerBase):
|
||||||
|
"""
|
||||||
|
File-only asynchronous logger that writes logs to a specified file.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, log_file: str):
|
||||||
|
"""
|
||||||
|
Initialize the file logger.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
log_file: File path for logging
|
||||||
|
"""
|
||||||
|
self.log_file = log_file
|
||||||
|
os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True)
|
||||||
|
|
||||||
|
def _write_to_file(self, level: str, message: str, tag: str):
|
||||||
|
"""Write a message to the log file."""
|
||||||
|
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
|
||||||
|
with open(self.log_file, "a", encoding="utf-8") as f:
|
||||||
|
f.write(f"[{timestamp}] [{level}] [{tag}] {message}\n")
|
||||||
|
|
||||||
|
def debug(self, message: str, tag: str = "DEBUG", **kwargs):
|
||||||
|
"""Log a debug message to file."""
|
||||||
|
self._write_to_file("DEBUG", message, tag)
|
||||||
|
|
||||||
|
def info(self, message: str, tag: str = "INFO", **kwargs):
|
||||||
|
"""Log an info message to file."""
|
||||||
|
self._write_to_file("INFO", message, tag)
|
||||||
|
|
||||||
|
def success(self, message: str, tag: str = "SUCCESS", **kwargs):
|
||||||
|
"""Log a success message to file."""
|
||||||
|
self._write_to_file("SUCCESS", message, tag)
|
||||||
|
|
||||||
|
def warning(self, message: str, tag: str = "WARNING", **kwargs):
|
||||||
|
"""Log a warning message to file."""
|
||||||
|
self._write_to_file("WARNING", message, tag)
|
||||||
|
|
||||||
|
def error(self, message: str, tag: str = "ERROR", **kwargs):
|
||||||
|
"""Log an error message to file."""
|
||||||
|
self._write_to_file("ERROR", message, tag)
|
||||||
|
|
||||||
|
def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
|
||||||
|
"""Log URL fetch status to file."""
|
||||||
|
status = "SUCCESS" if success else "FAILED"
|
||||||
|
message = f"{url[:url_length]}... | Status: {status} | Time: {timing:.2f}s"
|
||||||
|
self._write_to_file("URL_STATUS", message, tag)
|
||||||
|
|
||||||
|
def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
|
||||||
|
"""Log error status to file."""
|
||||||
|
message = f"{url[:url_length]}... | Error: {error}"
|
||||||
|
self._write_to_file("ERROR", message, tag)
|
||||||
|
|||||||
@@ -2,7 +2,6 @@ from .__version__ import __version__ as crawl4ai_version
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
import warnings
|
|
||||||
from colorama import Fore
|
from colorama import Fore
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, List
|
from typing import Optional, List
|
||||||
@@ -30,7 +29,7 @@ from .markdown_generation_strategy import (
|
|||||||
MarkdownGenerationStrategy,
|
MarkdownGenerationStrategy,
|
||||||
)
|
)
|
||||||
from .deep_crawling import DeepCrawlDecorator
|
from .deep_crawling import DeepCrawlDecorator
|
||||||
from .async_logger import AsyncLogger
|
from .async_logger import AsyncLogger, AsyncLoggerBase
|
||||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
from .async_dispatcher import * # noqa: F403
|
from .async_dispatcher import * # noqa: F403
|
||||||
from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
|
from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
|
||||||
@@ -80,22 +79,21 @@ class AsyncWebCrawler:
|
|||||||
await crawler.close()
|
await crawler.close()
|
||||||
```
|
```
|
||||||
|
|
||||||
Attributes:
|
Attributes:
|
||||||
browser_config (BrowserConfig): Configuration object for browser settings.
|
browser_config (BrowserConfig): Configuration object for browser settings.
|
||||||
crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages.
|
crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages.
|
||||||
logger (AsyncLogger): Logger instance for recording events and errors.
|
logger (AsyncLogger): Logger instance for recording events and errors.
|
||||||
always_bypass_cache (bool): Whether to always bypass cache.
|
|
||||||
crawl4ai_folder (str): Directory for storing cache.
|
crawl4ai_folder (str): Directory for storing cache.
|
||||||
base_directory (str): Base directory for storing cache.
|
base_directory (str): Base directory for storing cache.
|
||||||
ready (bool): Whether the crawler is ready for use.
|
ready (bool): Whether the crawler is ready for use.
|
||||||
|
|
||||||
Methods:
|
Methods:
|
||||||
start(): Start the crawler explicitly without using context manager.
|
start(): Start the crawler explicitly without using context manager.
|
||||||
close(): Close the crawler explicitly without using context manager.
|
close(): Close the crawler explicitly without using context manager.
|
||||||
arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
|
arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
|
||||||
awarmup(): Perform warmup sequence.
|
awarmup(): Perform warmup sequence.
|
||||||
arun_many(): Run the crawler for multiple sources.
|
arun_many(): Run the crawler for multiple sources.
|
||||||
aprocess_html(): Process HTML content.
|
aprocess_html(): Process HTML content.
|
||||||
|
|
||||||
Typical Usage:
|
Typical Usage:
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
@@ -116,50 +114,30 @@ class AsyncWebCrawler:
|
|||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
|
crawler_strategy: AsyncCrawlerStrategy = None,
|
||||||
config: Optional[BrowserConfig] = None,
|
config: BrowserConfig = None,
|
||||||
always_bypass_cache: bool = False,
|
|
||||||
always_by_pass_cache: Optional[bool] = None, # Deprecated parameter
|
|
||||||
base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
|
base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
|
||||||
thread_safe: bool = False,
|
thread_safe: bool = False,
|
||||||
|
logger: AsyncLoggerBase = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initialize the AsyncWebCrawler.
|
Initialize the AsyncWebCrawler.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
crawler_strategy: Strategy for crawling web pages. If None, will create AsyncPlaywrightCrawlerStrategy
|
crawler_strategy: Strategy for crawling web pages. Default AsyncPlaywrightCrawlerStrategy
|
||||||
config: Configuration object for browser settings. If None, will be created from kwargs
|
config: Configuration object for browser settings. Default BrowserConfig()
|
||||||
always_bypass_cache: Whether to always bypass cache (new parameter)
|
|
||||||
always_by_pass_cache: Deprecated, use always_bypass_cache instead
|
|
||||||
base_directory: Base directory for storing cache
|
base_directory: Base directory for storing cache
|
||||||
thread_safe: Whether to use thread-safe operations
|
thread_safe: Whether to use thread-safe operations
|
||||||
**kwargs: Additional arguments for backwards compatibility
|
**kwargs: Additional arguments for backwards compatibility
|
||||||
"""
|
"""
|
||||||
# Handle browser configuration
|
# Handle browser configuration
|
||||||
browser_config = config
|
browser_config = config or BrowserConfig()
|
||||||
if browser_config is not None:
|
|
||||||
if any(
|
|
||||||
k in kwargs
|
|
||||||
for k in [
|
|
||||||
"browser_type",
|
|
||||||
"headless",
|
|
||||||
"viewport_width",
|
|
||||||
"viewport_height",
|
|
||||||
]
|
|
||||||
):
|
|
||||||
self.logger.warning(
|
|
||||||
message="Both browser_config and legacy browser parameters provided. browser_config will take precedence.",
|
|
||||||
tag="WARNING",
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# Create browser config from kwargs for backwards compatibility
|
|
||||||
browser_config = BrowserConfig.from_kwargs(kwargs)
|
|
||||||
|
|
||||||
self.browser_config = browser_config
|
self.browser_config = browser_config
|
||||||
|
|
||||||
# Initialize logger first since other components may need it
|
# Initialize logger first since other components may need it
|
||||||
self.logger = AsyncLogger(
|
self.logger = logger or AsyncLogger(
|
||||||
log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"),
|
log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"),
|
||||||
verbose=self.browser_config.verbose,
|
verbose=self.browser_config.verbose,
|
||||||
tag_width=10,
|
tag_width=10,
|
||||||
@@ -173,24 +151,6 @@ class AsyncWebCrawler:
|
|||||||
**params, # Pass remaining kwargs for backwards compatibility
|
**params, # Pass remaining kwargs for backwards compatibility
|
||||||
)
|
)
|
||||||
|
|
||||||
# If craweler strategy doesnt have logger, use crawler logger
|
|
||||||
if not self.crawler_strategy.logger:
|
|
||||||
self.crawler_strategy.logger = self.logger
|
|
||||||
|
|
||||||
# Handle deprecated cache parameter
|
|
||||||
if always_by_pass_cache is not None:
|
|
||||||
if kwargs.get("warning", True):
|
|
||||||
warnings.warn(
|
|
||||||
"'always_by_pass_cache' is deprecated and will be removed in version 0.5.0. "
|
|
||||||
"Use 'always_bypass_cache' instead. "
|
|
||||||
"Pass warning=False to suppress this warning.",
|
|
||||||
DeprecationWarning,
|
|
||||||
stacklevel=2,
|
|
||||||
)
|
|
||||||
self.always_bypass_cache = always_by_pass_cache
|
|
||||||
else:
|
|
||||||
self.always_bypass_cache = always_bypass_cache
|
|
||||||
|
|
||||||
# Thread safety setup
|
# Thread safety setup
|
||||||
self._lock = asyncio.Lock() if thread_safe else None
|
self._lock = asyncio.Lock() if thread_safe else None
|
||||||
|
|
||||||
@@ -356,7 +316,7 @@ class AsyncWebCrawler:
|
|||||||
|
|
||||||
# Create cache context
|
# Create cache context
|
||||||
cache_context = CacheContext(
|
cache_context = CacheContext(
|
||||||
url, config.cache_mode, self.always_bypass_cache
|
url, config.cache_mode, False
|
||||||
)
|
)
|
||||||
|
|
||||||
# Initialize processing variables
|
# Initialize processing variables
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# crawl4ai/hub.py
|
# crawl4ai/hub.py
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Dict, Type
|
from typing import Dict, Type, Union
|
||||||
import logging
|
import logging
|
||||||
import importlib
|
import importlib
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -63,7 +63,7 @@ class CrawlerHub:
|
|||||||
cls._crawlers[name] = obj
|
cls._crawlers[name] = obj
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get(cls, name: str) -> Type[BaseCrawler] | None:
|
def get(cls, name: str) -> Union[Type[BaseCrawler], None]:
|
||||||
if not cls._crawlers:
|
if not cls._crawlers:
|
||||||
cls._discover_crawlers()
|
cls._discover_crawlers()
|
||||||
return cls._crawlers.get(name)
|
return cls._crawlers.get(name)
|
||||||
@@ -1,5 +1,12 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import *
|
from crawl4ai import (
|
||||||
|
AsyncWebCrawler,
|
||||||
|
BrowserConfig,
|
||||||
|
CrawlerRunConfig,
|
||||||
|
CacheMode,
|
||||||
|
DefaultMarkdownGenerator,
|
||||||
|
PruningContentFilter,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
|
|||||||
80
tests/loggers/test_logger.py
Normal file
80
tests/loggers/test_logger.py
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, AsyncLoggerBase
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
class AsyncFileLogger(AsyncLoggerBase):
|
||||||
|
"""
|
||||||
|
File-only asynchronous logger that writes logs to a specified file.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, log_file: str):
|
||||||
|
"""
|
||||||
|
Initialize the file logger.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
log_file: File path for logging
|
||||||
|
"""
|
||||||
|
self.log_file = log_file
|
||||||
|
os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True)
|
||||||
|
|
||||||
|
def _write_to_file(self, level: str, message: str, tag: str):
|
||||||
|
"""Write a message to the log file."""
|
||||||
|
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
|
||||||
|
with open(self.log_file, "a", encoding="utf-8") as f:
|
||||||
|
f.write(f"[{timestamp}] [{level}] [{tag}] {message}\n")
|
||||||
|
|
||||||
|
def debug(self, message: str, tag: str = "DEBUG", **kwargs):
|
||||||
|
"""Log a debug message to file."""
|
||||||
|
self._write_to_file("DEBUG", message, tag)
|
||||||
|
|
||||||
|
def info(self, message: str, tag: str = "INFO", **kwargs):
|
||||||
|
"""Log an info message to file."""
|
||||||
|
self._write_to_file("INFO", message, tag)
|
||||||
|
|
||||||
|
def success(self, message: str, tag: str = "SUCCESS", **kwargs):
|
||||||
|
"""Log a success message to file."""
|
||||||
|
self._write_to_file("SUCCESS", message, tag)
|
||||||
|
|
||||||
|
def warning(self, message: str, tag: str = "WARNING", **kwargs):
|
||||||
|
"""Log a warning message to file."""
|
||||||
|
self._write_to_file("WARNING", message, tag)
|
||||||
|
|
||||||
|
def error(self, message: str, tag: str = "ERROR", **kwargs):
|
||||||
|
"""Log an error message to file."""
|
||||||
|
self._write_to_file("ERROR", message, tag)
|
||||||
|
|
||||||
|
def url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", url_length: int = 50):
|
||||||
|
"""Log URL fetch status to file."""
|
||||||
|
status = "SUCCESS" if success else "FAILED"
|
||||||
|
message = f"{url[:url_length]}... | Status: {status} | Time: {timing:.2f}s"
|
||||||
|
self._write_to_file("URL_STATUS", message, tag)
|
||||||
|
|
||||||
|
def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
|
||||||
|
"""Log error status to file."""
|
||||||
|
message = f"{url[:url_length]}... | Error: {error}"
|
||||||
|
self._write_to_file("ERROR", message, tag)
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
browser_config = BrowserConfig(headless=True, verbose=True)
|
||||||
|
crawler = AsyncWebCrawler(config=browser_config, logger=AsyncFileLogger("/Users/unclecode/devs/crawl4ai/.private/tmp/crawl.log"))
|
||||||
|
await crawler.start()
|
||||||
|
|
||||||
|
try:
|
||||||
|
crawl_config = CrawlerRunConfig(
|
||||||
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
)
|
||||||
|
# Use the crawler multiple times
|
||||||
|
result = await crawler.arun(
|
||||||
|
url='https://kidocode.com/',
|
||||||
|
config=crawl_config
|
||||||
|
)
|
||||||
|
if result.success:
|
||||||
|
print("First crawl - Raw Markdown Length:", len(result.markdown_v2.raw_markdown))
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# Always ensure we close the crawler
|
||||||
|
await crawler.close()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
Reference in New Issue
Block a user