diff --git a/crawl4ai/async_logger.py b/crawl4ai/async_logger.py index 7866e36f..067e7a19 100644 --- a/crawl4ai/async_logger.py +++ b/crawl4ai/async_logger.py @@ -18,6 +18,24 @@ class LogLevel(Enum): def __str__(self): return self.name.lower() +class LogColor(str, Enum): + """Enum for log colors.""" + + DEBUG = "lightblack" + INFO = "cyan" + SUCCESS = "green" + WARNING = "yellow" + ERROR = "red" + CYAN = "cyan" + GREEN = "green" + YELLOW = "yellow" + MAGENTA = "magenta" + DIM_MAGENTA = "dim magenta" + + def __str__(self): + """Automatically convert rich color to string.""" + return self.value + class AsyncLoggerBase(ABC): @abstractmethod @@ -48,6 +66,7 @@ class AsyncLoggerBase(ABC): def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50): pass + class AsyncLogger(AsyncLoggerBase): """ Asynchronous logger with support for colored console output and file logging. @@ -68,11 +87,11 @@ class AsyncLogger(AsyncLoggerBase): } DEFAULT_COLORS = { - LogLevel.DEBUG: "lightblack", - LogLevel.INFO: "cyan", - LogLevel.SUCCESS: "green", - LogLevel.WARNING: "yellow", - LogLevel.ERROR: "red", + LogLevel.DEBUG: LogColor.DEBUG, + LogLevel.INFO: LogColor.INFO, + LogLevel.SUCCESS: LogColor.SUCCESS, + LogLevel.WARNING: LogColor.WARNING, + LogLevel.ERROR: LogColor.ERROR, } def __init__( @@ -81,7 +100,7 @@ class AsyncLogger(AsyncLoggerBase): log_level: LogLevel = LogLevel.DEBUG, tag_width: int = 10, icons: Optional[Dict[str, str]] = None, - colors: Optional[Dict[LogLevel, str]] = None, + colors: Optional[Dict[LogLevel, LogColor]] = None, verbose: bool = True, ): """ @@ -130,9 +149,9 @@ class AsyncLogger(AsyncLoggerBase): message: str, tag: str, params: Optional[Dict[str, Any]] = None, - colors: Optional[Dict[str, str]] = None, + colors: Optional[Dict[str, LogColor]] = None, boxes: Optional[List[str]] = None, - base_color: Optional[str] = None, + base_color: Optional[LogColor] = None, **kwargs, ): """ @@ -152,8 +171,11 @@ class AsyncLogger(AsyncLoggerBase): # avoid conflict with rich formatting parsed_message = message.replace("[", "[[").replace("]", "]]") - raw_message = message.format(**params) if params else message if params: + # FIXME: If there are formatting strings in floating point format, + # this may result in colors and boxes not being applied properly. + # such as {value:.2f}, the value is 0.23333 format it to 0.23, + # but we replace("0.23333", "[color]0.23333[/color]") formatted_message = parsed_message.format(**params) for key, value in params.items(): # value_str may discard `[` and `]`, so we need to replace it. @@ -163,17 +185,17 @@ class AsyncLogger(AsyncLoggerBase): color_str = f"[{colors[key]}]{value_str}[/{colors[key]}]" formatted_message = formatted_message.replace(value_str, color_str) value_str = color_str - + # check is need apply box if boxes and key in boxes: - formatted_message = formatted_message.replace(value_str, + formatted_message = formatted_message.replace(value_str, create_box_message(value_str, type=str(level))) - + else: formatted_message = parsed_message # Construct the full log line - color = base_color or self.colors[level] + color: LogColor = base_color or self.colors[level] log_line = f"[{color}]{self._format_tag(tag)} {self._get_icon(tag)} {formatted_message} [/{color}]" # Output to console if verbose @@ -223,17 +245,17 @@ class AsyncLogger(AsyncLoggerBase): """ self._log( level=LogLevel.SUCCESS if success else LogLevel.ERROR, - message="{url:.{url_length}}... | Status: {status} | Time: {timing:.2f}s", + message="{url:.{url_length}}... | Status: {status} | Time: {timing}s", tag=tag, params={ "url": url, "url_length": url_length, "status": success, - "timing": timing, + "timing": f"{timing:.2f}", # aviod a format string }, colors={ - "status": "green" if success else "red", - "timing": "yellow", + "status": LogColor.SUCCESS if success else LogColor.ERROR, + "timing": LogColor.WARNING, }, ) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 963c2d05..afaeeb24 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -34,7 +34,7 @@ from .markdown_generation_strategy import ( MarkdownGenerationStrategy, ) from .deep_crawling import DeepCrawlDecorator -from .async_logger import AsyncLogger, AsyncLoggerBase +from .async_logger import AsyncLogger, AsyncLoggerBase, LogColor from .async_configs import BrowserConfig, CrawlerRunConfig from .async_dispatcher import * # noqa: F403 from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter @@ -43,7 +43,6 @@ from .utils import ( sanitize_input_encode, InvalidCSSSelectorError, fast_format_html, - create_box_message, get_error_context, RobotsParser, ) @@ -381,8 +380,8 @@ class AsyncWebCrawler: "timing": f"{time.perf_counter() - start_time:.2f}s", }, colors={ - "status": "green" if crawl_result.success else "red", - "timing": "yellow", + "status": LogColor.SUCCESS if crawl_result.success else LogColor.ERROR, + "timing": LogColor.WARNING, }, ) @@ -401,7 +400,10 @@ class AsyncWebCrawler: "status": True, "timing": f"{time.perf_counter() - start_time:.2f}s", }, - colors={"status": "green", "timing": "yellow"}, + colors={ + "status": LogColor.SUCCESS if crawl_result.success else LogColor.ERROR, + "timing": LogColor.WARNING, + }, ) cached_result.success = bool(html) diff --git a/crawl4ai/browser_profiler.py b/crawl4ai/browser_profiler.py index f8b9e2b0..c9fd17c4 100644 --- a/crawl4ai/browser_profiler.py +++ b/crawl4ai/browser_profiler.py @@ -20,7 +20,7 @@ from rich.console import Console from .async_configs import BrowserConfig from .browser_manager import ManagedBrowser -from .async_logger import AsyncLogger, AsyncLoggerBase +from .async_logger import AsyncLogger, AsyncLoggerBase, LogColor from .utils import get_home_folder @@ -129,16 +129,16 @@ class BrowserProfiler: # Print instructions for the user with rich formatting border = "{'='*80}" - self.logger.info("{border}", tag="PROFILE", params={"border": f"\n{border}"}, colors={"border": "cyan"}) - self.logger.info("Creating browser profile: {profile_name}", tag="PROFILE", params={"profile_name": profile_name}, colors={"profile_name": "green"}) - self.logger.info("Profile directory: {profile_path}", tag="PROFILE", params={"profile_path": profile_path}, colors={"profile_path": "yellow"}) + self.logger.info("{border}", tag="PROFILE", params={"border": f"\n{border}"}, colors={"border": LogColor.CYAN}) + self.logger.info("Creating browser profile: {profile_name}", tag="PROFILE", params={"profile_name": profile_name}, colors={"profile_name": LogColor.GREEN}) + self.logger.info("Profile directory: {profile_path}", tag="PROFILE", params={"profile_path": profile_path}, colors={"profile_path": LogColor.YELLOW}) self.logger.info("\nInstructions:", tag="PROFILE") self.logger.info("1. A browser window will open for you to set up your profile.", tag="PROFILE") - self.logger.info("{segment}, configure settings, etc. as needed.", tag="PROFILE", params={"segment": "2. Log in to websites"}, colors={"segment": "cyan"}) - self.logger.info("3. When you're done, {segment} to close the browser.", tag="PROFILE", params={"segment": "press 'q' in this terminal"}, colors={"segment": "yellow"}) + self.logger.info("{segment}, configure settings, etc. as needed.", tag="PROFILE", params={"segment": "2. Log in to websites"}, colors={"segment": LogColor.CYAN}) + self.logger.info("3. When you're done, {segment} to close the browser.", tag="PROFILE", params={"segment": "press 'q' in this terminal"}, colors={"segment": LogColor.YELLOW}) self.logger.info("4. The profile will be saved and ready to use with Crawl4AI.", tag="PROFILE") - self.logger.info("{border}", tag="PROFILE", params={"border": f"{border}\n"}, colors={"border": "cyan"}) + self.logger.info("{border}", tag="PROFILE", params={"border": f"{border}\n"}, colors={"border": LogColor.CYAN}) # Create managed browser instance managed_browser = ManagedBrowser( @@ -197,7 +197,7 @@ class BrowserProfiler: if readable: key = sys.stdin.read(1) if key.lower() == 'q': - self.logger.info("Closing browser and saving profile...", tag="PROFILE", base_color="green") + self.logger.info("Closing browser and saving profile...", tag="PROFILE", base_color=LogColor.GREEN) user_done_event.set() return @@ -223,7 +223,7 @@ class BrowserProfiler: self.logger.error("Failed to start browser process.", tag="PROFILE") return None - self.logger.info(f"Browser launched. Waiting for you to finish...", tag="PROFILE") + self.logger.info("Browser launched. Waiting for you to finish...", tag="PROFILE") # Start listening for keyboard input listener_task = asyncio.create_task(listen_for_quit_command()) @@ -440,18 +440,18 @@ class BrowserProfiler: ``` """ while True: - self.logger.info(f"\nProfile Management Options:", tag="MENU") - self.logger.info(f"1. Create a new profile", tag="MENU", base_color="green") - self.logger.info(f"2. List available profiles", tag="MENU", base_color="yellow") - self.logger.info(f"3. Delete a profile", tag="MENU", base_color="red") + self.logger.info("\nProfile Management Options:", tag="MENU") + self.logger.info("1. Create a new profile", tag="MENU", base_color=LogColor.GREEN) + self.logger.info("2. List available profiles", tag="MENU", base_color=LogColor.YELLOW) + self.logger.info("3. Delete a profile", tag="MENU", base_color=LogColor.RED) # Only show crawl option if callback provided if crawl_callback: - self.logger.info(f"4. Use a profile to crawl a website", tag="MENU", base_color="cyan") - self.logger.info(f"5. Exit", tag="MENU", base_color="magenta") + self.logger.info("4. Use a profile to crawl a website", tag="MENU", base_color=LogColor.CYAN) + self.logger.info("5. Exit", tag="MENU", base_color=LogColor.MAGENTA) exit_option = "5" else: - self.logger.info(f"4. Exit", tag="MENU", base_color="magenta") + self.logger.info("4. Exit", tag="MENU", base_color=LogColor.MAGENTA) exit_option = "4" self.logger.print(f"\n[cyan]Enter your choice (1-{exit_option}): [/cyan]", end="") @@ -475,7 +475,7 @@ class BrowserProfiler: self.logger.info("\nAvailable profiles:", tag="PROFILES") for i, profile in enumerate(profiles): self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES") - self.logger.info(f" Path: {profile['path']}", tag="PROFILES", base_color="yellow") + self.logger.info(f" Path: {profile['path']}", tag="PROFILES", base_color=LogColor.YELLOW) self.logger.info(f" Created: {profile['created'].strftime('%Y-%m-%d %H:%M:%S')}", tag="PROFILES") self.logger.info(f" Browser type: {profile['type']}", tag="PROFILES") self.logger.info("", tag="PROFILES") # Empty line for spacing @@ -488,7 +488,7 @@ class BrowserProfiler: continue # Display numbered list - self.logger.info(f"\nAvailable profiles:", tag="PROFILES", base_color="yellow") + self.logger.info("\nAvailable profiles:", tag="PROFILES", base_color=LogColor.YELLOW) for i, profile in enumerate(profiles): self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES") @@ -527,7 +527,7 @@ class BrowserProfiler: continue # Display numbered list - self.logger.info(f"\nAvailable profiles:", tag="PROFILES", base_color="yellow") + self.logger.info("\nAvailable profiles:", tag="PROFILES", base_color=LogColor.YELLOW) for i, profile in enumerate(profiles): self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES") @@ -605,9 +605,9 @@ class BrowserProfiler: # Print initial information border = f"{Fore.CYAN}{'='*80}{Style.RESET_ALL}" self.logger.info(f"\n{border}", tag="CDP") - self.logger.info(f"Launching standalone browser with CDP debugging", tag="CDP") - self.logger.info("Browser type: {browser_type}", tag="CDP", params={"browser_type": browser_type}, colors={"browser_type": "cyan"}) - self.logger.info("Profile path: {profile_path}", tag="CDP", params={"profile_path": profile_path}, colors={"profile_path": "yellow"}) + self.logger.info("Launching standalone browser with CDP debugging", tag="CDP") + self.logger.info("Browser type: {browser_type}", tag="CDP", params={"browser_type": browser_type}, colors={"browser_type": LogColor.CYAN}) + self.logger.info("Profile path: {profile_path}", tag="CDP", params={"profile_path": profile_path}, colors={"profile_path": LogColor.YELLOW}) self.logger.info(f"Debugging port: {debugging_port}", tag="CDP") self.logger.info(f"Headless mode: {headless}", tag="CDP") @@ -722,7 +722,7 @@ class BrowserProfiler: self.logger.error("Failed to start browser process.", tag="CDP") return None - self.logger.info(f"Browser launched successfully. Retrieving CDP information...", tag="CDP") + self.logger.info("Browser launched successfully. Retrieving CDP information...", tag="CDP") # Get CDP URL and JSON config cdp_url, config_json = await get_cdp_json(debugging_port) @@ -732,10 +732,10 @@ class BrowserProfiler: if config_json: # Display relevant CDP information - self.logger.info(f"Browser: {config_json.get('Browser', 'Unknown')}", tag="CDP", colors={"Browser": "cyan"}) - self.logger.info(f"Protocol Version: {config_json.get('Protocol-Version', 'Unknown')}", tag="CDP", colors={"Protocol-Version": "cyan"}) + self.logger.info(f"Browser: {config_json.get('Browser', 'Unknown')}", tag="CDP", colors={"Browser": LogColor.CYAN}) + self.logger.info(f"Protocol Version: {config_json.get('Protocol-Version', 'Unknown')}", tag="CDP", colors={"Protocol-Version": LogColor.CYAN}) if 'webSocketDebuggerUrl' in config_json: - self.logger.info("WebSocket URL: {webSocketDebuggerUrl}", tag="CDP", params={"webSocketDebuggerUrl": config_json['webSocketDebuggerUrl']}, colors={"webSocketDebuggerUrl": "green"}) + self.logger.info("WebSocket URL: {webSocketDebuggerUrl}", tag="CDP", params={"webSocketDebuggerUrl": config_json['webSocketDebuggerUrl']}, colors={"webSocketDebuggerUrl": LogColor.GREEN}) else: self.logger.warning("Could not retrieve CDP configuration JSON", tag="CDP") else: diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py index 35c6ce8c..4102cbad 100644 --- a/crawl4ai/content_filter_strategy.py +++ b/crawl4ai/content_filter_strategy.py @@ -27,9 +27,7 @@ import json import hashlib from pathlib import Path from concurrent.futures import ThreadPoolExecutor -from .async_logger import AsyncLogger, LogLevel -from rich.console import Console -from rich.text import Text +from .async_logger import AsyncLogger, LogLevel, LogColor class RelevantContentFilter(ABC): @@ -847,7 +845,7 @@ class LLMContentFilter(RelevantContentFilter): }, colors={ **AsyncLogger.DEFAULT_COLORS, - LogLevel.INFO: "dim magenta" # Dimmed purple for LLM ops + LogLevel.INFO: LogColor.DIM_MAGENTA # Dimmed purple for LLM ops }, ) else: @@ -892,7 +890,7 @@ class LLMContentFilter(RelevantContentFilter): "Starting LLM markdown content filtering process", tag="LLM", params={"provider": self.llm_config.provider}, - colors={"provider": "cyan"}, + colors={"provider": LogColor.CYAN}, ) # Cache handling @@ -929,7 +927,7 @@ class LLMContentFilter(RelevantContentFilter): "LLM markdown: Split content into {chunk_count} chunks", tag="CHUNK", params={"chunk_count": len(html_chunks)}, - colors={"chunk_count": "yellow"}, + colors={"chunk_count": LogColor.YELLOW}, ) start_time = time.time() @@ -1038,7 +1036,7 @@ class LLMContentFilter(RelevantContentFilter): "LLM markdown: Completed processing in {time:.2f}s", tag="LLM", params={"time": end_time - start_time}, - colors={"time": "yellow"}, + colors={"time": LogColor.YELLOW}, ) result = ordered_results if ordered_results else []