diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 882f9a50..e5316187 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -15,7 +15,7 @@ import hashlib import json import uuid from .models import AsyncCrawlResponse - +from .utils import create_box_message from playwright_stealth import StealthConfig, stealth_async stealth_config = StealthConfig( @@ -321,10 +321,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): "--disable-infobars", "--window-position=0,0", "--ignore-certificate-errors", - "--ignore-certificate-errors-spki-list", + "--ignore-certificate-errors-spki-list" ] } - + # Add channel if specified (try Chrome first) if self.chrome_channel: browser_args["channel"] = self.chrome_channel @@ -765,12 +765,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self.execute_hook('before_goto', page, context = context) - response = await page.goto( - url, - # wait_until=kwargs.get("wait_until", ["domcontentloaded", "networkidle"]), - wait_until=kwargs.get("wait_until", "domcontentloaded"), - timeout=kwargs.get("page_timeout", 60000) - ) + try: + response = await page.goto( + url, + # wait_until=kwargs.get("wait_until", ["domcontentloaded", "networkidle"]), + wait_until=kwargs.get("wait_until", "domcontentloaded"), + timeout=kwargs.get("page_timeout", 60000), + ) + except Error as e: + raise RuntimeError(f"Failed on navigating ACS-GOTO :\n{str(e)}") # response = await page.goto("about:blank") # await page.evaluate(f"window.location.href = '{url}'") diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 5a46fe39..66b4c21b 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -26,8 +26,10 @@ from .utils import ( sanitize_input_encode, InvalidCSSSelectorError, format_html, - fast_format_html + fast_format_html, + create_box_message ) + from urllib.parse import urlparse import random from .__version__ import __version__ as crawl4ai_version @@ -326,15 +328,15 @@ class AsyncWebCrawler: if not hasattr(e, "msg"): e.msg = str(e) # print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}") + self.logger.error_status( url=cache_context.display_url, - error=e.msg, + error=create_box_message(e.msg, type = "error"), tag="ERROR" ) return CrawlResult( url=url, html="", - markdown=f"[ERROR] 🚫 arun(): Failed to crawl {cache_context.display_url}, error: {e.msg}", success=False, error_message=e.msg ) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index aaf27e91..253ec079 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -17,7 +17,8 @@ from requests.exceptions import InvalidSchema import hashlib from typing import Optional, Tuple, Dict, Any import xxhash - +from colorama import Fore, Style, init +import textwrap from .html2text import HTML2Text class CustomHTML2Text(HTML2Text): @@ -103,12 +104,67 @@ class CustomHTML2Text(HTML2Text): self.preserved_content.append(data) return super().handle_data(data, entity_char) - - - class InvalidCSSSelectorError(Exception): pass + +def create_box_message( + message: str, + type: str = "info", + width: int = 80, + add_newlines: bool = True, + double_line: bool = False +) -> str: + init() + + # Define border and text colors for different types + styles = { + "warning": (Fore.YELLOW, Fore.LIGHTYELLOW_EX, "⚠"), + "info": (Fore.BLUE, Fore.LIGHTBLUE_EX, "ℹ"), + "success": (Fore.GREEN, Fore.LIGHTGREEN_EX, "✓"), + "error": (Fore.RED, Fore.LIGHTRED_EX, "×"), + } + + border_color, text_color, prefix = styles.get(type.lower(), styles["info"]) + + # Define box characters based on line style + box_chars = { + "single": ("─", "│", "┌", "┐", "└", "┘"), + "double": ("═", "║", "╔", "╗", "╚", "╝") + } + line_style = "double" if double_line else "single" + h_line, v_line, tl, tr, bl, br = box_chars[line_style] + + # Process lines with lighter text color + formatted_lines = [] + raw_lines = message.split('\n') + + if raw_lines: + first_line = f"{prefix} {raw_lines[0].strip()}" + wrapped_first = textwrap.fill(first_line, width=width-4) + formatted_lines.extend(wrapped_first.split('\n')) + + for line in raw_lines[1:]: + if line.strip(): + wrapped = textwrap.fill(f" {line.strip()}", width=width-4) + formatted_lines.extend(wrapped.split('\n')) + else: + formatted_lines.append("") + + # Create the box with colored borders and lighter text + horizontal_line = h_line * (width - 1) + box = [ + f"{border_color}{tl}{horizontal_line}{tr}", + *[f"{border_color}{v_line}{text_color} {line:<{width-2}}{border_color}{v_line}" for line in formatted_lines], + f"{border_color}{bl}{horizontal_line}{br}{Style.RESET_ALL}" + ] + + result = "\n".join(box) + if add_newlines: + result = f"\n{result}\n" + + return result + def calculate_semaphore_count(): cpu_count = os.cpu_count() memory_gb = get_system_memory() / (1024 ** 3) # Convert to GB