feat: implement create_box_message utility for formatted error messages and enhance error logging in AsyncWebCrawler

This commit is contained in:
UncleCode
2024-11-28 19:24:07 +08:00
parent 0bccf23db3
commit a036b7f122
3 changed files with 77 additions and 16 deletions

View File

@@ -15,7 +15,7 @@ import hashlib
import json import json
import uuid import uuid
from .models import AsyncCrawlResponse from .models import AsyncCrawlResponse
from .utils import create_box_message
from playwright_stealth import StealthConfig, stealth_async from playwright_stealth import StealthConfig, stealth_async
stealth_config = StealthConfig( stealth_config = StealthConfig(
@@ -321,10 +321,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
"--disable-infobars", "--disable-infobars",
"--window-position=0,0", "--window-position=0,0",
"--ignore-certificate-errors", "--ignore-certificate-errors",
"--ignore-certificate-errors-spki-list", "--ignore-certificate-errors-spki-list"
] ]
} }
# Add channel if specified (try Chrome first) # Add channel if specified (try Chrome first)
if self.chrome_channel: if self.chrome_channel:
browser_args["channel"] = self.chrome_channel browser_args["channel"] = self.chrome_channel
@@ -765,12 +765,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
await self.execute_hook('before_goto', page, context = context) await self.execute_hook('before_goto', page, context = context)
response = await page.goto( try:
url, response = await page.goto(
# wait_until=kwargs.get("wait_until", ["domcontentloaded", "networkidle"]), url,
wait_until=kwargs.get("wait_until", "domcontentloaded"), # wait_until=kwargs.get("wait_until", ["domcontentloaded", "networkidle"]),
timeout=kwargs.get("page_timeout", 60000) wait_until=kwargs.get("wait_until", "domcontentloaded"),
) timeout=kwargs.get("page_timeout", 60000),
)
except Error as e:
raise RuntimeError(f"Failed on navigating ACS-GOTO :\n{str(e)}")
# response = await page.goto("about:blank") # response = await page.goto("about:blank")
# await page.evaluate(f"window.location.href = '{url}'") # await page.evaluate(f"window.location.href = '{url}'")

View File

@@ -26,8 +26,10 @@ from .utils import (
sanitize_input_encode, sanitize_input_encode,
InvalidCSSSelectorError, InvalidCSSSelectorError,
format_html, format_html,
fast_format_html fast_format_html,
create_box_message
) )
from urllib.parse import urlparse from urllib.parse import urlparse
import random import random
from .__version__ import __version__ as crawl4ai_version from .__version__ import __version__ as crawl4ai_version
@@ -326,15 +328,15 @@ class AsyncWebCrawler:
if not hasattr(e, "msg"): if not hasattr(e, "msg"):
e.msg = str(e) e.msg = str(e)
# print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}") # print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}")
self.logger.error_status( self.logger.error_status(
url=cache_context.display_url, url=cache_context.display_url,
error=e.msg, error=create_box_message(e.msg, type = "error"),
tag="ERROR" tag="ERROR"
) )
return CrawlResult( return CrawlResult(
url=url, url=url,
html="", html="",
markdown=f"[ERROR] 🚫 arun(): Failed to crawl {cache_context.display_url}, error: {e.msg}",
success=False, success=False,
error_message=e.msg error_message=e.msg
) )

View File

@@ -17,7 +17,8 @@ from requests.exceptions import InvalidSchema
import hashlib import hashlib
from typing import Optional, Tuple, Dict, Any from typing import Optional, Tuple, Dict, Any
import xxhash import xxhash
from colorama import Fore, Style, init
import textwrap
from .html2text import HTML2Text from .html2text import HTML2Text
class CustomHTML2Text(HTML2Text): class CustomHTML2Text(HTML2Text):
@@ -103,12 +104,67 @@ class CustomHTML2Text(HTML2Text):
self.preserved_content.append(data) self.preserved_content.append(data)
return return
super().handle_data(data, entity_char) super().handle_data(data, entity_char)
class InvalidCSSSelectorError(Exception): class InvalidCSSSelectorError(Exception):
pass pass
def create_box_message(
message: str,
type: str = "info",
width: int = 80,
add_newlines: bool = True,
double_line: bool = False
) -> str:
init()
# Define border and text colors for different types
styles = {
"warning": (Fore.YELLOW, Fore.LIGHTYELLOW_EX, ""),
"info": (Fore.BLUE, Fore.LIGHTBLUE_EX, ""),
"success": (Fore.GREEN, Fore.LIGHTGREEN_EX, ""),
"error": (Fore.RED, Fore.LIGHTRED_EX, "×"),
}
border_color, text_color, prefix = styles.get(type.lower(), styles["info"])
# Define box characters based on line style
box_chars = {
"single": ("", "", "", "", "", ""),
"double": ("", "", "", "", "", "")
}
line_style = "double" if double_line else "single"
h_line, v_line, tl, tr, bl, br = box_chars[line_style]
# Process lines with lighter text color
formatted_lines = []
raw_lines = message.split('\n')
if raw_lines:
first_line = f"{prefix} {raw_lines[0].strip()}"
wrapped_first = textwrap.fill(first_line, width=width-4)
formatted_lines.extend(wrapped_first.split('\n'))
for line in raw_lines[1:]:
if line.strip():
wrapped = textwrap.fill(f" {line.strip()}", width=width-4)
formatted_lines.extend(wrapped.split('\n'))
else:
formatted_lines.append("")
# Create the box with colored borders and lighter text
horizontal_line = h_line * (width - 1)
box = [
f"{border_color}{tl}{horizontal_line}{tr}",
*[f"{border_color}{v_line}{text_color} {line:<{width-2}}{border_color}{v_line}" for line in formatted_lines],
f"{border_color}{bl}{horizontal_line}{br}{Style.RESET_ALL}"
]
result = "\n".join(box)
if add_newlines:
result = f"\n{result}\n"
return result
def calculate_semaphore_count(): def calculate_semaphore_count():
cpu_count = os.cpu_count() cpu_count = os.cpu_count()
memory_gb = get_system_memory() / (1024 ** 3) # Convert to GB memory_gb = get_system_memory() / (1024 ** 3) # Convert to GB