feat: implement create_box_message utility for formatted error messages and enhance error logging in AsyncWebCrawler
This commit is contained in:
@@ -15,7 +15,7 @@ import hashlib
|
|||||||
import json
|
import json
|
||||||
import uuid
|
import uuid
|
||||||
from .models import AsyncCrawlResponse
|
from .models import AsyncCrawlResponse
|
||||||
|
from .utils import create_box_message
|
||||||
from playwright_stealth import StealthConfig, stealth_async
|
from playwright_stealth import StealthConfig, stealth_async
|
||||||
|
|
||||||
stealth_config = StealthConfig(
|
stealth_config = StealthConfig(
|
||||||
@@ -321,7 +321,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
"--disable-infobars",
|
"--disable-infobars",
|
||||||
"--window-position=0,0",
|
"--window-position=0,0",
|
||||||
"--ignore-certificate-errors",
|
"--ignore-certificate-errors",
|
||||||
"--ignore-certificate-errors-spki-list",
|
"--ignore-certificate-errors-spki-list"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -765,12 +765,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
await self.execute_hook('before_goto', page, context = context)
|
await self.execute_hook('before_goto', page, context = context)
|
||||||
|
|
||||||
|
|
||||||
response = await page.goto(
|
try:
|
||||||
url,
|
response = await page.goto(
|
||||||
# wait_until=kwargs.get("wait_until", ["domcontentloaded", "networkidle"]),
|
url,
|
||||||
wait_until=kwargs.get("wait_until", "domcontentloaded"),
|
# wait_until=kwargs.get("wait_until", ["domcontentloaded", "networkidle"]),
|
||||||
timeout=kwargs.get("page_timeout", 60000)
|
wait_until=kwargs.get("wait_until", "domcontentloaded"),
|
||||||
)
|
timeout=kwargs.get("page_timeout", 60000),
|
||||||
|
)
|
||||||
|
except Error as e:
|
||||||
|
raise RuntimeError(f"Failed on navigating ACS-GOTO :\n{str(e)}")
|
||||||
|
|
||||||
# response = await page.goto("about:blank")
|
# response = await page.goto("about:blank")
|
||||||
# await page.evaluate(f"window.location.href = '{url}'")
|
# await page.evaluate(f"window.location.href = '{url}'")
|
||||||
|
|||||||
@@ -26,8 +26,10 @@ from .utils import (
|
|||||||
sanitize_input_encode,
|
sanitize_input_encode,
|
||||||
InvalidCSSSelectorError,
|
InvalidCSSSelectorError,
|
||||||
format_html,
|
format_html,
|
||||||
fast_format_html
|
fast_format_html,
|
||||||
|
create_box_message
|
||||||
)
|
)
|
||||||
|
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
import random
|
import random
|
||||||
from .__version__ import __version__ as crawl4ai_version
|
from .__version__ import __version__ as crawl4ai_version
|
||||||
@@ -326,15 +328,15 @@ class AsyncWebCrawler:
|
|||||||
if not hasattr(e, "msg"):
|
if not hasattr(e, "msg"):
|
||||||
e.msg = str(e)
|
e.msg = str(e)
|
||||||
# print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}")
|
# print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}")
|
||||||
|
|
||||||
self.logger.error_status(
|
self.logger.error_status(
|
||||||
url=cache_context.display_url,
|
url=cache_context.display_url,
|
||||||
error=e.msg,
|
error=create_box_message(e.msg, type = "error"),
|
||||||
tag="ERROR"
|
tag="ERROR"
|
||||||
)
|
)
|
||||||
return CrawlResult(
|
return CrawlResult(
|
||||||
url=url,
|
url=url,
|
||||||
html="",
|
html="",
|
||||||
markdown=f"[ERROR] 🚫 arun(): Failed to crawl {cache_context.display_url}, error: {e.msg}",
|
|
||||||
success=False,
|
success=False,
|
||||||
error_message=e.msg
|
error_message=e.msg
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -17,7 +17,8 @@ from requests.exceptions import InvalidSchema
|
|||||||
import hashlib
|
import hashlib
|
||||||
from typing import Optional, Tuple, Dict, Any
|
from typing import Optional, Tuple, Dict, Any
|
||||||
import xxhash
|
import xxhash
|
||||||
|
from colorama import Fore, Style, init
|
||||||
|
import textwrap
|
||||||
|
|
||||||
from .html2text import HTML2Text
|
from .html2text import HTML2Text
|
||||||
class CustomHTML2Text(HTML2Text):
|
class CustomHTML2Text(HTML2Text):
|
||||||
@@ -103,12 +104,67 @@ class CustomHTML2Text(HTML2Text):
|
|||||||
self.preserved_content.append(data)
|
self.preserved_content.append(data)
|
||||||
return
|
return
|
||||||
super().handle_data(data, entity_char)
|
super().handle_data(data, entity_char)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class InvalidCSSSelectorError(Exception):
|
class InvalidCSSSelectorError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def create_box_message(
|
||||||
|
message: str,
|
||||||
|
type: str = "info",
|
||||||
|
width: int = 80,
|
||||||
|
add_newlines: bool = True,
|
||||||
|
double_line: bool = False
|
||||||
|
) -> str:
|
||||||
|
init()
|
||||||
|
|
||||||
|
# Define border and text colors for different types
|
||||||
|
styles = {
|
||||||
|
"warning": (Fore.YELLOW, Fore.LIGHTYELLOW_EX, "⚠"),
|
||||||
|
"info": (Fore.BLUE, Fore.LIGHTBLUE_EX, "ℹ"),
|
||||||
|
"success": (Fore.GREEN, Fore.LIGHTGREEN_EX, "✓"),
|
||||||
|
"error": (Fore.RED, Fore.LIGHTRED_EX, "×"),
|
||||||
|
}
|
||||||
|
|
||||||
|
border_color, text_color, prefix = styles.get(type.lower(), styles["info"])
|
||||||
|
|
||||||
|
# Define box characters based on line style
|
||||||
|
box_chars = {
|
||||||
|
"single": ("─", "│", "┌", "┐", "└", "┘"),
|
||||||
|
"double": ("═", "║", "╔", "╗", "╚", "╝")
|
||||||
|
}
|
||||||
|
line_style = "double" if double_line else "single"
|
||||||
|
h_line, v_line, tl, tr, bl, br = box_chars[line_style]
|
||||||
|
|
||||||
|
# Process lines with lighter text color
|
||||||
|
formatted_lines = []
|
||||||
|
raw_lines = message.split('\n')
|
||||||
|
|
||||||
|
if raw_lines:
|
||||||
|
first_line = f"{prefix} {raw_lines[0].strip()}"
|
||||||
|
wrapped_first = textwrap.fill(first_line, width=width-4)
|
||||||
|
formatted_lines.extend(wrapped_first.split('\n'))
|
||||||
|
|
||||||
|
for line in raw_lines[1:]:
|
||||||
|
if line.strip():
|
||||||
|
wrapped = textwrap.fill(f" {line.strip()}", width=width-4)
|
||||||
|
formatted_lines.extend(wrapped.split('\n'))
|
||||||
|
else:
|
||||||
|
formatted_lines.append("")
|
||||||
|
|
||||||
|
# Create the box with colored borders and lighter text
|
||||||
|
horizontal_line = h_line * (width - 1)
|
||||||
|
box = [
|
||||||
|
f"{border_color}{tl}{horizontal_line}{tr}",
|
||||||
|
*[f"{border_color}{v_line}{text_color} {line:<{width-2}}{border_color}{v_line}" for line in formatted_lines],
|
||||||
|
f"{border_color}{bl}{horizontal_line}{br}{Style.RESET_ALL}"
|
||||||
|
]
|
||||||
|
|
||||||
|
result = "\n".join(box)
|
||||||
|
if add_newlines:
|
||||||
|
result = f"\n{result}\n"
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
def calculate_semaphore_count():
|
def calculate_semaphore_count():
|
||||||
cpu_count = os.cpu_count()
|
cpu_count = os.cpu_count()
|
||||||
memory_gb = get_system_memory() / (1024 ** 3) # Convert to GB
|
memory_gb = get_system_memory() / (1024 ** 3) # Convert to GB
|
||||||
|
|||||||
Reference in New Issue
Block a user