From 2ab0bf27c21674b8f9f75d7b7324c604769dac7d Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 17 Aug 2025 19:14:55 +0800 Subject: [PATCH] refactor(utils): move memory utilities to utils and update imports --- crawl4ai/async_dispatcher.py | 2 +- crawl4ai/memory_utils.py | 79 ----------------------------------- crawl4ai/utils.py | 81 ++++++++++++++++++++++++++++++++++-- tests/test_memory_macos.py | 2 +- 4 files changed, 80 insertions(+), 84 deletions(-) delete mode 100644 crawl4ai/memory_utils.py diff --git a/crawl4ai/async_dispatcher.py b/crawl4ai/async_dispatcher.py index ce130d02..5bb1a47c 100644 --- a/crawl4ai/async_dispatcher.py +++ b/crawl4ai/async_dispatcher.py @@ -22,7 +22,7 @@ from urllib.parse import urlparse import random from abc import ABC, abstractmethod -from .memory_utils import get_true_memory_usage_percent +from .utils import get_true_memory_usage_percent class RateLimiter: diff --git a/crawl4ai/memory_utils.py b/crawl4ai/memory_utils.py deleted file mode 100644 index fa140c93..00000000 --- a/crawl4ai/memory_utils.py +++ /dev/null @@ -1,79 +0,0 @@ -import psutil -import platform -import subprocess -from typing import Tuple - - -def get_true_available_memory_gb() -> float: - """Get truly available memory including inactive pages (cross-platform)""" - vm = psutil.virtual_memory() - - if platform.system() == 'Darwin': # macOS - # On macOS, we need to include inactive memory too - try: - # Use vm_stat to get accurate values - result = subprocess.run(['vm_stat'], capture_output=True, text=True) - lines = result.stdout.split('\n') - - page_size = 16384 # macOS page size - pages = {} - - for line in lines: - if 'Pages free:' in line: - pages['free'] = int(line.split()[-1].rstrip('.')) - elif 'Pages inactive:' in line: - pages['inactive'] = int(line.split()[-1].rstrip('.')) - elif 'Pages speculative:' in line: - pages['speculative'] = int(line.split()[-1].rstrip('.')) - elif 'Pages purgeable:' in line: - pages['purgeable'] = int(line.split()[-1].rstrip('.')) - - # Calculate total available (free + inactive + speculative + purgeable) - total_available_pages = ( - pages.get('free', 0) + - pages.get('inactive', 0) + - pages.get('speculative', 0) + - pages.get('purgeable', 0) - ) - available_gb = (total_available_pages * page_size) / (1024**3) - - return available_gb - except: - # Fallback to psutil - return vm.available / (1024**3) - else: - # For Windows and Linux, psutil.available is accurate - return vm.available / (1024**3) - - -def get_true_memory_usage_percent() -> float: - """ - Get memory usage percentage that accounts for platform differences. - - Returns: - float: Memory usage percentage (0-100) - """ - vm = psutil.virtual_memory() - total_gb = vm.total / (1024**3) - available_gb = get_true_available_memory_gb() - - # Calculate used percentage based on truly available memory - used_percent = 100.0 * (total_gb - available_gb) / total_gb - - # Ensure it's within valid range - return max(0.0, min(100.0, used_percent)) - - -def get_memory_stats() -> Tuple[float, float, float]: - """ - Get comprehensive memory statistics. - - Returns: - Tuple[float, float, float]: (used_percent, available_gb, total_gb) - """ - vm = psutil.virtual_memory() - total_gb = vm.total / (1024**3) - available_gb = get_true_available_memory_gb() - used_percent = get_true_memory_usage_percent() - - return used_percent, available_gb, total_gb \ No newline at end of file diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 4cadfad4..73f1d2a3 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -16,7 +16,7 @@ from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, IM import httpx from socket import gaierror from pathlib import Path -from typing import Dict, Any, List, Optional, Callable +from typing import Dict, Any, List, Optional, Callable, Generator, Tuple, Iterable from urllib.parse import urljoin import requests from requests.exceptions import InvalidSchema @@ -40,8 +40,7 @@ from typing import Sequence from itertools import chain from collections import deque -from typing import Generator, Iterable - +import psutil import numpy as np from urllib.parse import ( @@ -3414,3 +3413,79 @@ def cosine_distance(vec1: np.ndarray, vec2: np.ndarray) -> float: """Calculate cosine distance (1 - similarity) between two vectors""" return 1 - cosine_similarity(vec1, vec2) + +# Memory utilities + +def get_true_available_memory_gb() -> float: + """Get truly available memory including inactive pages (cross-platform)""" + vm = psutil.virtual_memory() + + if platform.system() == 'Darwin': # macOS + # On macOS, we need to include inactive memory too + try: + # Use vm_stat to get accurate values + result = subprocess.run(['vm_stat'], capture_output=True, text=True) + lines = result.stdout.split('\n') + + page_size = 16384 # macOS page size + pages = {} + + for line in lines: + if 'Pages free:' in line: + pages['free'] = int(line.split()[-1].rstrip('.')) + elif 'Pages inactive:' in line: + pages['inactive'] = int(line.split()[-1].rstrip('.')) + elif 'Pages speculative:' in line: + pages['speculative'] = int(line.split()[-1].rstrip('.')) + elif 'Pages purgeable:' in line: + pages['purgeable'] = int(line.split()[-1].rstrip('.')) + + # Calculate total available (free + inactive + speculative + purgeable) + total_available_pages = ( + pages.get('free', 0) + + pages.get('inactive', 0) + + pages.get('speculative', 0) + + pages.get('purgeable', 0) + ) + available_gb = (total_available_pages * page_size) / (1024**3) + + return available_gb + except: + # Fallback to psutil + return vm.available / (1024**3) + else: + # For Windows and Linux, psutil.available is accurate + return vm.available / (1024**3) + + +def get_true_memory_usage_percent() -> float: + """ + Get memory usage percentage that accounts for platform differences. + + Returns: + float: Memory usage percentage (0-100) + """ + vm = psutil.virtual_memory() + total_gb = vm.total / (1024**3) + available_gb = get_true_available_memory_gb() + + # Calculate used percentage based on truly available memory + used_percent = 100.0 * (total_gb - available_gb) / total_gb + + # Ensure it's within valid range + return max(0.0, min(100.0, used_percent)) + + +def get_memory_stats() -> Tuple[float, float, float]: + """ + Get comprehensive memory statistics. + + Returns: + Tuple[float, float, float]: (used_percent, available_gb, total_gb) + """ + vm = psutil.virtual_memory() + total_gb = vm.total / (1024**3) + available_gb = get_true_available_memory_gb() + used_percent = get_true_memory_usage_percent() + + return used_percent, available_gb, total_gb \ No newline at end of file diff --git a/tests/test_memory_macos.py b/tests/test_memory_macos.py index b94d8a8b..7019ff03 100755 --- a/tests/test_memory_macos.py +++ b/tests/test_memory_macos.py @@ -4,7 +4,7 @@ import psutil import platform import time -from crawl4ai.memory_utils import get_true_memory_usage_percent, get_memory_stats, get_true_available_memory_gb +from crawl4ai.utils import get_true_memory_usage_percent, get_memory_stats, get_true_available_memory_gb def test_memory_calculation():