diff --git a/README.md b/README.md index b8616694..94dd7a88 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,6 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc - 🔄 Session management for complex multi-page crawling scenarios - 🌐 Asynchronous architecture for improved performance and scalability - ## Installation 🛠️ Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package or use Docker. @@ -56,9 +55,21 @@ For basic web crawling and scraping tasks: pip install crawl4ai ``` -By default this will install the asynchronous version of Crawl4AI, using Playwright for web crawling. +By default, this will install the asynchronous version of Crawl4AI, using Playwright for web crawling. - 👉 Note: The standard version of Crawl4AI uses Playwright for asynchronous crawling. If you encounter an error saying that Playwright is not installed, you can run playwright install. However, this should be done automatically during the setup process. +👉 Note: When you install Crawl4AI, the setup script should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods: + +1. Through the command line: + ```bash + playwright install + ``` + +2. If the above doesn't work, try this more specific command: + ```bash + python -m playwright install chromium + ``` + +This second method has proven to be more reliable in some cases. #### Installation with Synchronous Version diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index d32f25c9..35e3c59b 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -3,24 +3,15 @@ import base64, time from abc import ABC, abstractmethod from typing import Callable, Dict, Any, List, Optional import os -import psutil from playwright.async_api import async_playwright, Page, Browser, Error from io import BytesIO from PIL import Image, ImageDraw, ImageFont -from .utils import sanitize_input_encode +from .utils import sanitize_input_encode, calculate_semaphore_count import json, uuid import hashlib from pathlib import Path from playwright.async_api import ProxySettings from pydantic import BaseModel - -def calculate_semaphore_count(): - cpu_count = os.cpu_count() - memory_gb = psutil.virtual_memory().total / (1024 ** 3) # Convert to GB - base_count = max(1, cpu_count // 2) - memory_based_cap = int(memory_gb / 2) # Assume 2GB per instance - return min(base_count, memory_based_cap) - class AsyncCrawlResponse(BaseModel): html: str response_headers: Dict[str, str] diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index b5736ad8..71a36aed 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -6,6 +6,7 @@ import json import html import re import os +import platform from html2text import HTML2Text from .prompts import PROMPT_EXTRACT_BLOCKS from .config import * @@ -18,6 +19,46 @@ from requests.exceptions import InvalidSchema class InvalidCSSSelectorError(Exception): pass +def calculate_semaphore_count(): + cpu_count = os.cpu_count() + memory_gb = get_system_memory() / (1024 ** 3) # Convert to GB + base_count = max(1, cpu_count // 2) + memory_based_cap = int(memory_gb / 2) # Assume 2GB per instance + return min(base_count, memory_based_cap) + +def get_system_memory(): + system = platform.system() + if system == "Linux": + with open('/proc/meminfo', 'r') as mem: + for line in mem: + if line.startswith('MemTotal:'): + return int(line.split()[1]) * 1024 # Convert KB to bytes + elif system == "Darwin": # macOS + import subprocess + output = subprocess.check_output(['sysctl', '-n', 'hw.memsize']).decode('utf-8') + return int(output.strip()) + elif system == "Windows": + import ctypes + kernel32 = ctypes.windll.kernel32 + c_ulonglong = ctypes.c_ulonglong + class MEMORYSTATUSEX(ctypes.Structure): + _fields_ = [ + ('dwLength', ctypes.c_ulong), + ('dwMemoryLoad', ctypes.c_ulong), + ('ullTotalPhys', c_ulonglong), + ('ullAvailPhys', c_ulonglong), + ('ullTotalPageFile', c_ulonglong), + ('ullAvailPageFile', c_ulonglong), + ('ullTotalVirtual', c_ulonglong), + ('ullAvailVirtual', c_ulonglong), + ('ullAvailExtendedVirtual', c_ulonglong), + ] + memoryStatus = MEMORYSTATUSEX() + memoryStatus.dwLength = ctypes.sizeof(MEMORYSTATUSEX) + kernel32.GlobalMemoryStatusEx(ctypes.byref(memoryStatus)) + return memoryStatus.ullTotalPhys + else: + raise OSError("Unsupported operating system") def get_home_folder(): home_folder = os.path.join(Path.home(), ".crawl4ai") diff --git a/requirements.txt b/requirements.txt index 3394429e..7d21f5ae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,5 @@ numpy>=1.26.0,<2.1.1 pillow==10.4.0 playwright==1.47.0 python-dotenv==1.0.1 -requests==2.32.3 -PyYAML==6.0.2 -beautifulsoup4==4.12.3 -psutil==6.0.0 \ No newline at end of file +requests>=2.26.0,<2.32.3 +beautifulsoup4==4.12.3 \ No newline at end of file diff --git a/setup.py b/setup.py index e66d7d86..b827e6af 100644 --- a/setup.py +++ b/setup.py @@ -4,6 +4,7 @@ import os from pathlib import Path import shutil import subprocess +import sys # Create the .crawl4ai folder in the user's home directory if it doesn't exist # If the folder already exists, remove the cache folder @@ -35,21 +36,23 @@ transformer_requirements = ["transformers", "tokenizers", "onnxruntime"] cosine_similarity_requirements = ["torch", "transformers", "nltk", "spacy"] sync_requirements = ["selenium"] -def post_install(): - print("Running post-installation setup...") +def install_playwright(): + print("Installing Playwright browsers...") try: - subprocess.check_call(["playwright", "install"]) + subprocess.check_call([sys.executable, "-m", "playwright", "install"]) print("Playwright installation completed successfully.") - except subprocess.CalledProcessError: - print("Error during Playwright installation. Please run 'playwright install' manually.") - except FileNotFoundError: - print("Playwright not found. Please ensure it's installed and run 'playwright install' manually.") + except subprocess.CalledProcessError as e: + print(f"Error during Playwright installation: {e}") + print("Please run 'python -m playwright install' manually after the installation.") + except Exception as e: + print(f"Unexpected error during Playwright installation: {e}") + print("Please run 'python -m playwright install' manually after the installation.") class PostInstallCommand(install): def run(self): install.run(self) - post_install() - + install_playwright() + setup( name="Crawl4AI", version=version, @@ -61,7 +64,7 @@ setup( author_email="unclecode@kidocode.com", license="MIT", packages=find_packages(), - install_requires=default_requirements, + install_requires=default_requirements + ["playwright"], # Add playwright to default requirements extras_require={ "torch": torch_requirements, "transformer": transformer_requirements,