Remove dependency on psutil, PyYaml, and extend requests version range

This commit is contained in:
unclecode
2024-09-29 17:07:06 +08:00
parent 0759503e50
commit bccadec887
5 changed files with 71 additions and 27 deletions

View File

@@ -39,7 +39,6 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc
- 🔄 Session management for complex multi-page crawling scenarios
- 🌐 Asynchronous architecture for improved performance and scalability
## Installation 🛠️
Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package or use Docker.
@@ -56,9 +55,21 @@ For basic web crawling and scraping tasks:
pip install crawl4ai
```
By default this will install the asynchronous version of Crawl4AI, using Playwright for web crawling.
By default, this will install the asynchronous version of Crawl4AI, using Playwright for web crawling.
👉 Note: The standard version of Crawl4AI uses Playwright for asynchronous crawling. If you encounter an error saying that Playwright is not installed, you can run playwright install. However, this should be done automatically during the setup process.
👉 Note: When you install Crawl4AI, the setup script should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods:
1. Through the command line:
```bash
playwright install
```
2. If the above doesn't work, try this more specific command:
```bash
python -m playwright install chromium
```
This second method has proven to be more reliable in some cases.
#### Installation with Synchronous Version

View File

@@ -3,24 +3,15 @@ import base64, time
from abc import ABC, abstractmethod
from typing import Callable, Dict, Any, List, Optional
import os
import psutil
from playwright.async_api import async_playwright, Page, Browser, Error
from io import BytesIO
from PIL import Image, ImageDraw, ImageFont
from .utils import sanitize_input_encode
from .utils import sanitize_input_encode, calculate_semaphore_count
import json, uuid
import hashlib
from pathlib import Path
from playwright.async_api import ProxySettings
from pydantic import BaseModel
def calculate_semaphore_count():
cpu_count = os.cpu_count()
memory_gb = psutil.virtual_memory().total / (1024 ** 3) # Convert to GB
base_count = max(1, cpu_count // 2)
memory_based_cap = int(memory_gb / 2) # Assume 2GB per instance
return min(base_count, memory_based_cap)
class AsyncCrawlResponse(BaseModel):
html: str
response_headers: Dict[str, str]

View File

@@ -6,6 +6,7 @@ import json
import html
import re
import os
import platform
from html2text import HTML2Text
from .prompts import PROMPT_EXTRACT_BLOCKS
from .config import *
@@ -18,6 +19,46 @@ from requests.exceptions import InvalidSchema
class InvalidCSSSelectorError(Exception):
pass
def calculate_semaphore_count():
cpu_count = os.cpu_count()
memory_gb = get_system_memory() / (1024 ** 3) # Convert to GB
base_count = max(1, cpu_count // 2)
memory_based_cap = int(memory_gb / 2) # Assume 2GB per instance
return min(base_count, memory_based_cap)
def get_system_memory():
system = platform.system()
if system == "Linux":
with open('/proc/meminfo', 'r') as mem:
for line in mem:
if line.startswith('MemTotal:'):
return int(line.split()[1]) * 1024 # Convert KB to bytes
elif system == "Darwin": # macOS
import subprocess
output = subprocess.check_output(['sysctl', '-n', 'hw.memsize']).decode('utf-8')
return int(output.strip())
elif system == "Windows":
import ctypes
kernel32 = ctypes.windll.kernel32
c_ulonglong = ctypes.c_ulonglong
class MEMORYSTATUSEX(ctypes.Structure):
_fields_ = [
('dwLength', ctypes.c_ulong),
('dwMemoryLoad', ctypes.c_ulong),
('ullTotalPhys', c_ulonglong),
('ullAvailPhys', c_ulonglong),
('ullTotalPageFile', c_ulonglong),
('ullAvailPageFile', c_ulonglong),
('ullTotalVirtual', c_ulonglong),
('ullAvailVirtual', c_ulonglong),
('ullAvailExtendedVirtual', c_ulonglong),
]
memoryStatus = MEMORYSTATUSEX()
memoryStatus.dwLength = ctypes.sizeof(MEMORYSTATUSEX)
kernel32.GlobalMemoryStatusEx(ctypes.byref(memoryStatus))
return memoryStatus.ullTotalPhys
else:
raise OSError("Unsupported operating system")
def get_home_folder():
home_folder = os.path.join(Path.home(), ".crawl4ai")

View File

@@ -6,7 +6,5 @@ numpy>=1.26.0,<2.1.1
pillow==10.4.0
playwright==1.47.0
python-dotenv==1.0.1
requests==2.32.3
PyYAML==6.0.2
beautifulsoup4==4.12.3
psutil==6.0.0
requests>=2.26.0,<2.32.3
beautifulsoup4==4.12.3

View File

@@ -4,6 +4,7 @@ import os
from pathlib import Path
import shutil
import subprocess
import sys
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
# If the folder already exists, remove the cache folder
@@ -35,21 +36,23 @@ transformer_requirements = ["transformers", "tokenizers", "onnxruntime"]
cosine_similarity_requirements = ["torch", "transformers", "nltk", "spacy"]
sync_requirements = ["selenium"]
def post_install():
print("Running post-installation setup...")
def install_playwright():
print("Installing Playwright browsers...")
try:
subprocess.check_call(["playwright", "install"])
subprocess.check_call([sys.executable, "-m", "playwright", "install"])
print("Playwright installation completed successfully.")
except subprocess.CalledProcessError:
print("Error during Playwright installation. Please run 'playwright install' manually.")
except FileNotFoundError:
print("Playwright not found. Please ensure it's installed and run 'playwright install' manually.")
except subprocess.CalledProcessError as e:
print(f"Error during Playwright installation: {e}")
print("Please run 'python -m playwright install' manually after the installation.")
except Exception as e:
print(f"Unexpected error during Playwright installation: {e}")
print("Please run 'python -m playwright install' manually after the installation.")
class PostInstallCommand(install):
def run(self):
install.run(self)
post_install()
install_playwright()
setup(
name="Crawl4AI",
version=version,
@@ -61,7 +64,7 @@ setup(
author_email="unclecode@kidocode.com",
license="MIT",
packages=find_packages(),
install_requires=default_requirements,
install_requires=default_requirements + ["playwright"], # Add playwright to default requirements
extras_require={
"torch": torch_requirements,
"transformer": transformer_requirements,