New async database manager and migration support
- Introduced AsyncDatabaseManager for async DB management. - Added migration feature to transition to file-based storage. - Enhanced web crawler with improved caching logic. - Updated requirements and setup for async processing.
This commit is contained in:
34
setup.py
34
setup.py
@@ -5,34 +5,37 @@ from pathlib import Path
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import asyncio
|
||||
|
||||
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
||||
# If the folder already exists, remove the cache folder
|
||||
# Create the .crawl4ai folder structure
|
||||
crawl4ai_folder = Path.home() / ".crawl4ai"
|
||||
cache_folder = crawl4ai_folder / "cache"
|
||||
content_folders = ['html_content', 'cleaned_html', 'markdown_content',
|
||||
'extracted_content', 'screenshots']
|
||||
|
||||
# Clean up old cache if exists
|
||||
if cache_folder.exists():
|
||||
shutil.rmtree(cache_folder)
|
||||
|
||||
# Create new folder structure
|
||||
crawl4ai_folder.mkdir(exist_ok=True)
|
||||
cache_folder.mkdir(exist_ok=True)
|
||||
for folder in content_folders:
|
||||
(crawl4ai_folder / folder).mkdir(exist_ok=True)
|
||||
|
||||
# Read the requirements from requirements.txt
|
||||
# Read requirements and version
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
with open(os.path.join(__location__, "requirements.txt")) as f:
|
||||
requirements = f.read().splitlines()
|
||||
|
||||
# Read version from __init__.py
|
||||
with open("crawl4ai/_version.py") as f:
|
||||
for line in f:
|
||||
if line.startswith("__version__"):
|
||||
version = line.split("=")[1].strip().strip('"')
|
||||
break
|
||||
|
||||
# Define the requirements for different environments
|
||||
# Define requirements
|
||||
default_requirements = requirements
|
||||
# torch_requirements = ["torch", "nltk", "spacy", "scikit-learn"]
|
||||
# transformer_requirements = ["transformers", "tokenizers", "onnxruntime"]
|
||||
torch_requirements = ["torch", "nltk", "scikit-learn"]
|
||||
transformer_requirements = ["transformers", "tokenizers"]
|
||||
cosine_similarity_requirements = ["torch", "transformers", "nltk" ]
|
||||
@@ -50,10 +53,24 @@ def install_playwright():
|
||||
print(f"Unexpected error during Playwright installation: {e}")
|
||||
print("Please run 'python -m playwright install' manually after the installation.")
|
||||
|
||||
def run_migration():
|
||||
"""Initialize database during installation"""
|
||||
try:
|
||||
print("Starting database initialization...")
|
||||
from crawl4ai.async_database import async_db_manager
|
||||
asyncio.run(async_db_manager.initialize())
|
||||
print("Database initialization completed successfully.")
|
||||
except ImportError:
|
||||
print("Warning: Database module not found. Will initialize on first use.")
|
||||
except Exception as e:
|
||||
print(f"Warning: Database initialization failed: {e}")
|
||||
print("Database will be initialized on first use")
|
||||
|
||||
class PostInstallCommand(install):
|
||||
def run(self):
|
||||
install.run(self)
|
||||
install_playwright()
|
||||
run_migration()
|
||||
|
||||
setup(
|
||||
name="Crawl4AI",
|
||||
@@ -66,7 +83,7 @@ setup(
|
||||
author_email="unclecode@kidocode.com",
|
||||
license="MIT",
|
||||
packages=find_packages(),
|
||||
install_requires=default_requirements + ["playwright"], # Add playwright to default requirements
|
||||
install_requires=default_requirements + ["playwright", "aiofiles"], # Added aiofiles
|
||||
extras_require={
|
||||
"torch": torch_requirements,
|
||||
"transformer": transformer_requirements,
|
||||
@@ -77,6 +94,7 @@ setup(
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
'crawl4ai-download-models=crawl4ai.model_loader:main',
|
||||
'crawl4ai-migrate=crawl4ai.migrations:main', # Added migration command
|
||||
],
|
||||
},
|
||||
classifiers=[
|
||||
|
||||
Reference in New Issue
Block a user