New async database manager and migration support

- Introduced AsyncDatabaseManager for async DB management. - Added migration feature to transition to file-based storage. - Enhanced web crawler with improved caching logic. - Updated requirements and setup for async processing.
2024-11-16 14:54:41 +08:00
parent ae7ebc0bd8
commit d0014c6793
8 changed files with 685 additions and 119 deletions
--- a/setup.py
+++ b/setup.py
@@ -5,34 +5,37 @@ from pathlib import Path
 import shutil
 import subprocess
 import sys
+import asyncio

-# Create the .crawl4ai folder in the user's home directory if it doesn't exist
-# If the folder already exists, remove the cache folder
+# Create the .crawl4ai folder structure
 crawl4ai_folder = Path.home() / ".crawl4ai"
 cache_folder = crawl4ai_folder / "cache"
+content_folders = ['html_content', 'cleaned_html', 'markdown_content', 
+                  'extracted_content', 'screenshots']

+# Clean up old cache if exists
 if cache_folder.exists():
    shutil.rmtree(cache_folder)

+# Create new folder structure
 crawl4ai_folder.mkdir(exist_ok=True)
 cache_folder.mkdir(exist_ok=True)
+for folder in content_folders:
+    (crawl4ai_folder / folder).mkdir(exist_ok=True)

-# Read the requirements from requirements.txt
+# Read requirements and version
 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
 with open(os.path.join(__location__, "requirements.txt")) as f:
    requirements = f.read().splitlines()
    
-# Read version from __init__.py
 with open("crawl4ai/_version.py") as f:
    for line in f:
        if line.startswith("__version__"):
            version = line.split("=")[1].strip().strip('"')
            break

-# Define the requirements for different environments
+# Define requirements
 default_requirements = requirements
-# torch_requirements = ["torch", "nltk", "spacy", "scikit-learn"]
-# transformer_requirements = ["transformers", "tokenizers", "onnxruntime"]
 torch_requirements = ["torch", "nltk",  "scikit-learn"]
 transformer_requirements = ["transformers", "tokenizers"]
 cosine_similarity_requirements = ["torch", "transformers", "nltk" ]
@@ -50,10 +53,24 @@ def install_playwright():
        print(f"Unexpected error during Playwright installation: {e}")
        print("Please run 'python -m playwright install' manually after the installation.")

+def run_migration():
+    """Initialize database during installation"""
+    try:
+        print("Starting database initialization...")
+        from crawl4ai.async_database import async_db_manager
+        asyncio.run(async_db_manager.initialize())
+        print("Database initialization completed successfully.")
+    except ImportError:
+        print("Warning: Database module not found. Will initialize on first use.")
+    except Exception as e:
+        print(f"Warning: Database initialization failed: {e}")
+        print("Database will be initialized on first use")
+
 class PostInstallCommand(install):
    def run(self):
        install.run(self)
        install_playwright()
+        run_migration()

 setup(
    name="Crawl4AI",
@@ -66,7 +83,7 @@ setup(
    author_email="unclecode@kidocode.com",
    license="MIT",
    packages=find_packages(),
-    install_requires=default_requirements + ["playwright"],  # Add playwright to default requirements
+    install_requires=default_requirements + ["playwright", "aiofiles"],  # Added aiofiles
    extras_require={
        "torch": torch_requirements,
        "transformer": transformer_requirements,
@@ -77,6 +94,7 @@ setup(
    entry_points={
        'console_scripts': [
            'crawl4ai-download-models=crawl4ai.model_loader:main',
+            'crawl4ai-migrate=crawl4ai.migrations:main',  # Added migration command
        ],
    },
    classifiers=[