feat(database): implement version management and migration checks during initialization

This commit is contained in:
UncleCode
2024-11-17 18:09:33 +08:00
parent 2a82455b3d
commit f9fe6f89fe
4 changed files with 113 additions and 3 deletions

View File

@@ -11,6 +11,7 @@ from .models import CrawlResult
import xxhash
import aiofiles
from .config import NEED_MIGRATION
from .version_manager import VersionManager
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@@ -28,22 +29,49 @@ class AsyncDatabaseManager:
self.connection_pool: Dict[int, aiosqlite.Connection] = {}
self.pool_lock = asyncio.Lock()
self.connection_semaphore = asyncio.Semaphore(pool_size)
self._initialized = False
self.version_manager = VersionManager()
async def initialize(self):
"""Initialize the database and connection pool"""
try:
logger.info("Initializing database...")
# Ensure the database file exists
os.makedirs(os.path.dirname(self.db_path), exist_ok=True)
# Check if version update is needed
needs_update = self.version_manager.needs_update()
# Always ensure base table exists
await self.ainit_db()
if NEED_MIGRATION:
# Verify the table exists
async def verify_table(db):
async with db.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='crawled_data'"
) as cursor:
result = await cursor.fetchone()
if not result:
raise Exception("crawled_data table was not created")
await self.execute_with_retry(verify_table)
# If version changed or fresh install, run updates
if needs_update:
logger.info("New version detected, running updates...")
await self.update_db_schema()
from .migrations import run_migration # Import here to avoid circular imports
await run_migration()
logger.info("Database initialization and migration completed successfully")
self.version_manager.update_version() # Update stored version after successful migration
logger.info("Version update completed successfully")
else:
logger.info("Database initialization completed successfully")
except Exception as e:
logger.error(f"Database initialization error: {e}")
logger.info("Database will be initialized on first use")
raise
async def cleanup(self):
"""Cleanup connections when shutting down"""
@@ -55,6 +83,12 @@ class AsyncDatabaseManager:
@asynccontextmanager
async def get_connection(self):
"""Connection pool manager"""
if not self._initialized:
async with self.pool_lock: # Prevent multiple simultaneous initializations
if not self._initialized: # Double-check after acquiring lock
await self.initialize()
self._initialized = True
async with self.connection_semaphore:
task_id = id(asyncio.current_task())
try:
@@ -79,6 +113,7 @@ class AsyncDatabaseManager:
await self.connection_pool[task_id].close()
del self.connection_pool[task_id]
async def execute_with_retry(self, operation, *args):
"""Execute database operations with retry logic"""
for attempt in range(self.max_retries):

View File

@@ -0,0 +1,30 @@
# version_manager.py
import os
from pathlib import Path
from packaging import version
from . import __version__
class VersionManager:
def __init__(self):
self.home_dir = Path.home() / ".crawl4ai"
self.version_file = self.home_dir / "version.txt"
def get_installed_version(self):
"""Get the version recorded in home directory"""
if not self.version_file.exists():
return None
try:
return version.parse(self.version_file.read_text().strip())
except:
return None
def update_version(self):
"""Update the version file to current library version"""
self.version_file.write_text(__version__)
def needs_update(self):
"""Check if database needs update based on version"""
installed = self.get_installed_version()
current = version.parse(__version__)
return installed is None or installed < current

View File

@@ -58,6 +58,51 @@ crawl4ai-download-models
This is optional but will boost the performance and speed of the crawler. You only need to do this once after installation.
## Playwright Installation Note for Ubuntu
If you encounter issues with Playwright installation on Ubuntu, you may need to install additional dependencies:
```bash
sudo apt-get install -y \
libwoff1 \
libopus0 \
libwebp7 \
libwebpdemux2 \
libenchant-2-2 \
libgudev-1.0-0 \
libsecret-1-0 \
libhyphen0 \
libgdk-pixbuf2.0-0 \
libegl1 \
libnotify4 \
libxslt1.1 \
libevent-2.1-7 \
libgles2 \
libxcomposite1 \
libatk1.0-0 \
libatk-bridge2.0-0 \
libepoxy0 \
libgtk-3-0 \
libharfbuzz-icu0 \
libgstreamer-gl1.0-0 \
libgstreamer-plugins-bad1.0-0 \
gstreamer1.0-plugins-good \
gstreamer1.0-plugins-bad \
libxt6 \
libxaw7 \
xvfb \
fonts-noto-color-emoji \
libfontconfig \
libfreetype6 \
xfonts-cyrillic \
xfonts-scalable \
fonts-liberation \
fonts-ipafont-gothic \
fonts-wqy-zenhei \
fonts-tlwg-loma-otf \
fonts-freefont-ttf
```
## Option 2: Using Docker (Coming Soon)
Docker support for Crawl4AI is currently in progress and will be available soon. This will allow you to run Crawl4AI in a containerized environment, ensuring consistency across different systems.

View File

@@ -70,7 +70,7 @@ class PostInstallCommand(install):
def run(self):
install.run(self)
install_playwright()
run_migration()
# run_migration()
setup(
name="Crawl4AI",