feat(database): implement version management and migration checks during initialization

This commit is contained in:
UncleCode
2024-11-17 18:09:33 +08:00
parent 2a82455b3d
commit f9fe6f89fe
4 changed files with 113 additions and 3 deletions

View File

@@ -11,6 +11,7 @@ from .models import CrawlResult
import xxhash import xxhash
import aiofiles import aiofiles
from .config import NEED_MIGRATION from .config import NEED_MIGRATION
from .version_manager import VersionManager
# Set up logging # Set up logging
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -28,22 +29,49 @@ class AsyncDatabaseManager:
self.connection_pool: Dict[int, aiosqlite.Connection] = {} self.connection_pool: Dict[int, aiosqlite.Connection] = {}
self.pool_lock = asyncio.Lock() self.pool_lock = asyncio.Lock()
self.connection_semaphore = asyncio.Semaphore(pool_size) self.connection_semaphore = asyncio.Semaphore(pool_size)
self._initialized = False
self.version_manager = VersionManager()
async def initialize(self): async def initialize(self):
"""Initialize the database and connection pool""" """Initialize the database and connection pool"""
try: try:
logger.info("Initializing database...") logger.info("Initializing database...")
# Ensure the database file exists
os.makedirs(os.path.dirname(self.db_path), exist_ok=True)
# Check if version update is needed
needs_update = self.version_manager.needs_update()
# Always ensure base table exists
await self.ainit_db() await self.ainit_db()
if NEED_MIGRATION:
# Verify the table exists
async def verify_table(db):
async with db.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='crawled_data'"
) as cursor:
result = await cursor.fetchone()
if not result:
raise Exception("crawled_data table was not created")
await self.execute_with_retry(verify_table)
# If version changed or fresh install, run updates
if needs_update:
logger.info("New version detected, running updates...")
await self.update_db_schema() await self.update_db_schema()
from .migrations import run_migration # Import here to avoid circular imports from .migrations import run_migration # Import here to avoid circular imports
await run_migration() await run_migration()
logger.info("Database initialization and migration completed successfully") self.version_manager.update_version() # Update stored version after successful migration
logger.info("Version update completed successfully")
else: else:
logger.info("Database initialization completed successfully") logger.info("Database initialization completed successfully")
except Exception as e: except Exception as e:
logger.error(f"Database initialization error: {e}") logger.error(f"Database initialization error: {e}")
logger.info("Database will be initialized on first use") logger.info("Database will be initialized on first use")
raise
async def cleanup(self): async def cleanup(self):
"""Cleanup connections when shutting down""" """Cleanup connections when shutting down"""
@@ -55,6 +83,12 @@ class AsyncDatabaseManager:
@asynccontextmanager @asynccontextmanager
async def get_connection(self): async def get_connection(self):
"""Connection pool manager""" """Connection pool manager"""
if not self._initialized:
async with self.pool_lock: # Prevent multiple simultaneous initializations
if not self._initialized: # Double-check after acquiring lock
await self.initialize()
self._initialized = True
async with self.connection_semaphore: async with self.connection_semaphore:
task_id = id(asyncio.current_task()) task_id = id(asyncio.current_task())
try: try:
@@ -79,6 +113,7 @@ class AsyncDatabaseManager:
await self.connection_pool[task_id].close() await self.connection_pool[task_id].close()
del self.connection_pool[task_id] del self.connection_pool[task_id]
async def execute_with_retry(self, operation, *args): async def execute_with_retry(self, operation, *args):
"""Execute database operations with retry logic""" """Execute database operations with retry logic"""
for attempt in range(self.max_retries): for attempt in range(self.max_retries):

View File

@@ -0,0 +1,30 @@
# version_manager.py
import os
from pathlib import Path
from packaging import version
from . import __version__
class VersionManager:
def __init__(self):
self.home_dir = Path.home() / ".crawl4ai"
self.version_file = self.home_dir / "version.txt"
def get_installed_version(self):
"""Get the version recorded in home directory"""
if not self.version_file.exists():
return None
try:
return version.parse(self.version_file.read_text().strip())
except:
return None
def update_version(self):
"""Update the version file to current library version"""
self.version_file.write_text(__version__)
def needs_update(self):
"""Check if database needs update based on version"""
installed = self.get_installed_version()
current = version.parse(__version__)
return installed is None or installed < current

View File

@@ -58,6 +58,51 @@ crawl4ai-download-models
This is optional but will boost the performance and speed of the crawler. You only need to do this once after installation. This is optional but will boost the performance and speed of the crawler. You only need to do this once after installation.
## Playwright Installation Note for Ubuntu
If you encounter issues with Playwright installation on Ubuntu, you may need to install additional dependencies:
```bash
sudo apt-get install -y \
libwoff1 \
libopus0 \
libwebp7 \
libwebpdemux2 \
libenchant-2-2 \
libgudev-1.0-0 \
libsecret-1-0 \
libhyphen0 \
libgdk-pixbuf2.0-0 \
libegl1 \
libnotify4 \
libxslt1.1 \
libevent-2.1-7 \
libgles2 \
libxcomposite1 \
libatk1.0-0 \
libatk-bridge2.0-0 \
libepoxy0 \
libgtk-3-0 \
libharfbuzz-icu0 \
libgstreamer-gl1.0-0 \
libgstreamer-plugins-bad1.0-0 \
gstreamer1.0-plugins-good \
gstreamer1.0-plugins-bad \
libxt6 \
libxaw7 \
xvfb \
fonts-noto-color-emoji \
libfontconfig \
libfreetype6 \
xfonts-cyrillic \
xfonts-scalable \
fonts-liberation \
fonts-ipafont-gothic \
fonts-wqy-zenhei \
fonts-tlwg-loma-otf \
fonts-freefont-ttf
```
## Option 2: Using Docker (Coming Soon) ## Option 2: Using Docker (Coming Soon)
Docker support for Crawl4AI is currently in progress and will be available soon. This will allow you to run Crawl4AI in a containerized environment, ensuring consistency across different systems. Docker support for Crawl4AI is currently in progress and will be available soon. This will allow you to run Crawl4AI in a containerized environment, ensuring consistency across different systems.

View File

@@ -70,7 +70,7 @@ class PostInstallCommand(install):
def run(self): def run(self):
install.run(self) install.run(self)
install_playwright() install_playwright()
run_migration() # run_migration()
setup( setup(
name="Crawl4AI", name="Crawl4AI",