diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py index f97d8131..7809dfe1 100644 --- a/crawl4ai/async_database.py +++ b/crawl4ai/async_database.py @@ -11,6 +11,7 @@ from .models import CrawlResult import xxhash import aiofiles from .config import NEED_MIGRATION +from .version_manager import VersionManager # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -28,22 +29,49 @@ class AsyncDatabaseManager: self.connection_pool: Dict[int, aiosqlite.Connection] = {} self.pool_lock = asyncio.Lock() self.connection_semaphore = asyncio.Semaphore(pool_size) + self._initialized = False + self.version_manager = VersionManager() + async def initialize(self): """Initialize the database and connection pool""" try: logger.info("Initializing database...") + # Ensure the database file exists + os.makedirs(os.path.dirname(self.db_path), exist_ok=True) + + # Check if version update is needed + needs_update = self.version_manager.needs_update() + + # Always ensure base table exists await self.ainit_db() - if NEED_MIGRATION: + + # Verify the table exists + async def verify_table(db): + async with db.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name='crawled_data'" + ) as cursor: + result = await cursor.fetchone() + if not result: + raise Exception("crawled_data table was not created") + + await self.execute_with_retry(verify_table) + + # If version changed or fresh install, run updates + if needs_update: + logger.info("New version detected, running updates...") await self.update_db_schema() from .migrations import run_migration # Import here to avoid circular imports await run_migration() - logger.info("Database initialization and migration completed successfully") + self.version_manager.update_version() # Update stored version after successful migration + logger.info("Version update completed successfully") else: logger.info("Database initialization completed successfully") + except Exception as e: logger.error(f"Database initialization error: {e}") logger.info("Database will be initialized on first use") + raise async def cleanup(self): """Cleanup connections when shutting down""" @@ -55,6 +83,12 @@ class AsyncDatabaseManager: @asynccontextmanager async def get_connection(self): """Connection pool manager""" + if not self._initialized: + async with self.pool_lock: # Prevent multiple simultaneous initializations + if not self._initialized: # Double-check after acquiring lock + await self.initialize() + self._initialized = True + async with self.connection_semaphore: task_id = id(asyncio.current_task()) try: @@ -79,6 +113,7 @@ class AsyncDatabaseManager: await self.connection_pool[task_id].close() del self.connection_pool[task_id] + async def execute_with_retry(self, operation, *args): """Execute database operations with retry logic""" for attempt in range(self.max_retries): diff --git a/crawl4ai/version_manager.py b/crawl4ai/version_manager.py new file mode 100644 index 00000000..07e0c0e9 --- /dev/null +++ b/crawl4ai/version_manager.py @@ -0,0 +1,30 @@ +# version_manager.py +import os +from pathlib import Path +from packaging import version +from . import __version__ + +class VersionManager: + def __init__(self): + self.home_dir = Path.home() / ".crawl4ai" + self.version_file = self.home_dir / "version.txt" + + def get_installed_version(self): + """Get the version recorded in home directory""" + if not self.version_file.exists(): + return None + try: + return version.parse(self.version_file.read_text().strip()) + except: + return None + + def update_version(self): + """Update the version file to current library version""" + self.version_file.write_text(__version__) + + def needs_update(self): + """Check if database needs update based on version""" + installed = self.get_installed_version() + current = version.parse(__version__) + return installed is None or installed < current + diff --git a/docs/md_v2/basic/installation.md b/docs/md_v2/basic/installation.md index a4a60857..de8aeafa 100644 --- a/docs/md_v2/basic/installation.md +++ b/docs/md_v2/basic/installation.md @@ -58,6 +58,51 @@ crawl4ai-download-models This is optional but will boost the performance and speed of the crawler. You only need to do this once after installation. +## Playwright Installation Note for Ubuntu + +If you encounter issues with Playwright installation on Ubuntu, you may need to install additional dependencies: + +```bash +sudo apt-get install -y \ + libwoff1 \ + libopus0 \ + libwebp7 \ + libwebpdemux2 \ + libenchant-2-2 \ + libgudev-1.0-0 \ + libsecret-1-0 \ + libhyphen0 \ + libgdk-pixbuf2.0-0 \ + libegl1 \ + libnotify4 \ + libxslt1.1 \ + libevent-2.1-7 \ + libgles2 \ + libxcomposite1 \ + libatk1.0-0 \ + libatk-bridge2.0-0 \ + libepoxy0 \ + libgtk-3-0 \ + libharfbuzz-icu0 \ + libgstreamer-gl1.0-0 \ + libgstreamer-plugins-bad1.0-0 \ + gstreamer1.0-plugins-good \ + gstreamer1.0-plugins-bad \ + libxt6 \ + libxaw7 \ + xvfb \ + fonts-noto-color-emoji \ + libfontconfig \ + libfreetype6 \ + xfonts-cyrillic \ + xfonts-scalable \ + fonts-liberation \ + fonts-ipafont-gothic \ + fonts-wqy-zenhei \ + fonts-tlwg-loma-otf \ + fonts-freefont-ttf +``` + ## Option 2: Using Docker (Coming Soon) Docker support for Crawl4AI is currently in progress and will be available soon. This will allow you to run Crawl4AI in a containerized environment, ensuring consistency across different systems. diff --git a/setup.py b/setup.py index d8ad2cd3..bbc03026 100644 --- a/setup.py +++ b/setup.py @@ -70,7 +70,7 @@ class PostInstallCommand(install): def run(self): install.run(self) install_playwright() - run_migration() + # run_migration() setup( name="Crawl4AI",