New async database manager and migration support

- Introduced AsyncDatabaseManager for async DB management. - Added migration feature to transition to file-based storage. - Enhanced web crawler with improved caching logic. - Updated requirements and setup for async processing.
2024-11-16 14:54:41 +08:00
parent ae7ebc0bd8
commit d0014c6793
8 changed files with 685 additions and 119 deletions
--- a/crawl4ai/migrations.py
+++ b/crawl4ai/migrations.py
@@ -0,0 +1,152 @@
+import os
+import asyncio
+import logging
+from pathlib import Path
+import aiosqlite
+from typing import Optional
+import xxhash
+import aiofiles
+import shutil
+import time
+from datetime import datetime
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+class DatabaseMigration:
+    def __init__(self, db_path: str):
+        self.db_path = db_path
+        self.content_paths = self._ensure_content_dirs(os.path.dirname(db_path))
+        
+    def _ensure_content_dirs(self, base_path: str) -> dict:
+        dirs = {
+            'html': 'html_content',
+            'cleaned': 'cleaned_html',
+            'markdown': 'markdown_content', 
+            'extracted': 'extracted_content',
+            'screenshots': 'screenshots'
+        }
+        content_paths = {}
+        for key, dirname in dirs.items():
+            path = os.path.join(base_path, dirname)
+            os.makedirs(path, exist_ok=True)
+            content_paths[key] = path
+        return content_paths
+
+    def _generate_content_hash(self, content: str) -> str:
+        x = xxhash.xxh64()
+        x.update(content.encode())
+        content_hash = x.hexdigest()
+        return content_hash
+        # return hashlib.sha256(content.encode()).hexdigest()
+
+    async def _store_content(self, content: str, content_type: str) -> str:
+        if not content:
+            return ""
+        
+        content_hash = self._generate_content_hash(content)
+        file_path = os.path.join(self.content_paths[content_type], content_hash)
+        
+        if not os.path.exists(file_path):
+            async with aiofiles.open(file_path, 'w', encoding='utf-8') as f:
+                await f.write(content)
+                
+        return content_hash
+
+    async def migrate_database(self):
+        """Migrate existing database to file-based storage"""
+        logger.info("Starting database migration...")
+        
+        try:
+            async with aiosqlite.connect(self.db_path) as db:
+                # Get all rows
+                async with db.execute(
+                    '''SELECT url, html, cleaned_html, markdown, 
+                       extracted_content, screenshot FROM crawled_data'''
+                ) as cursor:
+                    rows = await cursor.fetchall()
+
+                migrated_count = 0
+                for row in rows:
+                    url, html, cleaned_html, markdown, extracted_content, screenshot = row
+                    
+                    # Store content in files and get hashes
+                    html_hash = await self._store_content(html, 'html')
+                    cleaned_hash = await self._store_content(cleaned_html, 'cleaned')
+                    markdown_hash = await self._store_content(markdown, 'markdown')
+                    extracted_hash = await self._store_content(extracted_content, 'extracted')
+                    screenshot_hash = await self._store_content(screenshot, 'screenshots')
+
+                    # Update database with hashes
+                    await db.execute('''
+                        UPDATE crawled_data 
+                        SET html = ?, 
+                            cleaned_html = ?,
+                            markdown = ?,
+                            extracted_content = ?,
+                            screenshot = ?
+                        WHERE url = ?
+                    ''', (html_hash, cleaned_hash, markdown_hash, 
+                         extracted_hash, screenshot_hash, url))
+                    
+                    migrated_count += 1
+                    if migrated_count % 100 == 0:
+                        logger.info(f"Migrated {migrated_count} records...")
+
+                await db.commit()
+                logger.info(f"Migration completed. {migrated_count} records processed.")
+
+        except Exception as e:
+            logger.error(f"Migration failed: {e}")
+            raise
+
+async def backup_database(db_path: str) -> str:
+    """Create backup of existing database"""
+    if not os.path.exists(db_path):
+        logger.info("No existing database found. Skipping backup.")
+        return None
+        
+    # Create backup with timestamp
+    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+    backup_path = f"{db_path}.backup_{timestamp}"
+    
+    try:
+        # Wait for any potential write operations to finish
+        await asyncio.sleep(1)
+        
+        # Create backup
+        shutil.copy2(db_path, backup_path)
+        logger.info(f"Database backup created at: {backup_path}")
+        return backup_path
+    except Exception as e:
+        logger.error(f"Backup failed: {e}")
+        raise
+    
+async def run_migration(db_path: Optional[str] = None):
+    """Run database migration"""
+    if db_path is None:
+        db_path = os.path.join(Path.home(), ".crawl4ai", "crawl4ai.db")
+    
+    if not os.path.exists(db_path):
+        logger.info("No existing database found. Skipping migration.")
+        return
+        
+    # Create backup first
+    backup_path = await backup_database(db_path)
+    if not backup_path:
+        return
+    
+    migration = DatabaseMigration(db_path)
+    await migration.migrate_database()
+    
+def main():
+    """CLI entry point for migration"""
+    import argparse
+    parser = argparse.ArgumentParser(description='Migrate Crawl4AI database to file-based storage')
+    parser.add_argument('--db-path', help='Custom database path')
+    args = parser.parse_args()
+    
+    asyncio.run(run_migration(args.db_path))
+
+if __name__ == "__main__":
+    main()