Apply Ruff Corrections

2025-01-13 19:19:58 +08:00
parent c3370ec5da
commit 8ec12d7d68
84 changed files with 6861 additions and 5076 deletions
--- a/crawl4ai/async_database.py
+++ b/crawl4ai/async_database.py
@@ -1,27 +1,29 @@
-import os, sys
+import os
 from pathlib import Path
 import aiosqlite
 import asyncio
-from typing import Optional, Tuple, Dict
+from typing import Optional, Dict
 from contextlib import asynccontextmanager
 import logging
 import json  # Added for serialization/deserialization
 from .utils import ensure_content_dirs, generate_content_hash
 from .models import CrawlResult, MarkdownGenerationResult
-import xxhash
 import aiofiles
-from .config import NEED_MIGRATION
 from .version_manager import VersionManager
 from .async_logger import AsyncLogger
 from .utils import get_error_context, create_box_message
+
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)

-base_directory = DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
+base_directory = DB_PATH = os.path.join(
+    os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai"
+)
 os.makedirs(DB_PATH, exist_ok=True)
 DB_PATH = os.path.join(base_directory, "crawl4ai.db")

+
 class AsyncDatabaseManager:
    def __init__(self, pool_size: int = 10, max_retries: int = 3):
        self.db_path = DB_PATH
@@ -32,28 +34,27 @@ class AsyncDatabaseManager:
        self.pool_lock = asyncio.Lock()
        self.init_lock = asyncio.Lock()
        self.connection_semaphore = asyncio.Semaphore(pool_size)
-        self._initialized = False  
+        self._initialized = False
        self.version_manager = VersionManager()
        self.logger = AsyncLogger(
            log_file=os.path.join(base_directory, ".crawl4ai", "crawler_db.log"),
            verbose=False,
-            tag_width=10
+            tag_width=10,
        )
-        
-        
+
    async def initialize(self):
        """Initialize the database and connection pool"""
        try:
            self.logger.info("Initializing database", tag="INIT")
            # Ensure the database file exists
            os.makedirs(os.path.dirname(self.db_path), exist_ok=True)
-            
+
            # Check if version update is needed
            needs_update = self.version_manager.needs_update()
-            
+
            # Always ensure base table exists
            await self.ainit_db()
-            
+
            # Verify the table exists
            async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
                async with db.execute(
@@ -62,33 +63,37 @@ class AsyncDatabaseManager:
                    result = await cursor.fetchone()
                    if not result:
                        raise Exception("crawled_data table was not created")
-            
+
            # If version changed or fresh install, run updates
            if needs_update:
                self.logger.info("New version detected, running updates", tag="INIT")
                await self.update_db_schema()
-                from .migrations import run_migration  # Import here to avoid circular imports
+                from .migrations import (
+                    run_migration,
+                )  # Import here to avoid circular imports
+
                await run_migration()
                self.version_manager.update_version()  # Update stored version after successful migration
-                self.logger.success("Version update completed successfully", tag="COMPLETE")
+                self.logger.success(
+                    "Version update completed successfully", tag="COMPLETE"
+                )
            else:
-                self.logger.success("Database initialization completed successfully", tag="COMPLETE")
+                self.logger.success(
+                    "Database initialization completed successfully", tag="COMPLETE"
+                )

-                
        except Exception as e:
            self.logger.error(
                message="Database initialization error: {error}",
                tag="ERROR",
-                params={"error": str(e)}
+                params={"error": str(e)},
            )
            self.logger.info(
-                message="Database will be initialized on first use",
-                tag="INIT"
+                message="Database will be initialized on first use", tag="INIT"
            )
-                        
+
            raise

-            
    async def cleanup(self):
        """Cleanup connections when shutting down"""
        async with self.pool_lock:
@@ -107,6 +112,7 @@ class AsyncDatabaseManager:
                        self._initialized = True
                    except Exception as e:
                        import sys
+
                        error_context = get_error_context(sys.exc_info())
                        self.logger.error(
                            message="Database initialization failed:\n{error}\n\nContext:\n{context}\n\nTraceback:\n{traceback}",
@@ -115,41 +121,52 @@ class AsyncDatabaseManager:
                            params={
                                "error": str(e),
                                "context": error_context["code_context"],
-                                "traceback": error_context["full_traceback"]
-                            }
+                                "traceback": error_context["full_traceback"],
+                            },
                        )
                        raise

        await self.connection_semaphore.acquire()
        task_id = id(asyncio.current_task())
-        
+
        try:
            async with self.pool_lock:
                if task_id not in self.connection_pool:
                    try:
-                        conn = await aiosqlite.connect(
-                            self.db_path,
-                            timeout=30.0
-                        )
-                        await conn.execute('PRAGMA journal_mode = WAL')
-                        await conn.execute('PRAGMA busy_timeout = 5000')
-                        
+                        conn = await aiosqlite.connect(self.db_path, timeout=30.0)
+                        await conn.execute("PRAGMA journal_mode = WAL")
+                        await conn.execute("PRAGMA busy_timeout = 5000")
+
                        # Verify database structure
-                        async with conn.execute("PRAGMA table_info(crawled_data)") as cursor:
+                        async with conn.execute(
+                            "PRAGMA table_info(crawled_data)"
+                        ) as cursor:
                            columns = await cursor.fetchall()
                            column_names = [col[1] for col in columns]
                            expected_columns = {
-                                'url', 'html', 'cleaned_html', 'markdown', 'extracted_content',
-                                'success', 'media', 'links', 'metadata', 'screenshot',
-                                'response_headers', 'downloaded_files'
+                                "url",
+                                "html",
+                                "cleaned_html",
+                                "markdown",
+                                "extracted_content",
+                                "success",
+                                "media",
+                                "links",
+                                "metadata",
+                                "screenshot",
+                                "response_headers",
+                                "downloaded_files",
                            }
                            missing_columns = expected_columns - set(column_names)
                            if missing_columns:
-                                raise ValueError(f"Database missing columns: {missing_columns}")
-                        
+                                raise ValueError(
+                                    f"Database missing columns: {missing_columns}"
+                                )
+
                        self.connection_pool[task_id] = conn
                    except Exception as e:
                        import sys
+
                        error_context = get_error_context(sys.exc_info())
                        error_message = (
                            f"Unexpected error in db get_connection at line {error_context['line_no']} "
@@ -158,7 +175,7 @@ class AsyncDatabaseManager:
                            f"Code context:\n{error_context['code_context']}"
                        )
                        self.logger.error(
-                            message=create_box_message(error_message, type= "error"),
+                            message=create_box_message(error_message, type="error"),
                        )

                        raise
@@ -167,6 +184,7 @@ class AsyncDatabaseManager:

        except Exception as e:
            import sys
+
            error_context = get_error_context(sys.exc_info())
            error_message = (
                f"Unexpected error in db get_connection at line {error_context['line_no']} "
@@ -175,7 +193,7 @@ class AsyncDatabaseManager:
                f"Code context:\n{error_context['code_context']}"
            )
            self.logger.error(
-                message=create_box_message(error_message, type= "error"),
+                message=create_box_message(error_message, type="error"),
            )
            raise
        finally:
@@ -185,7 +203,6 @@ class AsyncDatabaseManager:
                    del self.connection_pool[task_id]
            self.connection_semaphore.release()

-
    async def execute_with_retry(self, operation, *args):
        """Execute database operations with retry logic"""
        for attempt in range(self.max_retries):
@@ -200,18 +217,16 @@ class AsyncDatabaseManager:
                        message="Operation failed after {retries} attempts: {error}",
                        tag="ERROR",
                        force_verbose=True,
-                        params={
-                            "retries": self.max_retries,
-                            "error": str(e)
-                        }
-                    )                    
+                        params={"retries": self.max_retries, "error": str(e)},
+                    )
                    raise
                await asyncio.sleep(1 * (attempt + 1))  # Exponential backoff

    async def ainit_db(self):
        """Initialize database schema"""
        async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
-            await db.execute('''
+            await db.execute(
+                """
                CREATE TABLE IF NOT EXISTS crawled_data (
                    url TEXT PRIMARY KEY,
                    html TEXT,
@@ -226,21 +241,27 @@ class AsyncDatabaseManager:
                    response_headers TEXT DEFAULT "{}",
                    downloaded_files TEXT DEFAULT "{}"  -- New column added
                )
-            ''')
+            """
+            )
            await db.commit()

-        
-
    async def update_db_schema(self):
        """Update database schema if needed"""
        async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
            cursor = await db.execute("PRAGMA table_info(crawled_data)")
            columns = await cursor.fetchall()
            column_names = [column[1] for column in columns]
-            
+
            # List of new columns to add
-            new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files']
-            
+            new_columns = [
+                "media",
+                "links",
+                "metadata",
+                "screenshot",
+                "response_headers",
+                "downloaded_files",
+            ]
+
            for column in new_columns:
                if column not in column_names:
                    await self.aalter_db_add_column(column, db)
@@ -248,75 +269,91 @@ class AsyncDatabaseManager:

    async def aalter_db_add_column(self, new_column: str, db):
        """Add new column to the database"""
-        if new_column == 'response_headers':
-            await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"')
+        if new_column == "response_headers":
+            await db.execute(
+                f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"'
+            )
        else:
-            await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
+            await db.execute(
+                f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""'
+            )
        self.logger.info(
            message="Added column '{column}' to the database",
            tag="INIT",
-            params={"column": new_column}
-        )        
-
+            params={"column": new_column},
+        )

    async def aget_cached_url(self, url: str) -> Optional[CrawlResult]:
        """Retrieve cached URL data as CrawlResult"""
+
        async def _get(db):
            async with db.execute(
-                'SELECT * FROM crawled_data WHERE url = ?', (url,)
+                "SELECT * FROM crawled_data WHERE url = ?", (url,)
            ) as cursor:
                row = await cursor.fetchone()
                if not row:
                    return None
-                    
+
                # Get column names
                columns = [description[0] for description in cursor.description]
                # Create dict from row data
                row_dict = dict(zip(columns, row))
-                
+
                # Load content from files using stored hashes
                content_fields = {
-                    'html': row_dict['html'],
-                    'cleaned_html': row_dict['cleaned_html'],
-                    'markdown': row_dict['markdown'],
-                    'extracted_content': row_dict['extracted_content'],
-                    'screenshot': row_dict['screenshot'],
-                    'screenshots': row_dict['screenshot'],
+                    "html": row_dict["html"],
+                    "cleaned_html": row_dict["cleaned_html"],
+                    "markdown": row_dict["markdown"],
+                    "extracted_content": row_dict["extracted_content"],
+                    "screenshot": row_dict["screenshot"],
+                    "screenshots": row_dict["screenshot"],
                }
-                
+
                for field, hash_value in content_fields.items():
                    if hash_value:
                        content = await self._load_content(
-                            hash_value, 
-                            field.split('_')[0]  # Get content type from field name
+                            hash_value,
+                            field.split("_")[0],  # Get content type from field name
                        )
                        row_dict[field] = content or ""
                    else:
                        row_dict[field] = ""

                # Parse JSON fields
-                json_fields = ['media', 'links', 'metadata', 'response_headers', 'markdown']
+                json_fields = [
+                    "media",
+                    "links",
+                    "metadata",
+                    "response_headers",
+                    "markdown",
+                ]
                for field in json_fields:
                    try:
-                        row_dict[field] = json.loads(row_dict[field]) if row_dict[field] else {}
+                        row_dict[field] = (
+                            json.loads(row_dict[field]) if row_dict[field] else {}
+                        )
                    except json.JSONDecodeError:
                        row_dict[field] = {}

-                if isinstance(row_dict['markdown'], Dict):
-                    row_dict['markdown_v2'] = row_dict['markdown']
-                    if row_dict['markdown'].get('raw_markdown'):
-                        row_dict['markdown'] = row_dict['markdown']['raw_markdown']
-                
+                if isinstance(row_dict["markdown"], Dict):
+                    row_dict["markdown_v2"] = row_dict["markdown"]
+                    if row_dict["markdown"].get("raw_markdown"):
+                        row_dict["markdown"] = row_dict["markdown"]["raw_markdown"]
+
                # Parse downloaded_files
                try:
-                    row_dict['downloaded_files'] = json.loads(row_dict['downloaded_files']) if row_dict['downloaded_files'] else []
+                    row_dict["downloaded_files"] = (
+                        json.loads(row_dict["downloaded_files"])
+                        if row_dict["downloaded_files"]
+                        else []
+                    )
                except json.JSONDecodeError:
-                    row_dict['downloaded_files'] = []
+                    row_dict["downloaded_files"] = []

                # Remove any fields not in CrawlResult model
                valid_fields = CrawlResult.__annotations__.keys()
                filtered_dict = {k: v for k, v in row_dict.items() if k in valid_fields}
-                
+
                return CrawlResult(**filtered_dict)

        try:
@@ -326,7 +363,7 @@ class AsyncDatabaseManager:
                message="Error retrieving cached URL: {error}",
                tag="ERROR",
                force_verbose=True,
-                params={"error": str(e)}
+                params={"error": str(e)},
            )
            return None

@@ -334,37 +371,52 @@ class AsyncDatabaseManager:
        """Cache CrawlResult data"""
        # Store content files and get hashes
        content_map = {
-            'html': (result.html, 'html'),
-            'cleaned_html': (result.cleaned_html or "", 'cleaned'),
-            'markdown': None,
-            'extracted_content': (result.extracted_content or "", 'extracted'),
-            'screenshot': (result.screenshot or "", 'screenshots')
+            "html": (result.html, "html"),
+            "cleaned_html": (result.cleaned_html or "", "cleaned"),
+            "markdown": None,
+            "extracted_content": (result.extracted_content or "", "extracted"),
+            "screenshot": (result.screenshot or "", "screenshots"),
        }

        try:
            if isinstance(result.markdown, MarkdownGenerationResult):
-                content_map['markdown'] = (result.markdown.model_dump_json(), 'markdown')
-            elif hasattr(result, 'markdown_v2'):
-                content_map['markdown'] = (result.markdown_v2.model_dump_json(), 'markdown')
+                content_map["markdown"] = (
+                    result.markdown.model_dump_json(),
+                    "markdown",
+                )
+            elif hasattr(result, "markdown_v2"):
+                content_map["markdown"] = (
+                    result.markdown_v2.model_dump_json(),
+                    "markdown",
+                )
            elif isinstance(result.markdown, str):
                markdown_result = MarkdownGenerationResult(raw_markdown=result.markdown)
-                content_map['markdown'] = (markdown_result.model_dump_json(), 'markdown')
+                content_map["markdown"] = (
+                    markdown_result.model_dump_json(),
+                    "markdown",
+                )
            else:
-                content_map['markdown'] = (MarkdownGenerationResult().model_dump_json(), 'markdown')
+                content_map["markdown"] = (
+                    MarkdownGenerationResult().model_dump_json(),
+                    "markdown",
+                )
        except Exception as e:
            self.logger.warning(
-                message=f"Error processing markdown content: {str(e)}",
-                tag="WARNING"
+                message=f"Error processing markdown content: {str(e)}", tag="WARNING"
            )
            # Fallback to empty markdown result
-            content_map['markdown'] = (MarkdownGenerationResult().model_dump_json(), 'markdown')
-        
+            content_map["markdown"] = (
+                MarkdownGenerationResult().model_dump_json(),
+                "markdown",
+            )
+
        content_hashes = {}
        for field, (content, content_type) in content_map.items():
            content_hashes[field] = await self._store_content(content, content_type)

        async def _cache(db):
-            await db.execute('''
+            await db.execute(
+                """
                INSERT INTO crawled_data (
                    url, html, cleaned_html, markdown,
                    extracted_content, success, media, links, metadata,
@@ -383,20 +435,22 @@ class AsyncDatabaseManager:
                    screenshot = excluded.screenshot,
                    response_headers = excluded.response_headers,
                    downloaded_files = excluded.downloaded_files
-            ''', (
-                result.url,
-                content_hashes['html'],
-                content_hashes['cleaned_html'],
-                content_hashes['markdown'],
-                content_hashes['extracted_content'],
-                result.success,
-                json.dumps(result.media),
-                json.dumps(result.links),
-                json.dumps(result.metadata or {}),
-                content_hashes['screenshot'],
-                json.dumps(result.response_headers or {}),
-                json.dumps(result.downloaded_files or [])
-            ))
+            """,
+                (
+                    result.url,
+                    content_hashes["html"],
+                    content_hashes["cleaned_html"],
+                    content_hashes["markdown"],
+                    content_hashes["extracted_content"],
+                    result.success,
+                    json.dumps(result.media),
+                    json.dumps(result.links),
+                    json.dumps(result.metadata or {}),
+                    content_hashes["screenshot"],
+                    json.dumps(result.response_headers or {}),
+                    json.dumps(result.downloaded_files or []),
+                ),
+            )

        try:
            await self.execute_with_retry(_cache)
@@ -405,14 +459,14 @@ class AsyncDatabaseManager:
                message="Error caching URL: {error}",
                tag="ERROR",
                force_verbose=True,
-                params={"error": str(e)}
+                params={"error": str(e)},
            )
-            

    async def aget_total_count(self) -> int:
        """Get total number of cached URLs"""
+
        async def _count(db):
-            async with db.execute('SELECT COUNT(*) FROM crawled_data') as cursor:
+            async with db.execute("SELECT COUNT(*) FROM crawled_data") as cursor:
                result = await cursor.fetchone()
                return result[0] if result else 0

@@ -423,14 +477,15 @@ class AsyncDatabaseManager:
                message="Error getting total count: {error}",
                tag="ERROR",
                force_verbose=True,
-                params={"error": str(e)}
+                params={"error": str(e)},
            )
            return 0

    async def aclear_db(self):
        """Clear all data from the database"""
+
        async def _clear(db):
-            await db.execute('DELETE FROM crawled_data')
+            await db.execute("DELETE FROM crawled_data")

        try:
            await self.execute_with_retry(_clear)
@@ -439,13 +494,14 @@ class AsyncDatabaseManager:
                message="Error clearing database: {error}",
                tag="ERROR",
                force_verbose=True,
-                params={"error": str(e)}
+                params={"error": str(e)},
            )

    async def aflush_db(self):
        """Drop the entire table"""
+
        async def _flush(db):
-            await db.execute('DROP TABLE IF EXISTS crawled_data')
+            await db.execute("DROP TABLE IF EXISTS crawled_data")

        try:
            await self.execute_with_retry(_flush)
@@ -454,42 +510,44 @@ class AsyncDatabaseManager:
                message="Error flushing database: {error}",
                tag="ERROR",
                force_verbose=True,
-                params={"error": str(e)}
+                params={"error": str(e)},
            )
-            
-                
+
    async def _store_content(self, content: str, content_type: str) -> str:
        """Store content in filesystem and return hash"""
        if not content:
            return ""
-            
+
        content_hash = generate_content_hash(content)
        file_path = os.path.join(self.content_paths[content_type], content_hash)
-        
+
        # Only write if file doesn't exist
        if not os.path.exists(file_path):
-            async with aiofiles.open(file_path, 'w', encoding='utf-8') as f:
+            async with aiofiles.open(file_path, "w", encoding="utf-8") as f:
                await f.write(content)
-                
+
        return content_hash

-    async def _load_content(self, content_hash: str, content_type: str) -> Optional[str]:
+    async def _load_content(
+        self, content_hash: str, content_type: str
+    ) -> Optional[str]:
        """Load content from filesystem by hash"""
        if not content_hash:
            return None
-            
+
        file_path = os.path.join(self.content_paths[content_type], content_hash)
        try:
-            async with aiofiles.open(file_path, 'r', encoding='utf-8') as f:
+            async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
                return await f.read()
        except:
            self.logger.error(
                message="Failed to load content: {file_path}",
                tag="ERROR",
                force_verbose=True,
-                params={"file_path": file_path}
+                params={"file_path": file_path},
            )
            return None

+
 # Create a singleton instance
 async_db_manager = AsyncDatabaseManager()