feat(docker): add Docker Compose configurations for local and hub deployment; enhance GPU support checks in Dockerfile

feat(requirements): update requirements.txt to include snowballstemmer
fix(version_manager): correct version parsing to use __version__.__version__
feat(main): introduce chunking strategy and content filter in CrawlRequest model
feat(content_filter): enhance BM25 algorithm with priority tag scoring for improved content relevance
feat(logger): implement new async logger engine replacing print statements throughout library
fix(database): resolve version-related deadlock and circular lock issues in database operations
docs(docker): expand Docker deployment documentation with usage instructions for Docker Compose
This commit is contained in:
UncleCode
2024-11-18 21:00:06 +08:00
parent 152ac35bc2
commit 852729ff38
15 changed files with 952 additions and 232 deletions

View File

@@ -12,10 +12,12 @@ import xxhash
import aiofiles
from .config import NEED_MIGRATION
from .version_manager import VersionManager
from .async_logger import AsyncLogger
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
base_directory = Path.home()
DB_PATH = os.path.join(Path.home(), ".crawl4ai")
os.makedirs(DB_PATH, exist_ok=True)
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
@@ -28,15 +30,21 @@ class AsyncDatabaseManager:
self.max_retries = max_retries
self.connection_pool: Dict[int, aiosqlite.Connection] = {}
self.pool_lock = asyncio.Lock()
self.init_lock = asyncio.Lock()
self.connection_semaphore = asyncio.Semaphore(pool_size)
self._initialized = False
self.version_manager = VersionManager()
self.logger = AsyncLogger(
log_file=os.path.join(base_directory, ".crawl4ai", "crawler_db.log"),
verbose=False,
tag_width=10
)
async def initialize(self):
"""Initialize the database and connection pool"""
try:
logger.info("Initializing database...")
self.logger.info("Initializing database", tag="INIT")
# Ensure the database file exists
os.makedirs(os.path.dirname(self.db_path), exist_ok=True)
@@ -47,31 +55,39 @@ class AsyncDatabaseManager:
await self.ainit_db()
# Verify the table exists
async def verify_table(db):
async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
async with db.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='crawled_data'"
) as cursor:
result = await cursor.fetchone()
if not result:
raise Exception("crawled_data table was not created")
await self.execute_with_retry(verify_table)
# If version changed or fresh install, run updates
if needs_update:
logger.info("New version detected, running updates...")
self.logger.info("New version detected, running updates", tag="INIT")
await self.update_db_schema()
from .migrations import run_migration # Import here to avoid circular imports
await run_migration()
self.version_manager.update_version() # Update stored version after successful migration
logger.info("Version update completed successfully")
self.logger.success("Version update completed successfully", tag="COMPLETE")
else:
logger.info("Database initialization completed successfully")
self.logger.success("Database initialization completed successfully", tag="COMPLETE")
except Exception as e:
logger.error(f"Database initialization error: {e}")
logger.info("Database will be initialized on first use")
self.logger.error(
message="Database initialization error: {error}",
tag="ERROR",
params={"error": str(e)}
)
self.logger.info(
message="Database will be initialized on first use",
tag="INIT"
)
raise
async def cleanup(self):
"""Cleanup connections when shutting down"""
@@ -84,34 +100,41 @@ class AsyncDatabaseManager:
async def get_connection(self):
"""Connection pool manager"""
if not self._initialized:
async with self.pool_lock: # Prevent multiple simultaneous initializations
if not self._initialized: # Double-check after acquiring lock
# Use an asyncio.Lock to ensure only one initialization occurs
async with self.init_lock:
if not self._initialized:
await self.initialize()
self._initialized = True
async with self.connection_semaphore:
task_id = id(asyncio.current_task())
try:
async with self.pool_lock:
if task_id not in self.connection_pool:
conn = await aiosqlite.connect(
self.db_path,
timeout=30.0
)
await conn.execute('PRAGMA journal_mode = WAL')
await conn.execute('PRAGMA busy_timeout = 5000')
self.connection_pool[task_id] = conn
yield self.connection_pool[task_id]
except Exception as e:
logger.error(f"Connection error: {e}")
raise
finally:
async with self.pool_lock:
if task_id in self.connection_pool:
await self.connection_pool[task_id].close()
del self.connection_pool[task_id]
await self.connection_semaphore.acquire()
task_id = id(asyncio.current_task())
try:
async with self.pool_lock:
if task_id not in self.connection_pool:
conn = await aiosqlite.connect(
self.db_path,
timeout=30.0
)
await conn.execute('PRAGMA journal_mode = WAL')
await conn.execute('PRAGMA busy_timeout = 5000')
self.connection_pool[task_id] = conn
yield self.connection_pool[task_id]
except Exception as e:
self.logger.error(
message="Connection error: {error}",
tag="ERROR",
force_verbose=True,
params={"error": str(e)}
)
raise
finally:
async with self.pool_lock:
if task_id in self.connection_pool:
await self.connection_pool[task_id].close()
del self.connection_pool[task_id]
self.connection_semaphore.release()
async def execute_with_retry(self, operation, *args):
@@ -124,13 +147,21 @@ class AsyncDatabaseManager:
return result
except Exception as e:
if attempt == self.max_retries - 1:
logger.error(f"Operation failed after {self.max_retries} attempts: {e}")
self.logger.error(
message="Operation failed after {retries} attempts: {error}",
tag="ERROR",
force_verbose=True,
params={
"retries": self.max_retries,
"error": str(e)
}
)
raise
await asyncio.sleep(1 * (attempt + 1)) # Exponential backoff
async def ainit_db(self):
"""Initialize database schema"""
async def _init(db):
async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
await db.execute('''
CREATE TABLE IF NOT EXISTS crawled_data (
url TEXT PRIMARY KEY,
@@ -147,36 +178,37 @@ class AsyncDatabaseManager:
downloaded_files TEXT DEFAULT "{}" -- New column added
)
''')
await self.execute_with_retry(_init)
await db.commit()
async def update_db_schema(self):
"""Update database schema if needed"""
async def _check_columns(db):
async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
cursor = await db.execute("PRAGMA table_info(crawled_data)")
columns = await cursor.fetchall()
return [column[1] for column in columns]
column_names = [column[1] for column in columns]
# List of new columns to add
new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files']
for column in new_columns:
if column not in column_names:
await self.aalter_db_add_column(column, db)
await db.commit()
column_names = await self.execute_with_retry(_check_columns)
# List of new columns to add
new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files']
for column in new_columns:
if column not in column_names:
await self.aalter_db_add_column(column)
async def aalter_db_add_column(self, new_column: str):
async def aalter_db_add_column(self, new_column: str, db):
"""Add new column to the database"""
async def _alter(db):
if new_column == 'response_headers':
await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"')
else:
await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
logger.info(f"Added column '{new_column}' to the database.")
if new_column == 'response_headers':
await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"')
else:
await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
self.logger.info(
message="Added column '{column}' to the database",
tag="INIT",
params={"column": new_column}
)
await self.execute_with_retry(_alter)
async def aget_cached_url(self, url: str) -> Optional[CrawlResult]:
"""Retrieve cached URL data as CrawlResult"""
@@ -235,7 +267,12 @@ class AsyncDatabaseManager:
try:
return await self.execute_with_retry(_get)
except Exception as e:
logger.error(f"Error retrieving cached URL: {e}")
self.logger.error(
message="Error retrieving cached URL: {error}",
tag="ERROR",
force_verbose=True,
params={"error": str(e)}
)
return None
async def acache_url(self, result: CrawlResult):
@@ -291,7 +328,13 @@ class AsyncDatabaseManager:
try:
await self.execute_with_retry(_cache)
except Exception as e:
logger.error(f"Error caching URL: {e}")
self.logger.error(
message="Error caching URL: {error}",
tag="ERROR",
force_verbose=True,
params={"error": str(e)}
)
async def aget_total_count(self) -> int:
"""Get total number of cached URLs"""
@@ -303,7 +346,12 @@ class AsyncDatabaseManager:
try:
return await self.execute_with_retry(_count)
except Exception as e:
logger.error(f"Error getting total count: {e}")
self.logger.error(
message="Error getting total count: {error}",
tag="ERROR",
force_verbose=True,
params={"error": str(e)}
)
return 0
async def aclear_db(self):
@@ -314,7 +362,12 @@ class AsyncDatabaseManager:
try:
await self.execute_with_retry(_clear)
except Exception as e:
logger.error(f"Error clearing database: {e}")
self.logger.error(
message="Error clearing database: {error}",
tag="ERROR",
force_verbose=True,
params={"error": str(e)}
)
async def aflush_db(self):
"""Drop the entire table"""
@@ -324,7 +377,12 @@ class AsyncDatabaseManager:
try:
await self.execute_with_retry(_flush)
except Exception as e:
logger.error(f"Error flushing database: {e}")
self.logger.error(
message="Error flushing database: {error}",
tag="ERROR",
force_verbose=True,
params={"error": str(e)}
)
async def _store_content(self, content: str, content_type: str) -> str:
@@ -352,7 +410,12 @@ class AsyncDatabaseManager:
async with aiofiles.open(file_path, 'r', encoding='utf-8') as f:
return await f.read()
except:
logger.error(f"Failed to load content: {file_path}")
self.logger.error(
message="Failed to load content: {file_path}",
tag="ERROR",
force_verbose=True,
params={"file_path": file_path}
)
return None
# Create a singleton instance