feat(docker): add Docker Compose configurations for local and hub deployment; enhance GPU support checks in Dockerfile

feat(requirements): update requirements.txt to include snowballstemmer fix(version_manager): correct version parsing to use __version__.__version__ feat(main): introduce chunking strategy and content filter in CrawlRequest model feat(content_filter): enhance BM25 algorithm with priority tag scoring for improved content relevance feat(logger): implement new async logger engine replacing print statements throughout library fix(database): resolve version-related deadlock and circular lock issues in database operations docs(docker): expand Docker deployment documentation with usage instructions for Docker Compose
2024-11-18 21:00:06 +08:00
parent 152ac35bc2
commit 852729ff38
15 changed files with 952 additions and 232 deletions
--- a/12
+++ b/12
@@ -62,11 +62,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    libatspi2.0-0 \
    && rm -rf /var/lib/apt/lists/*
-# GPU support if enabled
+# GPU support if enabled and architecture is supported
-RUN if [ "$ENABLE_GPU" = "true" ] ; then \
+RUN if [ "$ENABLE_GPU" = "true" ] && [ "$(dpkg --print-architecture)" != "arm64" ] ; then \
-    apt-get update && apt-get install -y --no-install-recommends \
+        apt-get update && apt-get install -y --no-install-recommends \
-    nvidia-cuda-toolkit \
+        nvidia-cuda-toolkit \
-    && rm -rf /var/lib/apt/lists/* ; \
+        && rm -rf /var/lib/apt/lists/* ; \
    else \
        echo "Skipping NVIDIA CUDA Toolkit installation (unsupported architecture or GPU disabled)"; \
    fi
 # Create and set working directory
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -35,13 +35,15 @@ stealth_config = StealthConfig(
 class ManagedBrowser:
-    def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False):
+    def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None):
        self.browser_type = browser_type
        self.user_data_dir = user_data_dir
        self.headless = headless
        self.browser_process = None
        self.temp_dir = None
        self.debugging_port = 9222
        self.logger = logger
        self.shutting_down = False
    async def start(self) -> str:
        """
@@ -76,15 +78,38 @@ class ManagedBrowser:
    async def _monitor_browser_process(self):
        """Monitor the browser process for unexpected termination."""
        if self.browser_process:
-            stdout, stderr = await asyncio.gather(
+            try:
-                asyncio.to_thread(self.browser_process.stdout.read),
+                stdout, stderr = await asyncio.gather(
-                asyncio.to_thread(self.browser_process.stderr.read)
+                    asyncio.to_thread(self.browser_process.stdout.read),
-            )
+                    asyncio.to_thread(self.browser_process.stderr.read)
-            if self.browser_process.poll() is not None:
+                )
-                print(f"Browser process terminated unexpectedly with code {self.browser_process.returncode}")
+                
-                print(f"STDOUT: {stdout.decode()}")
+                # Check shutting_down flag BEFORE logging anything
-                print(f"STDERR: {stderr.decode()}")
+                if self.browser_process.poll() is not None:
-                await self.cleanup()
+                    if not self.shutting_down:
                        self.logger.error(
                            message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
                            tag="ERROR",
                            params={
                                "code": self.browser_process.returncode,
                                "stdout": stdout.decode(),
                                "stderr": stderr.decode()
                            }
                        )                
                        await self.cleanup()
                    else:
                        self.logger.info(
                            message="Browser process terminated normally | Code: {code}",
                            tag="INFO",
                            params={"code": self.browser_process.returncode}
                        )
            except Exception as e:
                if not self.shutting_down:
                    self.logger.error(
                        message="Error monitoring browser process: {error}",
                        tag="ERROR",
                        params={"error": str(e)}
                    )
    def _get_browser_path(self) -> str:
        """Returns the browser executable path based on OS and browser type"""
@@ -134,20 +159,39 @@ class ManagedBrowser:
    async def cleanup(self):
        """Cleanup browser process and temporary directory"""
        # Set shutting_down flag BEFORE any termination actions
        self.shutting_down = True
        if self.browser_process:
            try:
                self.browser_process.terminate()
-                await asyncio.sleep(1)
+                # Wait for process to end gracefully
                for _ in range(10):  # 10 attempts, 100ms each
                    if self.browser_process.poll() is not None:
                        break
                    await asyncio.sleep(0.1)
                # Force kill if still running
                if self.browser_process.poll() is None:
                    self.browser_process.kill()
                    await asyncio.sleep(0.1)  # Brief wait for kill to take effect
            except Exception as e:
-                print(f"Error terminating browser: {e}")
+                self.logger.error(
                    message="Error terminating browser: {error}",
                    tag="ERROR",
                    params={"error": str(e)}
                )
        if self.temp_dir and os.path.exists(self.temp_dir):
            try:
                shutil.rmtree(self.temp_dir)
            except Exception as e:
-                print(f"Error removing temporary directory: {e}")
+                self.logger.error(
                    message="Error removing temporary directory: {error}",
                    tag="ERROR",
                    params={"error": str(e)}
                )
 class AsyncCrawlerStrategy(ABC):
@@ -172,7 +216,8 @@ class AsyncCrawlerStrategy(ABC):
        pass
 class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
-    def __init__(self, use_cached_html=False, js_code=None, **kwargs):
+    def __init__(self, use_cached_html=False, js_code=None, logger = None, **kwargs):
        self.logger = logger
        self.use_cached_html = use_cached_html
        self.user_agent = kwargs.get(
            "user_agent",
@@ -231,7 +276,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                self.managed_browser = ManagedBrowser(
                    browser_type=self.browser_type,
                    user_data_dir=self.user_data_dir,
-                    headless=self.headless
+                    headless=self.headless,
                    logger=self.logger
                )
                cdp_url = await self.managed_browser.start()
                self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
@@ -282,6 +328,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                # Add extra args if provided
                if self.extra_args:
                    browser_args["args"].extend(self.extra_args)
                # Add downloads path if downloads are enabled
                if self.accept_downloads:
                    browser_args["downloads_path"] = self.downloads_path
                # Add proxy settings if a proxy is specified
                if self.proxy:
@@ -344,6 +394,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            self.browser = None
        if self.managed_browser:
            await asyncio.sleep(0.5)
            await self.managed_browser.cleanup()
            self.managed_browser = None
@@ -491,9 +542,19 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                        }}
                    """)
                else:
-                    print(f"Warning: Could not access content frame for iframe {i}")
+                    # print(f"Warning: Could not access content frame for iframe {i}")
                    self.logger.warning(
                        message="Could not access content frame for iframe {index}",
                        tag="SCRAPE",
                        params={"index": i}
                    )                    
            except Exception as e:
-                print(f"Error processing iframe {i}: {str(e)}")
+                self.logger.error(
                    message="Error processing iframe {index}: {error}",
                    tag="ERROR",
                    params={"index": i, "error": str(e)}
                )                
                # print(f"Error processing iframe {i}: {str(e)}")
        # Return the page object
        return page  
@@ -620,7 +681,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                    context = await self.browser.new_context(
                        user_agent=self.user_agent,
                        viewport={"width": 1920, "height": 1080},
-                        proxy={"server": self.proxy} if self.proxy else None
+                        proxy={"server": self.proxy} if self.proxy else None,
                        accept_downloads=self.accept_downloads,
                    )
                    await context.set_extra_http_headers(self.headers)
@@ -917,17 +979,31 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            suggested_filename = download.suggested_filename
            download_path = os.path.join(self.downloads_path, suggested_filename)
-            if self.verbose:
+            self.logger.info(
-                print(f"[LOG] 📥 Downloading {suggested_filename} to {download_path}")
+                message="Downloading {filename} to {path}",
                tag="FETCH",
                params={"filename": suggested_filename, "path": download_path}
            )
            start_time = time.perf_counter()
            await download.save_as(download_path)
            end_time = time.perf_counter()
            self._downloaded_files.append(download_path)
-            
+
-            if self.verbose:
+            self.logger.success(
-                print(f"[LOG] ✅ Downloaded {suggested_filename} successfully")
+                message="Downloaded {filename} successfully",
                tag="COMPLETE",
                params={"filename": suggested_filename, "path": download_path, "duration": f"{end_time - start_time:.2f}s"}
            )            
        except Exception as e:
-            if self.verbose:
+            self.logger.error(
-                print(f"[ERROR] Failed to handle download: {str(e)}")
+                message="Failed to handle download: {error}",
                tag="ERROR",
                params={"error": str(e)}
            )
            # if self.verbose:
            #     print(f"[ERROR] Failed to handle download: {str(e)}")
    async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
        semaphore_count = kwargs.get('semaphore_count', 5)  # Adjust as needed
@@ -1070,8 +1146,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            await page.evaluate(remove_overlays_js)
            await page.wait_for_timeout(500)  # Wait for any animations to complete
        except Exception as e:
-            if self.verbose:
+            self.logger.warning(
-                print(f"Warning: Failed to remove overlay elements: {str(e)}")
+                message="Failed to remove overlay elements: {error}",
                tag="SCRAPE",
                params={"error": str(e)}
            )            
            # if self.verbose:
            #     print(f"Warning: Failed to remove overlay elements: {str(e)}")
    async def take_screenshot(self, page: Page) -> str:
        """
@@ -1089,7 +1170,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            return base64.b64encode(screenshot).decode('utf-8')
        except Exception as e:
            error_message = f"Failed to take screenshot: {str(e)}"
-            print(error_message)
+            self.logger.error(
                message="Screenshot failed: {error}",
                tag="ERROR",
                params={"error": error_message}
            )
            # Generate an error image
            img = Image.new('RGB', (800, 600), color='black')
@@ -1123,7 +1209,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            return base64.b64encode(screenshot).decode('utf-8')
        except Exception as e:
            error_message = f"Failed to take screenshot: {str(e)}"
-            print(error_message)
+            # print(error_message)
            self.logger.error(
                message="Screenshot failed: {error}",
                tag="ERROR",
                params={"error": error_message}
            )            
            # Generate an error image
            img = Image.new('RGB', (800, 600), color='black')
--- a/crawl4ai/async_database.py
+++ b/crawl4ai/async_database.py
@@ -12,10 +12,12 @@ import xxhash
 import aiofiles
 from .config import NEED_MIGRATION
 from .version_manager import VersionManager
 from .async_logger import AsyncLogger
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 base_directory = Path.home()
 DB_PATH = os.path.join(Path.home(), ".crawl4ai")
 os.makedirs(DB_PATH, exist_ok=True)
 DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
@@ -28,15 +30,21 @@ class AsyncDatabaseManager:
        self.max_retries = max_retries
        self.connection_pool: Dict[int, aiosqlite.Connection] = {}
        self.pool_lock = asyncio.Lock()
        self.init_lock = asyncio.Lock()
        self.connection_semaphore = asyncio.Semaphore(pool_size)
        self._initialized = False  
        self.version_manager = VersionManager()
        self.logger = AsyncLogger(
            log_file=os.path.join(base_directory, ".crawl4ai", "crawler_db.log"),
            verbose=False,
            tag_width=10
        )
    async def initialize(self):
        """Initialize the database and connection pool"""
        try:
-            logger.info("Initializing database...")
+            self.logger.info("Initializing database", tag="INIT")
            # Ensure the database file exists
            os.makedirs(os.path.dirname(self.db_path), exist_ok=True)
@@ -47,31 +55,39 @@ class AsyncDatabaseManager:
            await self.ainit_db()
            # Verify the table exists
-            async def verify_table(db):
+            async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
                async with db.execute(
                    "SELECT name FROM sqlite_master WHERE type='table' AND name='crawled_data'"
                ) as cursor:
                    result = await cursor.fetchone()
                    if not result:
                        raise Exception("crawled_data table was not created")
            await self.execute_with_retry(verify_table)
            # If version changed or fresh install, run updates
            if needs_update:
-                logger.info("New version detected, running updates...")
+                self.logger.info("New version detected, running updates", tag="INIT")
                await self.update_db_schema()
                from .migrations import run_migration  # Import here to avoid circular imports
                await run_migration()
                self.version_manager.update_version()  # Update stored version after successful migration
-                logger.info("Version update completed successfully")
+                self.logger.success("Version update completed successfully", tag="COMPLETE")
            else:
-                logger.info("Database initialization completed successfully")
+                self.logger.success("Database initialization completed successfully", tag="COMPLETE")
        except Exception as e:
-            logger.error(f"Database initialization error: {e}")
+            self.logger.error(
-            logger.info("Database will be initialized on first use")
+                message="Database initialization error: {error}",
                tag="ERROR",
                params={"error": str(e)}
            )
            self.logger.info(
                message="Database will be initialized on first use",
                tag="INIT"
            )
            raise
    async def cleanup(self):
        """Cleanup connections when shutting down"""
@@ -84,34 +100,41 @@ class AsyncDatabaseManager:
    async def get_connection(self):
        """Connection pool manager"""
        if not self._initialized:
-            async with self.pool_lock:  # Prevent multiple simultaneous initializations
+            # Use an asyncio.Lock to ensure only one initialization occurs
-                if not self._initialized:  # Double-check after acquiring lock
+            async with self.init_lock:
                if not self._initialized:
                    await self.initialize()
                    self._initialized = True
-        async with self.connection_semaphore:
+        await self.connection_semaphore.acquire()
-            task_id = id(asyncio.current_task())
+        task_id = id(asyncio.current_task())
-            try:
+        try:
-                async with self.pool_lock:
+            async with self.pool_lock:
-                    if task_id not in self.connection_pool:
+                if task_id not in self.connection_pool:
-                        conn = await aiosqlite.connect(
+                    conn = await aiosqlite.connect(
-                            self.db_path,
+                        self.db_path,
-                            timeout=30.0
+                        timeout=30.0
-                        )
+                    )
-                        await conn.execute('PRAGMA journal_mode = WAL')
+                    await conn.execute('PRAGMA journal_mode = WAL')
-                        await conn.execute('PRAGMA busy_timeout = 5000')
+                    await conn.execute('PRAGMA busy_timeout = 5000')
-                        self.connection_pool[task_id] = conn
+                    self.connection_pool[task_id] = conn
-                    
+
-                yield self.connection_pool[task_id]
+            yield self.connection_pool[task_id]
-                
+
-            except Exception as e:
+        except Exception as e:
-                logger.error(f"Connection error: {e}")
+            self.logger.error(
-                raise
+                message="Connection error: {error}",
-            finally:
+                tag="ERROR",
-                async with self.pool_lock:
+                force_verbose=True,
-                    if task_id in self.connection_pool:
+                params={"error": str(e)}
-                        await self.connection_pool[task_id].close()
+            )
-                        del self.connection_pool[task_id]
+            raise
        finally:
            async with self.pool_lock:
                if task_id in self.connection_pool:
                    await self.connection_pool[task_id].close()
                    del self.connection_pool[task_id]
            self.connection_semaphore.release()
    async def execute_with_retry(self, operation, *args):
@@ -124,13 +147,21 @@ class AsyncDatabaseManager:
                    return result
            except Exception as e:
                if attempt == self.max_retries - 1:
-                    logger.error(f"Operation failed after {self.max_retries} attempts: {e}")
+                    self.logger.error(
                        message="Operation failed after {retries} attempts: {error}",
                        tag="ERROR",
                        force_verbose=True,
                        params={
                            "retries": self.max_retries,
                            "error": str(e)
                        }
                    )                    
                    raise
                await asyncio.sleep(1 * (attempt + 1))  # Exponential backoff
    async def ainit_db(self):
        """Initialize database schema"""
-        async def _init(db):
+        async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
            await db.execute('''
                CREATE TABLE IF NOT EXISTS crawled_data (
                    url TEXT PRIMARY KEY,
@@ -147,36 +178,37 @@ class AsyncDatabaseManager:
                    downloaded_files TEXT DEFAULT "{}"  -- New column added
                )
            ''')
-        
+            await db.commit()
-        await self.execute_with_retry(_init)
+
    async def update_db_schema(self):
        """Update database schema if needed"""
-        async def _check_columns(db):
+        async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
            cursor = await db.execute("PRAGMA table_info(crawled_data)")
            columns = await cursor.fetchall()
-            return [column[1] for column in columns]
+            column_names = [column[1] for column in columns]
            # List of new columns to add
            new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files']
            for column in new_columns:
                if column not in column_names:
                    await self.aalter_db_add_column(column, db)
            await db.commit()
-        column_names = await self.execute_with_retry(_check_columns)
+    async def aalter_db_add_column(self, new_column: str, db):
        # List of new columns to add
        new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files']
        for column in new_columns:
            if column not in column_names:
                await self.aalter_db_add_column(column)
    async def aalter_db_add_column(self, new_column: str):
        """Add new column to the database"""
-        async def _alter(db):
+        if new_column == 'response_headers':
-            if new_column == 'response_headers':
+            await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"')
-                await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"')
+        else:
-            else:
+            await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
-                await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
+        self.logger.info(
-            logger.info(f"Added column '{new_column}' to the database.")
+            message="Added column '{column}' to the database",
            tag="INIT",
            params={"column": new_column}
        )        
        await self.execute_with_retry(_alter)
    async def aget_cached_url(self, url: str) -> Optional[CrawlResult]:
        """Retrieve cached URL data as CrawlResult"""
@@ -235,7 +267,12 @@ class AsyncDatabaseManager:
        try:
            return await self.execute_with_retry(_get)
        except Exception as e:
-            logger.error(f"Error retrieving cached URL: {e}")
+            self.logger.error(
                message="Error retrieving cached URL: {error}",
                tag="ERROR",
                force_verbose=True,
                params={"error": str(e)}
            )
            return None
    async def acache_url(self, result: CrawlResult):
@@ -291,7 +328,13 @@ class AsyncDatabaseManager:
        try:
            await self.execute_with_retry(_cache)
        except Exception as e:
-            logger.error(f"Error caching URL: {e}")
+            self.logger.error(
                message="Error caching URL: {error}",
                tag="ERROR",
                force_verbose=True,
                params={"error": str(e)}
            )
    async def aget_total_count(self) -> int:
        """Get total number of cached URLs"""
@@ -303,7 +346,12 @@ class AsyncDatabaseManager:
        try:
            return await self.execute_with_retry(_count)
        except Exception as e:
-            logger.error(f"Error getting total count: {e}")
+            self.logger.error(
                message="Error getting total count: {error}",
                tag="ERROR",
                force_verbose=True,
                params={"error": str(e)}
            )
            return 0
    async def aclear_db(self):
@@ -314,7 +362,12 @@ class AsyncDatabaseManager:
        try:
            await self.execute_with_retry(_clear)
        except Exception as e:
-            logger.error(f"Error clearing database: {e}")
+            self.logger.error(
                message="Error clearing database: {error}",
                tag="ERROR",
                force_verbose=True,
                params={"error": str(e)}
            )
    async def aflush_db(self):
        """Drop the entire table"""
@@ -324,7 +377,12 @@ class AsyncDatabaseManager:
        try:
            await self.execute_with_retry(_flush)
        except Exception as e:
-            logger.error(f"Error flushing database: {e}")
+            self.logger.error(
                message="Error flushing database: {error}",
                tag="ERROR",
                force_verbose=True,
                params={"error": str(e)}
            )
    async def _store_content(self, content: str, content_type: str) -> str:
@@ -352,7 +410,12 @@ class AsyncDatabaseManager:
            async with aiofiles.open(file_path, 'r', encoding='utf-8') as f:
                return await f.read()
        except:
-            logger.error(f"Failed to load content: {file_path}")
+            self.logger.error(
                message="Failed to load content: {file_path}",
                tag="ERROR",
                force_verbose=True,
                params={"file_path": file_path}
            )
            return None
 # Create a singleton instance
--- a/crawl4ai/async_logger.py
+++ b/crawl4ai/async_logger.py
@@ -0,0 +1,231 @@
 from enum import Enum
 from typing import Optional, Dict, Any, Union
 from colorama import Fore, Back, Style, init
 import time
 import os
 from datetime import datetime
 class LogLevel(Enum):
    DEBUG = 1
    INFO = 2
    SUCCESS = 3
    WARNING = 4
    ERROR = 5
 class AsyncLogger:
    """
    Asynchronous logger with support for colored console output and file logging.
    Supports templated messages with colored components.
    """
    DEFAULT_ICONS = {
        'INIT': '→',
        'READY': '✓',
        'FETCH': '↓',
        'SCRAPE': '◆',
        'EXTRACT': '■',
        'COMPLETE': '●',
        'ERROR': '×',
        'DEBUG': '⋯',
        'INFO': 'ℹ',
        'WARNING': '⚠',
    }
    DEFAULT_COLORS = {
        LogLevel.DEBUG: Fore.LIGHTBLACK_EX,
        LogLevel.INFO: Fore.CYAN,
        LogLevel.SUCCESS: Fore.GREEN,
        LogLevel.WARNING: Fore.YELLOW,
        LogLevel.ERROR: Fore.RED,
    }
    def __init__(
        self,
        log_file: Optional[str] = None,
        log_level: LogLevel = LogLevel.INFO,
        tag_width: int = 10,
        icons: Optional[Dict[str, str]] = None,
        colors: Optional[Dict[LogLevel, str]] = None,
        verbose: bool = True
    ):
        """
        Initialize the logger.
        Args:
            log_file: Optional file path for logging
            log_level: Minimum log level to display
            tag_width: Width for tag formatting
            icons: Custom icons for different tags
            colors: Custom colors for different log levels
            verbose: Whether to output to console
        """
        init()  # Initialize colorama
        self.log_file = log_file
        self.log_level = log_level
        self.tag_width = tag_width
        self.icons = icons or self.DEFAULT_ICONS
        self.colors = colors or self.DEFAULT_COLORS
        self.verbose = verbose
        # Create log file directory if needed
        if log_file:
            os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True)
    def _format_tag(self, tag: str) -> str:
        """Format a tag with consistent width."""
        return f"[{tag}]".ljust(self.tag_width, ".")
    def _get_icon(self, tag: str) -> str:
        """Get the icon for a tag, defaulting to info icon if not found."""
        return self.icons.get(tag, self.icons['INFO'])
    def _write_to_file(self, message: str):
        """Write a message to the log file if configured."""
        if self.log_file:
            timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
            with open(self.log_file, 'a', encoding='utf-8') as f:
                # Strip ANSI color codes for file output
                clean_message = message.replace(Fore.RESET, '').replace(Style.RESET_ALL, '')
                for color in vars(Fore).values():
                    if isinstance(color, str):
                        clean_message = clean_message.replace(color, '')
                f.write(f"[{timestamp}] {clean_message}\n")
    def _log(
        self,
        level: LogLevel,
        message: str,
        tag: str,
        params: Optional[Dict[str, Any]] = None,
        colors: Optional[Dict[str, str]] = None,
        base_color: Optional[str] = None,
        **kwargs
    ):
        """
        Core logging method that handles message formatting and output.
        Args:
            level: Log level for this message
            message: Message template string
            tag: Tag for the message
            params: Parameters to format into the message
            colors: Color overrides for specific parameters
            base_color: Base color for the entire message
        """
        if level.value < self.log_level.value:
            return
        # Format the message with parameters if provided
        if params:
            try:
                # First format the message with raw parameters
                formatted_message = message.format(**params)
                # Then apply colors if specified
                if colors:
                    for key, color in colors.items():
                        # Find the formatted value in the message and wrap it with color
                        if key in params:
                            value_str = str(params[key])
                            formatted_message = formatted_message.replace(
                                value_str, 
                                f"{color}{value_str}{Style.RESET_ALL}"
                            )
            except KeyError as e:
                formatted_message = f"LOGGING ERROR: Missing parameter {e} in message template"
                level = LogLevel.ERROR
        else:
            formatted_message = message
        # Construct the full log line
        color = base_color or self.colors[level]
        log_line = f"{color}{self._format_tag(tag)} {self._get_icon(tag)} {formatted_message}{Style.RESET_ALL}"
        # Output to console if verbose
        if self.verbose or kwargs.get("force_verbose", False):
            print(log_line)
        # Write to file if configured
        self._write_to_file(log_line)
    def debug(self, message: str, tag: str = "DEBUG", **kwargs):
        """Log a debug message."""
        self._log(LogLevel.DEBUG, message, tag, **kwargs)
    def info(self, message: str, tag: str = "INFO", **kwargs):
        """Log an info message."""
        self._log(LogLevel.INFO, message, tag, **kwargs)
    def success(self, message: str, tag: str = "SUCCESS", **kwargs):
        """Log a success message."""
        self._log(LogLevel.SUCCESS, message, tag, **kwargs)
    def warning(self, message: str, tag: str = "WARNING", **kwargs):
        """Log a warning message."""
        self._log(LogLevel.WARNING, message, tag, **kwargs)
    def error(self, message: str, tag: str = "ERROR", **kwargs):
        """Log an error message."""
        self._log(LogLevel.ERROR, message, tag, **kwargs)
    def url_status(
        self,
        url: str,
        success: bool,
        timing: float,
        tag: str = "FETCH",
        url_length: int = 50
    ):
        """
        Convenience method for logging URL fetch status.
        Args:
            url: The URL being processed
            success: Whether the operation was successful
            timing: Time taken for the operation
            tag: Tag for the message
            url_length: Maximum length for URL in log
        """
        self._log(
            level=LogLevel.SUCCESS if success else LogLevel.ERROR,
            message="{url:.{url_length}}... | Status: {status} | Time: {timing:.2f}s",
            tag=tag,
            params={
                "url": url,
                "url_length": url_length,
                "status": success,
                "timing": timing
            },
            colors={
                "status": Fore.GREEN if success else Fore.RED,
                "timing": Fore.YELLOW
            }
        )
    def error_status(
        self,
        url: str,
        error: str,
        tag: str = "ERROR",
        url_length: int = 50
    ):
        """
        Convenience method for logging error status.
        Args:
            url: The URL being processed
            error: Error message
            tag: Tag for the message
            url_length: Maximum length for URL in log
        """
        self._log(
            level=LogLevel.ERROR,
            message="{url:.{url_length}}... | Error: {error}",
            tag=tag,
            params={
                "url": url,
                "url_length": url_length,
                "error": error
            }
        )
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -15,6 +15,7 @@ from .extraction_strategy import *
 from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
 from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
 from .content_scrapping_strategy import WebScrapingStrategy
 from .async_logger import AsyncLogger
 from .config import (
    MIN_WORD_THRESHOLD, 
@@ -74,19 +75,29 @@ class AsyncWebCrawler:
            always_by_pass_cache: Deprecated, use always_bypass_cache instead
            base_directory: Base directory for storing cache
        """
-        init()
+        # init()
-        self.log_width = 10  # Width of "[COMPLETE]" 
+        # self.log_width = 10  # Width of "[COMPLETE]" 
-        self.tag_format = lambda tag: f"[{tag}]".ljust(self.log_width, ".")
+        # self.tag_format = lambda tag: f"[{tag}]".ljust(self.log_width, ".")
-        self.log_icons = {
+        # self.log_icons = {
-            'INIT': '→',      # Alternative: '▶' or '►'
+        #     'INIT': '→',      # Alternative: '▶' or '►'
-            'READY': '✓',     # Alternative: '√'
+        #     'READY': '✓',     # Alternative: '√'
-            'FETCH': '↓',     # Alternative: '▼'
+        #     'FETCH': '↓',     # Alternative: '▼'
-            'SCRAPE': '◆',    # Alternative: '♦'
+        #     'SCRAPE': '◆',    # Alternative: '♦'
-            'EXTRACT': '■',    # Alternative: '□'
+        #     'EXTRACT': '■',    # Alternative: '□'
-            'COMPLETE': '●',   # Alternative: '○'
+        #     'COMPLETE': '●',   # Alternative: '○'
-            'ERROR': '×' 
+        #     'ERROR': '×' 
-        }        
+        # }        
-        self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(**kwargs)
+        self.verbose = kwargs.get("verbose", False)
        self.logger = AsyncLogger(
            log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"),
            verbose=self.verbose,
            tag_width=10
        )
        self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
            logger = self.logger,
            **kwargs
        )
        # Handle deprecated parameter
        if always_by_pass_cache is not None:
@@ -118,12 +129,13 @@ class AsyncWebCrawler:
    async def awarmup(self):
        """Initialize the crawler with warm-up sequence."""
-        if self.verbose:
+        self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
-            print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Crawl4AI {crawl4ai_version}{Style.RESET_ALL}")
+        # if self.verbose:
-            print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Warming up AsyncWebCrawler{Style.RESET_ALL}")
+        #     print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Crawl4AI {crawl4ai_version}{Style.RESET_ALL}")
        #     print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Warming up AsyncWebCrawler{Style.RESET_ALL}")
        self.ready = True
-        if self.verbose:
+        # if self.verbose:
-            print(f"{Fore.GREEN}{self.tag_format('READY')} {self.log_icons['READY']} AsyncWebCrawler initialized{Style.RESET_ALL}")
+        #     print(f"{Fore.GREEN}{self.tag_format('READY')} {self.log_icons['READY']} AsyncWebCrawler initialized{Style.RESET_ALL}")
    async def arun(
        self,
@@ -234,8 +246,14 @@ class AsyncWebCrawler:
                    screenshot_data = cached_result.screenshot
                    if not screenshot_data:
                        cached_result = None
-                if verbose:
+                # if verbose:
-                    print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s")
+                #     print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s")
                self.logger.url_status(
                        url=cache_context.display_url,
                        success=bool(html),
                        timing=time.perf_counter() - start_time,
                        tag="FETCH"
                    )                    
            # Fetch fresh content if needed
@@ -252,8 +270,14 @@ class AsyncWebCrawler:
                html = sanitize_input_encode(async_response.html)
                screenshot_data = async_response.screenshot
                t2 = time.perf_counter()
-                if verbose:
+                self.logger.url_status(
-                    print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url}... | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s")
+                    url=cache_context.display_url,
                    success=bool(html),
                    timing=t2 - t1,
                    tag="FETCH"
                )
                # if verbose:
                #     print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url}... | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s")
            # Process the HTML content
            crawl_result = await self.aprocess_html(
@@ -287,9 +311,21 @@ class AsyncWebCrawler:
            crawl_result.success = bool(html)
            crawl_result.session_id = kwargs.get("session_id", None)
-            if verbose:
+            # if verbose:
-                print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}")
+            #     print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}")
-
+            self.logger.success(
                    message="{url:.50}... | Status: {status} | Total: {timing}",
                    tag="COMPLETE",
                    params={
                        "url": cache_context.display_url,
                        "status": crawl_result.success,
                        "timing": f"{time.perf_counter() - start_time:.2f}s"
                    },
                    colors={
                        "status": Fore.GREEN if crawl_result.success else Fore.RED,
                        "timing": Fore.YELLOW
                    }
                )
            # Update cache if appropriate
            if cache_context.should_write() and not bool(cached_result):
@@ -300,7 +336,12 @@ class AsyncWebCrawler:
        except Exception as e:
            if not hasattr(e, "msg"):
                e.msg = str(e)
-            print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}")
+            # print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}")
            self.logger.error_status(
                url=cache_context.display_url,
                error=e.msg,
                tag="ERROR"
            )            
            return CrawlResult(
                url=url, 
                html="", 
@@ -362,7 +403,12 @@ class AsyncWebCrawler:
            domain = urlparse(url).netloc
            current_time = time.time()
-            print(f"{Fore.LIGHTBLACK_EX}{self.tag_format('PARALLEL')} Started task for {url[:50]}...{Style.RESET_ALL}")
+            # print(f"{Fore.LIGHTBLACK_EX}{self.tag_format('PARALLEL')} Started task for {url[:50]}...{Style.RESET_ALL}")
            self.logger.debug(
                message="Started task for {url:.50}...",
                tag="PARALLEL",
                params={"url": url}
            )            
            # Get delay settings from kwargs or use defaults
            mean_delay = kwargs.get('mean_delay', 0.1)  # 0.5 seconds default mean delay
@@ -394,12 +440,26 @@ class AsyncWebCrawler:
                )
        # Print start message
-        print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Starting concurrent crawling for {len(urls)} URLs...{Style.RESET_ALL}")
+        # print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Starting concurrent crawling for {len(urls)} URLs...{Style.RESET_ALL}")
        self.logger.info(
            message="Starting concurrent crawling for {count} URLs...",
            tag="INIT",
            params={"count": len(urls)}
        )        
        start_time = time.perf_counter()
        tasks = [crawl_with_semaphore(url) for url in urls]
        results = await asyncio.gather(*tasks, return_exceptions=True)
        end_time = time.perf_counter()
-        print(f"{Fore.YELLOW}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} Concurrent crawling completed for {len(urls)} URLs | Total time: {end_time - start_time:.2f}s{Style.RESET_ALL}")
+        # print(f"{Fore.YELLOW}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} Concurrent crawling completed for {len(urls)} URLs | Total time: {end_time - start_time:.2f}s{Style.RESET_ALL}")
        self.logger.success(
            message="Concurrent crawling completed for {count} URLs | " + Fore.YELLOW + " Total time: {timing}" + Style.RESET_ALL,
            tag="COMPLETE",
            params={
                "count": len(urls),
                "timing": f"{end_time - start_time:.2f}s"
            },
            colors={"timing": Fore.YELLOW}
        )        
        return [result if not isinstance(result, Exception) else str(result) for result in results]
@@ -451,9 +511,16 @@ class AsyncWebCrawler:
        links = result.get("links", [])
        metadata = result.get("metadata", {})
-        if verbose:
+        # if verbose:
-            print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms")
+        #     print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms")
-
+        self.logger.info(
            message="Processed {url:.50}... | Time: {timing}ms",
            tag="SCRAPE",
            params={
                "url": _url,
                "timing": int((time.perf_counter() - t1) * 1000)
            }
        )
        if extracted_content is None and extraction_strategy and chunking_strategy and not isinstance(extraction_strategy, NoExtractionStrategy):
@@ -467,8 +534,17 @@ class AsyncWebCrawler:
                sections = chunking_strategy.chunk(markdown)
                extracted_content = extraction_strategy.run(url, sections)
                extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
-            if verbose:
+            # if verbose:
-                print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}")
+                # print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}")
            self.logger.info(
                message="Completed for {url:.50}... | Time: {timing}s",
                tag="EXTRACT",
                params={
                    "url": _url,
                    "timing": time.perf_counter() - t1
                }
            )
--- a/crawl4ai/content_filter_strategy.py
+++ b/crawl4ai/content_filter_strategy.py
@@ -8,6 +8,10 @@ from bs4 import BeautifulSoup, NavigableString, Tag
 from .utils import clean_tokens
 from abc import ABC, abstractmethod
 from snowballstemmer import stemmer
 # from nltk.stem import PorterStemmer
 # ps = PorterStemmer()
 class RelevantContentFilter(ABC):
    def __init__(self, user_query: str = None):
        self.user_query = user_query
@@ -252,7 +256,7 @@ class RelevantContentFilter(ABC):
            return str(tag)  # Fallback to original if anything fails
 class BM25ContentFilter(RelevantContentFilter):
-    def __init__(self, user_query: str = None, bm25_threshold: float = 1.0):
+    def __init__(self, user_query: str = None, bm25_threshold: float = 1.0, language: str = 'english'):
        super().__init__(user_query=user_query)
        self.bm25_threshold = bm25_threshold
        self.priority_tags = {
@@ -268,6 +272,7 @@ class BM25ContentFilter(RelevantContentFilter):
            'pre': 1.5,
            'th': 1.5,  # Table headers
        }
        self.stemmer = stemmer(language)
    def filter_content(self, html: str) -> List[str]:
        """Implements content filtering using BM25 algorithm with priority tag handling"""
@@ -282,58 +287,42 @@ class BM25ContentFilter(RelevantContentFilter):
        if not candidates:
            return []
-        # Split into priority and regular candidates
+        # Tokenize corpus
-        priority_candidates = []
+        # tokenized_corpus = [chunk.lower().split() for _, chunk, _, _ in candidates]
-        regular_candidates = []
+        # tokenized_query = query.lower().split()
        # tokenized_corpus = [[ps.stem(word) for word in chunk.lower().split()] 
        #                 for _, chunk, _, _ in candidates]
        # tokenized_query = [ps.stem(word) for word in query.lower().split()]        
-        for index, chunk, tag_type, tag in candidates:
+        tokenized_corpus = [[self.stemmer.stemWord(word) for word in chunk.lower().split()] 
-            if tag.name in self.priority_tags:
+                   for _, chunk, _, _ in candidates]
-                priority_candidates.append((index, chunk, tag_type, tag))
+        tokenized_query = [self.stemmer.stemWord(word) for word in query.lower().split()]
            else:
                regular_candidates.append((index, chunk, tag_type, tag))
        # Process regular content with BM25
        tokenized_corpus = [chunk.lower().split() for _, chunk, _, _ in regular_candidates]
        tokenized_query = query.lower().split()
        # Clean from stop words and noise
        tokenized_corpus = [clean_tokens(tokens) for tokens in tokenized_corpus]
        tokenized_query = clean_tokens(tokenized_query)
-        
+
        bm25 = BM25Okapi(tokenized_corpus)
        scores = bm25.get_scores(tokenized_query)
-        # Score and boost regular candidates
+        # Adjust scores with tag weights
-        scored_candidates = [
+        adjusted_candidates = []
-            (score * self.priority_tags.get(tag.name, 1.0), index, chunk, tag_type, tag)
+        for score, (index, chunk, tag_type, tag) in zip(scores, candidates):
-            for score, (index, chunk, tag_type, tag) in zip(scores, regular_candidates)
+            tag_weight = self.priority_tags.get(tag.name, 1.0)
            adjusted_score = score * tag_weight
            adjusted_candidates.append((adjusted_score, index, chunk, tag))
        # Filter candidates by threshold
        selected_candidates = [
            (index, chunk, tag) for adjusted_score, index, chunk, tag in adjusted_candidates
            if adjusted_score >= self.bm25_threshold
        ]
        scored_candidates.sort(key=lambda x: x[0], reverse=True)
        # Process scored candidates
        selected_tags = set()
        selected_candidates = []
        # First add all priority candidates
        for index, chunk, tag_type, tag in priority_candidates:
            tag_id = id(tag)
            if tag_id not in selected_tags:
                selected_candidates.append((index, chunk, tag))
                selected_tags.add(tag_id)
        # Then add scored regular candidates that meet threshold
        for score, index, chunk, tag_type, tag in scored_candidates:
            if score < self.bm25_threshold:
                continue
            tag_id = id(tag)
            if tag_id not in selected_tags:
                selected_candidates.append((index, chunk, tag))
                selected_tags.add(tag_id)
        if not selected_candidates:
            return []
-        # Sort by original document order
+        # Sort selected candidates by original document order
        selected_candidates.sort(key=lambda x: x[0])
        return [self.clean_element(tag) for _, _, tag in selected_candidates]
        return [self.clean_element(tag) for _, _, tag in selected_candidates]
--- a/crawl4ai/content_scrapping_strategy.py
+++ b/crawl4ai/content_scrapping_strategy.py
@@ -149,6 +149,15 @@ class ContentScrapingStrategy(ABC):
        pass
 class WebScrapingStrategy(ContentScrapingStrategy):
    def __init__(self, logger=None):
        self.logger = logger
    def _log(self, level, message, tag="SCRAPE", **kwargs):
        """Helper method to safely use logger."""
        if self.logger:
            log_method = getattr(self.logger, level)
            log_method(message=message, tag=tag, **kwargs)
    def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
        return self._get_content_of_website_optimized(url, html, is_async=False, **kwargs)
@@ -167,7 +176,12 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        try:
            meta = extract_metadata("", soup)
        except Exception as e:
-            print('Error extracting metadata:', str(e))
+            self._log('error', 
                message="Error extracting metadata: {error}",
                tag="SCRAPE",
                params={"error": str(e)}
            )            
            # print('Error extracting metadata:', str(e))
            meta = {}
@@ -430,9 +444,12 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                try:
                    remove_unwanted_attributes(element, IMPORTANT_ATTRS, kwargs.get('keep_data_attributes', False))
                except Exception as e:
-                    print('Error removing unwanted attributes:', str(e))
+                    # print('Error removing unwanted attributes:', str(e))
-                
+                    self._log('error',
-
+                        message="Error removing unwanted attributes: {error}",
                        tag="SCRAPE",
                        params={"error": str(e)}
                    )
                # Process children
                for child in list(element.children):
                    if isinstance(child, NavigableString) and not isinstance(child, Comment):
@@ -453,7 +470,12 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                return keep_element
            except Exception as e:
-                print('Error processing element:', str(e))
+                # print('Error processing element:', str(e))
                self._log('error',
                    message="Error processing element: {error}",
                    tag="SCRAPE",
                    params={"error": str(e)}
                )                
                return False
        process_element(body)
@@ -516,7 +538,10 @@ class WebScrapingStrategy(ContentScrapingStrategy):
            str_body = body.encode_contents().decode('utf-8')
            print(f"[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.")
-
+            self._log('error',
                message="After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.",
                tag="SCRAPE"
            )
        cleaned_html = str_body.replace('\n\n', '\n').replace('  ', ' ')
@@ -525,6 +550,13 @@ class WebScrapingStrategy(ContentScrapingStrategy):
            h.update_params(**kwargs.get('html2text', {}))            
            markdown = h.handle(cleaned_html)
        except Exception as e:
            if not h:
                h = CustomHTML2Text()
            self._log('error',
                message="Error converting HTML to markdown: {error}",
                tag="SCRAPE",
                params={"error": str(e)}
            )
            markdown = h.handle(sanitize_html(cleaned_html))
        markdown = markdown.replace('    ```', '```')
--- a/crawl4ai/version_manager.py
+++ b/crawl4ai/version_manager.py
@@ -20,11 +20,11 @@ class VersionManager:
    def update_version(self):
        """Update the version file to current library version"""
-        self.version_file.write_text(__version__)
+        self.version_file.write_text(__version__.__version__)
    def needs_update(self):
        """Check if database needs update based on version"""
        installed = self.get_installed_version()
-        current = version.parse(__version__)
+        current = version.parse(__version__.__version__)
        return installed is None or installed < current
--- a/docker-compose.hub.yml
+++ b/docker-compose.hub.yml
@@ -0,0 +1,27 @@
 services:
  crawl4ai:
    image: unclecode/crawl4ai:basic  # Pull image from Docker Hub
    ports:
      - "11235:11235"  # FastAPI server
      - "8000:8000"    # Alternative port
      - "9222:9222"    # Browser debugging
      - "8080:8080"    # Additional port
    environment:
      - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}  # Optional API token
      - OPENAI_API_KEY=${OPENAI_API_KEY:-}          # Optional OpenAI API key
      - CLAUDE_API_KEY=${CLAUDE_API_KEY:-}          # Optional Claude API key
    volumes:
      - /dev/shm:/dev/shm  # Shared memory for browser operations
    deploy:
      resources:
        limits:
          memory: 4G
        reservations:
          memory: 1G
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s
--- a/docker-compose.local.yml
+++ b/docker-compose.local.yml
@@ -0,0 +1,33 @@
 services:
  crawl4ai:
    build:
      context: .
      dockerfile: Dockerfile
      args:
        PYTHON_VERSION: 3.10
        INSTALL_TYPE: all
        ENABLE_GPU: false
    ports:
      - "11235:11235"  # FastAPI server
      - "8000:8000"    # Alternative port
      - "9222:9222"    # Browser debugging
      - "8080:8080"    # Additional port
    environment:
      - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}  # Optional API token
      - OPENAI_API_KEY=${OPENAI_API_KEY:-}          # Optional OpenAI API key
      - CLAUDE_API_KEY=${CLAUDE_API_KEY:-}          # Optional Claude API key
    volumes:
      - /dev/shm:/dev/shm  # Shared memory for browser operations
    deploy:
      resources:
        limits:
          memory: 4G
        reservations:
          memory: 1G
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,5 +1,3 @@
 version: '3.8'
 services:
  crawl4ai:
    build:
@@ -9,15 +7,18 @@ services:
        PYTHON_VERSION: 3.10
        INSTALL_TYPE: all
        ENABLE_GPU: false
    profiles: ["local"]
    ports:
-      - "11235:11235"  # FastAPI server
+      - "11235:11235"
-      - "8000:8000"    # Alternative port
+      - "8000:8000"
-      - "9222:9222"    # Browser debugging
+      - "9222:9222"
-      - "8080:8080"    # Additional port
+      - "8080:8080"
    environment:
-      - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}  # Optional API token
+      - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}
      - OPENAI_API_KEY=${OPENAI_API_KEY:-}
      - CLAUDE_API_KEY=${CLAUDE_API_KEY:-}
    volumes:
-      - /dev/shm:/dev/shm  # Shared memory for browser operations
+      - /dev/shm:/dev/shm
    deploy:
      resources:
        limits:
@@ -30,4 +31,32 @@ services:
      interval: 30s
      timeout: 10s
      retries: 3
-      start_period: 40s
+      start_period: 40s
  crawl4ai-hub:
    image: unclecode/crawl4ai:basic
    profiles: ["hub"]
    ports:
      - "11235:11235"
      - "8000:8000"
      - "9222:9222"
      - "8080:8080"
    environment:
      - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}
      - OPENAI_API_KEY=${OPENAI_API_KEY:-}
      - CLAUDE_API_KEY=${CLAUDE_API_KEY:-}
    volumes:
      - /dev/shm:/dev/shm
    deploy:
      resources:
        limits:
          memory: 4G
        reservations:
          memory: 1G
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s
--- a/docs/examples/v0.3.74.overview.py
+++ b/docs/examples/v0.3.74.overview.py
@@ -1,9 +1,16 @@
 import os, sys
 # append the parent directory to the sys.path
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(parent_dir)
 parent_parent_dir = os.path.dirname(parent_dir)
 sys.path.append(parent_parent_dir)
 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
 __data__ = os.path.join(__location__, "__data")
 import asyncio
 import os
 from pathlib import Path
 import aiohttp
 import json
-from crawl4ai import AsyncWebCrawler
+from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.content_filter_strategy import BM25ContentFilter
 # 1. File Download Processing Example
@@ -32,7 +39,8 @@ async def download_example():
                console.log('No .exe download link found');
            }
            """,
-            wait_for=5  # Wait 5 seconds to ensure download starts
+            delay_before_return_html=1,  # Wait 5 seconds to ensure download starts
            cache_mode=CacheMode.BYPASS
        )
        if result.downloaded_files:
@@ -50,22 +58,32 @@ async def content_filtering_example():
    async with AsyncWebCrawler(verbose=True) as crawler:
        # Create filter with custom query for OpenAI's blog
        content_filter = BM25ContentFilter(
-            user_query="AI language models research innovation",
+            # user_query="Investment and fundraising",
            # user_query="Robotic",
            bm25_threshold=1.0
        )
        result = await crawler.arun(
-            url="https://openai.com/blog",
+            url="https://techcrunch.com/",
-            content_filter=content_filter
+            content_filter=content_filter,
            cache_mode=CacheMode.BYPASS
        )
-        print(f"Filtered content: {result.extracted_content}")
+        print(f"Filtered content: {len(result.fit_markdown)}")
        print(f"Filtered content: {result.fit_markdown}")
        # Save html 
        with open(os.path.join(__data__, "techcrunch.html"), "w") as f:
            f.write(result.fit_html)
        with open(os.path.join(__data__, "filtered_content.md"), "w") as f:
            f.write(result.fit_markdown)
 # 3. Local File and Raw HTML Processing Example
 async def local_and_raw_html_example():
    """Example of processing local files and raw HTML"""
    # Create a sample HTML file
-    sample_file = "sample.html"
+    sample_file = os.path.join(__data__, "sample.html")
    with open(sample_file, "w") as f:
        f.write("""
        <html><body>
@@ -112,21 +130,18 @@ async def browser_management_example():
        headless=False,
        verbose=True
    ) as crawler:
        result = await crawler.arun(
            url="https://crawl4ai.com",
            # session_id="persistent_session_1",
            cache_mode=CacheMode.BYPASS
        )        
        # Use GitHub as an example - it's a good test for browser management
        # because it requires proper browser handling
        result = await crawler.arun(
            url="https://github.com/trending",
-            session_id="persistent_session_1",
+            # session_id="persistent_session_1",
-            js_code="""
+            cache_mode=CacheMode.BYPASS
            // Custom JavaScript to execute on GitHub's trending page
            const repos = document.querySelectorAll('article.Box-row');
            const data = Array.from(repos).map(repo => ({
                name: repo.querySelector('h2')?.textContent?.trim(),
                description: repo.querySelector('p')?.textContent?.trim(),
                language: repo.querySelector('[itemprop="programmingLanguage"]')?.textContent?.trim()
            }));
            console.log('Trending repositories:', JSON.stringify(data, null, 2));
            """
        )
        print("\nBrowser session result:", result.success)
@@ -136,6 +151,8 @@ async def browser_management_example():
 # 5. API Usage Example
 async def api_example():
    """Example of using the new API endpoints"""
    api_token = os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code"
    headers = {'Authorization': f'Bearer {api_token}'}    
    async with aiohttp.ClientSession() as session:
        # Submit crawl job
        crawl_request = {
@@ -143,52 +160,78 @@ async def api_example():
            "extraction_config": {
                "type": "json_css",
                "params": {
-                    "selectors": {
+                    "schema": {
-                        "titles": ".title a",
+                        "name": "Hacker News Articles",
-                        "scores": ".score",
+                        "baseSelector": ".athing",
-                        "comments": ".comment-tree"
+                        "fields": [
                            {
                                "name": "title",
                                "selector": ".title a",
                                "type": "text"
                            },
                            {
                                "name": "score",
                                "selector": ".score",
                                "type": "text"
                            },
                            {
                                "name": "url",
                                "selector": ".title a",
                                "type": "attribute",
                                "attribute": "href"
                            }
                        ]
                    }
                }
            },
            "crawler_params": {
                "headless": True,
-                "use_managed_browser": True
+                # "use_managed_browser": True
            },
-            "screenshot": True,
+            "cache_mode": "bypass",
-            "magic": True
+            # "screenshot": True,
            # "magic": True
        }
        async with session.post(
            "http://localhost:11235/crawl",
-            json=crawl_request
+            json=crawl_request,
            headers=headers
        ) as response:
            task_data = await response.json()
            task_id = task_data["task_id"]
            # Check task status
-            async with session.get(
+            while True:
-                f"http://localhost:11235/task/{task_id}"
+                async with session.get(
-            ) as status_response:
+                    f"http://localhost:11235/task/{task_id}",
-                result = await status_response.json()
+                    headers=headers
-                print(f"Task result: {result}")
+                ) as status_response:
                    result = await status_response.json()
                    print(f"Task result: {result}")
                    if result["status"] == "completed":
                        break
                    else:
                        await asyncio.sleep(1)
 # Main execution
 async def main():
-    print("Running Crawl4AI feature examples...")
+    # print("Running Crawl4AI feature examples...")
-    print("\n1. Running Download Example:")
+    # print("\n1. Running Download Example:")
    await download_example()
-    print("\n2. Running Content Filtering Example:")
+    # print("\n2. Running Content Filtering Example:")
    await content_filtering_example()
-    print("\n3. Running Local and Raw HTML Example:")
+    # print("\n3. Running Local and Raw HTML Example:")
    await local_and_raw_html_example()
-    print("\n4. Running Browser Management Example:")
+    # print("\n4. Running Browser Management Example:")
    await browser_management_example()
-    print("\n5. Running API Example:")
+    # print("\n5. Running API Example:")
    await api_example()
 if __name__ == "__main__":
--- a/docs/md_v2/basic/docker-deploymeny.md
+++ b/docs/md_v2/basic/docker-deploymeny.md
@@ -15,6 +15,94 @@ docker run -p 11235:11235 unclecode/crawl4ai:basic
 docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:basic
 ```
 ## Running with Docker Compose 🐳
 ### Use Docker Compose (From Local Dockerfile or Docker Hub)
 Crawl4AI provides flexibility to use Docker Compose for managing your containerized services. You can either build the image locally from the provided `Dockerfile` or use the pre-built image from Docker Hub.
 ### **Option 1: Using Docker Compose to Build Locally**
 If you want to build the image locally, use the provided `docker-compose.local.yml` file.
 ```bash
 docker-compose -f docker-compose.local.yml up -d
 ```
 This will:
 1. Build the Docker image from the provided `Dockerfile`.
 2. Start the container and expose it on `http://localhost:11235`.
 ---
 ### **Option 2: Using Docker Compose with Pre-Built Image from Hub**
 If you prefer using the pre-built image on Docker Hub, use the `docker-compose.hub.yml` file.
 ```bash
 docker-compose -f docker-compose.hub.yml up -d
 ```
 This will:
 1. Pull the pre-built image `unclecode/crawl4ai:basic` (or `all`, depending on your configuration).
 2. Start the container and expose it on `http://localhost:11235`.
 ---
 ### **Stopping the Running Services**
 To stop the services started via Docker Compose, you can use:
 ```bash
 docker-compose -f docker-compose.local.yml down
 # OR
 docker-compose -f docker-compose.hub.yml down
 ```
 If the containers don’t stop and the application is still running, check the running containers:
 ```bash
 docker ps
 ```
 Find the `CONTAINER ID` of the running service and stop it forcefully:
 ```bash
 docker stop <CONTAINER_ID>
 ```
 ---
 ### **Debugging with Docker Compose**
 - **Check Logs**: To view the container logs:
  ```bash
  docker-compose -f docker-compose.local.yml logs -f
  ```
 - **Remove Orphaned Containers**: If the service is still running unexpectedly:
  ```bash
  docker-compose -f docker-compose.local.yml down --remove-orphans
  ```
 - **Manually Remove Network**: If the network is still in use:
  ```bash
  docker network ls
  docker network rm crawl4ai_default
  ```
 ---
 ### Why Use Docker Compose?
 Docker Compose is the recommended way to deploy Crawl4AI because:
 1. It simplifies multi-container setups.
 2. Allows you to define environment variables, resources, and ports in a single file.
 3. Makes it easier to switch between local development and production-ready images.
 For example, your `docker-compose.yml` could include API keys, token settings, and memory limits, making deployment quick and consistent.
 ## API Security 🔒
 ### Understanding CRAWL4AI_API_TOKEN
--- a/main.py
+++ b/main.py
@@ -26,6 +26,7 @@ from enum import Enum
 from dataclasses import dataclass
 import json
 from crawl4ai import AsyncWebCrawler, CrawlResult, CacheMode
 from crawl4ai.config import MIN_WORD_THRESHOLD
 from crawl4ai.extraction_strategy import (
    LLMExtractionStrategy,
    CosineStrategy,
@@ -53,12 +54,20 @@ class ExtractionConfig(BaseModel):
    type: CrawlerType
    params: Dict[str, Any] = {}
 class ChunkingStrategy(BaseModel):
    type: str
    params: Dict[str, Any] = {}
 class ContentFilter(BaseModel):
    type: str = "bm25"
    params: Dict[str, Any] = {}
 class CrawlRequest(BaseModel):
    urls: Union[HttpUrl, List[HttpUrl]]
    word_count_threshold: int = MIN_WORD_THRESHOLD
    extraction_config: Optional[ExtractionConfig] = None
-    crawler_params: Dict[str, Any] = {}
+    chunking_strategy: Optional[ChunkingStrategy] = None
-    priority: int = Field(default=5, ge=1, le=10)
+    content_filter: Optional[ContentFilter] = None
    ttl: Optional[int] = 3600
    js_code: Optional[List[str]] = None
    wait_for: Optional[str] = None
    css_selector: Optional[str] = None
@@ -66,7 +75,10 @@ class CrawlRequest(BaseModel):
    magic: bool = False
    extra: Optional[Dict[str, Any]] = {}
    session_id: Optional[str] = None
-    cache_mode: Optional[CacheMode] = None
+    cache_mode: Optional[CacheMode] = CacheMode.ENABLED
    priority: int = Field(default=5, ge=1, le=10)
    ttl: Optional[int] = 3600    
    crawler_params: Dict[str, Any] = {}
@dataclass
 class TaskInfo:
@@ -280,6 +292,7 @@ class CrawlerService:
                    if isinstance(request.urls, list):
                        results = await crawler.arun_many(
                            urls=[str(url) for url in request.urls],
                            word_count_threshold=MIN_WORD_THRESHOLD,
                            extraction_strategy=extraction_strategy,
                            js_code=request.js_code,
                            wait_for=request.wait_for,
@@ -287,6 +300,7 @@ class CrawlerService:
                            screenshot=request.screenshot,
                            magic=request.magic,
                            session_id=request.session_id,
                            cache_mode=request.cache_mode,
                            **request.extra,
                        )
                    else:
@@ -299,6 +313,7 @@ class CrawlerService:
                            screenshot=request.screenshot,
                            magic=request.magic,
                            session_id=request.session_id,
                            cache_mode=request.cache_mode,
                            **request.extra,
                        )
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,4 +12,5 @@ tf-playwright-stealth~=1.0
 xxhash~=3.4
 rank-bm25~=0.2
 aiofiles~=24.0
-colorama~=0.4
+colorama~=0.4
 snowballstemmer~=2.2