feat(docker): add Docker Compose configurations for local and hub deployment; enhance GPU support checks in Dockerfile

feat(requirements): update requirements.txt to include snowballstemmer fix(version_manager): correct version parsing to use __version__.__version__ feat(main): introduce chunking strategy and content filter in CrawlRequest model feat(content_filter): enhance BM25 algorithm with priority tag scoring for improved content relevance feat(logger): implement new async logger engine replacing print statements throughout library fix(database): resolve version-related deadlock and circular lock issues in database operations docs(docker): expand Docker deployment documentation with usage instructions for Docker Compose
2024-11-18 21:00:06 +08:00
parent 152ac35bc2
commit 852729ff38
15 changed files with 952 additions and 232 deletions
--- a/12
+++ b/12
@@ -62,11 +62,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    libatspi2.0-0 \
    && rm -rf /var/lib/apt/lists/*

-# GPU support if enabled
-RUN if [ "$ENABLE_GPU" = "true" ] ; then \
-    apt-get update && apt-get install -y --no-install-recommends \
-    nvidia-cuda-toolkit \
-    && rm -rf /var/lib/apt/lists/* ; \
+# GPU support if enabled and architecture is supported
+RUN if [ "$ENABLE_GPU" = "true" ] && [ "$(dpkg --print-architecture)" != "arm64" ] ; then \
+        apt-get update && apt-get install -y --no-install-recommends \
+        nvidia-cuda-toolkit \
+        && rm -rf /var/lib/apt/lists/* ; \
+    else \
+        echo "Skipping NVIDIA CUDA Toolkit installation (unsupported architecture or GPU disabled)"; \
    fi

 # Create and set working directory
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -35,13 +35,15 @@ stealth_config = StealthConfig(


 class ManagedBrowser:
-    def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False):
+    def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None):
        self.browser_type = browser_type
        self.user_data_dir = user_data_dir
        self.headless = headless
        self.browser_process = None
        self.temp_dir = None
        self.debugging_port = 9222
+        self.logger = logger
+        self.shutting_down = False

    async def start(self) -> str:
        """
@@ -76,15 +78,38 @@ class ManagedBrowser:
    async def _monitor_browser_process(self):
        """Monitor the browser process for unexpected termination."""
        if self.browser_process:
-            stdout, stderr = await asyncio.gather(
-                asyncio.to_thread(self.browser_process.stdout.read),
-                asyncio.to_thread(self.browser_process.stderr.read)
-            )
-            if self.browser_process.poll() is not None:
-                print(f"Browser process terminated unexpectedly with code {self.browser_process.returncode}")
-                print(f"STDOUT: {stdout.decode()}")
-                print(f"STDERR: {stderr.decode()}")
-                await self.cleanup()
+            try:
+                stdout, stderr = await asyncio.gather(
+                    asyncio.to_thread(self.browser_process.stdout.read),
+                    asyncio.to_thread(self.browser_process.stderr.read)
+                )
+                
+                # Check shutting_down flag BEFORE logging anything
+                if self.browser_process.poll() is not None:
+                    if not self.shutting_down:
+                        self.logger.error(
+                            message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
+                            tag="ERROR",
+                            params={
+                                "code": self.browser_process.returncode,
+                                "stdout": stdout.decode(),
+                                "stderr": stderr.decode()
+                            }
+                        )                
+                        await self.cleanup()
+                    else:
+                        self.logger.info(
+                            message="Browser process terminated normally | Code: {code}",
+                            tag="INFO",
+                            params={"code": self.browser_process.returncode}
+                        )
+            except Exception as e:
+                if not self.shutting_down:
+                    self.logger.error(
+                        message="Error monitoring browser process: {error}",
+                        tag="ERROR",
+                        params={"error": str(e)}
+                    )

    def _get_browser_path(self) -> str:
        """Returns the browser executable path based on OS and browser type"""
@@ -134,20 +159,39 @@ class ManagedBrowser:

    async def cleanup(self):
        """Cleanup browser process and temporary directory"""
+        # Set shutting_down flag BEFORE any termination actions
+        self.shutting_down = True
+        
        if self.browser_process:
            try:
                self.browser_process.terminate()
-                await asyncio.sleep(1)
+                # Wait for process to end gracefully
+                for _ in range(10):  # 10 attempts, 100ms each
+                    if self.browser_process.poll() is not None:
+                        break
+                    await asyncio.sleep(0.1)
+                
+                # Force kill if still running
                if self.browser_process.poll() is None:
                    self.browser_process.kill()
+                    await asyncio.sleep(0.1)  # Brief wait for kill to take effect
+                    
            except Exception as e:
-                print(f"Error terminating browser: {e}")
+                self.logger.error(
+                    message="Error terminating browser: {error}",
+                    tag="ERROR",
+                    params={"error": str(e)}
+                )

        if self.temp_dir and os.path.exists(self.temp_dir):
            try:
                shutil.rmtree(self.temp_dir)
            except Exception as e:
-                print(f"Error removing temporary directory: {e}")
+                self.logger.error(
+                    message="Error removing temporary directory: {error}",
+                    tag="ERROR",
+                    params={"error": str(e)}
+                )


 class AsyncCrawlerStrategy(ABC):
@@ -172,7 +216,8 @@ class AsyncCrawlerStrategy(ABC):
        pass

 class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
-    def __init__(self, use_cached_html=False, js_code=None, **kwargs):
+    def __init__(self, use_cached_html=False, js_code=None, logger = None, **kwargs):
+        self.logger = logger
        self.use_cached_html = use_cached_html
        self.user_agent = kwargs.get(
            "user_agent",
@@ -231,7 +276,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                self.managed_browser = ManagedBrowser(
                    browser_type=self.browser_type,
                    user_data_dir=self.user_data_dir,
-                    headless=self.headless
+                    headless=self.headless,
+                    logger=self.logger
                )
                cdp_url = await self.managed_browser.start()
                self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
@@ -283,6 +329,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                if self.extra_args:
                    browser_args["args"].extend(self.extra_args)
                    
+                # Add downloads path if downloads are enabled
+                if self.accept_downloads:
+                    browser_args["downloads_path"] = self.downloads_path
+                
                # Add proxy settings if a proxy is specified
                if self.proxy:
                    proxy_settings = ProxySettings(server=self.proxy)
@@ -344,6 +394,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            self.browser = None
            
        if self.managed_browser:
+            await asyncio.sleep(0.5)
            await self.managed_browser.cleanup()
            self.managed_browser = None
            
@@ -491,9 +542,19 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                        }}
                    """)
                else:
-                    print(f"Warning: Could not access content frame for iframe {i}")
+                    # print(f"Warning: Could not access content frame for iframe {i}")
+                    self.logger.warning(
+                        message="Could not access content frame for iframe {index}",
+                        tag="SCRAPE",
+                        params={"index": i}
+                    )                    
            except Exception as e:
-                print(f"Error processing iframe {i}: {str(e)}")
+                self.logger.error(
+                    message="Error processing iframe {index}: {error}",
+                    tag="ERROR",
+                    params={"index": i, "error": str(e)}
+                )                
+                # print(f"Error processing iframe {i}: {str(e)}")

        # Return the page object
        return page  
@@ -620,7 +681,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                    context = await self.browser.new_context(
                        user_agent=self.user_agent,
                        viewport={"width": 1920, "height": 1080},
-                        proxy={"server": self.proxy} if self.proxy else None
+                        proxy={"server": self.proxy} if self.proxy else None,
+                        accept_downloads=self.accept_downloads,
                    )
                    await context.set_extra_http_headers(self.headers)
                
@@ -917,17 +979,31 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            suggested_filename = download.suggested_filename
            download_path = os.path.join(self.downloads_path, suggested_filename)
            
-            if self.verbose:
-                print(f"[LOG] 📥 Downloading {suggested_filename} to {download_path}")
+            self.logger.info(
+                message="Downloading {filename} to {path}",
+                tag="FETCH",
+                params={"filename": suggested_filename, "path": download_path}
+            )
                
+            start_time = time.perf_counter()
            await download.save_as(download_path)
+            end_time = time.perf_counter()
            self._downloaded_files.append(download_path)

-            if self.verbose:
-                print(f"[LOG] ✅ Downloaded {suggested_filename} successfully")
+            self.logger.success(
+                message="Downloaded {filename} successfully",
+                tag="COMPLETE",
+                params={"filename": suggested_filename, "path": download_path, "duration": f"{end_time - start_time:.2f}s"}
+            )            
        except Exception as e:
-            if self.verbose:
-                print(f"[ERROR] Failed to handle download: {str(e)}")
+            self.logger.error(
+                message="Failed to handle download: {error}",
+                tag="ERROR",
+                params={"error": str(e)}
+            )
+            
+            # if self.verbose:
+            #     print(f"[ERROR] Failed to handle download: {str(e)}")
    
    async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
        semaphore_count = kwargs.get('semaphore_count', 5)  # Adjust as needed
@@ -1070,8 +1146,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            await page.evaluate(remove_overlays_js)
            await page.wait_for_timeout(500)  # Wait for any animations to complete
        except Exception as e:
-            if self.verbose:
-                print(f"Warning: Failed to remove overlay elements: {str(e)}")
+            self.logger.warning(
+                message="Failed to remove overlay elements: {error}",
+                tag="SCRAPE",
+                params={"error": str(e)}
+            )            
+            # if self.verbose:
+            #     print(f"Warning: Failed to remove overlay elements: {str(e)}")

    async def take_screenshot(self, page: Page) -> str:
        """
@@ -1089,7 +1170,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            return base64.b64encode(screenshot).decode('utf-8')
        except Exception as e:
            error_message = f"Failed to take screenshot: {str(e)}"
-            print(error_message)
+            self.logger.error(
+                message="Screenshot failed: {error}",
+                tag="ERROR",
+                params={"error": error_message}
+            )
+            

            # Generate an error image
            img = Image.new('RGB', (800, 600), color='black')
@@ -1123,7 +1209,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            return base64.b64encode(screenshot).decode('utf-8')
        except Exception as e:
            error_message = f"Failed to take screenshot: {str(e)}"
-            print(error_message)
+            # print(error_message)
+            self.logger.error(
+                message="Screenshot failed: {error}",
+                tag="ERROR",
+                params={"error": error_message}
+            )            

            # Generate an error image
            img = Image.new('RGB', (800, 600), color='black')
--- a/crawl4ai/async_database.py
+++ b/crawl4ai/async_database.py
@@ -12,10 +12,12 @@ import xxhash
 import aiofiles
 from .config import NEED_MIGRATION
 from .version_manager import VersionManager
+from .async_logger import AsyncLogger
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)

+base_directory = Path.home()
 DB_PATH = os.path.join(Path.home(), ".crawl4ai")
 os.makedirs(DB_PATH, exist_ok=True)
 DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
@@ -28,15 +30,21 @@ class AsyncDatabaseManager:
        self.max_retries = max_retries
        self.connection_pool: Dict[int, aiosqlite.Connection] = {}
        self.pool_lock = asyncio.Lock()
+        self.init_lock = asyncio.Lock()
        self.connection_semaphore = asyncio.Semaphore(pool_size)
        self._initialized = False  
        self.version_manager = VersionManager()
+        self.logger = AsyncLogger(
+            log_file=os.path.join(base_directory, ".crawl4ai", "crawler_db.log"),
+            verbose=False,
+            tag_width=10
+        )
        
        
    async def initialize(self):
        """Initialize the database and connection pool"""
        try:
-            logger.info("Initializing database...")
+            self.logger.info("Initializing database", tag="INIT")
            # Ensure the database file exists
            os.makedirs(os.path.dirname(self.db_path), exist_ok=True)
            
@@ -47,7 +55,7 @@ class AsyncDatabaseManager:
            await self.ainit_db()
            
            # Verify the table exists
-            async def verify_table(db):
+            async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
                async with db.execute(
                    "SELECT name FROM sqlite_master WHERE type='table' AND name='crawled_data'"
                ) as cursor:
@@ -55,24 +63,32 @@ class AsyncDatabaseManager:
                    if not result:
                        raise Exception("crawled_data table was not created")
            
-            await self.execute_with_retry(verify_table)
-            
            # If version changed or fresh install, run updates
            if needs_update:
-                logger.info("New version detected, running updates...")
+                self.logger.info("New version detected, running updates", tag="INIT")
                await self.update_db_schema()
                from .migrations import run_migration  # Import here to avoid circular imports
                await run_migration()
                self.version_manager.update_version()  # Update stored version after successful migration
-                logger.info("Version update completed successfully")
+                self.logger.success("Version update completed successfully", tag="COMPLETE")
            else:
-                logger.info("Database initialization completed successfully")
+                self.logger.success("Database initialization completed successfully", tag="COMPLETE")
+
                
        except Exception as e:
-            logger.error(f"Database initialization error: {e}")
-            logger.info("Database will be initialized on first use")
+            self.logger.error(
+                message="Database initialization error: {error}",
+                tag="ERROR",
+                params={"error": str(e)}
+            )
+            self.logger.info(
+                message="Database will be initialized on first use",
+                tag="INIT"
+            )
+                        
            raise

+            
    async def cleanup(self):
        """Cleanup connections when shutting down"""
        async with self.pool_lock:
@@ -84,34 +100,41 @@ class AsyncDatabaseManager:
    async def get_connection(self):
        """Connection pool manager"""
        if not self._initialized:
-            async with self.pool_lock:  # Prevent multiple simultaneous initializations
-                if not self._initialized:  # Double-check after acquiring lock
+            # Use an asyncio.Lock to ensure only one initialization occurs
+            async with self.init_lock:
+                if not self._initialized:
                    await self.initialize()
                    self._initialized = True

-        async with self.connection_semaphore:
-            task_id = id(asyncio.current_task())
-            try:
-                async with self.pool_lock:
-                    if task_id not in self.connection_pool:
-                        conn = await aiosqlite.connect(
-                            self.db_path,
-                            timeout=30.0
-                        )
-                        await conn.execute('PRAGMA journal_mode = WAL')
-                        await conn.execute('PRAGMA busy_timeout = 5000')
-                        self.connection_pool[task_id] = conn
+        await self.connection_semaphore.acquire()
+        task_id = id(asyncio.current_task())
+        try:
+            async with self.pool_lock:
+                if task_id not in self.connection_pool:
+                    conn = await aiosqlite.connect(
+                        self.db_path,
+                        timeout=30.0
+                    )
+                    await conn.execute('PRAGMA journal_mode = WAL')
+                    await conn.execute('PRAGMA busy_timeout = 5000')
+                    self.connection_pool[task_id] = conn

-                yield self.connection_pool[task_id]
+            yield self.connection_pool[task_id]

-            except Exception as e:
-                logger.error(f"Connection error: {e}")
-                raise
-            finally:
-                async with self.pool_lock:
-                    if task_id in self.connection_pool:
-                        await self.connection_pool[task_id].close()
-                        del self.connection_pool[task_id]
+        except Exception as e:
+            self.logger.error(
+                message="Connection error: {error}",
+                tag="ERROR",
+                force_verbose=True,
+                params={"error": str(e)}
+            )
+            raise
+        finally:
+            async with self.pool_lock:
+                if task_id in self.connection_pool:
+                    await self.connection_pool[task_id].close()
+                    del self.connection_pool[task_id]
+            self.connection_semaphore.release()


    async def execute_with_retry(self, operation, *args):
@@ -124,13 +147,21 @@ class AsyncDatabaseManager:
                    return result
            except Exception as e:
                if attempt == self.max_retries - 1:
-                    logger.error(f"Operation failed after {self.max_retries} attempts: {e}")
+                    self.logger.error(
+                        message="Operation failed after {retries} attempts: {error}",
+                        tag="ERROR",
+                        force_verbose=True,
+                        params={
+                            "retries": self.max_retries,
+                            "error": str(e)
+                        }
+                    )                    
                    raise
                await asyncio.sleep(1 * (attempt + 1))  # Exponential backoff

    async def ainit_db(self):
        """Initialize database schema"""
-        async def _init(db):
+        async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
            await db.execute('''
                CREATE TABLE IF NOT EXISTS crawled_data (
                    url TEXT PRIMARY KEY,
@@ -147,36 +178,37 @@ class AsyncDatabaseManager:
                    downloaded_files TEXT DEFAULT "{}"  -- New column added
                )
            ''')
+            await db.commit()

-        await self.execute_with_retry(_init)
        

    async def update_db_schema(self):
        """Update database schema if needed"""
-        async def _check_columns(db):
+        async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
            cursor = await db.execute("PRAGMA table_info(crawled_data)")
            columns = await cursor.fetchall()
-            return [column[1] for column in columns]
+            column_names = [column[1] for column in columns]
            
-        column_names = await self.execute_with_retry(_check_columns)
+            # List of new columns to add
+            new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files']
            
-        # List of new columns to add
-        new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files']
+            for column in new_columns:
+                if column not in column_names:
+                    await self.aalter_db_add_column(column, db)
+            await db.commit()

-        for column in new_columns:
-            if column not in column_names:
-                await self.aalter_db_add_column(column)
-
-    async def aalter_db_add_column(self, new_column: str):
+    async def aalter_db_add_column(self, new_column: str, db):
        """Add new column to the database"""
-        async def _alter(db):
-            if new_column == 'response_headers':
-                await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"')
-            else:
-                await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
-            logger.info(f"Added column '{new_column}' to the database.")
+        if new_column == 'response_headers':
+            await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"')
+        else:
+            await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
+        self.logger.info(
+            message="Added column '{column}' to the database",
+            tag="INIT",
+            params={"column": new_column}
+        )        

-        await self.execute_with_retry(_alter)

    async def aget_cached_url(self, url: str) -> Optional[CrawlResult]:
        """Retrieve cached URL data as CrawlResult"""
@@ -235,7 +267,12 @@ class AsyncDatabaseManager:
        try:
            return await self.execute_with_retry(_get)
        except Exception as e:
-            logger.error(f"Error retrieving cached URL: {e}")
+            self.logger.error(
+                message="Error retrieving cached URL: {error}",
+                tag="ERROR",
+                force_verbose=True,
+                params={"error": str(e)}
+            )
            return None

    async def acache_url(self, result: CrawlResult):
@@ -291,7 +328,13 @@ class AsyncDatabaseManager:
        try:
            await self.execute_with_retry(_cache)
        except Exception as e:
-            logger.error(f"Error caching URL: {e}")
+            self.logger.error(
+                message="Error caching URL: {error}",
+                tag="ERROR",
+                force_verbose=True,
+                params={"error": str(e)}
+            )
+            

    async def aget_total_count(self) -> int:
        """Get total number of cached URLs"""
@@ -303,7 +346,12 @@ class AsyncDatabaseManager:
        try:
            return await self.execute_with_retry(_count)
        except Exception as e:
-            logger.error(f"Error getting total count: {e}")
+            self.logger.error(
+                message="Error getting total count: {error}",
+                tag="ERROR",
+                force_verbose=True,
+                params={"error": str(e)}
+            )
            return 0

    async def aclear_db(self):
@@ -314,7 +362,12 @@ class AsyncDatabaseManager:
        try:
            await self.execute_with_retry(_clear)
        except Exception as e:
-            logger.error(f"Error clearing database: {e}")
+            self.logger.error(
+                message="Error clearing database: {error}",
+                tag="ERROR",
+                force_verbose=True,
+                params={"error": str(e)}
+            )

    async def aflush_db(self):
        """Drop the entire table"""
@@ -324,7 +377,12 @@ class AsyncDatabaseManager:
        try:
            await self.execute_with_retry(_flush)
        except Exception as e:
-            logger.error(f"Error flushing database: {e}")
+            self.logger.error(
+                message="Error flushing database: {error}",
+                tag="ERROR",
+                force_verbose=True,
+                params={"error": str(e)}
+            )
            
                
    async def _store_content(self, content: str, content_type: str) -> str:
@@ -352,7 +410,12 @@ class AsyncDatabaseManager:
            async with aiofiles.open(file_path, 'r', encoding='utf-8') as f:
                return await f.read()
        except:
-            logger.error(f"Failed to load content: {file_path}")
+            self.logger.error(
+                message="Failed to load content: {file_path}",
+                tag="ERROR",
+                force_verbose=True,
+                params={"file_path": file_path}
+            )
            return None

 # Create a singleton instance
--- a/crawl4ai/async_logger.py
+++ b/crawl4ai/async_logger.py
@@ -0,0 +1,231 @@
+from enum import Enum
+from typing import Optional, Dict, Any, Union
+from colorama import Fore, Back, Style, init
+import time
+import os
+from datetime import datetime
+
+class LogLevel(Enum):
+    DEBUG = 1
+    INFO = 2
+    SUCCESS = 3
+    WARNING = 4
+    ERROR = 5
+
+class AsyncLogger:
+    """
+    Asynchronous logger with support for colored console output and file logging.
+    Supports templated messages with colored components.
+    """
+    
+    DEFAULT_ICONS = {
+        'INIT': '→',
+        'READY': '✓',
+        'FETCH': '↓',
+        'SCRAPE': '◆',
+        'EXTRACT': '■',
+        'COMPLETE': '●',
+        'ERROR': '×',
+        'DEBUG': '⋯',
+        'INFO': 'ℹ',
+        'WARNING': '⚠',
+    }
+
+    DEFAULT_COLORS = {
+        LogLevel.DEBUG: Fore.LIGHTBLACK_EX,
+        LogLevel.INFO: Fore.CYAN,
+        LogLevel.SUCCESS: Fore.GREEN,
+        LogLevel.WARNING: Fore.YELLOW,
+        LogLevel.ERROR: Fore.RED,
+    }
+
+    def __init__(
+        self,
+        log_file: Optional[str] = None,
+        log_level: LogLevel = LogLevel.INFO,
+        tag_width: int = 10,
+        icons: Optional[Dict[str, str]] = None,
+        colors: Optional[Dict[LogLevel, str]] = None,
+        verbose: bool = True
+    ):
+        """
+        Initialize the logger.
+        
+        Args:
+            log_file: Optional file path for logging
+            log_level: Minimum log level to display
+            tag_width: Width for tag formatting
+            icons: Custom icons for different tags
+            colors: Custom colors for different log levels
+            verbose: Whether to output to console
+        """
+        init()  # Initialize colorama
+        self.log_file = log_file
+        self.log_level = log_level
+        self.tag_width = tag_width
+        self.icons = icons or self.DEFAULT_ICONS
+        self.colors = colors or self.DEFAULT_COLORS
+        self.verbose = verbose
+        
+        # Create log file directory if needed
+        if log_file:
+            os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True)
+
+    def _format_tag(self, tag: str) -> str:
+        """Format a tag with consistent width."""
+        return f"[{tag}]".ljust(self.tag_width, ".")
+
+    def _get_icon(self, tag: str) -> str:
+        """Get the icon for a tag, defaulting to info icon if not found."""
+        return self.icons.get(tag, self.icons['INFO'])
+
+    def _write_to_file(self, message: str):
+        """Write a message to the log file if configured."""
+        if self.log_file:
+            timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
+            with open(self.log_file, 'a', encoding='utf-8') as f:
+                # Strip ANSI color codes for file output
+                clean_message = message.replace(Fore.RESET, '').replace(Style.RESET_ALL, '')
+                for color in vars(Fore).values():
+                    if isinstance(color, str):
+                        clean_message = clean_message.replace(color, '')
+                f.write(f"[{timestamp}] {clean_message}\n")
+
+    def _log(
+        self,
+        level: LogLevel,
+        message: str,
+        tag: str,
+        params: Optional[Dict[str, Any]] = None,
+        colors: Optional[Dict[str, str]] = None,
+        base_color: Optional[str] = None,
+        **kwargs
+    ):
+        """
+        Core logging method that handles message formatting and output.
+        
+        Args:
+            level: Log level for this message
+            message: Message template string
+            tag: Tag for the message
+            params: Parameters to format into the message
+            colors: Color overrides for specific parameters
+            base_color: Base color for the entire message
+        """
+        if level.value < self.log_level.value:
+            return
+
+        # Format the message with parameters if provided
+        if params:
+            try:
+                # First format the message with raw parameters
+                formatted_message = message.format(**params)
+                
+                # Then apply colors if specified
+                if colors:
+                    for key, color in colors.items():
+                        # Find the formatted value in the message and wrap it with color
+                        if key in params:
+                            value_str = str(params[key])
+                            formatted_message = formatted_message.replace(
+                                value_str, 
+                                f"{color}{value_str}{Style.RESET_ALL}"
+                            )
+                            
+            except KeyError as e:
+                formatted_message = f"LOGGING ERROR: Missing parameter {e} in message template"
+                level = LogLevel.ERROR
+        else:
+            formatted_message = message
+
+        # Construct the full log line
+        color = base_color or self.colors[level]
+        log_line = f"{color}{self._format_tag(tag)} {self._get_icon(tag)} {formatted_message}{Style.RESET_ALL}"
+
+        # Output to console if verbose
+        if self.verbose or kwargs.get("force_verbose", False):
+            print(log_line)
+
+        # Write to file if configured
+        self._write_to_file(log_line)
+
+    def debug(self, message: str, tag: str = "DEBUG", **kwargs):
+        """Log a debug message."""
+        self._log(LogLevel.DEBUG, message, tag, **kwargs)
+
+    def info(self, message: str, tag: str = "INFO", **kwargs):
+        """Log an info message."""
+        self._log(LogLevel.INFO, message, tag, **kwargs)
+
+    def success(self, message: str, tag: str = "SUCCESS", **kwargs):
+        """Log a success message."""
+        self._log(LogLevel.SUCCESS, message, tag, **kwargs)
+
+    def warning(self, message: str, tag: str = "WARNING", **kwargs):
+        """Log a warning message."""
+        self._log(LogLevel.WARNING, message, tag, **kwargs)
+
+    def error(self, message: str, tag: str = "ERROR", **kwargs):
+        """Log an error message."""
+        self._log(LogLevel.ERROR, message, tag, **kwargs)
+
+    def url_status(
+        self,
+        url: str,
+        success: bool,
+        timing: float,
+        tag: str = "FETCH",
+        url_length: int = 50
+    ):
+        """
+        Convenience method for logging URL fetch status.
+        
+        Args:
+            url: The URL being processed
+            success: Whether the operation was successful
+            timing: Time taken for the operation
+            tag: Tag for the message
+            url_length: Maximum length for URL in log
+        """
+        self._log(
+            level=LogLevel.SUCCESS if success else LogLevel.ERROR,
+            message="{url:.{url_length}}... | Status: {status} | Time: {timing:.2f}s",
+            tag=tag,
+            params={
+                "url": url,
+                "url_length": url_length,
+                "status": success,
+                "timing": timing
+            },
+            colors={
+                "status": Fore.GREEN if success else Fore.RED,
+                "timing": Fore.YELLOW
+            }
+        )
+
+    def error_status(
+        self,
+        url: str,
+        error: str,
+        tag: str = "ERROR",
+        url_length: int = 50
+    ):
+        """
+        Convenience method for logging error status.
+        
+        Args:
+            url: The URL being processed
+            error: Error message
+            tag: Tag for the message
+            url_length: Maximum length for URL in log
+        """
+        self._log(
+            level=LogLevel.ERROR,
+            message="{url:.{url_length}}... | Error: {error}",
+            tag=tag,
+            params={
+                "url": url,
+                "url_length": url_length,
+                "error": error
+            }
+        )
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -15,6 +15,7 @@ from .extraction_strategy import *
 from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
 from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
 from .content_scrapping_strategy import WebScrapingStrategy
+from .async_logger import AsyncLogger

 from .config import (
    MIN_WORD_THRESHOLD, 
@@ -74,19 +75,29 @@ class AsyncWebCrawler:
            always_by_pass_cache: Deprecated, use always_bypass_cache instead
            base_directory: Base directory for storing cache
        """
-        init()
-        self.log_width = 10  # Width of "[COMPLETE]" 
-        self.tag_format = lambda tag: f"[{tag}]".ljust(self.log_width, ".")
-        self.log_icons = {
-            'INIT': '→',      # Alternative: '▶' or '►'
-            'READY': '✓',     # Alternative: '√'
-            'FETCH': '↓',     # Alternative: '▼'
-            'SCRAPE': '◆',    # Alternative: '♦'
-            'EXTRACT': '■',    # Alternative: '□'
-            'COMPLETE': '●',   # Alternative: '○'
-            'ERROR': '×' 
-        }        
-        self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(**kwargs)
+        # init()
+        # self.log_width = 10  # Width of "[COMPLETE]" 
+        # self.tag_format = lambda tag: f"[{tag}]".ljust(self.log_width, ".")
+        # self.log_icons = {
+        #     'INIT': '→',      # Alternative: '▶' or '►'
+        #     'READY': '✓',     # Alternative: '√'
+        #     'FETCH': '↓',     # Alternative: '▼'
+        #     'SCRAPE': '◆',    # Alternative: '♦'
+        #     'EXTRACT': '■',    # Alternative: '□'
+        #     'COMPLETE': '●',   # Alternative: '○'
+        #     'ERROR': '×' 
+        # }        
+        self.verbose = kwargs.get("verbose", False)
+        self.logger = AsyncLogger(
+            log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"),
+            verbose=self.verbose,
+            tag_width=10
+        )
+        
+        self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
+            logger = self.logger,
+            **kwargs
+        )
        
        # Handle deprecated parameter
        if always_by_pass_cache is not None:
@@ -118,12 +129,13 @@ class AsyncWebCrawler:

    async def awarmup(self):
        """Initialize the crawler with warm-up sequence."""
-        if self.verbose:
-            print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Crawl4AI {crawl4ai_version}{Style.RESET_ALL}")
-            print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Warming up AsyncWebCrawler{Style.RESET_ALL}")
+        self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
+        # if self.verbose:
+        #     print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Crawl4AI {crawl4ai_version}{Style.RESET_ALL}")
+        #     print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Warming up AsyncWebCrawler{Style.RESET_ALL}")
        self.ready = True
-        if self.verbose:
-            print(f"{Fore.GREEN}{self.tag_format('READY')} {self.log_icons['READY']} AsyncWebCrawler initialized{Style.RESET_ALL}")
+        # if self.verbose:
+        #     print(f"{Fore.GREEN}{self.tag_format('READY')} {self.log_icons['READY']} AsyncWebCrawler initialized{Style.RESET_ALL}")

    async def arun(
        self,
@@ -234,8 +246,14 @@ class AsyncWebCrawler:
                    screenshot_data = cached_result.screenshot
                    if not screenshot_data:
                        cached_result = None
-                if verbose:
-                    print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s")
+                # if verbose:
+                #     print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s")
+                self.logger.url_status(
+                        url=cache_context.display_url,
+                        success=bool(html),
+                        timing=time.perf_counter() - start_time,
+                        tag="FETCH"
+                    )                    


            # Fetch fresh content if needed
@@ -252,8 +270,14 @@ class AsyncWebCrawler:
                html = sanitize_input_encode(async_response.html)
                screenshot_data = async_response.screenshot
                t2 = time.perf_counter()
-                if verbose:
-                    print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url}... | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s")
+                self.logger.url_status(
+                    url=cache_context.display_url,
+                    success=bool(html),
+                    timing=t2 - t1,
+                    tag="FETCH"
+                )
+                # if verbose:
+                #     print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url}... | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s")

            # Process the HTML content
            crawl_result = await self.aprocess_html(
@@ -287,9 +311,21 @@ class AsyncWebCrawler:
            crawl_result.success = bool(html)
            crawl_result.session_id = kwargs.get("session_id", None)

-            if verbose:
-                print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}")
-
+            # if verbose:
+            #     print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}")
+            self.logger.success(
+                    message="{url:.50}... | Status: {status} | Total: {timing}",
+                    tag="COMPLETE",
+                    params={
+                        "url": cache_context.display_url,
+                        "status": crawl_result.success,
+                        "timing": f"{time.perf_counter() - start_time:.2f}s"
+                    },
+                    colors={
+                        "status": Fore.GREEN if crawl_result.success else Fore.RED,
+                        "timing": Fore.YELLOW
+                    }
+                )

            # Update cache if appropriate
            if cache_context.should_write() and not bool(cached_result):
@@ -300,7 +336,12 @@ class AsyncWebCrawler:
        except Exception as e:
            if not hasattr(e, "msg"):
                e.msg = str(e)
-            print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}")
+            # print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}")
+            self.logger.error_status(
+                url=cache_context.display_url,
+                error=e.msg,
+                tag="ERROR"
+            )            
            return CrawlResult(
                url=url, 
                html="", 
@@ -362,7 +403,12 @@ class AsyncWebCrawler:
            domain = urlparse(url).netloc
            current_time = time.time()
            
-            print(f"{Fore.LIGHTBLACK_EX}{self.tag_format('PARALLEL')} Started task for {url[:50]}...{Style.RESET_ALL}")
+            # print(f"{Fore.LIGHTBLACK_EX}{self.tag_format('PARALLEL')} Started task for {url[:50]}...{Style.RESET_ALL}")
+            self.logger.debug(
+                message="Started task for {url:.50}...",
+                tag="PARALLEL",
+                params={"url": url}
+            )            
            
            # Get delay settings from kwargs or use defaults
            mean_delay = kwargs.get('mean_delay', 0.1)  # 0.5 seconds default mean delay
@@ -394,12 +440,26 @@ class AsyncWebCrawler:
                )

        # Print start message
-        print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Starting concurrent crawling for {len(urls)} URLs...{Style.RESET_ALL}")
+        # print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Starting concurrent crawling for {len(urls)} URLs...{Style.RESET_ALL}")
+        self.logger.info(
+            message="Starting concurrent crawling for {count} URLs...",
+            tag="INIT",
+            params={"count": len(urls)}
+        )        
        start_time = time.perf_counter()
        tasks = [crawl_with_semaphore(url) for url in urls]
        results = await asyncio.gather(*tasks, return_exceptions=True)
        end_time = time.perf_counter()
-        print(f"{Fore.YELLOW}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} Concurrent crawling completed for {len(urls)} URLs | Total time: {end_time - start_time:.2f}s{Style.RESET_ALL}")
+        # print(f"{Fore.YELLOW}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} Concurrent crawling completed for {len(urls)} URLs | Total time: {end_time - start_time:.2f}s{Style.RESET_ALL}")
+        self.logger.success(
+            message="Concurrent crawling completed for {count} URLs | " + Fore.YELLOW + " Total time: {timing}" + Style.RESET_ALL,
+            tag="COMPLETE",
+            params={
+                "count": len(urls),
+                "timing": f"{end_time - start_time:.2f}s"
+            },
+            colors={"timing": Fore.YELLOW}
+        )        
        return [result if not isinstance(result, Exception) else str(result) for result in results]


@@ -451,9 +511,16 @@ class AsyncWebCrawler:
        links = result.get("links", [])
        metadata = result.get("metadata", {})
        
-        if verbose:
-            print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms")
-
+        # if verbose:
+        #     print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms")
+        self.logger.info(
+            message="Processed {url:.50}... | Time: {timing}ms",
+            tag="SCRAPE",
+            params={
+                "url": _url,
+                "timing": int((time.perf_counter() - t1) * 1000)
+            }
+        )


        if extracted_content is None and extraction_strategy and chunking_strategy and not isinstance(extraction_strategy, NoExtractionStrategy):
@@ -467,8 +534,17 @@ class AsyncWebCrawler:
                sections = chunking_strategy.chunk(markdown)
                extracted_content = extraction_strategy.run(url, sections)
                extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
-            if verbose:
-                print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}")
+            # if verbose:
+                # print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}")
+            self.logger.info(
+                message="Completed for {url:.50}... | Time: {timing}s",
+                tag="EXTRACT",
+                params={
+                    "url": _url,
+                    "timing": time.perf_counter() - t1
+                }
+            )
+        

                

--- a/crawl4ai/content_filter_strategy.py
+++ b/crawl4ai/content_filter_strategy.py
@@ -8,6 +8,10 @@ from bs4 import BeautifulSoup, NavigableString, Tag
 from .utils import clean_tokens
 from abc import ABC, abstractmethod

+from snowballstemmer import stemmer
+
+# from nltk.stem import PorterStemmer
+# ps = PorterStemmer()
 class RelevantContentFilter(ABC):
    def __init__(self, user_query: str = None):
        self.user_query = user_query
@@ -252,7 +256,7 @@ class RelevantContentFilter(ABC):
            return str(tag)  # Fallback to original if anything fails

 class BM25ContentFilter(RelevantContentFilter):
-    def __init__(self, user_query: str = None, bm25_threshold: float = 1.0):
+    def __init__(self, user_query: str = None, bm25_threshold: float = 1.0, language: str = 'english'):
        super().__init__(user_query=user_query)
        self.bm25_threshold = bm25_threshold
        self.priority_tags = {
@@ -268,6 +272,7 @@ class BM25ContentFilter(RelevantContentFilter):
            'pre': 1.5,
            'th': 1.5,  # Table headers
        }
+        self.stemmer = stemmer(language)

    def filter_content(self, html: str) -> List[str]:
        """Implements content filtering using BM25 algorithm with priority tag handling"""
@@ -282,19 +287,17 @@ class BM25ContentFilter(RelevantContentFilter):
        if not candidates:
            return []

-        # Split into priority and regular candidates
-        priority_candidates = []
-        regular_candidates = []
+        # Tokenize corpus
+        # tokenized_corpus = [chunk.lower().split() for _, chunk, _, _ in candidates]
+        # tokenized_query = query.lower().split()
                
-        for index, chunk, tag_type, tag in candidates:
-            if tag.name in self.priority_tags:
-                priority_candidates.append((index, chunk, tag_type, tag))
-            else:
-                regular_candidates.append((index, chunk, tag_type, tag))
+        # tokenized_corpus = [[ps.stem(word) for word in chunk.lower().split()] 
+        #                 for _, chunk, _, _ in candidates]
+        # tokenized_query = [ps.stem(word) for word in query.lower().split()]        
        
-        # Process regular content with BM25
-        tokenized_corpus = [chunk.lower().split() for _, chunk, _, _ in regular_candidates]
-        tokenized_query = query.lower().split()
+        tokenized_corpus = [[self.stemmer.stemWord(word) for word in chunk.lower().split()] 
+                   for _, chunk, _, _ in candidates]
+        tokenized_query = [self.stemmer.stemWord(word) for word in query.lower().split()]

        # Clean from stop words and noise
        tokenized_corpus = [clean_tokens(tokens) for tokens in tokenized_corpus]
@@ -303,37 +306,23 @@ class BM25ContentFilter(RelevantContentFilter):
        bm25 = BM25Okapi(tokenized_corpus)
        scores = bm25.get_scores(tokenized_query)

-        # Score and boost regular candidates
-        scored_candidates = [
-            (score * self.priority_tags.get(tag.name, 1.0), index, chunk, tag_type, tag)
-            for score, (index, chunk, tag_type, tag) in zip(scores, regular_candidates)
+        # Adjust scores with tag weights
+        adjusted_candidates = []
+        for score, (index, chunk, tag_type, tag) in zip(scores, candidates):
+            tag_weight = self.priority_tags.get(tag.name, 1.0)
+            adjusted_score = score * tag_weight
+            adjusted_candidates.append((adjusted_score, index, chunk, tag))
+
+        # Filter candidates by threshold
+        selected_candidates = [
+            (index, chunk, tag) for adjusted_score, index, chunk, tag in adjusted_candidates
+            if adjusted_score >= self.bm25_threshold
        ]
-        scored_candidates.sort(key=lambda x: x[0], reverse=True)
-
-        # Process scored candidates
-        selected_tags = set()
-        selected_candidates = []
-
-        # First add all priority candidates
-        for index, chunk, tag_type, tag in priority_candidates:
-            tag_id = id(tag)
-            if tag_id not in selected_tags:
-                selected_candidates.append((index, chunk, tag))
-                selected_tags.add(tag_id)
-
-        # Then add scored regular candidates that meet threshold
-        for score, index, chunk, tag_type, tag in scored_candidates:
-            if score < self.bm25_threshold:
-                continue
-            tag_id = id(tag)
-            if tag_id not in selected_tags:
-                selected_candidates.append((index, chunk, tag))
-                selected_tags.add(tag_id)

        if not selected_candidates:
            return []

-        # Sort by original document order
+        # Sort selected candidates by original document order
        selected_candidates.sort(key=lambda x: x[0])
-        return [self.clean_element(tag) for _, _, tag in selected_candidates]

+        return [self.clean_element(tag) for _, _, tag in selected_candidates]
--- a/crawl4ai/content_scrapping_strategy.py
+++ b/crawl4ai/content_scrapping_strategy.py
@@ -149,6 +149,15 @@ class ContentScrapingStrategy(ABC):
        pass

 class WebScrapingStrategy(ContentScrapingStrategy):
+    def __init__(self, logger=None):
+        self.logger = logger
+
+    def _log(self, level, message, tag="SCRAPE", **kwargs):
+        """Helper method to safely use logger."""
+        if self.logger:
+            log_method = getattr(self.logger, level)
+            log_method(message=message, tag=tag, **kwargs)
+                
    def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
        return self._get_content_of_website_optimized(url, html, is_async=False, **kwargs)

@@ -167,7 +176,12 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        try:
            meta = extract_metadata("", soup)
        except Exception as e:
-            print('Error extracting metadata:', str(e))
+            self._log('error', 
+                message="Error extracting metadata: {error}",
+                tag="SCRAPE",
+                params={"error": str(e)}
+            )            
+            # print('Error extracting metadata:', str(e))
            meta = {}
        
        
@@ -430,9 +444,12 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                try:
                    remove_unwanted_attributes(element, IMPORTANT_ATTRS, kwargs.get('keep_data_attributes', False))
                except Exception as e:
-                    print('Error removing unwanted attributes:', str(e))
-                
-
+                    # print('Error removing unwanted attributes:', str(e))
+                    self._log('error',
+                        message="Error removing unwanted attributes: {error}",
+                        tag="SCRAPE",
+                        params={"error": str(e)}
+                    )
                # Process children
                for child in list(element.children):
                    if isinstance(child, NavigableString) and not isinstance(child, Comment):
@@ -453,7 +470,12 @@ class WebScrapingStrategy(ContentScrapingStrategy):

                return keep_element
            except Exception as e:
-                print('Error processing element:', str(e))
+                # print('Error processing element:', str(e))
+                self._log('error',
+                    message="Error processing element: {error}",
+                    tag="SCRAPE",
+                    params={"error": str(e)}
+                )                
                return False
       
        process_element(body)
@@ -516,7 +538,10 @@ class WebScrapingStrategy(ContentScrapingStrategy):
            str_body = body.encode_contents().decode('utf-8')
            
            print(f"[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.")
-
+            self._log('error',
+                message="After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.",
+                tag="SCRAPE"
+            )

        cleaned_html = str_body.replace('\n\n', '\n').replace('  ', ' ')

@@ -525,6 +550,13 @@ class WebScrapingStrategy(ContentScrapingStrategy):
            h.update_params(**kwargs.get('html2text', {}))            
            markdown = h.handle(cleaned_html)
        except Exception as e:
+            if not h:
+                h = CustomHTML2Text()
+            self._log('error',
+                message="Error converting HTML to markdown: {error}",
+                tag="SCRAPE",
+                params={"error": str(e)}
+            )
            markdown = h.handle(sanitize_html(cleaned_html))
        markdown = markdown.replace('    ```', '```')

--- a/crawl4ai/version_manager.py
+++ b/crawl4ai/version_manager.py
@@ -20,11 +20,11 @@ class VersionManager:
            
    def update_version(self):
        """Update the version file to current library version"""
-        self.version_file.write_text(__version__)
+        self.version_file.write_text(__version__.__version__)
        
    def needs_update(self):
        """Check if database needs update based on version"""
        installed = self.get_installed_version()
-        current = version.parse(__version__)
+        current = version.parse(__version__.__version__)
        return installed is None or installed < current

--- a/docker-compose.hub.yml
+++ b/docker-compose.hub.yml
@@ -0,0 +1,27 @@
+services:
+  crawl4ai:
+    image: unclecode/crawl4ai:basic  # Pull image from Docker Hub
+    ports:
+      - "11235:11235"  # FastAPI server
+      - "8000:8000"    # Alternative port
+      - "9222:9222"    # Browser debugging
+      - "8080:8080"    # Additional port
+    environment:
+      - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}  # Optional API token
+      - OPENAI_API_KEY=${OPENAI_API_KEY:-}          # Optional OpenAI API key
+      - CLAUDE_API_KEY=${CLAUDE_API_KEY:-}          # Optional Claude API key
+    volumes:
+      - /dev/shm:/dev/shm  # Shared memory for browser operations
+    deploy:
+      resources:
+        limits:
+          memory: 4G
+        reservations:
+          memory: 1G
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s
--- a/docker-compose.local.yml
+++ b/docker-compose.local.yml
@@ -0,0 +1,33 @@
+services:
+  crawl4ai:
+    build:
+      context: .
+      dockerfile: Dockerfile
+      args:
+        PYTHON_VERSION: 3.10
+        INSTALL_TYPE: all
+        ENABLE_GPU: false
+    ports:
+      - "11235:11235"  # FastAPI server
+      - "8000:8000"    # Alternative port
+      - "9222:9222"    # Browser debugging
+      - "8080:8080"    # Additional port
+    environment:
+      - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}  # Optional API token
+      - OPENAI_API_KEY=${OPENAI_API_KEY:-}          # Optional OpenAI API key
+      - CLAUDE_API_KEY=${CLAUDE_API_KEY:-}          # Optional Claude API key
+    volumes:
+      - /dev/shm:/dev/shm  # Shared memory for browser operations
+    deploy:
+      resources:
+        limits:
+          memory: 4G
+        reservations:
+          memory: 1G
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,5 +1,3 @@
-version: '3.8'
-
 services:
  crawl4ai:
    build:
@@ -9,15 +7,46 @@ services:
        PYTHON_VERSION: 3.10
        INSTALL_TYPE: all
        ENABLE_GPU: false
+    profiles: ["local"]
    ports:
-      - "11235:11235"  # FastAPI server
-      - "8000:8000"    # Alternative port
-      - "9222:9222"    # Browser debugging
-      - "8080:8080"    # Additional port
+      - "11235:11235"
+      - "8000:8000"
+      - "9222:9222"
+      - "8080:8080"
    environment:
-      - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}  # Optional API token
+      - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}
+      - OPENAI_API_KEY=${OPENAI_API_KEY:-}
+      - CLAUDE_API_KEY=${CLAUDE_API_KEY:-}
    volumes:
-      - /dev/shm:/dev/shm  # Shared memory for browser operations
+      - /dev/shm:/dev/shm
+    deploy:
+      resources:
+        limits:
+          memory: 4G
+        reservations:
+          memory: 1G
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s
+
+  crawl4ai-hub:
+    image: unclecode/crawl4ai:basic
+    profiles: ["hub"]
+    ports:
+      - "11235:11235"
+      - "8000:8000"
+      - "9222:9222"
+      - "8080:8080"
+    environment:
+      - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}
+      - OPENAI_API_KEY=${OPENAI_API_KEY:-}
+      - CLAUDE_API_KEY=${CLAUDE_API_KEY:-}
+    volumes:
+      - /dev/shm:/dev/shm
    deploy:
      resources:
        limits:
--- a/docs/examples/v0.3.74.overview.py
+++ b/docs/examples/v0.3.74.overview.py
@@ -1,9 +1,16 @@
+import os, sys
+# append the parent directory to the sys.path
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+parent_parent_dir = os.path.dirname(parent_dir)
+sys.path.append(parent_parent_dir)
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+__data__ = os.path.join(__location__, "__data")
 import asyncio
-import os
 from pathlib import Path
 import aiohttp
 import json
-from crawl4ai import AsyncWebCrawler
+from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.content_filter_strategy import BM25ContentFilter

 # 1. File Download Processing Example
@@ -32,7 +39,8 @@ async def download_example():
                console.log('No .exe download link found');
            }
            """,
-            wait_for=5  # Wait 5 seconds to ensure download starts
+            delay_before_return_html=1,  # Wait 5 seconds to ensure download starts
+            cache_mode=CacheMode.BYPASS
        )
        
        if result.downloaded_files:
@@ -50,22 +58,32 @@ async def content_filtering_example():
    async with AsyncWebCrawler(verbose=True) as crawler:
        # Create filter with custom query for OpenAI's blog
        content_filter = BM25ContentFilter(
-            user_query="AI language models research innovation",
+            # user_query="Investment and fundraising",
+            # user_query="Robotic",
            bm25_threshold=1.0
        )
        
        result = await crawler.arun(
-            url="https://openai.com/blog",
-            content_filter=content_filter
+            url="https://techcrunch.com/",
+            content_filter=content_filter,
+            cache_mode=CacheMode.BYPASS
        )
        
-        print(f"Filtered content: {result.extracted_content}")
+        print(f"Filtered content: {len(result.fit_markdown)}")
+        print(f"Filtered content: {result.fit_markdown}")
+        
+        # Save html 
+        with open(os.path.join(__data__, "techcrunch.html"), "w") as f:
+            f.write(result.fit_html)
+        
+        with open(os.path.join(__data__, "filtered_content.md"), "w") as f:
+            f.write(result.fit_markdown)

 # 3. Local File and Raw HTML Processing Example
 async def local_and_raw_html_example():
    """Example of processing local files and raw HTML"""
    # Create a sample HTML file
-    sample_file = "sample.html"
+    sample_file = os.path.join(__data__, "sample.html")
    with open(sample_file, "w") as f:
        f.write("""
        <html><body>
@@ -112,21 +130,18 @@ async def browser_management_example():
        headless=False,
        verbose=True
    ) as crawler:
+
+        result = await crawler.arun(
+            url="https://crawl4ai.com",
+            # session_id="persistent_session_1",
+            cache_mode=CacheMode.BYPASS
+        )        
        # Use GitHub as an example - it's a good test for browser management
        # because it requires proper browser handling
        result = await crawler.arun(
            url="https://github.com/trending",
-            session_id="persistent_session_1",
-            js_code="""
-            // Custom JavaScript to execute on GitHub's trending page
-            const repos = document.querySelectorAll('article.Box-row');
-            const data = Array.from(repos).map(repo => ({
-                name: repo.querySelector('h2')?.textContent?.trim(),
-                description: repo.querySelector('p')?.textContent?.trim(),
-                language: repo.querySelector('[itemprop="programmingLanguage"]')?.textContent?.trim()
-            }));
-            console.log('Trending repositories:', JSON.stringify(data, null, 2));
-            """
+            # session_id="persistent_session_1",
+            cache_mode=CacheMode.BYPASS
        )
        
        print("\nBrowser session result:", result.success)
@@ -136,6 +151,8 @@ async def browser_management_example():
 # 5. API Usage Example
 async def api_example():
    """Example of using the new API endpoints"""
+    api_token = os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code"
+    headers = {'Authorization': f'Bearer {api_token}'}    
    async with aiohttp.ClientSession() as session:
        # Submit crawl job
        crawl_request = {
@@ -143,52 +160,78 @@ async def api_example():
            "extraction_config": {
                "type": "json_css",
                "params": {
-                    "selectors": {
-                        "titles": ".title a",
-                        "scores": ".score",
-                        "comments": ".comment-tree"
+                    "schema": {
+                        "name": "Hacker News Articles",
+                        "baseSelector": ".athing",
+                        "fields": [
+                            {
+                                "name": "title",
+                                "selector": ".title a",
+                                "type": "text"
+                            },
+                            {
+                                "name": "score",
+                                "selector": ".score",
+                                "type": "text"
+                            },
+                            {
+                                "name": "url",
+                                "selector": ".title a",
+                                "type": "attribute",
+                                "attribute": "href"
+                            }
+                        ]
                    }
                }
            },
            "crawler_params": {
                "headless": True,
-                "use_managed_browser": True
+                # "use_managed_browser": True
            },
-            "screenshot": True,
-            "magic": True
+            "cache_mode": "bypass",
+            # "screenshot": True,
+            # "magic": True
        }
        
        async with session.post(
            "http://localhost:11235/crawl",
-            json=crawl_request
+            json=crawl_request,
+            headers=headers
        ) as response:
            task_data = await response.json()
            task_id = task_data["task_id"]
            
            # Check task status
-            async with session.get(
-                f"http://localhost:11235/task/{task_id}"
-            ) as status_response:
-                result = await status_response.json()
-                print(f"Task result: {result}")
+            while True:
+                async with session.get(
+                    f"http://localhost:11235/task/{task_id}",
+                    headers=headers
+                ) as status_response:
+                    result = await status_response.json()
+                    print(f"Task result: {result}")
+                    
+                    if result["status"] == "completed":
+                        break
+                    else:
+                        await asyncio.sleep(1)

 # Main execution
 async def main():
-    print("Running Crawl4AI feature examples...")
+    # print("Running Crawl4AI feature examples...")
    
-    print("\n1. Running Download Example:")
+    # print("\n1. Running Download Example:")
    await download_example()
    
-    print("\n2. Running Content Filtering Example:")
+    # print("\n2. Running Content Filtering Example:")
    await content_filtering_example()
    
-    print("\n3. Running Local and Raw HTML Example:")
+    # print("\n3. Running Local and Raw HTML Example:")
    await local_and_raw_html_example()
    
-    print("\n4. Running Browser Management Example:")
+    # print("\n4. Running Browser Management Example:")
    await browser_management_example()
    
-    print("\n5. Running API Example:")
+    # print("\n5. Running API Example:")
    await api_example()

 if __name__ == "__main__":
--- a/docs/md_v2/basic/docker-deploymeny.md
+++ b/docs/md_v2/basic/docker-deploymeny.md
@@ -15,6 +15,94 @@ docker run -p 11235:11235 unclecode/crawl4ai:basic
 docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:basic
 ```

+## Running with Docker Compose 🐳
+
+### Use Docker Compose (From Local Dockerfile or Docker Hub)
+
+Crawl4AI provides flexibility to use Docker Compose for managing your containerized services. You can either build the image locally from the provided `Dockerfile` or use the pre-built image from Docker Hub.
+
+### **Option 1: Using Docker Compose to Build Locally**
+If you want to build the image locally, use the provided `docker-compose.local.yml` file.
+
+```bash
+docker-compose -f docker-compose.local.yml up -d
+```
+
+This will:
+1. Build the Docker image from the provided `Dockerfile`.
+2. Start the container and expose it on `http://localhost:11235`.
+
+---
+
+### **Option 2: Using Docker Compose with Pre-Built Image from Hub**
+If you prefer using the pre-built image on Docker Hub, use the `docker-compose.hub.yml` file.
+
+```bash
+docker-compose -f docker-compose.hub.yml up -d
+```
+
+This will:
+1. Pull the pre-built image `unclecode/crawl4ai:basic` (or `all`, depending on your configuration).
+2. Start the container and expose it on `http://localhost:11235`.
+
+---
+
+### **Stopping the Running Services**
+
+To stop the services started via Docker Compose, you can use:
+
+```bash
+docker-compose -f docker-compose.local.yml down
+# OR
+docker-compose -f docker-compose.hub.yml down
+```
+
+If the containers don’t stop and the application is still running, check the running containers:
+
+```bash
+docker ps
+```
+
+Find the `CONTAINER ID` of the running service and stop it forcefully:
+
+```bash
+docker stop <CONTAINER_ID>
+```
+
+---
+
+### **Debugging with Docker Compose**
+
+- **Check Logs**: To view the container logs:
+  ```bash
+  docker-compose -f docker-compose.local.yml logs -f
+  ```
+
+- **Remove Orphaned Containers**: If the service is still running unexpectedly:
+  ```bash
+  docker-compose -f docker-compose.local.yml down --remove-orphans
+  ```
+
+- **Manually Remove Network**: If the network is still in use:
+  ```bash
+  docker network ls
+  docker network rm crawl4ai_default
+  ```
+
+---
+
+### Why Use Docker Compose?
+
+Docker Compose is the recommended way to deploy Crawl4AI because:
+1. It simplifies multi-container setups.
+2. Allows you to define environment variables, resources, and ports in a single file.
+3. Makes it easier to switch between local development and production-ready images.
+
+For example, your `docker-compose.yml` could include API keys, token settings, and memory limits, making deployment quick and consistent.
+
+
+
+
 ## API Security 🔒

 ### Understanding CRAWL4AI_API_TOKEN
--- a/main.py
+++ b/main.py
@@ -26,6 +26,7 @@ from enum import Enum
 from dataclasses import dataclass
 import json
 from crawl4ai import AsyncWebCrawler, CrawlResult, CacheMode
+from crawl4ai.config import MIN_WORD_THRESHOLD
 from crawl4ai.extraction_strategy import (
    LLMExtractionStrategy,
    CosineStrategy,
@@ -53,12 +54,20 @@ class ExtractionConfig(BaseModel):
    type: CrawlerType
    params: Dict[str, Any] = {}

+class ChunkingStrategy(BaseModel):
+    type: str
+    params: Dict[str, Any] = {}
+
+class ContentFilter(BaseModel):
+    type: str = "bm25"
+    params: Dict[str, Any] = {}
+
 class CrawlRequest(BaseModel):
    urls: Union[HttpUrl, List[HttpUrl]]
+    word_count_threshold: int = MIN_WORD_THRESHOLD
    extraction_config: Optional[ExtractionConfig] = None
-    crawler_params: Dict[str, Any] = {}
-    priority: int = Field(default=5, ge=1, le=10)
-    ttl: Optional[int] = 3600
+    chunking_strategy: Optional[ChunkingStrategy] = None
+    content_filter: Optional[ContentFilter] = None
    js_code: Optional[List[str]] = None
    wait_for: Optional[str] = None
    css_selector: Optional[str] = None
@@ -66,7 +75,10 @@ class CrawlRequest(BaseModel):
    magic: bool = False
    extra: Optional[Dict[str, Any]] = {}
    session_id: Optional[str] = None
-    cache_mode: Optional[CacheMode] = None
+    cache_mode: Optional[CacheMode] = CacheMode.ENABLED
+    priority: int = Field(default=5, ge=1, le=10)
+    ttl: Optional[int] = 3600    
+    crawler_params: Dict[str, Any] = {}

@dataclass
 class TaskInfo:
@@ -280,6 +292,7 @@ class CrawlerService:
                    if isinstance(request.urls, list):
                        results = await crawler.arun_many(
                            urls=[str(url) for url in request.urls],
+                            word_count_threshold=MIN_WORD_THRESHOLD,
                            extraction_strategy=extraction_strategy,
                            js_code=request.js_code,
                            wait_for=request.wait_for,
@@ -287,6 +300,7 @@ class CrawlerService:
                            screenshot=request.screenshot,
                            magic=request.magic,
                            session_id=request.session_id,
+                            cache_mode=request.cache_mode,
                            **request.extra,
                        )
                    else:
@@ -299,6 +313,7 @@ class CrawlerService:
                            screenshot=request.screenshot,
                            magic=request.magic,
                            session_id=request.session_id,
+                            cache_mode=request.cache_mode,
                            **request.extra,
                        )

--- a/requirements.txt
+++ b/requirements.txt
@@ -13,3 +13,4 @@ xxhash~=3.4
 rank-bm25~=0.2
 aiofiles~=24.0
 colorama~=0.4
+snowballstemmer~=2.2