From 852729ff380f0568d6874bc960606ba3cce0e935 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Mon, 18 Nov 2024 21:00:06 +0800
Subject: [PATCH] feat(docker): add Docker Compose configurations for local and
 hub deployment; enhance GPU support checks in Dockerfile feat(requirements):
 update requirements.txt to include snowballstemmer fix(version_manager):
 correct version parsing to use __version__.__version__ feat(main): introduce
 chunking strategy and content filter in CrawlRequest model
 feat(content_filter): enhance BM25 algorithm with priority tag scoring for
 improved content relevance feat(logger): implement new async logger engine
 replacing print statements throughout library fix(database): resolve
 version-related deadlock and circular lock issues in database operations
 docs(docker): expand Docker deployment documentation with usage instructions
 for Docker Compose

---
 Dockerfile                             |  12 +-
 crawl4ai/async_crawler_strategy.py     | 149 ++++++++++++----
 crawl4ai/async_database.py             | 189 +++++++++++++-------
 crawl4ai/async_logger.py               | 231 +++++++++++++++++++++++++
 crawl4ai/async_webcrawler.py           | 144 +++++++++++----
 crawl4ai/content_filter_strategy.py    |  71 ++++----
 crawl4ai/content_scrapping_strategy.py |  44 ++++-
 crawl4ai/version_manager.py            |   4 +-
 docker-compose.hub.yml                 |  27 +++
 docker-compose.local.yml               |  33 ++++
 docker-compose.yml                     |  47 ++++-
 docs/examples/v0.3.74.overview.py      | 119 +++++++++----
 docs/md_v2/basic/docker-deploymeny.md  |  88 ++++++++++
 main.py                                |  23 ++-
 requirements.txt                       |   3 +-
 15 files changed, 952 insertions(+), 232 deletions(-)
 create mode 100644 crawl4ai/async_logger.py
 create mode 100644 docker-compose.hub.yml
 create mode 100644 docker-compose.local.yml

diff --git a/Dockerfile b/Dockerfile
index aac2280a..bd71deae 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -62,11 +62,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     libatspi2.0-0 \
     && rm -rf /var/lib/apt/lists/*
 
-# GPU support if enabled
-RUN if [ "$ENABLE_GPU" = "true" ] ; then \
-    apt-get update && apt-get install -y --no-install-recommends \
-    nvidia-cuda-toolkit \
-    && rm -rf /var/lib/apt/lists/* ; \
+# GPU support if enabled and architecture is supported
+RUN if [ "$ENABLE_GPU" = "true" ] && [ "$(dpkg --print-architecture)" != "arm64" ] ; then \
+        apt-get update && apt-get install -y --no-install-recommends \
+        nvidia-cuda-toolkit \
+        && rm -rf /var/lib/apt/lists/* ; \
+    else \
+        echo "Skipping NVIDIA CUDA Toolkit installation (unsupported architecture or GPU disabled)"; \
     fi
 
 # Create and set working directory
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 90d5cbe8..a6ba8e50 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -35,13 +35,15 @@ stealth_config = StealthConfig(
 
 
 class ManagedBrowser:
-    def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False):
+    def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None):
         self.browser_type = browser_type
         self.user_data_dir = user_data_dir
         self.headless = headless
         self.browser_process = None
         self.temp_dir = None
         self.debugging_port = 9222
+        self.logger = logger
+        self.shutting_down = False
 
     async def start(self) -> str:
         """
@@ -76,15 +78,38 @@ class ManagedBrowser:
     async def _monitor_browser_process(self):
         """Monitor the browser process for unexpected termination."""
         if self.browser_process:
-            stdout, stderr = await asyncio.gather(
-                asyncio.to_thread(self.browser_process.stdout.read),
-                asyncio.to_thread(self.browser_process.stderr.read)
-            )
-            if self.browser_process.poll() is not None:
-                print(f"Browser process terminated unexpectedly with code {self.browser_process.returncode}")
-                print(f"STDOUT: {stdout.decode()}")
-                print(f"STDERR: {stderr.decode()}")
-                await self.cleanup()
+            try:
+                stdout, stderr = await asyncio.gather(
+                    asyncio.to_thread(self.browser_process.stdout.read),
+                    asyncio.to_thread(self.browser_process.stderr.read)
+                )
+                
+                # Check shutting_down flag BEFORE logging anything
+                if self.browser_process.poll() is not None:
+                    if not self.shutting_down:
+                        self.logger.error(
+                            message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
+                            tag="ERROR",
+                            params={
+                                "code": self.browser_process.returncode,
+                                "stdout": stdout.decode(),
+                                "stderr": stderr.decode()
+                            }
+                        )                
+                        await self.cleanup()
+                    else:
+                        self.logger.info(
+                            message="Browser process terminated normally | Code: {code}",
+                            tag="INFO",
+                            params={"code": self.browser_process.returncode}
+                        )
+            except Exception as e:
+                if not self.shutting_down:
+                    self.logger.error(
+                        message="Error monitoring browser process: {error}",
+                        tag="ERROR",
+                        params={"error": str(e)}
+                    )
 
     def _get_browser_path(self) -> str:
         """Returns the browser executable path based on OS and browser type"""
@@ -134,20 +159,39 @@ class ManagedBrowser:
 
     async def cleanup(self):
         """Cleanup browser process and temporary directory"""
+        # Set shutting_down flag BEFORE any termination actions
+        self.shutting_down = True
+        
         if self.browser_process:
             try:
                 self.browser_process.terminate()
-                await asyncio.sleep(1)
+                # Wait for process to end gracefully
+                for _ in range(10):  # 10 attempts, 100ms each
+                    if self.browser_process.poll() is not None:
+                        break
+                    await asyncio.sleep(0.1)
+                
+                # Force kill if still running
                 if self.browser_process.poll() is None:
                     self.browser_process.kill()
+                    await asyncio.sleep(0.1)  # Brief wait for kill to take effect
+                    
             except Exception as e:
-                print(f"Error terminating browser: {e}")
+                self.logger.error(
+                    message="Error terminating browser: {error}",
+                    tag="ERROR",
+                    params={"error": str(e)}
+                )
 
         if self.temp_dir and os.path.exists(self.temp_dir):
             try:
                 shutil.rmtree(self.temp_dir)
             except Exception as e:
-                print(f"Error removing temporary directory: {e}")
+                self.logger.error(
+                    message="Error removing temporary directory: {error}",
+                    tag="ERROR",
+                    params={"error": str(e)}
+                )
 
 
 class AsyncCrawlerStrategy(ABC):
@@ -172,7 +216,8 @@ class AsyncCrawlerStrategy(ABC):
         pass
 
 class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
-    def __init__(self, use_cached_html=False, js_code=None, **kwargs):
+    def __init__(self, use_cached_html=False, js_code=None, logger = None, **kwargs):
+        self.logger = logger
         self.use_cached_html = use_cached_html
         self.user_agent = kwargs.get(
             "user_agent",
@@ -231,7 +276,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 self.managed_browser = ManagedBrowser(
                     browser_type=self.browser_type,
                     user_data_dir=self.user_data_dir,
-                    headless=self.headless
+                    headless=self.headless,
+                    logger=self.logger
                 )
                 cdp_url = await self.managed_browser.start()
                 self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
@@ -282,6 +328,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 # Add extra args if provided
                 if self.extra_args:
                     browser_args["args"].extend(self.extra_args)
+                    
+                # Add downloads path if downloads are enabled
+                if self.accept_downloads:
+                    browser_args["downloads_path"] = self.downloads_path
                 
                 # Add proxy settings if a proxy is specified
                 if self.proxy:
@@ -344,6 +394,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             self.browser = None
             
         if self.managed_browser:
+            await asyncio.sleep(0.5)
             await self.managed_browser.cleanup()
             self.managed_browser = None
             
@@ -491,9 +542,19 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                         }}
                     """)
                 else:
-                    print(f"Warning: Could not access content frame for iframe {i}")
+                    # print(f"Warning: Could not access content frame for iframe {i}")
+                    self.logger.warning(
+                        message="Could not access content frame for iframe {index}",
+                        tag="SCRAPE",
+                        params={"index": i}
+                    )                    
             except Exception as e:
-                print(f"Error processing iframe {i}: {str(e)}")
+                self.logger.error(
+                    message="Error processing iframe {index}: {error}",
+                    tag="ERROR",
+                    params={"index": i, "error": str(e)}
+                )                
+                # print(f"Error processing iframe {i}: {str(e)}")
 
         # Return the page object
         return page  
@@ -620,7 +681,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                     context = await self.browser.new_context(
                         user_agent=self.user_agent,
                         viewport={"width": 1920, "height": 1080},
-                        proxy={"server": self.proxy} if self.proxy else None
+                        proxy={"server": self.proxy} if self.proxy else None,
+                        accept_downloads=self.accept_downloads,
                     )
                     await context.set_extra_http_headers(self.headers)
                 
@@ -917,17 +979,31 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             suggested_filename = download.suggested_filename
             download_path = os.path.join(self.downloads_path, suggested_filename)
             
-            if self.verbose:
-                print(f"[LOG] 📥 Downloading {suggested_filename} to {download_path}")
+            self.logger.info(
+                message="Downloading {filename} to {path}",
+                tag="FETCH",
+                params={"filename": suggested_filename, "path": download_path}
+            )
                 
+            start_time = time.perf_counter()
             await download.save_as(download_path)
+            end_time = time.perf_counter()
             self._downloaded_files.append(download_path)
-            
-            if self.verbose:
-                print(f"[LOG] ✅ Downloaded {suggested_filename} successfully")
+
+            self.logger.success(
+                message="Downloaded {filename} successfully",
+                tag="COMPLETE",
+                params={"filename": suggested_filename, "path": download_path, "duration": f"{end_time - start_time:.2f}s"}
+            )            
         except Exception as e:
-            if self.verbose:
-                print(f"[ERROR] Failed to handle download: {str(e)}")
+            self.logger.error(
+                message="Failed to handle download: {error}",
+                tag="ERROR",
+                params={"error": str(e)}
+            )
+            
+            # if self.verbose:
+            #     print(f"[ERROR] Failed to handle download: {str(e)}")
     
     async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
         semaphore_count = kwargs.get('semaphore_count', 5)  # Adjust as needed
@@ -1070,8 +1146,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             await page.evaluate(remove_overlays_js)
             await page.wait_for_timeout(500)  # Wait for any animations to complete
         except Exception as e:
-            if self.verbose:
-                print(f"Warning: Failed to remove overlay elements: {str(e)}")
+            self.logger.warning(
+                message="Failed to remove overlay elements: {error}",
+                tag="SCRAPE",
+                params={"error": str(e)}
+            )            
+            # if self.verbose:
+            #     print(f"Warning: Failed to remove overlay elements: {str(e)}")
 
     async def take_screenshot(self, page: Page) -> str:
         """
@@ -1089,7 +1170,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             return base64.b64encode(screenshot).decode('utf-8')
         except Exception as e:
             error_message = f"Failed to take screenshot: {str(e)}"
-            print(error_message)
+            self.logger.error(
+                message="Screenshot failed: {error}",
+                tag="ERROR",
+                params={"error": error_message}
+            )
+            
 
             # Generate an error image
             img = Image.new('RGB', (800, 600), color='black')
@@ -1123,7 +1209,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             return base64.b64encode(screenshot).decode('utf-8')
         except Exception as e:
             error_message = f"Failed to take screenshot: {str(e)}"
-            print(error_message)
+            # print(error_message)
+            self.logger.error(
+                message="Screenshot failed: {error}",
+                tag="ERROR",
+                params={"error": error_message}
+            )            
 
             # Generate an error image
             img = Image.new('RGB', (800, 600), color='black')
diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py
index 7809dfe1..19160b6e 100644
--- a/crawl4ai/async_database.py
+++ b/crawl4ai/async_database.py
@@ -12,10 +12,12 @@ import xxhash
 import aiofiles
 from .config import NEED_MIGRATION
 from .version_manager import VersionManager
+from .async_logger import AsyncLogger
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
+base_directory = Path.home()
 DB_PATH = os.path.join(Path.home(), ".crawl4ai")
 os.makedirs(DB_PATH, exist_ok=True)
 DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
@@ -28,15 +30,21 @@ class AsyncDatabaseManager:
         self.max_retries = max_retries
         self.connection_pool: Dict[int, aiosqlite.Connection] = {}
         self.pool_lock = asyncio.Lock()
+        self.init_lock = asyncio.Lock()
         self.connection_semaphore = asyncio.Semaphore(pool_size)
         self._initialized = False  
         self.version_manager = VersionManager()
+        self.logger = AsyncLogger(
+            log_file=os.path.join(base_directory, ".crawl4ai", "crawler_db.log"),
+            verbose=False,
+            tag_width=10
+        )
         
         
     async def initialize(self):
         """Initialize the database and connection pool"""
         try:
-            logger.info("Initializing database...")
+            self.logger.info("Initializing database", tag="INIT")
             # Ensure the database file exists
             os.makedirs(os.path.dirname(self.db_path), exist_ok=True)
             
@@ -47,31 +55,39 @@ class AsyncDatabaseManager:
             await self.ainit_db()
             
             # Verify the table exists
-            async def verify_table(db):
+            async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
                 async with db.execute(
                     "SELECT name FROM sqlite_master WHERE type='table' AND name='crawled_data'"
                 ) as cursor:
                     result = await cursor.fetchone()
                     if not result:
                         raise Exception("crawled_data table was not created")
-                    
-            await self.execute_with_retry(verify_table)
             
             # If version changed or fresh install, run updates
             if needs_update:
-                logger.info("New version detected, running updates...")
+                self.logger.info("New version detected, running updates", tag="INIT")
                 await self.update_db_schema()
                 from .migrations import run_migration  # Import here to avoid circular imports
                 await run_migration()
                 self.version_manager.update_version()  # Update stored version after successful migration
-                logger.info("Version update completed successfully")
+                self.logger.success("Version update completed successfully", tag="COMPLETE")
             else:
-                logger.info("Database initialization completed successfully")
+                self.logger.success("Database initialization completed successfully", tag="COMPLETE")
+
                 
         except Exception as e:
-            logger.error(f"Database initialization error: {e}")
-            logger.info("Database will be initialized on first use")
+            self.logger.error(
+                message="Database initialization error: {error}",
+                tag="ERROR",
+                params={"error": str(e)}
+            )
+            self.logger.info(
+                message="Database will be initialized on first use",
+                tag="INIT"
+            )
+                        
             raise
+
             
     async def cleanup(self):
         """Cleanup connections when shutting down"""
@@ -84,34 +100,41 @@ class AsyncDatabaseManager:
     async def get_connection(self):
         """Connection pool manager"""
         if not self._initialized:
-            async with self.pool_lock:  # Prevent multiple simultaneous initializations
-                if not self._initialized:  # Double-check after acquiring lock
+            # Use an asyncio.Lock to ensure only one initialization occurs
+            async with self.init_lock:
+                if not self._initialized:
                     await self.initialize()
                     self._initialized = True
 
-        async with self.connection_semaphore:
-            task_id = id(asyncio.current_task())
-            try:
-                async with self.pool_lock:
-                    if task_id not in self.connection_pool:
-                        conn = await aiosqlite.connect(
-                            self.db_path,
-                            timeout=30.0
-                        )
-                        await conn.execute('PRAGMA journal_mode = WAL')
-                        await conn.execute('PRAGMA busy_timeout = 5000')
-                        self.connection_pool[task_id] = conn
-                    
-                yield self.connection_pool[task_id]
-                
-            except Exception as e:
-                logger.error(f"Connection error: {e}")
-                raise
-            finally:
-                async with self.pool_lock:
-                    if task_id in self.connection_pool:
-                        await self.connection_pool[task_id].close()
-                        del self.connection_pool[task_id]
+        await self.connection_semaphore.acquire()
+        task_id = id(asyncio.current_task())
+        try:
+            async with self.pool_lock:
+                if task_id not in self.connection_pool:
+                    conn = await aiosqlite.connect(
+                        self.db_path,
+                        timeout=30.0
+                    )
+                    await conn.execute('PRAGMA journal_mode = WAL')
+                    await conn.execute('PRAGMA busy_timeout = 5000')
+                    self.connection_pool[task_id] = conn
+
+            yield self.connection_pool[task_id]
+
+        except Exception as e:
+            self.logger.error(
+                message="Connection error: {error}",
+                tag="ERROR",
+                force_verbose=True,
+                params={"error": str(e)}
+            )
+            raise
+        finally:
+            async with self.pool_lock:
+                if task_id in self.connection_pool:
+                    await self.connection_pool[task_id].close()
+                    del self.connection_pool[task_id]
+            self.connection_semaphore.release()
 
 
     async def execute_with_retry(self, operation, *args):
@@ -124,13 +147,21 @@ class AsyncDatabaseManager:
                     return result
             except Exception as e:
                 if attempt == self.max_retries - 1:
-                    logger.error(f"Operation failed after {self.max_retries} attempts: {e}")
+                    self.logger.error(
+                        message="Operation failed after {retries} attempts: {error}",
+                        tag="ERROR",
+                        force_verbose=True,
+                        params={
+                            "retries": self.max_retries,
+                            "error": str(e)
+                        }
+                    )                    
                     raise
                 await asyncio.sleep(1 * (attempt + 1))  # Exponential backoff
 
     async def ainit_db(self):
         """Initialize database schema"""
-        async def _init(db):
+        async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
             await db.execute('''
                 CREATE TABLE IF NOT EXISTS crawled_data (
                     url TEXT PRIMARY KEY,
@@ -147,36 +178,37 @@ class AsyncDatabaseManager:
                     downloaded_files TEXT DEFAULT "{}"  -- New column added
                 )
             ''')
-        
-        await self.execute_with_retry(_init)
+            await db.commit()
+
         
 
     async def update_db_schema(self):
         """Update database schema if needed"""
-        async def _check_columns(db):
+        async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
             cursor = await db.execute("PRAGMA table_info(crawled_data)")
             columns = await cursor.fetchall()
-            return [column[1] for column in columns]
+            column_names = [column[1] for column in columns]
+            
+            # List of new columns to add
+            new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files']
+            
+            for column in new_columns:
+                if column not in column_names:
+                    await self.aalter_db_add_column(column, db)
+            await db.commit()
 
-        column_names = await self.execute_with_retry(_check_columns)
-        
-        # List of new columns to add
-        new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files']
-        
-        for column in new_columns:
-            if column not in column_names:
-                await self.aalter_db_add_column(column)
-
-    async def aalter_db_add_column(self, new_column: str):
+    async def aalter_db_add_column(self, new_column: str, db):
         """Add new column to the database"""
-        async def _alter(db):
-            if new_column == 'response_headers':
-                await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"')
-            else:
-                await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
-            logger.info(f"Added column '{new_column}' to the database.")
+        if new_column == 'response_headers':
+            await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"')
+        else:
+            await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
+        self.logger.info(
+            message="Added column '{column}' to the database",
+            tag="INIT",
+            params={"column": new_column}
+        )        
 
-        await self.execute_with_retry(_alter)
 
     async def aget_cached_url(self, url: str) -> Optional[CrawlResult]:
         """Retrieve cached URL data as CrawlResult"""
@@ -235,7 +267,12 @@ class AsyncDatabaseManager:
         try:
             return await self.execute_with_retry(_get)
         except Exception as e:
-            logger.error(f"Error retrieving cached URL: {e}")
+            self.logger.error(
+                message="Error retrieving cached URL: {error}",
+                tag="ERROR",
+                force_verbose=True,
+                params={"error": str(e)}
+            )
             return None
 
     async def acache_url(self, result: CrawlResult):
@@ -291,7 +328,13 @@ class AsyncDatabaseManager:
         try:
             await self.execute_with_retry(_cache)
         except Exception as e:
-            logger.error(f"Error caching URL: {e}")
+            self.logger.error(
+                message="Error caching URL: {error}",
+                tag="ERROR",
+                force_verbose=True,
+                params={"error": str(e)}
+            )
+            
 
     async def aget_total_count(self) -> int:
         """Get total number of cached URLs"""
@@ -303,7 +346,12 @@ class AsyncDatabaseManager:
         try:
             return await self.execute_with_retry(_count)
         except Exception as e:
-            logger.error(f"Error getting total count: {e}")
+            self.logger.error(
+                message="Error getting total count: {error}",
+                tag="ERROR",
+                force_verbose=True,
+                params={"error": str(e)}
+            )
             return 0
 
     async def aclear_db(self):
@@ -314,7 +362,12 @@ class AsyncDatabaseManager:
         try:
             await self.execute_with_retry(_clear)
         except Exception as e:
-            logger.error(f"Error clearing database: {e}")
+            self.logger.error(
+                message="Error clearing database: {error}",
+                tag="ERROR",
+                force_verbose=True,
+                params={"error": str(e)}
+            )
 
     async def aflush_db(self):
         """Drop the entire table"""
@@ -324,7 +377,12 @@ class AsyncDatabaseManager:
         try:
             await self.execute_with_retry(_flush)
         except Exception as e:
-            logger.error(f"Error flushing database: {e}")
+            self.logger.error(
+                message="Error flushing database: {error}",
+                tag="ERROR",
+                force_verbose=True,
+                params={"error": str(e)}
+            )
             
                 
     async def _store_content(self, content: str, content_type: str) -> str:
@@ -352,7 +410,12 @@ class AsyncDatabaseManager:
             async with aiofiles.open(file_path, 'r', encoding='utf-8') as f:
                 return await f.read()
         except:
-            logger.error(f"Failed to load content: {file_path}")
+            self.logger.error(
+                message="Failed to load content: {file_path}",
+                tag="ERROR",
+                force_verbose=True,
+                params={"file_path": file_path}
+            )
             return None
 
 # Create a singleton instance
diff --git a/crawl4ai/async_logger.py b/crawl4ai/async_logger.py
new file mode 100644
index 00000000..220edd11
--- /dev/null
+++ b/crawl4ai/async_logger.py
@@ -0,0 +1,231 @@
+from enum import Enum
+from typing import Optional, Dict, Any, Union
+from colorama import Fore, Back, Style, init
+import time
+import os
+from datetime import datetime
+
+class LogLevel(Enum):
+    DEBUG = 1
+    INFO = 2
+    SUCCESS = 3
+    WARNING = 4
+    ERROR = 5
+
+class AsyncLogger:
+    """
+    Asynchronous logger with support for colored console output and file logging.
+    Supports templated messages with colored components.
+    """
+    
+    DEFAULT_ICONS = {
+        'INIT': '→',
+        'READY': '✓',
+        'FETCH': '↓',
+        'SCRAPE': '◆',
+        'EXTRACT': '■',
+        'COMPLETE': '●',
+        'ERROR': '×',
+        'DEBUG': '⋯',
+        'INFO': 'ℹ',
+        'WARNING': '⚠',
+    }
+
+    DEFAULT_COLORS = {
+        LogLevel.DEBUG: Fore.LIGHTBLACK_EX,
+        LogLevel.INFO: Fore.CYAN,
+        LogLevel.SUCCESS: Fore.GREEN,
+        LogLevel.WARNING: Fore.YELLOW,
+        LogLevel.ERROR: Fore.RED,
+    }
+
+    def __init__(
+        self,
+        log_file: Optional[str] = None,
+        log_level: LogLevel = LogLevel.INFO,
+        tag_width: int = 10,
+        icons: Optional[Dict[str, str]] = None,
+        colors: Optional[Dict[LogLevel, str]] = None,
+        verbose: bool = True
+    ):
+        """
+        Initialize the logger.
+        
+        Args:
+            log_file: Optional file path for logging
+            log_level: Minimum log level to display
+            tag_width: Width for tag formatting
+            icons: Custom icons for different tags
+            colors: Custom colors for different log levels
+            verbose: Whether to output to console
+        """
+        init()  # Initialize colorama
+        self.log_file = log_file
+        self.log_level = log_level
+        self.tag_width = tag_width
+        self.icons = icons or self.DEFAULT_ICONS
+        self.colors = colors or self.DEFAULT_COLORS
+        self.verbose = verbose
+        
+        # Create log file directory if needed
+        if log_file:
+            os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True)
+
+    def _format_tag(self, tag: str) -> str:
+        """Format a tag with consistent width."""
+        return f"[{tag}]".ljust(self.tag_width, ".")
+
+    def _get_icon(self, tag: str) -> str:
+        """Get the icon for a tag, defaulting to info icon if not found."""
+        return self.icons.get(tag, self.icons['INFO'])
+
+    def _write_to_file(self, message: str):
+        """Write a message to the log file if configured."""
+        if self.log_file:
+            timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
+            with open(self.log_file, 'a', encoding='utf-8') as f:
+                # Strip ANSI color codes for file output
+                clean_message = message.replace(Fore.RESET, '').replace(Style.RESET_ALL, '')
+                for color in vars(Fore).values():
+                    if isinstance(color, str):
+                        clean_message = clean_message.replace(color, '')
+                f.write(f"[{timestamp}] {clean_message}\n")
+
+    def _log(
+        self,
+        level: LogLevel,
+        message: str,
+        tag: str,
+        params: Optional[Dict[str, Any]] = None,
+        colors: Optional[Dict[str, str]] = None,
+        base_color: Optional[str] = None,
+        **kwargs
+    ):
+        """
+        Core logging method that handles message formatting and output.
+        
+        Args:
+            level: Log level for this message
+            message: Message template string
+            tag: Tag for the message
+            params: Parameters to format into the message
+            colors: Color overrides for specific parameters
+            base_color: Base color for the entire message
+        """
+        if level.value < self.log_level.value:
+            return
+
+        # Format the message with parameters if provided
+        if params:
+            try:
+                # First format the message with raw parameters
+                formatted_message = message.format(**params)
+                
+                # Then apply colors if specified
+                if colors:
+                    for key, color in colors.items():
+                        # Find the formatted value in the message and wrap it with color
+                        if key in params:
+                            value_str = str(params[key])
+                            formatted_message = formatted_message.replace(
+                                value_str, 
+                                f"{color}{value_str}{Style.RESET_ALL}"
+                            )
+                            
+            except KeyError as e:
+                formatted_message = f"LOGGING ERROR: Missing parameter {e} in message template"
+                level = LogLevel.ERROR
+        else:
+            formatted_message = message
+
+        # Construct the full log line
+        color = base_color or self.colors[level]
+        log_line = f"{color}{self._format_tag(tag)} {self._get_icon(tag)} {formatted_message}{Style.RESET_ALL}"
+
+        # Output to console if verbose
+        if self.verbose or kwargs.get("force_verbose", False):
+            print(log_line)
+
+        # Write to file if configured
+        self._write_to_file(log_line)
+
+    def debug(self, message: str, tag: str = "DEBUG", **kwargs):
+        """Log a debug message."""
+        self._log(LogLevel.DEBUG, message, tag, **kwargs)
+
+    def info(self, message: str, tag: str = "INFO", **kwargs):
+        """Log an info message."""
+        self._log(LogLevel.INFO, message, tag, **kwargs)
+
+    def success(self, message: str, tag: str = "SUCCESS", **kwargs):
+        """Log a success message."""
+        self._log(LogLevel.SUCCESS, message, tag, **kwargs)
+
+    def warning(self, message: str, tag: str = "WARNING", **kwargs):
+        """Log a warning message."""
+        self._log(LogLevel.WARNING, message, tag, **kwargs)
+
+    def error(self, message: str, tag: str = "ERROR", **kwargs):
+        """Log an error message."""
+        self._log(LogLevel.ERROR, message, tag, **kwargs)
+
+    def url_status(
+        self,
+        url: str,
+        success: bool,
+        timing: float,
+        tag: str = "FETCH",
+        url_length: int = 50
+    ):
+        """
+        Convenience method for logging URL fetch status.
+        
+        Args:
+            url: The URL being processed
+            success: Whether the operation was successful
+            timing: Time taken for the operation
+            tag: Tag for the message
+            url_length: Maximum length for URL in log
+        """
+        self._log(
+            level=LogLevel.SUCCESS if success else LogLevel.ERROR,
+            message="{url:.{url_length}}... | Status: {status} | Time: {timing:.2f}s",
+            tag=tag,
+            params={
+                "url": url,
+                "url_length": url_length,
+                "status": success,
+                "timing": timing
+            },
+            colors={
+                "status": Fore.GREEN if success else Fore.RED,
+                "timing": Fore.YELLOW
+            }
+        )
+
+    def error_status(
+        self,
+        url: str,
+        error: str,
+        tag: str = "ERROR",
+        url_length: int = 50
+    ):
+        """
+        Convenience method for logging error status.
+        
+        Args:
+            url: The URL being processed
+            error: Error message
+            tag: Tag for the message
+            url_length: Maximum length for URL in log
+        """
+        self._log(
+            level=LogLevel.ERROR,
+            message="{url:.{url_length}}... | Error: {error}",
+            tag=tag,
+            params={
+                "url": url,
+                "url_length": url_length,
+                "error": error
+            }
+        )
\ No newline at end of file
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 79a17ac4..5fe7822c 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -15,6 +15,7 @@ from .extraction_strategy import *
 from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
 from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
 from .content_scrapping_strategy import WebScrapingStrategy
+from .async_logger import AsyncLogger
 
 from .config import (
     MIN_WORD_THRESHOLD, 
@@ -74,19 +75,29 @@ class AsyncWebCrawler:
             always_by_pass_cache: Deprecated, use always_bypass_cache instead
             base_directory: Base directory for storing cache
         """
-        init()
-        self.log_width = 10  # Width of "[COMPLETE]" 
-        self.tag_format = lambda tag: f"[{tag}]".ljust(self.log_width, ".")
-        self.log_icons = {
-            'INIT': '→',      # Alternative: '▶' or '►'
-            'READY': '✓',     # Alternative: '√'
-            'FETCH': '↓',     # Alternative: '▼'
-            'SCRAPE': '◆',    # Alternative: '♦'
-            'EXTRACT': '■',    # Alternative: '□'
-            'COMPLETE': '●',   # Alternative: '○'
-            'ERROR': '×' 
-        }        
-        self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(**kwargs)
+        # init()
+        # self.log_width = 10  # Width of "[COMPLETE]" 
+        # self.tag_format = lambda tag: f"[{tag}]".ljust(self.log_width, ".")
+        # self.log_icons = {
+        #     'INIT': '→',      # Alternative: '▶' or '►'
+        #     'READY': '✓',     # Alternative: '√'
+        #     'FETCH': '↓',     # Alternative: '▼'
+        #     'SCRAPE': '◆',    # Alternative: '♦'
+        #     'EXTRACT': '■',    # Alternative: '□'
+        #     'COMPLETE': '●',   # Alternative: '○'
+        #     'ERROR': '×' 
+        # }        
+        self.verbose = kwargs.get("verbose", False)
+        self.logger = AsyncLogger(
+            log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"),
+            verbose=self.verbose,
+            tag_width=10
+        )
+        
+        self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
+            logger = self.logger,
+            **kwargs
+        )
         
         # Handle deprecated parameter
         if always_by_pass_cache is not None:
@@ -118,12 +129,13 @@ class AsyncWebCrawler:
 
     async def awarmup(self):
         """Initialize the crawler with warm-up sequence."""
-        if self.verbose:
-            print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Crawl4AI {crawl4ai_version}{Style.RESET_ALL}")
-            print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Warming up AsyncWebCrawler{Style.RESET_ALL}")
+        self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
+        # if self.verbose:
+        #     print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Crawl4AI {crawl4ai_version}{Style.RESET_ALL}")
+        #     print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Warming up AsyncWebCrawler{Style.RESET_ALL}")
         self.ready = True
-        if self.verbose:
-            print(f"{Fore.GREEN}{self.tag_format('READY')} {self.log_icons['READY']} AsyncWebCrawler initialized{Style.RESET_ALL}")
+        # if self.verbose:
+        #     print(f"{Fore.GREEN}{self.tag_format('READY')} {self.log_icons['READY']} AsyncWebCrawler initialized{Style.RESET_ALL}")
 
     async def arun(
         self,
@@ -234,8 +246,14 @@ class AsyncWebCrawler:
                     screenshot_data = cached_result.screenshot
                     if not screenshot_data:
                         cached_result = None
-                if verbose:
-                    print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s")
+                # if verbose:
+                #     print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s")
+                self.logger.url_status(
+                        url=cache_context.display_url,
+                        success=bool(html),
+                        timing=time.perf_counter() - start_time,
+                        tag="FETCH"
+                    )                    
 
 
             # Fetch fresh content if needed
@@ -252,8 +270,14 @@ class AsyncWebCrawler:
                 html = sanitize_input_encode(async_response.html)
                 screenshot_data = async_response.screenshot
                 t2 = time.perf_counter()
-                if verbose:
-                    print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url}... | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s")
+                self.logger.url_status(
+                    url=cache_context.display_url,
+                    success=bool(html),
+                    timing=t2 - t1,
+                    tag="FETCH"
+                )
+                # if verbose:
+                #     print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url}... | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s")
 
             # Process the HTML content
             crawl_result = await self.aprocess_html(
@@ -287,9 +311,21 @@ class AsyncWebCrawler:
             crawl_result.success = bool(html)
             crawl_result.session_id = kwargs.get("session_id", None)
 
-            if verbose:
-                print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}")
-
+            # if verbose:
+            #     print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}")
+            self.logger.success(
+                    message="{url:.50}... | Status: {status} | Total: {timing}",
+                    tag="COMPLETE",
+                    params={
+                        "url": cache_context.display_url,
+                        "status": crawl_result.success,
+                        "timing": f"{time.perf_counter() - start_time:.2f}s"
+                    },
+                    colors={
+                        "status": Fore.GREEN if crawl_result.success else Fore.RED,
+                        "timing": Fore.YELLOW
+                    }
+                )
 
             # Update cache if appropriate
             if cache_context.should_write() and not bool(cached_result):
@@ -300,7 +336,12 @@ class AsyncWebCrawler:
         except Exception as e:
             if not hasattr(e, "msg"):
                 e.msg = str(e)
-            print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}")
+            # print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}")
+            self.logger.error_status(
+                url=cache_context.display_url,
+                error=e.msg,
+                tag="ERROR"
+            )            
             return CrawlResult(
                 url=url, 
                 html="", 
@@ -362,7 +403,12 @@ class AsyncWebCrawler:
             domain = urlparse(url).netloc
             current_time = time.time()
             
-            print(f"{Fore.LIGHTBLACK_EX}{self.tag_format('PARALLEL')} Started task for {url[:50]}...{Style.RESET_ALL}")
+            # print(f"{Fore.LIGHTBLACK_EX}{self.tag_format('PARALLEL')} Started task for {url[:50]}...{Style.RESET_ALL}")
+            self.logger.debug(
+                message="Started task for {url:.50}...",
+                tag="PARALLEL",
+                params={"url": url}
+            )            
             
             # Get delay settings from kwargs or use defaults
             mean_delay = kwargs.get('mean_delay', 0.1)  # 0.5 seconds default mean delay
@@ -394,12 +440,26 @@ class AsyncWebCrawler:
                 )
 
         # Print start message
-        print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Starting concurrent crawling for {len(urls)} URLs...{Style.RESET_ALL}")
+        # print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Starting concurrent crawling for {len(urls)} URLs...{Style.RESET_ALL}")
+        self.logger.info(
+            message="Starting concurrent crawling for {count} URLs...",
+            tag="INIT",
+            params={"count": len(urls)}
+        )        
         start_time = time.perf_counter()
         tasks = [crawl_with_semaphore(url) for url in urls]
         results = await asyncio.gather(*tasks, return_exceptions=True)
         end_time = time.perf_counter()
-        print(f"{Fore.YELLOW}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} Concurrent crawling completed for {len(urls)} URLs | Total time: {end_time - start_time:.2f}s{Style.RESET_ALL}")
+        # print(f"{Fore.YELLOW}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} Concurrent crawling completed for {len(urls)} URLs | Total time: {end_time - start_time:.2f}s{Style.RESET_ALL}")
+        self.logger.success(
+            message="Concurrent crawling completed for {count} URLs | " + Fore.YELLOW + " Total time: {timing}" + Style.RESET_ALL,
+            tag="COMPLETE",
+            params={
+                "count": len(urls),
+                "timing": f"{end_time - start_time:.2f}s"
+            },
+            colors={"timing": Fore.YELLOW}
+        )        
         return [result if not isinstance(result, Exception) else str(result) for result in results]
 
 
@@ -451,9 +511,16 @@ class AsyncWebCrawler:
         links = result.get("links", [])
         metadata = result.get("metadata", {})
         
-        if verbose:
-            print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms")
-
+        # if verbose:
+        #     print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms")
+        self.logger.info(
+            message="Processed {url:.50}... | Time: {timing}ms",
+            tag="SCRAPE",
+            params={
+                "url": _url,
+                "timing": int((time.perf_counter() - t1) * 1000)
+            }
+        )
 
 
         if extracted_content is None and extraction_strategy and chunking_strategy and not isinstance(extraction_strategy, NoExtractionStrategy):
@@ -467,8 +534,17 @@ class AsyncWebCrawler:
                 sections = chunking_strategy.chunk(markdown)
                 extracted_content = extraction_strategy.run(url, sections)
                 extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
-            if verbose:
-                print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}")
+            # if verbose:
+                # print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}")
+            self.logger.info(
+                message="Completed for {url:.50}... | Time: {timing}s",
+                tag="EXTRACT",
+                params={
+                    "url": _url,
+                    "timing": time.perf_counter() - t1
+                }
+            )
+        
 
                 
 
diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py
index 88375da9..88216f7f 100644
--- a/crawl4ai/content_filter_strategy.py
+++ b/crawl4ai/content_filter_strategy.py
@@ -8,6 +8,10 @@ from bs4 import BeautifulSoup, NavigableString, Tag
 from .utils import clean_tokens
 from abc import ABC, abstractmethod
 
+from snowballstemmer import stemmer
+
+# from nltk.stem import PorterStemmer
+# ps = PorterStemmer()
 class RelevantContentFilter(ABC):
     def __init__(self, user_query: str = None):
         self.user_query = user_query
@@ -252,7 +256,7 @@ class RelevantContentFilter(ABC):
             return str(tag)  # Fallback to original if anything fails
 
 class BM25ContentFilter(RelevantContentFilter):
-    def __init__(self, user_query: str = None, bm25_threshold: float = 1.0):
+    def __init__(self, user_query: str = None, bm25_threshold: float = 1.0, language: str = 'english'):
         super().__init__(user_query=user_query)
         self.bm25_threshold = bm25_threshold
         self.priority_tags = {
@@ -268,6 +272,7 @@ class BM25ContentFilter(RelevantContentFilter):
             'pre': 1.5,
             'th': 1.5,  # Table headers
         }
+        self.stemmer = stemmer(language)
 
     def filter_content(self, html: str) -> List[str]:
         """Implements content filtering using BM25 algorithm with priority tag handling"""
@@ -282,58 +287,42 @@ class BM25ContentFilter(RelevantContentFilter):
         if not candidates:
             return []
 
-        # Split into priority and regular candidates
-        priority_candidates = []
-        regular_candidates = []
+        # Tokenize corpus
+        # tokenized_corpus = [chunk.lower().split() for _, chunk, _, _ in candidates]
+        # tokenized_query = query.lower().split()
+                
+        # tokenized_corpus = [[ps.stem(word) for word in chunk.lower().split()] 
+        #                 for _, chunk, _, _ in candidates]
+        # tokenized_query = [ps.stem(word) for word in query.lower().split()]        
         
-        for index, chunk, tag_type, tag in candidates:
-            if tag.name in self.priority_tags:
-                priority_candidates.append((index, chunk, tag_type, tag))
-            else:
-                regular_candidates.append((index, chunk, tag_type, tag))
+        tokenized_corpus = [[self.stemmer.stemWord(word) for word in chunk.lower().split()] 
+                   for _, chunk, _, _ in candidates]
+        tokenized_query = [self.stemmer.stemWord(word) for word in query.lower().split()]
 
-        # Process regular content with BM25
-        tokenized_corpus = [chunk.lower().split() for _, chunk, _, _ in regular_candidates]
-        tokenized_query = query.lower().split()
-        
         # Clean from stop words and noise
         tokenized_corpus = [clean_tokens(tokens) for tokens in tokenized_corpus]
         tokenized_query = clean_tokens(tokenized_query)
-        
+
         bm25 = BM25Okapi(tokenized_corpus)
         scores = bm25.get_scores(tokenized_query)
 
-        # Score and boost regular candidates
-        scored_candidates = [
-            (score * self.priority_tags.get(tag.name, 1.0), index, chunk, tag_type, tag)
-            for score, (index, chunk, tag_type, tag) in zip(scores, regular_candidates)
+        # Adjust scores with tag weights
+        adjusted_candidates = []
+        for score, (index, chunk, tag_type, tag) in zip(scores, candidates):
+            tag_weight = self.priority_tags.get(tag.name, 1.0)
+            adjusted_score = score * tag_weight
+            adjusted_candidates.append((adjusted_score, index, chunk, tag))
+
+        # Filter candidates by threshold
+        selected_candidates = [
+            (index, chunk, tag) for adjusted_score, index, chunk, tag in adjusted_candidates
+            if adjusted_score >= self.bm25_threshold
         ]
-        scored_candidates.sort(key=lambda x: x[0], reverse=True)
-
-        # Process scored candidates
-        selected_tags = set()
-        selected_candidates = []
-
-        # First add all priority candidates
-        for index, chunk, tag_type, tag in priority_candidates:
-            tag_id = id(tag)
-            if tag_id not in selected_tags:
-                selected_candidates.append((index, chunk, tag))
-                selected_tags.add(tag_id)
-
-        # Then add scored regular candidates that meet threshold
-        for score, index, chunk, tag_type, tag in scored_candidates:
-            if score < self.bm25_threshold:
-                continue
-            tag_id = id(tag)
-            if tag_id not in selected_tags:
-                selected_candidates.append((index, chunk, tag))
-                selected_tags.add(tag_id)
 
         if not selected_candidates:
             return []
 
-        # Sort by original document order
+        # Sort selected candidates by original document order
         selected_candidates.sort(key=lambda x: x[0])
-        return [self.clean_element(tag) for _, _, tag in selected_candidates]
 
+        return [self.clean_element(tag) for _, _, tag in selected_candidates]
diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scrapping_strategy.py
index d16b0680..0f470671 100644
--- a/crawl4ai/content_scrapping_strategy.py
+++ b/crawl4ai/content_scrapping_strategy.py
@@ -149,6 +149,15 @@ class ContentScrapingStrategy(ABC):
         pass
 
 class WebScrapingStrategy(ContentScrapingStrategy):
+    def __init__(self, logger=None):
+        self.logger = logger
+
+    def _log(self, level, message, tag="SCRAPE", **kwargs):
+        """Helper method to safely use logger."""
+        if self.logger:
+            log_method = getattr(self.logger, level)
+            log_method(message=message, tag=tag, **kwargs)
+                
     def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
         return self._get_content_of_website_optimized(url, html, is_async=False, **kwargs)
 
@@ -167,7 +176,12 @@ class WebScrapingStrategy(ContentScrapingStrategy):
         try:
             meta = extract_metadata("", soup)
         except Exception as e:
-            print('Error extracting metadata:', str(e))
+            self._log('error', 
+                message="Error extracting metadata: {error}",
+                tag="SCRAPE",
+                params={"error": str(e)}
+            )            
+            # print('Error extracting metadata:', str(e))
             meta = {}
         
         
@@ -430,9 +444,12 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                 try:
                     remove_unwanted_attributes(element, IMPORTANT_ATTRS, kwargs.get('keep_data_attributes', False))
                 except Exception as e:
-                    print('Error removing unwanted attributes:', str(e))
-                
-
+                    # print('Error removing unwanted attributes:', str(e))
+                    self._log('error',
+                        message="Error removing unwanted attributes: {error}",
+                        tag="SCRAPE",
+                        params={"error": str(e)}
+                    )
                 # Process children
                 for child in list(element.children):
                     if isinstance(child, NavigableString) and not isinstance(child, Comment):
@@ -453,7 +470,12 @@ class WebScrapingStrategy(ContentScrapingStrategy):
 
                 return keep_element
             except Exception as e:
-                print('Error processing element:', str(e))
+                # print('Error processing element:', str(e))
+                self._log('error',
+                    message="Error processing element: {error}",
+                    tag="SCRAPE",
+                    params={"error": str(e)}
+                )                
                 return False
        
         process_element(body)
@@ -516,7 +538,10 @@ class WebScrapingStrategy(ContentScrapingStrategy):
             str_body = body.encode_contents().decode('utf-8')
             
             print(f"[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.")
-
+            self._log('error',
+                message="After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.",
+                tag="SCRAPE"
+            )
 
         cleaned_html = str_body.replace('\n\n', '\n').replace('  ', ' ')
 
@@ -525,6 +550,13 @@ class WebScrapingStrategy(ContentScrapingStrategy):
             h.update_params(**kwargs.get('html2text', {}))            
             markdown = h.handle(cleaned_html)
         except Exception as e:
+            if not h:
+                h = CustomHTML2Text()
+            self._log('error',
+                message="Error converting HTML to markdown: {error}",
+                tag="SCRAPE",
+                params={"error": str(e)}
+            )
             markdown = h.handle(sanitize_html(cleaned_html))
         markdown = markdown.replace('    ```', '```')
 
diff --git a/crawl4ai/version_manager.py b/crawl4ai/version_manager.py
index 07e0c0e9..8ae2de2e 100644
--- a/crawl4ai/version_manager.py
+++ b/crawl4ai/version_manager.py
@@ -20,11 +20,11 @@ class VersionManager:
             
     def update_version(self):
         """Update the version file to current library version"""
-        self.version_file.write_text(__version__)
+        self.version_file.write_text(__version__.__version__)
         
     def needs_update(self):
         """Check if database needs update based on version"""
         installed = self.get_installed_version()
-        current = version.parse(__version__)
+        current = version.parse(__version__.__version__)
         return installed is None or installed < current
 
diff --git a/docker-compose.hub.yml b/docker-compose.hub.yml
new file mode 100644
index 00000000..9bcfa982
--- /dev/null
+++ b/docker-compose.hub.yml
@@ -0,0 +1,27 @@
+services:
+  crawl4ai:
+    image: unclecode/crawl4ai:basic  # Pull image from Docker Hub
+    ports:
+      - "11235:11235"  # FastAPI server
+      - "8000:8000"    # Alternative port
+      - "9222:9222"    # Browser debugging
+      - "8080:8080"    # Additional port
+    environment:
+      - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}  # Optional API token
+      - OPENAI_API_KEY=${OPENAI_API_KEY:-}          # Optional OpenAI API key
+      - CLAUDE_API_KEY=${CLAUDE_API_KEY:-}          # Optional Claude API key
+    volumes:
+      - /dev/shm:/dev/shm  # Shared memory for browser operations
+    deploy:
+      resources:
+        limits:
+          memory: 4G
+        reservations:
+          memory: 1G
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s
diff --git a/docker-compose.local.yml b/docker-compose.local.yml
new file mode 100644
index 00000000..7dc41b47
--- /dev/null
+++ b/docker-compose.local.yml
@@ -0,0 +1,33 @@
+services:
+  crawl4ai:
+    build:
+      context: .
+      dockerfile: Dockerfile
+      args:
+        PYTHON_VERSION: 3.10
+        INSTALL_TYPE: all
+        ENABLE_GPU: false
+    ports:
+      - "11235:11235"  # FastAPI server
+      - "8000:8000"    # Alternative port
+      - "9222:9222"    # Browser debugging
+      - "8080:8080"    # Additional port
+    environment:
+      - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}  # Optional API token
+      - OPENAI_API_KEY=${OPENAI_API_KEY:-}          # Optional OpenAI API key
+      - CLAUDE_API_KEY=${CLAUDE_API_KEY:-}          # Optional Claude API key
+    volumes:
+      - /dev/shm:/dev/shm  # Shared memory for browser operations
+    deploy:
+      resources:
+        limits:
+          memory: 4G
+        reservations:
+          memory: 1G
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index ef0dc9e4..1097ef11 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,5 +1,3 @@
-version: '3.8'
-
 services:
   crawl4ai:
     build:
@@ -9,15 +7,18 @@ services:
         PYTHON_VERSION: 3.10
         INSTALL_TYPE: all
         ENABLE_GPU: false
+    profiles: ["local"]
     ports:
-      - "11235:11235"  # FastAPI server
-      - "8000:8000"    # Alternative port
-      - "9222:9222"    # Browser debugging
-      - "8080:8080"    # Additional port
+      - "11235:11235"
+      - "8000:8000"
+      - "9222:9222"
+      - "8080:8080"
     environment:
-      - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}  # Optional API token
+      - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}
+      - OPENAI_API_KEY=${OPENAI_API_KEY:-}
+      - CLAUDE_API_KEY=${CLAUDE_API_KEY:-}
     volumes:
-      - /dev/shm:/dev/shm  # Shared memory for browser operations
+      - /dev/shm:/dev/shm
     deploy:
       resources:
         limits:
@@ -30,4 +31,32 @@ services:
       interval: 30s
       timeout: 10s
       retries: 3
-      start_period: 40s
\ No newline at end of file
+      start_period: 40s
+
+  crawl4ai-hub:
+    image: unclecode/crawl4ai:basic
+    profiles: ["hub"]
+    ports:
+      - "11235:11235"
+      - "8000:8000"
+      - "9222:9222"
+      - "8080:8080"
+    environment:
+      - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}
+      - OPENAI_API_KEY=${OPENAI_API_KEY:-}
+      - CLAUDE_API_KEY=${CLAUDE_API_KEY:-}
+    volumes:
+      - /dev/shm:/dev/shm
+    deploy:
+      resources:
+        limits:
+          memory: 4G
+        reservations:
+          memory: 1G
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s
diff --git a/docs/examples/v0.3.74.overview.py b/docs/examples/v0.3.74.overview.py
index ec3a7d73..00296740 100644
--- a/docs/examples/v0.3.74.overview.py
+++ b/docs/examples/v0.3.74.overview.py
@@ -1,9 +1,16 @@
+import os, sys
+# append the parent directory to the sys.path
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+parent_parent_dir = os.path.dirname(parent_dir)
+sys.path.append(parent_parent_dir)
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+__data__ = os.path.join(__location__, "__data")
 import asyncio
-import os
 from pathlib import Path
 import aiohttp
 import json
-from crawl4ai import AsyncWebCrawler
+from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.content_filter_strategy import BM25ContentFilter
 
 # 1. File Download Processing Example
@@ -32,7 +39,8 @@ async def download_example():
                 console.log('No .exe download link found');
             }
             """,
-            wait_for=5  # Wait 5 seconds to ensure download starts
+            delay_before_return_html=1,  # Wait 5 seconds to ensure download starts
+            cache_mode=CacheMode.BYPASS
         )
         
         if result.downloaded_files:
@@ -50,22 +58,32 @@ async def content_filtering_example():
     async with AsyncWebCrawler(verbose=True) as crawler:
         # Create filter with custom query for OpenAI's blog
         content_filter = BM25ContentFilter(
-            user_query="AI language models research innovation",
+            # user_query="Investment and fundraising",
+            # user_query="Robotic",
             bm25_threshold=1.0
         )
         
         result = await crawler.arun(
-            url="https://openai.com/blog",
-            content_filter=content_filter
+            url="https://techcrunch.com/",
+            content_filter=content_filter,
+            cache_mode=CacheMode.BYPASS
         )
         
-        print(f"Filtered content: {result.extracted_content}")
+        print(f"Filtered content: {len(result.fit_markdown)}")
+        print(f"Filtered content: {result.fit_markdown}")
+        
+        # Save html 
+        with open(os.path.join(__data__, "techcrunch.html"), "w") as f:
+            f.write(result.fit_html)
+        
+        with open(os.path.join(__data__, "filtered_content.md"), "w") as f:
+            f.write(result.fit_markdown)
 
 # 3. Local File and Raw HTML Processing Example
 async def local_and_raw_html_example():
     """Example of processing local files and raw HTML"""
     # Create a sample HTML file
-    sample_file = "sample.html"
+    sample_file = os.path.join(__data__, "sample.html")
     with open(sample_file, "w") as f:
         f.write("""
         <html><body>
@@ -112,21 +130,18 @@ async def browser_management_example():
         headless=False,
         verbose=True
     ) as crawler:
+
+        result = await crawler.arun(
+            url="https://crawl4ai.com",
+            # session_id="persistent_session_1",
+            cache_mode=CacheMode.BYPASS
+        )        
         # Use GitHub as an example - it's a good test for browser management
         # because it requires proper browser handling
         result = await crawler.arun(
             url="https://github.com/trending",
-            session_id="persistent_session_1",
-            js_code="""
-            // Custom JavaScript to execute on GitHub's trending page
-            const repos = document.querySelectorAll('article.Box-row');
-            const data = Array.from(repos).map(repo => ({
-                name: repo.querySelector('h2')?.textContent?.trim(),
-                description: repo.querySelector('p')?.textContent?.trim(),
-                language: repo.querySelector('[itemprop="programmingLanguage"]')?.textContent?.trim()
-            }));
-            console.log('Trending repositories:', JSON.stringify(data, null, 2));
-            """
+            # session_id="persistent_session_1",
+            cache_mode=CacheMode.BYPASS
         )
         
         print("\nBrowser session result:", result.success)
@@ -136,6 +151,8 @@ async def browser_management_example():
 # 5. API Usage Example
 async def api_example():
     """Example of using the new API endpoints"""
+    api_token = os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code"
+    headers = {'Authorization': f'Bearer {api_token}'}    
     async with aiohttp.ClientSession() as session:
         # Submit crawl job
         crawl_request = {
@@ -143,52 +160,78 @@ async def api_example():
             "extraction_config": {
                 "type": "json_css",
                 "params": {
-                    "selectors": {
-                        "titles": ".title a",
-                        "scores": ".score",
-                        "comments": ".comment-tree"
+                    "schema": {
+                        "name": "Hacker News Articles",
+                        "baseSelector": ".athing",
+                        "fields": [
+                            {
+                                "name": "title",
+                                "selector": ".title a",
+                                "type": "text"
+                            },
+                            {
+                                "name": "score",
+                                "selector": ".score",
+                                "type": "text"
+                            },
+                            {
+                                "name": "url",
+                                "selector": ".title a",
+                                "type": "attribute",
+                                "attribute": "href"
+                            }
+                        ]
                     }
                 }
             },
             "crawler_params": {
                 "headless": True,
-                "use_managed_browser": True
+                # "use_managed_browser": True
             },
-            "screenshot": True,
-            "magic": True
+            "cache_mode": "bypass",
+            # "screenshot": True,
+            # "magic": True
         }
         
         async with session.post(
             "http://localhost:11235/crawl",
-            json=crawl_request
+            json=crawl_request,
+            headers=headers
         ) as response:
             task_data = await response.json()
             task_id = task_data["task_id"]
             
             # Check task status
-            async with session.get(
-                f"http://localhost:11235/task/{task_id}"
-            ) as status_response:
-                result = await status_response.json()
-                print(f"Task result: {result}")
+            while True:
+                async with session.get(
+                    f"http://localhost:11235/task/{task_id}",
+                    headers=headers
+                ) as status_response:
+                    result = await status_response.json()
+                    print(f"Task result: {result}")
+                    
+                    if result["status"] == "completed":
+                        break
+                    else:
+                        await asyncio.sleep(1)
 
 # Main execution
 async def main():
-    print("Running Crawl4AI feature examples...")
+    # print("Running Crawl4AI feature examples...")
     
-    print("\n1. Running Download Example:")
+    # print("\n1. Running Download Example:")
     await download_example()
     
-    print("\n2. Running Content Filtering Example:")
+    # print("\n2. Running Content Filtering Example:")
     await content_filtering_example()
     
-    print("\n3. Running Local and Raw HTML Example:")
+    # print("\n3. Running Local and Raw HTML Example:")
     await local_and_raw_html_example()
     
-    print("\n4. Running Browser Management Example:")
+    # print("\n4. Running Browser Management Example:")
     await browser_management_example()
     
-    print("\n5. Running API Example:")
+    # print("\n5. Running API Example:")
     await api_example()
 
 if __name__ == "__main__":
diff --git a/docs/md_v2/basic/docker-deploymeny.md b/docs/md_v2/basic/docker-deploymeny.md
index 30555708..87e468aa 100644
--- a/docs/md_v2/basic/docker-deploymeny.md
+++ b/docs/md_v2/basic/docker-deploymeny.md
@@ -15,6 +15,94 @@ docker run -p 11235:11235 unclecode/crawl4ai:basic
 docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:basic
 ```
 
+## Running with Docker Compose 🐳
+
+### Use Docker Compose (From Local Dockerfile or Docker Hub)
+
+Crawl4AI provides flexibility to use Docker Compose for managing your containerized services. You can either build the image locally from the provided `Dockerfile` or use the pre-built image from Docker Hub.
+
+### **Option 1: Using Docker Compose to Build Locally**
+If you want to build the image locally, use the provided `docker-compose.local.yml` file.
+
+```bash
+docker-compose -f docker-compose.local.yml up -d
+```
+
+This will:
+1. Build the Docker image from the provided `Dockerfile`.
+2. Start the container and expose it on `http://localhost:11235`.
+
+---
+
+### **Option 2: Using Docker Compose with Pre-Built Image from Hub**
+If you prefer using the pre-built image on Docker Hub, use the `docker-compose.hub.yml` file.
+
+```bash
+docker-compose -f docker-compose.hub.yml up -d
+```
+
+This will:
+1. Pull the pre-built image `unclecode/crawl4ai:basic` (or `all`, depending on your configuration).
+2. Start the container and expose it on `http://localhost:11235`.
+
+---
+
+### **Stopping the Running Services**
+
+To stop the services started via Docker Compose, you can use:
+
+```bash
+docker-compose -f docker-compose.local.yml down
+# OR
+docker-compose -f docker-compose.hub.yml down
+```
+
+If the containers don’t stop and the application is still running, check the running containers:
+
+```bash
+docker ps
+```
+
+Find the `CONTAINER ID` of the running service and stop it forcefully:
+
+```bash
+docker stop <CONTAINER_ID>
+```
+
+---
+
+### **Debugging with Docker Compose**
+
+- **Check Logs**: To view the container logs:
+  ```bash
+  docker-compose -f docker-compose.local.yml logs -f
+  ```
+
+- **Remove Orphaned Containers**: If the service is still running unexpectedly:
+  ```bash
+  docker-compose -f docker-compose.local.yml down --remove-orphans
+  ```
+
+- **Manually Remove Network**: If the network is still in use:
+  ```bash
+  docker network ls
+  docker network rm crawl4ai_default
+  ```
+
+---
+
+### Why Use Docker Compose?
+
+Docker Compose is the recommended way to deploy Crawl4AI because:
+1. It simplifies multi-container setups.
+2. Allows you to define environment variables, resources, and ports in a single file.
+3. Makes it easier to switch between local development and production-ready images.
+
+For example, your `docker-compose.yml` could include API keys, token settings, and memory limits, making deployment quick and consistent.
+
+
+
+
 ## API Security 🔒
 
 ### Understanding CRAWL4AI_API_TOKEN
diff --git a/main.py b/main.py
index ee5f7fc6..6d217410 100644
--- a/main.py
+++ b/main.py
@@ -26,6 +26,7 @@ from enum import Enum
 from dataclasses import dataclass
 import json
 from crawl4ai import AsyncWebCrawler, CrawlResult, CacheMode
+from crawl4ai.config import MIN_WORD_THRESHOLD
 from crawl4ai.extraction_strategy import (
     LLMExtractionStrategy,
     CosineStrategy,
@@ -53,12 +54,20 @@ class ExtractionConfig(BaseModel):
     type: CrawlerType
     params: Dict[str, Any] = {}
 
+class ChunkingStrategy(BaseModel):
+    type: str
+    params: Dict[str, Any] = {}
+
+class ContentFilter(BaseModel):
+    type: str = "bm25"
+    params: Dict[str, Any] = {}
+
 class CrawlRequest(BaseModel):
     urls: Union[HttpUrl, List[HttpUrl]]
+    word_count_threshold: int = MIN_WORD_THRESHOLD
     extraction_config: Optional[ExtractionConfig] = None
-    crawler_params: Dict[str, Any] = {}
-    priority: int = Field(default=5, ge=1, le=10)
-    ttl: Optional[int] = 3600
+    chunking_strategy: Optional[ChunkingStrategy] = None
+    content_filter: Optional[ContentFilter] = None
     js_code: Optional[List[str]] = None
     wait_for: Optional[str] = None
     css_selector: Optional[str] = None
@@ -66,7 +75,10 @@ class CrawlRequest(BaseModel):
     magic: bool = False
     extra: Optional[Dict[str, Any]] = {}
     session_id: Optional[str] = None
-    cache_mode: Optional[CacheMode] = None
+    cache_mode: Optional[CacheMode] = CacheMode.ENABLED
+    priority: int = Field(default=5, ge=1, le=10)
+    ttl: Optional[int] = 3600    
+    crawler_params: Dict[str, Any] = {}
 
 @dataclass
 class TaskInfo:
@@ -280,6 +292,7 @@ class CrawlerService:
                     if isinstance(request.urls, list):
                         results = await crawler.arun_many(
                             urls=[str(url) for url in request.urls],
+                            word_count_threshold=MIN_WORD_THRESHOLD,
                             extraction_strategy=extraction_strategy,
                             js_code=request.js_code,
                             wait_for=request.wait_for,
@@ -287,6 +300,7 @@ class CrawlerService:
                             screenshot=request.screenshot,
                             magic=request.magic,
                             session_id=request.session_id,
+                            cache_mode=request.cache_mode,
                             **request.extra,
                         )
                     else:
@@ -299,6 +313,7 @@ class CrawlerService:
                             screenshot=request.screenshot,
                             magic=request.magic,
                             session_id=request.session_id,
+                            cache_mode=request.cache_mode,
                             **request.extra,
                         )
 
diff --git a/requirements.txt b/requirements.txt
index e6294cc5..ed259ac9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,4 +12,5 @@ tf-playwright-stealth~=1.0
 xxhash~=3.4
 rank-bm25~=0.2
 aiofiles~=24.0
-colorama~=0.4
\ No newline at end of file
+colorama~=0.4
+snowballstemmer~=2.2
\ No newline at end of file