From 852729ff380f0568d6874bc960606ba3cce0e935 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Mon, 18 Nov 2024 21:00:06 +0800 Subject: [PATCH] feat(docker): add Docker Compose configurations for local and hub deployment; enhance GPU support checks in Dockerfile feat(requirements): update requirements.txt to include snowballstemmer fix(version_manager): correct version parsing to use __version__.__version__ feat(main): introduce chunking strategy and content filter in CrawlRequest model feat(content_filter): enhance BM25 algorithm with priority tag scoring for improved content relevance feat(logger): implement new async logger engine replacing print statements throughout library fix(database): resolve version-related deadlock and circular lock issues in database operations docs(docker): expand Docker deployment documentation with usage instructions for Docker Compose --- Dockerfile | 12 +- crawl4ai/async_crawler_strategy.py | 149 ++++++++++++---- crawl4ai/async_database.py | 189 +++++++++++++------- crawl4ai/async_logger.py | 231 +++++++++++++++++++++++++ crawl4ai/async_webcrawler.py | 144 +++++++++++---- crawl4ai/content_filter_strategy.py | 71 ++++---- crawl4ai/content_scrapping_strategy.py | 44 ++++- crawl4ai/version_manager.py | 4 +- docker-compose.hub.yml | 27 +++ docker-compose.local.yml | 33 ++++ docker-compose.yml | 47 ++++- docs/examples/v0.3.74.overview.py | 119 +++++++++---- docs/md_v2/basic/docker-deploymeny.md | 88 ++++++++++ main.py | 23 ++- requirements.txt | 3 +- 15 files changed, 952 insertions(+), 232 deletions(-) create mode 100644 crawl4ai/async_logger.py create mode 100644 docker-compose.hub.yml create mode 100644 docker-compose.local.yml diff --git a/Dockerfile b/Dockerfile index aac2280a..bd71deae 100644 --- a/Dockerfile +++ b/Dockerfile @@ -62,11 +62,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ libatspi2.0-0 \ && rm -rf /var/lib/apt/lists/* -# GPU support if enabled -RUN if [ "$ENABLE_GPU" = "true" ] ; then \ - apt-get update && apt-get install -y --no-install-recommends \ - nvidia-cuda-toolkit \ - && rm -rf /var/lib/apt/lists/* ; \ +# GPU support if enabled and architecture is supported +RUN if [ "$ENABLE_GPU" = "true" ] && [ "$(dpkg --print-architecture)" != "arm64" ] ; then \ + apt-get update && apt-get install -y --no-install-recommends \ + nvidia-cuda-toolkit \ + && rm -rf /var/lib/apt/lists/* ; \ + else \ + echo "Skipping NVIDIA CUDA Toolkit installation (unsupported architecture or GPU disabled)"; \ fi # Create and set working directory diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 90d5cbe8..a6ba8e50 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -35,13 +35,15 @@ stealth_config = StealthConfig( class ManagedBrowser: - def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False): + def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None): self.browser_type = browser_type self.user_data_dir = user_data_dir self.headless = headless self.browser_process = None self.temp_dir = None self.debugging_port = 9222 + self.logger = logger + self.shutting_down = False async def start(self) -> str: """ @@ -76,15 +78,38 @@ class ManagedBrowser: async def _monitor_browser_process(self): """Monitor the browser process for unexpected termination.""" if self.browser_process: - stdout, stderr = await asyncio.gather( - asyncio.to_thread(self.browser_process.stdout.read), - asyncio.to_thread(self.browser_process.stderr.read) - ) - if self.browser_process.poll() is not None: - print(f"Browser process terminated unexpectedly with code {self.browser_process.returncode}") - print(f"STDOUT: {stdout.decode()}") - print(f"STDERR: {stderr.decode()}") - await self.cleanup() + try: + stdout, stderr = await asyncio.gather( + asyncio.to_thread(self.browser_process.stdout.read), + asyncio.to_thread(self.browser_process.stderr.read) + ) + + # Check shutting_down flag BEFORE logging anything + if self.browser_process.poll() is not None: + if not self.shutting_down: + self.logger.error( + message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}", + tag="ERROR", + params={ + "code": self.browser_process.returncode, + "stdout": stdout.decode(), + "stderr": stderr.decode() + } + ) + await self.cleanup() + else: + self.logger.info( + message="Browser process terminated normally | Code: {code}", + tag="INFO", + params={"code": self.browser_process.returncode} + ) + except Exception as e: + if not self.shutting_down: + self.logger.error( + message="Error monitoring browser process: {error}", + tag="ERROR", + params={"error": str(e)} + ) def _get_browser_path(self) -> str: """Returns the browser executable path based on OS and browser type""" @@ -134,20 +159,39 @@ class ManagedBrowser: async def cleanup(self): """Cleanup browser process and temporary directory""" + # Set shutting_down flag BEFORE any termination actions + self.shutting_down = True + if self.browser_process: try: self.browser_process.terminate() - await asyncio.sleep(1) + # Wait for process to end gracefully + for _ in range(10): # 10 attempts, 100ms each + if self.browser_process.poll() is not None: + break + await asyncio.sleep(0.1) + + # Force kill if still running if self.browser_process.poll() is None: self.browser_process.kill() + await asyncio.sleep(0.1) # Brief wait for kill to take effect + except Exception as e: - print(f"Error terminating browser: {e}") + self.logger.error( + message="Error terminating browser: {error}", + tag="ERROR", + params={"error": str(e)} + ) if self.temp_dir and os.path.exists(self.temp_dir): try: shutil.rmtree(self.temp_dir) except Exception as e: - print(f"Error removing temporary directory: {e}") + self.logger.error( + message="Error removing temporary directory: {error}", + tag="ERROR", + params={"error": str(e)} + ) class AsyncCrawlerStrategy(ABC): @@ -172,7 +216,8 @@ class AsyncCrawlerStrategy(ABC): pass class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): - def __init__(self, use_cached_html=False, js_code=None, **kwargs): + def __init__(self, use_cached_html=False, js_code=None, logger = None, **kwargs): + self.logger = logger self.use_cached_html = use_cached_html self.user_agent = kwargs.get( "user_agent", @@ -231,7 +276,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.managed_browser = ManagedBrowser( browser_type=self.browser_type, user_data_dir=self.user_data_dir, - headless=self.headless + headless=self.headless, + logger=self.logger ) cdp_url = await self.managed_browser.start() self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) @@ -282,6 +328,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Add extra args if provided if self.extra_args: browser_args["args"].extend(self.extra_args) + + # Add downloads path if downloads are enabled + if self.accept_downloads: + browser_args["downloads_path"] = self.downloads_path # Add proxy settings if a proxy is specified if self.proxy: @@ -344,6 +394,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.browser = None if self.managed_browser: + await asyncio.sleep(0.5) await self.managed_browser.cleanup() self.managed_browser = None @@ -491,9 +542,19 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): }} """) else: - print(f"Warning: Could not access content frame for iframe {i}") + # print(f"Warning: Could not access content frame for iframe {i}") + self.logger.warning( + message="Could not access content frame for iframe {index}", + tag="SCRAPE", + params={"index": i} + ) except Exception as e: - print(f"Error processing iframe {i}: {str(e)}") + self.logger.error( + message="Error processing iframe {index}: {error}", + tag="ERROR", + params={"index": i, "error": str(e)} + ) + # print(f"Error processing iframe {i}: {str(e)}") # Return the page object return page @@ -620,7 +681,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): context = await self.browser.new_context( user_agent=self.user_agent, viewport={"width": 1920, "height": 1080}, - proxy={"server": self.proxy} if self.proxy else None + proxy={"server": self.proxy} if self.proxy else None, + accept_downloads=self.accept_downloads, ) await context.set_extra_http_headers(self.headers) @@ -917,17 +979,31 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): suggested_filename = download.suggested_filename download_path = os.path.join(self.downloads_path, suggested_filename) - if self.verbose: - print(f"[LOG] πŸ“₯ Downloading {suggested_filename} to {download_path}") + self.logger.info( + message="Downloading {filename} to {path}", + tag="FETCH", + params={"filename": suggested_filename, "path": download_path} + ) + start_time = time.perf_counter() await download.save_as(download_path) + end_time = time.perf_counter() self._downloaded_files.append(download_path) - - if self.verbose: - print(f"[LOG] βœ… Downloaded {suggested_filename} successfully") + + self.logger.success( + message="Downloaded {filename} successfully", + tag="COMPLETE", + params={"filename": suggested_filename, "path": download_path, "duration": f"{end_time - start_time:.2f}s"} + ) except Exception as e: - if self.verbose: - print(f"[ERROR] Failed to handle download: {str(e)}") + self.logger.error( + message="Failed to handle download: {error}", + tag="ERROR", + params={"error": str(e)} + ) + + # if self.verbose: + # print(f"[ERROR] Failed to handle download: {str(e)}") async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed @@ -1070,8 +1146,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await page.evaluate(remove_overlays_js) await page.wait_for_timeout(500) # Wait for any animations to complete except Exception as e: - if self.verbose: - print(f"Warning: Failed to remove overlay elements: {str(e)}") + self.logger.warning( + message="Failed to remove overlay elements: {error}", + tag="SCRAPE", + params={"error": str(e)} + ) + # if self.verbose: + # print(f"Warning: Failed to remove overlay elements: {str(e)}") async def take_screenshot(self, page: Page) -> str: """ @@ -1089,7 +1170,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return base64.b64encode(screenshot).decode('utf-8') except Exception as e: error_message = f"Failed to take screenshot: {str(e)}" - print(error_message) + self.logger.error( + message="Screenshot failed: {error}", + tag="ERROR", + params={"error": error_message} + ) + # Generate an error image img = Image.new('RGB', (800, 600), color='black') @@ -1123,7 +1209,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return base64.b64encode(screenshot).decode('utf-8') except Exception as e: error_message = f"Failed to take screenshot: {str(e)}" - print(error_message) + # print(error_message) + self.logger.error( + message="Screenshot failed: {error}", + tag="ERROR", + params={"error": error_message} + ) # Generate an error image img = Image.new('RGB', (800, 600), color='black') diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py index 7809dfe1..19160b6e 100644 --- a/crawl4ai/async_database.py +++ b/crawl4ai/async_database.py @@ -12,10 +12,12 @@ import xxhash import aiofiles from .config import NEED_MIGRATION from .version_manager import VersionManager +from .async_logger import AsyncLogger # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +base_directory = Path.home() DB_PATH = os.path.join(Path.home(), ".crawl4ai") os.makedirs(DB_PATH, exist_ok=True) DB_PATH = os.path.join(DB_PATH, "crawl4ai.db") @@ -28,15 +30,21 @@ class AsyncDatabaseManager: self.max_retries = max_retries self.connection_pool: Dict[int, aiosqlite.Connection] = {} self.pool_lock = asyncio.Lock() + self.init_lock = asyncio.Lock() self.connection_semaphore = asyncio.Semaphore(pool_size) self._initialized = False self.version_manager = VersionManager() + self.logger = AsyncLogger( + log_file=os.path.join(base_directory, ".crawl4ai", "crawler_db.log"), + verbose=False, + tag_width=10 + ) async def initialize(self): """Initialize the database and connection pool""" try: - logger.info("Initializing database...") + self.logger.info("Initializing database", tag="INIT") # Ensure the database file exists os.makedirs(os.path.dirname(self.db_path), exist_ok=True) @@ -47,31 +55,39 @@ class AsyncDatabaseManager: await self.ainit_db() # Verify the table exists - async def verify_table(db): + async with aiosqlite.connect(self.db_path, timeout=30.0) as db: async with db.execute( "SELECT name FROM sqlite_master WHERE type='table' AND name='crawled_data'" ) as cursor: result = await cursor.fetchone() if not result: raise Exception("crawled_data table was not created") - - await self.execute_with_retry(verify_table) # If version changed or fresh install, run updates if needs_update: - logger.info("New version detected, running updates...") + self.logger.info("New version detected, running updates", tag="INIT") await self.update_db_schema() from .migrations import run_migration # Import here to avoid circular imports await run_migration() self.version_manager.update_version() # Update stored version after successful migration - logger.info("Version update completed successfully") + self.logger.success("Version update completed successfully", tag="COMPLETE") else: - logger.info("Database initialization completed successfully") + self.logger.success("Database initialization completed successfully", tag="COMPLETE") + except Exception as e: - logger.error(f"Database initialization error: {e}") - logger.info("Database will be initialized on first use") + self.logger.error( + message="Database initialization error: {error}", + tag="ERROR", + params={"error": str(e)} + ) + self.logger.info( + message="Database will be initialized on first use", + tag="INIT" + ) + raise + async def cleanup(self): """Cleanup connections when shutting down""" @@ -84,34 +100,41 @@ class AsyncDatabaseManager: async def get_connection(self): """Connection pool manager""" if not self._initialized: - async with self.pool_lock: # Prevent multiple simultaneous initializations - if not self._initialized: # Double-check after acquiring lock + # Use an asyncio.Lock to ensure only one initialization occurs + async with self.init_lock: + if not self._initialized: await self.initialize() self._initialized = True - async with self.connection_semaphore: - task_id = id(asyncio.current_task()) - try: - async with self.pool_lock: - if task_id not in self.connection_pool: - conn = await aiosqlite.connect( - self.db_path, - timeout=30.0 - ) - await conn.execute('PRAGMA journal_mode = WAL') - await conn.execute('PRAGMA busy_timeout = 5000') - self.connection_pool[task_id] = conn - - yield self.connection_pool[task_id] - - except Exception as e: - logger.error(f"Connection error: {e}") - raise - finally: - async with self.pool_lock: - if task_id in self.connection_pool: - await self.connection_pool[task_id].close() - del self.connection_pool[task_id] + await self.connection_semaphore.acquire() + task_id = id(asyncio.current_task()) + try: + async with self.pool_lock: + if task_id not in self.connection_pool: + conn = await aiosqlite.connect( + self.db_path, + timeout=30.0 + ) + await conn.execute('PRAGMA journal_mode = WAL') + await conn.execute('PRAGMA busy_timeout = 5000') + self.connection_pool[task_id] = conn + + yield self.connection_pool[task_id] + + except Exception as e: + self.logger.error( + message="Connection error: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) + raise + finally: + async with self.pool_lock: + if task_id in self.connection_pool: + await self.connection_pool[task_id].close() + del self.connection_pool[task_id] + self.connection_semaphore.release() async def execute_with_retry(self, operation, *args): @@ -124,13 +147,21 @@ class AsyncDatabaseManager: return result except Exception as e: if attempt == self.max_retries - 1: - logger.error(f"Operation failed after {self.max_retries} attempts: {e}") + self.logger.error( + message="Operation failed after {retries} attempts: {error}", + tag="ERROR", + force_verbose=True, + params={ + "retries": self.max_retries, + "error": str(e) + } + ) raise await asyncio.sleep(1 * (attempt + 1)) # Exponential backoff async def ainit_db(self): """Initialize database schema""" - async def _init(db): + async with aiosqlite.connect(self.db_path, timeout=30.0) as db: await db.execute(''' CREATE TABLE IF NOT EXISTS crawled_data ( url TEXT PRIMARY KEY, @@ -147,36 +178,37 @@ class AsyncDatabaseManager: downloaded_files TEXT DEFAULT "{}" -- New column added ) ''') - - await self.execute_with_retry(_init) + await db.commit() + async def update_db_schema(self): """Update database schema if needed""" - async def _check_columns(db): + async with aiosqlite.connect(self.db_path, timeout=30.0) as db: cursor = await db.execute("PRAGMA table_info(crawled_data)") columns = await cursor.fetchall() - return [column[1] for column in columns] + column_names = [column[1] for column in columns] + + # List of new columns to add + new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files'] + + for column in new_columns: + if column not in column_names: + await self.aalter_db_add_column(column, db) + await db.commit() - column_names = await self.execute_with_retry(_check_columns) - - # List of new columns to add - new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files'] - - for column in new_columns: - if column not in column_names: - await self.aalter_db_add_column(column) - - async def aalter_db_add_column(self, new_column: str): + async def aalter_db_add_column(self, new_column: str, db): """Add new column to the database""" - async def _alter(db): - if new_column == 'response_headers': - await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"') - else: - await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""') - logger.info(f"Added column '{new_column}' to the database.") + if new_column == 'response_headers': + await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"') + else: + await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""') + self.logger.info( + message="Added column '{column}' to the database", + tag="INIT", + params={"column": new_column} + ) - await self.execute_with_retry(_alter) async def aget_cached_url(self, url: str) -> Optional[CrawlResult]: """Retrieve cached URL data as CrawlResult""" @@ -235,7 +267,12 @@ class AsyncDatabaseManager: try: return await self.execute_with_retry(_get) except Exception as e: - logger.error(f"Error retrieving cached URL: {e}") + self.logger.error( + message="Error retrieving cached URL: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) return None async def acache_url(self, result: CrawlResult): @@ -291,7 +328,13 @@ class AsyncDatabaseManager: try: await self.execute_with_retry(_cache) except Exception as e: - logger.error(f"Error caching URL: {e}") + self.logger.error( + message="Error caching URL: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) + async def aget_total_count(self) -> int: """Get total number of cached URLs""" @@ -303,7 +346,12 @@ class AsyncDatabaseManager: try: return await self.execute_with_retry(_count) except Exception as e: - logger.error(f"Error getting total count: {e}") + self.logger.error( + message="Error getting total count: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) return 0 async def aclear_db(self): @@ -314,7 +362,12 @@ class AsyncDatabaseManager: try: await self.execute_with_retry(_clear) except Exception as e: - logger.error(f"Error clearing database: {e}") + self.logger.error( + message="Error clearing database: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) async def aflush_db(self): """Drop the entire table""" @@ -324,7 +377,12 @@ class AsyncDatabaseManager: try: await self.execute_with_retry(_flush) except Exception as e: - logger.error(f"Error flushing database: {e}") + self.logger.error( + message="Error flushing database: {error}", + tag="ERROR", + force_verbose=True, + params={"error": str(e)} + ) async def _store_content(self, content: str, content_type: str) -> str: @@ -352,7 +410,12 @@ class AsyncDatabaseManager: async with aiofiles.open(file_path, 'r', encoding='utf-8') as f: return await f.read() except: - logger.error(f"Failed to load content: {file_path}") + self.logger.error( + message="Failed to load content: {file_path}", + tag="ERROR", + force_verbose=True, + params={"file_path": file_path} + ) return None # Create a singleton instance diff --git a/crawl4ai/async_logger.py b/crawl4ai/async_logger.py new file mode 100644 index 00000000..220edd11 --- /dev/null +++ b/crawl4ai/async_logger.py @@ -0,0 +1,231 @@ +from enum import Enum +from typing import Optional, Dict, Any, Union +from colorama import Fore, Back, Style, init +import time +import os +from datetime import datetime + +class LogLevel(Enum): + DEBUG = 1 + INFO = 2 + SUCCESS = 3 + WARNING = 4 + ERROR = 5 + +class AsyncLogger: + """ + Asynchronous logger with support for colored console output and file logging. + Supports templated messages with colored components. + """ + + DEFAULT_ICONS = { + 'INIT': 'β†’', + 'READY': 'βœ“', + 'FETCH': '↓', + 'SCRAPE': 'β—†', + 'EXTRACT': 'β– ', + 'COMPLETE': '●', + 'ERROR': 'Γ—', + 'DEBUG': 'β‹―', + 'INFO': 'β„Ή', + 'WARNING': '⚠', + } + + DEFAULT_COLORS = { + LogLevel.DEBUG: Fore.LIGHTBLACK_EX, + LogLevel.INFO: Fore.CYAN, + LogLevel.SUCCESS: Fore.GREEN, + LogLevel.WARNING: Fore.YELLOW, + LogLevel.ERROR: Fore.RED, + } + + def __init__( + self, + log_file: Optional[str] = None, + log_level: LogLevel = LogLevel.INFO, + tag_width: int = 10, + icons: Optional[Dict[str, str]] = None, + colors: Optional[Dict[LogLevel, str]] = None, + verbose: bool = True + ): + """ + Initialize the logger. + + Args: + log_file: Optional file path for logging + log_level: Minimum log level to display + tag_width: Width for tag formatting + icons: Custom icons for different tags + colors: Custom colors for different log levels + verbose: Whether to output to console + """ + init() # Initialize colorama + self.log_file = log_file + self.log_level = log_level + self.tag_width = tag_width + self.icons = icons or self.DEFAULT_ICONS + self.colors = colors or self.DEFAULT_COLORS + self.verbose = verbose + + # Create log file directory if needed + if log_file: + os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True) + + def _format_tag(self, tag: str) -> str: + """Format a tag with consistent width.""" + return f"[{tag}]".ljust(self.tag_width, ".") + + def _get_icon(self, tag: str) -> str: + """Get the icon for a tag, defaulting to info icon if not found.""" + return self.icons.get(tag, self.icons['INFO']) + + def _write_to_file(self, message: str): + """Write a message to the log file if configured.""" + if self.log_file: + timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] + with open(self.log_file, 'a', encoding='utf-8') as f: + # Strip ANSI color codes for file output + clean_message = message.replace(Fore.RESET, '').replace(Style.RESET_ALL, '') + for color in vars(Fore).values(): + if isinstance(color, str): + clean_message = clean_message.replace(color, '') + f.write(f"[{timestamp}] {clean_message}\n") + + def _log( + self, + level: LogLevel, + message: str, + tag: str, + params: Optional[Dict[str, Any]] = None, + colors: Optional[Dict[str, str]] = None, + base_color: Optional[str] = None, + **kwargs + ): + """ + Core logging method that handles message formatting and output. + + Args: + level: Log level for this message + message: Message template string + tag: Tag for the message + params: Parameters to format into the message + colors: Color overrides for specific parameters + base_color: Base color for the entire message + """ + if level.value < self.log_level.value: + return + + # Format the message with parameters if provided + if params: + try: + # First format the message with raw parameters + formatted_message = message.format(**params) + + # Then apply colors if specified + if colors: + for key, color in colors.items(): + # Find the formatted value in the message and wrap it with color + if key in params: + value_str = str(params[key]) + formatted_message = formatted_message.replace( + value_str, + f"{color}{value_str}{Style.RESET_ALL}" + ) + + except KeyError as e: + formatted_message = f"LOGGING ERROR: Missing parameter {e} in message template" + level = LogLevel.ERROR + else: + formatted_message = message + + # Construct the full log line + color = base_color or self.colors[level] + log_line = f"{color}{self._format_tag(tag)} {self._get_icon(tag)} {formatted_message}{Style.RESET_ALL}" + + # Output to console if verbose + if self.verbose or kwargs.get("force_verbose", False): + print(log_line) + + # Write to file if configured + self._write_to_file(log_line) + + def debug(self, message: str, tag: str = "DEBUG", **kwargs): + """Log a debug message.""" + self._log(LogLevel.DEBUG, message, tag, **kwargs) + + def info(self, message: str, tag: str = "INFO", **kwargs): + """Log an info message.""" + self._log(LogLevel.INFO, message, tag, **kwargs) + + def success(self, message: str, tag: str = "SUCCESS", **kwargs): + """Log a success message.""" + self._log(LogLevel.SUCCESS, message, tag, **kwargs) + + def warning(self, message: str, tag: str = "WARNING", **kwargs): + """Log a warning message.""" + self._log(LogLevel.WARNING, message, tag, **kwargs) + + def error(self, message: str, tag: str = "ERROR", **kwargs): + """Log an error message.""" + self._log(LogLevel.ERROR, message, tag, **kwargs) + + def url_status( + self, + url: str, + success: bool, + timing: float, + tag: str = "FETCH", + url_length: int = 50 + ): + """ + Convenience method for logging URL fetch status. + + Args: + url: The URL being processed + success: Whether the operation was successful + timing: Time taken for the operation + tag: Tag for the message + url_length: Maximum length for URL in log + """ + self._log( + level=LogLevel.SUCCESS if success else LogLevel.ERROR, + message="{url:.{url_length}}... | Status: {status} | Time: {timing:.2f}s", + tag=tag, + params={ + "url": url, + "url_length": url_length, + "status": success, + "timing": timing + }, + colors={ + "status": Fore.GREEN if success else Fore.RED, + "timing": Fore.YELLOW + } + ) + + def error_status( + self, + url: str, + error: str, + tag: str = "ERROR", + url_length: int = 50 + ): + """ + Convenience method for logging error status. + + Args: + url: The URL being processed + error: Error message + tag: Tag for the message + url_length: Maximum length for URL in log + """ + self._log( + level=LogLevel.ERROR, + message="{url:.{url_length}}... | Error: {error}", + tag=tag, + params={ + "url": url, + "url_length": url_length, + "error": error + } + ) \ No newline at end of file diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 79a17ac4..5fe7822c 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -15,6 +15,7 @@ from .extraction_strategy import * from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode from .content_scrapping_strategy import WebScrapingStrategy +from .async_logger import AsyncLogger from .config import ( MIN_WORD_THRESHOLD, @@ -74,19 +75,29 @@ class AsyncWebCrawler: always_by_pass_cache: Deprecated, use always_bypass_cache instead base_directory: Base directory for storing cache """ - init() - self.log_width = 10 # Width of "[COMPLETE]" - self.tag_format = lambda tag: f"[{tag}]".ljust(self.log_width, ".") - self.log_icons = { - 'INIT': 'β†’', # Alternative: 'β–Ά' or 'β–Ί' - 'READY': 'βœ“', # Alternative: '√' - 'FETCH': '↓', # Alternative: 'β–Ό' - 'SCRAPE': 'β—†', # Alternative: '♦' - 'EXTRACT': 'β– ', # Alternative: 'β–‘' - 'COMPLETE': '●', # Alternative: 'β—‹' - 'ERROR': 'Γ—' - } - self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(**kwargs) + # init() + # self.log_width = 10 # Width of "[COMPLETE]" + # self.tag_format = lambda tag: f"[{tag}]".ljust(self.log_width, ".") + # self.log_icons = { + # 'INIT': 'β†’', # Alternative: 'β–Ά' or 'β–Ί' + # 'READY': 'βœ“', # Alternative: '√' + # 'FETCH': '↓', # Alternative: 'β–Ό' + # 'SCRAPE': 'β—†', # Alternative: '♦' + # 'EXTRACT': 'β– ', # Alternative: 'β–‘' + # 'COMPLETE': '●', # Alternative: 'β—‹' + # 'ERROR': 'Γ—' + # } + self.verbose = kwargs.get("verbose", False) + self.logger = AsyncLogger( + log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"), + verbose=self.verbose, + tag_width=10 + ) + + self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( + logger = self.logger, + **kwargs + ) # Handle deprecated parameter if always_by_pass_cache is not None: @@ -118,12 +129,13 @@ class AsyncWebCrawler: async def awarmup(self): """Initialize the crawler with warm-up sequence.""" - if self.verbose: - print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Crawl4AI {crawl4ai_version}{Style.RESET_ALL}") - print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Warming up AsyncWebCrawler{Style.RESET_ALL}") + self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT") + # if self.verbose: + # print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Crawl4AI {crawl4ai_version}{Style.RESET_ALL}") + # print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Warming up AsyncWebCrawler{Style.RESET_ALL}") self.ready = True - if self.verbose: - print(f"{Fore.GREEN}{self.tag_format('READY')} {self.log_icons['READY']} AsyncWebCrawler initialized{Style.RESET_ALL}") + # if self.verbose: + # print(f"{Fore.GREEN}{self.tag_format('READY')} {self.log_icons['READY']} AsyncWebCrawler initialized{Style.RESET_ALL}") async def arun( self, @@ -234,8 +246,14 @@ class AsyncWebCrawler: screenshot_data = cached_result.screenshot if not screenshot_data: cached_result = None - if verbose: - print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s") + # if verbose: + # print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s") + self.logger.url_status( + url=cache_context.display_url, + success=bool(html), + timing=time.perf_counter() - start_time, + tag="FETCH" + ) # Fetch fresh content if needed @@ -252,8 +270,14 @@ class AsyncWebCrawler: html = sanitize_input_encode(async_response.html) screenshot_data = async_response.screenshot t2 = time.perf_counter() - if verbose: - print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url}... | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s") + self.logger.url_status( + url=cache_context.display_url, + success=bool(html), + timing=t2 - t1, + tag="FETCH" + ) + # if verbose: + # print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url}... | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s") # Process the HTML content crawl_result = await self.aprocess_html( @@ -287,9 +311,21 @@ class AsyncWebCrawler: crawl_result.success = bool(html) crawl_result.session_id = kwargs.get("session_id", None) - if verbose: - print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}") - + # if verbose: + # print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}") + self.logger.success( + message="{url:.50}... | Status: {status} | Total: {timing}", + tag="COMPLETE", + params={ + "url": cache_context.display_url, + "status": crawl_result.success, + "timing": f"{time.perf_counter() - start_time:.2f}s" + }, + colors={ + "status": Fore.GREEN if crawl_result.success else Fore.RED, + "timing": Fore.YELLOW + } + ) # Update cache if appropriate if cache_context.should_write() and not bool(cached_result): @@ -300,7 +336,12 @@ class AsyncWebCrawler: except Exception as e: if not hasattr(e, "msg"): e.msg = str(e) - print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}") + # print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}") + self.logger.error_status( + url=cache_context.display_url, + error=e.msg, + tag="ERROR" + ) return CrawlResult( url=url, html="", @@ -362,7 +403,12 @@ class AsyncWebCrawler: domain = urlparse(url).netloc current_time = time.time() - print(f"{Fore.LIGHTBLACK_EX}{self.tag_format('PARALLEL')} Started task for {url[:50]}...{Style.RESET_ALL}") + # print(f"{Fore.LIGHTBLACK_EX}{self.tag_format('PARALLEL')} Started task for {url[:50]}...{Style.RESET_ALL}") + self.logger.debug( + message="Started task for {url:.50}...", + tag="PARALLEL", + params={"url": url} + ) # Get delay settings from kwargs or use defaults mean_delay = kwargs.get('mean_delay', 0.1) # 0.5 seconds default mean delay @@ -394,12 +440,26 @@ class AsyncWebCrawler: ) # Print start message - print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Starting concurrent crawling for {len(urls)} URLs...{Style.RESET_ALL}") + # print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Starting concurrent crawling for {len(urls)} URLs...{Style.RESET_ALL}") + self.logger.info( + message="Starting concurrent crawling for {count} URLs...", + tag="INIT", + params={"count": len(urls)} + ) start_time = time.perf_counter() tasks = [crawl_with_semaphore(url) for url in urls] results = await asyncio.gather(*tasks, return_exceptions=True) end_time = time.perf_counter() - print(f"{Fore.YELLOW}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} Concurrent crawling completed for {len(urls)} URLs | Total time: {end_time - start_time:.2f}s{Style.RESET_ALL}") + # print(f"{Fore.YELLOW}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} Concurrent crawling completed for {len(urls)} URLs | Total time: {end_time - start_time:.2f}s{Style.RESET_ALL}") + self.logger.success( + message="Concurrent crawling completed for {count} URLs | " + Fore.YELLOW + " Total time: {timing}" + Style.RESET_ALL, + tag="COMPLETE", + params={ + "count": len(urls), + "timing": f"{end_time - start_time:.2f}s" + }, + colors={"timing": Fore.YELLOW} + ) return [result if not isinstance(result, Exception) else str(result) for result in results] @@ -451,9 +511,16 @@ class AsyncWebCrawler: links = result.get("links", []) metadata = result.get("metadata", {}) - if verbose: - print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms") - + # if verbose: + # print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms") + self.logger.info( + message="Processed {url:.50}... | Time: {timing}ms", + tag="SCRAPE", + params={ + "url": _url, + "timing": int((time.perf_counter() - t1) * 1000) + } + ) if extracted_content is None and extraction_strategy and chunking_strategy and not isinstance(extraction_strategy, NoExtractionStrategy): @@ -467,8 +534,17 @@ class AsyncWebCrawler: sections = chunking_strategy.chunk(markdown) extracted_content = extraction_strategy.run(url, sections) extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) - if verbose: - print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}") + # if verbose: + # print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}") + self.logger.info( + message="Completed for {url:.50}... | Time: {timing}s", + tag="EXTRACT", + params={ + "url": _url, + "timing": time.perf_counter() - t1 + } + ) + diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py index 88375da9..88216f7f 100644 --- a/crawl4ai/content_filter_strategy.py +++ b/crawl4ai/content_filter_strategy.py @@ -8,6 +8,10 @@ from bs4 import BeautifulSoup, NavigableString, Tag from .utils import clean_tokens from abc import ABC, abstractmethod +from snowballstemmer import stemmer + +# from nltk.stem import PorterStemmer +# ps = PorterStemmer() class RelevantContentFilter(ABC): def __init__(self, user_query: str = None): self.user_query = user_query @@ -252,7 +256,7 @@ class RelevantContentFilter(ABC): return str(tag) # Fallback to original if anything fails class BM25ContentFilter(RelevantContentFilter): - def __init__(self, user_query: str = None, bm25_threshold: float = 1.0): + def __init__(self, user_query: str = None, bm25_threshold: float = 1.0, language: str = 'english'): super().__init__(user_query=user_query) self.bm25_threshold = bm25_threshold self.priority_tags = { @@ -268,6 +272,7 @@ class BM25ContentFilter(RelevantContentFilter): 'pre': 1.5, 'th': 1.5, # Table headers } + self.stemmer = stemmer(language) def filter_content(self, html: str) -> List[str]: """Implements content filtering using BM25 algorithm with priority tag handling""" @@ -282,58 +287,42 @@ class BM25ContentFilter(RelevantContentFilter): if not candidates: return [] - # Split into priority and regular candidates - priority_candidates = [] - regular_candidates = [] + # Tokenize corpus + # tokenized_corpus = [chunk.lower().split() for _, chunk, _, _ in candidates] + # tokenized_query = query.lower().split() + + # tokenized_corpus = [[ps.stem(word) for word in chunk.lower().split()] + # for _, chunk, _, _ in candidates] + # tokenized_query = [ps.stem(word) for word in query.lower().split()] - for index, chunk, tag_type, tag in candidates: - if tag.name in self.priority_tags: - priority_candidates.append((index, chunk, tag_type, tag)) - else: - regular_candidates.append((index, chunk, tag_type, tag)) + tokenized_corpus = [[self.stemmer.stemWord(word) for word in chunk.lower().split()] + for _, chunk, _, _ in candidates] + tokenized_query = [self.stemmer.stemWord(word) for word in query.lower().split()] - # Process regular content with BM25 - tokenized_corpus = [chunk.lower().split() for _, chunk, _, _ in regular_candidates] - tokenized_query = query.lower().split() - # Clean from stop words and noise tokenized_corpus = [clean_tokens(tokens) for tokens in tokenized_corpus] tokenized_query = clean_tokens(tokenized_query) - + bm25 = BM25Okapi(tokenized_corpus) scores = bm25.get_scores(tokenized_query) - # Score and boost regular candidates - scored_candidates = [ - (score * self.priority_tags.get(tag.name, 1.0), index, chunk, tag_type, tag) - for score, (index, chunk, tag_type, tag) in zip(scores, regular_candidates) + # Adjust scores with tag weights + adjusted_candidates = [] + for score, (index, chunk, tag_type, tag) in zip(scores, candidates): + tag_weight = self.priority_tags.get(tag.name, 1.0) + adjusted_score = score * tag_weight + adjusted_candidates.append((adjusted_score, index, chunk, tag)) + + # Filter candidates by threshold + selected_candidates = [ + (index, chunk, tag) for adjusted_score, index, chunk, tag in adjusted_candidates + if adjusted_score >= self.bm25_threshold ] - scored_candidates.sort(key=lambda x: x[0], reverse=True) - - # Process scored candidates - selected_tags = set() - selected_candidates = [] - - # First add all priority candidates - for index, chunk, tag_type, tag in priority_candidates: - tag_id = id(tag) - if tag_id not in selected_tags: - selected_candidates.append((index, chunk, tag)) - selected_tags.add(tag_id) - - # Then add scored regular candidates that meet threshold - for score, index, chunk, tag_type, tag in scored_candidates: - if score < self.bm25_threshold: - continue - tag_id = id(tag) - if tag_id not in selected_tags: - selected_candidates.append((index, chunk, tag)) - selected_tags.add(tag_id) if not selected_candidates: return [] - # Sort by original document order + # Sort selected candidates by original document order selected_candidates.sort(key=lambda x: x[0]) - return [self.clean_element(tag) for _, _, tag in selected_candidates] + return [self.clean_element(tag) for _, _, tag in selected_candidates] diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scrapping_strategy.py index d16b0680..0f470671 100644 --- a/crawl4ai/content_scrapping_strategy.py +++ b/crawl4ai/content_scrapping_strategy.py @@ -149,6 +149,15 @@ class ContentScrapingStrategy(ABC): pass class WebScrapingStrategy(ContentScrapingStrategy): + def __init__(self, logger=None): + self.logger = logger + + def _log(self, level, message, tag="SCRAPE", **kwargs): + """Helper method to safely use logger.""" + if self.logger: + log_method = getattr(self.logger, level) + log_method(message=message, tag=tag, **kwargs) + def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: return self._get_content_of_website_optimized(url, html, is_async=False, **kwargs) @@ -167,7 +176,12 @@ class WebScrapingStrategy(ContentScrapingStrategy): try: meta = extract_metadata("", soup) except Exception as e: - print('Error extracting metadata:', str(e)) + self._log('error', + message="Error extracting metadata: {error}", + tag="SCRAPE", + params={"error": str(e)} + ) + # print('Error extracting metadata:', str(e)) meta = {} @@ -430,9 +444,12 @@ class WebScrapingStrategy(ContentScrapingStrategy): try: remove_unwanted_attributes(element, IMPORTANT_ATTRS, kwargs.get('keep_data_attributes', False)) except Exception as e: - print('Error removing unwanted attributes:', str(e)) - - + # print('Error removing unwanted attributes:', str(e)) + self._log('error', + message="Error removing unwanted attributes: {error}", + tag="SCRAPE", + params={"error": str(e)} + ) # Process children for child in list(element.children): if isinstance(child, NavigableString) and not isinstance(child, Comment): @@ -453,7 +470,12 @@ class WebScrapingStrategy(ContentScrapingStrategy): return keep_element except Exception as e: - print('Error processing element:', str(e)) + # print('Error processing element:', str(e)) + self._log('error', + message="Error processing element: {error}", + tag="SCRAPE", + params={"error": str(e)} + ) return False process_element(body) @@ -516,7 +538,10 @@ class WebScrapingStrategy(ContentScrapingStrategy): str_body = body.encode_contents().decode('utf-8') print(f"[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.") - + self._log('error', + message="After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.", + tag="SCRAPE" + ) cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ') @@ -525,6 +550,13 @@ class WebScrapingStrategy(ContentScrapingStrategy): h.update_params(**kwargs.get('html2text', {})) markdown = h.handle(cleaned_html) except Exception as e: + if not h: + h = CustomHTML2Text() + self._log('error', + message="Error converting HTML to markdown: {error}", + tag="SCRAPE", + params={"error": str(e)} + ) markdown = h.handle(sanitize_html(cleaned_html)) markdown = markdown.replace(' ```', '```') diff --git a/crawl4ai/version_manager.py b/crawl4ai/version_manager.py index 07e0c0e9..8ae2de2e 100644 --- a/crawl4ai/version_manager.py +++ b/crawl4ai/version_manager.py @@ -20,11 +20,11 @@ class VersionManager: def update_version(self): """Update the version file to current library version""" - self.version_file.write_text(__version__) + self.version_file.write_text(__version__.__version__) def needs_update(self): """Check if database needs update based on version""" installed = self.get_installed_version() - current = version.parse(__version__) + current = version.parse(__version__.__version__) return installed is None or installed < current diff --git a/docker-compose.hub.yml b/docker-compose.hub.yml new file mode 100644 index 00000000..9bcfa982 --- /dev/null +++ b/docker-compose.hub.yml @@ -0,0 +1,27 @@ +services: + crawl4ai: + image: unclecode/crawl4ai:basic # Pull image from Docker Hub + ports: + - "11235:11235" # FastAPI server + - "8000:8000" # Alternative port + - "9222:9222" # Browser debugging + - "8080:8080" # Additional port + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token + - OPENAI_API_KEY=${OPENAI_API_KEY:-} # Optional OpenAI API key + - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} # Optional Claude API key + volumes: + - /dev/shm:/dev/shm # Shared memory for browser operations + deploy: + resources: + limits: + memory: 4G + reservations: + memory: 1G + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:11235/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s diff --git a/docker-compose.local.yml b/docker-compose.local.yml new file mode 100644 index 00000000..7dc41b47 --- /dev/null +++ b/docker-compose.local.yml @@ -0,0 +1,33 @@ +services: + crawl4ai: + build: + context: . + dockerfile: Dockerfile + args: + PYTHON_VERSION: 3.10 + INSTALL_TYPE: all + ENABLE_GPU: false + ports: + - "11235:11235" # FastAPI server + - "8000:8000" # Alternative port + - "9222:9222" # Browser debugging + - "8080:8080" # Additional port + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token + - OPENAI_API_KEY=${OPENAI_API_KEY:-} # Optional OpenAI API key + - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} # Optional Claude API key + volumes: + - /dev/shm:/dev/shm # Shared memory for browser operations + deploy: + resources: + limits: + memory: 4G + reservations: + memory: 1G + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:11235/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index ef0dc9e4..1097ef11 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,3 @@ -version: '3.8' - services: crawl4ai: build: @@ -9,15 +7,18 @@ services: PYTHON_VERSION: 3.10 INSTALL_TYPE: all ENABLE_GPU: false + profiles: ["local"] ports: - - "11235:11235" # FastAPI server - - "8000:8000" # Alternative port - - "9222:9222" # Browser debugging - - "8080:8080" # Additional port + - "11235:11235" + - "8000:8000" + - "9222:9222" + - "8080:8080" environment: - - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} + - OPENAI_API_KEY=${OPENAI_API_KEY:-} + - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} volumes: - - /dev/shm:/dev/shm # Shared memory for browser operations + - /dev/shm:/dev/shm deploy: resources: limits: @@ -30,4 +31,32 @@ services: interval: 30s timeout: 10s retries: 3 - start_period: 40s \ No newline at end of file + start_period: 40s + + crawl4ai-hub: + image: unclecode/crawl4ai:basic + profiles: ["hub"] + ports: + - "11235:11235" + - "8000:8000" + - "9222:9222" + - "8080:8080" + environment: + - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} + - OPENAI_API_KEY=${OPENAI_API_KEY:-} + - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} + volumes: + - /dev/shm:/dev/shm + deploy: + resources: + limits: + memory: 4G + reservations: + memory: 1G + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:11235/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s diff --git a/docs/examples/v0.3.74.overview.py b/docs/examples/v0.3.74.overview.py index ec3a7d73..00296740 100644 --- a/docs/examples/v0.3.74.overview.py +++ b/docs/examples/v0.3.74.overview.py @@ -1,9 +1,16 @@ +import os, sys +# append the parent directory to the sys.path +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) +parent_parent_dir = os.path.dirname(parent_dir) +sys.path.append(parent_parent_dir) +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) +__data__ = os.path.join(__location__, "__data") import asyncio -import os from pathlib import Path import aiohttp import json -from crawl4ai import AsyncWebCrawler +from crawl4ai import AsyncWebCrawler, CacheMode from crawl4ai.content_filter_strategy import BM25ContentFilter # 1. File Download Processing Example @@ -32,7 +39,8 @@ async def download_example(): console.log('No .exe download link found'); } """, - wait_for=5 # Wait 5 seconds to ensure download starts + delay_before_return_html=1, # Wait 5 seconds to ensure download starts + cache_mode=CacheMode.BYPASS ) if result.downloaded_files: @@ -50,22 +58,32 @@ async def content_filtering_example(): async with AsyncWebCrawler(verbose=True) as crawler: # Create filter with custom query for OpenAI's blog content_filter = BM25ContentFilter( - user_query="AI language models research innovation", + # user_query="Investment and fundraising", + # user_query="Robotic", bm25_threshold=1.0 ) result = await crawler.arun( - url="https://openai.com/blog", - content_filter=content_filter + url="https://techcrunch.com/", + content_filter=content_filter, + cache_mode=CacheMode.BYPASS ) - print(f"Filtered content: {result.extracted_content}") + print(f"Filtered content: {len(result.fit_markdown)}") + print(f"Filtered content: {result.fit_markdown}") + + # Save html + with open(os.path.join(__data__, "techcrunch.html"), "w") as f: + f.write(result.fit_html) + + with open(os.path.join(__data__, "filtered_content.md"), "w") as f: + f.write(result.fit_markdown) # 3. Local File and Raw HTML Processing Example async def local_and_raw_html_example(): """Example of processing local files and raw HTML""" # Create a sample HTML file - sample_file = "sample.html" + sample_file = os.path.join(__data__, "sample.html") with open(sample_file, "w") as f: f.write(""" @@ -112,21 +130,18 @@ async def browser_management_example(): headless=False, verbose=True ) as crawler: + + result = await crawler.arun( + url="https://crawl4ai.com", + # session_id="persistent_session_1", + cache_mode=CacheMode.BYPASS + ) # Use GitHub as an example - it's a good test for browser management # because it requires proper browser handling result = await crawler.arun( url="https://github.com/trending", - session_id="persistent_session_1", - js_code=""" - // Custom JavaScript to execute on GitHub's trending page - const repos = document.querySelectorAll('article.Box-row'); - const data = Array.from(repos).map(repo => ({ - name: repo.querySelector('h2')?.textContent?.trim(), - description: repo.querySelector('p')?.textContent?.trim(), - language: repo.querySelector('[itemprop="programmingLanguage"]')?.textContent?.trim() - })); - console.log('Trending repositories:', JSON.stringify(data, null, 2)); - """ + # session_id="persistent_session_1", + cache_mode=CacheMode.BYPASS ) print("\nBrowser session result:", result.success) @@ -136,6 +151,8 @@ async def browser_management_example(): # 5. API Usage Example async def api_example(): """Example of using the new API endpoints""" + api_token = os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code" + headers = {'Authorization': f'Bearer {api_token}'} async with aiohttp.ClientSession() as session: # Submit crawl job crawl_request = { @@ -143,52 +160,78 @@ async def api_example(): "extraction_config": { "type": "json_css", "params": { - "selectors": { - "titles": ".title a", - "scores": ".score", - "comments": ".comment-tree" + "schema": { + "name": "Hacker News Articles", + "baseSelector": ".athing", + "fields": [ + { + "name": "title", + "selector": ".title a", + "type": "text" + }, + { + "name": "score", + "selector": ".score", + "type": "text" + }, + { + "name": "url", + "selector": ".title a", + "type": "attribute", + "attribute": "href" + } + ] } } }, "crawler_params": { "headless": True, - "use_managed_browser": True + # "use_managed_browser": True }, - "screenshot": True, - "magic": True + "cache_mode": "bypass", + # "screenshot": True, + # "magic": True } async with session.post( "http://localhost:11235/crawl", - json=crawl_request + json=crawl_request, + headers=headers ) as response: task_data = await response.json() task_id = task_data["task_id"] # Check task status - async with session.get( - f"http://localhost:11235/task/{task_id}" - ) as status_response: - result = await status_response.json() - print(f"Task result: {result}") + while True: + async with session.get( + f"http://localhost:11235/task/{task_id}", + headers=headers + ) as status_response: + result = await status_response.json() + print(f"Task result: {result}") + + if result["status"] == "completed": + break + else: + await asyncio.sleep(1) # Main execution async def main(): - print("Running Crawl4AI feature examples...") + # print("Running Crawl4AI feature examples...") - print("\n1. Running Download Example:") + # print("\n1. Running Download Example:") await download_example() - print("\n2. Running Content Filtering Example:") + # print("\n2. Running Content Filtering Example:") await content_filtering_example() - print("\n3. Running Local and Raw HTML Example:") + # print("\n3. Running Local and Raw HTML Example:") await local_and_raw_html_example() - print("\n4. Running Browser Management Example:") + # print("\n4. Running Browser Management Example:") await browser_management_example() - print("\n5. Running API Example:") + # print("\n5. Running API Example:") await api_example() if __name__ == "__main__": diff --git a/docs/md_v2/basic/docker-deploymeny.md b/docs/md_v2/basic/docker-deploymeny.md index 30555708..87e468aa 100644 --- a/docs/md_v2/basic/docker-deploymeny.md +++ b/docs/md_v2/basic/docker-deploymeny.md @@ -15,6 +15,94 @@ docker run -p 11235:11235 unclecode/crawl4ai:basic docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:basic ``` +## Running with Docker Compose 🐳 + +### Use Docker Compose (From Local Dockerfile or Docker Hub) + +Crawl4AI provides flexibility to use Docker Compose for managing your containerized services. You can either build the image locally from the provided `Dockerfile` or use the pre-built image from Docker Hub. + +### **Option 1: Using Docker Compose to Build Locally** +If you want to build the image locally, use the provided `docker-compose.local.yml` file. + +```bash +docker-compose -f docker-compose.local.yml up -d +``` + +This will: +1. Build the Docker image from the provided `Dockerfile`. +2. Start the container and expose it on `http://localhost:11235`. + +--- + +### **Option 2: Using Docker Compose with Pre-Built Image from Hub** +If you prefer using the pre-built image on Docker Hub, use the `docker-compose.hub.yml` file. + +```bash +docker-compose -f docker-compose.hub.yml up -d +``` + +This will: +1. Pull the pre-built image `unclecode/crawl4ai:basic` (or `all`, depending on your configuration). +2. Start the container and expose it on `http://localhost:11235`. + +--- + +### **Stopping the Running Services** + +To stop the services started via Docker Compose, you can use: + +```bash +docker-compose -f docker-compose.local.yml down +# OR +docker-compose -f docker-compose.hub.yml down +``` + +If the containers don’t stop and the application is still running, check the running containers: + +```bash +docker ps +``` + +Find the `CONTAINER ID` of the running service and stop it forcefully: + +```bash +docker stop +``` + +--- + +### **Debugging with Docker Compose** + +- **Check Logs**: To view the container logs: + ```bash + docker-compose -f docker-compose.local.yml logs -f + ``` + +- **Remove Orphaned Containers**: If the service is still running unexpectedly: + ```bash + docker-compose -f docker-compose.local.yml down --remove-orphans + ``` + +- **Manually Remove Network**: If the network is still in use: + ```bash + docker network ls + docker network rm crawl4ai_default + ``` + +--- + +### Why Use Docker Compose? + +Docker Compose is the recommended way to deploy Crawl4AI because: +1. It simplifies multi-container setups. +2. Allows you to define environment variables, resources, and ports in a single file. +3. Makes it easier to switch between local development and production-ready images. + +For example, your `docker-compose.yml` could include API keys, token settings, and memory limits, making deployment quick and consistent. + + + + ## API Security πŸ”’ ### Understanding CRAWL4AI_API_TOKEN diff --git a/main.py b/main.py index ee5f7fc6..6d217410 100644 --- a/main.py +++ b/main.py @@ -26,6 +26,7 @@ from enum import Enum from dataclasses import dataclass import json from crawl4ai import AsyncWebCrawler, CrawlResult, CacheMode +from crawl4ai.config import MIN_WORD_THRESHOLD from crawl4ai.extraction_strategy import ( LLMExtractionStrategy, CosineStrategy, @@ -53,12 +54,20 @@ class ExtractionConfig(BaseModel): type: CrawlerType params: Dict[str, Any] = {} +class ChunkingStrategy(BaseModel): + type: str + params: Dict[str, Any] = {} + +class ContentFilter(BaseModel): + type: str = "bm25" + params: Dict[str, Any] = {} + class CrawlRequest(BaseModel): urls: Union[HttpUrl, List[HttpUrl]] + word_count_threshold: int = MIN_WORD_THRESHOLD extraction_config: Optional[ExtractionConfig] = None - crawler_params: Dict[str, Any] = {} - priority: int = Field(default=5, ge=1, le=10) - ttl: Optional[int] = 3600 + chunking_strategy: Optional[ChunkingStrategy] = None + content_filter: Optional[ContentFilter] = None js_code: Optional[List[str]] = None wait_for: Optional[str] = None css_selector: Optional[str] = None @@ -66,7 +75,10 @@ class CrawlRequest(BaseModel): magic: bool = False extra: Optional[Dict[str, Any]] = {} session_id: Optional[str] = None - cache_mode: Optional[CacheMode] = None + cache_mode: Optional[CacheMode] = CacheMode.ENABLED + priority: int = Field(default=5, ge=1, le=10) + ttl: Optional[int] = 3600 + crawler_params: Dict[str, Any] = {} @dataclass class TaskInfo: @@ -280,6 +292,7 @@ class CrawlerService: if isinstance(request.urls, list): results = await crawler.arun_many( urls=[str(url) for url in request.urls], + word_count_threshold=MIN_WORD_THRESHOLD, extraction_strategy=extraction_strategy, js_code=request.js_code, wait_for=request.wait_for, @@ -287,6 +300,7 @@ class CrawlerService: screenshot=request.screenshot, magic=request.magic, session_id=request.session_id, + cache_mode=request.cache_mode, **request.extra, ) else: @@ -299,6 +313,7 @@ class CrawlerService: screenshot=request.screenshot, magic=request.magic, session_id=request.session_id, + cache_mode=request.cache_mode, **request.extra, ) diff --git a/requirements.txt b/requirements.txt index e6294cc5..ed259ac9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,5 @@ tf-playwright-stealth~=1.0 xxhash~=3.4 rank-bm25~=0.2 aiofiles~=24.0 -colorama~=0.4 \ No newline at end of file +colorama~=0.4 +snowballstemmer~=2.2 \ No newline at end of file