feat(docker): add Docker Compose configurations for local and hub deployment; enhance GPU support checks in Dockerfile

feat(requirements): update requirements.txt to include snowballstemmer
fix(version_manager): correct version parsing to use __version__.__version__
feat(main): introduce chunking strategy and content filter in CrawlRequest model
feat(content_filter): enhance BM25 algorithm with priority tag scoring for improved content relevance
feat(logger): implement new async logger engine replacing print statements throughout library
fix(database): resolve version-related deadlock and circular lock issues in database operations
docs(docker): expand Docker deployment documentation with usage instructions for Docker Compose
This commit is contained in:
UncleCode
2024-11-18 21:00:06 +08:00
parent 152ac35bc2
commit 852729ff38
15 changed files with 952 additions and 232 deletions

View File

@@ -62,11 +62,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
libatspi2.0-0 \ libatspi2.0-0 \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
# GPU support if enabled # GPU support if enabled and architecture is supported
RUN if [ "$ENABLE_GPU" = "true" ] ; then \ RUN if [ "$ENABLE_GPU" = "true" ] && [ "$(dpkg --print-architecture)" != "arm64" ] ; then \
apt-get update && apt-get install -y --no-install-recommends \ apt-get update && apt-get install -y --no-install-recommends \
nvidia-cuda-toolkit \ nvidia-cuda-toolkit \
&& rm -rf /var/lib/apt/lists/* ; \ && rm -rf /var/lib/apt/lists/* ; \
else \
echo "Skipping NVIDIA CUDA Toolkit installation (unsupported architecture or GPU disabled)"; \
fi fi
# Create and set working directory # Create and set working directory

View File

@@ -35,13 +35,15 @@ stealth_config = StealthConfig(
class ManagedBrowser: class ManagedBrowser:
def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False): def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None):
self.browser_type = browser_type self.browser_type = browser_type
self.user_data_dir = user_data_dir self.user_data_dir = user_data_dir
self.headless = headless self.headless = headless
self.browser_process = None self.browser_process = None
self.temp_dir = None self.temp_dir = None
self.debugging_port = 9222 self.debugging_port = 9222
self.logger = logger
self.shutting_down = False
async def start(self) -> str: async def start(self) -> str:
""" """
@@ -76,15 +78,38 @@ class ManagedBrowser:
async def _monitor_browser_process(self): async def _monitor_browser_process(self):
"""Monitor the browser process for unexpected termination.""" """Monitor the browser process for unexpected termination."""
if self.browser_process: if self.browser_process:
stdout, stderr = await asyncio.gather( try:
asyncio.to_thread(self.browser_process.stdout.read), stdout, stderr = await asyncio.gather(
asyncio.to_thread(self.browser_process.stderr.read) asyncio.to_thread(self.browser_process.stdout.read),
) asyncio.to_thread(self.browser_process.stderr.read)
if self.browser_process.poll() is not None: )
print(f"Browser process terminated unexpectedly with code {self.browser_process.returncode}")
print(f"STDOUT: {stdout.decode()}") # Check shutting_down flag BEFORE logging anything
print(f"STDERR: {stderr.decode()}") if self.browser_process.poll() is not None:
await self.cleanup() if not self.shutting_down:
self.logger.error(
message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
tag="ERROR",
params={
"code": self.browser_process.returncode,
"stdout": stdout.decode(),
"stderr": stderr.decode()
}
)
await self.cleanup()
else:
self.logger.info(
message="Browser process terminated normally | Code: {code}",
tag="INFO",
params={"code": self.browser_process.returncode}
)
except Exception as e:
if not self.shutting_down:
self.logger.error(
message="Error monitoring browser process: {error}",
tag="ERROR",
params={"error": str(e)}
)
def _get_browser_path(self) -> str: def _get_browser_path(self) -> str:
"""Returns the browser executable path based on OS and browser type""" """Returns the browser executable path based on OS and browser type"""
@@ -134,20 +159,39 @@ class ManagedBrowser:
async def cleanup(self): async def cleanup(self):
"""Cleanup browser process and temporary directory""" """Cleanup browser process and temporary directory"""
# Set shutting_down flag BEFORE any termination actions
self.shutting_down = True
if self.browser_process: if self.browser_process:
try: try:
self.browser_process.terminate() self.browser_process.terminate()
await asyncio.sleep(1) # Wait for process to end gracefully
for _ in range(10): # 10 attempts, 100ms each
if self.browser_process.poll() is not None:
break
await asyncio.sleep(0.1)
# Force kill if still running
if self.browser_process.poll() is None: if self.browser_process.poll() is None:
self.browser_process.kill() self.browser_process.kill()
await asyncio.sleep(0.1) # Brief wait for kill to take effect
except Exception as e: except Exception as e:
print(f"Error terminating browser: {e}") self.logger.error(
message="Error terminating browser: {error}",
tag="ERROR",
params={"error": str(e)}
)
if self.temp_dir and os.path.exists(self.temp_dir): if self.temp_dir and os.path.exists(self.temp_dir):
try: try:
shutil.rmtree(self.temp_dir) shutil.rmtree(self.temp_dir)
except Exception as e: except Exception as e:
print(f"Error removing temporary directory: {e}") self.logger.error(
message="Error removing temporary directory: {error}",
tag="ERROR",
params={"error": str(e)}
)
class AsyncCrawlerStrategy(ABC): class AsyncCrawlerStrategy(ABC):
@@ -172,7 +216,8 @@ class AsyncCrawlerStrategy(ABC):
pass pass
class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
def __init__(self, use_cached_html=False, js_code=None, **kwargs): def __init__(self, use_cached_html=False, js_code=None, logger = None, **kwargs):
self.logger = logger
self.use_cached_html = use_cached_html self.use_cached_html = use_cached_html
self.user_agent = kwargs.get( self.user_agent = kwargs.get(
"user_agent", "user_agent",
@@ -231,7 +276,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
self.managed_browser = ManagedBrowser( self.managed_browser = ManagedBrowser(
browser_type=self.browser_type, browser_type=self.browser_type,
user_data_dir=self.user_data_dir, user_data_dir=self.user_data_dir,
headless=self.headless headless=self.headless,
logger=self.logger
) )
cdp_url = await self.managed_browser.start() cdp_url = await self.managed_browser.start()
self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
@@ -282,6 +328,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
# Add extra args if provided # Add extra args if provided
if self.extra_args: if self.extra_args:
browser_args["args"].extend(self.extra_args) browser_args["args"].extend(self.extra_args)
# Add downloads path if downloads are enabled
if self.accept_downloads:
browser_args["downloads_path"] = self.downloads_path
# Add proxy settings if a proxy is specified # Add proxy settings if a proxy is specified
if self.proxy: if self.proxy:
@@ -344,6 +394,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
self.browser = None self.browser = None
if self.managed_browser: if self.managed_browser:
await asyncio.sleep(0.5)
await self.managed_browser.cleanup() await self.managed_browser.cleanup()
self.managed_browser = None self.managed_browser = None
@@ -491,9 +542,19 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
}} }}
""") """)
else: else:
print(f"Warning: Could not access content frame for iframe {i}") # print(f"Warning: Could not access content frame for iframe {i}")
self.logger.warning(
message="Could not access content frame for iframe {index}",
tag="SCRAPE",
params={"index": i}
)
except Exception as e: except Exception as e:
print(f"Error processing iframe {i}: {str(e)}") self.logger.error(
message="Error processing iframe {index}: {error}",
tag="ERROR",
params={"index": i, "error": str(e)}
)
# print(f"Error processing iframe {i}: {str(e)}")
# Return the page object # Return the page object
return page return page
@@ -620,7 +681,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
context = await self.browser.new_context( context = await self.browser.new_context(
user_agent=self.user_agent, user_agent=self.user_agent,
viewport={"width": 1920, "height": 1080}, viewport={"width": 1920, "height": 1080},
proxy={"server": self.proxy} if self.proxy else None proxy={"server": self.proxy} if self.proxy else None,
accept_downloads=self.accept_downloads,
) )
await context.set_extra_http_headers(self.headers) await context.set_extra_http_headers(self.headers)
@@ -917,17 +979,31 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
suggested_filename = download.suggested_filename suggested_filename = download.suggested_filename
download_path = os.path.join(self.downloads_path, suggested_filename) download_path = os.path.join(self.downloads_path, suggested_filename)
if self.verbose: self.logger.info(
print(f"[LOG] 📥 Downloading {suggested_filename} to {download_path}") message="Downloading {filename} to {path}",
tag="FETCH",
params={"filename": suggested_filename, "path": download_path}
)
start_time = time.perf_counter()
await download.save_as(download_path) await download.save_as(download_path)
end_time = time.perf_counter()
self._downloaded_files.append(download_path) self._downloaded_files.append(download_path)
if self.verbose: self.logger.success(
print(f"[LOG] ✅ Downloaded {suggested_filename} successfully") message="Downloaded {filename} successfully",
tag="COMPLETE",
params={"filename": suggested_filename, "path": download_path, "duration": f"{end_time - start_time:.2f}s"}
)
except Exception as e: except Exception as e:
if self.verbose: self.logger.error(
print(f"[ERROR] Failed to handle download: {str(e)}") message="Failed to handle download: {error}",
tag="ERROR",
params={"error": str(e)}
)
# if self.verbose:
# print(f"[ERROR] Failed to handle download: {str(e)}")
async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed semaphore_count = kwargs.get('semaphore_count', 5) # Adjust as needed
@@ -1070,8 +1146,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
await page.evaluate(remove_overlays_js) await page.evaluate(remove_overlays_js)
await page.wait_for_timeout(500) # Wait for any animations to complete await page.wait_for_timeout(500) # Wait for any animations to complete
except Exception as e: except Exception as e:
if self.verbose: self.logger.warning(
print(f"Warning: Failed to remove overlay elements: {str(e)}") message="Failed to remove overlay elements: {error}",
tag="SCRAPE",
params={"error": str(e)}
)
# if self.verbose:
# print(f"Warning: Failed to remove overlay elements: {str(e)}")
async def take_screenshot(self, page: Page) -> str: async def take_screenshot(self, page: Page) -> str:
""" """
@@ -1089,7 +1170,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
return base64.b64encode(screenshot).decode('utf-8') return base64.b64encode(screenshot).decode('utf-8')
except Exception as e: except Exception as e:
error_message = f"Failed to take screenshot: {str(e)}" error_message = f"Failed to take screenshot: {str(e)}"
print(error_message) self.logger.error(
message="Screenshot failed: {error}",
tag="ERROR",
params={"error": error_message}
)
# Generate an error image # Generate an error image
img = Image.new('RGB', (800, 600), color='black') img = Image.new('RGB', (800, 600), color='black')
@@ -1123,7 +1209,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
return base64.b64encode(screenshot).decode('utf-8') return base64.b64encode(screenshot).decode('utf-8')
except Exception as e: except Exception as e:
error_message = f"Failed to take screenshot: {str(e)}" error_message = f"Failed to take screenshot: {str(e)}"
print(error_message) # print(error_message)
self.logger.error(
message="Screenshot failed: {error}",
tag="ERROR",
params={"error": error_message}
)
# Generate an error image # Generate an error image
img = Image.new('RGB', (800, 600), color='black') img = Image.new('RGB', (800, 600), color='black')

View File

@@ -12,10 +12,12 @@ import xxhash
import aiofiles import aiofiles
from .config import NEED_MIGRATION from .config import NEED_MIGRATION
from .version_manager import VersionManager from .version_manager import VersionManager
from .async_logger import AsyncLogger
# Set up logging # Set up logging
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
base_directory = Path.home()
DB_PATH = os.path.join(Path.home(), ".crawl4ai") DB_PATH = os.path.join(Path.home(), ".crawl4ai")
os.makedirs(DB_PATH, exist_ok=True) os.makedirs(DB_PATH, exist_ok=True)
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db") DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
@@ -28,15 +30,21 @@ class AsyncDatabaseManager:
self.max_retries = max_retries self.max_retries = max_retries
self.connection_pool: Dict[int, aiosqlite.Connection] = {} self.connection_pool: Dict[int, aiosqlite.Connection] = {}
self.pool_lock = asyncio.Lock() self.pool_lock = asyncio.Lock()
self.init_lock = asyncio.Lock()
self.connection_semaphore = asyncio.Semaphore(pool_size) self.connection_semaphore = asyncio.Semaphore(pool_size)
self._initialized = False self._initialized = False
self.version_manager = VersionManager() self.version_manager = VersionManager()
self.logger = AsyncLogger(
log_file=os.path.join(base_directory, ".crawl4ai", "crawler_db.log"),
verbose=False,
tag_width=10
)
async def initialize(self): async def initialize(self):
"""Initialize the database and connection pool""" """Initialize the database and connection pool"""
try: try:
logger.info("Initializing database...") self.logger.info("Initializing database", tag="INIT")
# Ensure the database file exists # Ensure the database file exists
os.makedirs(os.path.dirname(self.db_path), exist_ok=True) os.makedirs(os.path.dirname(self.db_path), exist_ok=True)
@@ -47,31 +55,39 @@ class AsyncDatabaseManager:
await self.ainit_db() await self.ainit_db()
# Verify the table exists # Verify the table exists
async def verify_table(db): async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
async with db.execute( async with db.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='crawled_data'" "SELECT name FROM sqlite_master WHERE type='table' AND name='crawled_data'"
) as cursor: ) as cursor:
result = await cursor.fetchone() result = await cursor.fetchone()
if not result: if not result:
raise Exception("crawled_data table was not created") raise Exception("crawled_data table was not created")
await self.execute_with_retry(verify_table)
# If version changed or fresh install, run updates # If version changed or fresh install, run updates
if needs_update: if needs_update:
logger.info("New version detected, running updates...") self.logger.info("New version detected, running updates", tag="INIT")
await self.update_db_schema() await self.update_db_schema()
from .migrations import run_migration # Import here to avoid circular imports from .migrations import run_migration # Import here to avoid circular imports
await run_migration() await run_migration()
self.version_manager.update_version() # Update stored version after successful migration self.version_manager.update_version() # Update stored version after successful migration
logger.info("Version update completed successfully") self.logger.success("Version update completed successfully", tag="COMPLETE")
else: else:
logger.info("Database initialization completed successfully") self.logger.success("Database initialization completed successfully", tag="COMPLETE")
except Exception as e: except Exception as e:
logger.error(f"Database initialization error: {e}") self.logger.error(
logger.info("Database will be initialized on first use") message="Database initialization error: {error}",
tag="ERROR",
params={"error": str(e)}
)
self.logger.info(
message="Database will be initialized on first use",
tag="INIT"
)
raise raise
async def cleanup(self): async def cleanup(self):
"""Cleanup connections when shutting down""" """Cleanup connections when shutting down"""
@@ -84,34 +100,41 @@ class AsyncDatabaseManager:
async def get_connection(self): async def get_connection(self):
"""Connection pool manager""" """Connection pool manager"""
if not self._initialized: if not self._initialized:
async with self.pool_lock: # Prevent multiple simultaneous initializations # Use an asyncio.Lock to ensure only one initialization occurs
if not self._initialized: # Double-check after acquiring lock async with self.init_lock:
if not self._initialized:
await self.initialize() await self.initialize()
self._initialized = True self._initialized = True
async with self.connection_semaphore: await self.connection_semaphore.acquire()
task_id = id(asyncio.current_task()) task_id = id(asyncio.current_task())
try: try:
async with self.pool_lock: async with self.pool_lock:
if task_id not in self.connection_pool: if task_id not in self.connection_pool:
conn = await aiosqlite.connect( conn = await aiosqlite.connect(
self.db_path, self.db_path,
timeout=30.0 timeout=30.0
) )
await conn.execute('PRAGMA journal_mode = WAL') await conn.execute('PRAGMA journal_mode = WAL')
await conn.execute('PRAGMA busy_timeout = 5000') await conn.execute('PRAGMA busy_timeout = 5000')
self.connection_pool[task_id] = conn self.connection_pool[task_id] = conn
yield self.connection_pool[task_id] yield self.connection_pool[task_id]
except Exception as e: except Exception as e:
logger.error(f"Connection error: {e}") self.logger.error(
raise message="Connection error: {error}",
finally: tag="ERROR",
async with self.pool_lock: force_verbose=True,
if task_id in self.connection_pool: params={"error": str(e)}
await self.connection_pool[task_id].close() )
del self.connection_pool[task_id] raise
finally:
async with self.pool_lock:
if task_id in self.connection_pool:
await self.connection_pool[task_id].close()
del self.connection_pool[task_id]
self.connection_semaphore.release()
async def execute_with_retry(self, operation, *args): async def execute_with_retry(self, operation, *args):
@@ -124,13 +147,21 @@ class AsyncDatabaseManager:
return result return result
except Exception as e: except Exception as e:
if attempt == self.max_retries - 1: if attempt == self.max_retries - 1:
logger.error(f"Operation failed after {self.max_retries} attempts: {e}") self.logger.error(
message="Operation failed after {retries} attempts: {error}",
tag="ERROR",
force_verbose=True,
params={
"retries": self.max_retries,
"error": str(e)
}
)
raise raise
await asyncio.sleep(1 * (attempt + 1)) # Exponential backoff await asyncio.sleep(1 * (attempt + 1)) # Exponential backoff
async def ainit_db(self): async def ainit_db(self):
"""Initialize database schema""" """Initialize database schema"""
async def _init(db): async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
await db.execute(''' await db.execute('''
CREATE TABLE IF NOT EXISTS crawled_data ( CREATE TABLE IF NOT EXISTS crawled_data (
url TEXT PRIMARY KEY, url TEXT PRIMARY KEY,
@@ -147,36 +178,37 @@ class AsyncDatabaseManager:
downloaded_files TEXT DEFAULT "{}" -- New column added downloaded_files TEXT DEFAULT "{}" -- New column added
) )
''') ''')
await db.commit()
await self.execute_with_retry(_init)
async def update_db_schema(self): async def update_db_schema(self):
"""Update database schema if needed""" """Update database schema if needed"""
async def _check_columns(db): async with aiosqlite.connect(self.db_path, timeout=30.0) as db:
cursor = await db.execute("PRAGMA table_info(crawled_data)") cursor = await db.execute("PRAGMA table_info(crawled_data)")
columns = await cursor.fetchall() columns = await cursor.fetchall()
return [column[1] for column in columns] column_names = [column[1] for column in columns]
# List of new columns to add
new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files']
for column in new_columns:
if column not in column_names:
await self.aalter_db_add_column(column, db)
await db.commit()
column_names = await self.execute_with_retry(_check_columns) async def aalter_db_add_column(self, new_column: str, db):
# List of new columns to add
new_columns = ['media', 'links', 'metadata', 'screenshot', 'response_headers', 'downloaded_files']
for column in new_columns:
if column not in column_names:
await self.aalter_db_add_column(column)
async def aalter_db_add_column(self, new_column: str):
"""Add new column to the database""" """Add new column to the database"""
async def _alter(db): if new_column == 'response_headers':
if new_column == 'response_headers': await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"')
await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT "{{}}"') else:
else: await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""')
await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""') self.logger.info(
logger.info(f"Added column '{new_column}' to the database.") message="Added column '{column}' to the database",
tag="INIT",
params={"column": new_column}
)
await self.execute_with_retry(_alter)
async def aget_cached_url(self, url: str) -> Optional[CrawlResult]: async def aget_cached_url(self, url: str) -> Optional[CrawlResult]:
"""Retrieve cached URL data as CrawlResult""" """Retrieve cached URL data as CrawlResult"""
@@ -235,7 +267,12 @@ class AsyncDatabaseManager:
try: try:
return await self.execute_with_retry(_get) return await self.execute_with_retry(_get)
except Exception as e: except Exception as e:
logger.error(f"Error retrieving cached URL: {e}") self.logger.error(
message="Error retrieving cached URL: {error}",
tag="ERROR",
force_verbose=True,
params={"error": str(e)}
)
return None return None
async def acache_url(self, result: CrawlResult): async def acache_url(self, result: CrawlResult):
@@ -291,7 +328,13 @@ class AsyncDatabaseManager:
try: try:
await self.execute_with_retry(_cache) await self.execute_with_retry(_cache)
except Exception as e: except Exception as e:
logger.error(f"Error caching URL: {e}") self.logger.error(
message="Error caching URL: {error}",
tag="ERROR",
force_verbose=True,
params={"error": str(e)}
)
async def aget_total_count(self) -> int: async def aget_total_count(self) -> int:
"""Get total number of cached URLs""" """Get total number of cached URLs"""
@@ -303,7 +346,12 @@ class AsyncDatabaseManager:
try: try:
return await self.execute_with_retry(_count) return await self.execute_with_retry(_count)
except Exception as e: except Exception as e:
logger.error(f"Error getting total count: {e}") self.logger.error(
message="Error getting total count: {error}",
tag="ERROR",
force_verbose=True,
params={"error": str(e)}
)
return 0 return 0
async def aclear_db(self): async def aclear_db(self):
@@ -314,7 +362,12 @@ class AsyncDatabaseManager:
try: try:
await self.execute_with_retry(_clear) await self.execute_with_retry(_clear)
except Exception as e: except Exception as e:
logger.error(f"Error clearing database: {e}") self.logger.error(
message="Error clearing database: {error}",
tag="ERROR",
force_verbose=True,
params={"error": str(e)}
)
async def aflush_db(self): async def aflush_db(self):
"""Drop the entire table""" """Drop the entire table"""
@@ -324,7 +377,12 @@ class AsyncDatabaseManager:
try: try:
await self.execute_with_retry(_flush) await self.execute_with_retry(_flush)
except Exception as e: except Exception as e:
logger.error(f"Error flushing database: {e}") self.logger.error(
message="Error flushing database: {error}",
tag="ERROR",
force_verbose=True,
params={"error": str(e)}
)
async def _store_content(self, content: str, content_type: str) -> str: async def _store_content(self, content: str, content_type: str) -> str:
@@ -352,7 +410,12 @@ class AsyncDatabaseManager:
async with aiofiles.open(file_path, 'r', encoding='utf-8') as f: async with aiofiles.open(file_path, 'r', encoding='utf-8') as f:
return await f.read() return await f.read()
except: except:
logger.error(f"Failed to load content: {file_path}") self.logger.error(
message="Failed to load content: {file_path}",
tag="ERROR",
force_verbose=True,
params={"file_path": file_path}
)
return None return None
# Create a singleton instance # Create a singleton instance

231
crawl4ai/async_logger.py Normal file
View File

@@ -0,0 +1,231 @@
from enum import Enum
from typing import Optional, Dict, Any, Union
from colorama import Fore, Back, Style, init
import time
import os
from datetime import datetime
class LogLevel(Enum):
DEBUG = 1
INFO = 2
SUCCESS = 3
WARNING = 4
ERROR = 5
class AsyncLogger:
"""
Asynchronous logger with support for colored console output and file logging.
Supports templated messages with colored components.
"""
DEFAULT_ICONS = {
'INIT': '',
'READY': '',
'FETCH': '',
'SCRAPE': '',
'EXTRACT': '',
'COMPLETE': '',
'ERROR': '×',
'DEBUG': '',
'INFO': '',
'WARNING': '',
}
DEFAULT_COLORS = {
LogLevel.DEBUG: Fore.LIGHTBLACK_EX,
LogLevel.INFO: Fore.CYAN,
LogLevel.SUCCESS: Fore.GREEN,
LogLevel.WARNING: Fore.YELLOW,
LogLevel.ERROR: Fore.RED,
}
def __init__(
self,
log_file: Optional[str] = None,
log_level: LogLevel = LogLevel.INFO,
tag_width: int = 10,
icons: Optional[Dict[str, str]] = None,
colors: Optional[Dict[LogLevel, str]] = None,
verbose: bool = True
):
"""
Initialize the logger.
Args:
log_file: Optional file path for logging
log_level: Minimum log level to display
tag_width: Width for tag formatting
icons: Custom icons for different tags
colors: Custom colors for different log levels
verbose: Whether to output to console
"""
init() # Initialize colorama
self.log_file = log_file
self.log_level = log_level
self.tag_width = tag_width
self.icons = icons or self.DEFAULT_ICONS
self.colors = colors or self.DEFAULT_COLORS
self.verbose = verbose
# Create log file directory if needed
if log_file:
os.makedirs(os.path.dirname(os.path.abspath(log_file)), exist_ok=True)
def _format_tag(self, tag: str) -> str:
"""Format a tag with consistent width."""
return f"[{tag}]".ljust(self.tag_width, ".")
def _get_icon(self, tag: str) -> str:
"""Get the icon for a tag, defaulting to info icon if not found."""
return self.icons.get(tag, self.icons['INFO'])
def _write_to_file(self, message: str):
"""Write a message to the log file if configured."""
if self.log_file:
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
with open(self.log_file, 'a', encoding='utf-8') as f:
# Strip ANSI color codes for file output
clean_message = message.replace(Fore.RESET, '').replace(Style.RESET_ALL, '')
for color in vars(Fore).values():
if isinstance(color, str):
clean_message = clean_message.replace(color, '')
f.write(f"[{timestamp}] {clean_message}\n")
def _log(
self,
level: LogLevel,
message: str,
tag: str,
params: Optional[Dict[str, Any]] = None,
colors: Optional[Dict[str, str]] = None,
base_color: Optional[str] = None,
**kwargs
):
"""
Core logging method that handles message formatting and output.
Args:
level: Log level for this message
message: Message template string
tag: Tag for the message
params: Parameters to format into the message
colors: Color overrides for specific parameters
base_color: Base color for the entire message
"""
if level.value < self.log_level.value:
return
# Format the message with parameters if provided
if params:
try:
# First format the message with raw parameters
formatted_message = message.format(**params)
# Then apply colors if specified
if colors:
for key, color in colors.items():
# Find the formatted value in the message and wrap it with color
if key in params:
value_str = str(params[key])
formatted_message = formatted_message.replace(
value_str,
f"{color}{value_str}{Style.RESET_ALL}"
)
except KeyError as e:
formatted_message = f"LOGGING ERROR: Missing parameter {e} in message template"
level = LogLevel.ERROR
else:
formatted_message = message
# Construct the full log line
color = base_color or self.colors[level]
log_line = f"{color}{self._format_tag(tag)} {self._get_icon(tag)} {formatted_message}{Style.RESET_ALL}"
# Output to console if verbose
if self.verbose or kwargs.get("force_verbose", False):
print(log_line)
# Write to file if configured
self._write_to_file(log_line)
def debug(self, message: str, tag: str = "DEBUG", **kwargs):
"""Log a debug message."""
self._log(LogLevel.DEBUG, message, tag, **kwargs)
def info(self, message: str, tag: str = "INFO", **kwargs):
"""Log an info message."""
self._log(LogLevel.INFO, message, tag, **kwargs)
def success(self, message: str, tag: str = "SUCCESS", **kwargs):
"""Log a success message."""
self._log(LogLevel.SUCCESS, message, tag, **kwargs)
def warning(self, message: str, tag: str = "WARNING", **kwargs):
"""Log a warning message."""
self._log(LogLevel.WARNING, message, tag, **kwargs)
def error(self, message: str, tag: str = "ERROR", **kwargs):
"""Log an error message."""
self._log(LogLevel.ERROR, message, tag, **kwargs)
def url_status(
self,
url: str,
success: bool,
timing: float,
tag: str = "FETCH",
url_length: int = 50
):
"""
Convenience method for logging URL fetch status.
Args:
url: The URL being processed
success: Whether the operation was successful
timing: Time taken for the operation
tag: Tag for the message
url_length: Maximum length for URL in log
"""
self._log(
level=LogLevel.SUCCESS if success else LogLevel.ERROR,
message="{url:.{url_length}}... | Status: {status} | Time: {timing:.2f}s",
tag=tag,
params={
"url": url,
"url_length": url_length,
"status": success,
"timing": timing
},
colors={
"status": Fore.GREEN if success else Fore.RED,
"timing": Fore.YELLOW
}
)
def error_status(
self,
url: str,
error: str,
tag: str = "ERROR",
url_length: int = 50
):
"""
Convenience method for logging error status.
Args:
url: The URL being processed
error: Error message
tag: Tag for the message
url_length: Maximum length for URL in log
"""
self._log(
level=LogLevel.ERROR,
message="{url:.{url_length}}... | Error: {error}",
tag=tag,
params={
"url": url,
"url_length": url_length,
"error": error
}
)

View File

@@ -15,6 +15,7 @@ from .extraction_strategy import *
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
from .content_scrapping_strategy import WebScrapingStrategy from .content_scrapping_strategy import WebScrapingStrategy
from .async_logger import AsyncLogger
from .config import ( from .config import (
MIN_WORD_THRESHOLD, MIN_WORD_THRESHOLD,
@@ -74,19 +75,29 @@ class AsyncWebCrawler:
always_by_pass_cache: Deprecated, use always_bypass_cache instead always_by_pass_cache: Deprecated, use always_bypass_cache instead
base_directory: Base directory for storing cache base_directory: Base directory for storing cache
""" """
init() # init()
self.log_width = 10 # Width of "[COMPLETE]" # self.log_width = 10 # Width of "[COMPLETE]"
self.tag_format = lambda tag: f"[{tag}]".ljust(self.log_width, ".") # self.tag_format = lambda tag: f"[{tag}]".ljust(self.log_width, ".")
self.log_icons = { # self.log_icons = {
'INIT': '', # Alternative: '▶' or '►' # 'INIT': '→', # Alternative: '▶' or '►'
'READY': '', # Alternative: '√' # 'READY': '✓', # Alternative: '√'
'FETCH': '', # Alternative: '▼' # 'FETCH': '↓', # Alternative: '▼'
'SCRAPE': '', # Alternative: '♦' # 'SCRAPE': '◆', # Alternative: '♦'
'EXTRACT': '', # Alternative: '□' # 'EXTRACT': '■', # Alternative: '□'
'COMPLETE': '', # Alternative: '○' # 'COMPLETE': '●', # Alternative: '○'
'ERROR': '×' # 'ERROR': '×'
} # }
self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(**kwargs) self.verbose = kwargs.get("verbose", False)
self.logger = AsyncLogger(
log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"),
verbose=self.verbose,
tag_width=10
)
self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
logger = self.logger,
**kwargs
)
# Handle deprecated parameter # Handle deprecated parameter
if always_by_pass_cache is not None: if always_by_pass_cache is not None:
@@ -118,12 +129,13 @@ class AsyncWebCrawler:
async def awarmup(self): async def awarmup(self):
"""Initialize the crawler with warm-up sequence.""" """Initialize the crawler with warm-up sequence."""
if self.verbose: self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Crawl4AI {crawl4ai_version}{Style.RESET_ALL}") # if self.verbose:
print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Warming up AsyncWebCrawler{Style.RESET_ALL}") # print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Crawl4AI {crawl4ai_version}{Style.RESET_ALL}")
# print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Warming up AsyncWebCrawler{Style.RESET_ALL}")
self.ready = True self.ready = True
if self.verbose: # if self.verbose:
print(f"{Fore.GREEN}{self.tag_format('READY')} {self.log_icons['READY']} AsyncWebCrawler initialized{Style.RESET_ALL}") # print(f"{Fore.GREEN}{self.tag_format('READY')} {self.log_icons['READY']} AsyncWebCrawler initialized{Style.RESET_ALL}")
async def arun( async def arun(
self, self,
@@ -234,8 +246,14 @@ class AsyncWebCrawler:
screenshot_data = cached_result.screenshot screenshot_data = cached_result.screenshot
if not screenshot_data: if not screenshot_data:
cached_result = None cached_result = None
if verbose: # if verbose:
print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s") # print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s")
self.logger.url_status(
url=cache_context.display_url,
success=bool(html),
timing=time.perf_counter() - start_time,
tag="FETCH"
)
# Fetch fresh content if needed # Fetch fresh content if needed
@@ -252,8 +270,14 @@ class AsyncWebCrawler:
html = sanitize_input_encode(async_response.html) html = sanitize_input_encode(async_response.html)
screenshot_data = async_response.screenshot screenshot_data = async_response.screenshot
t2 = time.perf_counter() t2 = time.perf_counter()
if verbose: self.logger.url_status(
print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url}... | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s") url=cache_context.display_url,
success=bool(html),
timing=t2 - t1,
tag="FETCH"
)
# if verbose:
# print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Live fetch for {cache_context.display_url}... | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {t2 - t1:.2f}s")
# Process the HTML content # Process the HTML content
crawl_result = await self.aprocess_html( crawl_result = await self.aprocess_html(
@@ -287,9 +311,21 @@ class AsyncWebCrawler:
crawl_result.success = bool(html) crawl_result.success = bool(html)
crawl_result.session_id = kwargs.get("session_id", None) crawl_result.session_id = kwargs.get("session_id", None)
if verbose: # if verbose:
print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}") # print(f"{Fore.GREEN}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | Status: {Fore.GREEN if crawl_result.success else Fore.RED}{crawl_result.success} | {Fore.YELLOW}Total: {time.perf_counter() - start_time:.2f}s{Style.RESET_ALL}")
self.logger.success(
message="{url:.50}... | Status: {status} | Total: {timing}",
tag="COMPLETE",
params={
"url": cache_context.display_url,
"status": crawl_result.success,
"timing": f"{time.perf_counter() - start_time:.2f}s"
},
colors={
"status": Fore.GREEN if crawl_result.success else Fore.RED,
"timing": Fore.YELLOW
}
)
# Update cache if appropriate # Update cache if appropriate
if cache_context.should_write() and not bool(cached_result): if cache_context.should_write() and not bool(cached_result):
@@ -300,7 +336,12 @@ class AsyncWebCrawler:
except Exception as e: except Exception as e:
if not hasattr(e, "msg"): if not hasattr(e, "msg"):
e.msg = str(e) e.msg = str(e)
print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}") # print(f"{Fore.RED}{self.tag_format('ERROR')} {self.log_icons['ERROR']} Failed to crawl {cache_context.display_url[:URL_LOG_SHORTEN_LENGTH]}... | {e.msg}{Style.RESET_ALL}")
self.logger.error_status(
url=cache_context.display_url,
error=e.msg,
tag="ERROR"
)
return CrawlResult( return CrawlResult(
url=url, url=url,
html="", html="",
@@ -362,7 +403,12 @@ class AsyncWebCrawler:
domain = urlparse(url).netloc domain = urlparse(url).netloc
current_time = time.time() current_time = time.time()
print(f"{Fore.LIGHTBLACK_EX}{self.tag_format('PARALLEL')} Started task for {url[:50]}...{Style.RESET_ALL}") # print(f"{Fore.LIGHTBLACK_EX}{self.tag_format('PARALLEL')} Started task for {url[:50]}...{Style.RESET_ALL}")
self.logger.debug(
message="Started task for {url:.50}...",
tag="PARALLEL",
params={"url": url}
)
# Get delay settings from kwargs or use defaults # Get delay settings from kwargs or use defaults
mean_delay = kwargs.get('mean_delay', 0.1) # 0.5 seconds default mean delay mean_delay = kwargs.get('mean_delay', 0.1) # 0.5 seconds default mean delay
@@ -394,12 +440,26 @@ class AsyncWebCrawler:
) )
# Print start message # Print start message
print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Starting concurrent crawling for {len(urls)} URLs...{Style.RESET_ALL}") # print(f"{Fore.CYAN}{self.tag_format('INIT')} {self.log_icons['INIT']} Starting concurrent crawling for {len(urls)} URLs...{Style.RESET_ALL}")
self.logger.info(
message="Starting concurrent crawling for {count} URLs...",
tag="INIT",
params={"count": len(urls)}
)
start_time = time.perf_counter() start_time = time.perf_counter()
tasks = [crawl_with_semaphore(url) for url in urls] tasks = [crawl_with_semaphore(url) for url in urls]
results = await asyncio.gather(*tasks, return_exceptions=True) results = await asyncio.gather(*tasks, return_exceptions=True)
end_time = time.perf_counter() end_time = time.perf_counter()
print(f"{Fore.YELLOW}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} Concurrent crawling completed for {len(urls)} URLs | Total time: {end_time - start_time:.2f}s{Style.RESET_ALL}") # print(f"{Fore.YELLOW}{self.tag_format('COMPLETE')} {self.log_icons['COMPLETE']} Concurrent crawling completed for {len(urls)} URLs | Total time: {end_time - start_time:.2f}s{Style.RESET_ALL}")
self.logger.success(
message="Concurrent crawling completed for {count} URLs | " + Fore.YELLOW + " Total time: {timing}" + Style.RESET_ALL,
tag="COMPLETE",
params={
"count": len(urls),
"timing": f"{end_time - start_time:.2f}s"
},
colors={"timing": Fore.YELLOW}
)
return [result if not isinstance(result, Exception) else str(result) for result in results] return [result if not isinstance(result, Exception) else str(result) for result in results]
@@ -451,9 +511,16 @@ class AsyncWebCrawler:
links = result.get("links", []) links = result.get("links", [])
metadata = result.get("metadata", {}) metadata = result.get("metadata", {})
if verbose: # if verbose:
print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms") # print(f"{Fore.MAGENTA}{self.tag_format('SCRAPE')} {self.log_icons['SCRAPE']} Processed {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {int((time.perf_counter() - t1) * 1000)}ms")
self.logger.info(
message="Processed {url:.50}... | Time: {timing}ms",
tag="SCRAPE",
params={
"url": _url,
"timing": int((time.perf_counter() - t1) * 1000)
}
)
if extracted_content is None and extraction_strategy and chunking_strategy and not isinstance(extraction_strategy, NoExtractionStrategy): if extracted_content is None and extraction_strategy and chunking_strategy and not isinstance(extraction_strategy, NoExtractionStrategy):
@@ -467,8 +534,17 @@ class AsyncWebCrawler:
sections = chunking_strategy.chunk(markdown) sections = chunking_strategy.chunk(markdown)
extracted_content = extraction_strategy.run(url, sections) extracted_content = extraction_strategy.run(url, sections)
extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
if verbose: # if verbose:
print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}") # print(f"{Fore.YELLOW}{self.tag_format('EXTRACT')} {self.log_icons['EXTRACT']} Completed for {_url[:URL_LOG_SHORTEN_LENGTH]}...{Style.RESET_ALL} | Time: {time.perf_counter() - t1:.2f}s{Style.RESET_ALL}")
self.logger.info(
message="Completed for {url:.50}... | Time: {timing}s",
tag="EXTRACT",
params={
"url": _url,
"timing": time.perf_counter() - t1
}
)

View File

@@ -8,6 +8,10 @@ from bs4 import BeautifulSoup, NavigableString, Tag
from .utils import clean_tokens from .utils import clean_tokens
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from snowballstemmer import stemmer
# from nltk.stem import PorterStemmer
# ps = PorterStemmer()
class RelevantContentFilter(ABC): class RelevantContentFilter(ABC):
def __init__(self, user_query: str = None): def __init__(self, user_query: str = None):
self.user_query = user_query self.user_query = user_query
@@ -252,7 +256,7 @@ class RelevantContentFilter(ABC):
return str(tag) # Fallback to original if anything fails return str(tag) # Fallback to original if anything fails
class BM25ContentFilter(RelevantContentFilter): class BM25ContentFilter(RelevantContentFilter):
def __init__(self, user_query: str = None, bm25_threshold: float = 1.0): def __init__(self, user_query: str = None, bm25_threshold: float = 1.0, language: str = 'english'):
super().__init__(user_query=user_query) super().__init__(user_query=user_query)
self.bm25_threshold = bm25_threshold self.bm25_threshold = bm25_threshold
self.priority_tags = { self.priority_tags = {
@@ -268,6 +272,7 @@ class BM25ContentFilter(RelevantContentFilter):
'pre': 1.5, 'pre': 1.5,
'th': 1.5, # Table headers 'th': 1.5, # Table headers
} }
self.stemmer = stemmer(language)
def filter_content(self, html: str) -> List[str]: def filter_content(self, html: str) -> List[str]:
"""Implements content filtering using BM25 algorithm with priority tag handling""" """Implements content filtering using BM25 algorithm with priority tag handling"""
@@ -282,58 +287,42 @@ class BM25ContentFilter(RelevantContentFilter):
if not candidates: if not candidates:
return [] return []
# Split into priority and regular candidates # Tokenize corpus
priority_candidates = [] # tokenized_corpus = [chunk.lower().split() for _, chunk, _, _ in candidates]
regular_candidates = [] # tokenized_query = query.lower().split()
# tokenized_corpus = [[ps.stem(word) for word in chunk.lower().split()]
# for _, chunk, _, _ in candidates]
# tokenized_query = [ps.stem(word) for word in query.lower().split()]
for index, chunk, tag_type, tag in candidates: tokenized_corpus = [[self.stemmer.stemWord(word) for word in chunk.lower().split()]
if tag.name in self.priority_tags: for _, chunk, _, _ in candidates]
priority_candidates.append((index, chunk, tag_type, tag)) tokenized_query = [self.stemmer.stemWord(word) for word in query.lower().split()]
else:
regular_candidates.append((index, chunk, tag_type, tag))
# Process regular content with BM25
tokenized_corpus = [chunk.lower().split() for _, chunk, _, _ in regular_candidates]
tokenized_query = query.lower().split()
# Clean from stop words and noise # Clean from stop words and noise
tokenized_corpus = [clean_tokens(tokens) for tokens in tokenized_corpus] tokenized_corpus = [clean_tokens(tokens) for tokens in tokenized_corpus]
tokenized_query = clean_tokens(tokenized_query) tokenized_query = clean_tokens(tokenized_query)
bm25 = BM25Okapi(tokenized_corpus) bm25 = BM25Okapi(tokenized_corpus)
scores = bm25.get_scores(tokenized_query) scores = bm25.get_scores(tokenized_query)
# Score and boost regular candidates # Adjust scores with tag weights
scored_candidates = [ adjusted_candidates = []
(score * self.priority_tags.get(tag.name, 1.0), index, chunk, tag_type, tag) for score, (index, chunk, tag_type, tag) in zip(scores, candidates):
for score, (index, chunk, tag_type, tag) in zip(scores, regular_candidates) tag_weight = self.priority_tags.get(tag.name, 1.0)
adjusted_score = score * tag_weight
adjusted_candidates.append((adjusted_score, index, chunk, tag))
# Filter candidates by threshold
selected_candidates = [
(index, chunk, tag) for adjusted_score, index, chunk, tag in adjusted_candidates
if adjusted_score >= self.bm25_threshold
] ]
scored_candidates.sort(key=lambda x: x[0], reverse=True)
# Process scored candidates
selected_tags = set()
selected_candidates = []
# First add all priority candidates
for index, chunk, tag_type, tag in priority_candidates:
tag_id = id(tag)
if tag_id not in selected_tags:
selected_candidates.append((index, chunk, tag))
selected_tags.add(tag_id)
# Then add scored regular candidates that meet threshold
for score, index, chunk, tag_type, tag in scored_candidates:
if score < self.bm25_threshold:
continue
tag_id = id(tag)
if tag_id not in selected_tags:
selected_candidates.append((index, chunk, tag))
selected_tags.add(tag_id)
if not selected_candidates: if not selected_candidates:
return [] return []
# Sort by original document order # Sort selected candidates by original document order
selected_candidates.sort(key=lambda x: x[0]) selected_candidates.sort(key=lambda x: x[0])
return [self.clean_element(tag) for _, _, tag in selected_candidates]
return [self.clean_element(tag) for _, _, tag in selected_candidates]

View File

@@ -149,6 +149,15 @@ class ContentScrapingStrategy(ABC):
pass pass
class WebScrapingStrategy(ContentScrapingStrategy): class WebScrapingStrategy(ContentScrapingStrategy):
def __init__(self, logger=None):
self.logger = logger
def _log(self, level, message, tag="SCRAPE", **kwargs):
"""Helper method to safely use logger."""
if self.logger:
log_method = getattr(self.logger, level)
log_method(message=message, tag=tag, **kwargs)
def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
return self._get_content_of_website_optimized(url, html, is_async=False, **kwargs) return self._get_content_of_website_optimized(url, html, is_async=False, **kwargs)
@@ -167,7 +176,12 @@ class WebScrapingStrategy(ContentScrapingStrategy):
try: try:
meta = extract_metadata("", soup) meta = extract_metadata("", soup)
except Exception as e: except Exception as e:
print('Error extracting metadata:', str(e)) self._log('error',
message="Error extracting metadata: {error}",
tag="SCRAPE",
params={"error": str(e)}
)
# print('Error extracting metadata:', str(e))
meta = {} meta = {}
@@ -430,9 +444,12 @@ class WebScrapingStrategy(ContentScrapingStrategy):
try: try:
remove_unwanted_attributes(element, IMPORTANT_ATTRS, kwargs.get('keep_data_attributes', False)) remove_unwanted_attributes(element, IMPORTANT_ATTRS, kwargs.get('keep_data_attributes', False))
except Exception as e: except Exception as e:
print('Error removing unwanted attributes:', str(e)) # print('Error removing unwanted attributes:', str(e))
self._log('error',
message="Error removing unwanted attributes: {error}",
tag="SCRAPE",
params={"error": str(e)}
)
# Process children # Process children
for child in list(element.children): for child in list(element.children):
if isinstance(child, NavigableString) and not isinstance(child, Comment): if isinstance(child, NavigableString) and not isinstance(child, Comment):
@@ -453,7 +470,12 @@ class WebScrapingStrategy(ContentScrapingStrategy):
return keep_element return keep_element
except Exception as e: except Exception as e:
print('Error processing element:', str(e)) # print('Error processing element:', str(e))
self._log('error',
message="Error processing element: {error}",
tag="SCRAPE",
params={"error": str(e)}
)
return False return False
process_element(body) process_element(body)
@@ -516,7 +538,10 @@ class WebScrapingStrategy(ContentScrapingStrategy):
str_body = body.encode_contents().decode('utf-8') str_body = body.encode_contents().decode('utf-8')
print(f"[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.") print(f"[LOG] 😧 Error: After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.")
self._log('error',
message="After processing the crawled HTML and removing irrelevant tags, nothing was left in the page. Check the markdown for further details.",
tag="SCRAPE"
)
cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ') cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ')
@@ -525,6 +550,13 @@ class WebScrapingStrategy(ContentScrapingStrategy):
h.update_params(**kwargs.get('html2text', {})) h.update_params(**kwargs.get('html2text', {}))
markdown = h.handle(cleaned_html) markdown = h.handle(cleaned_html)
except Exception as e: except Exception as e:
if not h:
h = CustomHTML2Text()
self._log('error',
message="Error converting HTML to markdown: {error}",
tag="SCRAPE",
params={"error": str(e)}
)
markdown = h.handle(sanitize_html(cleaned_html)) markdown = h.handle(sanitize_html(cleaned_html))
markdown = markdown.replace(' ```', '```') markdown = markdown.replace(' ```', '```')

View File

@@ -20,11 +20,11 @@ class VersionManager:
def update_version(self): def update_version(self):
"""Update the version file to current library version""" """Update the version file to current library version"""
self.version_file.write_text(__version__) self.version_file.write_text(__version__.__version__)
def needs_update(self): def needs_update(self):
"""Check if database needs update based on version""" """Check if database needs update based on version"""
installed = self.get_installed_version() installed = self.get_installed_version()
current = version.parse(__version__) current = version.parse(__version__.__version__)
return installed is None or installed < current return installed is None or installed < current

27
docker-compose.hub.yml Normal file
View File

@@ -0,0 +1,27 @@
services:
crawl4ai:
image: unclecode/crawl4ai:basic # Pull image from Docker Hub
ports:
- "11235:11235" # FastAPI server
- "8000:8000" # Alternative port
- "9222:9222" # Browser debugging
- "8080:8080" # Additional port
environment:
- CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token
- OPENAI_API_KEY=${OPENAI_API_KEY:-} # Optional OpenAI API key
- CLAUDE_API_KEY=${CLAUDE_API_KEY:-} # Optional Claude API key
volumes:
- /dev/shm:/dev/shm # Shared memory for browser operations
deploy:
resources:
limits:
memory: 4G
reservations:
memory: 1G
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s

33
docker-compose.local.yml Normal file
View File

@@ -0,0 +1,33 @@
services:
crawl4ai:
build:
context: .
dockerfile: Dockerfile
args:
PYTHON_VERSION: 3.10
INSTALL_TYPE: all
ENABLE_GPU: false
ports:
- "11235:11235" # FastAPI server
- "8000:8000" # Alternative port
- "9222:9222" # Browser debugging
- "8080:8080" # Additional port
environment:
- CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token
- OPENAI_API_KEY=${OPENAI_API_KEY:-} # Optional OpenAI API key
- CLAUDE_API_KEY=${CLAUDE_API_KEY:-} # Optional Claude API key
volumes:
- /dev/shm:/dev/shm # Shared memory for browser operations
deploy:
resources:
limits:
memory: 4G
reservations:
memory: 1G
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s

View File

@@ -1,5 +1,3 @@
version: '3.8'
services: services:
crawl4ai: crawl4ai:
build: build:
@@ -9,15 +7,18 @@ services:
PYTHON_VERSION: 3.10 PYTHON_VERSION: 3.10
INSTALL_TYPE: all INSTALL_TYPE: all
ENABLE_GPU: false ENABLE_GPU: false
profiles: ["local"]
ports: ports:
- "11235:11235" # FastAPI server - "11235:11235"
- "8000:8000" # Alternative port - "8000:8000"
- "9222:9222" # Browser debugging - "9222:9222"
- "8080:8080" # Additional port - "8080:8080"
environment: environment:
- CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}
- OPENAI_API_KEY=${OPENAI_API_KEY:-}
- CLAUDE_API_KEY=${CLAUDE_API_KEY:-}
volumes: volumes:
- /dev/shm:/dev/shm # Shared memory for browser operations - /dev/shm:/dev/shm
deploy: deploy:
resources: resources:
limits: limits:
@@ -30,4 +31,32 @@ services:
interval: 30s interval: 30s
timeout: 10s timeout: 10s
retries: 3 retries: 3
start_period: 40s start_period: 40s
crawl4ai-hub:
image: unclecode/crawl4ai:basic
profiles: ["hub"]
ports:
- "11235:11235"
- "8000:8000"
- "9222:9222"
- "8080:8080"
environment:
- CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}
- OPENAI_API_KEY=${OPENAI_API_KEY:-}
- CLAUDE_API_KEY=${CLAUDE_API_KEY:-}
volumes:
- /dev/shm:/dev/shm
deploy:
resources:
limits:
memory: 4G
reservations:
memory: 1G
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s

View File

@@ -1,9 +1,16 @@
import os, sys
# append the parent directory to the sys.path
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
parent_parent_dir = os.path.dirname(parent_dir)
sys.path.append(parent_parent_dir)
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
__data__ = os.path.join(__location__, "__data")
import asyncio import asyncio
import os
from pathlib import Path from pathlib import Path
import aiohttp import aiohttp
import json import json
from crawl4ai import AsyncWebCrawler from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.content_filter_strategy import BM25ContentFilter from crawl4ai.content_filter_strategy import BM25ContentFilter
# 1. File Download Processing Example # 1. File Download Processing Example
@@ -32,7 +39,8 @@ async def download_example():
console.log('No .exe download link found'); console.log('No .exe download link found');
} }
""", """,
wait_for=5 # Wait 5 seconds to ensure download starts delay_before_return_html=1, # Wait 5 seconds to ensure download starts
cache_mode=CacheMode.BYPASS
) )
if result.downloaded_files: if result.downloaded_files:
@@ -50,22 +58,32 @@ async def content_filtering_example():
async with AsyncWebCrawler(verbose=True) as crawler: async with AsyncWebCrawler(verbose=True) as crawler:
# Create filter with custom query for OpenAI's blog # Create filter with custom query for OpenAI's blog
content_filter = BM25ContentFilter( content_filter = BM25ContentFilter(
user_query="AI language models research innovation", # user_query="Investment and fundraising",
# user_query="Robotic",
bm25_threshold=1.0 bm25_threshold=1.0
) )
result = await crawler.arun( result = await crawler.arun(
url="https://openai.com/blog", url="https://techcrunch.com/",
content_filter=content_filter content_filter=content_filter,
cache_mode=CacheMode.BYPASS
) )
print(f"Filtered content: {result.extracted_content}") print(f"Filtered content: {len(result.fit_markdown)}")
print(f"Filtered content: {result.fit_markdown}")
# Save html
with open(os.path.join(__data__, "techcrunch.html"), "w") as f:
f.write(result.fit_html)
with open(os.path.join(__data__, "filtered_content.md"), "w") as f:
f.write(result.fit_markdown)
# 3. Local File and Raw HTML Processing Example # 3. Local File and Raw HTML Processing Example
async def local_and_raw_html_example(): async def local_and_raw_html_example():
"""Example of processing local files and raw HTML""" """Example of processing local files and raw HTML"""
# Create a sample HTML file # Create a sample HTML file
sample_file = "sample.html" sample_file = os.path.join(__data__, "sample.html")
with open(sample_file, "w") as f: with open(sample_file, "w") as f:
f.write(""" f.write("""
<html><body> <html><body>
@@ -112,21 +130,18 @@ async def browser_management_example():
headless=False, headless=False,
verbose=True verbose=True
) as crawler: ) as crawler:
result = await crawler.arun(
url="https://crawl4ai.com",
# session_id="persistent_session_1",
cache_mode=CacheMode.BYPASS
)
# Use GitHub as an example - it's a good test for browser management # Use GitHub as an example - it's a good test for browser management
# because it requires proper browser handling # because it requires proper browser handling
result = await crawler.arun( result = await crawler.arun(
url="https://github.com/trending", url="https://github.com/trending",
session_id="persistent_session_1", # session_id="persistent_session_1",
js_code=""" cache_mode=CacheMode.BYPASS
// Custom JavaScript to execute on GitHub's trending page
const repos = document.querySelectorAll('article.Box-row');
const data = Array.from(repos).map(repo => ({
name: repo.querySelector('h2')?.textContent?.trim(),
description: repo.querySelector('p')?.textContent?.trim(),
language: repo.querySelector('[itemprop="programmingLanguage"]')?.textContent?.trim()
}));
console.log('Trending repositories:', JSON.stringify(data, null, 2));
"""
) )
print("\nBrowser session result:", result.success) print("\nBrowser session result:", result.success)
@@ -136,6 +151,8 @@ async def browser_management_example():
# 5. API Usage Example # 5. API Usage Example
async def api_example(): async def api_example():
"""Example of using the new API endpoints""" """Example of using the new API endpoints"""
api_token = os.getenv('CRAWL4AI_API_TOKEN') or "test_api_code"
headers = {'Authorization': f'Bearer {api_token}'}
async with aiohttp.ClientSession() as session: async with aiohttp.ClientSession() as session:
# Submit crawl job # Submit crawl job
crawl_request = { crawl_request = {
@@ -143,52 +160,78 @@ async def api_example():
"extraction_config": { "extraction_config": {
"type": "json_css", "type": "json_css",
"params": { "params": {
"selectors": { "schema": {
"titles": ".title a", "name": "Hacker News Articles",
"scores": ".score", "baseSelector": ".athing",
"comments": ".comment-tree" "fields": [
{
"name": "title",
"selector": ".title a",
"type": "text"
},
{
"name": "score",
"selector": ".score",
"type": "text"
},
{
"name": "url",
"selector": ".title a",
"type": "attribute",
"attribute": "href"
}
]
} }
} }
}, },
"crawler_params": { "crawler_params": {
"headless": True, "headless": True,
"use_managed_browser": True # "use_managed_browser": True
}, },
"screenshot": True, "cache_mode": "bypass",
"magic": True # "screenshot": True,
# "magic": True
} }
async with session.post( async with session.post(
"http://localhost:11235/crawl", "http://localhost:11235/crawl",
json=crawl_request json=crawl_request,
headers=headers
) as response: ) as response:
task_data = await response.json() task_data = await response.json()
task_id = task_data["task_id"] task_id = task_data["task_id"]
# Check task status # Check task status
async with session.get( while True:
f"http://localhost:11235/task/{task_id}" async with session.get(
) as status_response: f"http://localhost:11235/task/{task_id}",
result = await status_response.json() headers=headers
print(f"Task result: {result}") ) as status_response:
result = await status_response.json()
print(f"Task result: {result}")
if result["status"] == "completed":
break
else:
await asyncio.sleep(1)
# Main execution # Main execution
async def main(): async def main():
print("Running Crawl4AI feature examples...") # print("Running Crawl4AI feature examples...")
print("\n1. Running Download Example:") # print("\n1. Running Download Example:")
await download_example() await download_example()
print("\n2. Running Content Filtering Example:") # print("\n2. Running Content Filtering Example:")
await content_filtering_example() await content_filtering_example()
print("\n3. Running Local and Raw HTML Example:") # print("\n3. Running Local and Raw HTML Example:")
await local_and_raw_html_example() await local_and_raw_html_example()
print("\n4. Running Browser Management Example:") # print("\n4. Running Browser Management Example:")
await browser_management_example() await browser_management_example()
print("\n5. Running API Example:") # print("\n5. Running API Example:")
await api_example() await api_example()
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -15,6 +15,94 @@ docker run -p 11235:11235 unclecode/crawl4ai:basic
docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:basic docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:basic
``` ```
## Running with Docker Compose 🐳
### Use Docker Compose (From Local Dockerfile or Docker Hub)
Crawl4AI provides flexibility to use Docker Compose for managing your containerized services. You can either build the image locally from the provided `Dockerfile` or use the pre-built image from Docker Hub.
### **Option 1: Using Docker Compose to Build Locally**
If you want to build the image locally, use the provided `docker-compose.local.yml` file.
```bash
docker-compose -f docker-compose.local.yml up -d
```
This will:
1. Build the Docker image from the provided `Dockerfile`.
2. Start the container and expose it on `http://localhost:11235`.
---
### **Option 2: Using Docker Compose with Pre-Built Image from Hub**
If you prefer using the pre-built image on Docker Hub, use the `docker-compose.hub.yml` file.
```bash
docker-compose -f docker-compose.hub.yml up -d
```
This will:
1. Pull the pre-built image `unclecode/crawl4ai:basic` (or `all`, depending on your configuration).
2. Start the container and expose it on `http://localhost:11235`.
---
### **Stopping the Running Services**
To stop the services started via Docker Compose, you can use:
```bash
docker-compose -f docker-compose.local.yml down
# OR
docker-compose -f docker-compose.hub.yml down
```
If the containers dont stop and the application is still running, check the running containers:
```bash
docker ps
```
Find the `CONTAINER ID` of the running service and stop it forcefully:
```bash
docker stop <CONTAINER_ID>
```
---
### **Debugging with Docker Compose**
- **Check Logs**: To view the container logs:
```bash
docker-compose -f docker-compose.local.yml logs -f
```
- **Remove Orphaned Containers**: If the service is still running unexpectedly:
```bash
docker-compose -f docker-compose.local.yml down --remove-orphans
```
- **Manually Remove Network**: If the network is still in use:
```bash
docker network ls
docker network rm crawl4ai_default
```
---
### Why Use Docker Compose?
Docker Compose is the recommended way to deploy Crawl4AI because:
1. It simplifies multi-container setups.
2. Allows you to define environment variables, resources, and ports in a single file.
3. Makes it easier to switch between local development and production-ready images.
For example, your `docker-compose.yml` could include API keys, token settings, and memory limits, making deployment quick and consistent.
## API Security 🔒 ## API Security 🔒
### Understanding CRAWL4AI_API_TOKEN ### Understanding CRAWL4AI_API_TOKEN

23
main.py
View File

@@ -26,6 +26,7 @@ from enum import Enum
from dataclasses import dataclass from dataclasses import dataclass
import json import json
from crawl4ai import AsyncWebCrawler, CrawlResult, CacheMode from crawl4ai import AsyncWebCrawler, CrawlResult, CacheMode
from crawl4ai.config import MIN_WORD_THRESHOLD
from crawl4ai.extraction_strategy import ( from crawl4ai.extraction_strategy import (
LLMExtractionStrategy, LLMExtractionStrategy,
CosineStrategy, CosineStrategy,
@@ -53,12 +54,20 @@ class ExtractionConfig(BaseModel):
type: CrawlerType type: CrawlerType
params: Dict[str, Any] = {} params: Dict[str, Any] = {}
class ChunkingStrategy(BaseModel):
type: str
params: Dict[str, Any] = {}
class ContentFilter(BaseModel):
type: str = "bm25"
params: Dict[str, Any] = {}
class CrawlRequest(BaseModel): class CrawlRequest(BaseModel):
urls: Union[HttpUrl, List[HttpUrl]] urls: Union[HttpUrl, List[HttpUrl]]
word_count_threshold: int = MIN_WORD_THRESHOLD
extraction_config: Optional[ExtractionConfig] = None extraction_config: Optional[ExtractionConfig] = None
crawler_params: Dict[str, Any] = {} chunking_strategy: Optional[ChunkingStrategy] = None
priority: int = Field(default=5, ge=1, le=10) content_filter: Optional[ContentFilter] = None
ttl: Optional[int] = 3600
js_code: Optional[List[str]] = None js_code: Optional[List[str]] = None
wait_for: Optional[str] = None wait_for: Optional[str] = None
css_selector: Optional[str] = None css_selector: Optional[str] = None
@@ -66,7 +75,10 @@ class CrawlRequest(BaseModel):
magic: bool = False magic: bool = False
extra: Optional[Dict[str, Any]] = {} extra: Optional[Dict[str, Any]] = {}
session_id: Optional[str] = None session_id: Optional[str] = None
cache_mode: Optional[CacheMode] = None cache_mode: Optional[CacheMode] = CacheMode.ENABLED
priority: int = Field(default=5, ge=1, le=10)
ttl: Optional[int] = 3600
crawler_params: Dict[str, Any] = {}
@dataclass @dataclass
class TaskInfo: class TaskInfo:
@@ -280,6 +292,7 @@ class CrawlerService:
if isinstance(request.urls, list): if isinstance(request.urls, list):
results = await crawler.arun_many( results = await crawler.arun_many(
urls=[str(url) for url in request.urls], urls=[str(url) for url in request.urls],
word_count_threshold=MIN_WORD_THRESHOLD,
extraction_strategy=extraction_strategy, extraction_strategy=extraction_strategy,
js_code=request.js_code, js_code=request.js_code,
wait_for=request.wait_for, wait_for=request.wait_for,
@@ -287,6 +300,7 @@ class CrawlerService:
screenshot=request.screenshot, screenshot=request.screenshot,
magic=request.magic, magic=request.magic,
session_id=request.session_id, session_id=request.session_id,
cache_mode=request.cache_mode,
**request.extra, **request.extra,
) )
else: else:
@@ -299,6 +313,7 @@ class CrawlerService:
screenshot=request.screenshot, screenshot=request.screenshot,
magic=request.magic, magic=request.magic,
session_id=request.session_id, session_id=request.session_id,
cache_mode=request.cache_mode,
**request.extra, **request.extra,
) )

View File

@@ -12,4 +12,5 @@ tf-playwright-stealth~=1.0
xxhash~=3.4 xxhash~=3.4
rank-bm25~=0.2 rank-bm25~=0.2
aiofiles~=24.0 aiofiles~=24.0
colorama~=0.4 colorama~=0.4
snowballstemmer~=2.2