feat(browser): add BrowserHub for centralized browser management and resource sharing
This commit is contained in:
183
crawl4ai/browser/browser_hub.py
Normal file
183
crawl4ai/browser/browser_hub.py
Normal file
@@ -0,0 +1,183 @@
|
|||||||
|
# browser_hub_manager.py
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import asyncio
|
||||||
|
from typing import Dict, Optional
|
||||||
|
from .manager import BrowserManager, UnavailableBehavior
|
||||||
|
from ..async_configs import BrowserConfig
|
||||||
|
from ..async_logger import AsyncLogger
|
||||||
|
|
||||||
|
class BrowserHub:
|
||||||
|
"""
|
||||||
|
Manages Browser-Hub instances for sharing across multiple pipelines.
|
||||||
|
|
||||||
|
This class provides centralized management for browser resources, allowing
|
||||||
|
multiple pipelines to share browser instances efficiently, connect to
|
||||||
|
existing browser hubs, or create new ones with custom configurations.
|
||||||
|
"""
|
||||||
|
_instances: Dict[str, BrowserManager] = {}
|
||||||
|
_lock = asyncio.Lock()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def get_or_create_hub(
|
||||||
|
cls,
|
||||||
|
config: Optional[BrowserConfig] = None,
|
||||||
|
hub_id: Optional[str] = None,
|
||||||
|
connection_info: Optional[str] = None,
|
||||||
|
logger: Optional[AsyncLogger] = None,
|
||||||
|
max_browsers_per_config: int = 10,
|
||||||
|
max_pages_per_browser: int = 5,
|
||||||
|
initial_pool_size: int = 1,
|
||||||
|
page_configs: Optional[list] = None
|
||||||
|
) -> BrowserManager:
|
||||||
|
"""
|
||||||
|
Get an existing Browser-Hub or create a new one based on parameters.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Browser configuration for new hub
|
||||||
|
hub_id: Identifier for the hub instance
|
||||||
|
connection_info: Connection string for existing hub
|
||||||
|
logger: Logger for recording events and errors
|
||||||
|
max_browsers_per_config: Maximum browsers per configuration
|
||||||
|
max_pages_per_browser: Maximum pages per browser
|
||||||
|
initial_pool_size: Initial number of browsers to create
|
||||||
|
page_configs: Optional configurations for pre-warming pages
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
BrowserManager: The requested browser manager instance
|
||||||
|
"""
|
||||||
|
async with cls._lock:
|
||||||
|
# Scenario 3: Use existing hub via connection info
|
||||||
|
if connection_info:
|
||||||
|
instance_key = f"connection:{connection_info}"
|
||||||
|
if instance_key not in cls._instances:
|
||||||
|
cls._instances[instance_key] = await cls._connect_to_browser_hub(
|
||||||
|
connection_info, logger
|
||||||
|
)
|
||||||
|
return cls._instances[instance_key]
|
||||||
|
|
||||||
|
# Scenario 2: Custom configured hub
|
||||||
|
if config:
|
||||||
|
config_hash = cls._hash_config(config)
|
||||||
|
instance_key = hub_id or f"config:{config_hash}"
|
||||||
|
if instance_key not in cls._instances:
|
||||||
|
cls._instances[instance_key] = await cls._create_browser_hub(
|
||||||
|
config,
|
||||||
|
logger,
|
||||||
|
max_browsers_per_config,
|
||||||
|
max_pages_per_browser,
|
||||||
|
initial_pool_size,
|
||||||
|
page_configs
|
||||||
|
)
|
||||||
|
return cls._instances[instance_key]
|
||||||
|
|
||||||
|
# Scenario 1: Default hub
|
||||||
|
instance_key = "default"
|
||||||
|
if instance_key not in cls._instances:
|
||||||
|
cls._instances[instance_key] = await cls._create_default_browser_hub(
|
||||||
|
logger,
|
||||||
|
max_browsers_per_config,
|
||||||
|
max_pages_per_browser,
|
||||||
|
initial_pool_size
|
||||||
|
)
|
||||||
|
return cls._instances[instance_key]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def _create_browser_hub(
|
||||||
|
cls,
|
||||||
|
config: BrowserConfig,
|
||||||
|
logger: Optional[AsyncLogger],
|
||||||
|
max_browsers_per_config: int,
|
||||||
|
max_pages_per_browser: int,
|
||||||
|
initial_pool_size: int,
|
||||||
|
page_configs: Optional[list]
|
||||||
|
) -> BrowserManager:
|
||||||
|
"""Create a new browser hub with the specified configuration."""
|
||||||
|
manager = BrowserManager(
|
||||||
|
browser_config=config,
|
||||||
|
logger=logger,
|
||||||
|
unavailable_behavior=UnavailableBehavior.ON_DEMAND,
|
||||||
|
max_browsers_per_config=max_browsers_per_config
|
||||||
|
)
|
||||||
|
|
||||||
|
# Initialize the pool
|
||||||
|
await manager.initialize_pool(
|
||||||
|
browser_configs=[config] if config else None,
|
||||||
|
browsers_per_config=initial_pool_size,
|
||||||
|
page_configs=page_configs
|
||||||
|
)
|
||||||
|
|
||||||
|
return manager
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def _create_default_browser_hub(
|
||||||
|
cls,
|
||||||
|
logger: Optional[AsyncLogger],
|
||||||
|
max_browsers_per_config: int,
|
||||||
|
max_pages_per_browser: int,
|
||||||
|
initial_pool_size: int
|
||||||
|
) -> BrowserManager:
|
||||||
|
"""Create a default browser hub with standard settings."""
|
||||||
|
config = BrowserConfig(headless=True)
|
||||||
|
return await cls._create_browser_hub(
|
||||||
|
config,
|
||||||
|
logger,
|
||||||
|
max_browsers_per_config,
|
||||||
|
max_pages_per_browser,
|
||||||
|
initial_pool_size,
|
||||||
|
None
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def _connect_to_browser_hub(
|
||||||
|
cls,
|
||||||
|
connection_info: str,
|
||||||
|
logger: Optional[AsyncLogger]
|
||||||
|
) -> BrowserManager:
|
||||||
|
"""
|
||||||
|
Connect to an existing browser hub.
|
||||||
|
|
||||||
|
Note: This is a placeholder for future remote connection functionality.
|
||||||
|
Currently creates a local instance.
|
||||||
|
"""
|
||||||
|
if logger:
|
||||||
|
logger.info(
|
||||||
|
message="Remote browser hub connections not yet implemented. Creating local instance.",
|
||||||
|
tag="BROWSER_HUB"
|
||||||
|
)
|
||||||
|
# For now, create a default local instance
|
||||||
|
return await cls._create_default_browser_hub(
|
||||||
|
logger,
|
||||||
|
max_browsers_per_config=10,
|
||||||
|
max_pages_per_browser=5,
|
||||||
|
initial_pool_size=1
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _hash_config(cls, config: BrowserConfig) -> str:
|
||||||
|
"""Create a hash of the browser configuration for identification."""
|
||||||
|
# Convert config to dictionary, excluding any callable objects
|
||||||
|
config_dict = config.__dict__.copy()
|
||||||
|
for key in list(config_dict.keys()):
|
||||||
|
if callable(config_dict[key]):
|
||||||
|
del config_dict[key]
|
||||||
|
|
||||||
|
# Convert to canonical JSON string
|
||||||
|
config_json = json.dumps(config_dict, sort_keys=True, default=str)
|
||||||
|
|
||||||
|
# Hash the JSON
|
||||||
|
config_hash = hashlib.sha256(config_json.encode()).hexdigest()
|
||||||
|
return config_hash
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def shutdown_all(cls):
|
||||||
|
"""Close all browser hub instances and clear the registry."""
|
||||||
|
async with cls._lock:
|
||||||
|
shutdown_tasks = []
|
||||||
|
for hub in cls._instances.values():
|
||||||
|
shutdown_tasks.append(hub.close())
|
||||||
|
|
||||||
|
if shutdown_tasks:
|
||||||
|
await asyncio.gather(*shutdown_tasks)
|
||||||
|
|
||||||
|
cls._instances.clear()
|
||||||
@@ -1,55 +0,0 @@
|
|||||||
FROM ubuntu:22.04
|
|
||||||
|
|
||||||
# Install dependencies with comprehensive Chromium support
|
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
||||||
wget \
|
|
||||||
curl \
|
|
||||||
gnupg \
|
|
||||||
ca-certificates \
|
|
||||||
fonts-liberation \
|
|
||||||
# Core dependencies
|
|
||||||
libasound2 \
|
|
||||||
libatk1.0-0 \
|
|
||||||
libatk-bridge2.0-0 \
|
|
||||||
libdrm2 \
|
|
||||||
libgbm1 \
|
|
||||||
libgtk-3-0 \
|
|
||||||
libxcomposite1 \
|
|
||||||
libxdamage1 \
|
|
||||||
libxext6 \
|
|
||||||
libxfixes3 \
|
|
||||||
libxrandr2 \
|
|
||||||
libx11-6 \
|
|
||||||
libxcb1 \
|
|
||||||
libxkbcommon0 \
|
|
||||||
libpango-1.0-0 \
|
|
||||||
libcairo2 \
|
|
||||||
libcups2 \
|
|
||||||
libdbus-1-3 \
|
|
||||||
libnss3 \
|
|
||||||
libnspr4 \
|
|
||||||
libglib2.0-0 \
|
|
||||||
# Utilities
|
|
||||||
xdg-utils \
|
|
||||||
socat \
|
|
||||||
# Clean up
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# Install Chromium with codecs
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y \
|
|
||||||
chromium-browser \
|
|
||||||
chromium-codecs-ffmpeg-extra \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# Create Chrome alias for compatibility
|
|
||||||
RUN ln -s /usr/bin/chromium-browser /usr/bin/google-chrome
|
|
||||||
|
|
||||||
# Create data directory
|
|
||||||
RUN mkdir -p /data && chmod 777 /data
|
|
||||||
|
|
||||||
# Add startup script
|
|
||||||
COPY start.sh /start.sh
|
|
||||||
RUN chmod +x /start.sh
|
|
||||||
|
|
||||||
ENTRYPOINT ["/start.sh"]
|
|
||||||
@@ -1,57 +0,0 @@
|
|||||||
FROM ubuntu:22.04
|
|
||||||
|
|
||||||
# Install dependencies with comprehensive Chromium support
|
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
||||||
wget \
|
|
||||||
gnupg \
|
|
||||||
ca-certificates \
|
|
||||||
fonts-liberation \
|
|
||||||
# Sound support
|
|
||||||
libasound2 \
|
|
||||||
# Accessibility support
|
|
||||||
libatspi2.0-0 \
|
|
||||||
libatk1.0-0 \
|
|
||||||
libatk-bridge2.0-0 \
|
|
||||||
# Graphics and rendering
|
|
||||||
libdrm2 \
|
|
||||||
libgbm1 \
|
|
||||||
libgtk-3-0 \
|
|
||||||
libxcomposite1 \
|
|
||||||
libxdamage1 \
|
|
||||||
libxext6 \
|
|
||||||
libxfixes3 \
|
|
||||||
libxrandr2 \
|
|
||||||
# X11 and window system
|
|
||||||
libx11-6 \
|
|
||||||
libxcb1 \
|
|
||||||
libxkbcommon0 \
|
|
||||||
# Text and internationalization
|
|
||||||
libpango-1.0-0 \
|
|
||||||
libcairo2 \
|
|
||||||
# Printing support
|
|
||||||
libcups2 \
|
|
||||||
# System libraries
|
|
||||||
libdbus-1-3 \
|
|
||||||
libnss3 \
|
|
||||||
libnspr4 \
|
|
||||||
libglib2.0-0 \
|
|
||||||
# Utilities
|
|
||||||
xdg-utils \
|
|
||||||
socat \
|
|
||||||
# Process management
|
|
||||||
procps \
|
|
||||||
# Clean up
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# Install Chrome (new method)
|
|
||||||
RUN curl -fsSL https://dl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /usr/share/keyrings/googlechrome-linux-keyring.gpg && \
|
|
||||||
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/googlechrome-linux-keyring.gpg] http://dl.google.com/linux/chrome/deb/ stable main" | tee /etc/apt/sources.list.d/google-chrome.list && \
|
|
||||||
apt-get update && \
|
|
||||||
apt-get install -y google-chrome-stable && \
|
|
||||||
rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# Create data directory for user data
|
|
||||||
RUN mkdir -p /data && chmod 777 /data
|
|
||||||
|
|
||||||
# Keep container running without starting Chrome
|
|
||||||
CMD ["tail", "-f", "/dev/null"]
|
|
||||||
@@ -756,7 +756,6 @@ class BrowserManager:
|
|||||||
|
|
||||||
return status
|
return status
|
||||||
|
|
||||||
|
|
||||||
async def start(self):
|
async def start(self):
|
||||||
"""Start at least one browser instance in the pool.
|
"""Start at least one browser instance in the pool.
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user