1054 lines
40 KiB
Python
1054 lines
40 KiB
Python
import asyncio
|
|
import time
|
|
from typing import List, Optional
|
|
import os
|
|
import sys
|
|
import shutil
|
|
import tempfile
|
|
import psutil
|
|
import signal
|
|
import subprocess
|
|
import shlex
|
|
from playwright.async_api import BrowserContext
|
|
import hashlib
|
|
from .js_snippet import load_js_script
|
|
from .config import DOWNLOAD_PAGE_TIMEOUT
|
|
from .async_configs import BrowserConfig, CrawlerRunConfig
|
|
from playwright_stealth import StealthConfig
|
|
from .utils import get_chromium_path
|
|
|
|
stealth_config = StealthConfig(
|
|
webdriver=True,
|
|
chrome_app=True,
|
|
chrome_csi=True,
|
|
chrome_load_times=True,
|
|
chrome_runtime=True,
|
|
navigator_languages=True,
|
|
navigator_plugins=True,
|
|
navigator_permissions=True,
|
|
webgl_vendor=True,
|
|
outerdimensions=True,
|
|
navigator_hardware_concurrency=True,
|
|
media_codecs=True,
|
|
)
|
|
|
|
BROWSER_DISABLE_OPTIONS = [
|
|
"--disable-background-networking",
|
|
"--disable-background-timer-throttling",
|
|
"--disable-backgrounding-occluded-windows",
|
|
"--disable-breakpad",
|
|
"--disable-client-side-phishing-detection",
|
|
"--disable-component-extensions-with-background-pages",
|
|
"--disable-default-apps",
|
|
"--disable-extensions",
|
|
"--disable-features=TranslateUI",
|
|
"--disable-hang-monitor",
|
|
"--disable-ipc-flooding-protection",
|
|
"--disable-popup-blocking",
|
|
"--disable-prompt-on-repost",
|
|
"--disable-sync",
|
|
"--force-color-profile=srgb",
|
|
"--metrics-recording-only",
|
|
"--no-first-run",
|
|
"--password-store=basic",
|
|
"--use-mock-keychain",
|
|
]
|
|
|
|
|
|
class ManagedBrowser:
|
|
"""
|
|
Manages the browser process and context. This class allows to connect to the browser using CDP protocol.
|
|
|
|
Attributes:
|
|
browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
|
|
Default: "chromium".
|
|
user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
|
|
temporary directory may be used. Default: None.
|
|
headless (bool): Whether to run the browser in headless mode (no visible GUI).
|
|
Default: True.
|
|
browser_process (subprocess.Popen): The process object for the browser.
|
|
temp_dir (str): Temporary directory for user data if not provided.
|
|
debugging_port (int): Port for debugging the browser.
|
|
host (str): Host for debugging the browser.
|
|
|
|
Methods:
|
|
start(): Starts the browser process and returns the CDP endpoint URL.
|
|
_get_browser_path(): Returns the browser executable path based on OS and browser type.
|
|
_get_browser_args(): Returns browser-specific command line arguments.
|
|
_get_user_data_dir(): Returns the user data directory path.
|
|
_cleanup(): Terminates the browser process and removes the temporary directory.
|
|
create_profile(): Static method to create a user profile by launching a browser for user interaction.
|
|
"""
|
|
|
|
@staticmethod
|
|
def build_browser_flags(config: BrowserConfig) -> List[str]:
|
|
"""Common CLI flags for launching Chromium"""
|
|
flags = [
|
|
"--disable-gpu",
|
|
"--disable-gpu-compositing",
|
|
"--disable-software-rasterizer",
|
|
"--no-sandbox",
|
|
"--disable-dev-shm-usage",
|
|
"--no-first-run",
|
|
"--no-default-browser-check",
|
|
"--disable-infobars",
|
|
"--window-position=0,0",
|
|
"--ignore-certificate-errors",
|
|
"--ignore-certificate-errors-spki-list",
|
|
"--disable-blink-features=AutomationControlled",
|
|
"--window-position=400,0",
|
|
"--disable-renderer-backgrounding",
|
|
"--disable-ipc-flooding-protection",
|
|
"--force-color-profile=srgb",
|
|
"--mute-audio",
|
|
"--disable-background-timer-throttling",
|
|
]
|
|
if config.light_mode:
|
|
flags.extend(BROWSER_DISABLE_OPTIONS)
|
|
if config.text_mode:
|
|
flags.extend([
|
|
"--blink-settings=imagesEnabled=false",
|
|
"--disable-remote-fonts",
|
|
"--disable-images",
|
|
"--disable-javascript",
|
|
"--disable-software-rasterizer",
|
|
"--disable-dev-shm-usage",
|
|
])
|
|
# proxy support
|
|
if config.proxy:
|
|
flags.append(f"--proxy-server={config.proxy}")
|
|
elif config.proxy_config:
|
|
creds = ""
|
|
if config.proxy_config.username and config.proxy_config.password:
|
|
creds = f"{config.proxy_config.username}:{config.proxy_config.password}@"
|
|
flags.append(f"--proxy-server={creds}{config.proxy_config.server}")
|
|
# dedupe
|
|
return list(dict.fromkeys(flags))
|
|
|
|
browser_type: str
|
|
user_data_dir: str
|
|
headless: bool
|
|
browser_process: subprocess.Popen
|
|
temp_dir: str
|
|
debugging_port: int
|
|
host: str
|
|
|
|
def __init__(
|
|
self,
|
|
browser_type: str = "chromium",
|
|
user_data_dir: Optional[str] = None,
|
|
headless: bool = False,
|
|
logger=None,
|
|
host: str = "localhost",
|
|
debugging_port: int = 9222,
|
|
cdp_url: Optional[str] = None,
|
|
browser_config: Optional[BrowserConfig] = None,
|
|
):
|
|
"""
|
|
Initialize the ManagedBrowser instance.
|
|
|
|
Args:
|
|
browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
|
|
Default: "chromium".
|
|
user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
|
|
temporary directory may be used. Default: None.
|
|
headless (bool): Whether to run the browser in headless mode (no visible GUI).
|
|
Default: True.
|
|
logger (logging.Logger): Logger instance for logging messages. Default: None.
|
|
host (str): Host for debugging the browser. Default: "localhost".
|
|
debugging_port (int): Port for debugging the browser. Default: 9222.
|
|
cdp_url (str or None): CDP URL to connect to the browser. Default: None.
|
|
browser_config (BrowserConfig): Configuration object containing all browser settings. Default: None.
|
|
"""
|
|
self.browser_type = browser_config.browser_type
|
|
self.user_data_dir = browser_config.user_data_dir
|
|
self.headless = browser_config.headless
|
|
self.browser_process = None
|
|
self.temp_dir = None
|
|
self.debugging_port = browser_config.debugging_port
|
|
self.host = browser_config.host
|
|
self.logger = logger
|
|
self.shutting_down = False
|
|
self.cdp_url = browser_config.cdp_url
|
|
self.browser_config = browser_config
|
|
|
|
async def start(self) -> str:
|
|
"""
|
|
Starts the browser process or returns CDP endpoint URL.
|
|
If cdp_url is provided, returns it directly.
|
|
If user_data_dir is not provided for local browser, creates a temporary directory.
|
|
|
|
Returns:
|
|
str: CDP endpoint URL
|
|
"""
|
|
# If CDP URL provided, just return it
|
|
if self.cdp_url:
|
|
return self.cdp_url
|
|
|
|
# Create temp dir if needed
|
|
if not self.user_data_dir:
|
|
self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-")
|
|
self.user_data_dir = self.temp_dir
|
|
|
|
# Get browser path and args based on OS and browser type
|
|
# browser_path = self._get_browser_path()
|
|
args = await self._get_browser_args()
|
|
|
|
if self.browser_config.extra_args:
|
|
args.extend(self.browser_config.extra_args)
|
|
|
|
|
|
# ── make sure no old Chromium instance is owning the same port/profile ──
|
|
try:
|
|
if sys.platform == "win32":
|
|
if psutil is None:
|
|
raise RuntimeError("psutil not available, cannot clean old browser")
|
|
for p in psutil.process_iter(["pid", "name", "cmdline"]):
|
|
cl = " ".join(p.info.get("cmdline") or [])
|
|
if (
|
|
f"--remote-debugging-port={self.debugging_port}" in cl
|
|
and f"--user-data-dir={self.user_data_dir}" in cl
|
|
):
|
|
p.kill()
|
|
p.wait(timeout=5)
|
|
else: # macOS / Linux
|
|
# kill any process listening on the same debugging port
|
|
pids = (
|
|
subprocess.check_output(shlex.split(f"lsof -t -i:{self.debugging_port}"))
|
|
.decode()
|
|
.strip()
|
|
.splitlines()
|
|
)
|
|
for pid in pids:
|
|
try:
|
|
os.kill(int(pid), signal.SIGTERM)
|
|
except ProcessLookupError:
|
|
pass
|
|
|
|
# remove Chromium singleton locks, or new launch exits with
|
|
# “Opening in existing browser session.”
|
|
for f in ("SingletonLock", "SingletonSocket", "SingletonCookie"):
|
|
fp = os.path.join(self.user_data_dir, f)
|
|
if os.path.exists(fp):
|
|
os.remove(fp)
|
|
except Exception as _e:
|
|
# non-fatal — we'll try to start anyway, but log what happened
|
|
self.logger.warning(f"pre-launch cleanup failed: {_e}", tag="BROWSER")
|
|
|
|
|
|
# Start browser process
|
|
try:
|
|
# Use DETACHED_PROCESS flag on Windows to fully detach the process
|
|
# On Unix, we'll use preexec_fn=os.setpgrp to start the process in a new process group
|
|
if sys.platform == "win32":
|
|
self.browser_process = subprocess.Popen(
|
|
args,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
creationflags=subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP
|
|
)
|
|
else:
|
|
self.browser_process = subprocess.Popen(
|
|
args,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
preexec_fn=os.setpgrp # Start in a new process group
|
|
)
|
|
|
|
# We'll monitor for a short time to make sure it starts properly, but won't keep monitoring
|
|
await asyncio.sleep(0.5) # Give browser time to start
|
|
await self._initial_startup_check()
|
|
await asyncio.sleep(2) # Give browser time to start
|
|
return f"http://{self.host}:{self.debugging_port}"
|
|
except Exception as e:
|
|
await self.cleanup()
|
|
raise Exception(f"Failed to start browser: {e}")
|
|
|
|
async def _initial_startup_check(self):
|
|
"""
|
|
Perform a quick check to make sure the browser started successfully.
|
|
This only runs once at startup rather than continuously monitoring.
|
|
"""
|
|
if not self.browser_process:
|
|
return
|
|
|
|
# Check that process started without immediate termination
|
|
await asyncio.sleep(0.5)
|
|
if self.browser_process.poll() is not None:
|
|
# Process already terminated
|
|
stdout, stderr = b"", b""
|
|
try:
|
|
stdout, stderr = self.browser_process.communicate(timeout=0.5)
|
|
except subprocess.TimeoutExpired:
|
|
pass
|
|
|
|
self.logger.error(
|
|
message="Browser process terminated during startup | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
|
|
tag="ERROR",
|
|
params={
|
|
"code": self.browser_process.returncode,
|
|
"stdout": stdout.decode() if stdout else "",
|
|
"stderr": stderr.decode() if stderr else "",
|
|
},
|
|
)
|
|
|
|
async def _monitor_browser_process(self):
|
|
"""
|
|
Monitor the browser process for unexpected termination.
|
|
|
|
How it works:
|
|
1. Read stdout and stderr from the browser process.
|
|
2. If the process has terminated, log the error message and terminate the browser.
|
|
3. If the shutting_down flag is set, log the normal termination message.
|
|
4. If any other error occurs, log the error message.
|
|
|
|
Note: This method should be called in a separate task to avoid blocking the main event loop.
|
|
This is DEPRECATED and should not be used for builtin browsers that need to outlive the Python process.
|
|
"""
|
|
if self.browser_process:
|
|
try:
|
|
stdout, stderr = await asyncio.gather(
|
|
asyncio.to_thread(self.browser_process.stdout.read),
|
|
asyncio.to_thread(self.browser_process.stderr.read),
|
|
)
|
|
|
|
# Check shutting_down flag BEFORE logging anything
|
|
if self.browser_process.poll() is not None:
|
|
if not self.shutting_down:
|
|
self.logger.error(
|
|
message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
|
|
tag="ERROR",
|
|
params={
|
|
"code": self.browser_process.returncode,
|
|
"stdout": stdout.decode(),
|
|
"stderr": stderr.decode(),
|
|
},
|
|
)
|
|
await self.cleanup()
|
|
else:
|
|
self.logger.info(
|
|
message="Browser process terminated normally | Code: {code}",
|
|
tag="INFO",
|
|
params={"code": self.browser_process.returncode},
|
|
)
|
|
except Exception as e:
|
|
if not self.shutting_down:
|
|
self.logger.error(
|
|
message="Error monitoring browser process: {error}",
|
|
tag="ERROR",
|
|
params={"error": str(e)},
|
|
)
|
|
|
|
def _get_browser_path_WIP(self) -> str:
|
|
"""Returns the browser executable path based on OS and browser type"""
|
|
if sys.platform == "darwin": # macOS
|
|
paths = {
|
|
"chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
|
"firefox": "/Applications/Firefox.app/Contents/MacOS/firefox",
|
|
"webkit": "/Applications/Safari.app/Contents/MacOS/Safari",
|
|
}
|
|
elif sys.platform == "win32": # Windows
|
|
paths = {
|
|
"chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
|
|
"firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe",
|
|
"webkit": None, # WebKit not supported on Windows
|
|
}
|
|
else: # Linux
|
|
paths = {
|
|
"chromium": "google-chrome",
|
|
"firefox": "firefox",
|
|
"webkit": None, # WebKit not supported on Linux
|
|
}
|
|
|
|
return paths.get(self.browser_type)
|
|
|
|
async def _get_browser_path(self) -> str:
|
|
browser_path = await get_chromium_path(self.browser_type)
|
|
return browser_path
|
|
|
|
async def _get_browser_args(self) -> List[str]:
|
|
"""Returns full CLI args for launching the browser"""
|
|
base = [await self._get_browser_path()]
|
|
if self.browser_type == "chromium":
|
|
flags = [
|
|
f"--remote-debugging-port={self.debugging_port}",
|
|
f"--user-data-dir={self.user_data_dir}",
|
|
]
|
|
if self.headless:
|
|
flags.append("--headless=new")
|
|
# merge common launch flags
|
|
flags.extend(self.build_browser_flags(self.browser_config))
|
|
elif self.browser_type == "firefox":
|
|
flags = [
|
|
"--remote-debugging-port",
|
|
str(self.debugging_port),
|
|
"--profile",
|
|
self.user_data_dir,
|
|
]
|
|
if self.headless:
|
|
flags.append("--headless")
|
|
else:
|
|
raise NotImplementedError(f"Browser type {self.browser_type} not supported")
|
|
return base + flags
|
|
|
|
async def cleanup(self):
|
|
"""Cleanup browser process and temporary directory"""
|
|
# Set shutting_down flag BEFORE any termination actions
|
|
self.shutting_down = True
|
|
|
|
if self.browser_process:
|
|
try:
|
|
# For builtin browsers that should persist, we should check if it's a detached process
|
|
# Only terminate if we have proper control over the process
|
|
if not self.browser_process.poll():
|
|
# Process is still running
|
|
self.browser_process.terminate()
|
|
# Wait for process to end gracefully
|
|
for _ in range(10): # 10 attempts, 100ms each
|
|
if self.browser_process.poll() is not None:
|
|
break
|
|
await asyncio.sleep(0.1)
|
|
|
|
# Force kill if still running
|
|
if self.browser_process.poll() is None:
|
|
if sys.platform == "win32":
|
|
# On Windows we might need taskkill for detached processes
|
|
try:
|
|
subprocess.run(["taskkill", "/F", "/PID", str(self.browser_process.pid)])
|
|
except Exception:
|
|
self.browser_process.kill()
|
|
else:
|
|
self.browser_process.kill()
|
|
await asyncio.sleep(0.1) # Brief wait for kill to take effect
|
|
|
|
except Exception as e:
|
|
self.logger.error(
|
|
message="Error terminating browser: {error}",
|
|
tag="ERROR",
|
|
params={"error": str(e)},
|
|
)
|
|
|
|
if self.temp_dir and os.path.exists(self.temp_dir):
|
|
try:
|
|
shutil.rmtree(self.temp_dir)
|
|
except Exception as e:
|
|
self.logger.error(
|
|
message="Error removing temporary directory: {error}",
|
|
tag="ERROR",
|
|
params={"error": str(e)},
|
|
)
|
|
|
|
# These methods have been moved to BrowserProfiler class
|
|
@staticmethod
|
|
async def create_profile(browser_config=None, profile_name=None, logger=None):
|
|
"""
|
|
This method has been moved to the BrowserProfiler class.
|
|
|
|
Creates a browser profile by launching a browser for interactive user setup
|
|
and waits until the user closes it. The profile is stored in a directory that
|
|
can be used later with BrowserConfig.user_data_dir.
|
|
|
|
Please use BrowserProfiler.create_profile() instead.
|
|
|
|
Example:
|
|
```python
|
|
from crawl4ai.browser_profiler import BrowserProfiler
|
|
|
|
profiler = BrowserProfiler()
|
|
profile_path = await profiler.create_profile(profile_name="my-login-profile")
|
|
```
|
|
"""
|
|
from .browser_profiler import BrowserProfiler
|
|
|
|
# Create a BrowserProfiler instance and delegate to it
|
|
profiler = BrowserProfiler(logger=logger)
|
|
return await profiler.create_profile(profile_name=profile_name, browser_config=browser_config)
|
|
|
|
@staticmethod
|
|
def list_profiles():
|
|
"""
|
|
This method has been moved to the BrowserProfiler class.
|
|
|
|
Lists all available browser profiles in the Crawl4AI profiles directory.
|
|
|
|
Please use BrowserProfiler.list_profiles() instead.
|
|
|
|
Example:
|
|
```python
|
|
from crawl4ai.browser_profiler import BrowserProfiler
|
|
|
|
profiler = BrowserProfiler()
|
|
profiles = profiler.list_profiles()
|
|
```
|
|
"""
|
|
from .browser_profiler import BrowserProfiler
|
|
|
|
# Create a BrowserProfiler instance and delegate to it
|
|
profiler = BrowserProfiler()
|
|
return profiler.list_profiles()
|
|
|
|
@staticmethod
|
|
def delete_profile(profile_name_or_path):
|
|
"""
|
|
This method has been moved to the BrowserProfiler class.
|
|
|
|
Delete a browser profile by name or path.
|
|
|
|
Please use BrowserProfiler.delete_profile() instead.
|
|
|
|
Example:
|
|
```python
|
|
from crawl4ai.browser_profiler import BrowserProfiler
|
|
|
|
profiler = BrowserProfiler()
|
|
success = profiler.delete_profile("my-profile")
|
|
```
|
|
"""
|
|
from .browser_profiler import BrowserProfiler
|
|
|
|
# Create a BrowserProfiler instance and delegate to it
|
|
profiler = BrowserProfiler()
|
|
return profiler.delete_profile(profile_name_or_path)
|
|
|
|
|
|
|
|
|
|
class BrowserManager:
|
|
"""
|
|
Manages the browser instance and context.
|
|
|
|
Attributes:
|
|
config (BrowserConfig): Configuration object containing all browser settings
|
|
logger: Logger instance for recording events and errors
|
|
browser (Browser): The browser instance
|
|
default_context (BrowserContext): The default browser context
|
|
managed_browser (ManagedBrowser): The managed browser instance
|
|
playwright (Playwright): The Playwright instance
|
|
sessions (dict): Dictionary to store session information
|
|
session_ttl (int): Session timeout in seconds
|
|
"""
|
|
|
|
_playwright_instance = None
|
|
|
|
@classmethod
|
|
async def get_playwright(cls):
|
|
from playwright.async_api import async_playwright
|
|
cls._playwright_instance = await async_playwright().start()
|
|
return cls._playwright_instance
|
|
|
|
def __init__(self, browser_config: BrowserConfig, logger=None):
|
|
"""
|
|
Initialize the BrowserManager with a browser configuration.
|
|
|
|
Args:
|
|
browser_config (BrowserConfig): Configuration object containing all browser settings
|
|
logger: Logger instance for recording events and errors
|
|
"""
|
|
self.config: BrowserConfig = browser_config
|
|
self.logger = logger
|
|
|
|
# Browser state
|
|
self.browser = None
|
|
self.default_context = None
|
|
self.managed_browser = None
|
|
self.playwright = None
|
|
|
|
# Session management
|
|
self.sessions = {}
|
|
self.session_ttl = 1800 # 30 minutes
|
|
|
|
# Keep track of contexts by a "config signature," so each unique config reuses a single context
|
|
self.contexts_by_config = {}
|
|
self._contexts_lock = asyncio.Lock()
|
|
|
|
# Initialize ManagedBrowser if needed
|
|
if self.config.use_managed_browser:
|
|
self.managed_browser = ManagedBrowser(
|
|
browser_type=self.config.browser_type,
|
|
user_data_dir=self.config.user_data_dir,
|
|
headless=self.config.headless,
|
|
logger=self.logger,
|
|
debugging_port=self.config.debugging_port,
|
|
cdp_url=self.config.cdp_url,
|
|
browser_config=self.config,
|
|
)
|
|
|
|
async def start(self):
|
|
"""
|
|
Start the browser instance and set up the default context.
|
|
|
|
How it works:
|
|
1. Check if Playwright is already initialized.
|
|
2. If not, initialize Playwright.
|
|
3. If managed browser is used, start it and connect to the CDP endpoint.
|
|
4. If managed browser is not used, launch the browser and set up the default context.
|
|
|
|
Note: This method should be called in a separate task to avoid blocking the main event loop.
|
|
"""
|
|
if self.playwright is not None:
|
|
await self.close()
|
|
|
|
from playwright.async_api import async_playwright
|
|
|
|
self.playwright = await async_playwright().start()
|
|
|
|
if self.config.cdp_url or self.config.use_managed_browser:
|
|
self.config.use_managed_browser = True
|
|
cdp_url = await self.managed_browser.start() if not self.config.cdp_url else self.config.cdp_url
|
|
self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
|
|
contexts = self.browser.contexts
|
|
if contexts:
|
|
self.default_context = contexts[0]
|
|
else:
|
|
self.default_context = await self.create_browser_context()
|
|
await self.setup_context(self.default_context)
|
|
else:
|
|
browser_args = self._build_browser_args()
|
|
|
|
# Launch appropriate browser type
|
|
if self.config.browser_type == "firefox":
|
|
self.browser = await self.playwright.firefox.launch(**browser_args)
|
|
elif self.config.browser_type == "webkit":
|
|
self.browser = await self.playwright.webkit.launch(**browser_args)
|
|
else:
|
|
self.browser = await self.playwright.chromium.launch(**browser_args)
|
|
|
|
self.default_context = self.browser
|
|
|
|
|
|
def _build_browser_args(self) -> dict:
|
|
"""Build browser launch arguments from config."""
|
|
args = [
|
|
"--disable-gpu",
|
|
"--disable-gpu-compositing",
|
|
"--disable-software-rasterizer",
|
|
"--no-sandbox",
|
|
"--disable-dev-shm-usage",
|
|
"--no-first-run",
|
|
"--no-default-browser-check",
|
|
"--disable-infobars",
|
|
"--window-position=0,0",
|
|
"--ignore-certificate-errors",
|
|
"--ignore-certificate-errors-spki-list",
|
|
"--disable-blink-features=AutomationControlled",
|
|
"--window-position=400,0",
|
|
"--disable-renderer-backgrounding",
|
|
"--disable-ipc-flooding-protection",
|
|
"--force-color-profile=srgb",
|
|
"--mute-audio",
|
|
"--disable-background-timer-throttling",
|
|
# "--single-process",
|
|
f"--window-size={self.config.viewport_width},{self.config.viewport_height}",
|
|
]
|
|
|
|
if self.config.light_mode:
|
|
args.extend(BROWSER_DISABLE_OPTIONS)
|
|
|
|
if self.config.text_mode:
|
|
args.extend(
|
|
[
|
|
"--blink-settings=imagesEnabled=false",
|
|
"--disable-remote-fonts",
|
|
"--disable-images",
|
|
"--disable-javascript",
|
|
"--disable-software-rasterizer",
|
|
"--disable-dev-shm-usage",
|
|
]
|
|
)
|
|
|
|
if self.config.extra_args:
|
|
args.extend(self.config.extra_args)
|
|
|
|
# Deduplicate args
|
|
args = list(dict.fromkeys(args))
|
|
|
|
browser_args = {"headless": self.config.headless, "args": args}
|
|
|
|
if self.config.chrome_channel:
|
|
browser_args["channel"] = self.config.chrome_channel
|
|
|
|
if self.config.accept_downloads:
|
|
browser_args["downloads_path"] = self.config.downloads_path or os.path.join(
|
|
os.getcwd(), "downloads"
|
|
)
|
|
os.makedirs(browser_args["downloads_path"], exist_ok=True)
|
|
|
|
if self.config.proxy or self.config.proxy_config:
|
|
from playwright.async_api import ProxySettings
|
|
|
|
proxy_settings = (
|
|
ProxySettings(server=self.config.proxy)
|
|
if self.config.proxy
|
|
else ProxySettings(
|
|
server=self.config.proxy_config.server,
|
|
username=self.config.proxy_config.username,
|
|
password=self.config.proxy_config.password,
|
|
)
|
|
)
|
|
browser_args["proxy"] = proxy_settings
|
|
|
|
return browser_args
|
|
|
|
async def setup_context(
|
|
self,
|
|
context: BrowserContext,
|
|
crawlerRunConfig: CrawlerRunConfig = None,
|
|
is_default=False,
|
|
):
|
|
"""
|
|
Set up a browser context with the configured options.
|
|
|
|
How it works:
|
|
1. Set extra HTTP headers if provided.
|
|
2. Add cookies if provided.
|
|
3. Load storage state if provided.
|
|
4. Accept downloads if enabled.
|
|
5. Set default timeouts for navigation and download.
|
|
6. Set user agent if provided.
|
|
7. Set browser hints if provided.
|
|
8. Set proxy if provided.
|
|
9. Set downloads path if provided.
|
|
10. Set storage state if provided.
|
|
11. Set cache if provided.
|
|
12. Set extra HTTP headers if provided.
|
|
13. Add cookies if provided.
|
|
14. Set default timeouts for navigation and download if enabled.
|
|
15. Set user agent if provided.
|
|
16. Set browser hints if provided.
|
|
|
|
Args:
|
|
context (BrowserContext): The browser context to set up
|
|
crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
|
|
is_default (bool): Flag indicating if this is the default context
|
|
Returns:
|
|
None
|
|
"""
|
|
if self.config.headers:
|
|
await context.set_extra_http_headers(self.config.headers)
|
|
|
|
if self.config.cookies:
|
|
await context.add_cookies(self.config.cookies)
|
|
|
|
if self.config.storage_state:
|
|
await context.storage_state(path=None)
|
|
|
|
if self.config.accept_downloads:
|
|
context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT)
|
|
context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT)
|
|
if self.config.downloads_path:
|
|
context._impl_obj._options["accept_downloads"] = True
|
|
context._impl_obj._options[
|
|
"downloads_path"
|
|
] = self.config.downloads_path
|
|
|
|
# Handle user agent and browser hints
|
|
if self.config.user_agent:
|
|
combined_headers = {
|
|
"User-Agent": self.config.user_agent,
|
|
"sec-ch-ua": self.config.browser_hint,
|
|
}
|
|
combined_headers.update(self.config.headers)
|
|
await context.set_extra_http_headers(combined_headers)
|
|
|
|
# Add default cookie
|
|
await context.add_cookies(
|
|
[
|
|
{
|
|
"name": "cookiesEnabled",
|
|
"value": "true",
|
|
"url": crawlerRunConfig.url
|
|
if crawlerRunConfig and crawlerRunConfig.url
|
|
else "https://crawl4ai.com/",
|
|
}
|
|
]
|
|
)
|
|
|
|
# Handle navigator overrides
|
|
if crawlerRunConfig:
|
|
if (
|
|
crawlerRunConfig.override_navigator
|
|
or crawlerRunConfig.simulate_user
|
|
or crawlerRunConfig.magic
|
|
):
|
|
await context.add_init_script(load_js_script("navigator_overrider"))
|
|
|
|
async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None):
|
|
"""
|
|
Creates and returns a new browser context with configured settings.
|
|
Applies text-only mode settings if text_mode is enabled in config.
|
|
|
|
Returns:
|
|
Context: Browser context object with the specified configurations
|
|
"""
|
|
# Base settings
|
|
user_agent = self.config.headers.get("User-Agent", self.config.user_agent)
|
|
viewport_settings = {
|
|
"width": self.config.viewport_width,
|
|
"height": self.config.viewport_height,
|
|
}
|
|
proxy_settings = {"server": self.config.proxy} if self.config.proxy else None
|
|
|
|
blocked_extensions = [
|
|
# Images
|
|
"jpg",
|
|
"jpeg",
|
|
"png",
|
|
"gif",
|
|
"webp",
|
|
"svg",
|
|
"ico",
|
|
"bmp",
|
|
"tiff",
|
|
"psd",
|
|
# Fonts
|
|
"woff",
|
|
"woff2",
|
|
"ttf",
|
|
"otf",
|
|
"eot",
|
|
# Styles
|
|
# 'css', 'less', 'scss', 'sass',
|
|
# Media
|
|
"mp4",
|
|
"webm",
|
|
"ogg",
|
|
"avi",
|
|
"mov",
|
|
"wmv",
|
|
"flv",
|
|
"m4v",
|
|
"mp3",
|
|
"wav",
|
|
"aac",
|
|
"m4a",
|
|
"opus",
|
|
"flac",
|
|
# Documents
|
|
"pdf",
|
|
"doc",
|
|
"docx",
|
|
"xls",
|
|
"xlsx",
|
|
"ppt",
|
|
"pptx",
|
|
# Archives
|
|
"zip",
|
|
"rar",
|
|
"7z",
|
|
"tar",
|
|
"gz",
|
|
# Scripts and data
|
|
"xml",
|
|
"swf",
|
|
"wasm",
|
|
]
|
|
|
|
# Common context settings
|
|
context_settings = {
|
|
"user_agent": user_agent,
|
|
"viewport": viewport_settings,
|
|
"proxy": proxy_settings,
|
|
"accept_downloads": self.config.accept_downloads,
|
|
"storage_state": self.config.storage_state,
|
|
"ignore_https_errors": self.config.ignore_https_errors,
|
|
"device_scale_factor": 1.0,
|
|
"java_script_enabled": self.config.java_script_enabled,
|
|
}
|
|
|
|
if crawlerRunConfig:
|
|
# Check if there is value for crawlerRunConfig.proxy_config set add that to context
|
|
if crawlerRunConfig.proxy_config:
|
|
proxy_settings = {
|
|
"server": crawlerRunConfig.proxy_config.server,
|
|
}
|
|
if crawlerRunConfig.proxy_config.username:
|
|
proxy_settings.update({
|
|
"username": crawlerRunConfig.proxy_config.username,
|
|
"password": crawlerRunConfig.proxy_config.password,
|
|
})
|
|
context_settings["proxy"] = proxy_settings
|
|
|
|
if self.config.text_mode:
|
|
text_mode_settings = {
|
|
"has_touch": False,
|
|
"is_mobile": False,
|
|
}
|
|
# Update context settings with text mode settings
|
|
context_settings.update(text_mode_settings)
|
|
|
|
# inject locale / tz / geo if user provided them
|
|
if crawlerRunConfig:
|
|
if crawlerRunConfig.locale:
|
|
context_settings["locale"] = crawlerRunConfig.locale
|
|
if crawlerRunConfig.timezone_id:
|
|
context_settings["timezone_id"] = crawlerRunConfig.timezone_id
|
|
if crawlerRunConfig.geolocation:
|
|
context_settings["geolocation"] = {
|
|
"latitude": crawlerRunConfig.geolocation.latitude,
|
|
"longitude": crawlerRunConfig.geolocation.longitude,
|
|
"accuracy": crawlerRunConfig.geolocation.accuracy,
|
|
}
|
|
# ensure geolocation permission
|
|
perms = context_settings.get("permissions", [])
|
|
perms.append("geolocation")
|
|
context_settings["permissions"] = perms
|
|
|
|
# Create and return the context with all settings
|
|
context = await self.browser.new_context(**context_settings)
|
|
|
|
# Apply text mode settings if enabled
|
|
if self.config.text_mode:
|
|
# Create and apply route patterns for each extension
|
|
for ext in blocked_extensions:
|
|
await context.route(f"**/*.{ext}", lambda route: route.abort())
|
|
return context
|
|
|
|
def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str:
|
|
"""
|
|
Converts the crawlerRunConfig into a dict, excludes ephemeral fields,
|
|
then returns a hash of the sorted JSON. This yields a stable signature
|
|
that identifies configurations requiring a unique browser context.
|
|
"""
|
|
import json
|
|
|
|
config_dict = crawlerRunConfig.__dict__.copy()
|
|
# Exclude items that do not affect browser-level setup.
|
|
# Expand or adjust as needed, e.g. chunking_strategy is purely for data extraction, not for browser config.
|
|
ephemeral_keys = [
|
|
"session_id",
|
|
"js_code",
|
|
"scraping_strategy",
|
|
"extraction_strategy",
|
|
"chunking_strategy",
|
|
"cache_mode",
|
|
"content_filter",
|
|
"semaphore_count",
|
|
"url"
|
|
]
|
|
|
|
# Do NOT exclude locale, timezone_id, or geolocation as these DO affect browser context
|
|
# and should cause a new context to be created if they change
|
|
|
|
for key in ephemeral_keys:
|
|
if key in config_dict:
|
|
del config_dict[key]
|
|
# Convert to canonical JSON string
|
|
signature_json = json.dumps(config_dict, sort_keys=True, default=str)
|
|
|
|
# Hash the JSON so we get a compact, unique string
|
|
signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest()
|
|
return signature_hash
|
|
|
|
async def get_page(self, crawlerRunConfig: CrawlerRunConfig):
|
|
"""
|
|
Get a page for the given session ID, creating a new one if needed.
|
|
|
|
Args:
|
|
crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
|
|
|
|
Returns:
|
|
(page, context): The Page and its BrowserContext
|
|
"""
|
|
self._cleanup_expired_sessions()
|
|
|
|
# If a session_id is provided and we already have it, reuse that page + context
|
|
if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions:
|
|
context, page, _ = self.sessions[crawlerRunConfig.session_id]
|
|
# Update last-used timestamp
|
|
self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
|
|
return page, context
|
|
|
|
# If using a managed browser, just grab the shared default_context
|
|
if self.config.use_managed_browser:
|
|
context = self.default_context
|
|
pages = context.pages
|
|
page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
|
|
if not page:
|
|
if pages:
|
|
page = context.pages[0]
|
|
else:
|
|
page = await context.new_page()
|
|
else:
|
|
# Otherwise, check if we have an existing context for this config
|
|
config_signature = self._make_config_signature(crawlerRunConfig)
|
|
|
|
async with self._contexts_lock:
|
|
if config_signature in self.contexts_by_config:
|
|
context = self.contexts_by_config[config_signature]
|
|
else:
|
|
# Create and setup a new context
|
|
context = await self.create_browser_context(crawlerRunConfig)
|
|
await self.setup_context(context, crawlerRunConfig)
|
|
self.contexts_by_config[config_signature] = context
|
|
|
|
# Create a new page from the chosen context
|
|
page = await context.new_page()
|
|
|
|
# If a session_id is specified, store this session so we can reuse later
|
|
if crawlerRunConfig.session_id:
|
|
self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
|
|
|
|
return page, context
|
|
|
|
async def kill_session(self, session_id: str):
|
|
"""
|
|
Kill a browser session and clean up resources.
|
|
|
|
Args:
|
|
session_id (str): The session ID to kill.
|
|
"""
|
|
if session_id in self.sessions:
|
|
context, page, _ = self.sessions[session_id]
|
|
await page.close()
|
|
if not self.config.use_managed_browser:
|
|
await context.close()
|
|
del self.sessions[session_id]
|
|
|
|
def _cleanup_expired_sessions(self):
|
|
"""Clean up expired sessions based on TTL."""
|
|
current_time = time.time()
|
|
expired_sessions = [
|
|
sid
|
|
for sid, (_, _, last_used) in self.sessions.items()
|
|
if current_time - last_used > self.session_ttl
|
|
]
|
|
for sid in expired_sessions:
|
|
asyncio.create_task(self.kill_session(sid))
|
|
|
|
async def close(self):
|
|
"""Close all browser resources and clean up."""
|
|
if self.config.cdp_url:
|
|
return
|
|
|
|
if self.config.sleep_on_close:
|
|
await asyncio.sleep(0.5)
|
|
|
|
session_ids = list(self.sessions.keys())
|
|
for session_id in session_ids:
|
|
await self.kill_session(session_id)
|
|
|
|
# Now close all contexts we created. This reclaims memory from ephemeral contexts.
|
|
for ctx in self.contexts_by_config.values():
|
|
try:
|
|
await ctx.close()
|
|
except Exception as e:
|
|
self.logger.error(
|
|
message="Error closing context: {error}",
|
|
tag="ERROR",
|
|
params={"error": str(e)}
|
|
)
|
|
self.contexts_by_config.clear()
|
|
|
|
if self.browser:
|
|
await self.browser.close()
|
|
self.browser = None
|
|
|
|
if self.managed_browser:
|
|
await asyncio.sleep(0.5)
|
|
await self.managed_browser.cleanup()
|
|
self.managed_browser = None
|
|
|
|
if self.playwright:
|
|
await self.playwright.stop()
|
|
self.playwright = None
|