feat(browser): improve browser profile management and cleanup
Enhance browser profile handling with better process cleanup and documentation: - Add process cleanup for existing Chromium instances on Windows/Unix - Fix profile creation by passing complete browser config - Add comprehensive documentation for browser and CLI components - Add initial profile creation test - Bump version to 0.6.3 This change improves reliability when managing browser profiles and provides better documentation for developers.
This commit is contained in:
@@ -1,3 +1,3 @@
|
|||||||
# crawl4ai/_version.py
|
# crawl4ai/_version.py
|
||||||
__version__ = "0.6.2"
|
__version__ = "0.6.3"
|
||||||
|
|
||||||
|
|||||||
@@ -5,7 +5,10 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
import shutil
|
import shutil
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import psutil
|
||||||
|
import signal
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import shlex
|
||||||
from playwright.async_api import BrowserContext
|
from playwright.async_api import BrowserContext
|
||||||
import hashlib
|
import hashlib
|
||||||
from .js_snippet import load_js_script
|
from .js_snippet import load_js_script
|
||||||
@@ -193,6 +196,45 @@ class ManagedBrowser:
|
|||||||
|
|
||||||
if self.browser_config.extra_args:
|
if self.browser_config.extra_args:
|
||||||
args.extend(self.browser_config.extra_args)
|
args.extend(self.browser_config.extra_args)
|
||||||
|
|
||||||
|
|
||||||
|
# ── make sure no old Chromium instance is owning the same port/profile ──
|
||||||
|
try:
|
||||||
|
if sys.platform == "win32":
|
||||||
|
if psutil is None:
|
||||||
|
raise RuntimeError("psutil not available, cannot clean old browser")
|
||||||
|
for p in psutil.process_iter(["pid", "name", "cmdline"]):
|
||||||
|
cl = " ".join(p.info.get("cmdline") or [])
|
||||||
|
if (
|
||||||
|
f"--remote-debugging-port={self.debugging_port}" in cl
|
||||||
|
and f"--user-data-dir={self.user_data_dir}" in cl
|
||||||
|
):
|
||||||
|
p.kill()
|
||||||
|
p.wait(timeout=5)
|
||||||
|
else: # macOS / Linux
|
||||||
|
# kill any process listening on the same debugging port
|
||||||
|
pids = (
|
||||||
|
subprocess.check_output(shlex.split(f"lsof -t -i:{self.debugging_port}"))
|
||||||
|
.decode()
|
||||||
|
.strip()
|
||||||
|
.splitlines()
|
||||||
|
)
|
||||||
|
for pid in pids:
|
||||||
|
try:
|
||||||
|
os.kill(int(pid), signal.SIGTERM)
|
||||||
|
except ProcessLookupError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# remove Chromium singleton locks, or new launch exits with
|
||||||
|
# “Opening in existing browser session.”
|
||||||
|
for f in ("SingletonLock", "SingletonSocket", "SingletonCookie"):
|
||||||
|
fp = os.path.join(self.user_data_dir, f)
|
||||||
|
if os.path.exists(fp):
|
||||||
|
os.remove(fp)
|
||||||
|
except Exception as _e:
|
||||||
|
# non-fatal — we'll try to start anyway, but log what happened
|
||||||
|
self.logger.warning(f"pre-launch cleanup failed: {_e}", tag="BROWSER")
|
||||||
|
|
||||||
|
|
||||||
# Start browser process
|
# Start browser process
|
||||||
try:
|
try:
|
||||||
@@ -922,7 +964,7 @@ class BrowserManager:
|
|||||||
pages = context.pages
|
pages = context.pages
|
||||||
page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
|
page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
|
||||||
if not page:
|
if not page:
|
||||||
page = await context.new_page()
|
page = context.pages[0] # await context.new_page()
|
||||||
else:
|
else:
|
||||||
# Otherwise, check if we have an existing context for this config
|
# Otherwise, check if we have an existing context for this config
|
||||||
config_signature = self._make_config_signature(crawlerRunConfig)
|
config_signature = self._make_config_signature(crawlerRunConfig)
|
||||||
|
|||||||
@@ -140,13 +140,17 @@ class BrowserProfiler:
|
|||||||
self.logger.info("4. The profile will be saved and ready to use with Crawl4AI.", tag="PROFILE")
|
self.logger.info("4. The profile will be saved and ready to use with Crawl4AI.", tag="PROFILE")
|
||||||
self.logger.info(f"{border}\n", tag="PROFILE")
|
self.logger.info(f"{border}\n", tag="PROFILE")
|
||||||
|
|
||||||
|
browser_config.headless = False
|
||||||
|
browser_config.user_data_dir = profile_path
|
||||||
|
|
||||||
|
|
||||||
# Create managed browser instance
|
# Create managed browser instance
|
||||||
managed_browser = ManagedBrowser(
|
managed_browser = ManagedBrowser(
|
||||||
browser_type=browser_config.browser_type,
|
browser_config=browser_config,
|
||||||
user_data_dir=profile_path,
|
# user_data_dir=profile_path,
|
||||||
headless=False, # Must be visible
|
# headless=False, # Must be visible
|
||||||
logger=self.logger,
|
logger=self.logger,
|
||||||
debugging_port=browser_config.debugging_port
|
# debugging_port=browser_config.debugging_port
|
||||||
)
|
)
|
||||||
|
|
||||||
# Set up signal handlers to ensure cleanup on interrupt
|
# Set up signal handlers to ensure cleanup on interrupt
|
||||||
@@ -972,3 +976,30 @@ class BrowserProfiler:
|
|||||||
'info': browser_info
|
'info': browser_info
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Example usage
|
||||||
|
profiler = BrowserProfiler()
|
||||||
|
|
||||||
|
# Create a new profile
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
home_dir = Path.home()
|
||||||
|
profile_path = asyncio.run(profiler.create_profile( str(home_dir / ".crawl4ai/profiles/test-profile")))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Launch a standalone browser
|
||||||
|
asyncio.run(profiler.launch_standalone_browser())
|
||||||
|
|
||||||
|
# List profiles
|
||||||
|
profiles = profiler.list_profiles()
|
||||||
|
for profile in profiles:
|
||||||
|
print(f"Profile: {profile['name']}, Path: {profile['path']}")
|
||||||
|
|
||||||
|
# Delete a profile
|
||||||
|
success = profiler.delete_profile("my-profile")
|
||||||
|
if success:
|
||||||
|
print("Profile deleted successfully")
|
||||||
|
else:
|
||||||
|
print("Failed to delete profile")
|
||||||
51
docs/codebase/browser.md
Normal file
51
docs/codebase/browser.md
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
### browser_manager.py
|
||||||
|
|
||||||
|
| Function | What it does |
|
||||||
|
|---|---|
|
||||||
|
| `ManagedBrowser.build_browser_flags` | Returns baseline Chromium CLI flags, disables GPU and sandbox, plugs locale, timezone, stealth tweaks, and any extras from `BrowserConfig`. |
|
||||||
|
| `ManagedBrowser.__init__` | Stores config and logger, creates temp dir, preps internal state. |
|
||||||
|
| `ManagedBrowser.start` | Spawns or connects to the Chromium process, returns its CDP endpoint plus the `subprocess.Popen` handle. |
|
||||||
|
| `ManagedBrowser._initial_startup_check` | Pings the CDP endpoint once to be sure the browser is alive, raises if not. |
|
||||||
|
| `ManagedBrowser._monitor_browser_process` | Async-loops on the subprocess, logs exits or crashes, restarts if policy allows. |
|
||||||
|
| `ManagedBrowser._get_browser_path_WIP` | Old helper that maps OS + browser type to an executable path. |
|
||||||
|
| `ManagedBrowser._get_browser_path` | Current helper, checks env vars, Playwright cache, and OS defaults for the real executable. |
|
||||||
|
| `ManagedBrowser._get_browser_args` | Builds the final CLI arg list by merging user flags, stealth flags, and defaults. |
|
||||||
|
| `ManagedBrowser.cleanup` | Terminates the browser, stops monitors, deletes the temp dir. |
|
||||||
|
| `ManagedBrowser.create_profile` | Opens a visible browser so a human can log in, then zips the resulting user-data-dir to `~/.crawl4ai/profiles/<name>`. |
|
||||||
|
| `ManagedBrowser.list_profiles` | Thin wrapper, now forwarded to `BrowserProfiler.list_profiles()`. |
|
||||||
|
| `ManagedBrowser.delete_profile` | Thin wrapper, now forwarded to `BrowserProfiler.delete_profile()`. |
|
||||||
|
| `BrowserManager.__init__` | Holds the global Playwright instance, browser handle, config signature cache, session map, and logger. |
|
||||||
|
| `BrowserManager.start` | Boots the underlying `ManagedBrowser`, then spins up the default Playwright browser context with stealth patches. |
|
||||||
|
| `BrowserManager._build_browser_args` | Translates `CrawlerRunConfig` (proxy, UA, timezone, headless flag, etc.) into Playwright `launch_args`. |
|
||||||
|
| `BrowserManager.setup_context` | Applies locale, geolocation, permissions, cookies, and UA overrides on a fresh context. |
|
||||||
|
| `BrowserManager.create_browser_context` | Internal helper that actually calls `browser.new_context(**options)` after running `setup_context`. |
|
||||||
|
| `BrowserManager._make_config_signature` | Hashes the non-ephemeral parts of `CrawlerRunConfig` so contexts can be reused safely. |
|
||||||
|
| `BrowserManager.get_page` | Returns a ready `Page` for a given session id, reusing an existing one or creating a new context/page, injects helper scripts, updates `last_used`. |
|
||||||
|
| `BrowserManager.kill_session` | Force-closes a context/page for a session and removes it from the session map. |
|
||||||
|
| `BrowserManager._cleanup_expired_sessions` | Periodic sweep that drops sessions idle longer than `ttl_seconds`. |
|
||||||
|
| `BrowserManager.close` | Gracefully shuts down all contexts, the browser, Playwright, and background tasks. |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### browser_profiler.py
|
||||||
|
|
||||||
|
| Function | What it does |
|
||||||
|
|---|---|
|
||||||
|
| `BrowserProfiler.__init__` | Sets up profile folder paths, async logger, and signal handlers. |
|
||||||
|
| `BrowserProfiler.create_profile` | Launches a visible browser with a new user-data-dir for manual login, on exit compresses and stores it as a named profile. |
|
||||||
|
| `BrowserProfiler.cleanup_handler` | General SIGTERM/SIGINT cleanup wrapper that kills child processes. |
|
||||||
|
| `BrowserProfiler.sigint_handler` | Handles Ctrl-C during an interactive session, makes sure the browser shuts down cleanly. |
|
||||||
|
| `BrowserProfiler.listen_for_quit_command` | Async REPL that exits when the user types `q`. |
|
||||||
|
| `BrowserProfiler.list_profiles` | Enumerates `~/.crawl4ai/profiles`, prints profile name, browser type, size, and last modified. |
|
||||||
|
| `BrowserProfiler.get_profile_path` | Returns the absolute path of a profile given its name, or `None` if missing. |
|
||||||
|
| `BrowserProfiler.delete_profile` | Removes a profile folder or a direct path from disk, with optional confirmation prompt. |
|
||||||
|
| `BrowserProfiler.interactive_manager` | Text UI loop for listing, creating, deleting, or launching profiles. |
|
||||||
|
| `BrowserProfiler.launch_standalone_browser` | Starts a non-headless Chromium with remote debugging enabled and keeps it alive for manual tests. |
|
||||||
|
| `BrowserProfiler.get_cdp_json` | Pulls `/json/version` from a CDP endpoint and returns the parsed JSON. |
|
||||||
|
| `BrowserProfiler.launch_builtin_browser` | Spawns a headless Chromium in the background, saves `{wsEndpoint, pid, started_at}` to `~/.crawl4ai/builtin_browser.json`. |
|
||||||
|
| `BrowserProfiler.get_builtin_browser_info` | Reads that JSON file, verifies the PID, and returns browser status info. |
|
||||||
|
| `BrowserProfiler._is_browser_running` | Cross-platform helper that checks if a PID is still alive. |
|
||||||
|
| `BrowserProfiler.kill_builtin_browser` | Terminates the background builtin browser and removes its status file. |
|
||||||
|
| `BrowserProfiler.get_builtin_browser_status` | Returns `{running: bool, wsEndpoint, pid, started_at}` for quick health checks. |
|
||||||
|
|
||||||
|
Let me know what you want to tweak or dive into next.
|
||||||
40
docs/codebase/cli.md
Normal file
40
docs/codebase/cli.md
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
### `cli.py` command surface
|
||||||
|
|
||||||
|
| Command | Inputs / flags | What it does |
|
||||||
|
|---|---|---|
|
||||||
|
| **profiles** | *(none)* | Opens the interactive profile manager, lets you list, create, delete saved browser profiles that live in `~/.crawl4ai/profiles`. |
|
||||||
|
| **browser status** | – | Prints whether the always-on *builtin* browser is running, shows its CDP URL, PID, start time. |
|
||||||
|
| **browser stop** | – | Kills the builtin browser and deletes its status file. |
|
||||||
|
| **browser view** | `--url, -u` URL *(optional)* | Pops a visible window of the builtin browser, navigates to `URL` or `about:blank`. |
|
||||||
|
| **config list** | – | Dumps every global setting, showing current value, default, and description. |
|
||||||
|
| **config get** | `key` | Prints the value of a single setting, falls back to default if unset. |
|
||||||
|
| **config set** | `key value` | Persists a new value in the global config (stored under `~/.crawl4ai/config.yml`). |
|
||||||
|
| **examples** | – | Just spits out real-world CLI usage samples. |
|
||||||
|
| **crawl** | `url` *(positional)*<br>`--browser-config,-B` path<br>`--crawler-config,-C` path<br>`--filter-config,-f` path<br>`--extraction-config,-e` path<br>`--json-extract,-j` [desc]\*<br>`--schema,-s` path<br>`--browser,-b` k=v list<br>`--crawler,-c` k=v list<br>`--output,-o` all,json,markdown,md,markdown-fit,md-fit *(default all)*<br>`--output-file,-O` path<br>`--bypass-cache,-b` *(flag, default true — note flag reuse)*<br>`--question,-q` str<br>`--verbose,-v` *(flag)*<br>`--profile,-p` profile-name | One-shot crawl + extraction. Builds `BrowserConfig` and `CrawlerRunConfig` from inline flags or separate YAML/JSON files, runs `AsyncWebCrawler.run()`, can route through a named saved profile and pipe the result to stdout or a file. |
|
||||||
|
| **(default)** | Same flags as **crawl**, plus `--example` | Shortcut so you can type just `crwl https://site.com`. When first arg is not a known sub-command, it falls through to *crawl*. |
|
||||||
|
|
||||||
|
\* `--json-extract/-j` with no value turns on LLM-based JSON extraction using an auto schema, supplying a string lets you prompt-engineer the field descriptions.
|
||||||
|
|
||||||
|
> Quick mental model
|
||||||
|
> `profiles` = manage identities,
|
||||||
|
> `browser ...` = control long-running headless Chrome that all crawls can piggy-back on,
|
||||||
|
> `crawl` = do the actual work,
|
||||||
|
> `config` = tweak global defaults,
|
||||||
|
> everything else is sugar.
|
||||||
|
|
||||||
|
### Quick-fire “profile” usage cheatsheet
|
||||||
|
|
||||||
|
| Scenario | Command (copy-paste ready) | Notes |
|
||||||
|
|---|---|---|
|
||||||
|
| **Launch interactive Profile Manager UI** | `crwl profiles` | Opens TUI with options: 1 List, 2 Create, 3 Delete, 4 Use-to-crawl, 5 Exit. |
|
||||||
|
| **Create a fresh profile** | `crwl profiles` → choose **2** → name it → browser opens → log in → press **q** in terminal | Saves to `~/.crawl4ai/profiles/<name>`. |
|
||||||
|
| **List saved profiles** | `crwl profiles` → choose **1** | Shows name, browser type, size, last-modified. |
|
||||||
|
| **Delete a profile** | `crwl profiles` → choose **3** → pick the profile index → confirm | Removes the folder. |
|
||||||
|
| **Crawl with a profile (default alias)** | `crwl https://site.com/dashboard -p my-profile` | Keeps login cookies, sets `use_managed_browser=true` under the hood. |
|
||||||
|
| **Crawl + verbose JSON output** | `crwl https://site.com -p my-profile -o json -v` | Any other `crawl` flags work the same. |
|
||||||
|
| **Crawl with extra browser tweaks** | `crwl https://site.com -p my-profile -b "headless=true,viewport_width=1680"` | CLI overrides go on top of the profile. |
|
||||||
|
| **Same but via explicit sub-command** | `crwl crawl https://site.com -p my-profile` | Identical to default alias. |
|
||||||
|
| **Use profile from inside Profile Manager** | `crwl profiles` → choose **4** → pick profile → enter URL → follow prompts | Handy when demo-ing to non-CLI folks. |
|
||||||
|
| **One-off crawl with a profile folder path (no name lookup)** | `crwl https://site.com -b "user_data_dir=$HOME/.crawl4ai/profiles/my-profile,use_managed_browser=true"` | Bypasses registry, useful for CI scripts. |
|
||||||
|
| **Launch a dev browser on CDP port with the same identity** | `crwl cdp -d $HOME/.crawl4ai/profiles/my-profile -P 9223` | Lets Puppeteer/Playwright attach for debugging. |
|
||||||
|
|
||||||
32
tests/profiler/test_crteate_profile.py
Normal file
32
tests/profiler/test_crteate_profile.py
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
from crawl4ai import BrowserProfiler
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Example usage
|
||||||
|
profiler = BrowserProfiler()
|
||||||
|
|
||||||
|
# Create a new profile
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
home_dir = Path.home()
|
||||||
|
profile_path = asyncio.run(profiler.create_profile( str(home_dir / ".crawl4ai/profiles/test-profile")))
|
||||||
|
|
||||||
|
print(f"Profile created at: {profile_path}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# # Launch a standalone browser
|
||||||
|
# asyncio.run(profiler.launch_standalone_browser())
|
||||||
|
|
||||||
|
# # List profiles
|
||||||
|
# profiles = profiler.list_profiles()
|
||||||
|
# for profile in profiles:
|
||||||
|
# print(f"Profile: {profile['name']}, Path: {profile['path']}")
|
||||||
|
|
||||||
|
# # Delete a profile
|
||||||
|
# success = profiler.delete_profile("my-profile")
|
||||||
|
# if success:
|
||||||
|
# print("Profile deleted successfully")
|
||||||
|
# else:
|
||||||
|
# print("Failed to delete profile")
|
||||||
Reference in New Issue
Block a user