refactor(browser): improve browser path management

Implement more robust browser executable path handling using playwright's built-in browser management. This change:
- Adds async browser path resolution
- Implements path caching in the home folder
- Removes hardcoded browser paths
- Adds httpx dependency
- Removes obsolete test result files

This change makes the browser path resolution more reliable across different platforms and environments.
This commit is contained in:
UncleCode
2025-01-17 22:14:37 +08:00
parent ece9202b61
commit 2d6b19e1a2
7 changed files with 74 additions and 74 deletions

View File

@@ -22,6 +22,7 @@ from .async_configs import BrowserConfig, CrawlerRunConfig
from .async_logger import AsyncLogger from .async_logger import AsyncLogger
from playwright_stealth import StealthConfig from playwright_stealth import StealthConfig
from .ssl_certificate import SSLCertificate from .ssl_certificate import SSLCertificate
from .utils import get_home_folder, get_chromium_path
stealth_config = StealthConfig( stealth_config = StealthConfig(
webdriver=True, webdriver=True,
@@ -139,7 +140,7 @@ class ManagedBrowser:
# Get browser path and args based on OS and browser type # Get browser path and args based on OS and browser type
# browser_path = self._get_browser_path() # browser_path = self._get_browser_path()
args = self._get_browser_args() args = await self._get_browser_args()
# Start browser process # Start browser process
try: try:
@@ -200,7 +201,7 @@ class ManagedBrowser:
params={"error": str(e)}, params={"error": str(e)},
) )
def _get_browser_path(self) -> str: def _get_browser_path_WIP(self) -> str:
"""Returns the browser executable path based on OS and browser type""" """Returns the browser executable path based on OS and browser type"""
if sys.platform == "darwin": # macOS if sys.platform == "darwin": # macOS
paths = { paths = {
@@ -223,9 +224,13 @@ class ManagedBrowser:
return paths.get(self.browser_type) return paths.get(self.browser_type)
def _get_browser_args(self) -> List[str]: async def _get_browser_path(self) -> str:
browser_path = await get_chromium_path(self.browser_type)
return browser_path
async def _get_browser_args(self) -> List[str]:
"""Returns browser-specific command line arguments""" """Returns browser-specific command line arguments"""
base_args = [self._get_browser_path()] base_args = [await self._get_browser_path()]
if self.browser_type == "chromium": if self.browser_type == "chromium":
args = [ args = [

View File

@@ -16,6 +16,7 @@ from .utils import get_error_context, create_box_message
# Set up logging # Set up logging
# logging.basicConfig(level=logging.INFO) # logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger(__name__) # logger = logging.getLogger(__name__)
# logger.setLevel(logging.INFO)
base_directory = DB_PATH = os.path.join( base_directory = DB_PATH = os.path.join(
os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai" os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai"

View File

@@ -209,6 +209,58 @@ def get_home_folder():
os.makedirs(f"{home_folder}/models", exist_ok=True) os.makedirs(f"{home_folder}/models", exist_ok=True)
return home_folder return home_folder
async def get_chromium_path(browser_type) -> str:
"""Returns the browser executable path using playwright's browser management.
Uses playwright's built-in browser management to get the correct browser executable
path regardless of platform. This ensures we're using the same browser version
that playwright is tested with.
Returns:
str: Path to browser executable
Raises:
RuntimeError: If browser executable cannot be found
"""
browser_types = {
"chromium": "chromium",
"firefox": "firefox",
"webkit": "webkit"
}
browser_type = browser_types.get(browser_type)
if not browser_type:
raise RuntimeError(f"Unsupported browser type: {browser_type}")
# Check if a path has already been saved for this browser type
home_folder = get_home_folder()
path_file = os.path.join(home_folder, f"{browser_type.lower()}.path")
if os.path.exists(path_file):
with open(path_file, "r") as f:
return f.read()
from playwright.async_api import async_playwright
async with async_playwright() as p:
browsers = {
'chromium': p.chromium,
'firefox': p.firefox,
'webkit': p.webkit
}
if browser_type.lower() not in browsers:
raise ValueError(
f"Invalid browser type. Must be one of: {', '.join(browsers.keys())}"
)
# Save the path int the crawl4ai home folder
home_folder = get_home_folder()
browser_path = browsers[browser_type.lower()].executable_path
if not browser_path:
raise RuntimeError(f"Browser executable not found for type: {browser_type}")
# Save the path in a text file with browser type name
with open(os.path.join(home_folder, f"{browser_type.lower()}.path"), "w") as f:
f.write(browser_path)
return browser_path
def beautify_html(escaped_html): def beautify_html(escaped_html):
""" """

View File

@@ -27,8 +27,9 @@ from crawl4ai.extraction_strategy import (
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
class TaskStatus(str, Enum): class TaskStatus(str, Enum):

View File

@@ -36,6 +36,7 @@ dependencies = [
"aiofiles", "aiofiles",
"rich>=13.9.4", "rich>=13.9.4",
"cssselect>=1.2.0", "cssselect>=1.2.0",
"httpx==0.27.2",
] ]
classifiers = [ classifiers = [
"Development Status :: 3 - Alpha", "Development Status :: 3 - Alpha",
@@ -77,4 +78,12 @@ packages = {find = {where = ["."], include = ["crawl4ai*"]}}
crawl4ai = ["js_snippet/*.js"] crawl4ai = ["js_snippet/*.js"]
[tool.setuptools.dynamic] [tool.setuptools.dynamic]
version = {attr = "crawl4ai.__version__.__version__"} version = {attr = "crawl4ai.__version__.__version__"}
[tool.uv.sources]
crawl4ai = { workspace = true }
[dependency-groups]
dev = [
"crawl4ai",
]

View File

@@ -1,16 +0,0 @@
{
"tests": [
{
"case": "complicated_exclude_all_links",
"lxml_mode": {
"differences": {},
"execution_time": 0.0019578933715820312
},
"original_time": 0.0059909820556640625
}
],
"summary": {
"passed": 1,
"failed": 0
}
}

View File

@@ -1,52 +0,0 @@
{
"original": {
"performance": [],
"differences": []
},
"batch": {
"performance": [
{
"case": "basic",
"metrics": {
"time": 0.8874530792236328,
"memory": 98.328125
}
}
],
"differences": [
{
"case": "basic",
"differences": {
"images_count": {
"old": 50,
"new": 0,
"diff": -50
}
}
}
]
},
"lxml": {
"performance": [
{
"case": "basic",
"metrics": {
"time": 1.210719108581543,
"memory": 99.921875
}
}
],
"differences": [
{
"case": "basic",
"differences": {
"images_count": {
"old": 50,
"new": 0,
"diff": -50
}
}
}
]
}
}