refactor(browser): improve browser path management
Implement more robust browser executable path handling using playwright's built-in browser management. This change: - Adds async browser path resolution - Implements path caching in the home folder - Removes hardcoded browser paths - Adds httpx dependency - Removes obsolete test result files This change makes the browser path resolution more reliable across different platforms and environments.
This commit is contained in:
@@ -22,6 +22,7 @@ from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from .async_logger import AsyncLogger
|
||||
from playwright_stealth import StealthConfig
|
||||
from .ssl_certificate import SSLCertificate
|
||||
from .utils import get_home_folder, get_chromium_path
|
||||
|
||||
stealth_config = StealthConfig(
|
||||
webdriver=True,
|
||||
@@ -139,7 +140,7 @@ class ManagedBrowser:
|
||||
|
||||
# Get browser path and args based on OS and browser type
|
||||
# browser_path = self._get_browser_path()
|
||||
args = self._get_browser_args()
|
||||
args = await self._get_browser_args()
|
||||
|
||||
# Start browser process
|
||||
try:
|
||||
@@ -200,7 +201,7 @@ class ManagedBrowser:
|
||||
params={"error": str(e)},
|
||||
)
|
||||
|
||||
def _get_browser_path(self) -> str:
|
||||
def _get_browser_path_WIP(self) -> str:
|
||||
"""Returns the browser executable path based on OS and browser type"""
|
||||
if sys.platform == "darwin": # macOS
|
||||
paths = {
|
||||
@@ -223,9 +224,13 @@ class ManagedBrowser:
|
||||
|
||||
return paths.get(self.browser_type)
|
||||
|
||||
def _get_browser_args(self) -> List[str]:
|
||||
async def _get_browser_path(self) -> str:
|
||||
browser_path = await get_chromium_path(self.browser_type)
|
||||
return browser_path
|
||||
|
||||
async def _get_browser_args(self) -> List[str]:
|
||||
"""Returns browser-specific command line arguments"""
|
||||
base_args = [self._get_browser_path()]
|
||||
base_args = [await self._get_browser_path()]
|
||||
|
||||
if self.browser_type == "chromium":
|
||||
args = [
|
||||
|
||||
@@ -16,6 +16,7 @@ from .utils import get_error_context, create_box_message
|
||||
# Set up logging
|
||||
# logging.basicConfig(level=logging.INFO)
|
||||
# logger = logging.getLogger(__name__)
|
||||
# logger.setLevel(logging.INFO)
|
||||
|
||||
base_directory = DB_PATH = os.path.join(
|
||||
os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai"
|
||||
|
||||
@@ -209,6 +209,58 @@ def get_home_folder():
|
||||
os.makedirs(f"{home_folder}/models", exist_ok=True)
|
||||
return home_folder
|
||||
|
||||
async def get_chromium_path(browser_type) -> str:
|
||||
"""Returns the browser executable path using playwright's browser management.
|
||||
|
||||
Uses playwright's built-in browser management to get the correct browser executable
|
||||
path regardless of platform. This ensures we're using the same browser version
|
||||
that playwright is tested with.
|
||||
|
||||
Returns:
|
||||
str: Path to browser executable
|
||||
Raises:
|
||||
RuntimeError: If browser executable cannot be found
|
||||
"""
|
||||
browser_types = {
|
||||
"chromium": "chromium",
|
||||
"firefox": "firefox",
|
||||
"webkit": "webkit"
|
||||
}
|
||||
|
||||
browser_type = browser_types.get(browser_type)
|
||||
if not browser_type:
|
||||
raise RuntimeError(f"Unsupported browser type: {browser_type}")
|
||||
|
||||
# Check if a path has already been saved for this browser type
|
||||
home_folder = get_home_folder()
|
||||
path_file = os.path.join(home_folder, f"{browser_type.lower()}.path")
|
||||
if os.path.exists(path_file):
|
||||
with open(path_file, "r") as f:
|
||||
return f.read()
|
||||
|
||||
from playwright.async_api import async_playwright
|
||||
async with async_playwright() as p:
|
||||
browsers = {
|
||||
'chromium': p.chromium,
|
||||
'firefox': p.firefox,
|
||||
'webkit': p.webkit
|
||||
}
|
||||
|
||||
if browser_type.lower() not in browsers:
|
||||
raise ValueError(
|
||||
f"Invalid browser type. Must be one of: {', '.join(browsers.keys())}"
|
||||
)
|
||||
|
||||
# Save the path int the crawl4ai home folder
|
||||
home_folder = get_home_folder()
|
||||
browser_path = browsers[browser_type.lower()].executable_path
|
||||
if not browser_path:
|
||||
raise RuntimeError(f"Browser executable not found for type: {browser_type}")
|
||||
# Save the path in a text file with browser type name
|
||||
with open(os.path.join(home_folder, f"{browser_type.lower()}.path"), "w") as f:
|
||||
f.write(browser_path)
|
||||
|
||||
return browser_path
|
||||
|
||||
def beautify_html(escaped_html):
|
||||
"""
|
||||
|
||||
3
main.py
3
main.py
@@ -27,8 +27,9 @@ from crawl4ai.extraction_strategy import (
|
||||
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
class TaskStatus(str, Enum):
|
||||
|
||||
@@ -36,6 +36,7 @@ dependencies = [
|
||||
"aiofiles",
|
||||
"rich>=13.9.4",
|
||||
"cssselect>=1.2.0",
|
||||
"httpx==0.27.2",
|
||||
]
|
||||
classifiers = [
|
||||
"Development Status :: 3 - Alpha",
|
||||
@@ -78,3 +79,11 @@ crawl4ai = ["js_snippet/*.js"]
|
||||
|
||||
[tool.setuptools.dynamic]
|
||||
version = {attr = "crawl4ai.__version__.__version__"}
|
||||
|
||||
[tool.uv.sources]
|
||||
crawl4ai = { workspace = true }
|
||||
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
"crawl4ai",
|
||||
]
|
||||
|
||||
@@ -1,16 +0,0 @@
|
||||
{
|
||||
"tests": [
|
||||
{
|
||||
"case": "complicated_exclude_all_links",
|
||||
"lxml_mode": {
|
||||
"differences": {},
|
||||
"execution_time": 0.0019578933715820312
|
||||
},
|
||||
"original_time": 0.0059909820556640625
|
||||
}
|
||||
],
|
||||
"summary": {
|
||||
"passed": 1,
|
||||
"failed": 0
|
||||
}
|
||||
}
|
||||
@@ -1,52 +0,0 @@
|
||||
{
|
||||
"original": {
|
||||
"performance": [],
|
||||
"differences": []
|
||||
},
|
||||
"batch": {
|
||||
"performance": [
|
||||
{
|
||||
"case": "basic",
|
||||
"metrics": {
|
||||
"time": 0.8874530792236328,
|
||||
"memory": 98.328125
|
||||
}
|
||||
}
|
||||
],
|
||||
"differences": [
|
||||
{
|
||||
"case": "basic",
|
||||
"differences": {
|
||||
"images_count": {
|
||||
"old": 50,
|
||||
"new": 0,
|
||||
"diff": -50
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
"lxml": {
|
||||
"performance": [
|
||||
{
|
||||
"case": "basic",
|
||||
"metrics": {
|
||||
"time": 1.210719108581543,
|
||||
"memory": 99.921875
|
||||
}
|
||||
}
|
||||
],
|
||||
"differences": [
|
||||
{
|
||||
"case": "basic",
|
||||
"differences": {
|
||||
"images_count": {
|
||||
"old": 50,
|
||||
"new": 0,
|
||||
"diff": -50
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user