refactor(browser): improve browser path management

Implement more robust browser executable path handling using playwright's built-in browser management. This change: - Adds async browser path resolution - Implements path caching in the home folder - Removes hardcoded browser paths - Adds httpx dependency - Removes obsolete test result files This change makes the browser path resolution more reliable across different platforms and environments.
2025-01-17 22:14:37 +08:00
parent ece9202b61
commit 2d6b19e1a2
7 changed files with 74 additions and 74 deletions
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -22,6 +22,7 @@ from .async_configs import BrowserConfig, CrawlerRunConfig
 from .async_logger import AsyncLogger
 from playwright_stealth import StealthConfig
 from .ssl_certificate import SSLCertificate
 from .utils import get_home_folder, get_chromium_path
 stealth_config = StealthConfig(
    webdriver=True,
@@ -139,7 +140,7 @@ class ManagedBrowser:
        # Get browser path and args based on OS and browser type
        # browser_path = self._get_browser_path()
-        args = self._get_browser_args()
+        args = await self._get_browser_args()
        # Start browser process
        try:
@@ -200,7 +201,7 @@ class ManagedBrowser:
                        params={"error": str(e)},
                    )
-    def _get_browser_path(self) -> str:
+    def _get_browser_path_WIP(self) -> str:
        """Returns the browser executable path based on OS and browser type"""
        if sys.platform == "darwin":  # macOS
            paths = {
@@ -223,9 +224,13 @@ class ManagedBrowser:
        return paths.get(self.browser_type)
-    def _get_browser_args(self) -> List[str]:
+    async def _get_browser_path(self) -> str:
        browser_path = await get_chromium_path(self.browser_type)
        return browser_path
    async def _get_browser_args(self) -> List[str]:
        """Returns browser-specific command line arguments"""
-        base_args = [self._get_browser_path()]
+        base_args = [await self._get_browser_path()]
        if self.browser_type == "chromium":
            args = [
--- a/crawl4ai/async_database.py
+++ b/crawl4ai/async_database.py
@@ -16,6 +16,7 @@ from .utils import get_error_context, create_box_message
 # Set up logging
 # logging.basicConfig(level=logging.INFO)
 # logger = logging.getLogger(__name__)
 # logger.setLevel(logging.INFO)
 base_directory = DB_PATH = os.path.join(
    os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai"
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -209,6 +209,58 @@ def get_home_folder():
    os.makedirs(f"{home_folder}/models", exist_ok=True)
    return home_folder
 async def get_chromium_path(browser_type) -> str:
    """Returns the browser executable path using playwright's browser management.
    Uses playwright's built-in browser management to get the correct browser executable
    path regardless of platform. This ensures we're using the same browser version
    that playwright is tested with.
    Returns:
        str: Path to browser executable
    Raises:
        RuntimeError: If browser executable cannot be found
    """        
    browser_types = {
        "chromium": "chromium",
        "firefox": "firefox",
        "webkit": "webkit"
    }
    browser_type = browser_types.get(browser_type)
    if not browser_type:
        raise RuntimeError(f"Unsupported browser type: {browser_type}")
    # Check if a path has already been saved for this browser type
    home_folder = get_home_folder()
    path_file = os.path.join(home_folder, f"{browser_type.lower()}.path")
    if os.path.exists(path_file):
        with open(path_file, "r") as f:
            return f.read()
    from playwright.async_api import async_playwright
    async with async_playwright() as p:
        browsers = {
            'chromium': p.chromium,
            'firefox': p.firefox, 
            'webkit': p.webkit
        }
        if browser_type.lower() not in browsers:
            raise ValueError(
                f"Invalid browser type. Must be one of: {', '.join(browsers.keys())}"
            )
        # Save the path int the crawl4ai home folder
        home_folder = get_home_folder()
        browser_path = browsers[browser_type.lower()].executable_path
        if not browser_path:
            raise RuntimeError(f"Browser executable not found for type: {browser_type}")
        # Save the path in a text file with browser type name
        with open(os.path.join(home_folder, f"{browser_type.lower()}.path"), "w") as f:
            f.write(browser_path)
        return browser_path
 def beautify_html(escaped_html):
    """
--- a/main.py
+++ b/main.py
@@ -27,8 +27,9 @@ from crawl4ai.extraction_strategy import (
 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
-logging.basicConfig(level=logging.INFO)
+
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 class TaskStatus(str, Enum):
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,7 @@ dependencies = [
    "aiofiles",
    "rich>=13.9.4",
    "cssselect>=1.2.0",
    "httpx==0.27.2",
 ]
 classifiers = [
    "Development Status :: 3 - Alpha",
@@ -77,4 +78,12 @@ packages = {find = {where = ["."], include = ["crawl4ai*"]}}
 crawl4ai = ["js_snippet/*.js"]
 [tool.setuptools.dynamic]
-version = {attr = "crawl4ai.__version__.__version__"}
+version = {attr = "crawl4ai.__version__.__version__"}
 [tool.uv.sources]
 crawl4ai = { workspace = true }
 [dependency-groups]
 dev = [
    "crawl4ai",
 ]
--- a/scraper_equivalence_results.json
+++ b/scraper_equivalence_results.json
@@ -1,16 +0,0 @@
 {
  "tests": [
    {
      "case": "complicated_exclude_all_links",
      "lxml_mode": {
        "differences": {},
        "execution_time": 0.0019578933715820312
      },
      "original_time": 0.0059909820556640625
    }
  ],
  "summary": {
    "passed": 1,
    "failed": 0
  }
 }
--- a/scraper_evaluation.json
+++ b/scraper_evaluation.json
@@ -1,52 +0,0 @@
 {
  "original": {
    "performance": [],
    "differences": []
  },
  "batch": {
    "performance": [
      {
        "case": "basic",
        "metrics": {
          "time": 0.8874530792236328,
          "memory": 98.328125
        }
      }
    ],
    "differences": [
      {
        "case": "basic",
        "differences": {
          "images_count": {
            "old": 50,
            "new": 0,
            "diff": -50
          }
        }
      }
    ]
  },
  "lxml": {
    "performance": [
      {
        "case": "basic",
        "metrics": {
          "time": 1.210719108581543,
          "memory": 99.921875
        }
      }
    ],
    "differences": [
      {
        "case": "basic",
        "differences": {
          "images_count": {
            "old": 50,
            "new": 0,
            "diff": -50
          }
        }
      }
    ]
  }
 }