From 2d6b19e1a25bfdf7dfeab3faff958a2d7822d3cd Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Fri, 17 Jan 2025 22:14:37 +0800
Subject: [PATCH] refactor(browser): improve browser path management

Implement more robust browser executable path handling using playwright's built-in browser management. This change:
- Adds async browser path resolution
- Implements path caching in the home folder
- Removes hardcoded browser paths
- Adds httpx dependency
- Removes obsolete test result files

This change makes the browser path resolution more reliable across different platforms and environments.
---
 crawl4ai/async_crawler_strategy.py | 13 +++++---
 crawl4ai/async_database.py         |  1 +
 crawl4ai/utils.py                  | 52 ++++++++++++++++++++++++++++++
 main.py                            |  3 +-
 pyproject.toml                     | 11 ++++++-
 scraper_equivalence_results.json   | 16 ---------
 scraper_evaluation.json            | 52 ------------------------------
 7 files changed, 74 insertions(+), 74 deletions(-)
 delete mode 100644 scraper_equivalence_results.json
 delete mode 100644 scraper_evaluation.json

diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 0edefa73..60590035 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -22,6 +22,7 @@ from .async_configs import BrowserConfig, CrawlerRunConfig
 from .async_logger import AsyncLogger
 from playwright_stealth import StealthConfig
 from .ssl_certificate import SSLCertificate
+from .utils import get_home_folder, get_chromium_path
 
 stealth_config = StealthConfig(
     webdriver=True,
@@ -139,7 +140,7 @@ class ManagedBrowser:
 
         # Get browser path and args based on OS and browser type
         # browser_path = self._get_browser_path()
-        args = self._get_browser_args()
+        args = await self._get_browser_args()
 
         # Start browser process
         try:
@@ -200,7 +201,7 @@ class ManagedBrowser:
                         params={"error": str(e)},
                     )
 
-    def _get_browser_path(self) -> str:
+    def _get_browser_path_WIP(self) -> str:
         """Returns the browser executable path based on OS and browser type"""
         if sys.platform == "darwin":  # macOS
             paths = {
@@ -223,9 +224,13 @@ class ManagedBrowser:
 
         return paths.get(self.browser_type)
 
-    def _get_browser_args(self) -> List[str]:
+    async def _get_browser_path(self) -> str:
+        browser_path = await get_chromium_path(self.browser_type)
+        return browser_path
+
+    async def _get_browser_args(self) -> List[str]:
         """Returns browser-specific command line arguments"""
-        base_args = [self._get_browser_path()]
+        base_args = [await self._get_browser_path()]
 
         if self.browser_type == "chromium":
             args = [
diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py
index ca5e6ef2..b0c20f29 100644
--- a/crawl4ai/async_database.py
+++ b/crawl4ai/async_database.py
@@ -16,6 +16,7 @@ from .utils import get_error_context, create_box_message
 # Set up logging
 # logging.basicConfig(level=logging.INFO)
 # logger = logging.getLogger(__name__)
+# logger.setLevel(logging.INFO)
 
 base_directory = DB_PATH = os.path.join(
     os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai"
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index 63c8a092..ea1309a8 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -209,6 +209,58 @@ def get_home_folder():
     os.makedirs(f"{home_folder}/models", exist_ok=True)
     return home_folder
 
+async def get_chromium_path(browser_type) -> str:
+    """Returns the browser executable path using playwright's browser management.
+    
+    Uses playwright's built-in browser management to get the correct browser executable
+    path regardless of platform. This ensures we're using the same browser version
+    that playwright is tested with.
+    
+    Returns:
+        str: Path to browser executable
+    Raises:
+        RuntimeError: If browser executable cannot be found
+    """        
+    browser_types = {
+        "chromium": "chromium",
+        "firefox": "firefox",
+        "webkit": "webkit"
+    }
+    
+    browser_type = browser_types.get(browser_type)
+    if not browser_type:
+        raise RuntimeError(f"Unsupported browser type: {browser_type}")
+
+    # Check if a path has already been saved for this browser type
+    home_folder = get_home_folder()
+    path_file = os.path.join(home_folder, f"{browser_type.lower()}.path")
+    if os.path.exists(path_file):
+        with open(path_file, "r") as f:
+            return f.read()
+
+    from playwright.async_api import async_playwright
+    async with async_playwright() as p:
+        browsers = {
+            'chromium': p.chromium,
+            'firefox': p.firefox, 
+            'webkit': p.webkit
+        }
+        
+        if browser_type.lower() not in browsers:
+            raise ValueError(
+                f"Invalid browser type. Must be one of: {', '.join(browsers.keys())}"
+            )
+            
+        # Save the path int the crawl4ai home folder
+        home_folder = get_home_folder()
+        browser_path = browsers[browser_type.lower()].executable_path
+        if not browser_path:
+            raise RuntimeError(f"Browser executable not found for type: {browser_type}")
+        # Save the path in a text file with browser type name
+        with open(os.path.join(home_folder, f"{browser_type.lower()}.path"), "w") as f:
+            f.write(browser_path)
+        
+        return browser_path
 
 def beautify_html(escaped_html):
     """
diff --git a/main.py b/main.py
index 1f9e01a3..029653cd 100644
--- a/main.py
+++ b/main.py
@@ -27,8 +27,9 @@ from crawl4ai.extraction_strategy import (
 
 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
 
-logging.basicConfig(level=logging.INFO)
+
 logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
 
 
 class TaskStatus(str, Enum):
diff --git a/pyproject.toml b/pyproject.toml
index 7ca779d5..c9bd9ad3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,7 @@ dependencies = [
     "aiofiles",
     "rich>=13.9.4",
     "cssselect>=1.2.0",
+    "httpx==0.27.2",
 ]
 classifiers = [
     "Development Status :: 3 - Alpha",
@@ -77,4 +78,12 @@ packages = {find = {where = ["."], include = ["crawl4ai*"]}}
 crawl4ai = ["js_snippet/*.js"]
 
 [tool.setuptools.dynamic]
-version = {attr = "crawl4ai.__version__.__version__"}
\ No newline at end of file
+version = {attr = "crawl4ai.__version__.__version__"}
+
+[tool.uv.sources]
+crawl4ai = { workspace = true }
+
+[dependency-groups]
+dev = [
+    "crawl4ai",
+]
diff --git a/scraper_equivalence_results.json b/scraper_equivalence_results.json
deleted file mode 100644
index 2ad1080a..00000000
--- a/scraper_equivalence_results.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-  "tests": [
-    {
-      "case": "complicated_exclude_all_links",
-      "lxml_mode": {
-        "differences": {},
-        "execution_time": 0.0019578933715820312
-      },
-      "original_time": 0.0059909820556640625
-    }
-  ],
-  "summary": {
-    "passed": 1,
-    "failed": 0
-  }
-}
\ No newline at end of file
diff --git a/scraper_evaluation.json b/scraper_evaluation.json
deleted file mode 100644
index 9606d906..00000000
--- a/scraper_evaluation.json
+++ /dev/null
@@ -1,52 +0,0 @@
-{
-  "original": {
-    "performance": [],
-    "differences": []
-  },
-  "batch": {
-    "performance": [
-      {
-        "case": "basic",
-        "metrics": {
-          "time": 0.8874530792236328,
-          "memory": 98.328125
-        }
-      }
-    ],
-    "differences": [
-      {
-        "case": "basic",
-        "differences": {
-          "images_count": {
-            "old": 50,
-            "new": 0,
-            "diff": -50
-          }
-        }
-      }
-    ]
-  },
-  "lxml": {
-    "performance": [
-      {
-        "case": "basic",
-        "metrics": {
-          "time": 1.210719108581543,
-          "memory": 99.921875
-        }
-      }
-    ],
-    "differences": [
-      {
-        "case": "basic",
-        "differences": {
-          "images_count": {
-            "old": 50,
-            "new": 0,
-            "diff": -50
-          }
-        }
-      }
-    ]
-  }
-}
\ No newline at end of file