From 2d6b19e1a25bfdf7dfeab3faff958a2d7822d3cd Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 17 Jan 2025 22:14:37 +0800 Subject: [PATCH] refactor(browser): improve browser path management Implement more robust browser executable path handling using playwright's built-in browser management. This change: - Adds async browser path resolution - Implements path caching in the home folder - Removes hardcoded browser paths - Adds httpx dependency - Removes obsolete test result files This change makes the browser path resolution more reliable across different platforms and environments. --- crawl4ai/async_crawler_strategy.py | 13 +++++--- crawl4ai/async_database.py | 1 + crawl4ai/utils.py | 52 ++++++++++++++++++++++++++++++ main.py | 3 +- pyproject.toml | 11 ++++++- scraper_equivalence_results.json | 16 --------- scraper_evaluation.json | 52 ------------------------------ 7 files changed, 74 insertions(+), 74 deletions(-) delete mode 100644 scraper_equivalence_results.json delete mode 100644 scraper_evaluation.json diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 0edefa73..60590035 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -22,6 +22,7 @@ from .async_configs import BrowserConfig, CrawlerRunConfig from .async_logger import AsyncLogger from playwright_stealth import StealthConfig from .ssl_certificate import SSLCertificate +from .utils import get_home_folder, get_chromium_path stealth_config = StealthConfig( webdriver=True, @@ -139,7 +140,7 @@ class ManagedBrowser: # Get browser path and args based on OS and browser type # browser_path = self._get_browser_path() - args = self._get_browser_args() + args = await self._get_browser_args() # Start browser process try: @@ -200,7 +201,7 @@ class ManagedBrowser: params={"error": str(e)}, ) - def _get_browser_path(self) -> str: + def _get_browser_path_WIP(self) -> str: """Returns the browser executable path based on OS and browser type""" if sys.platform == "darwin": # macOS paths = { @@ -223,9 +224,13 @@ class ManagedBrowser: return paths.get(self.browser_type) - def _get_browser_args(self) -> List[str]: + async def _get_browser_path(self) -> str: + browser_path = await get_chromium_path(self.browser_type) + return browser_path + + async def _get_browser_args(self) -> List[str]: """Returns browser-specific command line arguments""" - base_args = [self._get_browser_path()] + base_args = [await self._get_browser_path()] if self.browser_type == "chromium": args = [ diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py index ca5e6ef2..b0c20f29 100644 --- a/crawl4ai/async_database.py +++ b/crawl4ai/async_database.py @@ -16,6 +16,7 @@ from .utils import get_error_context, create_box_message # Set up logging # logging.basicConfig(level=logging.INFO) # logger = logging.getLogger(__name__) +# logger.setLevel(logging.INFO) base_directory = DB_PATH = os.path.join( os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai" diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 63c8a092..ea1309a8 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -209,6 +209,58 @@ def get_home_folder(): os.makedirs(f"{home_folder}/models", exist_ok=True) return home_folder +async def get_chromium_path(browser_type) -> str: + """Returns the browser executable path using playwright's browser management. + + Uses playwright's built-in browser management to get the correct browser executable + path regardless of platform. This ensures we're using the same browser version + that playwright is tested with. + + Returns: + str: Path to browser executable + Raises: + RuntimeError: If browser executable cannot be found + """ + browser_types = { + "chromium": "chromium", + "firefox": "firefox", + "webkit": "webkit" + } + + browser_type = browser_types.get(browser_type) + if not browser_type: + raise RuntimeError(f"Unsupported browser type: {browser_type}") + + # Check if a path has already been saved for this browser type + home_folder = get_home_folder() + path_file = os.path.join(home_folder, f"{browser_type.lower()}.path") + if os.path.exists(path_file): + with open(path_file, "r") as f: + return f.read() + + from playwright.async_api import async_playwright + async with async_playwright() as p: + browsers = { + 'chromium': p.chromium, + 'firefox': p.firefox, + 'webkit': p.webkit + } + + if browser_type.lower() not in browsers: + raise ValueError( + f"Invalid browser type. Must be one of: {', '.join(browsers.keys())}" + ) + + # Save the path int the crawl4ai home folder + home_folder = get_home_folder() + browser_path = browsers[browser_type.lower()].executable_path + if not browser_path: + raise RuntimeError(f"Browser executable not found for type: {browser_type}") + # Save the path in a text file with browser type name + with open(os.path.join(home_folder, f"{browser_type.lower()}.path"), "w") as f: + f.write(browser_path) + + return browser_path def beautify_html(escaped_html): """ diff --git a/main.py b/main.py index 1f9e01a3..029653cd 100644 --- a/main.py +++ b/main.py @@ -27,8 +27,9 @@ from crawl4ai.extraction_strategy import ( __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) -logging.basicConfig(level=logging.INFO) + logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) class TaskStatus(str, Enum): diff --git a/pyproject.toml b/pyproject.toml index 7ca779d5..c9bd9ad3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ dependencies = [ "aiofiles", "rich>=13.9.4", "cssselect>=1.2.0", + "httpx==0.27.2", ] classifiers = [ "Development Status :: 3 - Alpha", @@ -77,4 +78,12 @@ packages = {find = {where = ["."], include = ["crawl4ai*"]}} crawl4ai = ["js_snippet/*.js"] [tool.setuptools.dynamic] -version = {attr = "crawl4ai.__version__.__version__"} \ No newline at end of file +version = {attr = "crawl4ai.__version__.__version__"} + +[tool.uv.sources] +crawl4ai = { workspace = true } + +[dependency-groups] +dev = [ + "crawl4ai", +] diff --git a/scraper_equivalence_results.json b/scraper_equivalence_results.json deleted file mode 100644 index 2ad1080a..00000000 --- a/scraper_equivalence_results.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "tests": [ - { - "case": "complicated_exclude_all_links", - "lxml_mode": { - "differences": {}, - "execution_time": 0.0019578933715820312 - }, - "original_time": 0.0059909820556640625 - } - ], - "summary": { - "passed": 1, - "failed": 0 - } -} \ No newline at end of file diff --git a/scraper_evaluation.json b/scraper_evaluation.json deleted file mode 100644 index 9606d906..00000000 --- a/scraper_evaluation.json +++ /dev/null @@ -1,52 +0,0 @@ -{ - "original": { - "performance": [], - "differences": [] - }, - "batch": { - "performance": [ - { - "case": "basic", - "metrics": { - "time": 0.8874530792236328, - "memory": 98.328125 - } - } - ], - "differences": [ - { - "case": "basic", - "differences": { - "images_count": { - "old": 50, - "new": 0, - "diff": -50 - } - } - } - ] - }, - "lxml": { - "performance": [ - { - "case": "basic", - "metrics": { - "time": 1.210719108581543, - "memory": 99.921875 - } - } - ], - "differences": [ - { - "case": "basic", - "differences": { - "images_count": { - "old": 50, - "new": 0, - "diff": -50 - } - } - } - ] - } -} \ No newline at end of file