refactor(browser): improve browser path management
Implement more robust browser executable path handling using playwright's built-in browser management. This change: - Adds async browser path resolution - Implements path caching in the home folder - Removes hardcoded browser paths - Adds httpx dependency - Removes obsolete test result files This change makes the browser path resolution more reliable across different platforms and environments.
This commit is contained in:
@@ -22,6 +22,7 @@ from .async_configs import BrowserConfig, CrawlerRunConfig
|
|||||||
from .async_logger import AsyncLogger
|
from .async_logger import AsyncLogger
|
||||||
from playwright_stealth import StealthConfig
|
from playwright_stealth import StealthConfig
|
||||||
from .ssl_certificate import SSLCertificate
|
from .ssl_certificate import SSLCertificate
|
||||||
|
from .utils import get_home_folder, get_chromium_path
|
||||||
|
|
||||||
stealth_config = StealthConfig(
|
stealth_config = StealthConfig(
|
||||||
webdriver=True,
|
webdriver=True,
|
||||||
@@ -139,7 +140,7 @@ class ManagedBrowser:
|
|||||||
|
|
||||||
# Get browser path and args based on OS and browser type
|
# Get browser path and args based on OS and browser type
|
||||||
# browser_path = self._get_browser_path()
|
# browser_path = self._get_browser_path()
|
||||||
args = self._get_browser_args()
|
args = await self._get_browser_args()
|
||||||
|
|
||||||
# Start browser process
|
# Start browser process
|
||||||
try:
|
try:
|
||||||
@@ -200,7 +201,7 @@ class ManagedBrowser:
|
|||||||
params={"error": str(e)},
|
params={"error": str(e)},
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_browser_path(self) -> str:
|
def _get_browser_path_WIP(self) -> str:
|
||||||
"""Returns the browser executable path based on OS and browser type"""
|
"""Returns the browser executable path based on OS and browser type"""
|
||||||
if sys.platform == "darwin": # macOS
|
if sys.platform == "darwin": # macOS
|
||||||
paths = {
|
paths = {
|
||||||
@@ -223,9 +224,13 @@ class ManagedBrowser:
|
|||||||
|
|
||||||
return paths.get(self.browser_type)
|
return paths.get(self.browser_type)
|
||||||
|
|
||||||
def _get_browser_args(self) -> List[str]:
|
async def _get_browser_path(self) -> str:
|
||||||
|
browser_path = await get_chromium_path(self.browser_type)
|
||||||
|
return browser_path
|
||||||
|
|
||||||
|
async def _get_browser_args(self) -> List[str]:
|
||||||
"""Returns browser-specific command line arguments"""
|
"""Returns browser-specific command line arguments"""
|
||||||
base_args = [self._get_browser_path()]
|
base_args = [await self._get_browser_path()]
|
||||||
|
|
||||||
if self.browser_type == "chromium":
|
if self.browser_type == "chromium":
|
||||||
args = [
|
args = [
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ from .utils import get_error_context, create_box_message
|
|||||||
# Set up logging
|
# Set up logging
|
||||||
# logging.basicConfig(level=logging.INFO)
|
# logging.basicConfig(level=logging.INFO)
|
||||||
# logger = logging.getLogger(__name__)
|
# logger = logging.getLogger(__name__)
|
||||||
|
# logger.setLevel(logging.INFO)
|
||||||
|
|
||||||
base_directory = DB_PATH = os.path.join(
|
base_directory = DB_PATH = os.path.join(
|
||||||
os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai"
|
os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai"
|
||||||
|
|||||||
@@ -209,6 +209,58 @@ def get_home_folder():
|
|||||||
os.makedirs(f"{home_folder}/models", exist_ok=True)
|
os.makedirs(f"{home_folder}/models", exist_ok=True)
|
||||||
return home_folder
|
return home_folder
|
||||||
|
|
||||||
|
async def get_chromium_path(browser_type) -> str:
|
||||||
|
"""Returns the browser executable path using playwright's browser management.
|
||||||
|
|
||||||
|
Uses playwright's built-in browser management to get the correct browser executable
|
||||||
|
path regardless of platform. This ensures we're using the same browser version
|
||||||
|
that playwright is tested with.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Path to browser executable
|
||||||
|
Raises:
|
||||||
|
RuntimeError: If browser executable cannot be found
|
||||||
|
"""
|
||||||
|
browser_types = {
|
||||||
|
"chromium": "chromium",
|
||||||
|
"firefox": "firefox",
|
||||||
|
"webkit": "webkit"
|
||||||
|
}
|
||||||
|
|
||||||
|
browser_type = browser_types.get(browser_type)
|
||||||
|
if not browser_type:
|
||||||
|
raise RuntimeError(f"Unsupported browser type: {browser_type}")
|
||||||
|
|
||||||
|
# Check if a path has already been saved for this browser type
|
||||||
|
home_folder = get_home_folder()
|
||||||
|
path_file = os.path.join(home_folder, f"{browser_type.lower()}.path")
|
||||||
|
if os.path.exists(path_file):
|
||||||
|
with open(path_file, "r") as f:
|
||||||
|
return f.read()
|
||||||
|
|
||||||
|
from playwright.async_api import async_playwright
|
||||||
|
async with async_playwright() as p:
|
||||||
|
browsers = {
|
||||||
|
'chromium': p.chromium,
|
||||||
|
'firefox': p.firefox,
|
||||||
|
'webkit': p.webkit
|
||||||
|
}
|
||||||
|
|
||||||
|
if browser_type.lower() not in browsers:
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid browser type. Must be one of: {', '.join(browsers.keys())}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Save the path int the crawl4ai home folder
|
||||||
|
home_folder = get_home_folder()
|
||||||
|
browser_path = browsers[browser_type.lower()].executable_path
|
||||||
|
if not browser_path:
|
||||||
|
raise RuntimeError(f"Browser executable not found for type: {browser_type}")
|
||||||
|
# Save the path in a text file with browser type name
|
||||||
|
with open(os.path.join(home_folder, f"{browser_type.lower()}.path"), "w") as f:
|
||||||
|
f.write(browser_path)
|
||||||
|
|
||||||
|
return browser_path
|
||||||
|
|
||||||
def beautify_html(escaped_html):
|
def beautify_html(escaped_html):
|
||||||
"""
|
"""
|
||||||
|
|||||||
3
main.py
3
main.py
@@ -27,8 +27,9 @@ from crawl4ai.extraction_strategy import (
|
|||||||
|
|
||||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
|
||||||
class TaskStatus(str, Enum):
|
class TaskStatus(str, Enum):
|
||||||
|
|||||||
@@ -36,6 +36,7 @@ dependencies = [
|
|||||||
"aiofiles",
|
"aiofiles",
|
||||||
"rich>=13.9.4",
|
"rich>=13.9.4",
|
||||||
"cssselect>=1.2.0",
|
"cssselect>=1.2.0",
|
||||||
|
"httpx==0.27.2",
|
||||||
]
|
]
|
||||||
classifiers = [
|
classifiers = [
|
||||||
"Development Status :: 3 - Alpha",
|
"Development Status :: 3 - Alpha",
|
||||||
@@ -77,4 +78,12 @@ packages = {find = {where = ["."], include = ["crawl4ai*"]}}
|
|||||||
crawl4ai = ["js_snippet/*.js"]
|
crawl4ai = ["js_snippet/*.js"]
|
||||||
|
|
||||||
[tool.setuptools.dynamic]
|
[tool.setuptools.dynamic]
|
||||||
version = {attr = "crawl4ai.__version__.__version__"}
|
version = {attr = "crawl4ai.__version__.__version__"}
|
||||||
|
|
||||||
|
[tool.uv.sources]
|
||||||
|
crawl4ai = { workspace = true }
|
||||||
|
|
||||||
|
[dependency-groups]
|
||||||
|
dev = [
|
||||||
|
"crawl4ai",
|
||||||
|
]
|
||||||
|
|||||||
@@ -1,16 +0,0 @@
|
|||||||
{
|
|
||||||
"tests": [
|
|
||||||
{
|
|
||||||
"case": "complicated_exclude_all_links",
|
|
||||||
"lxml_mode": {
|
|
||||||
"differences": {},
|
|
||||||
"execution_time": 0.0019578933715820312
|
|
||||||
},
|
|
||||||
"original_time": 0.0059909820556640625
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"summary": {
|
|
||||||
"passed": 1,
|
|
||||||
"failed": 0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,52 +0,0 @@
|
|||||||
{
|
|
||||||
"original": {
|
|
||||||
"performance": [],
|
|
||||||
"differences": []
|
|
||||||
},
|
|
||||||
"batch": {
|
|
||||||
"performance": [
|
|
||||||
{
|
|
||||||
"case": "basic",
|
|
||||||
"metrics": {
|
|
||||||
"time": 0.8874530792236328,
|
|
||||||
"memory": 98.328125
|
|
||||||
}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"differences": [
|
|
||||||
{
|
|
||||||
"case": "basic",
|
|
||||||
"differences": {
|
|
||||||
"images_count": {
|
|
||||||
"old": 50,
|
|
||||||
"new": 0,
|
|
||||||
"diff": -50
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"lxml": {
|
|
||||||
"performance": [
|
|
||||||
{
|
|
||||||
"case": "basic",
|
|
||||||
"metrics": {
|
|
||||||
"time": 1.210719108581543,
|
|
||||||
"memory": 99.921875
|
|
||||||
}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"differences": [
|
|
||||||
{
|
|
||||||
"case": "basic",
|
|
||||||
"differences": {
|
|
||||||
"images_count": {
|
|
||||||
"old": 50,
|
|
||||||
"new": 0,
|
|
||||||
"diff": -50
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Reference in New Issue
Block a user