- Introduced new async crawl strategy with session management. - Added BrowserManager for improved browser management. - Enhanced documentation, focusing on storage state and usage examples. - Improved error handling and logging for sessions. - Added JavaScript snippets for customizing navigator properties.
184 lines
7.3 KiB
Python
184 lines
7.3 KiB
Python
import asyncio
|
|
import base64
|
|
import time
|
|
from abc import ABC, abstractmethod
|
|
from typing import Callable, Dict, Any, List, Optional, Awaitable
|
|
import os, sys, shutil
|
|
import tempfile, subprocess
|
|
from playwright.async_api import async_playwright, Page, Browser, Error
|
|
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
|
|
from io import BytesIO
|
|
from PIL import Image, ImageDraw, ImageFont
|
|
from pathlib import Path
|
|
from playwright.async_api import ProxySettings
|
|
from pydantic import BaseModel
|
|
import hashlib
|
|
import json
|
|
import uuid
|
|
from .models import AsyncCrawlResponse
|
|
from .utils import create_box_message
|
|
from .user_agent_generator import UserAgentGenerator
|
|
from playwright_stealth import StealthConfig, stealth_async
|
|
|
|
|
|
class ManagedBrowser:
|
|
def __init__(self, browser_type: str = "chromium", user_data_dir: Optional[str] = None, headless: bool = False, logger = None, host: str = "localhost", debugging_port: int = 9222):
|
|
self.browser_type = browser_type
|
|
self.user_data_dir = user_data_dir
|
|
self.headless = headless
|
|
self.browser_process = None
|
|
self.temp_dir = None
|
|
self.debugging_port = debugging_port
|
|
self.host = host
|
|
self.logger = logger
|
|
self.shutting_down = False
|
|
|
|
async def start(self) -> str:
|
|
"""
|
|
Starts the browser process and returns the CDP endpoint URL.
|
|
If user_data_dir is not provided, creates a temporary directory.
|
|
"""
|
|
|
|
# Create temp dir if needed
|
|
if not self.user_data_dir:
|
|
self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-")
|
|
self.user_data_dir = self.temp_dir
|
|
|
|
# Get browser path and args based on OS and browser type
|
|
browser_path = self._get_browser_path()
|
|
args = self._get_browser_args()
|
|
|
|
# Start browser process
|
|
try:
|
|
self.browser_process = subprocess.Popen(
|
|
args,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE
|
|
)
|
|
# Monitor browser process output for errors
|
|
asyncio.create_task(self._monitor_browser_process())
|
|
await asyncio.sleep(2) # Give browser time to start
|
|
return f"http://{self.host}:{self.debugging_port}"
|
|
except Exception as e:
|
|
await self.cleanup()
|
|
raise Exception(f"Failed to start browser: {e}")
|
|
|
|
async def _monitor_browser_process(self):
|
|
"""Monitor the browser process for unexpected termination."""
|
|
if self.browser_process:
|
|
try:
|
|
stdout, stderr = await asyncio.gather(
|
|
asyncio.to_thread(self.browser_process.stdout.read),
|
|
asyncio.to_thread(self.browser_process.stderr.read)
|
|
)
|
|
|
|
# Check shutting_down flag BEFORE logging anything
|
|
if self.browser_process.poll() is not None:
|
|
if not self.shutting_down:
|
|
self.logger.error(
|
|
message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
|
|
tag="ERROR",
|
|
params={
|
|
"code": self.browser_process.returncode,
|
|
"stdout": stdout.decode(),
|
|
"stderr": stderr.decode()
|
|
}
|
|
)
|
|
await self.cleanup()
|
|
else:
|
|
self.logger.info(
|
|
message="Browser process terminated normally | Code: {code}",
|
|
tag="INFO",
|
|
params={"code": self.browser_process.returncode}
|
|
)
|
|
except Exception as e:
|
|
if not self.shutting_down:
|
|
self.logger.error(
|
|
message="Error monitoring browser process: {error}",
|
|
tag="ERROR",
|
|
params={"error": str(e)}
|
|
)
|
|
|
|
def _get_browser_path(self) -> str:
|
|
"""Returns the browser executable path based on OS and browser type"""
|
|
if sys.platform == "darwin": # macOS
|
|
paths = {
|
|
"chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
|
"firefox": "/Applications/Firefox.app/Contents/MacOS/firefox",
|
|
"webkit": "/Applications/Safari.app/Contents/MacOS/Safari"
|
|
}
|
|
elif sys.platform == "win32": # Windows
|
|
paths = {
|
|
"chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
|
|
"firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe",
|
|
"webkit": None # WebKit not supported on Windows
|
|
}
|
|
else: # Linux
|
|
paths = {
|
|
"chromium": "google-chrome",
|
|
"firefox": "firefox",
|
|
"webkit": None # WebKit not supported on Linux
|
|
}
|
|
|
|
return paths.get(self.browser_type)
|
|
|
|
def _get_browser_args(self) -> List[str]:
|
|
"""Returns browser-specific command line arguments"""
|
|
base_args = [self._get_browser_path()]
|
|
|
|
if self.browser_type == "chromium":
|
|
args = [
|
|
f"--remote-debugging-port={self.debugging_port}",
|
|
f"--user-data-dir={self.user_data_dir}",
|
|
]
|
|
if self.headless:
|
|
args.append("--headless=new")
|
|
elif self.browser_type == "firefox":
|
|
args = [
|
|
"--remote-debugging-port", str(self.debugging_port),
|
|
"--profile", self.user_data_dir,
|
|
]
|
|
if self.headless:
|
|
args.append("--headless")
|
|
else:
|
|
raise NotImplementedError(f"Browser type {self.browser_type} not supported")
|
|
|
|
return base_args + args
|
|
|
|
async def cleanup(self):
|
|
"""Cleanup browser process and temporary directory"""
|
|
# Set shutting_down flag BEFORE any termination actions
|
|
self.shutting_down = True
|
|
|
|
if self.browser_process:
|
|
try:
|
|
self.browser_process.terminate()
|
|
# Wait for process to end gracefully
|
|
for _ in range(10): # 10 attempts, 100ms each
|
|
if self.browser_process.poll() is not None:
|
|
break
|
|
await asyncio.sleep(0.1)
|
|
|
|
# Force kill if still running
|
|
if self.browser_process.poll() is None:
|
|
self.browser_process.kill()
|
|
await asyncio.sleep(0.1) # Brief wait for kill to take effect
|
|
|
|
except Exception as e:
|
|
self.logger.error(
|
|
message="Error terminating browser: {error}",
|
|
tag="ERROR",
|
|
params={"error": str(e)}
|
|
)
|
|
|
|
if self.temp_dir and os.path.exists(self.temp_dir):
|
|
try:
|
|
shutil.rmtree(self.temp_dir)
|
|
except Exception as e:
|
|
self.logger.error(
|
|
message="Error removing temporary directory: {error}",
|
|
tag="ERROR",
|
|
params={"error": str(e)}
|
|
)
|
|
|