feat(browser): enhance browser context creation with user data directory support and improved storage state handling
This commit is contained in:
@@ -139,6 +139,112 @@ class BaseBrowserStrategy(ABC):
|
|||||||
signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest()
|
signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest()
|
||||||
return signature_hash
|
return signature_hash
|
||||||
|
|
||||||
|
async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext:
|
||||||
|
"""Creates and returns a new browser context with configured settings.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
crawlerRunConfig: Configuration object for the crawler run
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
BrowserContext: Browser context object with the specified configurations
|
||||||
|
"""
|
||||||
|
if not self.browser:
|
||||||
|
raise ValueError("Browser must be initialized before creating context")
|
||||||
|
|
||||||
|
# Base settings
|
||||||
|
user_agent = self.config.headers.get("User-Agent", self.config.user_agent)
|
||||||
|
viewport_settings = {
|
||||||
|
"width": self.config.viewport_width,
|
||||||
|
"height": self.config.viewport_height,
|
||||||
|
}
|
||||||
|
proxy_settings = {"server": self.config.proxy} if self.config.proxy else None
|
||||||
|
|
||||||
|
# Define blocked extensions for resource optimization
|
||||||
|
blocked_extensions = [
|
||||||
|
# Images
|
||||||
|
"jpg", "jpeg", "png", "gif", "webp", "svg", "ico", "bmp", "tiff", "psd",
|
||||||
|
# Fonts
|
||||||
|
"woff", "woff2", "ttf", "otf", "eot",
|
||||||
|
# Media
|
||||||
|
"mp4", "webm", "ogg", "avi", "mov", "wmv", "flv", "m4v", "mp3", "wav", "aac",
|
||||||
|
"m4a", "opus", "flac",
|
||||||
|
# Documents
|
||||||
|
"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",
|
||||||
|
# Archives
|
||||||
|
"zip", "rar", "7z", "tar", "gz",
|
||||||
|
# Scripts and data
|
||||||
|
"xml", "swf", "wasm",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Common context settings
|
||||||
|
context_settings = {
|
||||||
|
"user_agent": user_agent,
|
||||||
|
"viewport": viewport_settings,
|
||||||
|
"proxy": proxy_settings,
|
||||||
|
"accept_downloads": self.config.accept_downloads,
|
||||||
|
"ignore_https_errors": self.config.ignore_https_errors,
|
||||||
|
"device_scale_factor": 1.0,
|
||||||
|
"java_script_enabled": self.config.java_script_enabled,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Apply text mode settings if enabled
|
||||||
|
if self.config.text_mode:
|
||||||
|
text_mode_settings = {
|
||||||
|
"has_touch": False,
|
||||||
|
"is_mobile": False,
|
||||||
|
# Disable javascript in text mode
|
||||||
|
"java_script_enabled": False
|
||||||
|
}
|
||||||
|
# Update context settings with text mode settings
|
||||||
|
context_settings.update(text_mode_settings)
|
||||||
|
if self.logger:
|
||||||
|
self.logger.debug("Text mode enabled for browser context", tag="BROWSER")
|
||||||
|
|
||||||
|
# Handle storage state properly - this is key for persistence
|
||||||
|
if self.config.storage_state:
|
||||||
|
context_settings["storage_state"] = self.config.storage_state
|
||||||
|
if self.logger:
|
||||||
|
if isinstance(self.config.storage_state, str):
|
||||||
|
self.logger.debug(f"Using storage state from file: {self.config.storage_state}", tag="BROWSER")
|
||||||
|
else:
|
||||||
|
self.logger.debug("Using storage state from config object", tag="BROWSER")
|
||||||
|
|
||||||
|
# If user_data_dir is specified, browser persistence should be automatic
|
||||||
|
if self.config.user_data_dir and self.logger:
|
||||||
|
self.logger.debug(f"Using user data directory: {self.config.user_data_dir}", tag="BROWSER")
|
||||||
|
|
||||||
|
# Apply crawler-specific configurations if provided
|
||||||
|
if crawlerRunConfig:
|
||||||
|
# Check if there is value for crawlerRunConfig.proxy_config set add that to context
|
||||||
|
if crawlerRunConfig.proxy_config:
|
||||||
|
proxy_settings = {
|
||||||
|
"server": crawlerRunConfig.proxy_config.server,
|
||||||
|
}
|
||||||
|
if crawlerRunConfig.proxy_config.username:
|
||||||
|
proxy_settings.update({
|
||||||
|
"username": crawlerRunConfig.proxy_config.username,
|
||||||
|
"password": crawlerRunConfig.proxy_config.password,
|
||||||
|
})
|
||||||
|
context_settings["proxy"] = proxy_settings
|
||||||
|
|
||||||
|
# Create and return the context
|
||||||
|
try:
|
||||||
|
# Create the context with appropriate settings
|
||||||
|
context = await self.browser.new_context(**context_settings)
|
||||||
|
|
||||||
|
# Apply text mode resource blocking if enabled
|
||||||
|
if self.config.text_mode:
|
||||||
|
# Create and apply route patterns for each extension
|
||||||
|
for ext in blocked_extensions:
|
||||||
|
await context.route(f"**/*.{ext}", lambda route: route.abort())
|
||||||
|
|
||||||
|
return context
|
||||||
|
except Exception as e:
|
||||||
|
if self.logger:
|
||||||
|
self.logger.error(f"Error creating browser context: {str(e)}", tag="BROWSER")
|
||||||
|
# Fallback to basic context creation if the advanced settings fail
|
||||||
|
return await self.browser.new_context()
|
||||||
|
|
||||||
async def setup_context(self, context: BrowserContext, crawlerRunConfig: Optional[CrawlerRunConfig] = None):
|
async def setup_context(self, context: BrowserContext, crawlerRunConfig: Optional[CrawlerRunConfig] = None):
|
||||||
"""Set up a browser context with the configured options.
|
"""Set up a browser context with the configured options.
|
||||||
|
|
||||||
@@ -301,97 +407,32 @@ class PlaywrightBrowserStrategy(BaseBrowserStrategy):
|
|||||||
async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext:
|
async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext:
|
||||||
"""Creates and returns a new browser context with configured settings.
|
"""Creates and returns a new browser context with configured settings.
|
||||||
|
|
||||||
|
This implementation extends the base class version to handle user_data_dir specifically.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
crawlerRunConfig: Configuration object for the crawler run
|
crawlerRunConfig: Configuration object for the crawler run
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
BrowserContext: Browser context object with the specified configurations
|
BrowserContext: Browser context object with the specified configurations
|
||||||
"""
|
"""
|
||||||
# Base settings
|
# Handle user_data_dir explicitly to ensure storage persistence
|
||||||
user_agent = self.config.headers.get("User-Agent", self.config.user_agent)
|
|
||||||
viewport_settings = {
|
|
||||||
"width": self.config.viewport_width,
|
|
||||||
"height": self.config.viewport_height,
|
|
||||||
}
|
|
||||||
proxy_settings = {"server": self.config.proxy} if self.config.proxy else None
|
|
||||||
|
|
||||||
blocked_extensions = [
|
|
||||||
# Images
|
|
||||||
"jpg", "jpeg", "png", "gif", "webp", "svg", "ico", "bmp", "tiff", "psd",
|
|
||||||
# Fonts
|
|
||||||
"woff", "woff2", "ttf", "otf", "eot",
|
|
||||||
# Media
|
|
||||||
"mp4", "webm", "ogg", "avi", "mov", "wmv", "flv", "m4v", "mp3", "wav", "aac",
|
|
||||||
"m4a", "opus", "flac",
|
|
||||||
# Documents
|
|
||||||
"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",
|
|
||||||
# Archives
|
|
||||||
"zip", "rar", "7z", "tar", "gz",
|
|
||||||
# Scripts and data
|
|
||||||
"xml", "swf", "wasm",
|
|
||||||
]
|
|
||||||
|
|
||||||
# Common context settings
|
|
||||||
context_settings = {
|
|
||||||
"user_agent": user_agent,
|
|
||||||
"viewport": viewport_settings,
|
|
||||||
"proxy": proxy_settings,
|
|
||||||
"accept_downloads": self.config.accept_downloads,
|
|
||||||
"ignore_https_errors": self.config.ignore_https_errors,
|
|
||||||
"device_scale_factor": 1.0,
|
|
||||||
"java_script_enabled": self.config.java_script_enabled,
|
|
||||||
}
|
|
||||||
|
|
||||||
# Handle storage state properly - this is key for persistence
|
|
||||||
if self.config.storage_state:
|
|
||||||
context_settings["storage_state"] = self.config.storage_state
|
|
||||||
if self.logger:
|
|
||||||
if isinstance(self.config.storage_state, str):
|
|
||||||
self.logger.debug(f"Using storage state from file: {self.config.storage_state}", tag="BROWSER")
|
|
||||||
else:
|
|
||||||
self.logger.debug("Using storage state from config object", tag="BROWSER")
|
|
||||||
|
|
||||||
if self.config.user_data_dir:
|
if self.config.user_data_dir:
|
||||||
context_settings["storage_state"] = os.path.join(
|
# Create a storage state file path if none exists
|
||||||
self.config.user_data_dir, "Default", "storage_state.json"
|
storage_path = os.path.join(self.config.user_data_dir, "Default", "storage_state.json")
|
||||||
)
|
|
||||||
# Create the file if it doesn't exist
|
# Create the file if it doesn't exist
|
||||||
if not os.path.exists(context_settings["storage_state"]):
|
if not os.path.exists(storage_path):
|
||||||
os.makedirs(os.path.dirname(context_settings["storage_state"]), exist_ok=True)
|
os.makedirs(os.path.dirname(storage_path), exist_ok=True)
|
||||||
with open(context_settings["storage_state"], "w") as f:
|
with open(storage_path, "w") as f:
|
||||||
json.dump({}, f)
|
json.dump({}, f)
|
||||||
|
|
||||||
|
# Override storage_state with our specific path
|
||||||
|
self.config.storage_state = storage_path
|
||||||
|
if self.logger:
|
||||||
|
self.logger.debug(f"Using persistent storage state at: {storage_path}", tag="BROWSER")
|
||||||
|
|
||||||
if crawlerRunConfig:
|
# Now call the base class implementation which handles everything else
|
||||||
# Check if there is value for crawlerRunConfig.proxy_config set add that to context
|
return await super().create_browser_context(crawlerRunConfig)
|
||||||
if crawlerRunConfig.proxy_config:
|
|
||||||
proxy_settings = {
|
|
||||||
"server": crawlerRunConfig.proxy_config.server,
|
|
||||||
}
|
|
||||||
if crawlerRunConfig.proxy_config.username:
|
|
||||||
proxy_settings.update({
|
|
||||||
"username": crawlerRunConfig.proxy_config.username,
|
|
||||||
"password": crawlerRunConfig.proxy_config.password,
|
|
||||||
})
|
|
||||||
context_settings["proxy"] = proxy_settings
|
|
||||||
|
|
||||||
if self.config.text_mode:
|
|
||||||
text_mode_settings = {
|
|
||||||
"has_touch": False,
|
|
||||||
"is_mobile": False,
|
|
||||||
}
|
|
||||||
# Update context settings with text mode settings
|
|
||||||
context_settings.update(text_mode_settings)
|
|
||||||
|
|
||||||
# Create and return the context with all settings
|
|
||||||
context = await self.browser.new_context(**context_settings)
|
|
||||||
|
|
||||||
# Apply text mode settings if enabled
|
|
||||||
if self.config.text_mode:
|
|
||||||
# Create and apply route patterns for each extension
|
|
||||||
for ext in blocked_extensions:
|
|
||||||
await context.route(f"**/*.{ext}", lambda route: route.abort())
|
|
||||||
return context
|
|
||||||
|
|
||||||
def _cleanup_expired_sessions(self):
|
def _cleanup_expired_sessions(self):
|
||||||
"""Clean up expired sessions based on TTL."""
|
"""Clean up expired sessions based on TTL."""
|
||||||
@@ -704,13 +745,28 @@ class CDPBrowserStrategy(BaseBrowserStrategy):
|
|||||||
async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext:
|
async def create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> BrowserContext:
|
||||||
"""Create a new browser context.
|
"""Create a new browser context.
|
||||||
|
|
||||||
|
Uses the base class implementation which handles all configurations.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
crawlerRunConfig: Configuration object for the crawler run
|
crawlerRunConfig: Configuration object for the crawler run
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
BrowserContext: Browser context object
|
BrowserContext: Browser context object
|
||||||
"""
|
"""
|
||||||
return await self.browser.new_context()
|
# Handle user_data_dir for CDP browsers
|
||||||
|
if self.config.user_data_dir:
|
||||||
|
# For CDP-based browsers, storage persistence is typically handled by the user_data_dir
|
||||||
|
# at the browser level, but we'll create a storage_state location for Playwright as well
|
||||||
|
storage_path = os.path.join(self.config.user_data_dir, "storage_state.json")
|
||||||
|
if not os.path.exists(storage_path):
|
||||||
|
# Create parent directory if it doesn't exist
|
||||||
|
os.makedirs(os.path.dirname(storage_path), exist_ok=True)
|
||||||
|
with open(storage_path, "w") as f:
|
||||||
|
json.dump({}, f)
|
||||||
|
self.config.storage_state = storage_path
|
||||||
|
|
||||||
|
# Use the base class implementation
|
||||||
|
return await super().create_browser_context(crawlerRunConfig)
|
||||||
|
|
||||||
def _cleanup_expired_sessions(self):
|
def _cleanup_expired_sessions(self):
|
||||||
"""Clean up expired sessions based on TTL."""
|
"""Clean up expired sessions based on TTL."""
|
||||||
|
|||||||
Reference in New Issue
Block a user