Fixed issues with the Manage Browser, including its inability to connect to the user directory and inability to create new pages within the Manage Browser context; all issues are now resolved.

This commit is contained in:
UncleCode
2024-11-07 20:15:03 +08:00
parent 16f918621f
commit b120965b6a

View File

@@ -187,6 +187,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
self.use_managed_browser = kwargs.get("use_managed_browser", False) self.use_managed_browser = kwargs.get("use_managed_browser", False)
self.user_data_dir = kwargs.get("user_data_dir", None) self.user_data_dir = kwargs.get("user_data_dir", None)
self.managed_browser = None self.managed_browser = None
self.default_context = None
self.hooks = { self.hooks = {
'on_browser_created': None, 'on_browser_created': None,
'on_user_agent_updated': None, 'on_user_agent_updated': None,
@@ -217,6 +218,25 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
) )
cdp_url = await self.managed_browser.start() cdp_url = await self.managed_browser.start()
self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
# Get the default context that maintains the user profile
contexts = self.browser.contexts
if contexts:
self.default_context = contexts[0]
else:
# If no default context exists, create one
self.default_context = await self.browser.new_context(
viewport={"width": 1920, "height": 1080}
)
# Set up the default context
if self.default_context:
await self.default_context.set_extra_http_headers(self.headers)
if self.user_agent:
await self.default_context.set_extra_http_headers({
"User-Agent": self.user_agent
})
else: else:
browser_args = { browser_args = {
"headless": self.headless, "headless": self.headless,
@@ -254,12 +274,20 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
async def close(self): async def close(self):
if self.sleep_on_close: if self.sleep_on_close:
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
# Close all active sessions
session_ids = list(self.sessions.keys())
for session_id in session_ids:
await self.kill_session(session_id)
if self.browser: if self.browser:
await self.browser.close() await self.browser.close()
self.browser = None self.browser = None
if self.managed_browser: if self.managed_browser:
await self.managed_browser.cleanup() await self.managed_browser.cleanup()
self.managed_browser = None self.managed_browser = None
if self.playwright: if self.playwright:
await self.playwright.stop() await self.playwright.stop()
self.playwright = None self.playwright = None
@@ -293,6 +321,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if session_id in self.sessions: if session_id in self.sessions:
context, page, _ = self.sessions[session_id] context, page, _ = self.sessions[session_id]
await page.close() await page.close()
if not self.use_managed_browser:
await context.close() await context.close()
del self.sessions[session_id] del self.sessions[session_id]
@@ -415,6 +444,20 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
self._cleanup_expired_sessions() self._cleanup_expired_sessions()
session_id = kwargs.get("session_id") session_id = kwargs.get("session_id")
# Handle page creation differently for managed browser
if self.use_managed_browser:
if session_id:
# Reuse existing session if available
context, page, _ = self.sessions.get(session_id, (None, None, None))
if not page:
# Create new page in default context if session doesn't exist
page = await self.default_context.new_page()
self.sessions[session_id] = (self.default_context, page, time.time())
else:
# Create new page in default context for non-session requests
page = await self.default_context.new_page()
else:
if session_id: if session_id:
context, page, _ = self.sessions.get(session_id, (None, None, None)) context, page, _ = self.sessions.get(session_id, (None, None, None))
if not context: if not context: