refactor(browser): improve browser strategy architecture and lifecycle management
Major refactoring of browser strategy implementations to improve code organization and reliability: - Move CrawlResultContainer and RunManyReturn types from async_webcrawler to models.py - Simplify browser lifecycle management in AsyncWebCrawler - Standardize browser strategy interface with _generate_page method - Improve headless mode handling and browser args construction - Clean up Docker and Playwright strategy implementations - Fix session management and context handling across strategies BREAKING CHANGE: Browser strategy interface has changed with new _generate_page method requirement
This commit is contained in:
@@ -270,7 +270,7 @@ class BrowserConfig:
|
|||||||
host: str = "localhost",
|
host: str = "localhost",
|
||||||
):
|
):
|
||||||
self.browser_type = browser_type
|
self.browser_type = browser_type
|
||||||
self.headless = headless
|
self.headless = headless and "new" or False
|
||||||
self.browser_mode = browser_mode
|
self.browser_mode = browser_mode
|
||||||
self.use_managed_browser = use_managed_browser
|
self.use_managed_browser = use_managed_browser
|
||||||
self.cdp_url = cdp_url
|
self.cdp_url = cdp_url
|
||||||
|
|||||||
@@ -4,18 +4,25 @@ import sys
|
|||||||
import time
|
import time
|
||||||
from colorama import Fore
|
from colorama import Fore
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, List, Generic, TypeVar
|
from typing import Optional, List
|
||||||
import json
|
import json
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
||||||
# from contextlib import nullcontext, asynccontextmanager
|
# from contextlib import nullcontext, asynccontextmanager
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
from .models import CrawlResult, MarkdownGenerationResult, DispatchResult, ScrapingResult
|
from .models import (
|
||||||
|
CrawlResult,
|
||||||
|
MarkdownGenerationResult,
|
||||||
|
DispatchResult,
|
||||||
|
ScrapingResult,
|
||||||
|
CrawlResultContainer,
|
||||||
|
RunManyReturn
|
||||||
|
)
|
||||||
from .async_database import async_db_manager
|
from .async_database import async_db_manager
|
||||||
from .chunking_strategy import * # noqa: F403
|
from .chunking_strategy import * # noqa: F403
|
||||||
from .chunking_strategy import IdentityChunking
|
from .chunking_strategy import IdentityChunking
|
||||||
from .content_filter_strategy import * # noqa: F403
|
from .content_filter_strategy import * # noqa: F403
|
||||||
from .extraction_strategy import * # noqa: F403
|
from .extraction_strategy import * # noqa: F403
|
||||||
from .extraction_strategy import NoExtractionStrategy
|
from .extraction_strategy import NoExtractionStrategy
|
||||||
from .async_crawler_strategy import (
|
from .async_crawler_strategy import (
|
||||||
AsyncCrawlerStrategy,
|
AsyncCrawlerStrategy,
|
||||||
@@ -30,7 +37,7 @@ from .markdown_generation_strategy import (
|
|||||||
from .deep_crawling import DeepCrawlDecorator
|
from .deep_crawling import DeepCrawlDecorator
|
||||||
from .async_logger import AsyncLogger, AsyncLoggerBase
|
from .async_logger import AsyncLogger, AsyncLoggerBase
|
||||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
from .async_dispatcher import * # noqa: F403
|
from .async_dispatcher import * # noqa: F403
|
||||||
from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
|
from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
|
||||||
|
|
||||||
from .utils import (
|
from .utils import (
|
||||||
@@ -42,45 +49,6 @@ from .utils import (
|
|||||||
RobotsParser,
|
RobotsParser,
|
||||||
)
|
)
|
||||||
|
|
||||||
from typing import Union, AsyncGenerator
|
|
||||||
|
|
||||||
CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
|
|
||||||
# RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
|
||||||
|
|
||||||
class CrawlResultContainer(Generic[CrawlResultT]):
|
|
||||||
def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]):
|
|
||||||
# Normalize to a list
|
|
||||||
if isinstance(results, list):
|
|
||||||
self._results = results
|
|
||||||
else:
|
|
||||||
self._results = [results]
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
return iter(self._results)
|
|
||||||
|
|
||||||
def __getitem__(self, index):
|
|
||||||
return self._results[index]
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return len(self._results)
|
|
||||||
|
|
||||||
def __getattr__(self, attr):
|
|
||||||
# Delegate attribute access to the first element.
|
|
||||||
if self._results:
|
|
||||||
return getattr(self._results[0], attr)
|
|
||||||
raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'")
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return f"{self.__class__.__name__}({self._results!r})"
|
|
||||||
|
|
||||||
# Redefine the union type. Now synchronous calls always return a container,
|
|
||||||
# while stream mode is handled with an AsyncGenerator.
|
|
||||||
RunManyReturn = Union[
|
|
||||||
CrawlResultContainer[CrawlResultT],
|
|
||||||
AsyncGenerator[CrawlResultT, None]
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class AsyncWebCrawler:
|
class AsyncWebCrawler:
|
||||||
"""
|
"""
|
||||||
@@ -193,45 +161,18 @@ class AsyncWebCrawler:
|
|||||||
|
|
||||||
# Decorate arun method with deep crawling capabilities
|
# Decorate arun method with deep crawling capabilities
|
||||||
self._deep_handler = DeepCrawlDecorator(self)
|
self._deep_handler = DeepCrawlDecorator(self)
|
||||||
self.arun = self._deep_handler(self.arun)
|
self.arun = self._deep_handler(self.arun)
|
||||||
|
|
||||||
async def start(self):
|
async def start(self):
|
||||||
"""
|
"""
|
||||||
Start the crawler explicitly without using context manager.
|
Start the crawler explicitly without using context manager.
|
||||||
This is equivalent to using 'async with' but gives more control over the lifecycle.
|
This is equivalent to using 'async with' but gives more control over the lifecycle.
|
||||||
|
|
||||||
This method will:
|
|
||||||
1. Check for builtin browser if browser_mode is 'builtin'
|
|
||||||
2. Initialize the browser and context
|
|
||||||
3. Perform warmup sequence
|
|
||||||
4. Return the crawler instance for method chaining
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
AsyncWebCrawler: The initialized crawler instance
|
AsyncWebCrawler: The initialized crawler instance
|
||||||
"""
|
"""
|
||||||
# Check for builtin browser if requested
|
|
||||||
if self.browser_config.browser_mode == "builtin" and not self.browser_config.cdp_url:
|
|
||||||
# Import here to avoid circular imports
|
|
||||||
from .browser_profiler import BrowserProfiler
|
|
||||||
profiler = BrowserProfiler(logger=self.logger)
|
|
||||||
|
|
||||||
# Get builtin browser info or launch if needed
|
|
||||||
browser_info = profiler.get_builtin_browser_info()
|
|
||||||
if not browser_info:
|
|
||||||
self.logger.info("Builtin browser not found, launching new instance...", tag="BROWSER")
|
|
||||||
cdp_url = await profiler.launch_builtin_browser()
|
|
||||||
if not cdp_url:
|
|
||||||
self.logger.warning("Failed to launch builtin browser, falling back to dedicated browser", tag="BROWSER")
|
|
||||||
else:
|
|
||||||
self.browser_config.cdp_url = cdp_url
|
|
||||||
self.browser_config.use_managed_browser = True
|
|
||||||
else:
|
|
||||||
self.logger.info(f"Using existing builtin browser at {browser_info.get('cdp_url')}", tag="BROWSER")
|
|
||||||
self.browser_config.cdp_url = browser_info.get('cdp_url')
|
|
||||||
self.browser_config.use_managed_browser = True
|
|
||||||
|
|
||||||
await self.crawler_strategy.__aenter__()
|
await self.crawler_strategy.__aenter__()
|
||||||
await self.awarmup()
|
self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
|
||||||
|
self.ready = True
|
||||||
return self
|
return self
|
||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
@@ -251,18 +192,6 @@ class AsyncWebCrawler:
|
|||||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||||
await self.close()
|
await self.close()
|
||||||
|
|
||||||
async def awarmup(self):
|
|
||||||
"""
|
|
||||||
Initialize the crawler with warm-up sequence.
|
|
||||||
|
|
||||||
This method:
|
|
||||||
1. Logs initialization info
|
|
||||||
2. Sets up browser configuration
|
|
||||||
3. Marks the crawler as ready
|
|
||||||
"""
|
|
||||||
self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
|
|
||||||
self.ready = True
|
|
||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def nullcontext(self):
|
async def nullcontext(self):
|
||||||
"""异步空上下文管理器"""
|
"""异步空上下文管理器"""
|
||||||
@@ -305,7 +234,7 @@ class AsyncWebCrawler:
|
|||||||
# Auto-start if not ready
|
# Auto-start if not ready
|
||||||
if not self.ready:
|
if not self.ready:
|
||||||
await self.start()
|
await self.start()
|
||||||
|
|
||||||
config = config or CrawlerRunConfig()
|
config = config or CrawlerRunConfig()
|
||||||
if not isinstance(url, str) or not url:
|
if not isinstance(url, str) or not url:
|
||||||
raise ValueError("Invalid URL, make sure the URL is a non-empty string")
|
raise ValueError("Invalid URL, make sure the URL is a non-empty string")
|
||||||
@@ -319,9 +248,7 @@ class AsyncWebCrawler:
|
|||||||
config.cache_mode = CacheMode.ENABLED
|
config.cache_mode = CacheMode.ENABLED
|
||||||
|
|
||||||
# Create cache context
|
# Create cache context
|
||||||
cache_context = CacheContext(
|
cache_context = CacheContext(url, config.cache_mode, False)
|
||||||
url, config.cache_mode, False
|
|
||||||
)
|
|
||||||
|
|
||||||
# Initialize processing variables
|
# Initialize processing variables
|
||||||
async_response: AsyncCrawlResponse = None
|
async_response: AsyncCrawlResponse = None
|
||||||
@@ -351,7 +278,7 @@ class AsyncWebCrawler:
|
|||||||
# if config.screenshot and not screenshot or config.pdf and not pdf:
|
# if config.screenshot and not screenshot or config.pdf and not pdf:
|
||||||
if config.screenshot and not screenshot_data:
|
if config.screenshot and not screenshot_data:
|
||||||
cached_result = None
|
cached_result = None
|
||||||
|
|
||||||
if config.pdf and not pdf_data:
|
if config.pdf and not pdf_data:
|
||||||
cached_result = None
|
cached_result = None
|
||||||
|
|
||||||
@@ -383,14 +310,18 @@ class AsyncWebCrawler:
|
|||||||
|
|
||||||
# Check robots.txt if enabled
|
# Check robots.txt if enabled
|
||||||
if config and config.check_robots_txt:
|
if config and config.check_robots_txt:
|
||||||
if not await self.robots_parser.can_fetch(url, self.browser_config.user_agent):
|
if not await self.robots_parser.can_fetch(
|
||||||
|
url, self.browser_config.user_agent
|
||||||
|
):
|
||||||
return CrawlResult(
|
return CrawlResult(
|
||||||
url=url,
|
url=url,
|
||||||
html="",
|
html="",
|
||||||
success=False,
|
success=False,
|
||||||
status_code=403,
|
status_code=403,
|
||||||
error_message="Access denied by robots.txt",
|
error_message="Access denied by robots.txt",
|
||||||
response_headers={"X-Robots-Status": "Blocked by robots.txt"}
|
response_headers={
|
||||||
|
"X-Robots-Status": "Blocked by robots.txt"
|
||||||
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
##############################
|
##############################
|
||||||
@@ -417,7 +348,7 @@ class AsyncWebCrawler:
|
|||||||
###############################################################
|
###############################################################
|
||||||
# Process the HTML content, Call CrawlerStrategy.process_html #
|
# Process the HTML content, Call CrawlerStrategy.process_html #
|
||||||
###############################################################
|
###############################################################
|
||||||
crawl_result : CrawlResult = await self.aprocess_html(
|
crawl_result: CrawlResult = await self.aprocess_html(
|
||||||
url=url,
|
url=url,
|
||||||
html=html,
|
html=html,
|
||||||
extracted_content=extracted_content,
|
extracted_content=extracted_content,
|
||||||
@@ -494,7 +425,7 @@ class AsyncWebCrawler:
|
|||||||
tag="ERROR",
|
tag="ERROR",
|
||||||
)
|
)
|
||||||
|
|
||||||
return CrawlResultContainer(
|
return CrawlResultContainer(
|
||||||
CrawlResult(
|
CrawlResult(
|
||||||
url=url, html="", success=False, error_message=error_message
|
url=url, html="", success=False, error_message=error_message
|
||||||
)
|
)
|
||||||
@@ -539,15 +470,14 @@ class AsyncWebCrawler:
|
|||||||
|
|
||||||
# Process HTML content
|
# Process HTML content
|
||||||
params = config.__dict__.copy()
|
params = config.__dict__.copy()
|
||||||
params.pop("url", None)
|
params.pop("url", None)
|
||||||
# add keys from kwargs to params that doesn't exist in params
|
# add keys from kwargs to params that doesn't exist in params
|
||||||
params.update({k: v for k, v in kwargs.items() if k not in params.keys()})
|
params.update({k: v for k, v in kwargs.items() if k not in params.keys()})
|
||||||
|
|
||||||
|
|
||||||
################################
|
################################
|
||||||
# Scraping Strategy Execution #
|
# Scraping Strategy Execution #
|
||||||
################################
|
################################
|
||||||
result : ScrapingResult = scraping_strategy.scrap(url, html, **params)
|
result: ScrapingResult = scraping_strategy.scrap(url, html, **params)
|
||||||
|
|
||||||
if result is None:
|
if result is None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@@ -596,7 +526,10 @@ class AsyncWebCrawler:
|
|||||||
self.logger.info(
|
self.logger.info(
|
||||||
message="{url:.50}... | Time: {timing}s",
|
message="{url:.50}... | Time: {timing}s",
|
||||||
tag="SCRAPE",
|
tag="SCRAPE",
|
||||||
params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000},
|
params={
|
||||||
|
"url": _url,
|
||||||
|
"timing": int((time.perf_counter() - t1) * 1000) / 1000,
|
||||||
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
################################
|
################################
|
||||||
@@ -671,7 +604,7 @@ class AsyncWebCrawler:
|
|||||||
async def arun_many(
|
async def arun_many(
|
||||||
self,
|
self,
|
||||||
urls: List[str],
|
urls: List[str],
|
||||||
config: Optional[CrawlerRunConfig] = None,
|
config: Optional[CrawlerRunConfig] = None,
|
||||||
dispatcher: Optional[BaseDispatcher] = None,
|
dispatcher: Optional[BaseDispatcher] = None,
|
||||||
# Legacy parameters maintained for backwards compatibility
|
# Legacy parameters maintained for backwards compatibility
|
||||||
# word_count_threshold=MIN_WORD_THRESHOLD,
|
# word_count_threshold=MIN_WORD_THRESHOLD,
|
||||||
@@ -685,8 +618,8 @@ class AsyncWebCrawler:
|
|||||||
# pdf: bool = False,
|
# pdf: bool = False,
|
||||||
# user_agent: str = None,
|
# user_agent: str = None,
|
||||||
# verbose=True,
|
# verbose=True,
|
||||||
**kwargs
|
**kwargs,
|
||||||
) -> RunManyReturn:
|
) -> RunManyReturn:
|
||||||
"""
|
"""
|
||||||
Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.
|
Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.
|
||||||
|
|
||||||
@@ -742,37 +675,32 @@ class AsyncWebCrawler:
|
|||||||
|
|
||||||
def transform_result(task_result):
|
def transform_result(task_result):
|
||||||
return (
|
return (
|
||||||
setattr(task_result.result, 'dispatch_result',
|
setattr(
|
||||||
DispatchResult(
|
task_result.result,
|
||||||
task_id=task_result.task_id,
|
"dispatch_result",
|
||||||
memory_usage=task_result.memory_usage,
|
DispatchResult(
|
||||||
peak_memory=task_result.peak_memory,
|
task_id=task_result.task_id,
|
||||||
start_time=task_result.start_time,
|
memory_usage=task_result.memory_usage,
|
||||||
end_time=task_result.end_time,
|
peak_memory=task_result.peak_memory,
|
||||||
error_message=task_result.error_message,
|
start_time=task_result.start_time,
|
||||||
)
|
end_time=task_result.end_time,
|
||||||
) or task_result.result
|
error_message=task_result.error_message,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
or task_result.result
|
||||||
|
)
|
||||||
|
|
||||||
stream = config.stream
|
stream = config.stream
|
||||||
|
|
||||||
if stream:
|
if stream:
|
||||||
|
|
||||||
async def result_transformer():
|
async def result_transformer():
|
||||||
async for task_result in dispatcher.run_urls_stream(crawler=self, urls=urls, config=config):
|
async for task_result in dispatcher.run_urls_stream(
|
||||||
|
crawler=self, urls=urls, config=config
|
||||||
|
):
|
||||||
yield transform_result(task_result)
|
yield transform_result(task_result)
|
||||||
|
|
||||||
return result_transformer()
|
return result_transformer()
|
||||||
else:
|
else:
|
||||||
_results = await dispatcher.run_urls(crawler=self, urls=urls, config=config)
|
_results = await dispatcher.run_urls(crawler=self, urls=urls, config=config)
|
||||||
return [transform_result(res) for res in _results]
|
return [transform_result(res) for res in _results]
|
||||||
|
|
||||||
async def aclear_cache(self):
|
|
||||||
"""Clear the cache database."""
|
|
||||||
await async_db_manager.cleanup()
|
|
||||||
|
|
||||||
async def aflush_cache(self):
|
|
||||||
"""Flush the cache database."""
|
|
||||||
await async_db_manager.aflush_db()
|
|
||||||
|
|
||||||
async def aget_cache_size(self):
|
|
||||||
"""Get the total number of cached items."""
|
|
||||||
return await async_db_manager.aget_total_count()
|
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ class BrowserManager:
|
|||||||
self.logger = logger
|
self.logger = logger
|
||||||
|
|
||||||
# Create strategy based on configuration
|
# Create strategy based on configuration
|
||||||
self._strategy = self._create_strategy()
|
self.strategy = self._create_strategy()
|
||||||
|
|
||||||
# Initialize state variables for compatibility with existing code
|
# Initialize state variables for compatibility with existing code
|
||||||
self.browser = None
|
self.browser = None
|
||||||
@@ -92,23 +92,23 @@ class BrowserManager:
|
|||||||
self: For method chaining
|
self: For method chaining
|
||||||
"""
|
"""
|
||||||
# Start the strategy
|
# Start the strategy
|
||||||
await self._strategy.start()
|
await self.strategy.start()
|
||||||
|
|
||||||
# Update legacy references
|
# Update legacy references
|
||||||
self.browser = self._strategy.browser
|
self.browser = self.strategy.browser
|
||||||
self.default_context = self._strategy.default_context
|
self.default_context = self.strategy.default_context
|
||||||
|
|
||||||
# Set browser process reference (for CDP strategy)
|
# Set browser process reference (for CDP strategy)
|
||||||
if hasattr(self._strategy, 'browser_process'):
|
if hasattr(self.strategy, 'browser_process'):
|
||||||
self.managed_browser = self._strategy
|
self.managed_browser = self.strategy
|
||||||
|
|
||||||
# Set Playwright reference
|
# Set Playwright reference
|
||||||
self.playwright = self._strategy.playwright
|
self.playwright = self.strategy.playwright
|
||||||
|
|
||||||
# Sync sessions if needed
|
# Sync sessions if needed
|
||||||
if hasattr(self._strategy, 'sessions'):
|
if hasattr(self.strategy, 'sessions'):
|
||||||
self.sessions = self._strategy.sessions
|
self.sessions = self.strategy.sessions
|
||||||
self.session_ttl = self._strategy.session_ttl
|
self.session_ttl = self.strategy.session_ttl
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
@@ -122,11 +122,11 @@ class BrowserManager:
|
|||||||
Tuple of (Page, BrowserContext)
|
Tuple of (Page, BrowserContext)
|
||||||
"""
|
"""
|
||||||
# Delegate to strategy
|
# Delegate to strategy
|
||||||
page, context = await self._strategy.get_page(crawlerRunConfig)
|
page, context = await self.strategy.get_page(crawlerRunConfig)
|
||||||
|
|
||||||
# Sync sessions if needed
|
# Sync sessions if needed
|
||||||
if hasattr(self._strategy, 'sessions'):
|
if hasattr(self.strategy, 'sessions'):
|
||||||
self.sessions = self._strategy.sessions
|
self.sessions = self.strategy.sessions
|
||||||
|
|
||||||
return page, context
|
return page, context
|
||||||
|
|
||||||
@@ -144,14 +144,15 @@ class BrowserManager:
|
|||||||
List of (Page, Context) tuples
|
List of (Page, Context) tuples
|
||||||
"""
|
"""
|
||||||
# Delegate to strategy
|
# Delegate to strategy
|
||||||
pages = await self._strategy.get_pages(crawlerRunConfig, count)
|
pages = await self.strategy.get_pages(crawlerRunConfig, count)
|
||||||
|
|
||||||
# Sync sessions if needed
|
# Sync sessions if needed
|
||||||
if hasattr(self._strategy, 'sessions'):
|
if hasattr(self.strategy, 'sessions'):
|
||||||
self.sessions = self._strategy.sessions
|
self.sessions = self.strategy.sessions
|
||||||
|
|
||||||
return pages
|
return pages
|
||||||
|
|
||||||
|
# Just for legacy compatibility
|
||||||
async def kill_session(self, session_id: str):
|
async def kill_session(self, session_id: str):
|
||||||
"""Kill a browser session and clean up resources.
|
"""Kill a browser session and clean up resources.
|
||||||
|
|
||||||
@@ -159,33 +160,16 @@ class BrowserManager:
|
|||||||
session_id: The session ID to kill
|
session_id: The session ID to kill
|
||||||
"""
|
"""
|
||||||
# Handle kill_session via our strategy if it supports it
|
# Handle kill_session via our strategy if it supports it
|
||||||
await self._strategy.kill_session(session_id)
|
await self.strategy.kill_session(session_id)
|
||||||
|
|
||||||
# sync sessions if needed
|
# sync sessions if needed
|
||||||
if hasattr(self._strategy, 'sessions'):
|
if hasattr(self.strategy, 'sessions'):
|
||||||
self.sessions = self._strategy.sessions
|
self.sessions = self.strategy.sessions
|
||||||
|
|
||||||
def _cleanup_expired_sessions(self):
|
|
||||||
"""Clean up expired sessions based on TTL."""
|
|
||||||
# Use strategy's implementation if available
|
|
||||||
if hasattr(self._strategy, '_cleanup_expired_sessions'):
|
|
||||||
self._strategy._cleanup_expired_sessions()
|
|
||||||
return
|
|
||||||
|
|
||||||
# Otherwise use our own implementation
|
|
||||||
current_time = time.time()
|
|
||||||
expired_sessions = [
|
|
||||||
sid
|
|
||||||
for sid, (_, _, last_used) in self.sessions.items()
|
|
||||||
if current_time - last_used > self.session_ttl
|
|
||||||
]
|
|
||||||
for sid in expired_sessions:
|
|
||||||
asyncio.create_task(self.kill_session(sid))
|
|
||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
"""Close the browser and clean up resources."""
|
"""Close the browser and clean up resources."""
|
||||||
# Delegate to strategy
|
# Delegate to strategy
|
||||||
await self._strategy.close()
|
await self.strategy.close()
|
||||||
|
|
||||||
# Reset legacy references
|
# Reset legacy references
|
||||||
self.browser = None
|
self.browser = None
|
||||||
|
|||||||
@@ -82,6 +82,9 @@ class BaseBrowserStrategy(ABC):
|
|||||||
return self
|
return self
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
|
async def _generate_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]:
|
||||||
|
pass
|
||||||
|
|
||||||
async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]:
|
async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]:
|
||||||
"""Get a page with specified configuration.
|
"""Get a page with specified configuration.
|
||||||
|
|
||||||
@@ -94,6 +97,23 @@ class BaseBrowserStrategy(ABC):
|
|||||||
Returns:
|
Returns:
|
||||||
Tuple of (Page, BrowserContext)
|
Tuple of (Page, BrowserContext)
|
||||||
"""
|
"""
|
||||||
|
# Clean up expired sessions first
|
||||||
|
self._cleanup_expired_sessions()
|
||||||
|
|
||||||
|
# If a session_id is provided and we already have it, reuse that page + context
|
||||||
|
if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions:
|
||||||
|
context, page, _ = self.sessions[crawlerRunConfig.session_id]
|
||||||
|
# Update last-used timestamp
|
||||||
|
self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
|
||||||
|
return page, context
|
||||||
|
|
||||||
|
page, context = await self._generate_page(crawlerRunConfig)
|
||||||
|
|
||||||
|
# If a session_id is specified, store this session so we can reuse later
|
||||||
|
if crawlerRunConfig.session_id:
|
||||||
|
self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
|
||||||
|
|
||||||
|
return page, context
|
||||||
pass
|
pass
|
||||||
|
|
||||||
async def get_pages(self, crawlerRunConfig: CrawlerRunConfig, count: int = 1) -> List[Tuple[Page, BrowserContext]]:
|
async def get_pages(self, crawlerRunConfig: CrawlerRunConfig, count: int = 1) -> List[Tuple[Page, BrowserContext]]:
|
||||||
@@ -120,31 +140,29 @@ class BaseBrowserStrategy(ABC):
|
|||||||
"""
|
"""
|
||||||
# Define common browser arguments that improve performance and stability
|
# Define common browser arguments that improve performance and stability
|
||||||
args = [
|
args = [
|
||||||
"--disable-gpu",
|
|
||||||
"--disable-gpu-compositing",
|
|
||||||
"--disable-software-rasterizer",
|
|
||||||
"--no-sandbox",
|
"--no-sandbox",
|
||||||
"--disable-dev-shm-usage",
|
|
||||||
"--no-first-run",
|
"--no-first-run",
|
||||||
"--no-default-browser-check",
|
"--no-default-browser-check",
|
||||||
"--disable-infobars",
|
|
||||||
"--window-position=0,0",
|
"--window-position=0,0",
|
||||||
"--ignore-certificate-errors",
|
"--ignore-certificate-errors",
|
||||||
"--ignore-certificate-errors-spki-list",
|
"--ignore-certificate-errors-spki-list",
|
||||||
"--disable-blink-features=AutomationControlled",
|
|
||||||
"--window-position=400,0",
|
"--window-position=400,0",
|
||||||
"--disable-renderer-backgrounding",
|
|
||||||
"--disable-ipc-flooding-protection",
|
|
||||||
"--force-color-profile=srgb",
|
"--force-color-profile=srgb",
|
||||||
"--mute-audio",
|
"--mute-audio",
|
||||||
|
"--disable-gpu",
|
||||||
|
"--disable-gpu-compositing",
|
||||||
|
"--disable-software-rasterizer",
|
||||||
|
"--disable-dev-shm-usage",
|
||||||
|
"--disable-infobars",
|
||||||
|
"--disable-blink-features=AutomationControlled",
|
||||||
|
"--disable-renderer-backgrounding",
|
||||||
|
"--disable-ipc-flooding-protection",
|
||||||
"--disable-background-timer-throttling",
|
"--disable-background-timer-throttling",
|
||||||
f"--window-size={self.config.viewport_width},{self.config.viewport_height}",
|
f"--window-size={self.config.viewport_width},{self.config.viewport_height}",
|
||||||
]
|
]
|
||||||
|
|
||||||
# Define browser disable options for light mode
|
# Define browser disable options for light mode
|
||||||
browser_disable_options = [
|
browser_disable_options = [
|
||||||
"--disable-background-networking",
|
|
||||||
"--disable-background-timer-throttling",
|
|
||||||
"--disable-backgrounding-occluded-windows",
|
"--disable-backgrounding-occluded-windows",
|
||||||
"--disable-breakpad",
|
"--disable-breakpad",
|
||||||
"--disable-client-side-phishing-detection",
|
"--disable-client-side-phishing-detection",
|
||||||
@@ -153,13 +171,10 @@ class BaseBrowserStrategy(ABC):
|
|||||||
"--disable-extensions",
|
"--disable-extensions",
|
||||||
"--disable-features=TranslateUI",
|
"--disable-features=TranslateUI",
|
||||||
"--disable-hang-monitor",
|
"--disable-hang-monitor",
|
||||||
"--disable-ipc-flooding-protection",
|
|
||||||
"--disable-popup-blocking",
|
"--disable-popup-blocking",
|
||||||
"--disable-prompt-on-repost",
|
"--disable-prompt-on-repost",
|
||||||
"--disable-sync",
|
"--disable-sync",
|
||||||
"--force-color-profile=srgb",
|
|
||||||
"--metrics-recording-only",
|
"--metrics-recording-only",
|
||||||
"--no-first-run",
|
|
||||||
"--password-store=basic",
|
"--password-store=basic",
|
||||||
"--use-mock-keychain",
|
"--use-mock-keychain",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -115,24 +115,11 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
if self.logger:
|
if self.logger:
|
||||||
self.logger.error(f"Failed to start built-in browser: {str(e)}", tag="BUILTIN")
|
self.logger.error(f"Failed to start built-in browser: {str(e)}", tag="BUILTIN")
|
||||||
|
|
||||||
|
# There is a possibility that at this point I need to clean up some resourece
|
||||||
raise
|
raise
|
||||||
|
|
||||||
async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]:
|
def _get_builtin_browser_info(cls, debugging_port: int, config_file: str, logger: Optional[AsyncLogger] = None) -> Optional[Dict[str, Any]]:
|
||||||
"""Get a page for the given configuration.
|
|
||||||
|
|
||||||
Inherits behavior from CDPBrowserStrategy for page management.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
crawlerRunConfig: Configuration object for the crawler run
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tuple of (Page, BrowserContext)
|
|
||||||
"""
|
|
||||||
# For built-in browsers, we use the same page management as CDP strategy
|
|
||||||
return await super().get_page(crawlerRunConfig)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def get_builtin_browser_info(cls, debugging_port: int, config_file: str, logger: Optional[AsyncLogger] = None) -> Optional[Dict[str, Any]]:
|
|
||||||
"""Get information about the built-in browser for a specific debugging port.
|
"""Get information about the built-in browser for a specific debugging port.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -157,15 +144,14 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
|
|||||||
browser_info = browser_info_dict["port_map"][port_str]
|
browser_info = browser_info_dict["port_map"][port_str]
|
||||||
|
|
||||||
# Check if the browser is still running
|
# Check if the browser is still running
|
||||||
pids = browser_info.get('pid')
|
pids = browser_info.get('pid', '')
|
||||||
if type(pids) == str and len(pids.split("\n")) > 1:
|
if isinstance(pids, str):
|
||||||
pids = [int(pid) for pid in pids.split("\n") if pid.isdigit()]
|
pids = [int(pid) for pid in pids.split() if pid.isdigit()]
|
||||||
elif type(pids) == str and pids.isdigit():
|
elif isinstance(pids, int):
|
||||||
pids = [int(pids)]
|
|
||||||
elif type(pids) == int:
|
|
||||||
pids = [pids]
|
pids = [pids]
|
||||||
else:
|
else:
|
||||||
pids = []
|
pids = []
|
||||||
|
|
||||||
# Check if any of the PIDs are running
|
# Check if any of the PIDs are running
|
||||||
if not pids:
|
if not pids:
|
||||||
if logger:
|
if logger:
|
||||||
@@ -205,7 +191,7 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
|
|||||||
Returns:
|
Returns:
|
||||||
dict: Browser information or None if no running browser is configured
|
dict: Browser information or None if no running browser is configured
|
||||||
"""
|
"""
|
||||||
return self.get_builtin_browser_info(
|
return self._get_builtin_browser_info(
|
||||||
debugging_port=self.config.debugging_port,
|
debugging_port=self.config.debugging_port,
|
||||||
config_file=self.builtin_config_file,
|
config_file=self.builtin_config_file,
|
||||||
logger=self.logger
|
logger=self.logger
|
||||||
@@ -226,7 +212,7 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
|
|||||||
str: CDP URL for the browser, or None if launch failed
|
str: CDP URL for the browser, or None if launch failed
|
||||||
"""
|
"""
|
||||||
# Check if there's an existing browser still running
|
# Check if there's an existing browser still running
|
||||||
browser_info = self.get_builtin_browser_info(
|
browser_info = self._get_builtin_browser_info(
|
||||||
debugging_port=debugging_port,
|
debugging_port=debugging_port,
|
||||||
config_file=self.builtin_config_file,
|
config_file=self.builtin_config_file,
|
||||||
logger=self.logger
|
logger=self.logger
|
||||||
@@ -238,6 +224,7 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
|
|||||||
|
|
||||||
# Create a user data directory for the built-in browser
|
# Create a user data directory for the built-in browser
|
||||||
user_data_dir = os.path.join(self.builtin_browser_dir, "user_data")
|
user_data_dir = os.path.join(self.builtin_browser_dir, "user_data")
|
||||||
|
|
||||||
# Raise error if user data dir is already engaged
|
# Raise error if user data dir is already engaged
|
||||||
if self._check_user_dir_is_engaged(user_data_dir):
|
if self._check_user_dir_is_engaged(user_data_dir):
|
||||||
raise Exception(f"User data directory {user_data_dir} is already engaged by another browser instance.")
|
raise Exception(f"User data directory {user_data_dir} is already engaged by another browser instance.")
|
||||||
@@ -246,15 +233,19 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
|
|||||||
os.makedirs(user_data_dir, exist_ok=True)
|
os.makedirs(user_data_dir, exist_ok=True)
|
||||||
|
|
||||||
# Prepare browser launch arguments
|
# Prepare browser launch arguments
|
||||||
|
browser_args = super()._build_browser_args()
|
||||||
browser_path = await get_browser_executable(browser_type)
|
browser_path = await get_browser_executable(browser_type)
|
||||||
|
base_args = [browser_path]
|
||||||
|
|
||||||
if browser_type == "chromium":
|
if browser_type == "chromium":
|
||||||
args = [
|
args = [
|
||||||
browser_path,
|
browser_path,
|
||||||
f"--remote-debugging-port={debugging_port}",
|
f"--remote-debugging-port={debugging_port}",
|
||||||
f"--user-data-dir={user_data_dir}",
|
f"--user-data-dir={user_data_dir}",
|
||||||
]
|
]
|
||||||
if headless:
|
# if headless:
|
||||||
args.append("--headless=new")
|
# args.append("--headless=new")
|
||||||
|
|
||||||
elif browser_type == "firefox":
|
elif browser_type == "firefox":
|
||||||
args = [
|
args = [
|
||||||
browser_path,
|
browser_path,
|
||||||
@@ -270,6 +261,8 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
|
|||||||
self.logger.error(f"Browser type {browser_type} not supported for built-in browser", tag="BUILTIN")
|
self.logger.error(f"Browser type {browser_type} not supported for built-in browser", tag="BUILTIN")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
args = base_args + browser_args + args
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
||||||
# Check if the port is already in use
|
# Check if the port is already in use
|
||||||
@@ -333,11 +326,12 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
|
|||||||
# Check if it already uses port mapping
|
# Check if it already uses port mapping
|
||||||
if isinstance(existing_data, dict) and "port_map" in existing_data:
|
if isinstance(existing_data, dict) and "port_map" in existing_data:
|
||||||
port_map = existing_data["port_map"]
|
port_map = existing_data["port_map"]
|
||||||
# Convert legacy format to port mapping
|
|
||||||
elif isinstance(existing_data, dict) and "debugging_port" in existing_data:
|
# # Convert legacy format to port mapping
|
||||||
old_port = str(existing_data.get("debugging_port"))
|
# elif isinstance(existing_data, dict) and "debugging_port" in existing_data:
|
||||||
if self._is_browser_running(existing_data.get("pid")):
|
# old_port = str(existing_data.get("debugging_port"))
|
||||||
port_map[old_port] = existing_data
|
# if self._is_browser_running(existing_data.get("pid")):
|
||||||
|
# port_map[old_port] = existing_data
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if self.logger:
|
if self.logger:
|
||||||
self.logger.warning(f"Could not read existing config: {str(e)}", tag="BUILTIN")
|
self.logger.warning(f"Could not read existing config: {str(e)}", tag="BUILTIN")
|
||||||
@@ -413,15 +407,19 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
|
|||||||
# Update config file to remove this browser
|
# Update config file to remove this browser
|
||||||
with open(self.builtin_config_file, 'r') as f:
|
with open(self.builtin_config_file, 'r') as f:
|
||||||
browser_info_dict = json.load(f)
|
browser_info_dict = json.load(f)
|
||||||
|
|
||||||
# Remove this port from the dictionary
|
# Remove this port from the dictionary
|
||||||
port_str = str(self.config.debugging_port)
|
port_str = str(self.config.debugging_port)
|
||||||
if port_str in browser_info_dict.get("port_map", {}):
|
if port_str in browser_info_dict.get("port_map", {}):
|
||||||
del browser_info_dict["port_map"][port_str]
|
del browser_info_dict["port_map"][port_str]
|
||||||
|
|
||||||
with open(self.builtin_config_file, 'w') as f:
|
with open(self.builtin_config_file, 'w') as f:
|
||||||
json.dump(browser_info_dict, f, indent=2)
|
json.dump(browser_info_dict, f, indent=2)
|
||||||
|
|
||||||
# Remove user data directory if it exists
|
# Remove user data directory if it exists
|
||||||
if os.path.exists(self.builtin_browser_dir):
|
if os.path.exists(self.builtin_browser_dir):
|
||||||
shutil.rmtree(self.builtin_browser_dir)
|
shutil.rmtree(self.builtin_browser_dir)
|
||||||
|
|
||||||
# Clear the browser info cache
|
# Clear the browser info cache
|
||||||
self.browser = None
|
self.browser = None
|
||||||
self.temp_dir = None
|
self.temp_dir = None
|
||||||
@@ -460,14 +458,11 @@ class BuiltinBrowserStrategy(CDPBrowserStrategy):
|
|||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
"""Close the built-in browser and clean up resources."""
|
"""Close the built-in browser and clean up resources."""
|
||||||
# Store the shutting_down state
|
|
||||||
was_shutting_down = getattr(self, 'shutting_down', False)
|
|
||||||
|
|
||||||
# Call parent class close method
|
# Call parent class close method
|
||||||
await super().close()
|
await super().close()
|
||||||
|
|
||||||
# Clean up built-in browser if we created it and were in shutdown mode
|
# Clean up built-in browser if we created it and were in shutdown mode
|
||||||
if was_shutting_down:
|
if self.shutting_down:
|
||||||
await self.kill_builtin_browser()
|
await self.kill_builtin_browser()
|
||||||
if self.logger:
|
if self.logger:
|
||||||
self.logger.debug("Killed built-in browser during shutdown", tag="BUILTIN")
|
self.logger.debug("Killed built-in browser during shutdown", tag="BUILTIN")
|
||||||
@@ -68,9 +68,11 @@ class CDPBrowserStrategy(BaseBrowserStrategy):
|
|||||||
|
|
||||||
if self.logger:
|
if self.logger:
|
||||||
self.logger.debug(f"Connected to CDP browser at {cdp_url}", tag="CDP")
|
self.logger.debug(f"Connected to CDP browser at {cdp_url}", tag="CDP")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if self.logger:
|
if self.logger:
|
||||||
self.logger.error(f"Failed to connect to CDP browser: {str(e)}", tag="CDP")
|
self.logger.error(f"Failed to connect to CDP browser: {str(e)}", tag="CDP")
|
||||||
|
|
||||||
# Clean up any resources before re-raising
|
# Clean up any resources before re-raising
|
||||||
await self._cleanup_process()
|
await self._cleanup_process()
|
||||||
raise
|
raise
|
||||||
@@ -95,7 +97,32 @@ class CDPBrowserStrategy(BaseBrowserStrategy):
|
|||||||
user_data_dir = self.config.user_data_dir
|
user_data_dir = self.config.user_data_dir
|
||||||
|
|
||||||
# Get browser args based on OS and browser type
|
# Get browser args based on OS and browser type
|
||||||
args = await self._get_browser_args(user_data_dir)
|
# args = await self._get_browser_args(user_data_dir)
|
||||||
|
browser_args = super()._build_browser_args()
|
||||||
|
browser_path = await get_browser_executable(self.config.browser_type)
|
||||||
|
base_args = [browser_path]
|
||||||
|
|
||||||
|
if self.config.browser_type == "chromium":
|
||||||
|
args = [
|
||||||
|
f"--remote-debugging-port={self.config.debugging_port}",
|
||||||
|
f"--user-data-dir={user_data_dir}",
|
||||||
|
]
|
||||||
|
# if self.config.headless:
|
||||||
|
# args.append("--headless=new")
|
||||||
|
|
||||||
|
elif self.config.browser_type == "firefox":
|
||||||
|
args = [
|
||||||
|
"--remote-debugging-port",
|
||||||
|
str(self.config.debugging_port),
|
||||||
|
"--profile",
|
||||||
|
user_data_dir,
|
||||||
|
]
|
||||||
|
if self.config.headless:
|
||||||
|
args.append("--headless")
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(f"Browser type {self.config.browser_type} not supported")
|
||||||
|
|
||||||
|
args = base_args + browser_args + args
|
||||||
|
|
||||||
# Start browser process
|
# Start browser process
|
||||||
try:
|
try:
|
||||||
@@ -136,40 +163,6 @@ class CDPBrowserStrategy(BaseBrowserStrategy):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
await self._cleanup_process()
|
await self._cleanup_process()
|
||||||
raise Exception(f"Failed to start browser: {e}")
|
raise Exception(f"Failed to start browser: {e}")
|
||||||
|
|
||||||
async def _get_browser_args(self, user_data_dir: str) -> List[str]:
|
|
||||||
"""Returns browser-specific command line arguments.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
user_data_dir: Path to user data directory
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of command-line arguments for the browser
|
|
||||||
"""
|
|
||||||
browser_args = super()._build_browser_args()
|
|
||||||
browser_path = await get_browser_executable(self.config.browser_type)
|
|
||||||
base_args = [browser_path]
|
|
||||||
|
|
||||||
if self.config.browser_type == "chromium":
|
|
||||||
args = [
|
|
||||||
f"--remote-debugging-port={self.config.debugging_port}",
|
|
||||||
f"--user-data-dir={user_data_dir}",
|
|
||||||
]
|
|
||||||
if self.config.headless:
|
|
||||||
args.append("--headless=new")
|
|
||||||
elif self.config.browser_type == "firefox":
|
|
||||||
args = [
|
|
||||||
"--remote-debugging-port",
|
|
||||||
str(self.config.debugging_port),
|
|
||||||
"--profile",
|
|
||||||
user_data_dir,
|
|
||||||
]
|
|
||||||
if self.config.headless:
|
|
||||||
args.append("--headless")
|
|
||||||
else:
|
|
||||||
raise NotImplementedError(f"Browser type {self.config.browser_type} not supported")
|
|
||||||
|
|
||||||
return base_args + browser_args + args
|
|
||||||
|
|
||||||
async def _cleanup_process(self):
|
async def _cleanup_process(self):
|
||||||
"""Cleanup browser process and temporary directory."""
|
"""Cleanup browser process and temporary directory."""
|
||||||
@@ -204,15 +197,40 @@ class CDPBrowserStrategy(BaseBrowserStrategy):
|
|||||||
if self.temp_dir and os.path.exists(self.temp_dir):
|
if self.temp_dir and os.path.exists(self.temp_dir):
|
||||||
try:
|
try:
|
||||||
shutil.rmtree(self.temp_dir)
|
shutil.rmtree(self.temp_dir)
|
||||||
|
self.temp_dir = None
|
||||||
|
if self.logger:
|
||||||
|
self.logger.debug("Removed temporary directory", tag="CDP")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if self.logger:
|
if self.logger:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
message="Error removing temporary directory: {error}",
|
message="Error removing temporary directory: {error}",
|
||||||
tag="ERROR",
|
tag="CDP",
|
||||||
params={"error": str(e)},
|
params={"error": str(e)}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.browser_process = None
|
||||||
|
|
||||||
async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]:
|
async def _generate_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]:
|
||||||
|
# For CDP, we typically use the shared default_context
|
||||||
|
context = self.default_context
|
||||||
|
pages = context.pages
|
||||||
|
|
||||||
|
# Otherwise, check if we have an existing context for this config
|
||||||
|
config_signature = self._make_config_signature(crawlerRunConfig)
|
||||||
|
self.contexts_by_config[config_signature] = context
|
||||||
|
|
||||||
|
await self.setup_context(context, crawlerRunConfig)
|
||||||
|
|
||||||
|
# Check if there's already a page with the target URL
|
||||||
|
page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
|
||||||
|
|
||||||
|
# If not found, create a new page
|
||||||
|
if not page:
|
||||||
|
page = await context.new_page()
|
||||||
|
|
||||||
|
return page, context
|
||||||
|
|
||||||
|
async def _get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]:
|
||||||
"""Get a page for the given configuration.
|
"""Get a page for the given configuration.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -221,15 +239,8 @@ class CDPBrowserStrategy(BaseBrowserStrategy):
|
|||||||
Returns:
|
Returns:
|
||||||
Tuple of (Page, BrowserContext)
|
Tuple of (Page, BrowserContext)
|
||||||
"""
|
"""
|
||||||
# Clean up expired sessions using base class method
|
# Call parent method to ensure browser is started
|
||||||
self._cleanup_expired_sessions()
|
await super().get_page(crawlerRunConfig)
|
||||||
|
|
||||||
# If a session_id is provided and we already have it, reuse that page + context
|
|
||||||
if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions:
|
|
||||||
context, page, _ = self.sessions[crawlerRunConfig.session_id]
|
|
||||||
# Update last-used timestamp
|
|
||||||
self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
|
|
||||||
return page, context
|
|
||||||
|
|
||||||
# For CDP, we typically use the shared default_context
|
# For CDP, we typically use the shared default_context
|
||||||
context = self.default_context
|
context = self.default_context
|
||||||
@@ -266,24 +277,5 @@ class CDPBrowserStrategy(BaseBrowserStrategy):
|
|||||||
await super().close()
|
await super().close()
|
||||||
|
|
||||||
# Additional CDP-specific cleanup
|
# Additional CDP-specific cleanup
|
||||||
if self.browser_process:
|
await asyncio.sleep(0.5)
|
||||||
await asyncio.sleep(0.5)
|
await self._cleanup_process()
|
||||||
await self._cleanup_process()
|
|
||||||
self.browser_process = None
|
|
||||||
if self.logger:
|
|
||||||
self.logger.debug("Cleaned up CDP browser process", tag="CDP")
|
|
||||||
|
|
||||||
# Clean up temporary directory
|
|
||||||
if self.temp_dir and os.path.exists(self.temp_dir):
|
|
||||||
try:
|
|
||||||
shutil.rmtree(self.temp_dir)
|
|
||||||
self.temp_dir = None
|
|
||||||
if self.logger:
|
|
||||||
self.logger.debug("Removed temporary directory", tag="CDP")
|
|
||||||
except Exception as e:
|
|
||||||
if self.logger:
|
|
||||||
self.logger.error(
|
|
||||||
message="Error removing temporary directory: {error}",
|
|
||||||
tag="CDP",
|
|
||||||
params={"error": str(e)}
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ from ..models import DockerConfig
|
|||||||
from ..docker_registry import DockerRegistry
|
from ..docker_registry import DockerRegistry
|
||||||
from ..docker_utils import DockerUtils
|
from ..docker_utils import DockerUtils
|
||||||
from .builtin import CDPBrowserStrategy
|
from .builtin import CDPBrowserStrategy
|
||||||
|
from .base import BaseBrowserStrategy
|
||||||
|
|
||||||
class DockerBrowserStrategy(CDPBrowserStrategy):
|
class DockerBrowserStrategy(CDPBrowserStrategy):
|
||||||
"""Docker-based browser strategy.
|
"""Docker-based browser strategy.
|
||||||
@@ -79,9 +79,7 @@ class DockerBrowserStrategy(CDPBrowserStrategy):
|
|||||||
self: For method chaining
|
self: For method chaining
|
||||||
"""
|
"""
|
||||||
# Initialize Playwright
|
# Initialize Playwright
|
||||||
from ..utils import get_playwright
|
await BaseBrowserStrategy.start(self)
|
||||||
|
|
||||||
self.playwright = await get_playwright()
|
|
||||||
|
|
||||||
if self.logger:
|
if self.logger:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
@@ -172,121 +170,6 @@ class DockerBrowserStrategy(CDPBrowserStrategy):
|
|||||||
# Use the utility method to generate the hash
|
# Use the utility method to generate the hash
|
||||||
return self.docker_utils.generate_config_hash(config_dict)
|
return self.docker_utils.generate_config_hash(config_dict)
|
||||||
|
|
||||||
async def _get_or_create_cdp_url1(self) -> str:
|
|
||||||
"""Get CDP URL by either creating a new container or using an existing one.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
CDP URL for connecting to the browser
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
Exception: If container creation or browser launch fails
|
|
||||||
"""
|
|
||||||
# If CDP URL is explicitly provided, use it
|
|
||||||
if self.config.cdp_url:
|
|
||||||
return self.config.cdp_url
|
|
||||||
|
|
||||||
# Ensure Docker image exists (will build if needed)
|
|
||||||
image_name = await self.docker_utils.ensure_docker_image_exists(
|
|
||||||
self.docker_config.image, self.docker_config.mode
|
|
||||||
)
|
|
||||||
|
|
||||||
# Generate config hash for container matching
|
|
||||||
config_hash = await self._generate_config_hash()
|
|
||||||
|
|
||||||
# Look for existing container with matching config
|
|
||||||
container_id = self.registry.find_container_by_config(
|
|
||||||
config_hash, self.docker_utils
|
|
||||||
)
|
|
||||||
|
|
||||||
if container_id:
|
|
||||||
# Use existing container
|
|
||||||
self.container_id = container_id
|
|
||||||
host_port = self.registry.get_container_host_port(container_id)
|
|
||||||
if self.logger:
|
|
||||||
self.logger.info(
|
|
||||||
f"Using existing Docker container: {container_id[:12]}",
|
|
||||||
tag="DOCKER",
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# Get a port for the new container
|
|
||||||
host_port = (
|
|
||||||
self.docker_config.host_port
|
|
||||||
or self.registry.get_next_available_port(self.docker_utils)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Prepare volumes list
|
|
||||||
volumes = list(self.docker_config.volumes)
|
|
||||||
|
|
||||||
# Add user data directory if specified
|
|
||||||
if self.docker_config.user_data_dir:
|
|
||||||
# Ensure user data directory exists
|
|
||||||
os.makedirs(self.docker_config.user_data_dir, exist_ok=True)
|
|
||||||
volumes.append(
|
|
||||||
f"{self.docker_config.user_data_dir}:{self.docker_config.container_user_data_dir}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Update config user_data_dir to point to container path
|
|
||||||
self.config.user_data_dir = self.docker_config.container_user_data_dir
|
|
||||||
|
|
||||||
# Create a new container
|
|
||||||
container_id = await self.docker_utils.create_container(
|
|
||||||
image_name=image_name,
|
|
||||||
host_port=host_port,
|
|
||||||
container_name=self.container_name,
|
|
||||||
volumes=volumes,
|
|
||||||
network=self.docker_config.network,
|
|
||||||
env_vars=self.docker_config.env_vars,
|
|
||||||
extra_args=self.docker_config.extra_args,
|
|
||||||
)
|
|
||||||
|
|
||||||
if not container_id:
|
|
||||||
raise Exception("Failed to create Docker container")
|
|
||||||
|
|
||||||
self.container_id = container_id
|
|
||||||
|
|
||||||
# Register the container
|
|
||||||
self.registry.register_container(container_id, host_port, config_hash)
|
|
||||||
|
|
||||||
# Wait for container to be ready
|
|
||||||
await self.docker_utils.wait_for_container_ready(container_id)
|
|
||||||
|
|
||||||
# Handle specific setup based on mode
|
|
||||||
if self.docker_config.mode == "launch":
|
|
||||||
# In launch mode, we need to start socat and Chrome
|
|
||||||
await self.docker_utils.start_socat_in_container(container_id)
|
|
||||||
|
|
||||||
# Build browser arguments
|
|
||||||
browser_args = self._build_browser_args()
|
|
||||||
|
|
||||||
# Launch Chrome
|
|
||||||
await self.docker_utils.launch_chrome_in_container(
|
|
||||||
container_id, browser_args
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get PIDs for later cleanup
|
|
||||||
self.chrome_process_id = (
|
|
||||||
await self.docker_utils.get_process_id_in_container(
|
|
||||||
container_id, "chrome"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
self.socat_process_id = (
|
|
||||||
await self.docker_utils.get_process_id_in_container(
|
|
||||||
container_id, "socat"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Wait for CDP to be ready
|
|
||||||
await self.docker_utils.wait_for_cdp_ready(host_port)
|
|
||||||
|
|
||||||
if self.logger:
|
|
||||||
self.logger.success(
|
|
||||||
f"Docker container ready: {container_id[:12]} on port {host_port}",
|
|
||||||
tag="DOCKER",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Return CDP URL
|
|
||||||
return f"http://localhost:{host_port}"
|
|
||||||
|
|
||||||
async def _get_or_create_cdp_url(self) -> str:
|
async def _get_or_create_cdp_url(self) -> str:
|
||||||
"""Get CDP URL by either creating a new container or using an existing one.
|
"""Get CDP URL by either creating a new container or using an existing one.
|
||||||
|
|
||||||
@@ -465,8 +348,7 @@ class DockerBrowserStrategy(CDPBrowserStrategy):
|
|||||||
async def close(self):
|
async def close(self):
|
||||||
"""Close the browser and clean up Docker container if needed."""
|
"""Close the browser and clean up Docker container if needed."""
|
||||||
# Set flag to track if we were the ones initiating shutdown
|
# Set flag to track if we were the ones initiating shutdown
|
||||||
initiated_shutdown = not getattr(self, "shutting_down", False)
|
initiated_shutdown = not self.shutting_down
|
||||||
|
|
||||||
# Storage persistence for Docker needs special handling
|
# Storage persistence for Docker needs special handling
|
||||||
# We need to store state before calling super().close() which will close the browser
|
# We need to store state before calling super().close() which will close the browser
|
||||||
if (
|
if (
|
||||||
|
|||||||
@@ -80,8 +80,26 @@ class PlaywrightBrowserStrategy(BaseBrowserStrategy):
|
|||||||
raise
|
raise
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]:
|
async def _generate_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]:
|
||||||
|
# Otherwise, check if we have an existing context for this config
|
||||||
|
config_signature = self._make_config_signature(crawlerRunConfig)
|
||||||
|
|
||||||
|
async with self._contexts_lock:
|
||||||
|
if config_signature in self.contexts_by_config:
|
||||||
|
context = self.contexts_by_config[config_signature]
|
||||||
|
else:
|
||||||
|
# Create and setup a new context
|
||||||
|
context = await self.create_browser_context(crawlerRunConfig)
|
||||||
|
await self.setup_context(context, crawlerRunConfig)
|
||||||
|
self.contexts_by_config[config_signature] = context
|
||||||
|
|
||||||
|
# Create a new page from the chosen context
|
||||||
|
page = await context.new_page()
|
||||||
|
|
||||||
|
return page, context
|
||||||
|
|
||||||
|
async def _get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]:
|
||||||
"""Get a page for the given configuration.
|
"""Get a page for the given configuration.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -90,15 +108,8 @@ class PlaywrightBrowserStrategy(BaseBrowserStrategy):
|
|||||||
Returns:
|
Returns:
|
||||||
Tuple of (Page, BrowserContext)
|
Tuple of (Page, BrowserContext)
|
||||||
"""
|
"""
|
||||||
# Clean up expired sessions first
|
# Call parent method to ensure browser is started
|
||||||
self._cleanup_expired_sessions()
|
await super().get_page(crawlerRunConfig)
|
||||||
|
|
||||||
# If a session_id is provided and we already have it, reuse that page + context
|
|
||||||
if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions:
|
|
||||||
context, page, _ = self.sessions[crawlerRunConfig.session_id]
|
|
||||||
# Update last-used timestamp
|
|
||||||
self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
|
|
||||||
return page, context
|
|
||||||
|
|
||||||
# Otherwise, check if we have an existing context for this config
|
# Otherwise, check if we have an existing context for this config
|
||||||
config_signature = self._make_config_signature(crawlerRunConfig)
|
config_signature = self._make_config_signature(crawlerRunConfig)
|
||||||
@@ -121,8 +132,3 @@ class PlaywrightBrowserStrategy(BaseBrowserStrategy):
|
|||||||
|
|
||||||
return page, context
|
return page, context
|
||||||
|
|
||||||
async def close(self):
|
|
||||||
"""Close the Playwright browser and clean up resources."""
|
|
||||||
# The base implementation already handles everything needed for Playwright
|
|
||||||
# including storage persistence, sessions, contexts, browser and playwright
|
|
||||||
await super().close()
|
|
||||||
@@ -1,5 +1,7 @@
|
|||||||
from pydantic import BaseModel, HttpUrl, PrivateAttr
|
from pydantic import BaseModel, HttpUrl, PrivateAttr
|
||||||
from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
|
from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
|
||||||
|
from typing import AsyncGenerator
|
||||||
|
from typing import Generic, TypeVar
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from .ssl_certificate import SSLCertificate
|
from .ssl_certificate import SSLCertificate
|
||||||
@@ -34,34 +36,12 @@ class CrawlerTaskResult:
|
|||||||
def success(self) -> bool:
|
def success(self) -> bool:
|
||||||
return self.result.success
|
return self.result.success
|
||||||
|
|
||||||
|
|
||||||
class CrawlStatus(Enum):
|
class CrawlStatus(Enum):
|
||||||
QUEUED = "QUEUED"
|
QUEUED = "QUEUED"
|
||||||
IN_PROGRESS = "IN_PROGRESS"
|
IN_PROGRESS = "IN_PROGRESS"
|
||||||
COMPLETED = "COMPLETED"
|
COMPLETED = "COMPLETED"
|
||||||
FAILED = "FAILED"
|
FAILED = "FAILED"
|
||||||
|
|
||||||
|
|
||||||
# @dataclass
|
|
||||||
# class CrawlStats:
|
|
||||||
# task_id: str
|
|
||||||
# url: str
|
|
||||||
# status: CrawlStatus
|
|
||||||
# start_time: Optional[datetime] = None
|
|
||||||
# end_time: Optional[datetime] = None
|
|
||||||
# memory_usage: float = 0.0
|
|
||||||
# peak_memory: float = 0.0
|
|
||||||
# error_message: str = ""
|
|
||||||
|
|
||||||
# @property
|
|
||||||
# def duration(self) -> str:
|
|
||||||
# if not self.start_time:
|
|
||||||
# return "0:00"
|
|
||||||
# end = self.end_time or datetime.now()
|
|
||||||
# duration = end - self.start_time
|
|
||||||
# return str(timedelta(seconds=int(duration.total_seconds())))
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class CrawlStats:
|
class CrawlStats:
|
||||||
task_id: str
|
task_id: str
|
||||||
@@ -95,7 +75,6 @@ class CrawlStats:
|
|||||||
duration = end - start
|
duration = end - start
|
||||||
return str(timedelta(seconds=int(duration.total_seconds())))
|
return str(timedelta(seconds=int(duration.total_seconds())))
|
||||||
|
|
||||||
|
|
||||||
class DisplayMode(Enum):
|
class DisplayMode(Enum):
|
||||||
DETAILED = "DETAILED"
|
DETAILED = "DETAILED"
|
||||||
AGGREGATED = "AGGREGATED"
|
AGGREGATED = "AGGREGATED"
|
||||||
@@ -112,12 +91,10 @@ class TokenUsage:
|
|||||||
completion_tokens_details: Optional[dict] = None
|
completion_tokens_details: Optional[dict] = None
|
||||||
prompt_tokens_details: Optional[dict] = None
|
prompt_tokens_details: Optional[dict] = None
|
||||||
|
|
||||||
|
|
||||||
class UrlModel(BaseModel):
|
class UrlModel(BaseModel):
|
||||||
url: HttpUrl
|
url: HttpUrl
|
||||||
forced: bool = False
|
forced: bool = False
|
||||||
|
|
||||||
|
|
||||||
class MarkdownGenerationResult(BaseModel):
|
class MarkdownGenerationResult(BaseModel):
|
||||||
raw_markdown: str
|
raw_markdown: str
|
||||||
markdown_with_citations: str
|
markdown_with_citations: str
|
||||||
@@ -284,6 +261,40 @@ class StringCompatibleMarkdown(str):
|
|||||||
def __getattr__(self, name):
|
def __getattr__(self, name):
|
||||||
return getattr(self._markdown_result, name)
|
return getattr(self._markdown_result, name)
|
||||||
|
|
||||||
|
CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
|
||||||
|
|
||||||
|
class CrawlResultContainer(Generic[CrawlResultT]):
|
||||||
|
def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]):
|
||||||
|
# Normalize to a list
|
||||||
|
if isinstance(results, list):
|
||||||
|
self._results = results
|
||||||
|
else:
|
||||||
|
self._results = [results]
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return iter(self._results)
|
||||||
|
|
||||||
|
def __getitem__(self, index):
|
||||||
|
return self._results[index]
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self._results)
|
||||||
|
|
||||||
|
def __getattr__(self, attr):
|
||||||
|
# Delegate attribute access to the first element.
|
||||||
|
if self._results:
|
||||||
|
return getattr(self._results[0], attr)
|
||||||
|
raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'")
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"{self.__class__.__name__}({self._results!r})"
|
||||||
|
|
||||||
|
RunManyReturn = Union[
|
||||||
|
CrawlResultContainer[CrawlResultT],
|
||||||
|
AsyncGenerator[CrawlResultT, None]
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
# END of backward compatibility code for markdown/markdown_v2.
|
# END of backward compatibility code for markdown/markdown_v2.
|
||||||
# When removing this code in the future, make sure to:
|
# When removing this code in the future, make sure to:
|
||||||
# 1. Replace the private attribute and property with a standard field
|
# 1. Replace the private attribute and property with a standard field
|
||||||
@@ -304,7 +315,6 @@ class AsyncCrawlResponse(BaseModel):
|
|||||||
class Config:
|
class Config:
|
||||||
arbitrary_types_allowed = True
|
arbitrary_types_allowed = True
|
||||||
|
|
||||||
|
|
||||||
###############################
|
###############################
|
||||||
# Scraping Models
|
# Scraping Models
|
||||||
###############################
|
###############################
|
||||||
|
|||||||
@@ -530,7 +530,7 @@ async def test_docker_registry_reuse():
|
|||||||
logger.info("First browser started successfully", tag="TEST")
|
logger.info("First browser started successfully", tag="TEST")
|
||||||
|
|
||||||
# Get container ID from the strategy
|
# Get container ID from the strategy
|
||||||
docker_strategy1 = manager1._strategy
|
docker_strategy1 = manager1.strategy
|
||||||
container_id1 = docker_strategy1.container_id
|
container_id1 = docker_strategy1.container_id
|
||||||
logger.info(f"First browser container ID: {container_id1[:12]}", tag="TEST")
|
logger.info(f"First browser container ID: {container_id1[:12]}", tag="TEST")
|
||||||
|
|
||||||
@@ -560,7 +560,7 @@ async def test_docker_registry_reuse():
|
|||||||
logger.info("Second browser started successfully", tag="TEST")
|
logger.info("Second browser started successfully", tag="TEST")
|
||||||
|
|
||||||
# Get container ID from the second strategy
|
# Get container ID from the second strategy
|
||||||
docker_strategy2 = manager2._strategy
|
docker_strategy2 = manager2.strategy
|
||||||
container_id2 = docker_strategy2.container_id
|
container_id2 = docker_strategy2.container_id
|
||||||
logger.info(f"Second browser container ID: {container_id2[:12]}", tag="TEST")
|
logger.info(f"Second browser container ID: {container_id2[:12]}", tag="TEST")
|
||||||
|
|
||||||
|
|||||||
@@ -56,13 +56,13 @@ async def test_builtin_browser_creation():
|
|||||||
|
|
||||||
# Step 2: Check if we have a BuiltinBrowserStrategy
|
# Step 2: Check if we have a BuiltinBrowserStrategy
|
||||||
print(f"\n{INFO}2. Checking if we have a BuiltinBrowserStrategy{RESET}")
|
print(f"\n{INFO}2. Checking if we have a BuiltinBrowserStrategy{RESET}")
|
||||||
if isinstance(manager._strategy, BuiltinBrowserStrategy):
|
if isinstance(manager.strategy, BuiltinBrowserStrategy):
|
||||||
print(
|
print(
|
||||||
f"{SUCCESS}Correct strategy type: {manager._strategy.__class__.__name__}{RESET}"
|
f"{SUCCESS}Correct strategy type: {manager.strategy.__class__.__name__}{RESET}"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
print(
|
print(
|
||||||
f"{ERROR}Wrong strategy type: {manager._strategy.__class__.__name__}{RESET}"
|
f"{ERROR}Wrong strategy type: {manager.strategy.__class__.__name__}{RESET}"
|
||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -77,7 +77,7 @@ async def test_builtin_browser_creation():
|
|||||||
|
|
||||||
# Step 4: Get browser info from the strategy
|
# Step 4: Get browser info from the strategy
|
||||||
print(f"\n{INFO}4. Getting browser information{RESET}")
|
print(f"\n{INFO}4. Getting browser information{RESET}")
|
||||||
browser_info = manager._strategy.get_browser_info()
|
browser_info = manager.strategy.get_browser_info()
|
||||||
if browser_info:
|
if browser_info:
|
||||||
print(f"{SUCCESS}Browser info retrieved:{RESET}")
|
print(f"{SUCCESS}Browser info retrieved:{RESET}")
|
||||||
for key, value in browser_info.items():
|
for key, value in browser_info.items():
|
||||||
@@ -149,7 +149,7 @@ async def test_browser_status_management(manager: BrowserManager):
|
|||||||
# Step 1: Get browser status
|
# Step 1: Get browser status
|
||||||
print(f"\n{INFO}1. Getting browser status{RESET}")
|
print(f"\n{INFO}1. Getting browser status{RESET}")
|
||||||
try:
|
try:
|
||||||
status = await manager._strategy.get_builtin_browser_status()
|
status = await manager.strategy.get_builtin_browser_status()
|
||||||
print(f"{SUCCESS}Browser status:{RESET}")
|
print(f"{SUCCESS}Browser status:{RESET}")
|
||||||
print(f" Running: {status['running']}")
|
print(f" Running: {status['running']}")
|
||||||
print(f" CDP URL: {status['cdp_url']}")
|
print(f" CDP URL: {status['cdp_url']}")
|
||||||
@@ -160,7 +160,7 @@ async def test_browser_status_management(manager: BrowserManager):
|
|||||||
# Step 2: Test killing the browser
|
# Step 2: Test killing the browser
|
||||||
print(f"\n{INFO}2. Testing killing the browser{RESET}")
|
print(f"\n{INFO}2. Testing killing the browser{RESET}")
|
||||||
try:
|
try:
|
||||||
result = await manager._strategy.kill_builtin_browser()
|
result = await manager.strategy.kill_builtin_browser()
|
||||||
if result:
|
if result:
|
||||||
print(f"{SUCCESS}Browser killed successfully{RESET}")
|
print(f"{SUCCESS}Browser killed successfully{RESET}")
|
||||||
else:
|
else:
|
||||||
@@ -172,7 +172,7 @@ async def test_browser_status_management(manager: BrowserManager):
|
|||||||
# Step 3: Check status after kill
|
# Step 3: Check status after kill
|
||||||
print(f"\n{INFO}3. Checking status after kill{RESET}")
|
print(f"\n{INFO}3. Checking status after kill{RESET}")
|
||||||
try:
|
try:
|
||||||
status = await manager._strategy.get_builtin_browser_status()
|
status = await manager.strategy.get_builtin_browser_status()
|
||||||
if not status["running"]:
|
if not status["running"]:
|
||||||
print(f"{SUCCESS}Browser is correctly reported as not running{RESET}")
|
print(f"{SUCCESS}Browser is correctly reported as not running{RESET}")
|
||||||
else:
|
else:
|
||||||
@@ -184,7 +184,7 @@ async def test_browser_status_management(manager: BrowserManager):
|
|||||||
# Step 4: Launch a new browser
|
# Step 4: Launch a new browser
|
||||||
print(f"\n{INFO}4. Launching a new browser{RESET}")
|
print(f"\n{INFO}4. Launching a new browser{RESET}")
|
||||||
try:
|
try:
|
||||||
cdp_url = await manager._strategy.launch_builtin_browser(
|
cdp_url = await manager.strategy.launch_builtin_browser(
|
||||||
browser_type="chromium", headless=True
|
browser_type="chromium", headless=True
|
||||||
)
|
)
|
||||||
if cdp_url:
|
if cdp_url:
|
||||||
@@ -223,8 +223,8 @@ async def test_multiple_managers():
|
|||||||
print(f"{SUCCESS}Second manager started{RESET}")
|
print(f"{SUCCESS}Second manager started{RESET}")
|
||||||
|
|
||||||
# Check if they got the same CDP URL
|
# Check if they got the same CDP URL
|
||||||
cdp_url1 = manager1._strategy.config.cdp_url
|
cdp_url1 = manager1.strategy.config.cdp_url
|
||||||
cdp_url2 = manager2._strategy.config.cdp_url
|
cdp_url2 = manager2.strategy.config.cdp_url
|
||||||
|
|
||||||
if cdp_url1 == cdp_url2:
|
if cdp_url1 == cdp_url2:
|
||||||
print(
|
print(
|
||||||
@@ -316,7 +316,7 @@ async def test_edge_cases():
|
|||||||
|
|
||||||
# Kill the browser directly
|
# Kill the browser directly
|
||||||
print(f"{INFO}Killing the browser...{RESET}")
|
print(f"{INFO}Killing the browser...{RESET}")
|
||||||
await manager._strategy.kill_builtin_browser()
|
await manager.strategy.kill_builtin_browser()
|
||||||
print(f"{SUCCESS}Browser killed{RESET}")
|
print(f"{SUCCESS}Browser killed{RESET}")
|
||||||
|
|
||||||
# Try to get a page (should fail or launch a new browser)
|
# Try to get a page (should fail or launch a new browser)
|
||||||
@@ -350,7 +350,7 @@ async def cleanup_browsers():
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
# No need to start, just access the strategy directly
|
# No need to start, just access the strategy directly
|
||||||
strategy = manager._strategy
|
strategy = manager.strategy
|
||||||
if isinstance(strategy, BuiltinBrowserStrategy):
|
if isinstance(strategy, BuiltinBrowserStrategy):
|
||||||
result = await strategy.kill_builtin_browser()
|
result = await strategy.kill_builtin_browser()
|
||||||
if result:
|
if result:
|
||||||
@@ -420,7 +420,7 @@ async def test_performance_scaling():
|
|||||||
user_data_dir=os.path.join(temp_dir, f"browser_profile_{i}"),
|
user_data_dir=os.path.join(temp_dir, f"browser_profile_{i}"),
|
||||||
)
|
)
|
||||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||||
manager._strategy.shutting_down = True
|
manager.strategy.shutting_down = True
|
||||||
manager_configs.append((manager, i, port))
|
manager_configs.append((manager, i, port))
|
||||||
|
|
||||||
# Define async function to start a single manager
|
# Define async function to start a single manager
|
||||||
@@ -614,7 +614,7 @@ async def test_performance_scaling_lab( num_browsers: int = 10, pages_per_browse
|
|||||||
user_data_dir=os.path.join(temp_dir, f"browser_profile_{i}"),
|
user_data_dir=os.path.join(temp_dir, f"browser_profile_{i}"),
|
||||||
)
|
)
|
||||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||||
manager._strategy.shutting_down = True
|
manager.strategy.shutting_down = True
|
||||||
manager_configs.append((manager, i, port))
|
manager_configs.append((manager, i, port))
|
||||||
|
|
||||||
# Define async function to start a single manager
|
# Define async function to start a single manager
|
||||||
|
|||||||
Reference in New Issue
Block a user