feat(browser): improve browser context management and add shared data support
Add shared_data parameter to CrawlerRunConfig to allow data sharing between hooks. Implement browser context reuse based on config signatures to improve memory usage. Fix Firefox/Webkit channel settings. Add config parameter to hook callbacks for better context access. Remove debug print statements. BREAKING CHANGE: Hook callback signatures now include config parameter
This commit is contained in:
@@ -112,6 +112,9 @@ class BrowserConfig:
|
|||||||
self.user_data_dir = user_data_dir
|
self.user_data_dir = user_data_dir
|
||||||
self.chrome_channel = chrome_channel or self.browser_type or "chromium"
|
self.chrome_channel = chrome_channel or self.browser_type or "chromium"
|
||||||
self.channel = channel or self.browser_type or "chromium"
|
self.channel = channel or self.browser_type or "chromium"
|
||||||
|
if self.browser_type in ["firefox", "webkit"]:
|
||||||
|
self.channel = ""
|
||||||
|
self.chrome_channel = ""
|
||||||
self.proxy = proxy
|
self.proxy = proxy
|
||||||
self.proxy_config = proxy_config
|
self.proxy_config = proxy_config
|
||||||
self.viewport_width = viewport_width
|
self.viewport_width = viewport_width
|
||||||
@@ -239,6 +242,8 @@ class CrawlerRunConfig:
|
|||||||
Default: False.
|
Default: False.
|
||||||
no_cache_write (bool): Legacy parameter, if True acts like CacheMode.READ_ONLY.
|
no_cache_write (bool): Legacy parameter, if True acts like CacheMode.READ_ONLY.
|
||||||
Default: False.
|
Default: False.
|
||||||
|
shared_data (dict or None): Shared data to be passed between hooks.
|
||||||
|
Default: None.
|
||||||
|
|
||||||
# Page Navigation and Timing Parameters
|
# Page Navigation and Timing Parameters
|
||||||
wait_until (str): The condition to wait for when navigating, e.g. "domcontentloaded".
|
wait_until (str): The condition to wait for when navigating, e.g. "domcontentloaded".
|
||||||
@@ -344,6 +349,7 @@ class CrawlerRunConfig:
|
|||||||
disable_cache: bool = False,
|
disable_cache: bool = False,
|
||||||
no_cache_read: bool = False,
|
no_cache_read: bool = False,
|
||||||
no_cache_write: bool = False,
|
no_cache_write: bool = False,
|
||||||
|
shared_data: dict = None,
|
||||||
# Page Navigation and Timing Parameters
|
# Page Navigation and Timing Parameters
|
||||||
wait_until: str = "domcontentloaded",
|
wait_until: str = "domcontentloaded",
|
||||||
page_timeout: int = PAGE_TIMEOUT,
|
page_timeout: int = PAGE_TIMEOUT,
|
||||||
@@ -411,6 +417,7 @@ class CrawlerRunConfig:
|
|||||||
self.disable_cache = disable_cache
|
self.disable_cache = disable_cache
|
||||||
self.no_cache_read = no_cache_read
|
self.no_cache_read = no_cache_read
|
||||||
self.no_cache_write = no_cache_write
|
self.no_cache_write = no_cache_write
|
||||||
|
self.shared_data = shared_data
|
||||||
|
|
||||||
# Page Navigation and Timing Parameters
|
# Page Navigation and Timing Parameters
|
||||||
self.wait_until = wait_until
|
self.wait_until = wait_until
|
||||||
@@ -501,6 +508,7 @@ class CrawlerRunConfig:
|
|||||||
disable_cache=kwargs.get("disable_cache", False),
|
disable_cache=kwargs.get("disable_cache", False),
|
||||||
no_cache_read=kwargs.get("no_cache_read", False),
|
no_cache_read=kwargs.get("no_cache_read", False),
|
||||||
no_cache_write=kwargs.get("no_cache_write", False),
|
no_cache_write=kwargs.get("no_cache_write", False),
|
||||||
|
shared_data=kwargs.get("shared_data", None),
|
||||||
# Page Navigation and Timing Parameters
|
# Page Navigation and Timing Parameters
|
||||||
wait_until=kwargs.get("wait_until", "domcontentloaded"),
|
wait_until=kwargs.get("wait_until", "domcontentloaded"),
|
||||||
page_timeout=kwargs.get("page_timeout", 60000),
|
page_timeout=kwargs.get("page_timeout", 60000),
|
||||||
@@ -574,6 +582,8 @@ class CrawlerRunConfig:
|
|||||||
"disable_cache": self.disable_cache,
|
"disable_cache": self.disable_cache,
|
||||||
"no_cache_read": self.no_cache_read,
|
"no_cache_read": self.no_cache_read,
|
||||||
"no_cache_write": self.no_cache_write,
|
"no_cache_write": self.no_cache_write,
|
||||||
|
"shared_data": self.shared_data,
|
||||||
|
# Page Navigation and Timing Parameters
|
||||||
"wait_until": self.wait_until,
|
"wait_until": self.wait_until,
|
||||||
"page_timeout": self.page_timeout,
|
"page_timeout": self.page_timeout,
|
||||||
"wait_for": self.wait_for,
|
"wait_for": self.wait_for,
|
||||||
|
|||||||
@@ -326,6 +326,10 @@ class BrowserManager:
|
|||||||
self.sessions = {}
|
self.sessions = {}
|
||||||
self.session_ttl = 1800 # 30 minutes
|
self.session_ttl = 1800 # 30 minutes
|
||||||
|
|
||||||
|
# Keep track of contexts by a "config signature," so each unique config reuses a single context
|
||||||
|
self.contexts_by_config = {}
|
||||||
|
self._contexts_lock = asyncio.Lock()
|
||||||
|
|
||||||
# Initialize ManagedBrowser if needed
|
# Initialize ManagedBrowser if needed
|
||||||
if self.config.use_managed_browser:
|
if self.config.use_managed_browser:
|
||||||
self.managed_browser = ManagedBrowser(
|
self.managed_browser = ManagedBrowser(
|
||||||
@@ -642,7 +646,38 @@ class BrowserManager:
|
|||||||
await context.route(f"**/*.{ext}", lambda route: route.abort())
|
await context.route(f"**/*.{ext}", lambda route: route.abort())
|
||||||
return context
|
return context
|
||||||
|
|
||||||
# async def get_page(self, session_id: Optional[str], user_agent: str):
|
def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str:
|
||||||
|
"""
|
||||||
|
Converts the crawlerRunConfig into a dict, excludes ephemeral fields,
|
||||||
|
then returns a hash of the sorted JSON. This yields a stable signature
|
||||||
|
that identifies configurations requiring a unique browser context.
|
||||||
|
"""
|
||||||
|
import json, hashlib
|
||||||
|
|
||||||
|
config_dict = crawlerRunConfig.__dict__.copy()
|
||||||
|
# Exclude items that do not affect browser-level setup.
|
||||||
|
# Expand or adjust as needed, e.g. chunking_strategy is purely for data extraction, not for browser config.
|
||||||
|
ephemeral_keys = [
|
||||||
|
"session_id",
|
||||||
|
"js_code",
|
||||||
|
"scraping_strategy",
|
||||||
|
"extraction_strategy",
|
||||||
|
"chunking_strategy",
|
||||||
|
"cache_mode",
|
||||||
|
"content_filter",
|
||||||
|
"semaphore_count",
|
||||||
|
"url"
|
||||||
|
]
|
||||||
|
for key in ephemeral_keys:
|
||||||
|
if key in config_dict:
|
||||||
|
del config_dict[key]
|
||||||
|
# Convert to canonical JSON string
|
||||||
|
signature_json = json.dumps(config_dict, sort_keys=True, default=str)
|
||||||
|
|
||||||
|
# Hash the JSON so we get a compact, unique string
|
||||||
|
signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest()
|
||||||
|
return signature_hash
|
||||||
|
|
||||||
async def get_page(self, crawlerRunConfig: CrawlerRunConfig):
|
async def get_page(self, crawlerRunConfig: CrawlerRunConfig):
|
||||||
"""
|
"""
|
||||||
Get a page for the given session ID, creating a new one if needed.
|
Get a page for the given session ID, creating a new one if needed.
|
||||||
@@ -651,24 +686,38 @@ class BrowserManager:
|
|||||||
crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
|
crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Page: The page object for the given session ID.
|
(page, context): The Page and its BrowserContext
|
||||||
BrowserContext: The browser context for the given session ID.
|
|
||||||
"""
|
"""
|
||||||
self._cleanup_expired_sessions()
|
self._cleanup_expired_sessions()
|
||||||
|
|
||||||
|
# If a session_id is provided and we already have it, reuse that page + context
|
||||||
if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions:
|
if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions:
|
||||||
context, page, _ = self.sessions[crawlerRunConfig.session_id]
|
context, page, _ = self.sessions[crawlerRunConfig.session_id]
|
||||||
|
# Update last-used timestamp
|
||||||
self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
|
self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
|
||||||
return page, context
|
return page, context
|
||||||
|
|
||||||
|
# If using a managed browser, just grab the shared default_context
|
||||||
if self.config.use_managed_browser:
|
if self.config.use_managed_browser:
|
||||||
context = self.default_context
|
context = self.default_context
|
||||||
page = await context.new_page()
|
page = await context.new_page()
|
||||||
else:
|
else:
|
||||||
context = await self.create_browser_context()
|
# Otherwise, check if we have an existing context for this config
|
||||||
await self.setup_context(context, crawlerRunConfig)
|
config_signature = self._make_config_signature(crawlerRunConfig)
|
||||||
|
|
||||||
|
async with self._contexts_lock:
|
||||||
|
if config_signature in self.contexts_by_config:
|
||||||
|
context = self.contexts_by_config[config_signature]
|
||||||
|
else:
|
||||||
|
# Create and setup a new context
|
||||||
|
context = await self.create_browser_context()
|
||||||
|
await self.setup_context(context, crawlerRunConfig)
|
||||||
|
self.contexts_by_config[config_signature] = context
|
||||||
|
|
||||||
|
# Create a new page from the chosen context
|
||||||
page = await context.new_page()
|
page = await context.new_page()
|
||||||
|
|
||||||
|
# If a session_id is specified, store this session so we can reuse later
|
||||||
if crawlerRunConfig.session_id:
|
if crawlerRunConfig.session_id:
|
||||||
self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
|
self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
|
||||||
|
|
||||||
@@ -708,6 +757,18 @@ class BrowserManager:
|
|||||||
for session_id in session_ids:
|
for session_id in session_ids:
|
||||||
await self.kill_session(session_id)
|
await self.kill_session(session_id)
|
||||||
|
|
||||||
|
# Now close all contexts we created. This reclaims memory from ephemeral contexts.
|
||||||
|
for ctx in self.contexts_by_config.values():
|
||||||
|
try:
|
||||||
|
await ctx.close()
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(
|
||||||
|
message="Error closing context: {error}",
|
||||||
|
tag="ERROR",
|
||||||
|
params={"error": str(e)}
|
||||||
|
)
|
||||||
|
self.contexts_by_config.clear()
|
||||||
|
|
||||||
if self.browser:
|
if self.browser:
|
||||||
await self.browser.close()
|
await self.browser.close()
|
||||||
self.browser = None
|
self.browser = None
|
||||||
@@ -1204,7 +1265,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
await context.add_init_script(load_js_script("navigator_overrider"))
|
await context.add_init_script(load_js_script("navigator_overrider"))
|
||||||
|
|
||||||
# Call hook after page creation
|
# Call hook after page creation
|
||||||
await self.execute_hook("on_page_context_created", page, context=context)
|
await self.execute_hook("on_page_context_created", page, context=context, config=config)
|
||||||
|
|
||||||
# Set up console logging if requested
|
# Set up console logging if requested
|
||||||
if config.log_console:
|
if config.log_console:
|
||||||
@@ -1245,7 +1306,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
|
|
||||||
# Handle page navigation and content loading
|
# Handle page navigation and content loading
|
||||||
if not config.js_only:
|
if not config.js_only:
|
||||||
await self.execute_hook("before_goto", page, context=context, url=url)
|
await self.execute_hook("before_goto", page, context=context, url=url, config=config)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Generate a unique nonce for this request
|
# Generate a unique nonce for this request
|
||||||
@@ -1265,7 +1326,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
|
raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
|
||||||
|
|
||||||
await self.execute_hook(
|
await self.execute_hook(
|
||||||
"after_goto", page, context=context, url=url, response=response
|
"after_goto", page, context=context, url=url, response=response, config=config
|
||||||
)
|
)
|
||||||
|
|
||||||
if response is None:
|
if response is None:
|
||||||
@@ -1439,7 +1500,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
params={"error": execution_result.get("error")},
|
params={"error": execution_result.get("error")},
|
||||||
)
|
)
|
||||||
|
|
||||||
await self.execute_hook("on_execution_started", page, context=context)
|
await self.execute_hook("on_execution_started", page, context=context, config=config)
|
||||||
|
|
||||||
# Handle user simulation
|
# Handle user simulation
|
||||||
if config.simulate_user or config.magic:
|
if config.simulate_user or config.magic:
|
||||||
@@ -1482,7 +1543,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
page = await self.process_iframes(page)
|
page = await self.process_iframes(page)
|
||||||
|
|
||||||
# Pre-content retrieval hooks and delay
|
# Pre-content retrieval hooks and delay
|
||||||
await self.execute_hook("before_retrieve_html", page, context=context)
|
await self.execute_hook("before_retrieve_html", page, context=context, config=config)
|
||||||
if config.delay_before_return_html:
|
if config.delay_before_return_html:
|
||||||
await asyncio.sleep(config.delay_before_return_html)
|
await asyncio.sleep(config.delay_before_return_html)
|
||||||
|
|
||||||
@@ -1493,7 +1554,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
# Get final HTML content
|
# Get final HTML content
|
||||||
html = await page.content()
|
html = await page.content()
|
||||||
await self.execute_hook(
|
await self.execute_hook(
|
||||||
"before_return_html", page=page, html=html, context=context
|
"before_return_html", page=page, html=html, context=context, config=config
|
||||||
)
|
)
|
||||||
|
|
||||||
# Handle PDF and screenshot generation
|
# Handle PDF and screenshot generation
|
||||||
@@ -1989,10 +2050,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
try:
|
try:
|
||||||
await page.wait_for_load_state("domcontentloaded", timeout=5000)
|
await page.wait_for_load_state("domcontentloaded", timeout=5000)
|
||||||
print(
|
|
||||||
"DOM content loaded after script execution in",
|
|
||||||
time.time() - t1,
|
|
||||||
)
|
|
||||||
except Error as e:
|
except Error as e:
|
||||||
self.logger.warning(
|
self.logger.warning(
|
||||||
message="DOM content load timeout: {error}",
|
message="DOM content load timeout: {error}",
|
||||||
@@ -2099,13 +2156,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
# Wait for network idle after script execution
|
# Wait for network idle after script execution
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
await page.wait_for_load_state("domcontentloaded", timeout=5000)
|
await page.wait_for_load_state("domcontentloaded", timeout=5000)
|
||||||
print(
|
|
||||||
"DOM content loaded after script execution in", time.time() - t1
|
|
||||||
)
|
|
||||||
|
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
await page.wait_for_load_state("networkidle", timeout=5000)
|
await page.wait_for_load_state("networkidle", timeout=5000)
|
||||||
print("Network idle after script execution in", time.time() - t1)
|
|
||||||
|
|
||||||
results.append(result if result else {"success": True})
|
results.append(result if result else {"success": True})
|
||||||
|
|
||||||
|
|||||||
@@ -231,7 +231,7 @@ async def extract_structured_data_using_css_extractor():
|
|||||||
print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
|
print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
|
||||||
schema = {
|
schema = {
|
||||||
"name": "KidoCode Courses",
|
"name": "KidoCode Courses",
|
||||||
"baseSelector": "section.charge-methodology .w-tab-content > div",
|
"baseSelector": "section.charge-methodology .framework-collection-item.w-dyn-item",
|
||||||
"fields": [
|
"fields": [
|
||||||
{
|
{
|
||||||
"name": "section_title",
|
"name": "section_title",
|
||||||
@@ -279,6 +279,7 @@ async def extract_structured_data_using_css_extractor():
|
|||||||
cache_mode=CacheMode.BYPASS,
|
cache_mode=CacheMode.BYPASS,
|
||||||
extraction_strategy=JsonCssExtractionStrategy(schema),
|
extraction_strategy=JsonCssExtractionStrategy(schema),
|
||||||
js_code=[js_click_tabs],
|
js_code=[js_click_tabs],
|
||||||
|
delay_before_return_html=1
|
||||||
)
|
)
|
||||||
|
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
@@ -591,29 +592,26 @@ async def speed_comparison():
|
|||||||
# Main execution
|
# Main execution
|
||||||
async def main():
|
async def main():
|
||||||
# Basic examples
|
# Basic examples
|
||||||
# await simple_crawl()
|
await simple_crawl()
|
||||||
# await simple_example_with_running_js_code()
|
await simple_example_with_running_js_code()
|
||||||
# await simple_example_with_css_selector()
|
await simple_example_with_css_selector()
|
||||||
|
|
||||||
# Advanced examples
|
# Advanced examples
|
||||||
# await extract_structured_data_using_css_extractor()
|
await extract_structured_data_using_css_extractor()
|
||||||
await extract_structured_data_using_llm(
|
await extract_structured_data_using_llm(
|
||||||
"openai/gpt-4o", os.getenv("OPENAI_API_KEY")
|
"openai/gpt-4o", os.getenv("OPENAI_API_KEY")
|
||||||
)
|
)
|
||||||
# await crawl_dynamic_content_pages_method_1()
|
await crawl_dynamic_content_pages_method_1()
|
||||||
# await crawl_dynamic_content_pages_method_2()
|
await crawl_dynamic_content_pages_method_2()
|
||||||
|
|
||||||
# Browser comparisons
|
# Browser comparisons
|
||||||
# await crawl_custom_browser_type()
|
await crawl_custom_browser_type()
|
||||||
|
|
||||||
# Performance testing
|
|
||||||
# await speed_comparison()
|
|
||||||
|
|
||||||
# Screenshot example
|
# Screenshot example
|
||||||
# await capture_and_save_screenshot(
|
await capture_and_save_screenshot(
|
||||||
# "https://www.example.com",
|
"https://www.example.com",
|
||||||
# os.path.join(__location__, "tmp/example_screenshot.jpg")
|
os.path.join(__location__, "tmp/example_screenshot.jpg")
|
||||||
# )
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
85
tests/20241401/test_cache_context.py
Normal file
85
tests/20241401/test_cache_context.py
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
|
from playwright.async_api import Page, BrowserContext
|
||||||
|
|
||||||
|
async def test_reuse_context_by_config():
|
||||||
|
# We will store each context ID in these maps to confirm reuse
|
||||||
|
context_ids_for_A = []
|
||||||
|
context_ids_for_B = []
|
||||||
|
|
||||||
|
# Create a small hook to track context creation
|
||||||
|
async def on_page_context_created(page: Page, context: BrowserContext, config: CrawlerRunConfig, **kwargs):
|
||||||
|
c_id = id(context)
|
||||||
|
print(f"[HOOK] on_page_context_created - Context ID: {c_id}")
|
||||||
|
# Distinguish which config we used by checking a custom hook param
|
||||||
|
config_label = config.shared_data.get("config_label", "unknown")
|
||||||
|
if config_label == "A":
|
||||||
|
context_ids_for_A.append(c_id)
|
||||||
|
elif config_label == "B":
|
||||||
|
context_ids_for_B.append(c_id)
|
||||||
|
return page
|
||||||
|
|
||||||
|
# Browser config - Headless, verbose so we see logs
|
||||||
|
browser_config = BrowserConfig(headless=True, verbose=True)
|
||||||
|
|
||||||
|
# Two crawler run configs that differ (for example, text_mode):
|
||||||
|
configA = CrawlerRunConfig(
|
||||||
|
only_text=True,
|
||||||
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
wait_until="domcontentloaded",
|
||||||
|
shared_data = {
|
||||||
|
"config_label" : "A"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
configB = CrawlerRunConfig(
|
||||||
|
only_text=False,
|
||||||
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
wait_until="domcontentloaded",
|
||||||
|
shared_data = {
|
||||||
|
"config_label" : "B"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create the crawler
|
||||||
|
crawler = AsyncWebCrawler(config=browser_config)
|
||||||
|
|
||||||
|
# Attach our custom hook
|
||||||
|
# Note: "on_page_context_created" will be called each time a new context+page is generated
|
||||||
|
crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created)
|
||||||
|
|
||||||
|
# Start the crawler (launches the browser)
|
||||||
|
await crawler.start()
|
||||||
|
|
||||||
|
# For demonstration, we’ll crawl a benign site multiple times with each config
|
||||||
|
test_url = "https://example.com"
|
||||||
|
print("\n--- Crawling with config A (text_mode=True) ---")
|
||||||
|
for _ in range(2):
|
||||||
|
# Pass an extra kwarg to the hook so we know which config is being used
|
||||||
|
await crawler.arun(test_url, config=configA)
|
||||||
|
|
||||||
|
print("\n--- Crawling with config B (text_mode=False) ---")
|
||||||
|
for _ in range(2):
|
||||||
|
await crawler.arun(test_url, config=configB)
|
||||||
|
|
||||||
|
# Close the crawler (shuts down the browser, closes contexts)
|
||||||
|
await crawler.close()
|
||||||
|
|
||||||
|
# Validate and show the results
|
||||||
|
print("\n=== RESULTS ===")
|
||||||
|
print(f"Config A context IDs: {context_ids_for_A}")
|
||||||
|
print(f"Config B context IDs: {context_ids_for_B}")
|
||||||
|
if len(set(context_ids_for_A)) == 1:
|
||||||
|
print("✅ All config A crawls used the SAME BrowserContext.")
|
||||||
|
else:
|
||||||
|
print("❌ Config A crawls created multiple contexts unexpectedly.")
|
||||||
|
if len(set(context_ids_for_B)) == 1:
|
||||||
|
print("✅ All config B crawls used the SAME BrowserContext.")
|
||||||
|
else:
|
||||||
|
print("❌ Config B crawls created multiple contexts unexpectedly.")
|
||||||
|
if set(context_ids_for_A).isdisjoint(context_ids_for_B):
|
||||||
|
print("✅ Config A context is different from Config B context.")
|
||||||
|
else:
|
||||||
|
print("❌ A and B ended up sharing the same context somehow!")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(test_reuse_context_by_config())
|
||||||
Reference in New Issue
Block a user