Files
crawl4ai/tests/async/test_browser_memory.py
unclecode c046918bb4 Add memory-saving mode, browser recycling, and CDP leak fixes
- Add memory_saving_mode config: aggressive cache discard + V8 heap cap
  flags for high-volume crawling (1000+ pages)
- Add max_pages_before_recycle config: automatic browser process recycling
  after N pages to reclaim leaked memory (recommended 500-1000)
- Add default Chrome flags to disable unused features (OptimizationHints,
  MediaRouter, component updates, domain reliability)
- Fix CDP session leak: detach CDP session after viewport adjustment
- Fix session kill: only close context when refcount reaches 0, preventing
  use-after-close for shared contexts
- Add browser lifecycle and memory tests
2026-02-04 02:00:53 +00:00

1170 lines
44 KiB
Python

"""
Tests for browser memory management: memory_saving_mode, browser recycling,
and CDP session leak fixes.
These are integration tests that launch real browsers and crawl real pages.
They verify:
1. memory_saving_mode Chrome flags are applied
2. Browser recycling fires at the right threshold and doesn't break crawling
3. Concurrent crawls survive a recycle boundary without errors
4. Recycling resets all internal tracking state cleanly
5. Memory doesn't grow unbounded over many pages
6. CDP session detach fix doesn't regress viewport adjustment
"""
import asyncio
import os
import time
import threading
from http.server import HTTPServer, SimpleHTTPRequestHandler
import psutil
import pytest
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
# ---------------------------------------------------------------------------
# Local test server — avoids network flakiness
# ---------------------------------------------------------------------------
PAGES_HTML = {}
for i in range(200):
PAGES_HTML[f"/page{i}"] = f"""<!DOCTYPE html>
<html><head><title>Page {i}</title></head>
<body>
<h1>Test page {i}</h1>
<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
Paragraph {i} with enough text to exercise the content pipeline.</p>
<a href="/page{(i+1) % 200}">Next</a>
</body></html>"""
class MemTestHandler(SimpleHTTPRequestHandler):
"""Serves lightweight HTML pages for memory tests.
Also serves /login and /dashboard for multi-step session tests.
/login sets a cookie, /dashboard checks the cookie to prove session state.
"""
def log_message(self, *args):
pass # silent
def do_GET(self):
if self.path == "/login":
self.send_response(200)
self.send_header("Content-type", "text/html")
self.send_header("Set-Cookie", "auth_token=valid123; Path=/")
self.end_headers()
self.wfile.write(b"""<!DOCTYPE html>
<html><head><title>Login</title></head>
<body><h1>Login Page</h1><p>You are now logged in.</p>
<a href="/dashboard">Go to dashboard</a></body></html>""")
return
if self.path == "/dashboard":
cookie = self.headers.get("Cookie", "")
if "auth_token=valid123" in cookie:
body = "<h1>Dashboard</h1><p>Welcome, authenticated user!</p>"
else:
body = "<h1>Dashboard</h1><p>NOT AUTHENTICATED</p>"
self.send_response(200)
self.send_header("Content-type", "text/html")
self.end_headers()
self.wfile.write(
f"<!DOCTYPE html><html><head><title>Dashboard</title></head>"
f"<body>{body}</body></html>".encode()
)
return
if self.path == "/step1":
self.send_response(200)
self.send_header("Content-type", "text/html")
self.end_headers()
self.wfile.write(b"""<!DOCTYPE html>
<html><head><title>Step 1</title></head>
<body><h1>Step 1</h1><p>First step complete</p></body></html>""")
return
if self.path == "/step2":
self.send_response(200)
self.send_header("Content-type", "text/html")
self.end_headers()
self.wfile.write(b"""<!DOCTYPE html>
<html><head><title>Step 2</title></head>
<body><h1>Step 2</h1><p>Second step complete</p></body></html>""")
return
if self.path == "/step3":
self.send_response(200)
self.send_header("Content-type", "text/html")
self.end_headers()
self.wfile.write(b"""<!DOCTYPE html>
<html><head><title>Step 3</title></head>
<body><h1>Step 3</h1><p>Third step complete</p></body></html>""")
return
html = PAGES_HTML.get(self.path)
if html is None:
# Fallback for root and unknown paths
html = PAGES_HTML["/page0"]
self.send_response(200)
self.send_header("Content-type", "text/html")
self.end_headers()
self.wfile.write(html.encode())
class ReuseAddrHTTPServer(HTTPServer):
allow_reuse_address = True
@pytest.fixture(scope="module")
def test_server():
"""Start a local HTTP server for the test module."""
server = ReuseAddrHTTPServer(("127.0.0.1", 0), MemTestHandler)
port = server.server_address[1]
thread = threading.Thread(target=server.serve_forever, daemon=True)
thread.start()
yield f"http://127.0.0.1:{port}"
server.shutdown()
def _url(base, i):
return f"{base}/page{i}"
def _get_chromium_rss_mb():
"""Sum RSS of all chromium/chrome child processes in MB."""
total = 0
for proc in psutil.process_iter(["name", "cmdline"]):
try:
name = (proc.info["name"] or "").lower()
cmdline = " ".join(proc.info["cmdline"] or []).lower()
if "chrom" in name or "chrom" in cmdline:
total += proc.memory_info().rss
except (psutil.NoSuchProcess, psutil.AccessDenied):
pass
return total / (1024 * 1024)
# ---------------------------------------------------------------------------
# Helpers to reach into BrowserManager internals
# ---------------------------------------------------------------------------
def _bm(crawler: AsyncWebCrawler):
"""Shortcut to get the BrowserManager from a crawler."""
return crawler.crawler_strategy.browser_manager
# ===========================================================================
# Test 1: memory_saving_mode flag propagation
# ===========================================================================
@pytest.mark.asyncio
async def test_memory_saving_flags_applied(test_server):
"""Verify --aggressive-cache-discard and --js-flags are in the launch args
when memory_saving_mode=True, and absent when False."""
config_on = BrowserConfig(
headless=True,
verbose=False,
memory_saving_mode=True,
)
config_off = BrowserConfig(
headless=True,
verbose=False,
memory_saving_mode=False,
)
async with AsyncWebCrawler(config=config_on) as crawler:
bm = _bm(crawler)
browser_args = bm._build_browser_args()
# _build_browser_args returns a dict with an "args" key
args_list = browser_args.get("args", browser_args) if isinstance(browser_args, dict) else browser_args
assert "--aggressive-cache-discard" in args_list, (
"memory_saving_mode=True should add --aggressive-cache-discard"
)
assert any("max-old-space-size" in a for a in args_list), (
"memory_saving_mode=True should add V8 heap cap"
)
# Always-on flags should be present regardless
assert any("OptimizationHints" in a for a in args_list)
async with AsyncWebCrawler(config=config_off) as crawler:
bm = _bm(crawler)
browser_args = bm._build_browser_args()
args_list = browser_args.get("args", browser_args) if isinstance(browser_args, dict) else browser_args
assert "--aggressive-cache-discard" not in args_list, (
"memory_saving_mode=False should NOT add --aggressive-cache-discard"
)
assert not any("max-old-space-size" in a for a in args_list), (
"memory_saving_mode=False should NOT add V8 heap cap"
)
# Always-on flags should still be there
assert any("OptimizationHints" in a for a in args_list)
# ===========================================================================
# Test 2: Always-on flags present in both code paths
# ===========================================================================
@pytest.mark.asyncio
async def test_always_on_flags_present(test_server):
"""The 3 always-on memory flags should appear in _build_browser_args
even with default BrowserConfig."""
config = BrowserConfig(headless=True, verbose=False)
async with AsyncWebCrawler(config=config) as crawler:
browser_args = _bm(crawler)._build_browser_args()
args_list = browser_args.get("args", browser_args) if isinstance(browser_args, dict) else browser_args
assert any("disable-component-update" in a for a in args_list)
assert any("disable-domain-reliability" in a for a in args_list)
assert any("OptimizationHints" in a for a in args_list)
# ===========================================================================
# Test 3: Basic recycling — counter increments, recycle fires, crawls resume
# ===========================================================================
@pytest.mark.asyncio
async def test_recycle_fires_at_threshold(test_server):
"""Set max_pages_before_recycle=5, crawl 8 pages sequentially.
Verify the counter resets after recycle and all crawls succeed."""
config = BrowserConfig(
headless=True,
verbose=False,
memory_saving_mode=True,
max_pages_before_recycle=5,
)
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=config) as crawler:
bm = _bm(crawler)
assert bm._pages_served == 0
results = []
for i in range(8):
r = await crawler.arun(url=_url(test_server, i), config=run_config)
results.append(r)
# All 8 crawls should succeed — recycle happened transparently
assert len(results) == 8
assert all(r.success for r in results), (
f"Failed crawls: {[i for i, r in enumerate(results) if not r.success]}"
)
# After 8 pages with threshold=5, recycle happened once (at page 5).
# Pages 6,7,8 served after recycle → counter should be 3.
assert bm._pages_served == 3, (
f"Expected 3 pages after recycle, got {bm._pages_served}"
)
# ===========================================================================
# Test 4: Recycling resets all tracking state
# ===========================================================================
@pytest.mark.asyncio
async def test_recycle_clears_tracking_state(test_server):
"""After a recycle, internal dicts should be clean."""
config = BrowserConfig(
headless=True,
verbose=False,
max_pages_before_recycle=3,
)
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=config) as crawler:
bm = _bm(crawler)
# Crawl 3 pages → triggers recycle
for i in range(3):
r = await crawler.arun(url=_url(test_server, i), config=run_config)
assert r.success
# Give recycle a moment to complete (it fires in release_page_with_context)
await asyncio.sleep(0.5)
# Recycle should have reset these
assert bm._pages_served == 0, f"Counter not reset: {bm._pages_served}"
assert sum(bm._context_refcounts.values()) == 0, (
f"Refcounts not zero after recycle: {bm._context_refcounts}"
)
# Crawl one more page to prove browser is alive
r = await crawler.arun(url=_url(test_server, 99), config=run_config)
assert r.success
assert bm._pages_served == 1
# ===========================================================================
# Test 5: Concurrent crawls across a recycle boundary
# ===========================================================================
@pytest.mark.asyncio
async def test_concurrent_crawls_across_recycle(test_server):
"""Launch concurrent crawls that straddle the recycle threshold.
Recycling should wait for in-flight crawls to finish, not crash them."""
config = BrowserConfig(
headless=True,
verbose=False,
max_pages_before_recycle=5,
)
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=config) as crawler:
# Fire 10 concurrent crawls with threshold=5
urls = [_url(test_server, i) for i in range(10)]
tasks = [crawler.arun(url=u, config=run_config) for u in urls]
results = await asyncio.gather(*tasks, return_exceptions=True)
exceptions = [r for r in results if isinstance(r, Exception)]
assert len(exceptions) == 0, (
f"Got {len(exceptions)} exceptions during concurrent recycle: "
f"{exceptions[:3]}"
)
successes = [r for r in results if not isinstance(r, Exception) and r.success]
assert len(successes) == 10, (
f"Only {len(successes)}/10 crawls succeeded"
)
# ===========================================================================
# Test 6: Recycle with sessions — sessions cleared, new session works after
# ===========================================================================
@pytest.mark.asyncio
async def test_recycle_blocked_by_active_session(test_server):
"""An active session holds a context refcount, so the browser should NOT
recycle while the session is open — even if pages_served >= threshold.
This proves recycling is safe around sessions."""
config = BrowserConfig(
headless=True,
verbose=False,
max_pages_before_recycle=3,
)
async with AsyncWebCrawler(config=config) as crawler:
bm = _bm(crawler)
run_no_session = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
# Crawl 2 non-session pages (released immediately)
for i in range(2):
r = await crawler.arun(url=_url(test_server, i), config=run_no_session)
assert r.success
# Create a named session on page 3 — hits the threshold
run_with_session = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
session_id="test_session",
verbose=False,
)
r = await crawler.arun(url=_url(test_server, 2), config=run_with_session)
assert r.success
assert "test_session" in bm.sessions
# We've hit 3 pages (the threshold), but the session holds a refcount
# so recycle must NOT fire
assert bm._pages_served == 3
assert not bm._recycling, (
"Recycle should not fire while a session holds a refcount"
)
# Browser should still be alive — use the session again
r = await crawler.arun(url=_url(test_server, 50), config=run_with_session)
assert r.success, "Session should still work even past recycle threshold"
# Session reuses the same page, so counter stays at 3
# (only get_page increments it, and session reuse skips get_page)
assert bm._pages_served >= 3
assert not bm._recycling
@pytest.mark.asyncio
async def test_sessions_cleared_by_recycle(test_server):
"""After a recycle, the sessions dict is empty and new sessions work."""
config = BrowserConfig(
headless=True,
verbose=False,
max_pages_before_recycle=3,
)
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=config) as crawler:
bm = _bm(crawler)
# Crawl 3 non-session pages → recycle fires (all refcounts 0)
for i in range(3):
r = await crawler.arun(url=_url(test_server, i), config=run_config)
assert r.success
await asyncio.sleep(0.5)
# Sessions dict cleared by recycle
assert len(bm.sessions) == 0, (
f"Sessions should be empty after recycle, got {list(bm.sessions.keys())}"
)
# New session should work on the fresh browser
run_with_session = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
session_id="post_recycle_session",
verbose=False,
)
r = await crawler.arun(url=_url(test_server, 99), config=run_with_session)
assert r.success
assert "post_recycle_session" in bm.sessions
# ===========================================================================
# Test 7: Multiple recycle cycles — browser survives repeated recycling
# ===========================================================================
@pytest.mark.asyncio
async def test_multiple_recycle_cycles(test_server):
"""Recycle the browser 4 times (threshold=5, crawl 22 pages).
Every single crawl must succeed."""
config = BrowserConfig(
headless=True,
verbose=False,
max_pages_before_recycle=5,
)
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=config) as crawler:
bm = _bm(crawler)
all_results = []
for i in range(22):
r = await crawler.arun(url=_url(test_server, i % 200), config=run_config)
all_results.append(r)
assert all(r.success for r in all_results), (
f"Failed at pages: "
f"{[i for i, r in enumerate(all_results) if not r.success]}"
)
# 22 pages, threshold 5 → recycles at 5, 10, 15, 20 → 4 recycles
# After last recycle at page 20, pages 21,22 served → counter = 2
assert bm._pages_served == 2
# ===========================================================================
# Test 8: Recycling disabled by default (max_pages_before_recycle=0)
# ===========================================================================
@pytest.mark.asyncio
async def test_recycle_disabled_by_default(test_server):
"""With default config (max_pages_before_recycle=0), no recycling happens
no matter how many pages are crawled."""
config = BrowserConfig(headless=True, verbose=False)
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=config) as crawler:
bm = _bm(crawler)
for i in range(10):
r = await crawler.arun(url=_url(test_server, i), config=run_config)
assert r.success
# Counter increments but never resets
assert bm._pages_served == 10
assert not bm._recycling
# ===========================================================================
# Test 9: _recycle_done event blocks get_page during recycle
# ===========================================================================
@pytest.mark.asyncio
async def test_recycle_event_blocks_new_pages(test_server):
"""Simulate a recycle by manually clearing the event, then verify that
get_page blocks until the event is set."""
config = BrowserConfig(headless=True, verbose=False)
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=config) as crawler:
bm = _bm(crawler)
# Manually block the gate
bm._recycle_done.clear()
got_page = False
async def try_get_page():
nonlocal got_page
r = await crawler.arun(url=_url(test_server, 0), config=run_config)
got_page = r.success
task = asyncio.create_task(try_get_page())
# Wait a bit — the crawl should be blocked
await asyncio.sleep(0.5)
assert not got_page, "get_page should block while _recycle_done is cleared"
# Release the gate
bm._recycle_done.set()
await asyncio.wait_for(task, timeout=15.0)
assert got_page, "Crawl should succeed after recycle_done is set"
# ===========================================================================
# Test 10: BrowserConfig serialization round-trip
# ===========================================================================
@pytest.mark.asyncio
async def test_config_serialization_roundtrip():
"""memory_saving_mode and max_pages_before_recycle survive
to_dict → from_kwargs → clone round-trips."""
original = BrowserConfig(
headless=True,
memory_saving_mode=True,
max_pages_before_recycle=500,
)
# to_dict → from_kwargs
d = original.to_dict()
assert d["memory_saving_mode"] is True
assert d["max_pages_before_recycle"] == 500
restored = BrowserConfig.from_kwargs(d)
assert restored.memory_saving_mode is True
assert restored.max_pages_before_recycle == 500
# clone with override
cloned = original.clone(max_pages_before_recycle=1000)
assert cloned.memory_saving_mode is True # inherited
assert cloned.max_pages_before_recycle == 1000 # overridden
# dump / load
dumped = original.dump()
loaded = BrowserConfig.load(dumped)
assert loaded.memory_saving_mode is True
assert loaded.max_pages_before_recycle == 500
# ===========================================================================
# Test 11: Memory stays bounded over many pages with recycling
# ===========================================================================
@pytest.mark.asyncio
async def test_memory_bounded_with_recycling(test_server):
"""Crawl 40 pages with recycling every 10. Measure RSS at page 10
(just after first recycle) and at page 40. Memory should not grow
significantly — the recycle should keep it bounded.
This is the core proof that recycling controls memory growth.
Without recycling, Chromium RSS grows ~2-5 MB per page.
With recycling, it should stay roughly flat."""
config = BrowserConfig(
headless=True,
verbose=False,
memory_saving_mode=True,
max_pages_before_recycle=10,
)
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=config) as crawler:
rss_samples = []
for i in range(40):
r = await crawler.arun(url=_url(test_server, i % 200), config=run_config)
assert r.success, f"Page {i} failed"
# Sample after each recycle boundary + a few extra
if (i + 1) % 10 == 0:
await asyncio.sleep(0.3) # let recycle finish
rss_samples.append(_get_chromium_rss_mb())
# We should have 4 samples (at pages 10, 20, 30, 40)
assert len(rss_samples) == 4
# The key assertion: RSS at page 40 should not be dramatically larger
# than at page 10. Allow 50% growth as tolerance for GC timing etc.
# Without recycling, we'd expect 60-150 MB growth over 30 extra pages.
if rss_samples[0] > 0: # guard against measurement issues
growth_ratio = rss_samples[-1] / rss_samples[0]
assert growth_ratio < 2.0, (
f"Memory grew {growth_ratio:.1f}x from {rss_samples[0]:.0f}MB "
f"to {rss_samples[-1]:.0f}MB over 30 pages with recycling. "
f"All samples: {[f'{s:.0f}' for s in rss_samples]} MB"
)
# ===========================================================================
# Test 12: Memory grows WITHOUT recycling (control test)
# ===========================================================================
@pytest.mark.asyncio
async def test_memory_grows_without_recycling(test_server):
"""Control test: crawl 30 pages WITHOUT recycling and observe that
chromium RSS is higher at the end than at the start.
This proves that recycling is what keeps memory bounded."""
config = BrowserConfig(
headless=True,
verbose=False,
memory_saving_mode=False,
max_pages_before_recycle=0, # disabled
)
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=config) as crawler:
# Warm up — let initial browser memory stabilize
for i in range(3):
r = await crawler.arun(url=_url(test_server, i), config=run_config)
assert r.success
await asyncio.sleep(0.3)
rss_start = _get_chromium_rss_mb()
# Crawl 30 more pages
for i in range(3, 33):
r = await crawler.arun(url=_url(test_server, i), config=run_config)
assert r.success
await asyncio.sleep(0.3)
rss_end = _get_chromium_rss_mb()
# RSS should be at least somewhat higher (chromium leaks)
# We just need this to not be 0 — proving our measurement works
if rss_start > 0:
print(
f"\n[CONTROL] RSS without recycling: "
f"{rss_start:.0f}MB → {rss_end:.0f}MB "
f"(+{rss_end - rss_start:.0f}MB over 30 pages)"
)
# ===========================================================================
# Test 13: Viewport adjustment doesn't leak CDP sessions
# ===========================================================================
@pytest.mark.asyncio
async def test_viewport_adjustment_no_cdp_leak(test_server):
"""Crawl several pages that trigger viewport adjustment (scan_full_page).
If CDP sessions leak, Chromium's DevTools session count grows and
eventually causes slowdowns. We just verify all crawls succeed and
the browser stays healthy."""
config = BrowserConfig(headless=True, verbose=False)
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
scan_full_page=True, # triggers fit_to_viewport_adjustment → CDP session
verbose=False,
)
async with AsyncWebCrawler(config=config) as crawler:
for i in range(15):
r = await crawler.arun(url=_url(test_server, i), config=run_config)
assert r.success, f"Page {i} failed with scan_full_page"
# ===========================================================================
# Test 14: Recycle under concurrent load with arun_many
# ===========================================================================
@pytest.mark.asyncio
async def test_recycle_with_arun_many(test_server):
"""Use arun_many to crawl a batch that exceeds the recycle threshold.
This tests the dispatcher + recycling interaction."""
config = BrowserConfig(
headless=True,
verbose=False,
max_pages_before_recycle=5,
)
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=config) as crawler:
urls = [_url(test_server, i) for i in range(12)]
results = await crawler.arun_many(urls, config=run_config)
successes = [r for r in results if r.success]
assert len(successes) == 12, (
f"Only {len(successes)}/12 succeeded with arun_many + recycling"
)
# ===========================================================================
# Test 15: _global_pages_in_use cleaned after recycle
# ===========================================================================
@pytest.mark.asyncio
async def test_global_pages_in_use_cleared(test_server):
"""After a recycle, the _global_pages_in_use set for this browser's
endpoint should be empty (old pages are dead)."""
config = BrowserConfig(
headless=True,
verbose=False,
max_pages_before_recycle=3,
)
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=config) as crawler:
bm = _bm(crawler)
for i in range(3):
r = await crawler.arun(url=_url(test_server, i), config=run_config)
assert r.success
await asyncio.sleep(0.5)
# After recycle, pages_in_use for old endpoint should be empty
from crawl4ai.browser_manager import BrowserManager
if bm._browser_endpoint_key:
piu = BrowserManager._global_pages_in_use.get(
bm._browser_endpoint_key, set()
)
assert len(piu) == 0, (
f"_global_pages_in_use should be empty after recycle, "
f"has {len(piu)} stale entries"
)
# ===========================================================================
# Test 16: Content integrity across recycle — page content is correct
# ===========================================================================
@pytest.mark.asyncio
async def test_content_integrity_across_recycle(test_server):
"""Verify that pages crawled AFTER a recycle return correct content,
not stale data from before the recycle."""
config = BrowserConfig(
headless=True,
verbose=False,
max_pages_before_recycle=3,
)
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
async with AsyncWebCrawler(config=config) as crawler:
# Crawl pages 0,1,2 → triggers recycle
for i in range(3):
r = await crawler.arun(url=_url(test_server, i), config=run_config)
assert r.success
await asyncio.sleep(0.5)
# Crawl page 150 after recycle — content should match page 150
r = await crawler.arun(url=_url(test_server, 150), config=run_config)
assert r.success
assert "Test page 150" in r.html, (
"Content after recycle should be from the correct page"
)
assert "Paragraph 150" in r.html
# ===========================================================================
# SESSION + RECYCLE INTERACTION TESTS
# ===========================================================================
# ===========================================================================
# Test 17: Multi-step session crawl — login → dashboard with cookie
# ===========================================================================
@pytest.mark.asyncio
async def test_multistep_session_login_flow(test_server):
"""Simulate login → dashboard multi-step crawl using session_id.
The session preserves cookies, so dashboard should see authenticated state.
No recycling involved — baseline session behavior."""
config = BrowserConfig(headless=True, verbose=False)
async with AsyncWebCrawler(config=config) as crawler:
session_cfg = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
session_id="login_flow",
verbose=False,
)
# Step 1: login — sets cookie
r = await crawler.arun(url=f"{test_server}/login", config=session_cfg)
assert r.success
assert "Login Page" in r.html
# Step 2: dashboard — cookie should carry over via session
r = await crawler.arun(url=f"{test_server}/dashboard", config=session_cfg)
assert r.success
assert "Welcome, authenticated user" in r.html, (
"Session should carry cookies from login to dashboard"
)
# ===========================================================================
# Test 18: Multi-step session survives non-session crawls past threshold
# ===========================================================================
@pytest.mark.asyncio
async def test_session_survives_threshold_with_interleaved_crawls(test_server):
"""Open a session, then do many non-session crawls that push
pages_served past the recycle threshold. The session should prevent
recycle from firing (refcount > 0). Then continue using the session
and it should still work."""
config = BrowserConfig(
headless=True,
verbose=False,
max_pages_before_recycle=5,
)
async with AsyncWebCrawler(config=config) as crawler:
bm = _bm(crawler)
# Start a session — step 1
session_cfg = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
session_id="persistent_session",
verbose=False,
)
r = await crawler.arun(url=f"{test_server}/login", config=session_cfg)
assert r.success
assert "persistent_session" in bm.sessions
# Fire 8 non-session crawls — pushes pages_served to 9
# (1 from session + 8 = 9, well past threshold of 5)
no_session = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
for i in range(8):
r = await crawler.arun(url=_url(test_server, i), config=no_session)
assert r.success, f"Non-session crawl {i} failed"
# Recycle should NOT have fired — session holds refcount
assert bm._pages_served == 9, (
f"Expected 9 pages served, got {bm._pages_served}"
)
assert not bm._recycling
assert "persistent_session" in bm.sessions, (
"Session should still exist — recycle blocked by refcount"
)
# Session should still work — navigate to dashboard with cookies
r = await crawler.arun(url=f"{test_server}/dashboard", config=session_cfg)
assert r.success
assert "Welcome, authenticated user" in r.html, (
"Session cookies should still work after interleaved non-session crawls"
)
# ===========================================================================
# Test 19: 3-step session flow with recycle threshold — recycle blocked
# ===========================================================================
@pytest.mark.asyncio
async def test_three_step_session_blocks_recycle(test_server):
"""3-step session (step1 → step2 → step3) with low threshold.
The session's refcount should block recycle for the entire flow."""
config = BrowserConfig(
headless=True,
verbose=False,
max_pages_before_recycle=2, # very low threshold
)
async with AsyncWebCrawler(config=config) as crawler:
bm = _bm(crawler)
session_cfg = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
session_id="multistep",
verbose=False,
)
# Step 1
r = await crawler.arun(url=f"{test_server}/step1", config=session_cfg)
assert r.success
assert "Step 1" in r.html
# Step 2 — pages_served is still 1 (session reuse doesn't increment)
# but even if it did, refcount blocks recycle
r = await crawler.arun(url=f"{test_server}/step2", config=session_cfg)
assert r.success
assert "Step 2" in r.html
# Step 3
r = await crawler.arun(url=f"{test_server}/step3", config=session_cfg)
assert r.success
assert "Step 3" in r.html
# Session page reuse doesn't increment counter (only get_page does)
# Initial creation = 1 page, subsequent calls reuse it
assert bm._pages_served == 1
assert not bm._recycling
assert "multistep" in bm.sessions
# ===========================================================================
# Test 20: Two concurrent sessions — both survive past threshold
# ===========================================================================
@pytest.mark.asyncio
async def test_two_concurrent_sessions_block_recycle(test_server):
"""Two sessions open at the same time, with non-session crawls interleaved.
Both sessions should prevent recycle and remain functional."""
config = BrowserConfig(
headless=True,
verbose=False,
max_pages_before_recycle=3,
)
async with AsyncWebCrawler(config=config) as crawler:
bm = _bm(crawler)
session_a = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, session_id="sess_a", verbose=False,
)
session_b = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, session_id="sess_b", verbose=False,
)
no_session = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
# Open session A
r = await crawler.arun(url=f"{test_server}/login", config=session_a)
assert r.success
# Open session B
r = await crawler.arun(url=f"{test_server}/step1", config=session_b)
assert r.success
# 5 non-session crawls — pages_served goes to 7 (2 sessions + 5)
for i in range(5):
r = await crawler.arun(url=_url(test_server, i), config=no_session)
assert r.success
# Both sessions hold refcounts → recycle blocked
assert not bm._recycling
assert "sess_a" in bm.sessions
assert "sess_b" in bm.sessions
# Both sessions still work
r = await crawler.arun(url=f"{test_server}/dashboard", config=session_a)
assert r.success
assert "Welcome, authenticated user" in r.html
r = await crawler.arun(url=f"{test_server}/step2", config=session_b)
assert r.success
assert "Step 2" in r.html
# ===========================================================================
# Test 21: Session killed, then recycle fires on next non-session crawl
# ===========================================================================
@pytest.mark.asyncio
async def test_recycle_fires_after_session_killed(test_server):
"""Session blocks recycle. After session is killed (refcount drops to 0),
the next non-session crawl that pushes past threshold triggers recycle."""
config = BrowserConfig(
headless=True,
verbose=False,
max_pages_before_recycle=3,
)
async with AsyncWebCrawler(config=config) as crawler:
bm = _bm(crawler)
no_session = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
# Open a session (1 page)
session_cfg = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, session_id="temp_sess", verbose=False,
)
r = await crawler.arun(url=f"{test_server}/step1", config=session_cfg)
assert r.success
# 3 non-session crawls (4 pages total, threshold=3, but session blocks)
for i in range(3):
r = await crawler.arun(url=_url(test_server, i), config=no_session)
assert r.success
pages_before_kill = bm._pages_served
assert pages_before_kill == 4
assert not bm._recycling
# Kill the session — refcount drops to 0
await crawler.crawler_strategy.kill_session("temp_sess")
assert "temp_sess" not in bm.sessions
# One more crawl — should trigger recycle (pages_served=5 >= 3, refcounts=0)
r = await crawler.arun(url=_url(test_server, 99), config=no_session)
assert r.success
await asyncio.sleep(0.5)
# Recycle should have fired — counter reset
assert bm._pages_served < pages_before_kill, (
f"Expected counter reset after recycle, got {bm._pages_served}"
)
# ===========================================================================
# Test 22: Concurrent session crawls — same session from multiple tasks
# ===========================================================================
@pytest.mark.asyncio
async def test_concurrent_same_session_crawls(test_server):
"""Multiple asyncio tasks using the same session_id concurrently.
The session page should be shared safely between them."""
config = BrowserConfig(headless=True, verbose=False)
async with AsyncWebCrawler(config=config) as crawler:
session_cfg = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
session_id="shared_session",
verbose=False,
)
# Login first to set cookie
r = await crawler.arun(url=f"{test_server}/login", config=session_cfg)
assert r.success
# Fire 5 concurrent crawls on the same session
urls = [f"{test_server}/page{i}" for i in range(5)]
tasks = [
crawler.arun(url=u, config=session_cfg) for u in urls
]
results = await asyncio.gather(*tasks, return_exceptions=True)
exceptions = [r for r in results if isinstance(r, Exception)]
# Some may fail due to navigation conflicts (same page, concurrent goto),
# but there should be no crashes or browser death
assert len(exceptions) == 0, (
f"Exceptions in concurrent same-session crawls: {exceptions[:3]}"
)
# ===========================================================================
# Test 23: Session + recycling — session killed mid-batch, recycle fires,
# new session works after
# ===========================================================================
@pytest.mark.asyncio
async def test_session_lifecycle_across_recycle(test_server):
"""Full lifecycle: create session → use it → kill it → recycle fires →
create new session → use it. End-to-end proof that recycling is safe."""
config = BrowserConfig(
headless=True,
verbose=False,
max_pages_before_recycle=4,
)
async with AsyncWebCrawler(config=config) as crawler:
bm = _bm(crawler)
no_session = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
# Phase 1: create and use a session
sess_v1 = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, session_id="lifecycle_sess", verbose=False,
)
r = await crawler.arun(url=f"{test_server}/login", config=sess_v1)
assert r.success
r = await crawler.arun(url=f"{test_server}/dashboard", config=sess_v1)
assert r.success
assert "Welcome, authenticated user" in r.html
# Phase 2: kill session
await crawler.crawler_strategy.kill_session("lifecycle_sess")
# Phase 3: push past threshold with non-session crawls
for i in range(5):
r = await crawler.arun(url=_url(test_server, i), config=no_session)
assert r.success
await asyncio.sleep(0.5)
# Recycle should have happened (session killed, refcount=0)
assert bm._pages_served < 6, (
f"Expected reset after recycle, got {bm._pages_served}"
)
# Phase 4: new session on the fresh browser
sess_v2 = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, session_id="lifecycle_sess_v2", verbose=False,
)
r = await crawler.arun(url=f"{test_server}/login", config=sess_v2)
assert r.success
assert "lifecycle_sess_v2" in bm.sessions
r = await crawler.arun(url=f"{test_server}/dashboard", config=sess_v2)
assert r.success
assert "Welcome, authenticated user" in r.html, (
"New session after recycle should work with cookies"
)
# ===========================================================================
# Test 24: Parallel sessions + non-session crawls with arun_many
# ===========================================================================
@pytest.mark.asyncio
async def test_session_with_arun_many_interleaved(test_server):
"""Open a session, then fire arun_many for non-session URLs.
The session should survive the batch and remain usable after."""
config = BrowserConfig(
headless=True,
verbose=False,
max_pages_before_recycle=10,
)
async with AsyncWebCrawler(config=config) as crawler:
bm = _bm(crawler)
# Open session
session_cfg = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, session_id="batch_sess", verbose=False,
)
r = await crawler.arun(url=f"{test_server}/login", config=session_cfg)
assert r.success
# Batch of non-session crawls
no_session = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
urls = [_url(test_server, i) for i in range(8)]
results = await crawler.arun_many(urls, config=no_session)
assert all(r.success for r in results), "All batch crawls should succeed"
# Session still alive
assert "batch_sess" in bm.sessions
r = await crawler.arun(url=f"{test_server}/dashboard", config=session_cfg)
assert r.success
assert "Welcome, authenticated user" in r.html
# ===========================================================================
# Test 25: Session refcount tracking correctness
# ===========================================================================
@pytest.mark.asyncio
async def test_session_refcount_stays_at_one(test_server):
"""Verify that a session holds exactly 1 refcount throughout its
lifecycle, regardless of how many times it's reused."""
config = BrowserConfig(headless=True, verbose=False)
async with AsyncWebCrawler(config=config) as crawler:
bm = _bm(crawler)
session_cfg = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, session_id="refcount_test", verbose=False,
)
# Create session
r = await crawler.arun(url=f"{test_server}/step1", config=session_cfg)
assert r.success
# Find the session's context signature
_, page, _ = bm.sessions["refcount_test"]
sig = bm._page_to_sig.get(page)
if sig:
refcount = bm._context_refcounts.get(sig, 0)
assert refcount == 1, (
f"Session should hold exactly 1 refcount, got {refcount}"
)
# Reuse session multiple times — refcount should stay at 1
for url in ["/step2", "/step3", "/dashboard"]:
r = await crawler.arun(url=f"{test_server}{url}", config=session_cfg)
assert r.success
if sig:
refcount = bm._context_refcounts.get(sig, 0)
assert refcount == 1, (
f"After reuse, refcount should still be 1, got {refcount}"
)
# Kill session — refcount should drop to 0
await crawler.crawler_strategy.kill_session("refcount_test")
if sig:
refcount = bm._context_refcounts.get(sig, 0)
assert refcount == 0, (
f"After kill, refcount should be 0, got {refcount}"
)