- Add memory_saving_mode config: aggressive cache discard + V8 heap cap flags for high-volume crawling (1000+ pages) - Add max_pages_before_recycle config: automatic browser process recycling after N pages to reclaim leaked memory (recommended 500-1000) - Add default Chrome flags to disable unused features (OptimizationHints, MediaRouter, component updates, domain reliability) - Fix CDP session leak: detach CDP session after viewport adjustment - Fix session kill: only close context when refcount reaches 0, preventing use-after-close for shared contexts - Add browser lifecycle and memory tests
1170 lines
44 KiB
Python
1170 lines
44 KiB
Python
"""
|
|
Tests for browser memory management: memory_saving_mode, browser recycling,
|
|
and CDP session leak fixes.
|
|
|
|
These are integration tests that launch real browsers and crawl real pages.
|
|
They verify:
|
|
1. memory_saving_mode Chrome flags are applied
|
|
2. Browser recycling fires at the right threshold and doesn't break crawling
|
|
3. Concurrent crawls survive a recycle boundary without errors
|
|
4. Recycling resets all internal tracking state cleanly
|
|
5. Memory doesn't grow unbounded over many pages
|
|
6. CDP session detach fix doesn't regress viewport adjustment
|
|
"""
|
|
|
|
import asyncio
|
|
import os
|
|
import time
|
|
import threading
|
|
from http.server import HTTPServer, SimpleHTTPRequestHandler
|
|
|
|
import psutil
|
|
import pytest
|
|
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Local test server — avoids network flakiness
|
|
# ---------------------------------------------------------------------------
|
|
|
|
PAGES_HTML = {}
|
|
for i in range(200):
|
|
PAGES_HTML[f"/page{i}"] = f"""<!DOCTYPE html>
|
|
<html><head><title>Page {i}</title></head>
|
|
<body>
|
|
<h1>Test page {i}</h1>
|
|
<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit.
|
|
Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
|
|
Paragraph {i} with enough text to exercise the content pipeline.</p>
|
|
<a href="/page{(i+1) % 200}">Next</a>
|
|
</body></html>"""
|
|
|
|
|
|
class MemTestHandler(SimpleHTTPRequestHandler):
|
|
"""Serves lightweight HTML pages for memory tests.
|
|
|
|
Also serves /login and /dashboard for multi-step session tests.
|
|
/login sets a cookie, /dashboard checks the cookie to prove session state.
|
|
"""
|
|
|
|
def log_message(self, *args):
|
|
pass # silent
|
|
|
|
def do_GET(self):
|
|
if self.path == "/login":
|
|
self.send_response(200)
|
|
self.send_header("Content-type", "text/html")
|
|
self.send_header("Set-Cookie", "auth_token=valid123; Path=/")
|
|
self.end_headers()
|
|
self.wfile.write(b"""<!DOCTYPE html>
|
|
<html><head><title>Login</title></head>
|
|
<body><h1>Login Page</h1><p>You are now logged in.</p>
|
|
<a href="/dashboard">Go to dashboard</a></body></html>""")
|
|
return
|
|
|
|
if self.path == "/dashboard":
|
|
cookie = self.headers.get("Cookie", "")
|
|
if "auth_token=valid123" in cookie:
|
|
body = "<h1>Dashboard</h1><p>Welcome, authenticated user!</p>"
|
|
else:
|
|
body = "<h1>Dashboard</h1><p>NOT AUTHENTICATED</p>"
|
|
self.send_response(200)
|
|
self.send_header("Content-type", "text/html")
|
|
self.end_headers()
|
|
self.wfile.write(
|
|
f"<!DOCTYPE html><html><head><title>Dashboard</title></head>"
|
|
f"<body>{body}</body></html>".encode()
|
|
)
|
|
return
|
|
|
|
if self.path == "/step1":
|
|
self.send_response(200)
|
|
self.send_header("Content-type", "text/html")
|
|
self.end_headers()
|
|
self.wfile.write(b"""<!DOCTYPE html>
|
|
<html><head><title>Step 1</title></head>
|
|
<body><h1>Step 1</h1><p>First step complete</p></body></html>""")
|
|
return
|
|
|
|
if self.path == "/step2":
|
|
self.send_response(200)
|
|
self.send_header("Content-type", "text/html")
|
|
self.end_headers()
|
|
self.wfile.write(b"""<!DOCTYPE html>
|
|
<html><head><title>Step 2</title></head>
|
|
<body><h1>Step 2</h1><p>Second step complete</p></body></html>""")
|
|
return
|
|
|
|
if self.path == "/step3":
|
|
self.send_response(200)
|
|
self.send_header("Content-type", "text/html")
|
|
self.end_headers()
|
|
self.wfile.write(b"""<!DOCTYPE html>
|
|
<html><head><title>Step 3</title></head>
|
|
<body><h1>Step 3</h1><p>Third step complete</p></body></html>""")
|
|
return
|
|
|
|
html = PAGES_HTML.get(self.path)
|
|
if html is None:
|
|
# Fallback for root and unknown paths
|
|
html = PAGES_HTML["/page0"]
|
|
self.send_response(200)
|
|
self.send_header("Content-type", "text/html")
|
|
self.end_headers()
|
|
self.wfile.write(html.encode())
|
|
|
|
|
|
class ReuseAddrHTTPServer(HTTPServer):
|
|
allow_reuse_address = True
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def test_server():
|
|
"""Start a local HTTP server for the test module."""
|
|
server = ReuseAddrHTTPServer(("127.0.0.1", 0), MemTestHandler)
|
|
port = server.server_address[1]
|
|
thread = threading.Thread(target=server.serve_forever, daemon=True)
|
|
thread.start()
|
|
yield f"http://127.0.0.1:{port}"
|
|
server.shutdown()
|
|
|
|
|
|
def _url(base, i):
|
|
return f"{base}/page{i}"
|
|
|
|
|
|
def _get_chromium_rss_mb():
|
|
"""Sum RSS of all chromium/chrome child processes in MB."""
|
|
total = 0
|
|
for proc in psutil.process_iter(["name", "cmdline"]):
|
|
try:
|
|
name = (proc.info["name"] or "").lower()
|
|
cmdline = " ".join(proc.info["cmdline"] or []).lower()
|
|
if "chrom" in name or "chrom" in cmdline:
|
|
total += proc.memory_info().rss
|
|
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
pass
|
|
return total / (1024 * 1024)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers to reach into BrowserManager internals
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _bm(crawler: AsyncWebCrawler):
|
|
"""Shortcut to get the BrowserManager from a crawler."""
|
|
return crawler.crawler_strategy.browser_manager
|
|
|
|
|
|
# ===========================================================================
|
|
# Test 1: memory_saving_mode flag propagation
|
|
# ===========================================================================
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_memory_saving_flags_applied(test_server):
|
|
"""Verify --aggressive-cache-discard and --js-flags are in the launch args
|
|
when memory_saving_mode=True, and absent when False."""
|
|
config_on = BrowserConfig(
|
|
headless=True,
|
|
verbose=False,
|
|
memory_saving_mode=True,
|
|
)
|
|
config_off = BrowserConfig(
|
|
headless=True,
|
|
verbose=False,
|
|
memory_saving_mode=False,
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=config_on) as crawler:
|
|
bm = _bm(crawler)
|
|
browser_args = bm._build_browser_args()
|
|
# _build_browser_args returns a dict with an "args" key
|
|
args_list = browser_args.get("args", browser_args) if isinstance(browser_args, dict) else browser_args
|
|
assert "--aggressive-cache-discard" in args_list, (
|
|
"memory_saving_mode=True should add --aggressive-cache-discard"
|
|
)
|
|
assert any("max-old-space-size" in a for a in args_list), (
|
|
"memory_saving_mode=True should add V8 heap cap"
|
|
)
|
|
# Always-on flags should be present regardless
|
|
assert any("OptimizationHints" in a for a in args_list)
|
|
|
|
async with AsyncWebCrawler(config=config_off) as crawler:
|
|
bm = _bm(crawler)
|
|
browser_args = bm._build_browser_args()
|
|
args_list = browser_args.get("args", browser_args) if isinstance(browser_args, dict) else browser_args
|
|
assert "--aggressive-cache-discard" not in args_list, (
|
|
"memory_saving_mode=False should NOT add --aggressive-cache-discard"
|
|
)
|
|
assert not any("max-old-space-size" in a for a in args_list), (
|
|
"memory_saving_mode=False should NOT add V8 heap cap"
|
|
)
|
|
# Always-on flags should still be there
|
|
assert any("OptimizationHints" in a for a in args_list)
|
|
|
|
|
|
# ===========================================================================
|
|
# Test 2: Always-on flags present in both code paths
|
|
# ===========================================================================
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_always_on_flags_present(test_server):
|
|
"""The 3 always-on memory flags should appear in _build_browser_args
|
|
even with default BrowserConfig."""
|
|
config = BrowserConfig(headless=True, verbose=False)
|
|
async with AsyncWebCrawler(config=config) as crawler:
|
|
browser_args = _bm(crawler)._build_browser_args()
|
|
args_list = browser_args.get("args", browser_args) if isinstance(browser_args, dict) else browser_args
|
|
assert any("disable-component-update" in a for a in args_list)
|
|
assert any("disable-domain-reliability" in a for a in args_list)
|
|
assert any("OptimizationHints" in a for a in args_list)
|
|
|
|
|
|
# ===========================================================================
|
|
# Test 3: Basic recycling — counter increments, recycle fires, crawls resume
|
|
# ===========================================================================
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_recycle_fires_at_threshold(test_server):
|
|
"""Set max_pages_before_recycle=5, crawl 8 pages sequentially.
|
|
Verify the counter resets after recycle and all crawls succeed."""
|
|
config = BrowserConfig(
|
|
headless=True,
|
|
verbose=False,
|
|
memory_saving_mode=True,
|
|
max_pages_before_recycle=5,
|
|
)
|
|
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
|
|
|
async with AsyncWebCrawler(config=config) as crawler:
|
|
bm = _bm(crawler)
|
|
assert bm._pages_served == 0
|
|
|
|
results = []
|
|
for i in range(8):
|
|
r = await crawler.arun(url=_url(test_server, i), config=run_config)
|
|
results.append(r)
|
|
|
|
# All 8 crawls should succeed — recycle happened transparently
|
|
assert len(results) == 8
|
|
assert all(r.success for r in results), (
|
|
f"Failed crawls: {[i for i, r in enumerate(results) if not r.success]}"
|
|
)
|
|
|
|
# After 8 pages with threshold=5, recycle happened once (at page 5).
|
|
# Pages 6,7,8 served after recycle → counter should be 3.
|
|
assert bm._pages_served == 3, (
|
|
f"Expected 3 pages after recycle, got {bm._pages_served}"
|
|
)
|
|
|
|
|
|
# ===========================================================================
|
|
# Test 4: Recycling resets all tracking state
|
|
# ===========================================================================
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_recycle_clears_tracking_state(test_server):
|
|
"""After a recycle, internal dicts should be clean."""
|
|
config = BrowserConfig(
|
|
headless=True,
|
|
verbose=False,
|
|
max_pages_before_recycle=3,
|
|
)
|
|
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
|
|
|
async with AsyncWebCrawler(config=config) as crawler:
|
|
bm = _bm(crawler)
|
|
|
|
# Crawl 3 pages → triggers recycle
|
|
for i in range(3):
|
|
r = await crawler.arun(url=_url(test_server, i), config=run_config)
|
|
assert r.success
|
|
|
|
# Give recycle a moment to complete (it fires in release_page_with_context)
|
|
await asyncio.sleep(0.5)
|
|
|
|
# Recycle should have reset these
|
|
assert bm._pages_served == 0, f"Counter not reset: {bm._pages_served}"
|
|
assert sum(bm._context_refcounts.values()) == 0, (
|
|
f"Refcounts not zero after recycle: {bm._context_refcounts}"
|
|
)
|
|
|
|
# Crawl one more page to prove browser is alive
|
|
r = await crawler.arun(url=_url(test_server, 99), config=run_config)
|
|
assert r.success
|
|
assert bm._pages_served == 1
|
|
|
|
|
|
# ===========================================================================
|
|
# Test 5: Concurrent crawls across a recycle boundary
|
|
# ===========================================================================
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_concurrent_crawls_across_recycle(test_server):
|
|
"""Launch concurrent crawls that straddle the recycle threshold.
|
|
Recycling should wait for in-flight crawls to finish, not crash them."""
|
|
config = BrowserConfig(
|
|
headless=True,
|
|
verbose=False,
|
|
max_pages_before_recycle=5,
|
|
)
|
|
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
|
|
|
async with AsyncWebCrawler(config=config) as crawler:
|
|
# Fire 10 concurrent crawls with threshold=5
|
|
urls = [_url(test_server, i) for i in range(10)]
|
|
tasks = [crawler.arun(url=u, config=run_config) for u in urls]
|
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
exceptions = [r for r in results if isinstance(r, Exception)]
|
|
assert len(exceptions) == 0, (
|
|
f"Got {len(exceptions)} exceptions during concurrent recycle: "
|
|
f"{exceptions[:3]}"
|
|
)
|
|
successes = [r for r in results if not isinstance(r, Exception) and r.success]
|
|
assert len(successes) == 10, (
|
|
f"Only {len(successes)}/10 crawls succeeded"
|
|
)
|
|
|
|
|
|
# ===========================================================================
|
|
# Test 6: Recycle with sessions — sessions cleared, new session works after
|
|
# ===========================================================================
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_recycle_blocked_by_active_session(test_server):
|
|
"""An active session holds a context refcount, so the browser should NOT
|
|
recycle while the session is open — even if pages_served >= threshold.
|
|
This proves recycling is safe around sessions."""
|
|
config = BrowserConfig(
|
|
headless=True,
|
|
verbose=False,
|
|
max_pages_before_recycle=3,
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=config) as crawler:
|
|
bm = _bm(crawler)
|
|
run_no_session = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
|
|
|
# Crawl 2 non-session pages (released immediately)
|
|
for i in range(2):
|
|
r = await crawler.arun(url=_url(test_server, i), config=run_no_session)
|
|
assert r.success
|
|
|
|
# Create a named session on page 3 — hits the threshold
|
|
run_with_session = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS,
|
|
session_id="test_session",
|
|
verbose=False,
|
|
)
|
|
r = await crawler.arun(url=_url(test_server, 2), config=run_with_session)
|
|
assert r.success
|
|
assert "test_session" in bm.sessions
|
|
|
|
# We've hit 3 pages (the threshold), but the session holds a refcount
|
|
# so recycle must NOT fire
|
|
assert bm._pages_served == 3
|
|
assert not bm._recycling, (
|
|
"Recycle should not fire while a session holds a refcount"
|
|
)
|
|
|
|
# Browser should still be alive — use the session again
|
|
r = await crawler.arun(url=_url(test_server, 50), config=run_with_session)
|
|
assert r.success, "Session should still work even past recycle threshold"
|
|
|
|
# Session reuses the same page, so counter stays at 3
|
|
# (only get_page increments it, and session reuse skips get_page)
|
|
assert bm._pages_served >= 3
|
|
assert not bm._recycling
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_sessions_cleared_by_recycle(test_server):
|
|
"""After a recycle, the sessions dict is empty and new sessions work."""
|
|
config = BrowserConfig(
|
|
headless=True,
|
|
verbose=False,
|
|
max_pages_before_recycle=3,
|
|
)
|
|
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
|
|
|
async with AsyncWebCrawler(config=config) as crawler:
|
|
bm = _bm(crawler)
|
|
|
|
# Crawl 3 non-session pages → recycle fires (all refcounts 0)
|
|
for i in range(3):
|
|
r = await crawler.arun(url=_url(test_server, i), config=run_config)
|
|
assert r.success
|
|
|
|
await asyncio.sleep(0.5)
|
|
|
|
# Sessions dict cleared by recycle
|
|
assert len(bm.sessions) == 0, (
|
|
f"Sessions should be empty after recycle, got {list(bm.sessions.keys())}"
|
|
)
|
|
|
|
# New session should work on the fresh browser
|
|
run_with_session = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS,
|
|
session_id="post_recycle_session",
|
|
verbose=False,
|
|
)
|
|
r = await crawler.arun(url=_url(test_server, 99), config=run_with_session)
|
|
assert r.success
|
|
assert "post_recycle_session" in bm.sessions
|
|
|
|
|
|
# ===========================================================================
|
|
# Test 7: Multiple recycle cycles — browser survives repeated recycling
|
|
# ===========================================================================
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_multiple_recycle_cycles(test_server):
|
|
"""Recycle the browser 4 times (threshold=5, crawl 22 pages).
|
|
Every single crawl must succeed."""
|
|
config = BrowserConfig(
|
|
headless=True,
|
|
verbose=False,
|
|
max_pages_before_recycle=5,
|
|
)
|
|
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
|
|
|
async with AsyncWebCrawler(config=config) as crawler:
|
|
bm = _bm(crawler)
|
|
all_results = []
|
|
|
|
for i in range(22):
|
|
r = await crawler.arun(url=_url(test_server, i % 200), config=run_config)
|
|
all_results.append(r)
|
|
|
|
assert all(r.success for r in all_results), (
|
|
f"Failed at pages: "
|
|
f"{[i for i, r in enumerate(all_results) if not r.success]}"
|
|
)
|
|
# 22 pages, threshold 5 → recycles at 5, 10, 15, 20 → 4 recycles
|
|
# After last recycle at page 20, pages 21,22 served → counter = 2
|
|
assert bm._pages_served == 2
|
|
|
|
|
|
# ===========================================================================
|
|
# Test 8: Recycling disabled by default (max_pages_before_recycle=0)
|
|
# ===========================================================================
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_recycle_disabled_by_default(test_server):
|
|
"""With default config (max_pages_before_recycle=0), no recycling happens
|
|
no matter how many pages are crawled."""
|
|
config = BrowserConfig(headless=True, verbose=False)
|
|
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
|
|
|
async with AsyncWebCrawler(config=config) as crawler:
|
|
bm = _bm(crawler)
|
|
|
|
for i in range(10):
|
|
r = await crawler.arun(url=_url(test_server, i), config=run_config)
|
|
assert r.success
|
|
|
|
# Counter increments but never resets
|
|
assert bm._pages_served == 10
|
|
assert not bm._recycling
|
|
|
|
|
|
# ===========================================================================
|
|
# Test 9: _recycle_done event blocks get_page during recycle
|
|
# ===========================================================================
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_recycle_event_blocks_new_pages(test_server):
|
|
"""Simulate a recycle by manually clearing the event, then verify that
|
|
get_page blocks until the event is set."""
|
|
config = BrowserConfig(headless=True, verbose=False)
|
|
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
|
|
|
async with AsyncWebCrawler(config=config) as crawler:
|
|
bm = _bm(crawler)
|
|
|
|
# Manually block the gate
|
|
bm._recycle_done.clear()
|
|
|
|
got_page = False
|
|
|
|
async def try_get_page():
|
|
nonlocal got_page
|
|
r = await crawler.arun(url=_url(test_server, 0), config=run_config)
|
|
got_page = r.success
|
|
|
|
task = asyncio.create_task(try_get_page())
|
|
|
|
# Wait a bit — the crawl should be blocked
|
|
await asyncio.sleep(0.5)
|
|
assert not got_page, "get_page should block while _recycle_done is cleared"
|
|
|
|
# Release the gate
|
|
bm._recycle_done.set()
|
|
await asyncio.wait_for(task, timeout=15.0)
|
|
assert got_page, "Crawl should succeed after recycle_done is set"
|
|
|
|
|
|
# ===========================================================================
|
|
# Test 10: BrowserConfig serialization round-trip
|
|
# ===========================================================================
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_config_serialization_roundtrip():
|
|
"""memory_saving_mode and max_pages_before_recycle survive
|
|
to_dict → from_kwargs → clone round-trips."""
|
|
original = BrowserConfig(
|
|
headless=True,
|
|
memory_saving_mode=True,
|
|
max_pages_before_recycle=500,
|
|
)
|
|
|
|
# to_dict → from_kwargs
|
|
d = original.to_dict()
|
|
assert d["memory_saving_mode"] is True
|
|
assert d["max_pages_before_recycle"] == 500
|
|
|
|
restored = BrowserConfig.from_kwargs(d)
|
|
assert restored.memory_saving_mode is True
|
|
assert restored.max_pages_before_recycle == 500
|
|
|
|
# clone with override
|
|
cloned = original.clone(max_pages_before_recycle=1000)
|
|
assert cloned.memory_saving_mode is True # inherited
|
|
assert cloned.max_pages_before_recycle == 1000 # overridden
|
|
|
|
# dump / load
|
|
dumped = original.dump()
|
|
loaded = BrowserConfig.load(dumped)
|
|
assert loaded.memory_saving_mode is True
|
|
assert loaded.max_pages_before_recycle == 500
|
|
|
|
|
|
# ===========================================================================
|
|
# Test 11: Memory stays bounded over many pages with recycling
|
|
# ===========================================================================
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_memory_bounded_with_recycling(test_server):
|
|
"""Crawl 40 pages with recycling every 10. Measure RSS at page 10
|
|
(just after first recycle) and at page 40. Memory should not grow
|
|
significantly — the recycle should keep it bounded.
|
|
|
|
This is the core proof that recycling controls memory growth.
|
|
Without recycling, Chromium RSS grows ~2-5 MB per page.
|
|
With recycling, it should stay roughly flat."""
|
|
config = BrowserConfig(
|
|
headless=True,
|
|
verbose=False,
|
|
memory_saving_mode=True,
|
|
max_pages_before_recycle=10,
|
|
)
|
|
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
|
|
|
async with AsyncWebCrawler(config=config) as crawler:
|
|
rss_samples = []
|
|
|
|
for i in range(40):
|
|
r = await crawler.arun(url=_url(test_server, i % 200), config=run_config)
|
|
assert r.success, f"Page {i} failed"
|
|
|
|
# Sample after each recycle boundary + a few extra
|
|
if (i + 1) % 10 == 0:
|
|
await asyncio.sleep(0.3) # let recycle finish
|
|
rss_samples.append(_get_chromium_rss_mb())
|
|
|
|
# We should have 4 samples (at pages 10, 20, 30, 40)
|
|
assert len(rss_samples) == 4
|
|
|
|
# The key assertion: RSS at page 40 should not be dramatically larger
|
|
# than at page 10. Allow 50% growth as tolerance for GC timing etc.
|
|
# Without recycling, we'd expect 60-150 MB growth over 30 extra pages.
|
|
if rss_samples[0] > 0: # guard against measurement issues
|
|
growth_ratio = rss_samples[-1] / rss_samples[0]
|
|
assert growth_ratio < 2.0, (
|
|
f"Memory grew {growth_ratio:.1f}x from {rss_samples[0]:.0f}MB "
|
|
f"to {rss_samples[-1]:.0f}MB over 30 pages with recycling. "
|
|
f"All samples: {[f'{s:.0f}' for s in rss_samples]} MB"
|
|
)
|
|
|
|
|
|
# ===========================================================================
|
|
# Test 12: Memory grows WITHOUT recycling (control test)
|
|
# ===========================================================================
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_memory_grows_without_recycling(test_server):
|
|
"""Control test: crawl 30 pages WITHOUT recycling and observe that
|
|
chromium RSS is higher at the end than at the start.
|
|
This proves that recycling is what keeps memory bounded."""
|
|
config = BrowserConfig(
|
|
headless=True,
|
|
verbose=False,
|
|
memory_saving_mode=False,
|
|
max_pages_before_recycle=0, # disabled
|
|
)
|
|
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
|
|
|
async with AsyncWebCrawler(config=config) as crawler:
|
|
# Warm up — let initial browser memory stabilize
|
|
for i in range(3):
|
|
r = await crawler.arun(url=_url(test_server, i), config=run_config)
|
|
assert r.success
|
|
await asyncio.sleep(0.3)
|
|
rss_start = _get_chromium_rss_mb()
|
|
|
|
# Crawl 30 more pages
|
|
for i in range(3, 33):
|
|
r = await crawler.arun(url=_url(test_server, i), config=run_config)
|
|
assert r.success
|
|
|
|
await asyncio.sleep(0.3)
|
|
rss_end = _get_chromium_rss_mb()
|
|
|
|
# RSS should be at least somewhat higher (chromium leaks)
|
|
# We just need this to not be 0 — proving our measurement works
|
|
if rss_start > 0:
|
|
print(
|
|
f"\n[CONTROL] RSS without recycling: "
|
|
f"{rss_start:.0f}MB → {rss_end:.0f}MB "
|
|
f"(+{rss_end - rss_start:.0f}MB over 30 pages)"
|
|
)
|
|
|
|
|
|
# ===========================================================================
|
|
# Test 13: Viewport adjustment doesn't leak CDP sessions
|
|
# ===========================================================================
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_viewport_adjustment_no_cdp_leak(test_server):
|
|
"""Crawl several pages that trigger viewport adjustment (scan_full_page).
|
|
If CDP sessions leak, Chromium's DevTools session count grows and
|
|
eventually causes slowdowns. We just verify all crawls succeed and
|
|
the browser stays healthy."""
|
|
config = BrowserConfig(headless=True, verbose=False)
|
|
run_config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS,
|
|
scan_full_page=True, # triggers fit_to_viewport_adjustment → CDP session
|
|
verbose=False,
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=config) as crawler:
|
|
for i in range(15):
|
|
r = await crawler.arun(url=_url(test_server, i), config=run_config)
|
|
assert r.success, f"Page {i} failed with scan_full_page"
|
|
|
|
|
|
# ===========================================================================
|
|
# Test 14: Recycle under concurrent load with arun_many
|
|
# ===========================================================================
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_recycle_with_arun_many(test_server):
|
|
"""Use arun_many to crawl a batch that exceeds the recycle threshold.
|
|
This tests the dispatcher + recycling interaction."""
|
|
config = BrowserConfig(
|
|
headless=True,
|
|
verbose=False,
|
|
max_pages_before_recycle=5,
|
|
)
|
|
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
|
|
|
async with AsyncWebCrawler(config=config) as crawler:
|
|
urls = [_url(test_server, i) for i in range(12)]
|
|
results = await crawler.arun_many(urls, config=run_config)
|
|
|
|
successes = [r for r in results if r.success]
|
|
assert len(successes) == 12, (
|
|
f"Only {len(successes)}/12 succeeded with arun_many + recycling"
|
|
)
|
|
|
|
|
|
# ===========================================================================
|
|
# Test 15: _global_pages_in_use cleaned after recycle
|
|
# ===========================================================================
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_global_pages_in_use_cleared(test_server):
|
|
"""After a recycle, the _global_pages_in_use set for this browser's
|
|
endpoint should be empty (old pages are dead)."""
|
|
config = BrowserConfig(
|
|
headless=True,
|
|
verbose=False,
|
|
max_pages_before_recycle=3,
|
|
)
|
|
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
|
|
|
async with AsyncWebCrawler(config=config) as crawler:
|
|
bm = _bm(crawler)
|
|
|
|
for i in range(3):
|
|
r = await crawler.arun(url=_url(test_server, i), config=run_config)
|
|
assert r.success
|
|
|
|
await asyncio.sleep(0.5)
|
|
|
|
# After recycle, pages_in_use for old endpoint should be empty
|
|
from crawl4ai.browser_manager import BrowserManager
|
|
if bm._browser_endpoint_key:
|
|
piu = BrowserManager._global_pages_in_use.get(
|
|
bm._browser_endpoint_key, set()
|
|
)
|
|
assert len(piu) == 0, (
|
|
f"_global_pages_in_use should be empty after recycle, "
|
|
f"has {len(piu)} stale entries"
|
|
)
|
|
|
|
|
|
# ===========================================================================
|
|
# Test 16: Content integrity across recycle — page content is correct
|
|
# ===========================================================================
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_content_integrity_across_recycle(test_server):
|
|
"""Verify that pages crawled AFTER a recycle return correct content,
|
|
not stale data from before the recycle."""
|
|
config = BrowserConfig(
|
|
headless=True,
|
|
verbose=False,
|
|
max_pages_before_recycle=3,
|
|
)
|
|
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
|
|
|
async with AsyncWebCrawler(config=config) as crawler:
|
|
# Crawl pages 0,1,2 → triggers recycle
|
|
for i in range(3):
|
|
r = await crawler.arun(url=_url(test_server, i), config=run_config)
|
|
assert r.success
|
|
|
|
await asyncio.sleep(0.5)
|
|
|
|
# Crawl page 150 after recycle — content should match page 150
|
|
r = await crawler.arun(url=_url(test_server, 150), config=run_config)
|
|
assert r.success
|
|
assert "Test page 150" in r.html, (
|
|
"Content after recycle should be from the correct page"
|
|
)
|
|
assert "Paragraph 150" in r.html
|
|
|
|
|
|
# ===========================================================================
|
|
# SESSION + RECYCLE INTERACTION TESTS
|
|
# ===========================================================================
|
|
|
|
|
|
# ===========================================================================
|
|
# Test 17: Multi-step session crawl — login → dashboard with cookie
|
|
# ===========================================================================
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_multistep_session_login_flow(test_server):
|
|
"""Simulate login → dashboard multi-step crawl using session_id.
|
|
The session preserves cookies, so dashboard should see authenticated state.
|
|
No recycling involved — baseline session behavior."""
|
|
config = BrowserConfig(headless=True, verbose=False)
|
|
|
|
async with AsyncWebCrawler(config=config) as crawler:
|
|
session_cfg = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS,
|
|
session_id="login_flow",
|
|
verbose=False,
|
|
)
|
|
|
|
# Step 1: login — sets cookie
|
|
r = await crawler.arun(url=f"{test_server}/login", config=session_cfg)
|
|
assert r.success
|
|
assert "Login Page" in r.html
|
|
|
|
# Step 2: dashboard — cookie should carry over via session
|
|
r = await crawler.arun(url=f"{test_server}/dashboard", config=session_cfg)
|
|
assert r.success
|
|
assert "Welcome, authenticated user" in r.html, (
|
|
"Session should carry cookies from login to dashboard"
|
|
)
|
|
|
|
|
|
# ===========================================================================
|
|
# Test 18: Multi-step session survives non-session crawls past threshold
|
|
# ===========================================================================
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_session_survives_threshold_with_interleaved_crawls(test_server):
|
|
"""Open a session, then do many non-session crawls that push
|
|
pages_served past the recycle threshold. The session should prevent
|
|
recycle from firing (refcount > 0). Then continue using the session
|
|
and it should still work."""
|
|
config = BrowserConfig(
|
|
headless=True,
|
|
verbose=False,
|
|
max_pages_before_recycle=5,
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=config) as crawler:
|
|
bm = _bm(crawler)
|
|
|
|
# Start a session — step 1
|
|
session_cfg = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS,
|
|
session_id="persistent_session",
|
|
verbose=False,
|
|
)
|
|
r = await crawler.arun(url=f"{test_server}/login", config=session_cfg)
|
|
assert r.success
|
|
assert "persistent_session" in bm.sessions
|
|
|
|
# Fire 8 non-session crawls — pushes pages_served to 9
|
|
# (1 from session + 8 = 9, well past threshold of 5)
|
|
no_session = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
|
for i in range(8):
|
|
r = await crawler.arun(url=_url(test_server, i), config=no_session)
|
|
assert r.success, f"Non-session crawl {i} failed"
|
|
|
|
# Recycle should NOT have fired — session holds refcount
|
|
assert bm._pages_served == 9, (
|
|
f"Expected 9 pages served, got {bm._pages_served}"
|
|
)
|
|
assert not bm._recycling
|
|
assert "persistent_session" in bm.sessions, (
|
|
"Session should still exist — recycle blocked by refcount"
|
|
)
|
|
|
|
# Session should still work — navigate to dashboard with cookies
|
|
r = await crawler.arun(url=f"{test_server}/dashboard", config=session_cfg)
|
|
assert r.success
|
|
assert "Welcome, authenticated user" in r.html, (
|
|
"Session cookies should still work after interleaved non-session crawls"
|
|
)
|
|
|
|
|
|
# ===========================================================================
|
|
# Test 19: 3-step session flow with recycle threshold — recycle blocked
|
|
# ===========================================================================
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_three_step_session_blocks_recycle(test_server):
|
|
"""3-step session (step1 → step2 → step3) with low threshold.
|
|
The session's refcount should block recycle for the entire flow."""
|
|
config = BrowserConfig(
|
|
headless=True,
|
|
verbose=False,
|
|
max_pages_before_recycle=2, # very low threshold
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=config) as crawler:
|
|
bm = _bm(crawler)
|
|
|
|
session_cfg = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS,
|
|
session_id="multistep",
|
|
verbose=False,
|
|
)
|
|
|
|
# Step 1
|
|
r = await crawler.arun(url=f"{test_server}/step1", config=session_cfg)
|
|
assert r.success
|
|
assert "Step 1" in r.html
|
|
|
|
# Step 2 — pages_served is still 1 (session reuse doesn't increment)
|
|
# but even if it did, refcount blocks recycle
|
|
r = await crawler.arun(url=f"{test_server}/step2", config=session_cfg)
|
|
assert r.success
|
|
assert "Step 2" in r.html
|
|
|
|
# Step 3
|
|
r = await crawler.arun(url=f"{test_server}/step3", config=session_cfg)
|
|
assert r.success
|
|
assert "Step 3" in r.html
|
|
|
|
# Session page reuse doesn't increment counter (only get_page does)
|
|
# Initial creation = 1 page, subsequent calls reuse it
|
|
assert bm._pages_served == 1
|
|
assert not bm._recycling
|
|
assert "multistep" in bm.sessions
|
|
|
|
|
|
# ===========================================================================
|
|
# Test 20: Two concurrent sessions — both survive past threshold
|
|
# ===========================================================================
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_two_concurrent_sessions_block_recycle(test_server):
|
|
"""Two sessions open at the same time, with non-session crawls interleaved.
|
|
Both sessions should prevent recycle and remain functional."""
|
|
config = BrowserConfig(
|
|
headless=True,
|
|
verbose=False,
|
|
max_pages_before_recycle=3,
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=config) as crawler:
|
|
bm = _bm(crawler)
|
|
|
|
session_a = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS, session_id="sess_a", verbose=False,
|
|
)
|
|
session_b = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS, session_id="sess_b", verbose=False,
|
|
)
|
|
no_session = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
|
|
|
# Open session A
|
|
r = await crawler.arun(url=f"{test_server}/login", config=session_a)
|
|
assert r.success
|
|
|
|
# Open session B
|
|
r = await crawler.arun(url=f"{test_server}/step1", config=session_b)
|
|
assert r.success
|
|
|
|
# 5 non-session crawls — pages_served goes to 7 (2 sessions + 5)
|
|
for i in range(5):
|
|
r = await crawler.arun(url=_url(test_server, i), config=no_session)
|
|
assert r.success
|
|
|
|
# Both sessions hold refcounts → recycle blocked
|
|
assert not bm._recycling
|
|
assert "sess_a" in bm.sessions
|
|
assert "sess_b" in bm.sessions
|
|
|
|
# Both sessions still work
|
|
r = await crawler.arun(url=f"{test_server}/dashboard", config=session_a)
|
|
assert r.success
|
|
assert "Welcome, authenticated user" in r.html
|
|
|
|
r = await crawler.arun(url=f"{test_server}/step2", config=session_b)
|
|
assert r.success
|
|
assert "Step 2" in r.html
|
|
|
|
|
|
# ===========================================================================
|
|
# Test 21: Session killed, then recycle fires on next non-session crawl
|
|
# ===========================================================================
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_recycle_fires_after_session_killed(test_server):
|
|
"""Session blocks recycle. After session is killed (refcount drops to 0),
|
|
the next non-session crawl that pushes past threshold triggers recycle."""
|
|
config = BrowserConfig(
|
|
headless=True,
|
|
verbose=False,
|
|
max_pages_before_recycle=3,
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=config) as crawler:
|
|
bm = _bm(crawler)
|
|
no_session = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
|
|
|
# Open a session (1 page)
|
|
session_cfg = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS, session_id="temp_sess", verbose=False,
|
|
)
|
|
r = await crawler.arun(url=f"{test_server}/step1", config=session_cfg)
|
|
assert r.success
|
|
|
|
# 3 non-session crawls (4 pages total, threshold=3, but session blocks)
|
|
for i in range(3):
|
|
r = await crawler.arun(url=_url(test_server, i), config=no_session)
|
|
assert r.success
|
|
|
|
pages_before_kill = bm._pages_served
|
|
assert pages_before_kill == 4
|
|
assert not bm._recycling
|
|
|
|
# Kill the session — refcount drops to 0
|
|
await crawler.crawler_strategy.kill_session("temp_sess")
|
|
assert "temp_sess" not in bm.sessions
|
|
|
|
# One more crawl — should trigger recycle (pages_served=5 >= 3, refcounts=0)
|
|
r = await crawler.arun(url=_url(test_server, 99), config=no_session)
|
|
assert r.success
|
|
|
|
await asyncio.sleep(0.5)
|
|
|
|
# Recycle should have fired — counter reset
|
|
assert bm._pages_served < pages_before_kill, (
|
|
f"Expected counter reset after recycle, got {bm._pages_served}"
|
|
)
|
|
|
|
|
|
# ===========================================================================
|
|
# Test 22: Concurrent session crawls — same session from multiple tasks
|
|
# ===========================================================================
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_concurrent_same_session_crawls(test_server):
|
|
"""Multiple asyncio tasks using the same session_id concurrently.
|
|
The session page should be shared safely between them."""
|
|
config = BrowserConfig(headless=True, verbose=False)
|
|
|
|
async with AsyncWebCrawler(config=config) as crawler:
|
|
session_cfg = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS,
|
|
session_id="shared_session",
|
|
verbose=False,
|
|
)
|
|
|
|
# Login first to set cookie
|
|
r = await crawler.arun(url=f"{test_server}/login", config=session_cfg)
|
|
assert r.success
|
|
|
|
# Fire 5 concurrent crawls on the same session
|
|
urls = [f"{test_server}/page{i}" for i in range(5)]
|
|
tasks = [
|
|
crawler.arun(url=u, config=session_cfg) for u in urls
|
|
]
|
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
|
|
exceptions = [r for r in results if isinstance(r, Exception)]
|
|
# Some may fail due to navigation conflicts (same page, concurrent goto),
|
|
# but there should be no crashes or browser death
|
|
assert len(exceptions) == 0, (
|
|
f"Exceptions in concurrent same-session crawls: {exceptions[:3]}"
|
|
)
|
|
|
|
|
|
# ===========================================================================
|
|
# Test 23: Session + recycling — session killed mid-batch, recycle fires,
|
|
# new session works after
|
|
# ===========================================================================
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_session_lifecycle_across_recycle(test_server):
|
|
"""Full lifecycle: create session → use it → kill it → recycle fires →
|
|
create new session → use it. End-to-end proof that recycling is safe."""
|
|
config = BrowserConfig(
|
|
headless=True,
|
|
verbose=False,
|
|
max_pages_before_recycle=4,
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=config) as crawler:
|
|
bm = _bm(crawler)
|
|
no_session = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
|
|
|
# Phase 1: create and use a session
|
|
sess_v1 = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS, session_id="lifecycle_sess", verbose=False,
|
|
)
|
|
r = await crawler.arun(url=f"{test_server}/login", config=sess_v1)
|
|
assert r.success
|
|
|
|
r = await crawler.arun(url=f"{test_server}/dashboard", config=sess_v1)
|
|
assert r.success
|
|
assert "Welcome, authenticated user" in r.html
|
|
|
|
# Phase 2: kill session
|
|
await crawler.crawler_strategy.kill_session("lifecycle_sess")
|
|
|
|
# Phase 3: push past threshold with non-session crawls
|
|
for i in range(5):
|
|
r = await crawler.arun(url=_url(test_server, i), config=no_session)
|
|
assert r.success
|
|
|
|
await asyncio.sleep(0.5)
|
|
|
|
# Recycle should have happened (session killed, refcount=0)
|
|
assert bm._pages_served < 6, (
|
|
f"Expected reset after recycle, got {bm._pages_served}"
|
|
)
|
|
|
|
# Phase 4: new session on the fresh browser
|
|
sess_v2 = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS, session_id="lifecycle_sess_v2", verbose=False,
|
|
)
|
|
r = await crawler.arun(url=f"{test_server}/login", config=sess_v2)
|
|
assert r.success
|
|
assert "lifecycle_sess_v2" in bm.sessions
|
|
|
|
r = await crawler.arun(url=f"{test_server}/dashboard", config=sess_v2)
|
|
assert r.success
|
|
assert "Welcome, authenticated user" in r.html, (
|
|
"New session after recycle should work with cookies"
|
|
)
|
|
|
|
|
|
# ===========================================================================
|
|
# Test 24: Parallel sessions + non-session crawls with arun_many
|
|
# ===========================================================================
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_session_with_arun_many_interleaved(test_server):
|
|
"""Open a session, then fire arun_many for non-session URLs.
|
|
The session should survive the batch and remain usable after."""
|
|
config = BrowserConfig(
|
|
headless=True,
|
|
verbose=False,
|
|
max_pages_before_recycle=10,
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=config) as crawler:
|
|
bm = _bm(crawler)
|
|
|
|
# Open session
|
|
session_cfg = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS, session_id="batch_sess", verbose=False,
|
|
)
|
|
r = await crawler.arun(url=f"{test_server}/login", config=session_cfg)
|
|
assert r.success
|
|
|
|
# Batch of non-session crawls
|
|
no_session = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=False)
|
|
urls = [_url(test_server, i) for i in range(8)]
|
|
results = await crawler.arun_many(urls, config=no_session)
|
|
assert all(r.success for r in results), "All batch crawls should succeed"
|
|
|
|
# Session still alive
|
|
assert "batch_sess" in bm.sessions
|
|
r = await crawler.arun(url=f"{test_server}/dashboard", config=session_cfg)
|
|
assert r.success
|
|
assert "Welcome, authenticated user" in r.html
|
|
|
|
|
|
# ===========================================================================
|
|
# Test 25: Session refcount tracking correctness
|
|
# ===========================================================================
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_session_refcount_stays_at_one(test_server):
|
|
"""Verify that a session holds exactly 1 refcount throughout its
|
|
lifecycle, regardless of how many times it's reused."""
|
|
config = BrowserConfig(headless=True, verbose=False)
|
|
|
|
async with AsyncWebCrawler(config=config) as crawler:
|
|
bm = _bm(crawler)
|
|
|
|
session_cfg = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS, session_id="refcount_test", verbose=False,
|
|
)
|
|
|
|
# Create session
|
|
r = await crawler.arun(url=f"{test_server}/step1", config=session_cfg)
|
|
assert r.success
|
|
|
|
# Find the session's context signature
|
|
_, page, _ = bm.sessions["refcount_test"]
|
|
sig = bm._page_to_sig.get(page)
|
|
if sig:
|
|
refcount = bm._context_refcounts.get(sig, 0)
|
|
assert refcount == 1, (
|
|
f"Session should hold exactly 1 refcount, got {refcount}"
|
|
)
|
|
|
|
# Reuse session multiple times — refcount should stay at 1
|
|
for url in ["/step2", "/step3", "/dashboard"]:
|
|
r = await crawler.arun(url=f"{test_server}{url}", config=session_cfg)
|
|
assert r.success
|
|
|
|
if sig:
|
|
refcount = bm._context_refcounts.get(sig, 0)
|
|
assert refcount == 1, (
|
|
f"After reuse, refcount should still be 1, got {refcount}"
|
|
)
|
|
|
|
# Kill session — refcount should drop to 0
|
|
await crawler.crawler_strategy.kill_session("refcount_test")
|
|
if sig:
|
|
refcount = bm._context_refcounts.get(sig, 0)
|
|
assert refcount == 0, (
|
|
f"After kill, refcount should be 0, got {refcount}"
|
|
)
|