Fix proxy auth for persistent browser contexts
Chromium's --proxy-server CLI flag silently ignores inline credentials (user:pass@server). For persistent contexts, crawl4ai was embedding credentials in this flag via ManagedBrowser.build_browser_flags(), causing proxy auth to fail and the browser to fall back to direct connection. Fix: Use Playwright's launch_persistent_context(proxy=...) API instead of subprocess + CDP when use_persistent_context=True. This handles proxy authentication properly via the HTTP CONNECT handshake. The non-persistent and CDP paths remain unchanged. Changes: - Strip credentials from --proxy-server flag in build_browser_flags() - Add launch_persistent_context() path in BrowserManager.start() - Add cleanup path in BrowserManager.close() - Guard create_browser_context() when self.browser is None - Add regression tests covering all 4 proxy/persistence combinations
This commit is contained in:
96
tests/proxy/test_proxy_regression.py
Normal file
96
tests/proxy/test_proxy_regression.py
Normal file
@@ -0,0 +1,96 @@
|
||||
"""Regression tests for proxy fix:
|
||||
1. Persistent context + proxy (new path via launch_persistent_context)
|
||||
2. Persistent context WITHOUT proxy (should still use launch_persistent_context)
|
||||
3. Non-persistent + proxy on CrawlerRunConfig (existing path, must not break)
|
||||
4. Non-persistent, no proxy (basic crawl, must not break)
|
||||
"""
|
||||
import asyncio
|
||||
import os
|
||||
import shutil
|
||||
import uuid
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_configs import ProxyConfig
|
||||
|
||||
TEST_URL = "https://httpbin.org/ip" # Simple endpoint, returns IP
|
||||
|
||||
|
||||
async def test(label, browser_config, run_config=None):
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Test: {label}")
|
||||
print(f"{'='*60}")
|
||||
run_config = run_config or CrawlerRunConfig()
|
||||
try:
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(TEST_URL, config=run_config)
|
||||
print(f" Status: {result.status_code}")
|
||||
print(f" HTML bytes: {len(result.html)}")
|
||||
if result.markdown:
|
||||
# httpbin.org/ip returns JSON with "origin" key
|
||||
md = result.markdown.raw_markdown.strip()
|
||||
print(f" Content: {md[:200]}")
|
||||
if result.error_message:
|
||||
print(f" ERROR: {result.error_message}")
|
||||
return result
|
||||
except Exception as e:
|
||||
print(f" EXCEPTION: {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def main():
|
||||
proxy = ProxyConfig(
|
||||
server="https://network.joinmassive.com:65535",
|
||||
username="mpuQHs4sWZ-country-US",
|
||||
password="D0yWxVQo8wQ05RWqz1Bn",
|
||||
)
|
||||
|
||||
# 1. Persistent context + proxy (the fixed path)
|
||||
pd = os.path.expanduser(f"~/.crawl4ai/test_{uuid.uuid4().hex[:8]}")
|
||||
os.makedirs(pd, exist_ok=True)
|
||||
try:
|
||||
await test(
|
||||
"Persistent + proxy (launch_persistent_context)",
|
||||
BrowserConfig(
|
||||
headless=True,
|
||||
use_persistent_context=True,
|
||||
user_data_dir=pd,
|
||||
proxy_config=proxy,
|
||||
),
|
||||
)
|
||||
finally:
|
||||
shutil.rmtree(pd, ignore_errors=True)
|
||||
|
||||
# 2. Persistent context WITHOUT proxy
|
||||
pd2 = os.path.expanduser(f"~/.crawl4ai/test_{uuid.uuid4().hex[:8]}")
|
||||
os.makedirs(pd2, exist_ok=True)
|
||||
try:
|
||||
await test(
|
||||
"Persistent, no proxy (launch_persistent_context)",
|
||||
BrowserConfig(
|
||||
headless=True,
|
||||
use_persistent_context=True,
|
||||
user_data_dir=pd2,
|
||||
),
|
||||
)
|
||||
finally:
|
||||
shutil.rmtree(pd2, ignore_errors=True)
|
||||
|
||||
# 3. Non-persistent + proxy on CrawlerRunConfig
|
||||
await test(
|
||||
"Non-persistent + proxy on RunConfig",
|
||||
BrowserConfig(headless=True),
|
||||
CrawlerRunConfig(
|
||||
proxy_config=proxy,
|
||||
),
|
||||
)
|
||||
|
||||
# 4. Basic crawl - no proxy, no persistent
|
||||
await test(
|
||||
"Basic crawl (no proxy, no persistent)",
|
||||
BrowserConfig(headless=True),
|
||||
)
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("All regression tests complete.")
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user