Fix proxy auth for persistent browser contexts

Chromium's --proxy-server CLI flag silently ignores inline credentials
(user:pass@server). For persistent contexts, crawl4ai was embedding
credentials in this flag via ManagedBrowser.build_browser_flags(),
causing proxy auth to fail and the browser to fall back to direct
connection.

Fix: Use Playwright's launch_persistent_context(proxy=...) API instead
of subprocess + CDP when use_persistent_context=True. This handles
proxy authentication properly via the HTTP CONNECT handshake. The
non-persistent and CDP paths remain unchanged.

Changes:
- Strip credentials from --proxy-server flag in build_browser_flags()
- Add launch_persistent_context() path in BrowserManager.start()
- Add cleanup path in BrowserManager.close()
- Guard create_browser_context() when self.browser is None
- Add regression tests covering all 4 proxy/persistence combinations
This commit is contained in:
unclecode
2026-02-12 11:19:29 +00:00
parent 1a24ac785e
commit 112f44a97d
8 changed files with 690 additions and 5 deletions

View File

@@ -0,0 +1,61 @@
import asyncio
import os
import shutil
import uuid
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
async def crawl_chanel(url: str):
# Fresh profile each time (gets flagged after one use)
profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
os.makedirs(profile_dir, exist_ok=True)
browser_config = BrowserConfig(
headless=True,
enable_stealth=True,
use_persistent_context=True,
user_data_dir=profile_dir,
viewport_width=1920,
viewport_height=1080,
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
headers={
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
}
)
run_config = CrawlerRunConfig(
magic=True,
simulate_user=True,
override_navigator=True,
page_timeout=120000,
wait_until="load",
delay_before_return_html=10.0,
)
try:
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url, config=run_config)
return result
finally:
shutil.rmtree(profile_dir, ignore_errors=True)
async def main():
url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/"
result = await crawl_chanel(url)
print(f"Status: {result.status_code}")
print(f"Success: {result.success}")
print(f"HTML: {len(result.html):,} bytes")
if result.markdown:
md_len = len(result.markdown.raw_markdown)
print(f"Markdown: {md_len:,} chars")
if result.error_message:
print(f"Error: {result.error_message}")
asyncio.run(main())

View File

@@ -0,0 +1,62 @@
import asyncio
import os
import shutil
import uuid
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.async_configs import ProxyConfig
async def main():
profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
os.makedirs(profile_dir, exist_ok=True)
browser_config = BrowserConfig(
headless=True,
enable_stealth=True,
use_persistent_context=True,
user_data_dir=profile_dir,
viewport_width=1920,
viewport_height=1080,
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
headers={
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
},
proxy_config=ProxyConfig(
server="https://network.joinmassive.com:65535",
username="mpuQHs4sWZ-country-US",
password="D0yWxVQo8wQ05RWqz1Bn",
),
)
run_config = CrawlerRunConfig(
magic=True,
simulate_user=True,
override_navigator=True,
page_timeout=120000,
wait_until="networkidle",
delay_before_return_html=15.0,
)
try:
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
"https://www.chanel.com/us/fashion/handbags/c/1x1x1/",
config=run_config,
)
print(f"Status: {result.status_code}")
print(f"HTML bytes: {len(result.html)}")
print(f"\n=== FULL HTML ===\n{result.html}")
print(f"\n=== RESPONSE HEADERS ===")
if result.response_headers:
for k, v in sorted(result.response_headers.items()):
print(f" {k}: {v}")
finally:
shutil.rmtree(profile_dir, ignore_errors=True)
asyncio.run(main())

View File

@@ -0,0 +1,147 @@
import asyncio
import os
import shutil
import uuid
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, UndetectedAdapter
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
async def attempt(label, browser_config, run_config, crawler_strategy=None):
print(f"\n{'='*60}")
print(f"Attempt: {label}")
print(f"{'='*60}")
url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/"
kwargs = {"config": browser_config}
if crawler_strategy:
kwargs["crawler_strategy"] = crawler_strategy
try:
async with AsyncWebCrawler(**kwargs) as crawler:
result = await crawler.arun(url, config=run_config)
print(f"Status: {result.status_code}")
print(f"Success: {result.success}")
print(f"HTML: {len(result.html):,} bytes")
if result.markdown:
print(f"Markdown: {len(result.markdown.raw_markdown):,} chars")
if result.error_message:
print(f"Error: {result.error_message}")
# Check for anti-bot indicators
html_lower = result.html.lower()
for indicator in ["access denied", "403", "blocked", "captcha", "challenge"]:
if indicator in html_lower:
print(f" Anti-bot indicator found: '{indicator}'")
return result
except Exception as e:
print(f"Exception: {e}")
return None
async def main():
mac_ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
}
# ---- Attempt 1: Mac UA + stealth (user's original approach) ----
profile1 = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
os.makedirs(profile1, exist_ok=True)
try:
bc1 = BrowserConfig(
headless=True,
enable_stealth=True,
use_persistent_context=True,
user_data_dir=profile1,
viewport_width=1920,
viewport_height=1080,
user_agent=mac_ua,
headers=headers,
)
rc1 = CrawlerRunConfig(
magic=True,
simulate_user=True,
override_navigator=True,
page_timeout=120000,
wait_until="load",
delay_before_return_html=10.0,
)
await attempt("Mac UA + Stealth + Magic (user's script)", bc1, rc1)
finally:
shutil.rmtree(profile1, ignore_errors=True)
await asyncio.sleep(3)
# ---- Attempt 2: Undetected adapter (patchright) ----
profile2 = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
os.makedirs(profile2, exist_ok=True)
try:
bc2 = BrowserConfig(
headless=True,
use_persistent_context=True,
user_data_dir=profile2,
viewport_width=1920,
viewport_height=1080,
user_agent=mac_ua,
headers=headers,
)
rc2 = CrawlerRunConfig(
simulate_user=True,
override_navigator=True,
page_timeout=120000,
wait_until="load",
delay_before_return_html=15.0,
)
adapter = UndetectedAdapter()
strategy = AsyncPlaywrightCrawlerStrategy(
browser_config=bc2,
browser_adapter=adapter,
)
await attempt("Undetected Adapter (patchright)", bc2, rc2, crawler_strategy=strategy)
finally:
shutil.rmtree(profile2, ignore_errors=True)
await asyncio.sleep(3)
# ---- Attempt 3: Longer delay + networkidle ----
profile3 = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
os.makedirs(profile3, exist_ok=True)
try:
bc3 = BrowserConfig(
headless=True,
enable_stealth=True,
use_persistent_context=True,
user_data_dir=profile3,
viewport_width=1920,
viewport_height=1080,
user_agent=mac_ua,
headers=headers,
)
rc3 = CrawlerRunConfig(
magic=True,
simulate_user=True,
override_navigator=True,
page_timeout=120000,
wait_until="networkidle",
delay_before_return_html=20.0,
js_code="""
// Simulate human-like scrolling
await new Promise(r => setTimeout(r, 2000));
window.scrollTo({top: 300, behavior: 'smooth'});
await new Promise(r => setTimeout(r, 1500));
window.scrollTo({top: 600, behavior: 'smooth'});
await new Promise(r => setTimeout(r, 1000));
""",
)
await attempt("Stealth + networkidle + scroll + 20s delay", bc3, rc3)
finally:
shutil.rmtree(profile3, ignore_errors=True)
asyncio.run(main())

View File

@@ -0,0 +1,62 @@
import asyncio
import os
import shutil
import uuid
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
async def crawl_chanel(url: str):
profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
os.makedirs(profile_dir, exist_ok=True)
browser_config = BrowserConfig(
headless=False, # Non-headless via Xvfb - harder to detect
enable_stealth=True,
use_persistent_context=True,
user_data_dir=profile_dir,
viewport_width=1920,
viewport_height=1080,
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
headers={
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
}
)
run_config = CrawlerRunConfig(
magic=True,
simulate_user=True,
override_navigator=True,
page_timeout=120000,
wait_until="load",
delay_before_return_html=10.0,
)
try:
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url, config=run_config)
return result
finally:
shutil.rmtree(profile_dir, ignore_errors=True)
async def main():
url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/"
result = await crawl_chanel(url)
print(f"Status: {result.status_code}")
print(f"Success: {result.success}")
print(f"HTML: {len(result.html):,} bytes")
if result.markdown:
md_len = len(result.markdown.raw_markdown)
print(f"Markdown: {md_len:,} chars")
if md_len > 500:
print(f"\nFirst 500 chars of markdown:\n{result.markdown.raw_markdown[:500]}")
if result.error_message:
print(f"Error: {result.error_message}")
asyncio.run(main())

View File

@@ -0,0 +1,68 @@
import asyncio
import os
import shutil
import uuid
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.async_configs import ProxyConfig
async def crawl_chanel(url: str):
profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
os.makedirs(profile_dir, exist_ok=True)
browser_config = BrowserConfig(
headless=True,
enable_stealth=True,
use_persistent_context=True,
user_data_dir=profile_dir,
viewport_width=1920,
viewport_height=1080,
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
headers={
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
},
proxy_config=ProxyConfig(
server="https://network.joinmassive.com:65535",
username="mpuQHs4sWZ-country-US",
password="D0yWxVQo8wQ05RWqz1Bn",
),
)
run_config = CrawlerRunConfig(
magic=True,
simulate_user=True,
override_navigator=True,
page_timeout=120000,
wait_until="load",
delay_before_return_html=10.0,
)
try:
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url, config=run_config)
return result
finally:
shutil.rmtree(profile_dir, ignore_errors=True)
async def main():
url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/"
result = await crawl_chanel(url)
print(f"Status: {result.status_code}")
print(f"Success: {result.success}")
print(f"HTML: {len(result.html):,} bytes")
if result.markdown:
md_len = len(result.markdown.raw_markdown)
print(f"Markdown: {md_len:,} chars")
if md_len > 500:
print(f"\nFirst 500 chars of markdown:\n{result.markdown.raw_markdown[:500]}")
if result.error_message:
print(f"Error: {result.error_message}")
asyncio.run(main())

View File

@@ -0,0 +1,84 @@
"""Test if matching UA to actual platform fixes Akamai detection."""
import asyncio
import os
import shutil
import uuid
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.async_configs import ProxyConfig
async def test_with_ua(label, user_agent):
print(f"\n{'='*60}")
print(f"Test: {label}")
print(f"{'='*60}")
profile_dir = os.path.expanduser(f"~/.crawl4ai/test_{uuid.uuid4().hex[:8]}")
os.makedirs(profile_dir, exist_ok=True)
browser_config = BrowserConfig(
headless=True,
enable_stealth=True,
use_persistent_context=True,
user_data_dir=profile_dir,
viewport_width=1920,
viewport_height=1080,
user_agent=user_agent,
headers={
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
},
proxy_config=ProxyConfig(
server="https://network.joinmassive.com:65535",
username="mpuQHs4sWZ-country-US",
password="D0yWxVQo8wQ05RWqz1Bn",
),
)
run_config = CrawlerRunConfig(
magic=True,
simulate_user=True,
override_navigator=True,
page_timeout=120000,
wait_until="load",
delay_before_return_html=10.0,
)
try:
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
"https://www.chanel.com/us/fashion/handbags/c/1x1x1/",
config=run_config,
)
print(f" Status: {result.status_code}")
print(f" HTML bytes: {len(result.html)}")
blocked = "access denied" in result.html.lower()
print(f" Blocked: {blocked}")
if not blocked and len(result.html) > 1000:
print(f" SUCCESS! Got real content")
except Exception as e:
print(f" EXCEPTION: {e}")
finally:
shutil.rmtree(profile_dir, ignore_errors=True)
async def main():
# Mac UA on Linux = platform mismatch
await test_with_ua(
"Mac UA (mismatched platform)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
)
await asyncio.sleep(3)
# Linux UA = matches actual navigator.platform
await test_with_ua(
"Linux UA (matching platform)",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
)
asyncio.run(main())

View File

@@ -0,0 +1,96 @@
"""Regression tests for proxy fix:
1. Persistent context + proxy (new path via launch_persistent_context)
2. Persistent context WITHOUT proxy (should still use launch_persistent_context)
3. Non-persistent + proxy on CrawlerRunConfig (existing path, must not break)
4. Non-persistent, no proxy (basic crawl, must not break)
"""
import asyncio
import os
import shutil
import uuid
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.async_configs import ProxyConfig
TEST_URL = "https://httpbin.org/ip" # Simple endpoint, returns IP
async def test(label, browser_config, run_config=None):
print(f"\n{'='*60}")
print(f"Test: {label}")
print(f"{'='*60}")
run_config = run_config or CrawlerRunConfig()
try:
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(TEST_URL, config=run_config)
print(f" Status: {result.status_code}")
print(f" HTML bytes: {len(result.html)}")
if result.markdown:
# httpbin.org/ip returns JSON with "origin" key
md = result.markdown.raw_markdown.strip()
print(f" Content: {md[:200]}")
if result.error_message:
print(f" ERROR: {result.error_message}")
return result
except Exception as e:
print(f" EXCEPTION: {e}")
return None
async def main():
proxy = ProxyConfig(
server="https://network.joinmassive.com:65535",
username="mpuQHs4sWZ-country-US",
password="D0yWxVQo8wQ05RWqz1Bn",
)
# 1. Persistent context + proxy (the fixed path)
pd = os.path.expanduser(f"~/.crawl4ai/test_{uuid.uuid4().hex[:8]}")
os.makedirs(pd, exist_ok=True)
try:
await test(
"Persistent + proxy (launch_persistent_context)",
BrowserConfig(
headless=True,
use_persistent_context=True,
user_data_dir=pd,
proxy_config=proxy,
),
)
finally:
shutil.rmtree(pd, ignore_errors=True)
# 2. Persistent context WITHOUT proxy
pd2 = os.path.expanduser(f"~/.crawl4ai/test_{uuid.uuid4().hex[:8]}")
os.makedirs(pd2, exist_ok=True)
try:
await test(
"Persistent, no proxy (launch_persistent_context)",
BrowserConfig(
headless=True,
use_persistent_context=True,
user_data_dir=pd2,
),
)
finally:
shutil.rmtree(pd2, ignore_errors=True)
# 3. Non-persistent + proxy on CrawlerRunConfig
await test(
"Non-persistent + proxy on RunConfig",
BrowserConfig(headless=True),
CrawlerRunConfig(
proxy_config=proxy,
),
)
# 4. Basic crawl - no proxy, no persistent
await test(
"Basic crawl (no proxy, no persistent)",
BrowserConfig(headless=True),
)
print("\n" + "="*60)
print("All regression tests complete.")
asyncio.run(main())