Fix proxy auth for persistent browser contexts
Chromium's --proxy-server CLI flag silently ignores inline credentials (user:pass@server). For persistent contexts, crawl4ai was embedding credentials in this flag via ManagedBrowser.build_browser_flags(), causing proxy auth to fail and the browser to fall back to direct connection. Fix: Use Playwright's launch_persistent_context(proxy=...) API instead of subprocess + CDP when use_persistent_context=True. This handles proxy authentication properly via the HTTP CONNECT handshake. The non-persistent and CDP paths remain unchanged. Changes: - Strip credentials from --proxy-server flag in build_browser_flags() - Add launch_persistent_context() path in BrowserManager.start() - Add cleanup path in BrowserManager.close() - Guard create_browser_context() when self.browser is None - Add regression tests covering all 4 proxy/persistence combinations
This commit is contained in:
84
tests/proxy/test_platform_match.py
Normal file
84
tests/proxy/test_platform_match.py
Normal file
@@ -0,0 +1,84 @@
|
||||
"""Test if matching UA to actual platform fixes Akamai detection."""
|
||||
import asyncio
|
||||
import os
|
||||
import shutil
|
||||
import uuid
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_configs import ProxyConfig
|
||||
|
||||
|
||||
async def test_with_ua(label, user_agent):
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Test: {label}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
profile_dir = os.path.expanduser(f"~/.crawl4ai/test_{uuid.uuid4().hex[:8]}")
|
||||
os.makedirs(profile_dir, exist_ok=True)
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
enable_stealth=True,
|
||||
use_persistent_context=True,
|
||||
user_data_dir=profile_dir,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080,
|
||||
user_agent=user_agent,
|
||||
headers={
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
"Sec-Fetch-Dest": "document",
|
||||
"Sec-Fetch-Mode": "navigate",
|
||||
"Sec-Fetch-Site": "none",
|
||||
"Sec-Fetch-User": "?1",
|
||||
},
|
||||
proxy_config=ProxyConfig(
|
||||
server="https://network.joinmassive.com:65535",
|
||||
username="mpuQHs4sWZ-country-US",
|
||||
password="D0yWxVQo8wQ05RWqz1Bn",
|
||||
),
|
||||
)
|
||||
|
||||
run_config = CrawlerRunConfig(
|
||||
magic=True,
|
||||
simulate_user=True,
|
||||
override_navigator=True,
|
||||
page_timeout=120000,
|
||||
wait_until="load",
|
||||
delay_before_return_html=10.0,
|
||||
)
|
||||
|
||||
try:
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://www.chanel.com/us/fashion/handbags/c/1x1x1/",
|
||||
config=run_config,
|
||||
)
|
||||
print(f" Status: {result.status_code}")
|
||||
print(f" HTML bytes: {len(result.html)}")
|
||||
blocked = "access denied" in result.html.lower()
|
||||
print(f" Blocked: {blocked}")
|
||||
if not blocked and len(result.html) > 1000:
|
||||
print(f" SUCCESS! Got real content")
|
||||
except Exception as e:
|
||||
print(f" EXCEPTION: {e}")
|
||||
finally:
|
||||
shutil.rmtree(profile_dir, ignore_errors=True)
|
||||
|
||||
|
||||
async def main():
|
||||
# Mac UA on Linux = platform mismatch
|
||||
await test_with_ua(
|
||||
"Mac UA (mismatched platform)",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||
)
|
||||
|
||||
await asyncio.sleep(3)
|
||||
|
||||
# Linux UA = matches actual navigator.platform
|
||||
await test_with_ua(
|
||||
"Linux UA (matching platform)",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||
)
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user