Fix proxy auth for persistent browser contexts

Chromium's --proxy-server CLI flag silently ignores inline credentials (user:pass@server). For persistent contexts, crawl4ai was embedding credentials in this flag via ManagedBrowser.build_browser_flags(), causing proxy auth to fail and the browser to fall back to direct connection. Fix: Use Playwright's launch_persistent_context(proxy=...) API instead of subprocess + CDP when use_persistent_context=True. This handles proxy authentication properly via the HTTP CONNECT handshake. The non-persistent and CDP paths remain unchanged. Changes: - Strip credentials from --proxy-server flag in build_browser_flags() - Add launch_persistent_context() path in BrowserManager.start() - Add cleanup path in BrowserManager.close() - Guard create_browser_context() when self.browser is None - Add regression tests covering all 4 proxy/persistence combinations
2026-02-12 11:19:29 +00:00
parent 1a24ac785e
commit 112f44a97d
8 changed files with 690 additions and 5 deletions
--- a/tests/proxy/test_chanel_basic.py
+++ b/tests/proxy/test_chanel_basic.py
@@ -0,0 +1,61 @@
+import asyncio
+import os
+import shutil
+import uuid
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+
+async def crawl_chanel(url: str):
+    # Fresh profile each time (gets flagged after one use)
+    profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
+    os.makedirs(profile_dir, exist_ok=True)
+
+    browser_config = BrowserConfig(
+        headless=True,
+        enable_stealth=True,
+        use_persistent_context=True,
+        user_data_dir=profile_dir,
+        viewport_width=1920,
+        viewport_height=1080,
+        user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
+        headers={
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+            "Accept-Language": "en-US,en;q=0.9",
+            "Sec-Fetch-Dest": "document",
+            "Sec-Fetch-Mode": "navigate",
+            "Sec-Fetch-Site": "none",
+            "Sec-Fetch-User": "?1",
+        }
+    )
+
+    run_config = CrawlerRunConfig(
+        magic=True,
+        simulate_user=True,
+        override_navigator=True,
+        page_timeout=120000,
+        wait_until="load",
+        delay_before_return_html=10.0,
+    )
+
+    try:
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            result = await crawler.arun(url, config=run_config)
+            return result
+    finally:
+        shutil.rmtree(profile_dir, ignore_errors=True)
+
+
+async def main():
+    url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/"
+    result = await crawl_chanel(url)
+    print(f"Status: {result.status_code}")
+    print(f"Success: {result.success}")
+    print(f"HTML: {len(result.html):,} bytes")
+    if result.markdown:
+        md_len = len(result.markdown.raw_markdown)
+        print(f"Markdown: {md_len:,} chars")
+    if result.error_message:
+        print(f"Error: {result.error_message}")
+
+
+asyncio.run(main())
--- a/tests/proxy/test_chanel_debug.py
+++ b/tests/proxy/test_chanel_debug.py
@@ -0,0 +1,62 @@
+import asyncio
+import os
+import shutil
+import uuid
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+from crawl4ai.async_configs import ProxyConfig
+
+
+async def main():
+    profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
+    os.makedirs(profile_dir, exist_ok=True)
+
+    browser_config = BrowserConfig(
+        headless=True,
+        enable_stealth=True,
+        use_persistent_context=True,
+        user_data_dir=profile_dir,
+        viewport_width=1920,
+        viewport_height=1080,
+        user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
+        headers={
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+            "Accept-Language": "en-US,en;q=0.9",
+            "Sec-Fetch-Dest": "document",
+            "Sec-Fetch-Mode": "navigate",
+            "Sec-Fetch-Site": "none",
+            "Sec-Fetch-User": "?1",
+        },
+        proxy_config=ProxyConfig(
+            server="https://network.joinmassive.com:65535",
+            username="mpuQHs4sWZ-country-US",
+            password="D0yWxVQo8wQ05RWqz1Bn",
+        ),
+    )
+
+    run_config = CrawlerRunConfig(
+        magic=True,
+        simulate_user=True,
+        override_navigator=True,
+        page_timeout=120000,
+        wait_until="networkidle",
+        delay_before_return_html=15.0,
+    )
+
+    try:
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            result = await crawler.arun(
+                "https://www.chanel.com/us/fashion/handbags/c/1x1x1/",
+                config=run_config,
+            )
+            print(f"Status: {result.status_code}")
+            print(f"HTML bytes: {len(result.html)}")
+            print(f"\n=== FULL HTML ===\n{result.html}")
+            print(f"\n=== RESPONSE HEADERS ===")
+            if result.response_headers:
+                for k, v in sorted(result.response_headers.items()):
+                    print(f"  {k}: {v}")
+    finally:
+        shutil.rmtree(profile_dir, ignore_errors=True)
+
+
+asyncio.run(main())
--- a/tests/proxy/test_chanel_multi_attempt.py
+++ b/tests/proxy/test_chanel_multi_attempt.py
@@ -0,0 +1,147 @@
+import asyncio
+import os
+import shutil
+import uuid
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, UndetectedAdapter
+from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
+
+
+async def attempt(label, browser_config, run_config, crawler_strategy=None):
+    print(f"\n{'='*60}")
+    print(f"Attempt: {label}")
+    print(f"{'='*60}")
+
+    url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/"
+
+    kwargs = {"config": browser_config}
+    if crawler_strategy:
+        kwargs["crawler_strategy"] = crawler_strategy
+
+    try:
+        async with AsyncWebCrawler(**kwargs) as crawler:
+            result = await crawler.arun(url, config=run_config)
+            print(f"Status: {result.status_code}")
+            print(f"Success: {result.success}")
+            print(f"HTML: {len(result.html):,} bytes")
+            if result.markdown:
+                print(f"Markdown: {len(result.markdown.raw_markdown):,} chars")
+            if result.error_message:
+                print(f"Error: {result.error_message}")
+            # Check for anti-bot indicators
+            html_lower = result.html.lower()
+            for indicator in ["access denied", "403", "blocked", "captcha", "challenge"]:
+                if indicator in html_lower:
+                    print(f"  Anti-bot indicator found: '{indicator}'")
+            return result
+    except Exception as e:
+        print(f"Exception: {e}")
+        return None
+
+
+async def main():
+    mac_ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
+
+    headers = {
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+        "Accept-Language": "en-US,en;q=0.9",
+        "Sec-Fetch-Dest": "document",
+        "Sec-Fetch-Mode": "navigate",
+        "Sec-Fetch-Site": "none",
+        "Sec-Fetch-User": "?1",
+    }
+
+    # ---- Attempt 1: Mac UA + stealth (user's original approach) ----
+    profile1 = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
+    os.makedirs(profile1, exist_ok=True)
+    try:
+        bc1 = BrowserConfig(
+            headless=True,
+            enable_stealth=True,
+            use_persistent_context=True,
+            user_data_dir=profile1,
+            viewport_width=1920,
+            viewport_height=1080,
+            user_agent=mac_ua,
+            headers=headers,
+        )
+        rc1 = CrawlerRunConfig(
+            magic=True,
+            simulate_user=True,
+            override_navigator=True,
+            page_timeout=120000,
+            wait_until="load",
+            delay_before_return_html=10.0,
+        )
+        await attempt("Mac UA + Stealth + Magic (user's script)", bc1, rc1)
+    finally:
+        shutil.rmtree(profile1, ignore_errors=True)
+
+    await asyncio.sleep(3)
+
+    # ---- Attempt 2: Undetected adapter (patchright) ----
+    profile2 = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
+    os.makedirs(profile2, exist_ok=True)
+    try:
+        bc2 = BrowserConfig(
+            headless=True,
+            use_persistent_context=True,
+            user_data_dir=profile2,
+            viewport_width=1920,
+            viewport_height=1080,
+            user_agent=mac_ua,
+            headers=headers,
+        )
+        rc2 = CrawlerRunConfig(
+            simulate_user=True,
+            override_navigator=True,
+            page_timeout=120000,
+            wait_until="load",
+            delay_before_return_html=15.0,
+        )
+        adapter = UndetectedAdapter()
+        strategy = AsyncPlaywrightCrawlerStrategy(
+            browser_config=bc2,
+            browser_adapter=adapter,
+        )
+        await attempt("Undetected Adapter (patchright)", bc2, rc2, crawler_strategy=strategy)
+    finally:
+        shutil.rmtree(profile2, ignore_errors=True)
+
+    await asyncio.sleep(3)
+
+    # ---- Attempt 3: Longer delay + networkidle ----
+    profile3 = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
+    os.makedirs(profile3, exist_ok=True)
+    try:
+        bc3 = BrowserConfig(
+            headless=True,
+            enable_stealth=True,
+            use_persistent_context=True,
+            user_data_dir=profile3,
+            viewport_width=1920,
+            viewport_height=1080,
+            user_agent=mac_ua,
+            headers=headers,
+        )
+        rc3 = CrawlerRunConfig(
+            magic=True,
+            simulate_user=True,
+            override_navigator=True,
+            page_timeout=120000,
+            wait_until="networkidle",
+            delay_before_return_html=20.0,
+            js_code="""
+            // Simulate human-like scrolling
+            await new Promise(r => setTimeout(r, 2000));
+            window.scrollTo({top: 300, behavior: 'smooth'});
+            await new Promise(r => setTimeout(r, 1500));
+            window.scrollTo({top: 600, behavior: 'smooth'});
+            await new Promise(r => setTimeout(r, 1000));
+            """,
+        )
+        await attempt("Stealth + networkidle + scroll + 20s delay", bc3, rc3)
+    finally:
+        shutil.rmtree(profile3, ignore_errors=True)
+
+
+asyncio.run(main())
--- a/tests/proxy/test_chanel_xvfb.py
+++ b/tests/proxy/test_chanel_xvfb.py
@@ -0,0 +1,62 @@
+import asyncio
+import os
+import shutil
+import uuid
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+
+async def crawl_chanel(url: str):
+    profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
+    os.makedirs(profile_dir, exist_ok=True)
+
+    browser_config = BrowserConfig(
+        headless=False,  # Non-headless via Xvfb - harder to detect
+        enable_stealth=True,
+        use_persistent_context=True,
+        user_data_dir=profile_dir,
+        viewport_width=1920,
+        viewport_height=1080,
+        user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
+        headers={
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+            "Accept-Language": "en-US,en;q=0.9",
+            "Sec-Fetch-Dest": "document",
+            "Sec-Fetch-Mode": "navigate",
+            "Sec-Fetch-Site": "none",
+            "Sec-Fetch-User": "?1",
+        }
+    )
+
+    run_config = CrawlerRunConfig(
+        magic=True,
+        simulate_user=True,
+        override_navigator=True,
+        page_timeout=120000,
+        wait_until="load",
+        delay_before_return_html=10.0,
+    )
+
+    try:
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            result = await crawler.arun(url, config=run_config)
+            return result
+    finally:
+        shutil.rmtree(profile_dir, ignore_errors=True)
+
+
+async def main():
+    url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/"
+    result = await crawl_chanel(url)
+    print(f"Status: {result.status_code}")
+    print(f"Success: {result.success}")
+    print(f"HTML: {len(result.html):,} bytes")
+    if result.markdown:
+        md_len = len(result.markdown.raw_markdown)
+        print(f"Markdown: {md_len:,} chars")
+        if md_len > 500:
+            print(f"\nFirst 500 chars of markdown:\n{result.markdown.raw_markdown[:500]}")
+    if result.error_message:
+        print(f"Error: {result.error_message}")
+
+
+asyncio.run(main())
--- a/tests/proxy/test_persistent_proxy.py
+++ b/tests/proxy/test_persistent_proxy.py
@@ -0,0 +1,68 @@
+import asyncio
+import os
+import shutil
+import uuid
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+from crawl4ai.async_configs import ProxyConfig
+
+
+async def crawl_chanel(url: str):
+    profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
+    os.makedirs(profile_dir, exist_ok=True)
+
+    browser_config = BrowserConfig(
+        headless=True,
+        enable_stealth=True,
+        use_persistent_context=True,
+        user_data_dir=profile_dir,
+        viewport_width=1920,
+        viewport_height=1080,
+        user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
+        headers={
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+            "Accept-Language": "en-US,en;q=0.9",
+            "Sec-Fetch-Dest": "document",
+            "Sec-Fetch-Mode": "navigate",
+            "Sec-Fetch-Site": "none",
+            "Sec-Fetch-User": "?1",
+        },
+        proxy_config=ProxyConfig(
+            server="https://network.joinmassive.com:65535",
+            username="mpuQHs4sWZ-country-US",
+            password="D0yWxVQo8wQ05RWqz1Bn",
+        ),
+    )
+
+    run_config = CrawlerRunConfig(
+        magic=True,
+        simulate_user=True,
+        override_navigator=True,
+        page_timeout=120000,
+        wait_until="load",
+        delay_before_return_html=10.0,
+    )
+
+    try:
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            result = await crawler.arun(url, config=run_config)
+            return result
+    finally:
+        shutil.rmtree(profile_dir, ignore_errors=True)
+
+
+async def main():
+    url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/"
+    result = await crawl_chanel(url)
+    print(f"Status: {result.status_code}")
+    print(f"Success: {result.success}")
+    print(f"HTML: {len(result.html):,} bytes")
+    if result.markdown:
+        md_len = len(result.markdown.raw_markdown)
+        print(f"Markdown: {md_len:,} chars")
+        if md_len > 500:
+            print(f"\nFirst 500 chars of markdown:\n{result.markdown.raw_markdown[:500]}")
+    if result.error_message:
+        print(f"Error: {result.error_message}")
+
+
+asyncio.run(main())
--- a/tests/proxy/test_platform_match.py
+++ b/tests/proxy/test_platform_match.py
@@ -0,0 +1,84 @@
+"""Test if matching UA to actual platform fixes Akamai detection."""
+import asyncio
+import os
+import shutil
+import uuid
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+from crawl4ai.async_configs import ProxyConfig
+
+
+async def test_with_ua(label, user_agent):
+    print(f"\n{'='*60}")
+    print(f"Test: {label}")
+    print(f"{'='*60}")
+
+    profile_dir = os.path.expanduser(f"~/.crawl4ai/test_{uuid.uuid4().hex[:8]}")
+    os.makedirs(profile_dir, exist_ok=True)
+
+    browser_config = BrowserConfig(
+        headless=True,
+        enable_stealth=True,
+        use_persistent_context=True,
+        user_data_dir=profile_dir,
+        viewport_width=1920,
+        viewport_height=1080,
+        user_agent=user_agent,
+        headers={
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+            "Accept-Language": "en-US,en;q=0.9",
+            "Sec-Fetch-Dest": "document",
+            "Sec-Fetch-Mode": "navigate",
+            "Sec-Fetch-Site": "none",
+            "Sec-Fetch-User": "?1",
+        },
+        proxy_config=ProxyConfig(
+            server="https://network.joinmassive.com:65535",
+            username="mpuQHs4sWZ-country-US",
+            password="D0yWxVQo8wQ05RWqz1Bn",
+        ),
+    )
+
+    run_config = CrawlerRunConfig(
+        magic=True,
+        simulate_user=True,
+        override_navigator=True,
+        page_timeout=120000,
+        wait_until="load",
+        delay_before_return_html=10.0,
+    )
+
+    try:
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            result = await crawler.arun(
+                "https://www.chanel.com/us/fashion/handbags/c/1x1x1/",
+                config=run_config,
+            )
+            print(f"  Status: {result.status_code}")
+            print(f"  HTML bytes: {len(result.html)}")
+            blocked = "access denied" in result.html.lower()
+            print(f"  Blocked: {blocked}")
+            if not blocked and len(result.html) > 1000:
+                print(f"  SUCCESS! Got real content")
+    except Exception as e:
+        print(f"  EXCEPTION: {e}")
+    finally:
+        shutil.rmtree(profile_dir, ignore_errors=True)
+
+
+async def main():
+    # Mac UA on Linux = platform mismatch
+    await test_with_ua(
+        "Mac UA (mismatched platform)",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
+    )
+
+    await asyncio.sleep(3)
+
+    # Linux UA = matches actual navigator.platform
+    await test_with_ua(
+        "Linux UA (matching platform)",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
+    )
+
+
+asyncio.run(main())
--- a/tests/proxy/test_proxy_regression.py
+++ b/tests/proxy/test_proxy_regression.py
@@ -0,0 +1,96 @@
+"""Regression tests for proxy fix:
+1. Persistent context + proxy (new path via launch_persistent_context)
+2. Persistent context WITHOUT proxy (should still use launch_persistent_context)
+3. Non-persistent + proxy on CrawlerRunConfig (existing path, must not break)
+4. Non-persistent, no proxy (basic crawl, must not break)
+"""
+import asyncio
+import os
+import shutil
+import uuid
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+from crawl4ai.async_configs import ProxyConfig
+
+TEST_URL = "https://httpbin.org/ip"  # Simple endpoint, returns IP
+
+
+async def test(label, browser_config, run_config=None):
+    print(f"\n{'='*60}")
+    print(f"Test: {label}")
+    print(f"{'='*60}")
+    run_config = run_config or CrawlerRunConfig()
+    try:
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            result = await crawler.arun(TEST_URL, config=run_config)
+            print(f"  Status: {result.status_code}")
+            print(f"  HTML bytes: {len(result.html)}")
+            if result.markdown:
+                # httpbin.org/ip returns JSON with "origin" key
+                md = result.markdown.raw_markdown.strip()
+                print(f"  Content: {md[:200]}")
+            if result.error_message:
+                print(f"  ERROR: {result.error_message}")
+            return result
+    except Exception as e:
+        print(f"  EXCEPTION: {e}")
+        return None
+
+
+async def main():
+    proxy = ProxyConfig(
+        server="https://network.joinmassive.com:65535",
+        username="mpuQHs4sWZ-country-US",
+        password="D0yWxVQo8wQ05RWqz1Bn",
+    )
+
+    # 1. Persistent context + proxy (the fixed path)
+    pd = os.path.expanduser(f"~/.crawl4ai/test_{uuid.uuid4().hex[:8]}")
+    os.makedirs(pd, exist_ok=True)
+    try:
+        await test(
+            "Persistent + proxy (launch_persistent_context)",
+            BrowserConfig(
+                headless=True,
+                use_persistent_context=True,
+                user_data_dir=pd,
+                proxy_config=proxy,
+            ),
+        )
+    finally:
+        shutil.rmtree(pd, ignore_errors=True)
+
+    # 2. Persistent context WITHOUT proxy
+    pd2 = os.path.expanduser(f"~/.crawl4ai/test_{uuid.uuid4().hex[:8]}")
+    os.makedirs(pd2, exist_ok=True)
+    try:
+        await test(
+            "Persistent, no proxy (launch_persistent_context)",
+            BrowserConfig(
+                headless=True,
+                use_persistent_context=True,
+                user_data_dir=pd2,
+            ),
+        )
+    finally:
+        shutil.rmtree(pd2, ignore_errors=True)
+
+    # 3. Non-persistent + proxy on CrawlerRunConfig
+    await test(
+        "Non-persistent + proxy on RunConfig",
+        BrowserConfig(headless=True),
+        CrawlerRunConfig(
+            proxy_config=proxy,
+        ),
+    )
+
+    # 4. Basic crawl - no proxy, no persistent
+    await test(
+        "Basic crawl (no proxy, no persistent)",
+        BrowserConfig(headless=True),
+    )
+
+    print("\n" + "="*60)
+    print("All regression tests complete.")
+
+
+asyncio.run(main())