Fix proxy auth for persistent browser contexts

Chromium's --proxy-server CLI flag silently ignores inline credentials (user:pass@server). For persistent contexts, crawl4ai was embedding credentials in this flag via ManagedBrowser.build_browser_flags(), causing proxy auth to fail and the browser to fall back to direct connection. Fix: Use Playwright's launch_persistent_context(proxy=...) API instead of subprocess + CDP when use_persistent_context=True. This handles proxy authentication properly via the HTTP CONNECT handshake. The non-persistent and CDP paths remain unchanged. Changes: - Strip credentials from --proxy-server flag in build_browser_flags() - Add launch_persistent_context() path in BrowserManager.start() - Add cleanup path in BrowserManager.close() - Guard create_browser_context() when self.browser is None - Add regression tests covering all 4 proxy/persistence combinations
2026-02-12 11:19:29 +00:00
parent 1a24ac785e
commit 112f44a97d
8 changed files with 690 additions and 5 deletions
--- a/crawl4ai/browser_manager.py
+++ b/crawl4ai/browser_manager.py
@@ -109,14 +109,13 @@ class ManagedBrowser:
                "--disable-software-rasterizer",
                "--disable-dev-shm-usage",
            ])
-        # proxy support
+        # proxy support — only pass server URL, never credentials.
        # Chromium's --proxy-server flag silently ignores inline user:pass@.
        # Auth credentials are handled at the Playwright context level instead.
        if config.proxy:
            flags.append(f"--proxy-server={config.proxy}")
        elif config.proxy_config:
-            creds = ""
+            flags.append(f"--proxy-server={config.proxy_config.server}")
            if config.proxy_config.username and config.proxy_config.password:
                creds = f"{config.proxy_config.username}:{config.proxy_config.password}@"
            flags.append(f"--proxy-server={creds}{config.proxy_config.server}")
        # dedupe
        return list(dict.fromkeys(flags))
@@ -711,6 +710,7 @@ class BrowserManager:
        self.managed_browser = None
        self.playwright = None
        self._using_cached_cdp = False
        self._launched_persistent = False  # True when using launch_persistent_context
        # Session management
        self.sessions = {}
@@ -793,6 +793,76 @@ class BrowserManager:
            # Initialize playwright
            self.playwright = await async_playwright().start()
        # ── Persistent context via Playwright's native API ──────────────
        # When use_persistent_context is set and we're not connecting to an
        # external CDP endpoint, use launch_persistent_context() instead of
        # subprocess + CDP.  This properly supports proxy authentication
        # (server + username + password) which the --proxy-server CLI flag
        # cannot handle.
        if (
            self.config.use_persistent_context
            and not self.config.cdp_url
            and not self._using_cached_cdp
        ):
            # Collect stealth / optimization CLI flags, excluding ones that
            # launch_persistent_context handles via keyword arguments.
            _skip_prefixes = (
                "--proxy-server",
                "--remote-debugging-port",
                "--user-data-dir",
                "--headless",
                "--window-size",
            )
            cli_args = [
                flag
                for flag in ManagedBrowser.build_browser_flags(self.config)
                if not flag.startswith(_skip_prefixes)
            ]
            if self.config.extra_args:
                cli_args.extend(self.config.extra_args)
            launch_kwargs = {
                "headless": self.config.headless,
                "args": list(dict.fromkeys(cli_args)),  # dedupe
                "viewport": {
                    "width": self.config.viewport_width,
                    "height": self.config.viewport_height,
                },
                "user_agent": self.config.user_agent or None,
                "ignore_https_errors": self.config.ignore_https_errors,
                "accept_downloads": self.config.accept_downloads,
            }
            if self.config.proxy_config:
                launch_kwargs["proxy"] = {
                    "server": self.config.proxy_config.server,
                    "username": self.config.proxy_config.username,
                    "password": self.config.proxy_config.password,
                }
            if self.config.storage_state:
                launch_kwargs["storage_state"] = self.config.storage_state
            user_data_dir = self.config.user_data_dir or tempfile.mkdtemp(
                prefix="crawl4ai-persistent-"
            )
            self.default_context = (
                await self.playwright.chromium.launch_persistent_context(
                    user_data_dir, **launch_kwargs
                )
            )
            self.browser = None  # persistent context has no separate Browser
            self._launched_persistent = True
            await self.setup_context(self.default_context)
            # Set the browser endpoint key for global page tracking
            self._browser_endpoint_key = self._compute_browser_endpoint_key()
            if self._browser_endpoint_key not in BrowserManager._global_pages_in_use:
                BrowserManager._global_pages_in_use[self._browser_endpoint_key] = set()
            return
        if self.config.cdp_url or self.config.use_managed_browser:
            self.config.use_managed_browser = True
@@ -1158,6 +1228,12 @@ class BrowserManager:
        Returns:
            Context: Browser context object with the specified configurations
        """
        if self.browser is None:
            raise RuntimeError(
                "Cannot create new browser contexts when using "
                "use_persistent_context=True. Persistent context uses a "
                "single shared context."
            )
        # Base settings
        user_agent = self.config.headers.get("User-Agent", self.config.user_agent) 
        viewport_settings = {
@@ -1858,6 +1934,35 @@ class BrowserManager:
                    self.playwright = None
            return
        # ── Persistent context launched via launch_persistent_context ──
        if self._launched_persistent:
            session_ids = list(self.sessions.keys())
            for session_id in session_ids:
                await self.kill_session(session_id)
            for ctx in self.contexts_by_config.values():
                try:
                    await ctx.close()
                except Exception:
                    pass
            self.contexts_by_config.clear()
            self._context_refcounts.clear()
            self._context_last_used.clear()
            self._page_to_sig.clear()
            # Closing the persistent context also terminates the browser
            if self.default_context:
                try:
                    await self.default_context.close()
                except Exception:
                    pass
                self.default_context = None
            if self.playwright:
                await self.playwright.stop()
                self.playwright = None
            self._launched_persistent = False
            return
        if self.config.sleep_on_close:
            await asyncio.sleep(0.5)
--- a/tests/proxy/test_chanel_basic.py
+++ b/tests/proxy/test_chanel_basic.py
@@ -0,0 +1,61 @@
 import asyncio
 import os
 import shutil
 import uuid
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
 async def crawl_chanel(url: str):
    # Fresh profile each time (gets flagged after one use)
    profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
    os.makedirs(profile_dir, exist_ok=True)
    browser_config = BrowserConfig(
        headless=True,
        enable_stealth=True,
        use_persistent_context=True,
        user_data_dir=profile_dir,
        viewport_width=1920,
        viewport_height=1080,
        user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
        headers={
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.9",
            "Sec-Fetch-Dest": "document",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "none",
            "Sec-Fetch-User": "?1",
        }
    )
    run_config = CrawlerRunConfig(
        magic=True,
        simulate_user=True,
        override_navigator=True,
        page_timeout=120000,
        wait_until="load",
        delay_before_return_html=10.0,
    )
    try:
        async with AsyncWebCrawler(config=browser_config) as crawler:
            result = await crawler.arun(url, config=run_config)
            return result
    finally:
        shutil.rmtree(profile_dir, ignore_errors=True)
 async def main():
    url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/"
    result = await crawl_chanel(url)
    print(f"Status: {result.status_code}")
    print(f"Success: {result.success}")
    print(f"HTML: {len(result.html):,} bytes")
    if result.markdown:
        md_len = len(result.markdown.raw_markdown)
        print(f"Markdown: {md_len:,} chars")
    if result.error_message:
        print(f"Error: {result.error_message}")
 asyncio.run(main())
--- a/tests/proxy/test_chanel_debug.py
+++ b/tests/proxy/test_chanel_debug.py
@@ -0,0 +1,62 @@
 import asyncio
 import os
 import shutil
 import uuid
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
 from crawl4ai.async_configs import ProxyConfig
 async def main():
    profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
    os.makedirs(profile_dir, exist_ok=True)
    browser_config = BrowserConfig(
        headless=True,
        enable_stealth=True,
        use_persistent_context=True,
        user_data_dir=profile_dir,
        viewport_width=1920,
        viewport_height=1080,
        user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
        headers={
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.9",
            "Sec-Fetch-Dest": "document",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "none",
            "Sec-Fetch-User": "?1",
        },
        proxy_config=ProxyConfig(
            server="https://network.joinmassive.com:65535",
            username="mpuQHs4sWZ-country-US",
            password="D0yWxVQo8wQ05RWqz1Bn",
        ),
    )
    run_config = CrawlerRunConfig(
        magic=True,
        simulate_user=True,
        override_navigator=True,
        page_timeout=120000,
        wait_until="networkidle",
        delay_before_return_html=15.0,
    )
    try:
        async with AsyncWebCrawler(config=browser_config) as crawler:
            result = await crawler.arun(
                "https://www.chanel.com/us/fashion/handbags/c/1x1x1/",
                config=run_config,
            )
            print(f"Status: {result.status_code}")
            print(f"HTML bytes: {len(result.html)}")
            print(f"\n=== FULL HTML ===\n{result.html}")
            print(f"\n=== RESPONSE HEADERS ===")
            if result.response_headers:
                for k, v in sorted(result.response_headers.items()):
                    print(f"  {k}: {v}")
    finally:
        shutil.rmtree(profile_dir, ignore_errors=True)
 asyncio.run(main())
--- a/tests/proxy/test_chanel_multi_attempt.py
+++ b/tests/proxy/test_chanel_multi_attempt.py
@@ -0,0 +1,147 @@
 import asyncio
 import os
 import shutil
 import uuid
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, UndetectedAdapter
 from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
 async def attempt(label, browser_config, run_config, crawler_strategy=None):
    print(f"\n{'='*60}")
    print(f"Attempt: {label}")
    print(f"{'='*60}")
    url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/"
    kwargs = {"config": browser_config}
    if crawler_strategy:
        kwargs["crawler_strategy"] = crawler_strategy
    try:
        async with AsyncWebCrawler(**kwargs) as crawler:
            result = await crawler.arun(url, config=run_config)
            print(f"Status: {result.status_code}")
            print(f"Success: {result.success}")
            print(f"HTML: {len(result.html):,} bytes")
            if result.markdown:
                print(f"Markdown: {len(result.markdown.raw_markdown):,} chars")
            if result.error_message:
                print(f"Error: {result.error_message}")
            # Check for anti-bot indicators
            html_lower = result.html.lower()
            for indicator in ["access denied", "403", "blocked", "captcha", "challenge"]:
                if indicator in html_lower:
                    print(f"  Anti-bot indicator found: '{indicator}'")
            return result
    except Exception as e:
        print(f"Exception: {e}")
        return None
 async def main():
    mac_ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
    }
    # ---- Attempt 1: Mac UA + stealth (user's original approach) ----
    profile1 = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
    os.makedirs(profile1, exist_ok=True)
    try:
        bc1 = BrowserConfig(
            headless=True,
            enable_stealth=True,
            use_persistent_context=True,
            user_data_dir=profile1,
            viewport_width=1920,
            viewport_height=1080,
            user_agent=mac_ua,
            headers=headers,
        )
        rc1 = CrawlerRunConfig(
            magic=True,
            simulate_user=True,
            override_navigator=True,
            page_timeout=120000,
            wait_until="load",
            delay_before_return_html=10.0,
        )
        await attempt("Mac UA + Stealth + Magic (user's script)", bc1, rc1)
    finally:
        shutil.rmtree(profile1, ignore_errors=True)
    await asyncio.sleep(3)
    # ---- Attempt 2: Undetected adapter (patchright) ----
    profile2 = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
    os.makedirs(profile2, exist_ok=True)
    try:
        bc2 = BrowserConfig(
            headless=True,
            use_persistent_context=True,
            user_data_dir=profile2,
            viewport_width=1920,
            viewport_height=1080,
            user_agent=mac_ua,
            headers=headers,
        )
        rc2 = CrawlerRunConfig(
            simulate_user=True,
            override_navigator=True,
            page_timeout=120000,
            wait_until="load",
            delay_before_return_html=15.0,
        )
        adapter = UndetectedAdapter()
        strategy = AsyncPlaywrightCrawlerStrategy(
            browser_config=bc2,
            browser_adapter=adapter,
        )
        await attempt("Undetected Adapter (patchright)", bc2, rc2, crawler_strategy=strategy)
    finally:
        shutil.rmtree(profile2, ignore_errors=True)
    await asyncio.sleep(3)
    # ---- Attempt 3: Longer delay + networkidle ----
    profile3 = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
    os.makedirs(profile3, exist_ok=True)
    try:
        bc3 = BrowserConfig(
            headless=True,
            enable_stealth=True,
            use_persistent_context=True,
            user_data_dir=profile3,
            viewport_width=1920,
            viewport_height=1080,
            user_agent=mac_ua,
            headers=headers,
        )
        rc3 = CrawlerRunConfig(
            magic=True,
            simulate_user=True,
            override_navigator=True,
            page_timeout=120000,
            wait_until="networkidle",
            delay_before_return_html=20.0,
            js_code="""
            // Simulate human-like scrolling
            await new Promise(r => setTimeout(r, 2000));
            window.scrollTo({top: 300, behavior: 'smooth'});
            await new Promise(r => setTimeout(r, 1500));
            window.scrollTo({top: 600, behavior: 'smooth'});
            await new Promise(r => setTimeout(r, 1000));
            """,
        )
        await attempt("Stealth + networkidle + scroll + 20s delay", bc3, rc3)
    finally:
        shutil.rmtree(profile3, ignore_errors=True)
 asyncio.run(main())
--- a/tests/proxy/test_chanel_xvfb.py
+++ b/tests/proxy/test_chanel_xvfb.py
@@ -0,0 +1,62 @@
 import asyncio
 import os
 import shutil
 import uuid
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
 async def crawl_chanel(url: str):
    profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
    os.makedirs(profile_dir, exist_ok=True)
    browser_config = BrowserConfig(
        headless=False,  # Non-headless via Xvfb - harder to detect
        enable_stealth=True,
        use_persistent_context=True,
        user_data_dir=profile_dir,
        viewport_width=1920,
        viewport_height=1080,
        user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
        headers={
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.9",
            "Sec-Fetch-Dest": "document",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "none",
            "Sec-Fetch-User": "?1",
        }
    )
    run_config = CrawlerRunConfig(
        magic=True,
        simulate_user=True,
        override_navigator=True,
        page_timeout=120000,
        wait_until="load",
        delay_before_return_html=10.0,
    )
    try:
        async with AsyncWebCrawler(config=browser_config) as crawler:
            result = await crawler.arun(url, config=run_config)
            return result
    finally:
        shutil.rmtree(profile_dir, ignore_errors=True)
 async def main():
    url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/"
    result = await crawl_chanel(url)
    print(f"Status: {result.status_code}")
    print(f"Success: {result.success}")
    print(f"HTML: {len(result.html):,} bytes")
    if result.markdown:
        md_len = len(result.markdown.raw_markdown)
        print(f"Markdown: {md_len:,} chars")
        if md_len > 500:
            print(f"\nFirst 500 chars of markdown:\n{result.markdown.raw_markdown[:500]}")
    if result.error_message:
        print(f"Error: {result.error_message}")
 asyncio.run(main())
--- a/tests/proxy/test_persistent_proxy.py
+++ b/tests/proxy/test_persistent_proxy.py
@@ -0,0 +1,68 @@
 import asyncio
 import os
 import shutil
 import uuid
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
 from crawl4ai.async_configs import ProxyConfig
 async def crawl_chanel(url: str):
    profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
    os.makedirs(profile_dir, exist_ok=True)
    browser_config = BrowserConfig(
        headless=True,
        enable_stealth=True,
        use_persistent_context=True,
        user_data_dir=profile_dir,
        viewport_width=1920,
        viewport_height=1080,
        user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
        headers={
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.9",
            "Sec-Fetch-Dest": "document",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "none",
            "Sec-Fetch-User": "?1",
        },
        proxy_config=ProxyConfig(
            server="https://network.joinmassive.com:65535",
            username="mpuQHs4sWZ-country-US",
            password="D0yWxVQo8wQ05RWqz1Bn",
        ),
    )
    run_config = CrawlerRunConfig(
        magic=True,
        simulate_user=True,
        override_navigator=True,
        page_timeout=120000,
        wait_until="load",
        delay_before_return_html=10.0,
    )
    try:
        async with AsyncWebCrawler(config=browser_config) as crawler:
            result = await crawler.arun(url, config=run_config)
            return result
    finally:
        shutil.rmtree(profile_dir, ignore_errors=True)
 async def main():
    url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/"
    result = await crawl_chanel(url)
    print(f"Status: {result.status_code}")
    print(f"Success: {result.success}")
    print(f"HTML: {len(result.html):,} bytes")
    if result.markdown:
        md_len = len(result.markdown.raw_markdown)
        print(f"Markdown: {md_len:,} chars")
        if md_len > 500:
            print(f"\nFirst 500 chars of markdown:\n{result.markdown.raw_markdown[:500]}")
    if result.error_message:
        print(f"Error: {result.error_message}")
 asyncio.run(main())
--- a/tests/proxy/test_platform_match.py
+++ b/tests/proxy/test_platform_match.py
@@ -0,0 +1,84 @@
 """Test if matching UA to actual platform fixes Akamai detection."""
 import asyncio
 import os
 import shutil
 import uuid
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
 from crawl4ai.async_configs import ProxyConfig
 async def test_with_ua(label, user_agent):
    print(f"\n{'='*60}")
    print(f"Test: {label}")
    print(f"{'='*60}")
    profile_dir = os.path.expanduser(f"~/.crawl4ai/test_{uuid.uuid4().hex[:8]}")
    os.makedirs(profile_dir, exist_ok=True)
    browser_config = BrowserConfig(
        headless=True,
        enable_stealth=True,
        use_persistent_context=True,
        user_data_dir=profile_dir,
        viewport_width=1920,
        viewport_height=1080,
        user_agent=user_agent,
        headers={
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.9",
            "Sec-Fetch-Dest": "document",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "none",
            "Sec-Fetch-User": "?1",
        },
        proxy_config=ProxyConfig(
            server="https://network.joinmassive.com:65535",
            username="mpuQHs4sWZ-country-US",
            password="D0yWxVQo8wQ05RWqz1Bn",
        ),
    )
    run_config = CrawlerRunConfig(
        magic=True,
        simulate_user=True,
        override_navigator=True,
        page_timeout=120000,
        wait_until="load",
        delay_before_return_html=10.0,
    )
    try:
        async with AsyncWebCrawler(config=browser_config) as crawler:
            result = await crawler.arun(
                "https://www.chanel.com/us/fashion/handbags/c/1x1x1/",
                config=run_config,
            )
            print(f"  Status: {result.status_code}")
            print(f"  HTML bytes: {len(result.html)}")
            blocked = "access denied" in result.html.lower()
            print(f"  Blocked: {blocked}")
            if not blocked and len(result.html) > 1000:
                print(f"  SUCCESS! Got real content")
    except Exception as e:
        print(f"  EXCEPTION: {e}")
    finally:
        shutil.rmtree(profile_dir, ignore_errors=True)
 async def main():
    # Mac UA on Linux = platform mismatch
    await test_with_ua(
        "Mac UA (mismatched platform)",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
    )
    await asyncio.sleep(3)
    # Linux UA = matches actual navigator.platform
    await test_with_ua(
        "Linux UA (matching platform)",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
    )
 asyncio.run(main())
--- a/tests/proxy/test_proxy_regression.py
+++ b/tests/proxy/test_proxy_regression.py
@@ -0,0 +1,96 @@
 """Regression tests for proxy fix:
 1. Persistent context + proxy (new path via launch_persistent_context)
 2. Persistent context WITHOUT proxy (should still use launch_persistent_context)
 3. Non-persistent + proxy on CrawlerRunConfig (existing path, must not break)
 4. Non-persistent, no proxy (basic crawl, must not break)
 """
 import asyncio
 import os
 import shutil
 import uuid
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
 from crawl4ai.async_configs import ProxyConfig
 TEST_URL = "https://httpbin.org/ip"  # Simple endpoint, returns IP
 async def test(label, browser_config, run_config=None):
    print(f"\n{'='*60}")
    print(f"Test: {label}")
    print(f"{'='*60}")
    run_config = run_config or CrawlerRunConfig()
    try:
        async with AsyncWebCrawler(config=browser_config) as crawler:
            result = await crawler.arun(TEST_URL, config=run_config)
            print(f"  Status: {result.status_code}")
            print(f"  HTML bytes: {len(result.html)}")
            if result.markdown:
                # httpbin.org/ip returns JSON with "origin" key
                md = result.markdown.raw_markdown.strip()
                print(f"  Content: {md[:200]}")
            if result.error_message:
                print(f"  ERROR: {result.error_message}")
            return result
    except Exception as e:
        print(f"  EXCEPTION: {e}")
        return None
 async def main():
    proxy = ProxyConfig(
        server="https://network.joinmassive.com:65535",
        username="mpuQHs4sWZ-country-US",
        password="D0yWxVQo8wQ05RWqz1Bn",
    )
    # 1. Persistent context + proxy (the fixed path)
    pd = os.path.expanduser(f"~/.crawl4ai/test_{uuid.uuid4().hex[:8]}")
    os.makedirs(pd, exist_ok=True)
    try:
        await test(
            "Persistent + proxy (launch_persistent_context)",
            BrowserConfig(
                headless=True,
                use_persistent_context=True,
                user_data_dir=pd,
                proxy_config=proxy,
            ),
        )
    finally:
        shutil.rmtree(pd, ignore_errors=True)
    # 2. Persistent context WITHOUT proxy
    pd2 = os.path.expanduser(f"~/.crawl4ai/test_{uuid.uuid4().hex[:8]}")
    os.makedirs(pd2, exist_ok=True)
    try:
        await test(
            "Persistent, no proxy (launch_persistent_context)",
            BrowserConfig(
                headless=True,
                use_persistent_context=True,
                user_data_dir=pd2,
            ),
        )
    finally:
        shutil.rmtree(pd2, ignore_errors=True)
    # 3. Non-persistent + proxy on CrawlerRunConfig
    await test(
        "Non-persistent + proxy on RunConfig",
        BrowserConfig(headless=True),
        CrawlerRunConfig(
            proxy_config=proxy,
        ),
    )
    # 4. Basic crawl - no proxy, no persistent
    await test(
        "Basic crawl (no proxy, no persistent)",
        BrowserConfig(headless=True),
    )
    print("\n" + "="*60)
    print("All regression tests complete.")
 asyncio.run(main())