From 112f44a97d6896fda1200a3f5f926c625d62c05e Mon Sep 17 00:00:00 2001 From: unclecode Date: Thu, 12 Feb 2026 11:19:29 +0000 Subject: [PATCH] Fix proxy auth for persistent browser contexts Chromium's --proxy-server CLI flag silently ignores inline credentials (user:pass@server). For persistent contexts, crawl4ai was embedding credentials in this flag via ManagedBrowser.build_browser_flags(), causing proxy auth to fail and the browser to fall back to direct connection. Fix: Use Playwright's launch_persistent_context(proxy=...) API instead of subprocess + CDP when use_persistent_context=True. This handles proxy authentication properly via the HTTP CONNECT handshake. The non-persistent and CDP paths remain unchanged. Changes: - Strip credentials from --proxy-server flag in build_browser_flags() - Add launch_persistent_context() path in BrowserManager.start() - Add cleanup path in BrowserManager.close() - Guard create_browser_context() when self.browser is None - Add regression tests covering all 4 proxy/persistence combinations --- crawl4ai/browser_manager.py | 115 +++++++++++++++++- tests/proxy/test_chanel_basic.py | 61 ++++++++++ tests/proxy/test_chanel_debug.py | 62 ++++++++++ tests/proxy/test_chanel_multi_attempt.py | 147 +++++++++++++++++++++++ tests/proxy/test_chanel_xvfb.py | 62 ++++++++++ tests/proxy/test_persistent_proxy.py | 68 +++++++++++ tests/proxy/test_platform_match.py | 84 +++++++++++++ tests/proxy/test_proxy_regression.py | 96 +++++++++++++++ 8 files changed, 690 insertions(+), 5 deletions(-) create mode 100644 tests/proxy/test_chanel_basic.py create mode 100644 tests/proxy/test_chanel_debug.py create mode 100644 tests/proxy/test_chanel_multi_attempt.py create mode 100644 tests/proxy/test_chanel_xvfb.py create mode 100644 tests/proxy/test_persistent_proxy.py create mode 100644 tests/proxy/test_platform_match.py create mode 100644 tests/proxy/test_proxy_regression.py diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index 54c51347..51e8f99b 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -109,14 +109,13 @@ class ManagedBrowser: "--disable-software-rasterizer", "--disable-dev-shm-usage", ]) - # proxy support + # proxy support — only pass server URL, never credentials. + # Chromium's --proxy-server flag silently ignores inline user:pass@. + # Auth credentials are handled at the Playwright context level instead. if config.proxy: flags.append(f"--proxy-server={config.proxy}") elif config.proxy_config: - creds = "" - if config.proxy_config.username and config.proxy_config.password: - creds = f"{config.proxy_config.username}:{config.proxy_config.password}@" - flags.append(f"--proxy-server={creds}{config.proxy_config.server}") + flags.append(f"--proxy-server={config.proxy_config.server}") # dedupe return list(dict.fromkeys(flags)) @@ -711,6 +710,7 @@ class BrowserManager: self.managed_browser = None self.playwright = None self._using_cached_cdp = False + self._launched_persistent = False # True when using launch_persistent_context # Session management self.sessions = {} @@ -793,6 +793,76 @@ class BrowserManager: # Initialize playwright self.playwright = await async_playwright().start() + # ── Persistent context via Playwright's native API ────────────── + # When use_persistent_context is set and we're not connecting to an + # external CDP endpoint, use launch_persistent_context() instead of + # subprocess + CDP. This properly supports proxy authentication + # (server + username + password) which the --proxy-server CLI flag + # cannot handle. + if ( + self.config.use_persistent_context + and not self.config.cdp_url + and not self._using_cached_cdp + ): + # Collect stealth / optimization CLI flags, excluding ones that + # launch_persistent_context handles via keyword arguments. + _skip_prefixes = ( + "--proxy-server", + "--remote-debugging-port", + "--user-data-dir", + "--headless", + "--window-size", + ) + cli_args = [ + flag + for flag in ManagedBrowser.build_browser_flags(self.config) + if not flag.startswith(_skip_prefixes) + ] + if self.config.extra_args: + cli_args.extend(self.config.extra_args) + + launch_kwargs = { + "headless": self.config.headless, + "args": list(dict.fromkeys(cli_args)), # dedupe + "viewport": { + "width": self.config.viewport_width, + "height": self.config.viewport_height, + }, + "user_agent": self.config.user_agent or None, + "ignore_https_errors": self.config.ignore_https_errors, + "accept_downloads": self.config.accept_downloads, + } + + if self.config.proxy_config: + launch_kwargs["proxy"] = { + "server": self.config.proxy_config.server, + "username": self.config.proxy_config.username, + "password": self.config.proxy_config.password, + } + + if self.config.storage_state: + launch_kwargs["storage_state"] = self.config.storage_state + + user_data_dir = self.config.user_data_dir or tempfile.mkdtemp( + prefix="crawl4ai-persistent-" + ) + + self.default_context = ( + await self.playwright.chromium.launch_persistent_context( + user_data_dir, **launch_kwargs + ) + ) + self.browser = None # persistent context has no separate Browser + self._launched_persistent = True + + await self.setup_context(self.default_context) + + # Set the browser endpoint key for global page tracking + self._browser_endpoint_key = self._compute_browser_endpoint_key() + if self._browser_endpoint_key not in BrowserManager._global_pages_in_use: + BrowserManager._global_pages_in_use[self._browser_endpoint_key] = set() + return + if self.config.cdp_url or self.config.use_managed_browser: self.config.use_managed_browser = True @@ -1158,6 +1228,12 @@ class BrowserManager: Returns: Context: Browser context object with the specified configurations """ + if self.browser is None: + raise RuntimeError( + "Cannot create new browser contexts when using " + "use_persistent_context=True. Persistent context uses a " + "single shared context." + ) # Base settings user_agent = self.config.headers.get("User-Agent", self.config.user_agent) viewport_settings = { @@ -1858,6 +1934,35 @@ class BrowserManager: self.playwright = None return + # ── Persistent context launched via launch_persistent_context ── + if self._launched_persistent: + session_ids = list(self.sessions.keys()) + for session_id in session_ids: + await self.kill_session(session_id) + for ctx in self.contexts_by_config.values(): + try: + await ctx.close() + except Exception: + pass + self.contexts_by_config.clear() + self._context_refcounts.clear() + self._context_last_used.clear() + self._page_to_sig.clear() + + # Closing the persistent context also terminates the browser + if self.default_context: + try: + await self.default_context.close() + except Exception: + pass + self.default_context = None + + if self.playwright: + await self.playwright.stop() + self.playwright = None + self._launched_persistent = False + return + if self.config.sleep_on_close: await asyncio.sleep(0.5) diff --git a/tests/proxy/test_chanel_basic.py b/tests/proxy/test_chanel_basic.py new file mode 100644 index 00000000..86201c5c --- /dev/null +++ b/tests/proxy/test_chanel_basic.py @@ -0,0 +1,61 @@ +import asyncio +import os +import shutil +import uuid +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + + +async def crawl_chanel(url: str): + # Fresh profile each time (gets flagged after one use) + profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}") + os.makedirs(profile_dir, exist_ok=True) + + browser_config = BrowserConfig( + headless=True, + enable_stealth=True, + use_persistent_context=True, + user_data_dir=profile_dir, + viewport_width=1920, + viewport_height=1080, + user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", + headers={ + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none", + "Sec-Fetch-User": "?1", + } + ) + + run_config = CrawlerRunConfig( + magic=True, + simulate_user=True, + override_navigator=True, + page_timeout=120000, + wait_until="load", + delay_before_return_html=10.0, + ) + + try: + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url, config=run_config) + return result + finally: + shutil.rmtree(profile_dir, ignore_errors=True) + + +async def main(): + url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/" + result = await crawl_chanel(url) + print(f"Status: {result.status_code}") + print(f"Success: {result.success}") + print(f"HTML: {len(result.html):,} bytes") + if result.markdown: + md_len = len(result.markdown.raw_markdown) + print(f"Markdown: {md_len:,} chars") + if result.error_message: + print(f"Error: {result.error_message}") + + +asyncio.run(main()) diff --git a/tests/proxy/test_chanel_debug.py b/tests/proxy/test_chanel_debug.py new file mode 100644 index 00000000..a656d512 --- /dev/null +++ b/tests/proxy/test_chanel_debug.py @@ -0,0 +1,62 @@ +import asyncio +import os +import shutil +import uuid +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig +from crawl4ai.async_configs import ProxyConfig + + +async def main(): + profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}") + os.makedirs(profile_dir, exist_ok=True) + + browser_config = BrowserConfig( + headless=True, + enable_stealth=True, + use_persistent_context=True, + user_data_dir=profile_dir, + viewport_width=1920, + viewport_height=1080, + user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", + headers={ + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none", + "Sec-Fetch-User": "?1", + }, + proxy_config=ProxyConfig( + server="https://network.joinmassive.com:65535", + username="mpuQHs4sWZ-country-US", + password="D0yWxVQo8wQ05RWqz1Bn", + ), + ) + + run_config = CrawlerRunConfig( + magic=True, + simulate_user=True, + override_navigator=True, + page_timeout=120000, + wait_until="networkidle", + delay_before_return_html=15.0, + ) + + try: + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + "https://www.chanel.com/us/fashion/handbags/c/1x1x1/", + config=run_config, + ) + print(f"Status: {result.status_code}") + print(f"HTML bytes: {len(result.html)}") + print(f"\n=== FULL HTML ===\n{result.html}") + print(f"\n=== RESPONSE HEADERS ===") + if result.response_headers: + for k, v in sorted(result.response_headers.items()): + print(f" {k}: {v}") + finally: + shutil.rmtree(profile_dir, ignore_errors=True) + + +asyncio.run(main()) diff --git a/tests/proxy/test_chanel_multi_attempt.py b/tests/proxy/test_chanel_multi_attempt.py new file mode 100644 index 00000000..a02b97aa --- /dev/null +++ b/tests/proxy/test_chanel_multi_attempt.py @@ -0,0 +1,147 @@ +import asyncio +import os +import shutil +import uuid +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, UndetectedAdapter +from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy + + +async def attempt(label, browser_config, run_config, crawler_strategy=None): + print(f"\n{'='*60}") + print(f"Attempt: {label}") + print(f"{'='*60}") + + url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/" + + kwargs = {"config": browser_config} + if crawler_strategy: + kwargs["crawler_strategy"] = crawler_strategy + + try: + async with AsyncWebCrawler(**kwargs) as crawler: + result = await crawler.arun(url, config=run_config) + print(f"Status: {result.status_code}") + print(f"Success: {result.success}") + print(f"HTML: {len(result.html):,} bytes") + if result.markdown: + print(f"Markdown: {len(result.markdown.raw_markdown):,} chars") + if result.error_message: + print(f"Error: {result.error_message}") + # Check for anti-bot indicators + html_lower = result.html.lower() + for indicator in ["access denied", "403", "blocked", "captcha", "challenge"]: + if indicator in html_lower: + print(f" Anti-bot indicator found: '{indicator}'") + return result + except Exception as e: + print(f"Exception: {e}") + return None + + +async def main(): + mac_ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" + + headers = { + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none", + "Sec-Fetch-User": "?1", + } + + # ---- Attempt 1: Mac UA + stealth (user's original approach) ---- + profile1 = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}") + os.makedirs(profile1, exist_ok=True) + try: + bc1 = BrowserConfig( + headless=True, + enable_stealth=True, + use_persistent_context=True, + user_data_dir=profile1, + viewport_width=1920, + viewport_height=1080, + user_agent=mac_ua, + headers=headers, + ) + rc1 = CrawlerRunConfig( + magic=True, + simulate_user=True, + override_navigator=True, + page_timeout=120000, + wait_until="load", + delay_before_return_html=10.0, + ) + await attempt("Mac UA + Stealth + Magic (user's script)", bc1, rc1) + finally: + shutil.rmtree(profile1, ignore_errors=True) + + await asyncio.sleep(3) + + # ---- Attempt 2: Undetected adapter (patchright) ---- + profile2 = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}") + os.makedirs(profile2, exist_ok=True) + try: + bc2 = BrowserConfig( + headless=True, + use_persistent_context=True, + user_data_dir=profile2, + viewport_width=1920, + viewport_height=1080, + user_agent=mac_ua, + headers=headers, + ) + rc2 = CrawlerRunConfig( + simulate_user=True, + override_navigator=True, + page_timeout=120000, + wait_until="load", + delay_before_return_html=15.0, + ) + adapter = UndetectedAdapter() + strategy = AsyncPlaywrightCrawlerStrategy( + browser_config=bc2, + browser_adapter=adapter, + ) + await attempt("Undetected Adapter (patchright)", bc2, rc2, crawler_strategy=strategy) + finally: + shutil.rmtree(profile2, ignore_errors=True) + + await asyncio.sleep(3) + + # ---- Attempt 3: Longer delay + networkidle ---- + profile3 = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}") + os.makedirs(profile3, exist_ok=True) + try: + bc3 = BrowserConfig( + headless=True, + enable_stealth=True, + use_persistent_context=True, + user_data_dir=profile3, + viewport_width=1920, + viewport_height=1080, + user_agent=mac_ua, + headers=headers, + ) + rc3 = CrawlerRunConfig( + magic=True, + simulate_user=True, + override_navigator=True, + page_timeout=120000, + wait_until="networkidle", + delay_before_return_html=20.0, + js_code=""" + // Simulate human-like scrolling + await new Promise(r => setTimeout(r, 2000)); + window.scrollTo({top: 300, behavior: 'smooth'}); + await new Promise(r => setTimeout(r, 1500)); + window.scrollTo({top: 600, behavior: 'smooth'}); + await new Promise(r => setTimeout(r, 1000)); + """, + ) + await attempt("Stealth + networkidle + scroll + 20s delay", bc3, rc3) + finally: + shutil.rmtree(profile3, ignore_errors=True) + + +asyncio.run(main()) diff --git a/tests/proxy/test_chanel_xvfb.py b/tests/proxy/test_chanel_xvfb.py new file mode 100644 index 00000000..c9221c17 --- /dev/null +++ b/tests/proxy/test_chanel_xvfb.py @@ -0,0 +1,62 @@ +import asyncio +import os +import shutil +import uuid +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + + +async def crawl_chanel(url: str): + profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}") + os.makedirs(profile_dir, exist_ok=True) + + browser_config = BrowserConfig( + headless=False, # Non-headless via Xvfb - harder to detect + enable_stealth=True, + use_persistent_context=True, + user_data_dir=profile_dir, + viewport_width=1920, + viewport_height=1080, + user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", + headers={ + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none", + "Sec-Fetch-User": "?1", + } + ) + + run_config = CrawlerRunConfig( + magic=True, + simulate_user=True, + override_navigator=True, + page_timeout=120000, + wait_until="load", + delay_before_return_html=10.0, + ) + + try: + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url, config=run_config) + return result + finally: + shutil.rmtree(profile_dir, ignore_errors=True) + + +async def main(): + url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/" + result = await crawl_chanel(url) + print(f"Status: {result.status_code}") + print(f"Success: {result.success}") + print(f"HTML: {len(result.html):,} bytes") + if result.markdown: + md_len = len(result.markdown.raw_markdown) + print(f"Markdown: {md_len:,} chars") + if md_len > 500: + print(f"\nFirst 500 chars of markdown:\n{result.markdown.raw_markdown[:500]}") + if result.error_message: + print(f"Error: {result.error_message}") + + +asyncio.run(main()) diff --git a/tests/proxy/test_persistent_proxy.py b/tests/proxy/test_persistent_proxy.py new file mode 100644 index 00000000..b700825a --- /dev/null +++ b/tests/proxy/test_persistent_proxy.py @@ -0,0 +1,68 @@ +import asyncio +import os +import shutil +import uuid +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig +from crawl4ai.async_configs import ProxyConfig + + +async def crawl_chanel(url: str): + profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}") + os.makedirs(profile_dir, exist_ok=True) + + browser_config = BrowserConfig( + headless=True, + enable_stealth=True, + use_persistent_context=True, + user_data_dir=profile_dir, + viewport_width=1920, + viewport_height=1080, + user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", + headers={ + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none", + "Sec-Fetch-User": "?1", + }, + proxy_config=ProxyConfig( + server="https://network.joinmassive.com:65535", + username="mpuQHs4sWZ-country-US", + password="D0yWxVQo8wQ05RWqz1Bn", + ), + ) + + run_config = CrawlerRunConfig( + magic=True, + simulate_user=True, + override_navigator=True, + page_timeout=120000, + wait_until="load", + delay_before_return_html=10.0, + ) + + try: + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url, config=run_config) + return result + finally: + shutil.rmtree(profile_dir, ignore_errors=True) + + +async def main(): + url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/" + result = await crawl_chanel(url) + print(f"Status: {result.status_code}") + print(f"Success: {result.success}") + print(f"HTML: {len(result.html):,} bytes") + if result.markdown: + md_len = len(result.markdown.raw_markdown) + print(f"Markdown: {md_len:,} chars") + if md_len > 500: + print(f"\nFirst 500 chars of markdown:\n{result.markdown.raw_markdown[:500]}") + if result.error_message: + print(f"Error: {result.error_message}") + + +asyncio.run(main()) diff --git a/tests/proxy/test_platform_match.py b/tests/proxy/test_platform_match.py new file mode 100644 index 00000000..3e503242 --- /dev/null +++ b/tests/proxy/test_platform_match.py @@ -0,0 +1,84 @@ +"""Test if matching UA to actual platform fixes Akamai detection.""" +import asyncio +import os +import shutil +import uuid +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig +from crawl4ai.async_configs import ProxyConfig + + +async def test_with_ua(label, user_agent): + print(f"\n{'='*60}") + print(f"Test: {label}") + print(f"{'='*60}") + + profile_dir = os.path.expanduser(f"~/.crawl4ai/test_{uuid.uuid4().hex[:8]}") + os.makedirs(profile_dir, exist_ok=True) + + browser_config = BrowserConfig( + headless=True, + enable_stealth=True, + use_persistent_context=True, + user_data_dir=profile_dir, + viewport_width=1920, + viewport_height=1080, + user_agent=user_agent, + headers={ + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none", + "Sec-Fetch-User": "?1", + }, + proxy_config=ProxyConfig( + server="https://network.joinmassive.com:65535", + username="mpuQHs4sWZ-country-US", + password="D0yWxVQo8wQ05RWqz1Bn", + ), + ) + + run_config = CrawlerRunConfig( + magic=True, + simulate_user=True, + override_navigator=True, + page_timeout=120000, + wait_until="load", + delay_before_return_html=10.0, + ) + + try: + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + "https://www.chanel.com/us/fashion/handbags/c/1x1x1/", + config=run_config, + ) + print(f" Status: {result.status_code}") + print(f" HTML bytes: {len(result.html)}") + blocked = "access denied" in result.html.lower() + print(f" Blocked: {blocked}") + if not blocked and len(result.html) > 1000: + print(f" SUCCESS! Got real content") + except Exception as e: + print(f" EXCEPTION: {e}") + finally: + shutil.rmtree(profile_dir, ignore_errors=True) + + +async def main(): + # Mac UA on Linux = platform mismatch + await test_with_ua( + "Mac UA (mismatched platform)", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", + ) + + await asyncio.sleep(3) + + # Linux UA = matches actual navigator.platform + await test_with_ua( + "Linux UA (matching platform)", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", + ) + + +asyncio.run(main()) diff --git a/tests/proxy/test_proxy_regression.py b/tests/proxy/test_proxy_regression.py new file mode 100644 index 00000000..764322d7 --- /dev/null +++ b/tests/proxy/test_proxy_regression.py @@ -0,0 +1,96 @@ +"""Regression tests for proxy fix: +1. Persistent context + proxy (new path via launch_persistent_context) +2. Persistent context WITHOUT proxy (should still use launch_persistent_context) +3. Non-persistent + proxy on CrawlerRunConfig (existing path, must not break) +4. Non-persistent, no proxy (basic crawl, must not break) +""" +import asyncio +import os +import shutil +import uuid +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig +from crawl4ai.async_configs import ProxyConfig + +TEST_URL = "https://httpbin.org/ip" # Simple endpoint, returns IP + + +async def test(label, browser_config, run_config=None): + print(f"\n{'='*60}") + print(f"Test: {label}") + print(f"{'='*60}") + run_config = run_config or CrawlerRunConfig() + try: + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(TEST_URL, config=run_config) + print(f" Status: {result.status_code}") + print(f" HTML bytes: {len(result.html)}") + if result.markdown: + # httpbin.org/ip returns JSON with "origin" key + md = result.markdown.raw_markdown.strip() + print(f" Content: {md[:200]}") + if result.error_message: + print(f" ERROR: {result.error_message}") + return result + except Exception as e: + print(f" EXCEPTION: {e}") + return None + + +async def main(): + proxy = ProxyConfig( + server="https://network.joinmassive.com:65535", + username="mpuQHs4sWZ-country-US", + password="D0yWxVQo8wQ05RWqz1Bn", + ) + + # 1. Persistent context + proxy (the fixed path) + pd = os.path.expanduser(f"~/.crawl4ai/test_{uuid.uuid4().hex[:8]}") + os.makedirs(pd, exist_ok=True) + try: + await test( + "Persistent + proxy (launch_persistent_context)", + BrowserConfig( + headless=True, + use_persistent_context=True, + user_data_dir=pd, + proxy_config=proxy, + ), + ) + finally: + shutil.rmtree(pd, ignore_errors=True) + + # 2. Persistent context WITHOUT proxy + pd2 = os.path.expanduser(f"~/.crawl4ai/test_{uuid.uuid4().hex[:8]}") + os.makedirs(pd2, exist_ok=True) + try: + await test( + "Persistent, no proxy (launch_persistent_context)", + BrowserConfig( + headless=True, + use_persistent_context=True, + user_data_dir=pd2, + ), + ) + finally: + shutil.rmtree(pd2, ignore_errors=True) + + # 3. Non-persistent + proxy on CrawlerRunConfig + await test( + "Non-persistent + proxy on RunConfig", + BrowserConfig(headless=True), + CrawlerRunConfig( + proxy_config=proxy, + ), + ) + + # 4. Basic crawl - no proxy, no persistent + await test( + "Basic crawl (no proxy, no persistent)", + BrowserConfig(headless=True), + ) + + print("\n" + "="*60) + print("All regression tests complete.") + + +asyncio.run(main())