From fdd989785fa8aa3ccdaa6175a8ce8db54b9a100f Mon Sep 17 00:00:00 2001 From: unclecode Date: Fri, 13 Feb 2026 04:10:47 +0000 Subject: [PATCH] Sync sec-ch-ua with User-Agent and keep WebGL alive in stealth mode Fix a bug where magic mode and per-request UA overrides would change the User-Agent header without updating the sec-ch-ua (browser hint) header to match. Anti-bot systems like Akamai detect this mismatch as a bot signal. Changes: - Regenerate browser_hint via UAGen.generate_client_hints() whenever the UA is changed at crawl time (magic mode or explicit override) - Re-apply updated headers to the page via set_extra_http_headers() - Skip per-crawl UA override for persistent contexts where the UA is locked at launch time by Playwright's protocol layer - Move --disable-gpu flags behind enable_stealth check so WebGL works via SwiftShader when stealth mode is active (missing WebGL is a detectable headless signal) - Clean up old test scripts, add clean anti-bot test --- crawl4ai/async_crawler_strategy.py | 40 ++++-- crawl4ai/browser_manager.py | 11 +- tests/proxy/test_chanel_basic.py | 61 ---------- tests/proxy/test_chanel_cdp_proxy.py | 112 +++++++++++++++++ tests/proxy/test_chanel_debug.py | 62 ---------- tests/proxy/test_chanel_multi_attempt.py | 147 ----------------------- tests/proxy/test_chanel_xvfb.py | 62 ---------- tests/proxy/test_platform_match.py | 84 ------------- tests/proxy/test_proxy_verify.py | 109 +++++++++++++++++ 9 files changed, 261 insertions(+), 427 deletions(-) delete mode 100644 tests/proxy/test_chanel_basic.py create mode 100644 tests/proxy/test_chanel_cdp_proxy.py delete mode 100644 tests/proxy/test_chanel_debug.py delete mode 100644 tests/proxy/test_chanel_multi_attempt.py delete mode 100644 tests/proxy/test_chanel_xvfb.py delete mode 100644 tests/proxy/test_platform_match.py create mode 100644 tests/proxy/test_proxy_verify.py diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 9ac3f7fc..858f4cfc 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -19,7 +19,7 @@ from .config import SCREENSHOT_HEIGHT_TRESHOLD from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig from .async_logger import AsyncLogger from .ssl_certificate import SSLCertificate -from .user_agent_generator import ValidUAGenerator +from .user_agent_generator import ValidUAGenerator, UAGen from .browser_manager import BrowserManager from .browser_adapter import BrowserAdapter, PlaywrightAdapter, UndetectedAdapter @@ -534,18 +534,42 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): captured_requests = [] captured_console = [] - # Handle user agent with magic mode - user_agent_to_override = config.user_agent - if user_agent_to_override: - self.browser_config.user_agent = user_agent_to_override - elif config.magic or config.user_agent_mode == "random": - self.browser_config.user_agent = ValidUAGenerator().generate( - **(config.user_agent_generator_config or {}) + # Handle user agent with magic mode. + # For persistent contexts the UA is locked at browser launch time + # (launch_persistent_context bakes it into the protocol layer), so + # changing it here would only desync browser_config from reality. + # Users should set user_agent or user_agent_mode on BrowserConfig. + ua_changed = False + if not self.browser_config.use_persistent_context: + user_agent_to_override = config.user_agent + if user_agent_to_override: + self.browser_config.user_agent = user_agent_to_override + ua_changed = True + elif config.magic or config.user_agent_mode == "random": + self.browser_config.user_agent = ValidUAGenerator().generate( + **(config.user_agent_generator_config or {}) + ) + ua_changed = True + + # Keep sec-ch-ua in sync whenever the UA changed + if ua_changed: + self.browser_config.browser_hint = UAGen.generate_client_hints( + self.browser_config.user_agent ) + self.browser_config.headers["sec-ch-ua"] = self.browser_config.browser_hint # Get page for session page, context = await self.browser_manager.get_page(crawlerRunConfig=config) + # Push updated UA + sec-ch-ua to the page so the server sees them + if ua_changed: + combined_headers = { + "User-Agent": self.browser_config.user_agent, + "sec-ch-ua": self.browser_config.browser_hint, + } + combined_headers.update(self.browser_config.headers) + await page.set_extra_http_headers(combined_headers) + # await page.goto(URL) # Add default cookie diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index 51e8f99b..2296d076 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -70,9 +70,6 @@ class ManagedBrowser: def build_browser_flags(config: BrowserConfig) -> List[str]: """Common CLI flags for launching Chromium""" flags = [ - "--disable-gpu", - "--disable-gpu-compositing", - "--disable-software-rasterizer", "--no-sandbox", "--disable-dev-shm-usage", "--no-first-run", @@ -93,6 +90,14 @@ class ManagedBrowser: "--disable-component-update", "--disable-domain-reliability", ] + # GPU flags disable WebGL which anti-bot sensors detect as headless. + # Keep WebGL working (via SwiftShader) when stealth mode is active. + if not config.enable_stealth: + flags.extend([ + "--disable-gpu", + "--disable-gpu-compositing", + "--disable-software-rasterizer", + ]) if config.memory_saving_mode: flags.extend([ "--aggressive-cache-discard", diff --git a/tests/proxy/test_chanel_basic.py b/tests/proxy/test_chanel_basic.py deleted file mode 100644 index 86201c5c..00000000 --- a/tests/proxy/test_chanel_basic.py +++ /dev/null @@ -1,61 +0,0 @@ -import asyncio -import os -import shutil -import uuid -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig - - -async def crawl_chanel(url: str): - # Fresh profile each time (gets flagged after one use) - profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}") - os.makedirs(profile_dir, exist_ok=True) - - browser_config = BrowserConfig( - headless=True, - enable_stealth=True, - use_persistent_context=True, - user_data_dir=profile_dir, - viewport_width=1920, - viewport_height=1080, - user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", - headers={ - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", - "Accept-Language": "en-US,en;q=0.9", - "Sec-Fetch-Dest": "document", - "Sec-Fetch-Mode": "navigate", - "Sec-Fetch-Site": "none", - "Sec-Fetch-User": "?1", - } - ) - - run_config = CrawlerRunConfig( - magic=True, - simulate_user=True, - override_navigator=True, - page_timeout=120000, - wait_until="load", - delay_before_return_html=10.0, - ) - - try: - async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun(url, config=run_config) - return result - finally: - shutil.rmtree(profile_dir, ignore_errors=True) - - -async def main(): - url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/" - result = await crawl_chanel(url) - print(f"Status: {result.status_code}") - print(f"Success: {result.success}") - print(f"HTML: {len(result.html):,} bytes") - if result.markdown: - md_len = len(result.markdown.raw_markdown) - print(f"Markdown: {md_len:,} chars") - if result.error_message: - print(f"Error: {result.error_message}") - - -asyncio.run(main()) diff --git a/tests/proxy/test_chanel_cdp_proxy.py b/tests/proxy/test_chanel_cdp_proxy.py new file mode 100644 index 00000000..3fc90bb1 --- /dev/null +++ b/tests/proxy/test_chanel_cdp_proxy.py @@ -0,0 +1,112 @@ +""" +Test: Chanel.com anti-bot bypass via crawl4ai + +Requires env vars: + MASSIVE_USERNAME — Massive residential proxy username + MASSIVE_PASSWORD — Massive residential proxy password + +Optional: + --cdp URL Connect to external browser via CDP (e.g. http://localhost:9223) + --attempts N Number of attempts per test (default 3) + +Usage: + export MASSIVE_USERNAME="your_user" + export MASSIVE_PASSWORD="your_pass" + .venv/bin/python tests/proxy/test_chanel_cdp_proxy.py + .venv/bin/python tests/proxy/test_chanel_cdp_proxy.py --cdp http://localhost:9223 +""" + +import asyncio +import os +import sys +import re +import tempfile +import shutil +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig +from crawl4ai.async_configs import ProxyConfig + +URL = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/" + +MASSIVE_USERNAME = os.environ.get("MASSIVE_USERNAME", "") +MASSIVE_PASSWORD = os.environ.get("MASSIVE_PASSWORD", "") +MASSIVE_SERVER = "https://network.joinmassive.com:65535" + + +def get_proxy_config(): + if not MASSIVE_USERNAME or not MASSIVE_PASSWORD: + print("ERROR: Set MASSIVE_USERNAME and MASSIVE_PASSWORD env vars") + sys.exit(1) + return ProxyConfig( + server=MASSIVE_SERVER, + username=MASSIVE_USERNAME, + password=MASSIVE_PASSWORD, + ) + + +async def test_isolated_context(cdp_url: str = None, attempts: int = 3): + """Test with isolated context (works with both Playwright and CDP).""" + mode = f"CDP ({cdp_url})" if cdp_url else "Playwright Chromium" + print(f"\n{'='*60}") + print(f"Mode: Isolated context — {mode}") + print(f"{'='*60}\n") + + kwargs = dict( + enable_stealth=True, + create_isolated_context=True, + viewport_width=1920, + viewport_height=1080, + ) + if cdp_url: + kwargs["cdp_url"] = cdp_url + else: + kwargs["headless"] = True + + config = BrowserConfig(**kwargs) + run_config = CrawlerRunConfig( + magic=True, + simulate_user=True, + override_navigator=True, + proxy_config=get_proxy_config(), + page_timeout=120000, + wait_until="load", + delay_before_return_html=15.0, + ) + + passed = 0 + async with AsyncWebCrawler(config=config) as crawler: + for i in range(attempts): + result = await crawler.arun(URL, config=run_config) + ok = result.status_code == 200 and len(result.html) > 10000 + title = "" + if ok: + passed += 1 + m = re.search(r"(.*?)", result.html) + title = f" title={m.group(1)}" if m else "" + print(f" Attempt {i+1}: status={result.status_code} html={len(result.html):>10,} bytes {'PASS' if ok else 'FAIL'}{title}") + + print(f"\nResult: {passed}/{attempts} passed") + return passed > 0 + + +async def main(): + cdp_url = None + attempts = 3 + + args = sys.argv[1:] + for j, arg in enumerate(args): + if arg == "--cdp" and j + 1 < len(args): + cdp_url = args[j + 1] + if arg == "--attempts" and j + 1 < len(args): + attempts = int(args[j + 1]) + + ok = await test_isolated_context(cdp_url=cdp_url, attempts=attempts) + + print(f"\n{'='*60}") + print(f"Result: {'PASS' if ok else 'FAIL'}") + print(f"{'='*60}") + return ok + + +if __name__ == "__main__": + ok = asyncio.run(main()) + sys.exit(0 if ok else 1) diff --git a/tests/proxy/test_chanel_debug.py b/tests/proxy/test_chanel_debug.py deleted file mode 100644 index a656d512..00000000 --- a/tests/proxy/test_chanel_debug.py +++ /dev/null @@ -1,62 +0,0 @@ -import asyncio -import os -import shutil -import uuid -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig -from crawl4ai.async_configs import ProxyConfig - - -async def main(): - profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}") - os.makedirs(profile_dir, exist_ok=True) - - browser_config = BrowserConfig( - headless=True, - enable_stealth=True, - use_persistent_context=True, - user_data_dir=profile_dir, - viewport_width=1920, - viewport_height=1080, - user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", - headers={ - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", - "Accept-Language": "en-US,en;q=0.9", - "Sec-Fetch-Dest": "document", - "Sec-Fetch-Mode": "navigate", - "Sec-Fetch-Site": "none", - "Sec-Fetch-User": "?1", - }, - proxy_config=ProxyConfig( - server="https://network.joinmassive.com:65535", - username="mpuQHs4sWZ-country-US", - password="D0yWxVQo8wQ05RWqz1Bn", - ), - ) - - run_config = CrawlerRunConfig( - magic=True, - simulate_user=True, - override_navigator=True, - page_timeout=120000, - wait_until="networkidle", - delay_before_return_html=15.0, - ) - - try: - async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun( - "https://www.chanel.com/us/fashion/handbags/c/1x1x1/", - config=run_config, - ) - print(f"Status: {result.status_code}") - print(f"HTML bytes: {len(result.html)}") - print(f"\n=== FULL HTML ===\n{result.html}") - print(f"\n=== RESPONSE HEADERS ===") - if result.response_headers: - for k, v in sorted(result.response_headers.items()): - print(f" {k}: {v}") - finally: - shutil.rmtree(profile_dir, ignore_errors=True) - - -asyncio.run(main()) diff --git a/tests/proxy/test_chanel_multi_attempt.py b/tests/proxy/test_chanel_multi_attempt.py deleted file mode 100644 index a02b97aa..00000000 --- a/tests/proxy/test_chanel_multi_attempt.py +++ /dev/null @@ -1,147 +0,0 @@ -import asyncio -import os -import shutil -import uuid -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, UndetectedAdapter -from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy - - -async def attempt(label, browser_config, run_config, crawler_strategy=None): - print(f"\n{'='*60}") - print(f"Attempt: {label}") - print(f"{'='*60}") - - url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/" - - kwargs = {"config": browser_config} - if crawler_strategy: - kwargs["crawler_strategy"] = crawler_strategy - - try: - async with AsyncWebCrawler(**kwargs) as crawler: - result = await crawler.arun(url, config=run_config) - print(f"Status: {result.status_code}") - print(f"Success: {result.success}") - print(f"HTML: {len(result.html):,} bytes") - if result.markdown: - print(f"Markdown: {len(result.markdown.raw_markdown):,} chars") - if result.error_message: - print(f"Error: {result.error_message}") - # Check for anti-bot indicators - html_lower = result.html.lower() - for indicator in ["access denied", "403", "blocked", "captcha", "challenge"]: - if indicator in html_lower: - print(f" Anti-bot indicator found: '{indicator}'") - return result - except Exception as e: - print(f"Exception: {e}") - return None - - -async def main(): - mac_ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" - - headers = { - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", - "Accept-Language": "en-US,en;q=0.9", - "Sec-Fetch-Dest": "document", - "Sec-Fetch-Mode": "navigate", - "Sec-Fetch-Site": "none", - "Sec-Fetch-User": "?1", - } - - # ---- Attempt 1: Mac UA + stealth (user's original approach) ---- - profile1 = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}") - os.makedirs(profile1, exist_ok=True) - try: - bc1 = BrowserConfig( - headless=True, - enable_stealth=True, - use_persistent_context=True, - user_data_dir=profile1, - viewport_width=1920, - viewport_height=1080, - user_agent=mac_ua, - headers=headers, - ) - rc1 = CrawlerRunConfig( - magic=True, - simulate_user=True, - override_navigator=True, - page_timeout=120000, - wait_until="load", - delay_before_return_html=10.0, - ) - await attempt("Mac UA + Stealth + Magic (user's script)", bc1, rc1) - finally: - shutil.rmtree(profile1, ignore_errors=True) - - await asyncio.sleep(3) - - # ---- Attempt 2: Undetected adapter (patchright) ---- - profile2 = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}") - os.makedirs(profile2, exist_ok=True) - try: - bc2 = BrowserConfig( - headless=True, - use_persistent_context=True, - user_data_dir=profile2, - viewport_width=1920, - viewport_height=1080, - user_agent=mac_ua, - headers=headers, - ) - rc2 = CrawlerRunConfig( - simulate_user=True, - override_navigator=True, - page_timeout=120000, - wait_until="load", - delay_before_return_html=15.0, - ) - adapter = UndetectedAdapter() - strategy = AsyncPlaywrightCrawlerStrategy( - browser_config=bc2, - browser_adapter=adapter, - ) - await attempt("Undetected Adapter (patchright)", bc2, rc2, crawler_strategy=strategy) - finally: - shutil.rmtree(profile2, ignore_errors=True) - - await asyncio.sleep(3) - - # ---- Attempt 3: Longer delay + networkidle ---- - profile3 = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}") - os.makedirs(profile3, exist_ok=True) - try: - bc3 = BrowserConfig( - headless=True, - enable_stealth=True, - use_persistent_context=True, - user_data_dir=profile3, - viewport_width=1920, - viewport_height=1080, - user_agent=mac_ua, - headers=headers, - ) - rc3 = CrawlerRunConfig( - magic=True, - simulate_user=True, - override_navigator=True, - page_timeout=120000, - wait_until="networkidle", - delay_before_return_html=20.0, - js_code=""" - // Simulate human-like scrolling - await new Promise(r => setTimeout(r, 2000)); - window.scrollTo({top: 300, behavior: 'smooth'}); - await new Promise(r => setTimeout(r, 1500)); - window.scrollTo({top: 600, behavior: 'smooth'}); - await new Promise(r => setTimeout(r, 1000)); - """, - ) - await attempt("Stealth + networkidle + scroll + 20s delay", bc3, rc3) - finally: - shutil.rmtree(profile3, ignore_errors=True) - - -asyncio.run(main()) diff --git a/tests/proxy/test_chanel_xvfb.py b/tests/proxy/test_chanel_xvfb.py deleted file mode 100644 index c9221c17..00000000 --- a/tests/proxy/test_chanel_xvfb.py +++ /dev/null @@ -1,62 +0,0 @@ -import asyncio -import os -import shutil -import uuid -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig - - -async def crawl_chanel(url: str): - profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}") - os.makedirs(profile_dir, exist_ok=True) - - browser_config = BrowserConfig( - headless=False, # Non-headless via Xvfb - harder to detect - enable_stealth=True, - use_persistent_context=True, - user_data_dir=profile_dir, - viewport_width=1920, - viewport_height=1080, - user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", - headers={ - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", - "Accept-Language": "en-US,en;q=0.9", - "Sec-Fetch-Dest": "document", - "Sec-Fetch-Mode": "navigate", - "Sec-Fetch-Site": "none", - "Sec-Fetch-User": "?1", - } - ) - - run_config = CrawlerRunConfig( - magic=True, - simulate_user=True, - override_navigator=True, - page_timeout=120000, - wait_until="load", - delay_before_return_html=10.0, - ) - - try: - async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun(url, config=run_config) - return result - finally: - shutil.rmtree(profile_dir, ignore_errors=True) - - -async def main(): - url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/" - result = await crawl_chanel(url) - print(f"Status: {result.status_code}") - print(f"Success: {result.success}") - print(f"HTML: {len(result.html):,} bytes") - if result.markdown: - md_len = len(result.markdown.raw_markdown) - print(f"Markdown: {md_len:,} chars") - if md_len > 500: - print(f"\nFirst 500 chars of markdown:\n{result.markdown.raw_markdown[:500]}") - if result.error_message: - print(f"Error: {result.error_message}") - - -asyncio.run(main()) diff --git a/tests/proxy/test_platform_match.py b/tests/proxy/test_platform_match.py deleted file mode 100644 index 3e503242..00000000 --- a/tests/proxy/test_platform_match.py +++ /dev/null @@ -1,84 +0,0 @@ -"""Test if matching UA to actual platform fixes Akamai detection.""" -import asyncio -import os -import shutil -import uuid -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig -from crawl4ai.async_configs import ProxyConfig - - -async def test_with_ua(label, user_agent): - print(f"\n{'='*60}") - print(f"Test: {label}") - print(f"{'='*60}") - - profile_dir = os.path.expanduser(f"~/.crawl4ai/test_{uuid.uuid4().hex[:8]}") - os.makedirs(profile_dir, exist_ok=True) - - browser_config = BrowserConfig( - headless=True, - enable_stealth=True, - use_persistent_context=True, - user_data_dir=profile_dir, - viewport_width=1920, - viewport_height=1080, - user_agent=user_agent, - headers={ - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", - "Accept-Language": "en-US,en;q=0.9", - "Sec-Fetch-Dest": "document", - "Sec-Fetch-Mode": "navigate", - "Sec-Fetch-Site": "none", - "Sec-Fetch-User": "?1", - }, - proxy_config=ProxyConfig( - server="https://network.joinmassive.com:65535", - username="mpuQHs4sWZ-country-US", - password="D0yWxVQo8wQ05RWqz1Bn", - ), - ) - - run_config = CrawlerRunConfig( - magic=True, - simulate_user=True, - override_navigator=True, - page_timeout=120000, - wait_until="load", - delay_before_return_html=10.0, - ) - - try: - async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun( - "https://www.chanel.com/us/fashion/handbags/c/1x1x1/", - config=run_config, - ) - print(f" Status: {result.status_code}") - print(f" HTML bytes: {len(result.html)}") - blocked = "access denied" in result.html.lower() - print(f" Blocked: {blocked}") - if not blocked and len(result.html) > 1000: - print(f" SUCCESS! Got real content") - except Exception as e: - print(f" EXCEPTION: {e}") - finally: - shutil.rmtree(profile_dir, ignore_errors=True) - - -async def main(): - # Mac UA on Linux = platform mismatch - await test_with_ua( - "Mac UA (mismatched platform)", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", - ) - - await asyncio.sleep(3) - - # Linux UA = matches actual navigator.platform - await test_with_ua( - "Linux UA (matching platform)", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", - ) - - -asyncio.run(main()) diff --git a/tests/proxy/test_proxy_verify.py b/tests/proxy/test_proxy_verify.py new file mode 100644 index 00000000..bf9b4f4d --- /dev/null +++ b/tests/proxy/test_proxy_verify.py @@ -0,0 +1,109 @@ +""" +Verify proxies are working and check what IPs they resolve to. +Then test Chanel through NST proxy (different provider). +""" +import requests + +# Check our real IP +def check_ip(label, proxy=None): + print(f"\n--- {label} ---") + try: + kwargs = {"url": "https://httpbin.org/ip", "timeout": 15} + if proxy: + kwargs["proxies"] = {"https": proxy, "http": proxy} + resp = requests.get(**kwargs) + print(f" IP: {resp.json()}") + except Exception as e: + print(f" ERROR: {e}") + +# Get NST proxy credentials +def get_nst_proxy(channel_id): + token = "NSTPROXY-DA9C7A614946EA8FCEFDA9FD3B3F4A9D" + api_url = f"https://api.nstproxy.com/api/v1/generate/apiproxies?count=1&country=US&protocol=http&sessionDuration=0&channelId={channel_id}&token={token}" + print(f"\nFetching NST proxy ({channel_id[:8]}...):") + print(f" URL: {api_url}") + try: + resp = requests.get(api_url, timeout=15) + print(f" HTTP {resp.status_code}") + print(f" Body: {resp.text[:500]}") + data = resp.json() + if data.get("code") == 200 and data.get("data"): + proxy_str = data["data"][0] + parts = proxy_str.split(":") + if len(parts) == 4: + ip, port, user, pwd = parts + proxy_url = f"http://{user}:{pwd}@{ip}:{port}" + print(f" Proxy URL: http://{user[:10]}...@{ip}:{port}") + return proxy_url + except Exception as e: + print(f" ERROR: {e}") + return None + +# Test Chanel +def test_chanel(label, proxy=None, use_cffi=False): + url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/" + headers = { + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", + } + print(f"\n{'='*60}") + print(f"TEST: {label}") + try: + if use_cffi: + from curl_cffi import requests as cffi_requests + kwargs = {"url": url, "headers": headers, "impersonate": "chrome", "timeout": 30, "allow_redirects": True} + if proxy: + kwargs["proxies"] = {"https": proxy, "http": proxy} + resp = cffi_requests.get(**kwargs) + else: + kwargs = {"url": url, "headers": headers, "timeout": 30, "allow_redirects": True} + if proxy: + kwargs["proxies"] = {"https": proxy, "http": proxy} + resp = requests.get(**kwargs) + + blocked = "Access Denied" in resp.text + print(f" Status: {resp.status_code}") + print(f" Size: {len(resp.text):,} bytes") + print(f" Result: {'BLOCKED' if blocked else 'SUCCESS' if resp.status_code == 200 and len(resp.text) > 10000 else 'UNCLEAR'}") + if not blocked and resp.status_code == 200: + print(f" First 300 chars: {resp.text[:300]}") + except Exception as e: + print(f" ERROR: {e}") + + +if __name__ == "__main__": + MASSIVE_RES = "https://mpuQHs4sWZ-country-US:D0yWxVQo8wQ05RWqz1Bn@network.joinmassive.com:65535" + MASSIVE_DC = "http://mpuQHs4sWZ-country-US:D0yWxVQo8wQ05RWqz1Bn@isp.joinmassive.com:8000" + + # Step 1: Verify IPs + print("="*60) + print("STEP 1: Verify proxy IPs") + check_ip("Direct (Hetzner)") + check_ip("Massive Residential", MASSIVE_RES) + check_ip("Massive Datacenter/ISP", MASSIVE_DC) + + # Step 2: Get NST proxies + print("\n" + "="*60) + print("STEP 2: Get NST proxy credentials") + nst_res = get_nst_proxy("7864DDA266D5899C") # residential + nst_dc = get_nst_proxy("AE0C3B5547F8A021") # datacenter + + if nst_res: + check_ip("NST Residential", nst_res) + if nst_dc: + check_ip("NST Datacenter", nst_dc) + + # Step 3: Test Chanel with all available proxies + print("\n" + "="*60) + print("STEP 3: Test Chanel.com") + + if nst_res: + test_chanel("curl_cffi + NST residential", proxy=nst_res, use_cffi=True) + test_chanel("plain requests + NST residential", proxy=nst_res, use_cffi=False) + + if nst_dc: + test_chanel("curl_cffi + NST datacenter", proxy=nst_dc, use_cffi=True) + + # Also try Massive ISP/datacenter (different from residential) + test_chanel("curl_cffi + Massive ISP", proxy=MASSIVE_DC, use_cffi=True)