Files
crawl4ai/tests/proxy/test_chanel_cdp_proxy.py
unclecode fdd989785f Sync sec-ch-ua with User-Agent and keep WebGL alive in stealth mode
Fix a bug where magic mode and per-request UA overrides would change
the User-Agent header without updating the sec-ch-ua (browser hint)
header to match. Anti-bot systems like Akamai detect this mismatch
as a bot signal.

Changes:
- Regenerate browser_hint via UAGen.generate_client_hints() whenever
  the UA is changed at crawl time (magic mode or explicit override)
- Re-apply updated headers to the page via set_extra_http_headers()
- Skip per-crawl UA override for persistent contexts where the UA is
  locked at launch time by Playwright's protocol layer
- Move --disable-gpu flags behind enable_stealth check so WebGL works
  via SwiftShader when stealth mode is active (missing WebGL is a
  detectable headless signal)
- Clean up old test scripts, add clean anti-bot test
2026-02-13 04:10:47 +00:00

113 lines
3.3 KiB
Python

"""
Test: Chanel.com anti-bot bypass via crawl4ai
Requires env vars:
MASSIVE_USERNAME — Massive residential proxy username
MASSIVE_PASSWORD — Massive residential proxy password
Optional:
--cdp URL Connect to external browser via CDP (e.g. http://localhost:9223)
--attempts N Number of attempts per test (default 3)
Usage:
export MASSIVE_USERNAME="your_user"
export MASSIVE_PASSWORD="your_pass"
.venv/bin/python tests/proxy/test_chanel_cdp_proxy.py
.venv/bin/python tests/proxy/test_chanel_cdp_proxy.py --cdp http://localhost:9223
"""
import asyncio
import os
import sys
import re
import tempfile
import shutil
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.async_configs import ProxyConfig
URL = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/"
MASSIVE_USERNAME = os.environ.get("MASSIVE_USERNAME", "")
MASSIVE_PASSWORD = os.environ.get("MASSIVE_PASSWORD", "")
MASSIVE_SERVER = "https://network.joinmassive.com:65535"
def get_proxy_config():
if not MASSIVE_USERNAME or not MASSIVE_PASSWORD:
print("ERROR: Set MASSIVE_USERNAME and MASSIVE_PASSWORD env vars")
sys.exit(1)
return ProxyConfig(
server=MASSIVE_SERVER,
username=MASSIVE_USERNAME,
password=MASSIVE_PASSWORD,
)
async def test_isolated_context(cdp_url: str = None, attempts: int = 3):
"""Test with isolated context (works with both Playwright and CDP)."""
mode = f"CDP ({cdp_url})" if cdp_url else "Playwright Chromium"
print(f"\n{'='*60}")
print(f"Mode: Isolated context — {mode}")
print(f"{'='*60}\n")
kwargs = dict(
enable_stealth=True,
create_isolated_context=True,
viewport_width=1920,
viewport_height=1080,
)
if cdp_url:
kwargs["cdp_url"] = cdp_url
else:
kwargs["headless"] = True
config = BrowserConfig(**kwargs)
run_config = CrawlerRunConfig(
magic=True,
simulate_user=True,
override_navigator=True,
proxy_config=get_proxy_config(),
page_timeout=120000,
wait_until="load",
delay_before_return_html=15.0,
)
passed = 0
async with AsyncWebCrawler(config=config) as crawler:
for i in range(attempts):
result = await crawler.arun(URL, config=run_config)
ok = result.status_code == 200 and len(result.html) > 10000
title = ""
if ok:
passed += 1
m = re.search(r"<title>(.*?)</title>", result.html)
title = f" title={m.group(1)}" if m else ""
print(f" Attempt {i+1}: status={result.status_code} html={len(result.html):>10,} bytes {'PASS' if ok else 'FAIL'}{title}")
print(f"\nResult: {passed}/{attempts} passed")
return passed > 0
async def main():
cdp_url = None
attempts = 3
args = sys.argv[1:]
for j, arg in enumerate(args):
if arg == "--cdp" and j + 1 < len(args):
cdp_url = args[j + 1]
if arg == "--attempts" and j + 1 < len(args):
attempts = int(args[j + 1])
ok = await test_isolated_context(cdp_url=cdp_url, attempts=attempts)
print(f"\n{'='*60}")
print(f"Result: {'PASS' if ok else 'FAIL'}")
print(f"{'='*60}")
return ok
if __name__ == "__main__":
ok = asyncio.run(main())
sys.exit(0 if ok else 1)