Fix a bug where magic mode and per-request UA overrides would change the User-Agent header without updating the sec-ch-ua (browser hint) header to match. Anti-bot systems like Akamai detect this mismatch as a bot signal. Changes: - Regenerate browser_hint via UAGen.generate_client_hints() whenever the UA is changed at crawl time (magic mode or explicit override) - Re-apply updated headers to the page via set_extra_http_headers() - Skip per-crawl UA override for persistent contexts where the UA is locked at launch time by Playwright's protocol layer - Move --disable-gpu flags behind enable_stealth check so WebGL works via SwiftShader when stealth mode is active (missing WebGL is a detectable headless signal) - Clean up old test scripts, add clean anti-bot test
113 lines
3.3 KiB
Python
113 lines
3.3 KiB
Python
"""
|
|
Test: Chanel.com anti-bot bypass via crawl4ai
|
|
|
|
Requires env vars:
|
|
MASSIVE_USERNAME — Massive residential proxy username
|
|
MASSIVE_PASSWORD — Massive residential proxy password
|
|
|
|
Optional:
|
|
--cdp URL Connect to external browser via CDP (e.g. http://localhost:9223)
|
|
--attempts N Number of attempts per test (default 3)
|
|
|
|
Usage:
|
|
export MASSIVE_USERNAME="your_user"
|
|
export MASSIVE_PASSWORD="your_pass"
|
|
.venv/bin/python tests/proxy/test_chanel_cdp_proxy.py
|
|
.venv/bin/python tests/proxy/test_chanel_cdp_proxy.py --cdp http://localhost:9223
|
|
"""
|
|
|
|
import asyncio
|
|
import os
|
|
import sys
|
|
import re
|
|
import tempfile
|
|
import shutil
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
|
from crawl4ai.async_configs import ProxyConfig
|
|
|
|
URL = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/"
|
|
|
|
MASSIVE_USERNAME = os.environ.get("MASSIVE_USERNAME", "")
|
|
MASSIVE_PASSWORD = os.environ.get("MASSIVE_PASSWORD", "")
|
|
MASSIVE_SERVER = "https://network.joinmassive.com:65535"
|
|
|
|
|
|
def get_proxy_config():
|
|
if not MASSIVE_USERNAME or not MASSIVE_PASSWORD:
|
|
print("ERROR: Set MASSIVE_USERNAME and MASSIVE_PASSWORD env vars")
|
|
sys.exit(1)
|
|
return ProxyConfig(
|
|
server=MASSIVE_SERVER,
|
|
username=MASSIVE_USERNAME,
|
|
password=MASSIVE_PASSWORD,
|
|
)
|
|
|
|
|
|
async def test_isolated_context(cdp_url: str = None, attempts: int = 3):
|
|
"""Test with isolated context (works with both Playwright and CDP)."""
|
|
mode = f"CDP ({cdp_url})" if cdp_url else "Playwright Chromium"
|
|
print(f"\n{'='*60}")
|
|
print(f"Mode: Isolated context — {mode}")
|
|
print(f"{'='*60}\n")
|
|
|
|
kwargs = dict(
|
|
enable_stealth=True,
|
|
create_isolated_context=True,
|
|
viewport_width=1920,
|
|
viewport_height=1080,
|
|
)
|
|
if cdp_url:
|
|
kwargs["cdp_url"] = cdp_url
|
|
else:
|
|
kwargs["headless"] = True
|
|
|
|
config = BrowserConfig(**kwargs)
|
|
run_config = CrawlerRunConfig(
|
|
magic=True,
|
|
simulate_user=True,
|
|
override_navigator=True,
|
|
proxy_config=get_proxy_config(),
|
|
page_timeout=120000,
|
|
wait_until="load",
|
|
delay_before_return_html=15.0,
|
|
)
|
|
|
|
passed = 0
|
|
async with AsyncWebCrawler(config=config) as crawler:
|
|
for i in range(attempts):
|
|
result = await crawler.arun(URL, config=run_config)
|
|
ok = result.status_code == 200 and len(result.html) > 10000
|
|
title = ""
|
|
if ok:
|
|
passed += 1
|
|
m = re.search(r"<title>(.*?)</title>", result.html)
|
|
title = f" title={m.group(1)}" if m else ""
|
|
print(f" Attempt {i+1}: status={result.status_code} html={len(result.html):>10,} bytes {'PASS' if ok else 'FAIL'}{title}")
|
|
|
|
print(f"\nResult: {passed}/{attempts} passed")
|
|
return passed > 0
|
|
|
|
|
|
async def main():
|
|
cdp_url = None
|
|
attempts = 3
|
|
|
|
args = sys.argv[1:]
|
|
for j, arg in enumerate(args):
|
|
if arg == "--cdp" and j + 1 < len(args):
|
|
cdp_url = args[j + 1]
|
|
if arg == "--attempts" and j + 1 < len(args):
|
|
attempts = int(args[j + 1])
|
|
|
|
ok = await test_isolated_context(cdp_url=cdp_url, attempts=attempts)
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Result: {'PASS' if ok else 'FAIL'}")
|
|
print(f"{'='*60}")
|
|
return ok
|
|
|
|
|
|
if __name__ == "__main__":
|
|
ok = asyncio.run(main())
|
|
sys.exit(0 if ok else 1)
|