Sync sec-ch-ua with User-Agent and keep WebGL alive in stealth mode
Fix a bug where magic mode and per-request UA overrides would change the User-Agent header without updating the sec-ch-ua (browser hint) header to match. Anti-bot systems like Akamai detect this mismatch as a bot signal. Changes: - Regenerate browser_hint via UAGen.generate_client_hints() whenever the UA is changed at crawl time (magic mode or explicit override) - Re-apply updated headers to the page via set_extra_http_headers() - Skip per-crawl UA override for persistent contexts where the UA is locked at launch time by Playwright's protocol layer - Move --disable-gpu flags behind enable_stealth check so WebGL works via SwiftShader when stealth mode is active (missing WebGL is a detectable headless signal) - Clean up old test scripts, add clean anti-bot test
This commit is contained in:
@@ -19,7 +19,7 @@ from .config import SCREENSHOT_HEIGHT_TRESHOLD
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig
|
||||
from .async_logger import AsyncLogger
|
||||
from .ssl_certificate import SSLCertificate
|
||||
from .user_agent_generator import ValidUAGenerator
|
||||
from .user_agent_generator import ValidUAGenerator, UAGen
|
||||
from .browser_manager import BrowserManager
|
||||
from .browser_adapter import BrowserAdapter, PlaywrightAdapter, UndetectedAdapter
|
||||
|
||||
@@ -534,18 +534,42 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
captured_requests = []
|
||||
captured_console = []
|
||||
|
||||
# Handle user agent with magic mode
|
||||
user_agent_to_override = config.user_agent
|
||||
if user_agent_to_override:
|
||||
self.browser_config.user_agent = user_agent_to_override
|
||||
elif config.magic or config.user_agent_mode == "random":
|
||||
self.browser_config.user_agent = ValidUAGenerator().generate(
|
||||
**(config.user_agent_generator_config or {})
|
||||
# Handle user agent with magic mode.
|
||||
# For persistent contexts the UA is locked at browser launch time
|
||||
# (launch_persistent_context bakes it into the protocol layer), so
|
||||
# changing it here would only desync browser_config from reality.
|
||||
# Users should set user_agent or user_agent_mode on BrowserConfig.
|
||||
ua_changed = False
|
||||
if not self.browser_config.use_persistent_context:
|
||||
user_agent_to_override = config.user_agent
|
||||
if user_agent_to_override:
|
||||
self.browser_config.user_agent = user_agent_to_override
|
||||
ua_changed = True
|
||||
elif config.magic or config.user_agent_mode == "random":
|
||||
self.browser_config.user_agent = ValidUAGenerator().generate(
|
||||
**(config.user_agent_generator_config or {})
|
||||
)
|
||||
ua_changed = True
|
||||
|
||||
# Keep sec-ch-ua in sync whenever the UA changed
|
||||
if ua_changed:
|
||||
self.browser_config.browser_hint = UAGen.generate_client_hints(
|
||||
self.browser_config.user_agent
|
||||
)
|
||||
self.browser_config.headers["sec-ch-ua"] = self.browser_config.browser_hint
|
||||
|
||||
# Get page for session
|
||||
page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
|
||||
|
||||
# Push updated UA + sec-ch-ua to the page so the server sees them
|
||||
if ua_changed:
|
||||
combined_headers = {
|
||||
"User-Agent": self.browser_config.user_agent,
|
||||
"sec-ch-ua": self.browser_config.browser_hint,
|
||||
}
|
||||
combined_headers.update(self.browser_config.headers)
|
||||
await page.set_extra_http_headers(combined_headers)
|
||||
|
||||
# await page.goto(URL)
|
||||
|
||||
# Add default cookie
|
||||
|
||||
@@ -70,9 +70,6 @@ class ManagedBrowser:
|
||||
def build_browser_flags(config: BrowserConfig) -> List[str]:
|
||||
"""Common CLI flags for launching Chromium"""
|
||||
flags = [
|
||||
"--disable-gpu",
|
||||
"--disable-gpu-compositing",
|
||||
"--disable-software-rasterizer",
|
||||
"--no-sandbox",
|
||||
"--disable-dev-shm-usage",
|
||||
"--no-first-run",
|
||||
@@ -93,6 +90,14 @@ class ManagedBrowser:
|
||||
"--disable-component-update",
|
||||
"--disable-domain-reliability",
|
||||
]
|
||||
# GPU flags disable WebGL which anti-bot sensors detect as headless.
|
||||
# Keep WebGL working (via SwiftShader) when stealth mode is active.
|
||||
if not config.enable_stealth:
|
||||
flags.extend([
|
||||
"--disable-gpu",
|
||||
"--disable-gpu-compositing",
|
||||
"--disable-software-rasterizer",
|
||||
])
|
||||
if config.memory_saving_mode:
|
||||
flags.extend([
|
||||
"--aggressive-cache-discard",
|
||||
|
||||
@@ -1,61 +0,0 @@
|
||||
import asyncio
|
||||
import os
|
||||
import shutil
|
||||
import uuid
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
|
||||
|
||||
async def crawl_chanel(url: str):
|
||||
# Fresh profile each time (gets flagged after one use)
|
||||
profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
|
||||
os.makedirs(profile_dir, exist_ok=True)
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
enable_stealth=True,
|
||||
use_persistent_context=True,
|
||||
user_data_dir=profile_dir,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080,
|
||||
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||
headers={
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
"Sec-Fetch-Dest": "document",
|
||||
"Sec-Fetch-Mode": "navigate",
|
||||
"Sec-Fetch-Site": "none",
|
||||
"Sec-Fetch-User": "?1",
|
||||
}
|
||||
)
|
||||
|
||||
run_config = CrawlerRunConfig(
|
||||
magic=True,
|
||||
simulate_user=True,
|
||||
override_navigator=True,
|
||||
page_timeout=120000,
|
||||
wait_until="load",
|
||||
delay_before_return_html=10.0,
|
||||
)
|
||||
|
||||
try:
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url, config=run_config)
|
||||
return result
|
||||
finally:
|
||||
shutil.rmtree(profile_dir, ignore_errors=True)
|
||||
|
||||
|
||||
async def main():
|
||||
url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/"
|
||||
result = await crawl_chanel(url)
|
||||
print(f"Status: {result.status_code}")
|
||||
print(f"Success: {result.success}")
|
||||
print(f"HTML: {len(result.html):,} bytes")
|
||||
if result.markdown:
|
||||
md_len = len(result.markdown.raw_markdown)
|
||||
print(f"Markdown: {md_len:,} chars")
|
||||
if result.error_message:
|
||||
print(f"Error: {result.error_message}")
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
112
tests/proxy/test_chanel_cdp_proxy.py
Normal file
112
tests/proxy/test_chanel_cdp_proxy.py
Normal file
@@ -0,0 +1,112 @@
|
||||
"""
|
||||
Test: Chanel.com anti-bot bypass via crawl4ai
|
||||
|
||||
Requires env vars:
|
||||
MASSIVE_USERNAME — Massive residential proxy username
|
||||
MASSIVE_PASSWORD — Massive residential proxy password
|
||||
|
||||
Optional:
|
||||
--cdp URL Connect to external browser via CDP (e.g. http://localhost:9223)
|
||||
--attempts N Number of attempts per test (default 3)
|
||||
|
||||
Usage:
|
||||
export MASSIVE_USERNAME="your_user"
|
||||
export MASSIVE_PASSWORD="your_pass"
|
||||
.venv/bin/python tests/proxy/test_chanel_cdp_proxy.py
|
||||
.venv/bin/python tests/proxy/test_chanel_cdp_proxy.py --cdp http://localhost:9223
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import tempfile
|
||||
import shutil
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_configs import ProxyConfig
|
||||
|
||||
URL = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/"
|
||||
|
||||
MASSIVE_USERNAME = os.environ.get("MASSIVE_USERNAME", "")
|
||||
MASSIVE_PASSWORD = os.environ.get("MASSIVE_PASSWORD", "")
|
||||
MASSIVE_SERVER = "https://network.joinmassive.com:65535"
|
||||
|
||||
|
||||
def get_proxy_config():
|
||||
if not MASSIVE_USERNAME or not MASSIVE_PASSWORD:
|
||||
print("ERROR: Set MASSIVE_USERNAME and MASSIVE_PASSWORD env vars")
|
||||
sys.exit(1)
|
||||
return ProxyConfig(
|
||||
server=MASSIVE_SERVER,
|
||||
username=MASSIVE_USERNAME,
|
||||
password=MASSIVE_PASSWORD,
|
||||
)
|
||||
|
||||
|
||||
async def test_isolated_context(cdp_url: str = None, attempts: int = 3):
|
||||
"""Test with isolated context (works with both Playwright and CDP)."""
|
||||
mode = f"CDP ({cdp_url})" if cdp_url else "Playwright Chromium"
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Mode: Isolated context — {mode}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
kwargs = dict(
|
||||
enable_stealth=True,
|
||||
create_isolated_context=True,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080,
|
||||
)
|
||||
if cdp_url:
|
||||
kwargs["cdp_url"] = cdp_url
|
||||
else:
|
||||
kwargs["headless"] = True
|
||||
|
||||
config = BrowserConfig(**kwargs)
|
||||
run_config = CrawlerRunConfig(
|
||||
magic=True,
|
||||
simulate_user=True,
|
||||
override_navigator=True,
|
||||
proxy_config=get_proxy_config(),
|
||||
page_timeout=120000,
|
||||
wait_until="load",
|
||||
delay_before_return_html=15.0,
|
||||
)
|
||||
|
||||
passed = 0
|
||||
async with AsyncWebCrawler(config=config) as crawler:
|
||||
for i in range(attempts):
|
||||
result = await crawler.arun(URL, config=run_config)
|
||||
ok = result.status_code == 200 and len(result.html) > 10000
|
||||
title = ""
|
||||
if ok:
|
||||
passed += 1
|
||||
m = re.search(r"<title>(.*?)</title>", result.html)
|
||||
title = f" title={m.group(1)}" if m else ""
|
||||
print(f" Attempt {i+1}: status={result.status_code} html={len(result.html):>10,} bytes {'PASS' if ok else 'FAIL'}{title}")
|
||||
|
||||
print(f"\nResult: {passed}/{attempts} passed")
|
||||
return passed > 0
|
||||
|
||||
|
||||
async def main():
|
||||
cdp_url = None
|
||||
attempts = 3
|
||||
|
||||
args = sys.argv[1:]
|
||||
for j, arg in enumerate(args):
|
||||
if arg == "--cdp" and j + 1 < len(args):
|
||||
cdp_url = args[j + 1]
|
||||
if arg == "--attempts" and j + 1 < len(args):
|
||||
attempts = int(args[j + 1])
|
||||
|
||||
ok = await test_isolated_context(cdp_url=cdp_url, attempts=attempts)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Result: {'PASS' if ok else 'FAIL'}")
|
||||
print(f"{'='*60}")
|
||||
return ok
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ok = asyncio.run(main())
|
||||
sys.exit(0 if ok else 1)
|
||||
@@ -1,62 +0,0 @@
|
||||
import asyncio
|
||||
import os
|
||||
import shutil
|
||||
import uuid
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_configs import ProxyConfig
|
||||
|
||||
|
||||
async def main():
|
||||
profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
|
||||
os.makedirs(profile_dir, exist_ok=True)
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
enable_stealth=True,
|
||||
use_persistent_context=True,
|
||||
user_data_dir=profile_dir,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080,
|
||||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||
headers={
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
"Sec-Fetch-Dest": "document",
|
||||
"Sec-Fetch-Mode": "navigate",
|
||||
"Sec-Fetch-Site": "none",
|
||||
"Sec-Fetch-User": "?1",
|
||||
},
|
||||
proxy_config=ProxyConfig(
|
||||
server="https://network.joinmassive.com:65535",
|
||||
username="mpuQHs4sWZ-country-US",
|
||||
password="D0yWxVQo8wQ05RWqz1Bn",
|
||||
),
|
||||
)
|
||||
|
||||
run_config = CrawlerRunConfig(
|
||||
magic=True,
|
||||
simulate_user=True,
|
||||
override_navigator=True,
|
||||
page_timeout=120000,
|
||||
wait_until="networkidle",
|
||||
delay_before_return_html=15.0,
|
||||
)
|
||||
|
||||
try:
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://www.chanel.com/us/fashion/handbags/c/1x1x1/",
|
||||
config=run_config,
|
||||
)
|
||||
print(f"Status: {result.status_code}")
|
||||
print(f"HTML bytes: {len(result.html)}")
|
||||
print(f"\n=== FULL HTML ===\n{result.html}")
|
||||
print(f"\n=== RESPONSE HEADERS ===")
|
||||
if result.response_headers:
|
||||
for k, v in sorted(result.response_headers.items()):
|
||||
print(f" {k}: {v}")
|
||||
finally:
|
||||
shutil.rmtree(profile_dir, ignore_errors=True)
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
@@ -1,147 +0,0 @@
|
||||
import asyncio
|
||||
import os
|
||||
import shutil
|
||||
import uuid
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, UndetectedAdapter
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
|
||||
async def attempt(label, browser_config, run_config, crawler_strategy=None):
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Attempt: {label}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/"
|
||||
|
||||
kwargs = {"config": browser_config}
|
||||
if crawler_strategy:
|
||||
kwargs["crawler_strategy"] = crawler_strategy
|
||||
|
||||
try:
|
||||
async with AsyncWebCrawler(**kwargs) as crawler:
|
||||
result = await crawler.arun(url, config=run_config)
|
||||
print(f"Status: {result.status_code}")
|
||||
print(f"Success: {result.success}")
|
||||
print(f"HTML: {len(result.html):,} bytes")
|
||||
if result.markdown:
|
||||
print(f"Markdown: {len(result.markdown.raw_markdown):,} chars")
|
||||
if result.error_message:
|
||||
print(f"Error: {result.error_message}")
|
||||
# Check for anti-bot indicators
|
||||
html_lower = result.html.lower()
|
||||
for indicator in ["access denied", "403", "blocked", "captcha", "challenge"]:
|
||||
if indicator in html_lower:
|
||||
print(f" Anti-bot indicator found: '{indicator}'")
|
||||
return result
|
||||
except Exception as e:
|
||||
print(f"Exception: {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def main():
|
||||
mac_ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||
|
||||
headers = {
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
"Sec-Fetch-Dest": "document",
|
||||
"Sec-Fetch-Mode": "navigate",
|
||||
"Sec-Fetch-Site": "none",
|
||||
"Sec-Fetch-User": "?1",
|
||||
}
|
||||
|
||||
# ---- Attempt 1: Mac UA + stealth (user's original approach) ----
|
||||
profile1 = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
|
||||
os.makedirs(profile1, exist_ok=True)
|
||||
try:
|
||||
bc1 = BrowserConfig(
|
||||
headless=True,
|
||||
enable_stealth=True,
|
||||
use_persistent_context=True,
|
||||
user_data_dir=profile1,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080,
|
||||
user_agent=mac_ua,
|
||||
headers=headers,
|
||||
)
|
||||
rc1 = CrawlerRunConfig(
|
||||
magic=True,
|
||||
simulate_user=True,
|
||||
override_navigator=True,
|
||||
page_timeout=120000,
|
||||
wait_until="load",
|
||||
delay_before_return_html=10.0,
|
||||
)
|
||||
await attempt("Mac UA + Stealth + Magic (user's script)", bc1, rc1)
|
||||
finally:
|
||||
shutil.rmtree(profile1, ignore_errors=True)
|
||||
|
||||
await asyncio.sleep(3)
|
||||
|
||||
# ---- Attempt 2: Undetected adapter (patchright) ----
|
||||
profile2 = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
|
||||
os.makedirs(profile2, exist_ok=True)
|
||||
try:
|
||||
bc2 = BrowserConfig(
|
||||
headless=True,
|
||||
use_persistent_context=True,
|
||||
user_data_dir=profile2,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080,
|
||||
user_agent=mac_ua,
|
||||
headers=headers,
|
||||
)
|
||||
rc2 = CrawlerRunConfig(
|
||||
simulate_user=True,
|
||||
override_navigator=True,
|
||||
page_timeout=120000,
|
||||
wait_until="load",
|
||||
delay_before_return_html=15.0,
|
||||
)
|
||||
adapter = UndetectedAdapter()
|
||||
strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=bc2,
|
||||
browser_adapter=adapter,
|
||||
)
|
||||
await attempt("Undetected Adapter (patchright)", bc2, rc2, crawler_strategy=strategy)
|
||||
finally:
|
||||
shutil.rmtree(profile2, ignore_errors=True)
|
||||
|
||||
await asyncio.sleep(3)
|
||||
|
||||
# ---- Attempt 3: Longer delay + networkidle ----
|
||||
profile3 = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
|
||||
os.makedirs(profile3, exist_ok=True)
|
||||
try:
|
||||
bc3 = BrowserConfig(
|
||||
headless=True,
|
||||
enable_stealth=True,
|
||||
use_persistent_context=True,
|
||||
user_data_dir=profile3,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080,
|
||||
user_agent=mac_ua,
|
||||
headers=headers,
|
||||
)
|
||||
rc3 = CrawlerRunConfig(
|
||||
magic=True,
|
||||
simulate_user=True,
|
||||
override_navigator=True,
|
||||
page_timeout=120000,
|
||||
wait_until="networkidle",
|
||||
delay_before_return_html=20.0,
|
||||
js_code="""
|
||||
// Simulate human-like scrolling
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
window.scrollTo({top: 300, behavior: 'smooth'});
|
||||
await new Promise(r => setTimeout(r, 1500));
|
||||
window.scrollTo({top: 600, behavior: 'smooth'});
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
""",
|
||||
)
|
||||
await attempt("Stealth + networkidle + scroll + 20s delay", bc3, rc3)
|
||||
finally:
|
||||
shutil.rmtree(profile3, ignore_errors=True)
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
@@ -1,62 +0,0 @@
|
||||
import asyncio
|
||||
import os
|
||||
import shutil
|
||||
import uuid
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
|
||||
|
||||
async def crawl_chanel(url: str):
|
||||
profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
|
||||
os.makedirs(profile_dir, exist_ok=True)
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=False, # Non-headless via Xvfb - harder to detect
|
||||
enable_stealth=True,
|
||||
use_persistent_context=True,
|
||||
user_data_dir=profile_dir,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080,
|
||||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||
headers={
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
"Sec-Fetch-Dest": "document",
|
||||
"Sec-Fetch-Mode": "navigate",
|
||||
"Sec-Fetch-Site": "none",
|
||||
"Sec-Fetch-User": "?1",
|
||||
}
|
||||
)
|
||||
|
||||
run_config = CrawlerRunConfig(
|
||||
magic=True,
|
||||
simulate_user=True,
|
||||
override_navigator=True,
|
||||
page_timeout=120000,
|
||||
wait_until="load",
|
||||
delay_before_return_html=10.0,
|
||||
)
|
||||
|
||||
try:
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url, config=run_config)
|
||||
return result
|
||||
finally:
|
||||
shutil.rmtree(profile_dir, ignore_errors=True)
|
||||
|
||||
|
||||
async def main():
|
||||
url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/"
|
||||
result = await crawl_chanel(url)
|
||||
print(f"Status: {result.status_code}")
|
||||
print(f"Success: {result.success}")
|
||||
print(f"HTML: {len(result.html):,} bytes")
|
||||
if result.markdown:
|
||||
md_len = len(result.markdown.raw_markdown)
|
||||
print(f"Markdown: {md_len:,} chars")
|
||||
if md_len > 500:
|
||||
print(f"\nFirst 500 chars of markdown:\n{result.markdown.raw_markdown[:500]}")
|
||||
if result.error_message:
|
||||
print(f"Error: {result.error_message}")
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
@@ -1,84 +0,0 @@
|
||||
"""Test if matching UA to actual platform fixes Akamai detection."""
|
||||
import asyncio
|
||||
import os
|
||||
import shutil
|
||||
import uuid
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_configs import ProxyConfig
|
||||
|
||||
|
||||
async def test_with_ua(label, user_agent):
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Test: {label}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
profile_dir = os.path.expanduser(f"~/.crawl4ai/test_{uuid.uuid4().hex[:8]}")
|
||||
os.makedirs(profile_dir, exist_ok=True)
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
enable_stealth=True,
|
||||
use_persistent_context=True,
|
||||
user_data_dir=profile_dir,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080,
|
||||
user_agent=user_agent,
|
||||
headers={
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
"Sec-Fetch-Dest": "document",
|
||||
"Sec-Fetch-Mode": "navigate",
|
||||
"Sec-Fetch-Site": "none",
|
||||
"Sec-Fetch-User": "?1",
|
||||
},
|
||||
proxy_config=ProxyConfig(
|
||||
server="https://network.joinmassive.com:65535",
|
||||
username="mpuQHs4sWZ-country-US",
|
||||
password="D0yWxVQo8wQ05RWqz1Bn",
|
||||
),
|
||||
)
|
||||
|
||||
run_config = CrawlerRunConfig(
|
||||
magic=True,
|
||||
simulate_user=True,
|
||||
override_navigator=True,
|
||||
page_timeout=120000,
|
||||
wait_until="load",
|
||||
delay_before_return_html=10.0,
|
||||
)
|
||||
|
||||
try:
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://www.chanel.com/us/fashion/handbags/c/1x1x1/",
|
||||
config=run_config,
|
||||
)
|
||||
print(f" Status: {result.status_code}")
|
||||
print(f" HTML bytes: {len(result.html)}")
|
||||
blocked = "access denied" in result.html.lower()
|
||||
print(f" Blocked: {blocked}")
|
||||
if not blocked and len(result.html) > 1000:
|
||||
print(f" SUCCESS! Got real content")
|
||||
except Exception as e:
|
||||
print(f" EXCEPTION: {e}")
|
||||
finally:
|
||||
shutil.rmtree(profile_dir, ignore_errors=True)
|
||||
|
||||
|
||||
async def main():
|
||||
# Mac UA on Linux = platform mismatch
|
||||
await test_with_ua(
|
||||
"Mac UA (mismatched platform)",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||
)
|
||||
|
||||
await asyncio.sleep(3)
|
||||
|
||||
# Linux UA = matches actual navigator.platform
|
||||
await test_with_ua(
|
||||
"Linux UA (matching platform)",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||
)
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
109
tests/proxy/test_proxy_verify.py
Normal file
109
tests/proxy/test_proxy_verify.py
Normal file
@@ -0,0 +1,109 @@
|
||||
"""
|
||||
Verify proxies are working and check what IPs they resolve to.
|
||||
Then test Chanel through NST proxy (different provider).
|
||||
"""
|
||||
import requests
|
||||
|
||||
# Check our real IP
|
||||
def check_ip(label, proxy=None):
|
||||
print(f"\n--- {label} ---")
|
||||
try:
|
||||
kwargs = {"url": "https://httpbin.org/ip", "timeout": 15}
|
||||
if proxy:
|
||||
kwargs["proxies"] = {"https": proxy, "http": proxy}
|
||||
resp = requests.get(**kwargs)
|
||||
print(f" IP: {resp.json()}")
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
|
||||
# Get NST proxy credentials
|
||||
def get_nst_proxy(channel_id):
|
||||
token = "NSTPROXY-DA9C7A614946EA8FCEFDA9FD3B3F4A9D"
|
||||
api_url = f"https://api.nstproxy.com/api/v1/generate/apiproxies?count=1&country=US&protocol=http&sessionDuration=0&channelId={channel_id}&token={token}"
|
||||
print(f"\nFetching NST proxy ({channel_id[:8]}...):")
|
||||
print(f" URL: {api_url}")
|
||||
try:
|
||||
resp = requests.get(api_url, timeout=15)
|
||||
print(f" HTTP {resp.status_code}")
|
||||
print(f" Body: {resp.text[:500]}")
|
||||
data = resp.json()
|
||||
if data.get("code") == 200 and data.get("data"):
|
||||
proxy_str = data["data"][0]
|
||||
parts = proxy_str.split(":")
|
||||
if len(parts) == 4:
|
||||
ip, port, user, pwd = parts
|
||||
proxy_url = f"http://{user}:{pwd}@{ip}:{port}"
|
||||
print(f" Proxy URL: http://{user[:10]}...@{ip}:{port}")
|
||||
return proxy_url
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
return None
|
||||
|
||||
# Test Chanel
|
||||
def test_chanel(label, proxy=None, use_cffi=False):
|
||||
url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/"
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
}
|
||||
print(f"\n{'='*60}")
|
||||
print(f"TEST: {label}")
|
||||
try:
|
||||
if use_cffi:
|
||||
from curl_cffi import requests as cffi_requests
|
||||
kwargs = {"url": url, "headers": headers, "impersonate": "chrome", "timeout": 30, "allow_redirects": True}
|
||||
if proxy:
|
||||
kwargs["proxies"] = {"https": proxy, "http": proxy}
|
||||
resp = cffi_requests.get(**kwargs)
|
||||
else:
|
||||
kwargs = {"url": url, "headers": headers, "timeout": 30, "allow_redirects": True}
|
||||
if proxy:
|
||||
kwargs["proxies"] = {"https": proxy, "http": proxy}
|
||||
resp = requests.get(**kwargs)
|
||||
|
||||
blocked = "Access Denied" in resp.text
|
||||
print(f" Status: {resp.status_code}")
|
||||
print(f" Size: {len(resp.text):,} bytes")
|
||||
print(f" Result: {'BLOCKED' if blocked else 'SUCCESS' if resp.status_code == 200 and len(resp.text) > 10000 else 'UNCLEAR'}")
|
||||
if not blocked and resp.status_code == 200:
|
||||
print(f" First 300 chars: {resp.text[:300]}")
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
MASSIVE_RES = "https://mpuQHs4sWZ-country-US:D0yWxVQo8wQ05RWqz1Bn@network.joinmassive.com:65535"
|
||||
MASSIVE_DC = "http://mpuQHs4sWZ-country-US:D0yWxVQo8wQ05RWqz1Bn@isp.joinmassive.com:8000"
|
||||
|
||||
# Step 1: Verify IPs
|
||||
print("="*60)
|
||||
print("STEP 1: Verify proxy IPs")
|
||||
check_ip("Direct (Hetzner)")
|
||||
check_ip("Massive Residential", MASSIVE_RES)
|
||||
check_ip("Massive Datacenter/ISP", MASSIVE_DC)
|
||||
|
||||
# Step 2: Get NST proxies
|
||||
print("\n" + "="*60)
|
||||
print("STEP 2: Get NST proxy credentials")
|
||||
nst_res = get_nst_proxy("7864DDA266D5899C") # residential
|
||||
nst_dc = get_nst_proxy("AE0C3B5547F8A021") # datacenter
|
||||
|
||||
if nst_res:
|
||||
check_ip("NST Residential", nst_res)
|
||||
if nst_dc:
|
||||
check_ip("NST Datacenter", nst_dc)
|
||||
|
||||
# Step 3: Test Chanel with all available proxies
|
||||
print("\n" + "="*60)
|
||||
print("STEP 3: Test Chanel.com")
|
||||
|
||||
if nst_res:
|
||||
test_chanel("curl_cffi + NST residential", proxy=nst_res, use_cffi=True)
|
||||
test_chanel("plain requests + NST residential", proxy=nst_res, use_cffi=False)
|
||||
|
||||
if nst_dc:
|
||||
test_chanel("curl_cffi + NST datacenter", proxy=nst_dc, use_cffi=True)
|
||||
|
||||
# Also try Massive ISP/datacenter (different from residential)
|
||||
test_chanel("curl_cffi + Massive ISP", proxy=MASSIVE_DC, use_cffi=True)
|
||||
Reference in New Issue
Block a user