Sync sec-ch-ua with User-Agent and keep WebGL alive in stealth mode
Fix a bug where magic mode and per-request UA overrides would change the User-Agent header without updating the sec-ch-ua (browser hint) header to match. Anti-bot systems like Akamai detect this mismatch as a bot signal. Changes: - Regenerate browser_hint via UAGen.generate_client_hints() whenever the UA is changed at crawl time (magic mode or explicit override) - Re-apply updated headers to the page via set_extra_http_headers() - Skip per-crawl UA override for persistent contexts where the UA is locked at launch time by Playwright's protocol layer - Move --disable-gpu flags behind enable_stealth check so WebGL works via SwiftShader when stealth mode is active (missing WebGL is a detectable headless signal) - Clean up old test scripts, add clean anti-bot test
This commit is contained in:
@@ -19,7 +19,7 @@ from .config import SCREENSHOT_HEIGHT_TRESHOLD
|
|||||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig
|
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig
|
||||||
from .async_logger import AsyncLogger
|
from .async_logger import AsyncLogger
|
||||||
from .ssl_certificate import SSLCertificate
|
from .ssl_certificate import SSLCertificate
|
||||||
from .user_agent_generator import ValidUAGenerator
|
from .user_agent_generator import ValidUAGenerator, UAGen
|
||||||
from .browser_manager import BrowserManager
|
from .browser_manager import BrowserManager
|
||||||
from .browser_adapter import BrowserAdapter, PlaywrightAdapter, UndetectedAdapter
|
from .browser_adapter import BrowserAdapter, PlaywrightAdapter, UndetectedAdapter
|
||||||
|
|
||||||
@@ -534,18 +534,42 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
captured_requests = []
|
captured_requests = []
|
||||||
captured_console = []
|
captured_console = []
|
||||||
|
|
||||||
# Handle user agent with magic mode
|
# Handle user agent with magic mode.
|
||||||
user_agent_to_override = config.user_agent
|
# For persistent contexts the UA is locked at browser launch time
|
||||||
if user_agent_to_override:
|
# (launch_persistent_context bakes it into the protocol layer), so
|
||||||
self.browser_config.user_agent = user_agent_to_override
|
# changing it here would only desync browser_config from reality.
|
||||||
elif config.magic or config.user_agent_mode == "random":
|
# Users should set user_agent or user_agent_mode on BrowserConfig.
|
||||||
self.browser_config.user_agent = ValidUAGenerator().generate(
|
ua_changed = False
|
||||||
**(config.user_agent_generator_config or {})
|
if not self.browser_config.use_persistent_context:
|
||||||
|
user_agent_to_override = config.user_agent
|
||||||
|
if user_agent_to_override:
|
||||||
|
self.browser_config.user_agent = user_agent_to_override
|
||||||
|
ua_changed = True
|
||||||
|
elif config.magic or config.user_agent_mode == "random":
|
||||||
|
self.browser_config.user_agent = ValidUAGenerator().generate(
|
||||||
|
**(config.user_agent_generator_config or {})
|
||||||
|
)
|
||||||
|
ua_changed = True
|
||||||
|
|
||||||
|
# Keep sec-ch-ua in sync whenever the UA changed
|
||||||
|
if ua_changed:
|
||||||
|
self.browser_config.browser_hint = UAGen.generate_client_hints(
|
||||||
|
self.browser_config.user_agent
|
||||||
)
|
)
|
||||||
|
self.browser_config.headers["sec-ch-ua"] = self.browser_config.browser_hint
|
||||||
|
|
||||||
# Get page for session
|
# Get page for session
|
||||||
page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
|
page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
|
||||||
|
|
||||||
|
# Push updated UA + sec-ch-ua to the page so the server sees them
|
||||||
|
if ua_changed:
|
||||||
|
combined_headers = {
|
||||||
|
"User-Agent": self.browser_config.user_agent,
|
||||||
|
"sec-ch-ua": self.browser_config.browser_hint,
|
||||||
|
}
|
||||||
|
combined_headers.update(self.browser_config.headers)
|
||||||
|
await page.set_extra_http_headers(combined_headers)
|
||||||
|
|
||||||
# await page.goto(URL)
|
# await page.goto(URL)
|
||||||
|
|
||||||
# Add default cookie
|
# Add default cookie
|
||||||
|
|||||||
@@ -70,9 +70,6 @@ class ManagedBrowser:
|
|||||||
def build_browser_flags(config: BrowserConfig) -> List[str]:
|
def build_browser_flags(config: BrowserConfig) -> List[str]:
|
||||||
"""Common CLI flags for launching Chromium"""
|
"""Common CLI flags for launching Chromium"""
|
||||||
flags = [
|
flags = [
|
||||||
"--disable-gpu",
|
|
||||||
"--disable-gpu-compositing",
|
|
||||||
"--disable-software-rasterizer",
|
|
||||||
"--no-sandbox",
|
"--no-sandbox",
|
||||||
"--disable-dev-shm-usage",
|
"--disable-dev-shm-usage",
|
||||||
"--no-first-run",
|
"--no-first-run",
|
||||||
@@ -93,6 +90,14 @@ class ManagedBrowser:
|
|||||||
"--disable-component-update",
|
"--disable-component-update",
|
||||||
"--disable-domain-reliability",
|
"--disable-domain-reliability",
|
||||||
]
|
]
|
||||||
|
# GPU flags disable WebGL which anti-bot sensors detect as headless.
|
||||||
|
# Keep WebGL working (via SwiftShader) when stealth mode is active.
|
||||||
|
if not config.enable_stealth:
|
||||||
|
flags.extend([
|
||||||
|
"--disable-gpu",
|
||||||
|
"--disable-gpu-compositing",
|
||||||
|
"--disable-software-rasterizer",
|
||||||
|
])
|
||||||
if config.memory_saving_mode:
|
if config.memory_saving_mode:
|
||||||
flags.extend([
|
flags.extend([
|
||||||
"--aggressive-cache-discard",
|
"--aggressive-cache-discard",
|
||||||
|
|||||||
@@ -1,61 +0,0 @@
|
|||||||
import asyncio
|
|
||||||
import os
|
|
||||||
import shutil
|
|
||||||
import uuid
|
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
|
||||||
|
|
||||||
|
|
||||||
async def crawl_chanel(url: str):
|
|
||||||
# Fresh profile each time (gets flagged after one use)
|
|
||||||
profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
|
|
||||||
os.makedirs(profile_dir, exist_ok=True)
|
|
||||||
|
|
||||||
browser_config = BrowserConfig(
|
|
||||||
headless=True,
|
|
||||||
enable_stealth=True,
|
|
||||||
use_persistent_context=True,
|
|
||||||
user_data_dir=profile_dir,
|
|
||||||
viewport_width=1920,
|
|
||||||
viewport_height=1080,
|
|
||||||
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
|
||||||
headers={
|
|
||||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
|
||||||
"Accept-Language": "en-US,en;q=0.9",
|
|
||||||
"Sec-Fetch-Dest": "document",
|
|
||||||
"Sec-Fetch-Mode": "navigate",
|
|
||||||
"Sec-Fetch-Site": "none",
|
|
||||||
"Sec-Fetch-User": "?1",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
run_config = CrawlerRunConfig(
|
|
||||||
magic=True,
|
|
||||||
simulate_user=True,
|
|
||||||
override_navigator=True,
|
|
||||||
page_timeout=120000,
|
|
||||||
wait_until="load",
|
|
||||||
delay_before_return_html=10.0,
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
||||||
result = await crawler.arun(url, config=run_config)
|
|
||||||
return result
|
|
||||||
finally:
|
|
||||||
shutil.rmtree(profile_dir, ignore_errors=True)
|
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/"
|
|
||||||
result = await crawl_chanel(url)
|
|
||||||
print(f"Status: {result.status_code}")
|
|
||||||
print(f"Success: {result.success}")
|
|
||||||
print(f"HTML: {len(result.html):,} bytes")
|
|
||||||
if result.markdown:
|
|
||||||
md_len = len(result.markdown.raw_markdown)
|
|
||||||
print(f"Markdown: {md_len:,} chars")
|
|
||||||
if result.error_message:
|
|
||||||
print(f"Error: {result.error_message}")
|
|
||||||
|
|
||||||
|
|
||||||
asyncio.run(main())
|
|
||||||
112
tests/proxy/test_chanel_cdp_proxy.py
Normal file
112
tests/proxy/test_chanel_cdp_proxy.py
Normal file
@@ -0,0 +1,112 @@
|
|||||||
|
"""
|
||||||
|
Test: Chanel.com anti-bot bypass via crawl4ai
|
||||||
|
|
||||||
|
Requires env vars:
|
||||||
|
MASSIVE_USERNAME — Massive residential proxy username
|
||||||
|
MASSIVE_PASSWORD — Massive residential proxy password
|
||||||
|
|
||||||
|
Optional:
|
||||||
|
--cdp URL Connect to external browser via CDP (e.g. http://localhost:9223)
|
||||||
|
--attempts N Number of attempts per test (default 3)
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
export MASSIVE_USERNAME="your_user"
|
||||||
|
export MASSIVE_PASSWORD="your_pass"
|
||||||
|
.venv/bin/python tests/proxy/test_chanel_cdp_proxy.py
|
||||||
|
.venv/bin/python tests/proxy/test_chanel_cdp_proxy.py --cdp http://localhost:9223
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
import tempfile
|
||||||
|
import shutil
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||||
|
from crawl4ai.async_configs import ProxyConfig
|
||||||
|
|
||||||
|
URL = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/"
|
||||||
|
|
||||||
|
MASSIVE_USERNAME = os.environ.get("MASSIVE_USERNAME", "")
|
||||||
|
MASSIVE_PASSWORD = os.environ.get("MASSIVE_PASSWORD", "")
|
||||||
|
MASSIVE_SERVER = "https://network.joinmassive.com:65535"
|
||||||
|
|
||||||
|
|
||||||
|
def get_proxy_config():
|
||||||
|
if not MASSIVE_USERNAME or not MASSIVE_PASSWORD:
|
||||||
|
print("ERROR: Set MASSIVE_USERNAME and MASSIVE_PASSWORD env vars")
|
||||||
|
sys.exit(1)
|
||||||
|
return ProxyConfig(
|
||||||
|
server=MASSIVE_SERVER,
|
||||||
|
username=MASSIVE_USERNAME,
|
||||||
|
password=MASSIVE_PASSWORD,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def test_isolated_context(cdp_url: str = None, attempts: int = 3):
|
||||||
|
"""Test with isolated context (works with both Playwright and CDP)."""
|
||||||
|
mode = f"CDP ({cdp_url})" if cdp_url else "Playwright Chromium"
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Mode: Isolated context — {mode}")
|
||||||
|
print(f"{'='*60}\n")
|
||||||
|
|
||||||
|
kwargs = dict(
|
||||||
|
enable_stealth=True,
|
||||||
|
create_isolated_context=True,
|
||||||
|
viewport_width=1920,
|
||||||
|
viewport_height=1080,
|
||||||
|
)
|
||||||
|
if cdp_url:
|
||||||
|
kwargs["cdp_url"] = cdp_url
|
||||||
|
else:
|
||||||
|
kwargs["headless"] = True
|
||||||
|
|
||||||
|
config = BrowserConfig(**kwargs)
|
||||||
|
run_config = CrawlerRunConfig(
|
||||||
|
magic=True,
|
||||||
|
simulate_user=True,
|
||||||
|
override_navigator=True,
|
||||||
|
proxy_config=get_proxy_config(),
|
||||||
|
page_timeout=120000,
|
||||||
|
wait_until="load",
|
||||||
|
delay_before_return_html=15.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
passed = 0
|
||||||
|
async with AsyncWebCrawler(config=config) as crawler:
|
||||||
|
for i in range(attempts):
|
||||||
|
result = await crawler.arun(URL, config=run_config)
|
||||||
|
ok = result.status_code == 200 and len(result.html) > 10000
|
||||||
|
title = ""
|
||||||
|
if ok:
|
||||||
|
passed += 1
|
||||||
|
m = re.search(r"<title>(.*?)</title>", result.html)
|
||||||
|
title = f" title={m.group(1)}" if m else ""
|
||||||
|
print(f" Attempt {i+1}: status={result.status_code} html={len(result.html):>10,} bytes {'PASS' if ok else 'FAIL'}{title}")
|
||||||
|
|
||||||
|
print(f"\nResult: {passed}/{attempts} passed")
|
||||||
|
return passed > 0
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
cdp_url = None
|
||||||
|
attempts = 3
|
||||||
|
|
||||||
|
args = sys.argv[1:]
|
||||||
|
for j, arg in enumerate(args):
|
||||||
|
if arg == "--cdp" and j + 1 < len(args):
|
||||||
|
cdp_url = args[j + 1]
|
||||||
|
if arg == "--attempts" and j + 1 < len(args):
|
||||||
|
attempts = int(args[j + 1])
|
||||||
|
|
||||||
|
ok = await test_isolated_context(cdp_url=cdp_url, attempts=attempts)
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Result: {'PASS' if ok else 'FAIL'}")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
return ok
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
ok = asyncio.run(main())
|
||||||
|
sys.exit(0 if ok else 1)
|
||||||
@@ -1,62 +0,0 @@
|
|||||||
import asyncio
|
|
||||||
import os
|
|
||||||
import shutil
|
|
||||||
import uuid
|
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
|
||||||
from crawl4ai.async_configs import ProxyConfig
|
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
|
|
||||||
os.makedirs(profile_dir, exist_ok=True)
|
|
||||||
|
|
||||||
browser_config = BrowserConfig(
|
|
||||||
headless=True,
|
|
||||||
enable_stealth=True,
|
|
||||||
use_persistent_context=True,
|
|
||||||
user_data_dir=profile_dir,
|
|
||||||
viewport_width=1920,
|
|
||||||
viewport_height=1080,
|
|
||||||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
|
||||||
headers={
|
|
||||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
|
||||||
"Accept-Language": "en-US,en;q=0.9",
|
|
||||||
"Sec-Fetch-Dest": "document",
|
|
||||||
"Sec-Fetch-Mode": "navigate",
|
|
||||||
"Sec-Fetch-Site": "none",
|
|
||||||
"Sec-Fetch-User": "?1",
|
|
||||||
},
|
|
||||||
proxy_config=ProxyConfig(
|
|
||||||
server="https://network.joinmassive.com:65535",
|
|
||||||
username="mpuQHs4sWZ-country-US",
|
|
||||||
password="D0yWxVQo8wQ05RWqz1Bn",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
run_config = CrawlerRunConfig(
|
|
||||||
magic=True,
|
|
||||||
simulate_user=True,
|
|
||||||
override_navigator=True,
|
|
||||||
page_timeout=120000,
|
|
||||||
wait_until="networkidle",
|
|
||||||
delay_before_return_html=15.0,
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
||||||
result = await crawler.arun(
|
|
||||||
"https://www.chanel.com/us/fashion/handbags/c/1x1x1/",
|
|
||||||
config=run_config,
|
|
||||||
)
|
|
||||||
print(f"Status: {result.status_code}")
|
|
||||||
print(f"HTML bytes: {len(result.html)}")
|
|
||||||
print(f"\n=== FULL HTML ===\n{result.html}")
|
|
||||||
print(f"\n=== RESPONSE HEADERS ===")
|
|
||||||
if result.response_headers:
|
|
||||||
for k, v in sorted(result.response_headers.items()):
|
|
||||||
print(f" {k}: {v}")
|
|
||||||
finally:
|
|
||||||
shutil.rmtree(profile_dir, ignore_errors=True)
|
|
||||||
|
|
||||||
|
|
||||||
asyncio.run(main())
|
|
||||||
@@ -1,147 +0,0 @@
|
|||||||
import asyncio
|
|
||||||
import os
|
|
||||||
import shutil
|
|
||||||
import uuid
|
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, UndetectedAdapter
|
|
||||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
|
||||||
|
|
||||||
|
|
||||||
async def attempt(label, browser_config, run_config, crawler_strategy=None):
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"Attempt: {label}")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
|
|
||||||
url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/"
|
|
||||||
|
|
||||||
kwargs = {"config": browser_config}
|
|
||||||
if crawler_strategy:
|
|
||||||
kwargs["crawler_strategy"] = crawler_strategy
|
|
||||||
|
|
||||||
try:
|
|
||||||
async with AsyncWebCrawler(**kwargs) as crawler:
|
|
||||||
result = await crawler.arun(url, config=run_config)
|
|
||||||
print(f"Status: {result.status_code}")
|
|
||||||
print(f"Success: {result.success}")
|
|
||||||
print(f"HTML: {len(result.html):,} bytes")
|
|
||||||
if result.markdown:
|
|
||||||
print(f"Markdown: {len(result.markdown.raw_markdown):,} chars")
|
|
||||||
if result.error_message:
|
|
||||||
print(f"Error: {result.error_message}")
|
|
||||||
# Check for anti-bot indicators
|
|
||||||
html_lower = result.html.lower()
|
|
||||||
for indicator in ["access denied", "403", "blocked", "captcha", "challenge"]:
|
|
||||||
if indicator in html_lower:
|
|
||||||
print(f" Anti-bot indicator found: '{indicator}'")
|
|
||||||
return result
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Exception: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
mac_ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
|
||||||
|
|
||||||
headers = {
|
|
||||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
|
||||||
"Accept-Language": "en-US,en;q=0.9",
|
|
||||||
"Sec-Fetch-Dest": "document",
|
|
||||||
"Sec-Fetch-Mode": "navigate",
|
|
||||||
"Sec-Fetch-Site": "none",
|
|
||||||
"Sec-Fetch-User": "?1",
|
|
||||||
}
|
|
||||||
|
|
||||||
# ---- Attempt 1: Mac UA + stealth (user's original approach) ----
|
|
||||||
profile1 = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
|
|
||||||
os.makedirs(profile1, exist_ok=True)
|
|
||||||
try:
|
|
||||||
bc1 = BrowserConfig(
|
|
||||||
headless=True,
|
|
||||||
enable_stealth=True,
|
|
||||||
use_persistent_context=True,
|
|
||||||
user_data_dir=profile1,
|
|
||||||
viewport_width=1920,
|
|
||||||
viewport_height=1080,
|
|
||||||
user_agent=mac_ua,
|
|
||||||
headers=headers,
|
|
||||||
)
|
|
||||||
rc1 = CrawlerRunConfig(
|
|
||||||
magic=True,
|
|
||||||
simulate_user=True,
|
|
||||||
override_navigator=True,
|
|
||||||
page_timeout=120000,
|
|
||||||
wait_until="load",
|
|
||||||
delay_before_return_html=10.0,
|
|
||||||
)
|
|
||||||
await attempt("Mac UA + Stealth + Magic (user's script)", bc1, rc1)
|
|
||||||
finally:
|
|
||||||
shutil.rmtree(profile1, ignore_errors=True)
|
|
||||||
|
|
||||||
await asyncio.sleep(3)
|
|
||||||
|
|
||||||
# ---- Attempt 2: Undetected adapter (patchright) ----
|
|
||||||
profile2 = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
|
|
||||||
os.makedirs(profile2, exist_ok=True)
|
|
||||||
try:
|
|
||||||
bc2 = BrowserConfig(
|
|
||||||
headless=True,
|
|
||||||
use_persistent_context=True,
|
|
||||||
user_data_dir=profile2,
|
|
||||||
viewport_width=1920,
|
|
||||||
viewport_height=1080,
|
|
||||||
user_agent=mac_ua,
|
|
||||||
headers=headers,
|
|
||||||
)
|
|
||||||
rc2 = CrawlerRunConfig(
|
|
||||||
simulate_user=True,
|
|
||||||
override_navigator=True,
|
|
||||||
page_timeout=120000,
|
|
||||||
wait_until="load",
|
|
||||||
delay_before_return_html=15.0,
|
|
||||||
)
|
|
||||||
adapter = UndetectedAdapter()
|
|
||||||
strategy = AsyncPlaywrightCrawlerStrategy(
|
|
||||||
browser_config=bc2,
|
|
||||||
browser_adapter=adapter,
|
|
||||||
)
|
|
||||||
await attempt("Undetected Adapter (patchright)", bc2, rc2, crawler_strategy=strategy)
|
|
||||||
finally:
|
|
||||||
shutil.rmtree(profile2, ignore_errors=True)
|
|
||||||
|
|
||||||
await asyncio.sleep(3)
|
|
||||||
|
|
||||||
# ---- Attempt 3: Longer delay + networkidle ----
|
|
||||||
profile3 = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
|
|
||||||
os.makedirs(profile3, exist_ok=True)
|
|
||||||
try:
|
|
||||||
bc3 = BrowserConfig(
|
|
||||||
headless=True,
|
|
||||||
enable_stealth=True,
|
|
||||||
use_persistent_context=True,
|
|
||||||
user_data_dir=profile3,
|
|
||||||
viewport_width=1920,
|
|
||||||
viewport_height=1080,
|
|
||||||
user_agent=mac_ua,
|
|
||||||
headers=headers,
|
|
||||||
)
|
|
||||||
rc3 = CrawlerRunConfig(
|
|
||||||
magic=True,
|
|
||||||
simulate_user=True,
|
|
||||||
override_navigator=True,
|
|
||||||
page_timeout=120000,
|
|
||||||
wait_until="networkidle",
|
|
||||||
delay_before_return_html=20.0,
|
|
||||||
js_code="""
|
|
||||||
// Simulate human-like scrolling
|
|
||||||
await new Promise(r => setTimeout(r, 2000));
|
|
||||||
window.scrollTo({top: 300, behavior: 'smooth'});
|
|
||||||
await new Promise(r => setTimeout(r, 1500));
|
|
||||||
window.scrollTo({top: 600, behavior: 'smooth'});
|
|
||||||
await new Promise(r => setTimeout(r, 1000));
|
|
||||||
""",
|
|
||||||
)
|
|
||||||
await attempt("Stealth + networkidle + scroll + 20s delay", bc3, rc3)
|
|
||||||
finally:
|
|
||||||
shutil.rmtree(profile3, ignore_errors=True)
|
|
||||||
|
|
||||||
|
|
||||||
asyncio.run(main())
|
|
||||||
@@ -1,62 +0,0 @@
|
|||||||
import asyncio
|
|
||||||
import os
|
|
||||||
import shutil
|
|
||||||
import uuid
|
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
|
||||||
|
|
||||||
|
|
||||||
async def crawl_chanel(url: str):
|
|
||||||
profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
|
|
||||||
os.makedirs(profile_dir, exist_ok=True)
|
|
||||||
|
|
||||||
browser_config = BrowserConfig(
|
|
||||||
headless=False, # Non-headless via Xvfb - harder to detect
|
|
||||||
enable_stealth=True,
|
|
||||||
use_persistent_context=True,
|
|
||||||
user_data_dir=profile_dir,
|
|
||||||
viewport_width=1920,
|
|
||||||
viewport_height=1080,
|
|
||||||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
|
||||||
headers={
|
|
||||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
|
||||||
"Accept-Language": "en-US,en;q=0.9",
|
|
||||||
"Sec-Fetch-Dest": "document",
|
|
||||||
"Sec-Fetch-Mode": "navigate",
|
|
||||||
"Sec-Fetch-Site": "none",
|
|
||||||
"Sec-Fetch-User": "?1",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
run_config = CrawlerRunConfig(
|
|
||||||
magic=True,
|
|
||||||
simulate_user=True,
|
|
||||||
override_navigator=True,
|
|
||||||
page_timeout=120000,
|
|
||||||
wait_until="load",
|
|
||||||
delay_before_return_html=10.0,
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
||||||
result = await crawler.arun(url, config=run_config)
|
|
||||||
return result
|
|
||||||
finally:
|
|
||||||
shutil.rmtree(profile_dir, ignore_errors=True)
|
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/"
|
|
||||||
result = await crawl_chanel(url)
|
|
||||||
print(f"Status: {result.status_code}")
|
|
||||||
print(f"Success: {result.success}")
|
|
||||||
print(f"HTML: {len(result.html):,} bytes")
|
|
||||||
if result.markdown:
|
|
||||||
md_len = len(result.markdown.raw_markdown)
|
|
||||||
print(f"Markdown: {md_len:,} chars")
|
|
||||||
if md_len > 500:
|
|
||||||
print(f"\nFirst 500 chars of markdown:\n{result.markdown.raw_markdown[:500]}")
|
|
||||||
if result.error_message:
|
|
||||||
print(f"Error: {result.error_message}")
|
|
||||||
|
|
||||||
|
|
||||||
asyncio.run(main())
|
|
||||||
@@ -1,84 +0,0 @@
|
|||||||
"""Test if matching UA to actual platform fixes Akamai detection."""
|
|
||||||
import asyncio
|
|
||||||
import os
|
|
||||||
import shutil
|
|
||||||
import uuid
|
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
|
||||||
from crawl4ai.async_configs import ProxyConfig
|
|
||||||
|
|
||||||
|
|
||||||
async def test_with_ua(label, user_agent):
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"Test: {label}")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
|
|
||||||
profile_dir = os.path.expanduser(f"~/.crawl4ai/test_{uuid.uuid4().hex[:8]}")
|
|
||||||
os.makedirs(profile_dir, exist_ok=True)
|
|
||||||
|
|
||||||
browser_config = BrowserConfig(
|
|
||||||
headless=True,
|
|
||||||
enable_stealth=True,
|
|
||||||
use_persistent_context=True,
|
|
||||||
user_data_dir=profile_dir,
|
|
||||||
viewport_width=1920,
|
|
||||||
viewport_height=1080,
|
|
||||||
user_agent=user_agent,
|
|
||||||
headers={
|
|
||||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
|
||||||
"Accept-Language": "en-US,en;q=0.9",
|
|
||||||
"Sec-Fetch-Dest": "document",
|
|
||||||
"Sec-Fetch-Mode": "navigate",
|
|
||||||
"Sec-Fetch-Site": "none",
|
|
||||||
"Sec-Fetch-User": "?1",
|
|
||||||
},
|
|
||||||
proxy_config=ProxyConfig(
|
|
||||||
server="https://network.joinmassive.com:65535",
|
|
||||||
username="mpuQHs4sWZ-country-US",
|
|
||||||
password="D0yWxVQo8wQ05RWqz1Bn",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
run_config = CrawlerRunConfig(
|
|
||||||
magic=True,
|
|
||||||
simulate_user=True,
|
|
||||||
override_navigator=True,
|
|
||||||
page_timeout=120000,
|
|
||||||
wait_until="load",
|
|
||||||
delay_before_return_html=10.0,
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
||||||
result = await crawler.arun(
|
|
||||||
"https://www.chanel.com/us/fashion/handbags/c/1x1x1/",
|
|
||||||
config=run_config,
|
|
||||||
)
|
|
||||||
print(f" Status: {result.status_code}")
|
|
||||||
print(f" HTML bytes: {len(result.html)}")
|
|
||||||
blocked = "access denied" in result.html.lower()
|
|
||||||
print(f" Blocked: {blocked}")
|
|
||||||
if not blocked and len(result.html) > 1000:
|
|
||||||
print(f" SUCCESS! Got real content")
|
|
||||||
except Exception as e:
|
|
||||||
print(f" EXCEPTION: {e}")
|
|
||||||
finally:
|
|
||||||
shutil.rmtree(profile_dir, ignore_errors=True)
|
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
# Mac UA on Linux = platform mismatch
|
|
||||||
await test_with_ua(
|
|
||||||
"Mac UA (mismatched platform)",
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
|
||||||
)
|
|
||||||
|
|
||||||
await asyncio.sleep(3)
|
|
||||||
|
|
||||||
# Linux UA = matches actual navigator.platform
|
|
||||||
await test_with_ua(
|
|
||||||
"Linux UA (matching platform)",
|
|
||||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
asyncio.run(main())
|
|
||||||
109
tests/proxy/test_proxy_verify.py
Normal file
109
tests/proxy/test_proxy_verify.py
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
"""
|
||||||
|
Verify proxies are working and check what IPs they resolve to.
|
||||||
|
Then test Chanel through NST proxy (different provider).
|
||||||
|
"""
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# Check our real IP
|
||||||
|
def check_ip(label, proxy=None):
|
||||||
|
print(f"\n--- {label} ---")
|
||||||
|
try:
|
||||||
|
kwargs = {"url": "https://httpbin.org/ip", "timeout": 15}
|
||||||
|
if proxy:
|
||||||
|
kwargs["proxies"] = {"https": proxy, "http": proxy}
|
||||||
|
resp = requests.get(**kwargs)
|
||||||
|
print(f" IP: {resp.json()}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ERROR: {e}")
|
||||||
|
|
||||||
|
# Get NST proxy credentials
|
||||||
|
def get_nst_proxy(channel_id):
|
||||||
|
token = "NSTPROXY-DA9C7A614946EA8FCEFDA9FD3B3F4A9D"
|
||||||
|
api_url = f"https://api.nstproxy.com/api/v1/generate/apiproxies?count=1&country=US&protocol=http&sessionDuration=0&channelId={channel_id}&token={token}"
|
||||||
|
print(f"\nFetching NST proxy ({channel_id[:8]}...):")
|
||||||
|
print(f" URL: {api_url}")
|
||||||
|
try:
|
||||||
|
resp = requests.get(api_url, timeout=15)
|
||||||
|
print(f" HTTP {resp.status_code}")
|
||||||
|
print(f" Body: {resp.text[:500]}")
|
||||||
|
data = resp.json()
|
||||||
|
if data.get("code") == 200 and data.get("data"):
|
||||||
|
proxy_str = data["data"][0]
|
||||||
|
parts = proxy_str.split(":")
|
||||||
|
if len(parts) == 4:
|
||||||
|
ip, port, user, pwd = parts
|
||||||
|
proxy_url = f"http://{user}:{pwd}@{ip}:{port}"
|
||||||
|
print(f" Proxy URL: http://{user[:10]}...@{ip}:{port}")
|
||||||
|
return proxy_url
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ERROR: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Test Chanel
|
||||||
|
def test_chanel(label, proxy=None, use_cffi=False):
|
||||||
|
url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/"
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
|
}
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"TEST: {label}")
|
||||||
|
try:
|
||||||
|
if use_cffi:
|
||||||
|
from curl_cffi import requests as cffi_requests
|
||||||
|
kwargs = {"url": url, "headers": headers, "impersonate": "chrome", "timeout": 30, "allow_redirects": True}
|
||||||
|
if proxy:
|
||||||
|
kwargs["proxies"] = {"https": proxy, "http": proxy}
|
||||||
|
resp = cffi_requests.get(**kwargs)
|
||||||
|
else:
|
||||||
|
kwargs = {"url": url, "headers": headers, "timeout": 30, "allow_redirects": True}
|
||||||
|
if proxy:
|
||||||
|
kwargs["proxies"] = {"https": proxy, "http": proxy}
|
||||||
|
resp = requests.get(**kwargs)
|
||||||
|
|
||||||
|
blocked = "Access Denied" in resp.text
|
||||||
|
print(f" Status: {resp.status_code}")
|
||||||
|
print(f" Size: {len(resp.text):,} bytes")
|
||||||
|
print(f" Result: {'BLOCKED' if blocked else 'SUCCESS' if resp.status_code == 200 and len(resp.text) > 10000 else 'UNCLEAR'}")
|
||||||
|
if not blocked and resp.status_code == 200:
|
||||||
|
print(f" First 300 chars: {resp.text[:300]}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ERROR: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
MASSIVE_RES = "https://mpuQHs4sWZ-country-US:D0yWxVQo8wQ05RWqz1Bn@network.joinmassive.com:65535"
|
||||||
|
MASSIVE_DC = "http://mpuQHs4sWZ-country-US:D0yWxVQo8wQ05RWqz1Bn@isp.joinmassive.com:8000"
|
||||||
|
|
||||||
|
# Step 1: Verify IPs
|
||||||
|
print("="*60)
|
||||||
|
print("STEP 1: Verify proxy IPs")
|
||||||
|
check_ip("Direct (Hetzner)")
|
||||||
|
check_ip("Massive Residential", MASSIVE_RES)
|
||||||
|
check_ip("Massive Datacenter/ISP", MASSIVE_DC)
|
||||||
|
|
||||||
|
# Step 2: Get NST proxies
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("STEP 2: Get NST proxy credentials")
|
||||||
|
nst_res = get_nst_proxy("7864DDA266D5899C") # residential
|
||||||
|
nst_dc = get_nst_proxy("AE0C3B5547F8A021") # datacenter
|
||||||
|
|
||||||
|
if nst_res:
|
||||||
|
check_ip("NST Residential", nst_res)
|
||||||
|
if nst_dc:
|
||||||
|
check_ip("NST Datacenter", nst_dc)
|
||||||
|
|
||||||
|
# Step 3: Test Chanel with all available proxies
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("STEP 3: Test Chanel.com")
|
||||||
|
|
||||||
|
if nst_res:
|
||||||
|
test_chanel("curl_cffi + NST residential", proxy=nst_res, use_cffi=True)
|
||||||
|
test_chanel("plain requests + NST residential", proxy=nst_res, use_cffi=False)
|
||||||
|
|
||||||
|
if nst_dc:
|
||||||
|
test_chanel("curl_cffi + NST datacenter", proxy=nst_dc, use_cffi=True)
|
||||||
|
|
||||||
|
# Also try Massive ISP/datacenter (different from residential)
|
||||||
|
test_chanel("curl_cffi + Massive ISP", proxy=MASSIVE_DC, use_cffi=True)
|
||||||
Reference in New Issue
Block a user