Fix proxy auth for persistent browser contexts
Chromium's --proxy-server CLI flag silently ignores inline credentials (user:pass@server). For persistent contexts, crawl4ai was embedding credentials in this flag via ManagedBrowser.build_browser_flags(), causing proxy auth to fail and the browser to fall back to direct connection. Fix: Use Playwright's launch_persistent_context(proxy=...) API instead of subprocess + CDP when use_persistent_context=True. This handles proxy authentication properly via the HTTP CONNECT handshake. The non-persistent and CDP paths remain unchanged. Changes: - Strip credentials from --proxy-server flag in build_browser_flags() - Add launch_persistent_context() path in BrowserManager.start() - Add cleanup path in BrowserManager.close() - Guard create_browser_context() when self.browser is None - Add regression tests covering all 4 proxy/persistence combinations
This commit is contained in:
@@ -109,14 +109,13 @@ class ManagedBrowser:
|
|||||||
"--disable-software-rasterizer",
|
"--disable-software-rasterizer",
|
||||||
"--disable-dev-shm-usage",
|
"--disable-dev-shm-usage",
|
||||||
])
|
])
|
||||||
# proxy support
|
# proxy support — only pass server URL, never credentials.
|
||||||
|
# Chromium's --proxy-server flag silently ignores inline user:pass@.
|
||||||
|
# Auth credentials are handled at the Playwright context level instead.
|
||||||
if config.proxy:
|
if config.proxy:
|
||||||
flags.append(f"--proxy-server={config.proxy}")
|
flags.append(f"--proxy-server={config.proxy}")
|
||||||
elif config.proxy_config:
|
elif config.proxy_config:
|
||||||
creds = ""
|
flags.append(f"--proxy-server={config.proxy_config.server}")
|
||||||
if config.proxy_config.username and config.proxy_config.password:
|
|
||||||
creds = f"{config.proxy_config.username}:{config.proxy_config.password}@"
|
|
||||||
flags.append(f"--proxy-server={creds}{config.proxy_config.server}")
|
|
||||||
# dedupe
|
# dedupe
|
||||||
return list(dict.fromkeys(flags))
|
return list(dict.fromkeys(flags))
|
||||||
|
|
||||||
@@ -711,6 +710,7 @@ class BrowserManager:
|
|||||||
self.managed_browser = None
|
self.managed_browser = None
|
||||||
self.playwright = None
|
self.playwright = None
|
||||||
self._using_cached_cdp = False
|
self._using_cached_cdp = False
|
||||||
|
self._launched_persistent = False # True when using launch_persistent_context
|
||||||
|
|
||||||
# Session management
|
# Session management
|
||||||
self.sessions = {}
|
self.sessions = {}
|
||||||
@@ -793,6 +793,76 @@ class BrowserManager:
|
|||||||
# Initialize playwright
|
# Initialize playwright
|
||||||
self.playwright = await async_playwright().start()
|
self.playwright = await async_playwright().start()
|
||||||
|
|
||||||
|
# ── Persistent context via Playwright's native API ──────────────
|
||||||
|
# When use_persistent_context is set and we're not connecting to an
|
||||||
|
# external CDP endpoint, use launch_persistent_context() instead of
|
||||||
|
# subprocess + CDP. This properly supports proxy authentication
|
||||||
|
# (server + username + password) which the --proxy-server CLI flag
|
||||||
|
# cannot handle.
|
||||||
|
if (
|
||||||
|
self.config.use_persistent_context
|
||||||
|
and not self.config.cdp_url
|
||||||
|
and not self._using_cached_cdp
|
||||||
|
):
|
||||||
|
# Collect stealth / optimization CLI flags, excluding ones that
|
||||||
|
# launch_persistent_context handles via keyword arguments.
|
||||||
|
_skip_prefixes = (
|
||||||
|
"--proxy-server",
|
||||||
|
"--remote-debugging-port",
|
||||||
|
"--user-data-dir",
|
||||||
|
"--headless",
|
||||||
|
"--window-size",
|
||||||
|
)
|
||||||
|
cli_args = [
|
||||||
|
flag
|
||||||
|
for flag in ManagedBrowser.build_browser_flags(self.config)
|
||||||
|
if not flag.startswith(_skip_prefixes)
|
||||||
|
]
|
||||||
|
if self.config.extra_args:
|
||||||
|
cli_args.extend(self.config.extra_args)
|
||||||
|
|
||||||
|
launch_kwargs = {
|
||||||
|
"headless": self.config.headless,
|
||||||
|
"args": list(dict.fromkeys(cli_args)), # dedupe
|
||||||
|
"viewport": {
|
||||||
|
"width": self.config.viewport_width,
|
||||||
|
"height": self.config.viewport_height,
|
||||||
|
},
|
||||||
|
"user_agent": self.config.user_agent or None,
|
||||||
|
"ignore_https_errors": self.config.ignore_https_errors,
|
||||||
|
"accept_downloads": self.config.accept_downloads,
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.config.proxy_config:
|
||||||
|
launch_kwargs["proxy"] = {
|
||||||
|
"server": self.config.proxy_config.server,
|
||||||
|
"username": self.config.proxy_config.username,
|
||||||
|
"password": self.config.proxy_config.password,
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.config.storage_state:
|
||||||
|
launch_kwargs["storage_state"] = self.config.storage_state
|
||||||
|
|
||||||
|
user_data_dir = self.config.user_data_dir or tempfile.mkdtemp(
|
||||||
|
prefix="crawl4ai-persistent-"
|
||||||
|
)
|
||||||
|
|
||||||
|
self.default_context = (
|
||||||
|
await self.playwright.chromium.launch_persistent_context(
|
||||||
|
user_data_dir, **launch_kwargs
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self.browser = None # persistent context has no separate Browser
|
||||||
|
self._launched_persistent = True
|
||||||
|
|
||||||
|
await self.setup_context(self.default_context)
|
||||||
|
|
||||||
|
# Set the browser endpoint key for global page tracking
|
||||||
|
self._browser_endpoint_key = self._compute_browser_endpoint_key()
|
||||||
|
if self._browser_endpoint_key not in BrowserManager._global_pages_in_use:
|
||||||
|
BrowserManager._global_pages_in_use[self._browser_endpoint_key] = set()
|
||||||
|
return
|
||||||
|
|
||||||
if self.config.cdp_url or self.config.use_managed_browser:
|
if self.config.cdp_url or self.config.use_managed_browser:
|
||||||
self.config.use_managed_browser = True
|
self.config.use_managed_browser = True
|
||||||
|
|
||||||
@@ -1158,6 +1228,12 @@ class BrowserManager:
|
|||||||
Returns:
|
Returns:
|
||||||
Context: Browser context object with the specified configurations
|
Context: Browser context object with the specified configurations
|
||||||
"""
|
"""
|
||||||
|
if self.browser is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
"Cannot create new browser contexts when using "
|
||||||
|
"use_persistent_context=True. Persistent context uses a "
|
||||||
|
"single shared context."
|
||||||
|
)
|
||||||
# Base settings
|
# Base settings
|
||||||
user_agent = self.config.headers.get("User-Agent", self.config.user_agent)
|
user_agent = self.config.headers.get("User-Agent", self.config.user_agent)
|
||||||
viewport_settings = {
|
viewport_settings = {
|
||||||
@@ -1858,6 +1934,35 @@ class BrowserManager:
|
|||||||
self.playwright = None
|
self.playwright = None
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# ── Persistent context launched via launch_persistent_context ──
|
||||||
|
if self._launched_persistent:
|
||||||
|
session_ids = list(self.sessions.keys())
|
||||||
|
for session_id in session_ids:
|
||||||
|
await self.kill_session(session_id)
|
||||||
|
for ctx in self.contexts_by_config.values():
|
||||||
|
try:
|
||||||
|
await ctx.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
self.contexts_by_config.clear()
|
||||||
|
self._context_refcounts.clear()
|
||||||
|
self._context_last_used.clear()
|
||||||
|
self._page_to_sig.clear()
|
||||||
|
|
||||||
|
# Closing the persistent context also terminates the browser
|
||||||
|
if self.default_context:
|
||||||
|
try:
|
||||||
|
await self.default_context.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
self.default_context = None
|
||||||
|
|
||||||
|
if self.playwright:
|
||||||
|
await self.playwright.stop()
|
||||||
|
self.playwright = None
|
||||||
|
self._launched_persistent = False
|
||||||
|
return
|
||||||
|
|
||||||
if self.config.sleep_on_close:
|
if self.config.sleep_on_close:
|
||||||
await asyncio.sleep(0.5)
|
await asyncio.sleep(0.5)
|
||||||
|
|
||||||
|
|||||||
61
tests/proxy/test_chanel_basic.py
Normal file
61
tests/proxy/test_chanel_basic.py
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import uuid
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||||
|
|
||||||
|
|
||||||
|
async def crawl_chanel(url: str):
|
||||||
|
# Fresh profile each time (gets flagged after one use)
|
||||||
|
profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
|
||||||
|
os.makedirs(profile_dir, exist_ok=True)
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
headless=True,
|
||||||
|
enable_stealth=True,
|
||||||
|
use_persistent_context=True,
|
||||||
|
user_data_dir=profile_dir,
|
||||||
|
viewport_width=1920,
|
||||||
|
viewport_height=1080,
|
||||||
|
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||||
|
headers={
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||||
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
|
"Sec-Fetch-Dest": "document",
|
||||||
|
"Sec-Fetch-Mode": "navigate",
|
||||||
|
"Sec-Fetch-Site": "none",
|
||||||
|
"Sec-Fetch-User": "?1",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
run_config = CrawlerRunConfig(
|
||||||
|
magic=True,
|
||||||
|
simulate_user=True,
|
||||||
|
override_navigator=True,
|
||||||
|
page_timeout=120000,
|
||||||
|
wait_until="load",
|
||||||
|
delay_before_return_html=10.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
result = await crawler.arun(url, config=run_config)
|
||||||
|
return result
|
||||||
|
finally:
|
||||||
|
shutil.rmtree(profile_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/"
|
||||||
|
result = await crawl_chanel(url)
|
||||||
|
print(f"Status: {result.status_code}")
|
||||||
|
print(f"Success: {result.success}")
|
||||||
|
print(f"HTML: {len(result.html):,} bytes")
|
||||||
|
if result.markdown:
|
||||||
|
md_len = len(result.markdown.raw_markdown)
|
||||||
|
print(f"Markdown: {md_len:,} chars")
|
||||||
|
if result.error_message:
|
||||||
|
print(f"Error: {result.error_message}")
|
||||||
|
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
62
tests/proxy/test_chanel_debug.py
Normal file
62
tests/proxy/test_chanel_debug.py
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import uuid
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||||
|
from crawl4ai.async_configs import ProxyConfig
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
|
||||||
|
os.makedirs(profile_dir, exist_ok=True)
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
headless=True,
|
||||||
|
enable_stealth=True,
|
||||||
|
use_persistent_context=True,
|
||||||
|
user_data_dir=profile_dir,
|
||||||
|
viewport_width=1920,
|
||||||
|
viewport_height=1080,
|
||||||
|
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||||
|
headers={
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||||
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
|
"Sec-Fetch-Dest": "document",
|
||||||
|
"Sec-Fetch-Mode": "navigate",
|
||||||
|
"Sec-Fetch-Site": "none",
|
||||||
|
"Sec-Fetch-User": "?1",
|
||||||
|
},
|
||||||
|
proxy_config=ProxyConfig(
|
||||||
|
server="https://network.joinmassive.com:65535",
|
||||||
|
username="mpuQHs4sWZ-country-US",
|
||||||
|
password="D0yWxVQo8wQ05RWqz1Bn",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
run_config = CrawlerRunConfig(
|
||||||
|
magic=True,
|
||||||
|
simulate_user=True,
|
||||||
|
override_navigator=True,
|
||||||
|
page_timeout=120000,
|
||||||
|
wait_until="networkidle",
|
||||||
|
delay_before_return_html=15.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
result = await crawler.arun(
|
||||||
|
"https://www.chanel.com/us/fashion/handbags/c/1x1x1/",
|
||||||
|
config=run_config,
|
||||||
|
)
|
||||||
|
print(f"Status: {result.status_code}")
|
||||||
|
print(f"HTML bytes: {len(result.html)}")
|
||||||
|
print(f"\n=== FULL HTML ===\n{result.html}")
|
||||||
|
print(f"\n=== RESPONSE HEADERS ===")
|
||||||
|
if result.response_headers:
|
||||||
|
for k, v in sorted(result.response_headers.items()):
|
||||||
|
print(f" {k}: {v}")
|
||||||
|
finally:
|
||||||
|
shutil.rmtree(profile_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
147
tests/proxy/test_chanel_multi_attempt.py
Normal file
147
tests/proxy/test_chanel_multi_attempt.py
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import uuid
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, UndetectedAdapter
|
||||||
|
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||||
|
|
||||||
|
|
||||||
|
async def attempt(label, browser_config, run_config, crawler_strategy=None):
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Attempt: {label}")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/"
|
||||||
|
|
||||||
|
kwargs = {"config": browser_config}
|
||||||
|
if crawler_strategy:
|
||||||
|
kwargs["crawler_strategy"] = crawler_strategy
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with AsyncWebCrawler(**kwargs) as crawler:
|
||||||
|
result = await crawler.arun(url, config=run_config)
|
||||||
|
print(f"Status: {result.status_code}")
|
||||||
|
print(f"Success: {result.success}")
|
||||||
|
print(f"HTML: {len(result.html):,} bytes")
|
||||||
|
if result.markdown:
|
||||||
|
print(f"Markdown: {len(result.markdown.raw_markdown):,} chars")
|
||||||
|
if result.error_message:
|
||||||
|
print(f"Error: {result.error_message}")
|
||||||
|
# Check for anti-bot indicators
|
||||||
|
html_lower = result.html.lower()
|
||||||
|
for indicator in ["access denied", "403", "blocked", "captcha", "challenge"]:
|
||||||
|
if indicator in html_lower:
|
||||||
|
print(f" Anti-bot indicator found: '{indicator}'")
|
||||||
|
return result
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Exception: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
mac_ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||||
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
|
"Sec-Fetch-Dest": "document",
|
||||||
|
"Sec-Fetch-Mode": "navigate",
|
||||||
|
"Sec-Fetch-Site": "none",
|
||||||
|
"Sec-Fetch-User": "?1",
|
||||||
|
}
|
||||||
|
|
||||||
|
# ---- Attempt 1: Mac UA + stealth (user's original approach) ----
|
||||||
|
profile1 = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
|
||||||
|
os.makedirs(profile1, exist_ok=True)
|
||||||
|
try:
|
||||||
|
bc1 = BrowserConfig(
|
||||||
|
headless=True,
|
||||||
|
enable_stealth=True,
|
||||||
|
use_persistent_context=True,
|
||||||
|
user_data_dir=profile1,
|
||||||
|
viewport_width=1920,
|
||||||
|
viewport_height=1080,
|
||||||
|
user_agent=mac_ua,
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
rc1 = CrawlerRunConfig(
|
||||||
|
magic=True,
|
||||||
|
simulate_user=True,
|
||||||
|
override_navigator=True,
|
||||||
|
page_timeout=120000,
|
||||||
|
wait_until="load",
|
||||||
|
delay_before_return_html=10.0,
|
||||||
|
)
|
||||||
|
await attempt("Mac UA + Stealth + Magic (user's script)", bc1, rc1)
|
||||||
|
finally:
|
||||||
|
shutil.rmtree(profile1, ignore_errors=True)
|
||||||
|
|
||||||
|
await asyncio.sleep(3)
|
||||||
|
|
||||||
|
# ---- Attempt 2: Undetected adapter (patchright) ----
|
||||||
|
profile2 = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
|
||||||
|
os.makedirs(profile2, exist_ok=True)
|
||||||
|
try:
|
||||||
|
bc2 = BrowserConfig(
|
||||||
|
headless=True,
|
||||||
|
use_persistent_context=True,
|
||||||
|
user_data_dir=profile2,
|
||||||
|
viewport_width=1920,
|
||||||
|
viewport_height=1080,
|
||||||
|
user_agent=mac_ua,
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
rc2 = CrawlerRunConfig(
|
||||||
|
simulate_user=True,
|
||||||
|
override_navigator=True,
|
||||||
|
page_timeout=120000,
|
||||||
|
wait_until="load",
|
||||||
|
delay_before_return_html=15.0,
|
||||||
|
)
|
||||||
|
adapter = UndetectedAdapter()
|
||||||
|
strategy = AsyncPlaywrightCrawlerStrategy(
|
||||||
|
browser_config=bc2,
|
||||||
|
browser_adapter=adapter,
|
||||||
|
)
|
||||||
|
await attempt("Undetected Adapter (patchright)", bc2, rc2, crawler_strategy=strategy)
|
||||||
|
finally:
|
||||||
|
shutil.rmtree(profile2, ignore_errors=True)
|
||||||
|
|
||||||
|
await asyncio.sleep(3)
|
||||||
|
|
||||||
|
# ---- Attempt 3: Longer delay + networkidle ----
|
||||||
|
profile3 = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
|
||||||
|
os.makedirs(profile3, exist_ok=True)
|
||||||
|
try:
|
||||||
|
bc3 = BrowserConfig(
|
||||||
|
headless=True,
|
||||||
|
enable_stealth=True,
|
||||||
|
use_persistent_context=True,
|
||||||
|
user_data_dir=profile3,
|
||||||
|
viewport_width=1920,
|
||||||
|
viewport_height=1080,
|
||||||
|
user_agent=mac_ua,
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
rc3 = CrawlerRunConfig(
|
||||||
|
magic=True,
|
||||||
|
simulate_user=True,
|
||||||
|
override_navigator=True,
|
||||||
|
page_timeout=120000,
|
||||||
|
wait_until="networkidle",
|
||||||
|
delay_before_return_html=20.0,
|
||||||
|
js_code="""
|
||||||
|
// Simulate human-like scrolling
|
||||||
|
await new Promise(r => setTimeout(r, 2000));
|
||||||
|
window.scrollTo({top: 300, behavior: 'smooth'});
|
||||||
|
await new Promise(r => setTimeout(r, 1500));
|
||||||
|
window.scrollTo({top: 600, behavior: 'smooth'});
|
||||||
|
await new Promise(r => setTimeout(r, 1000));
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
await attempt("Stealth + networkidle + scroll + 20s delay", bc3, rc3)
|
||||||
|
finally:
|
||||||
|
shutil.rmtree(profile3, ignore_errors=True)
|
||||||
|
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
62
tests/proxy/test_chanel_xvfb.py
Normal file
62
tests/proxy/test_chanel_xvfb.py
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import uuid
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||||
|
|
||||||
|
|
||||||
|
async def crawl_chanel(url: str):
|
||||||
|
profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
|
||||||
|
os.makedirs(profile_dir, exist_ok=True)
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
headless=False, # Non-headless via Xvfb - harder to detect
|
||||||
|
enable_stealth=True,
|
||||||
|
use_persistent_context=True,
|
||||||
|
user_data_dir=profile_dir,
|
||||||
|
viewport_width=1920,
|
||||||
|
viewport_height=1080,
|
||||||
|
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||||
|
headers={
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||||
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
|
"Sec-Fetch-Dest": "document",
|
||||||
|
"Sec-Fetch-Mode": "navigate",
|
||||||
|
"Sec-Fetch-Site": "none",
|
||||||
|
"Sec-Fetch-User": "?1",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
run_config = CrawlerRunConfig(
|
||||||
|
magic=True,
|
||||||
|
simulate_user=True,
|
||||||
|
override_navigator=True,
|
||||||
|
page_timeout=120000,
|
||||||
|
wait_until="load",
|
||||||
|
delay_before_return_html=10.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
result = await crawler.arun(url, config=run_config)
|
||||||
|
return result
|
||||||
|
finally:
|
||||||
|
shutil.rmtree(profile_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/"
|
||||||
|
result = await crawl_chanel(url)
|
||||||
|
print(f"Status: {result.status_code}")
|
||||||
|
print(f"Success: {result.success}")
|
||||||
|
print(f"HTML: {len(result.html):,} bytes")
|
||||||
|
if result.markdown:
|
||||||
|
md_len = len(result.markdown.raw_markdown)
|
||||||
|
print(f"Markdown: {md_len:,} chars")
|
||||||
|
if md_len > 500:
|
||||||
|
print(f"\nFirst 500 chars of markdown:\n{result.markdown.raw_markdown[:500]}")
|
||||||
|
if result.error_message:
|
||||||
|
print(f"Error: {result.error_message}")
|
||||||
|
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
68
tests/proxy/test_persistent_proxy.py
Normal file
68
tests/proxy/test_persistent_proxy.py
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import uuid
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||||
|
from crawl4ai.async_configs import ProxyConfig
|
||||||
|
|
||||||
|
|
||||||
|
async def crawl_chanel(url: str):
|
||||||
|
profile_dir = os.path.expanduser(f"~/.crawl4ai/chanel_{uuid.uuid4().hex[:8]}")
|
||||||
|
os.makedirs(profile_dir, exist_ok=True)
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
headless=True,
|
||||||
|
enable_stealth=True,
|
||||||
|
use_persistent_context=True,
|
||||||
|
user_data_dir=profile_dir,
|
||||||
|
viewport_width=1920,
|
||||||
|
viewport_height=1080,
|
||||||
|
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||||
|
headers={
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||||
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
|
"Sec-Fetch-Dest": "document",
|
||||||
|
"Sec-Fetch-Mode": "navigate",
|
||||||
|
"Sec-Fetch-Site": "none",
|
||||||
|
"Sec-Fetch-User": "?1",
|
||||||
|
},
|
||||||
|
proxy_config=ProxyConfig(
|
||||||
|
server="https://network.joinmassive.com:65535",
|
||||||
|
username="mpuQHs4sWZ-country-US",
|
||||||
|
password="D0yWxVQo8wQ05RWqz1Bn",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
run_config = CrawlerRunConfig(
|
||||||
|
magic=True,
|
||||||
|
simulate_user=True,
|
||||||
|
override_navigator=True,
|
||||||
|
page_timeout=120000,
|
||||||
|
wait_until="load",
|
||||||
|
delay_before_return_html=10.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
result = await crawler.arun(url, config=run_config)
|
||||||
|
return result
|
||||||
|
finally:
|
||||||
|
shutil.rmtree(profile_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
url = "https://www.chanel.com/us/fashion/handbags/c/1x1x1/"
|
||||||
|
result = await crawl_chanel(url)
|
||||||
|
print(f"Status: {result.status_code}")
|
||||||
|
print(f"Success: {result.success}")
|
||||||
|
print(f"HTML: {len(result.html):,} bytes")
|
||||||
|
if result.markdown:
|
||||||
|
md_len = len(result.markdown.raw_markdown)
|
||||||
|
print(f"Markdown: {md_len:,} chars")
|
||||||
|
if md_len > 500:
|
||||||
|
print(f"\nFirst 500 chars of markdown:\n{result.markdown.raw_markdown[:500]}")
|
||||||
|
if result.error_message:
|
||||||
|
print(f"Error: {result.error_message}")
|
||||||
|
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
84
tests/proxy/test_platform_match.py
Normal file
84
tests/proxy/test_platform_match.py
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
"""Test if matching UA to actual platform fixes Akamai detection."""
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import uuid
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||||
|
from crawl4ai.async_configs import ProxyConfig
|
||||||
|
|
||||||
|
|
||||||
|
async def test_with_ua(label, user_agent):
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Test: {label}")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
profile_dir = os.path.expanduser(f"~/.crawl4ai/test_{uuid.uuid4().hex[:8]}")
|
||||||
|
os.makedirs(profile_dir, exist_ok=True)
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
headless=True,
|
||||||
|
enable_stealth=True,
|
||||||
|
use_persistent_context=True,
|
||||||
|
user_data_dir=profile_dir,
|
||||||
|
viewport_width=1920,
|
||||||
|
viewport_height=1080,
|
||||||
|
user_agent=user_agent,
|
||||||
|
headers={
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
|
||||||
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
|
"Sec-Fetch-Dest": "document",
|
||||||
|
"Sec-Fetch-Mode": "navigate",
|
||||||
|
"Sec-Fetch-Site": "none",
|
||||||
|
"Sec-Fetch-User": "?1",
|
||||||
|
},
|
||||||
|
proxy_config=ProxyConfig(
|
||||||
|
server="https://network.joinmassive.com:65535",
|
||||||
|
username="mpuQHs4sWZ-country-US",
|
||||||
|
password="D0yWxVQo8wQ05RWqz1Bn",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
run_config = CrawlerRunConfig(
|
||||||
|
magic=True,
|
||||||
|
simulate_user=True,
|
||||||
|
override_navigator=True,
|
||||||
|
page_timeout=120000,
|
||||||
|
wait_until="load",
|
||||||
|
delay_before_return_html=10.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
result = await crawler.arun(
|
||||||
|
"https://www.chanel.com/us/fashion/handbags/c/1x1x1/",
|
||||||
|
config=run_config,
|
||||||
|
)
|
||||||
|
print(f" Status: {result.status_code}")
|
||||||
|
print(f" HTML bytes: {len(result.html)}")
|
||||||
|
blocked = "access denied" in result.html.lower()
|
||||||
|
print(f" Blocked: {blocked}")
|
||||||
|
if not blocked and len(result.html) > 1000:
|
||||||
|
print(f" SUCCESS! Got real content")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" EXCEPTION: {e}")
|
||||||
|
finally:
|
||||||
|
shutil.rmtree(profile_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
# Mac UA on Linux = platform mismatch
|
||||||
|
await test_with_ua(
|
||||||
|
"Mac UA (mismatched platform)",
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||||
|
)
|
||||||
|
|
||||||
|
await asyncio.sleep(3)
|
||||||
|
|
||||||
|
# Linux UA = matches actual navigator.platform
|
||||||
|
await test_with_ua(
|
||||||
|
"Linux UA (matching platform)",
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
96
tests/proxy/test_proxy_regression.py
Normal file
96
tests/proxy/test_proxy_regression.py
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
"""Regression tests for proxy fix:
|
||||||
|
1. Persistent context + proxy (new path via launch_persistent_context)
|
||||||
|
2. Persistent context WITHOUT proxy (should still use launch_persistent_context)
|
||||||
|
3. Non-persistent + proxy on CrawlerRunConfig (existing path, must not break)
|
||||||
|
4. Non-persistent, no proxy (basic crawl, must not break)
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import uuid
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||||
|
from crawl4ai.async_configs import ProxyConfig
|
||||||
|
|
||||||
|
TEST_URL = "https://httpbin.org/ip" # Simple endpoint, returns IP
|
||||||
|
|
||||||
|
|
||||||
|
async def test(label, browser_config, run_config=None):
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Test: {label}")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
run_config = run_config or CrawlerRunConfig()
|
||||||
|
try:
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
result = await crawler.arun(TEST_URL, config=run_config)
|
||||||
|
print(f" Status: {result.status_code}")
|
||||||
|
print(f" HTML bytes: {len(result.html)}")
|
||||||
|
if result.markdown:
|
||||||
|
# httpbin.org/ip returns JSON with "origin" key
|
||||||
|
md = result.markdown.raw_markdown.strip()
|
||||||
|
print(f" Content: {md[:200]}")
|
||||||
|
if result.error_message:
|
||||||
|
print(f" ERROR: {result.error_message}")
|
||||||
|
return result
|
||||||
|
except Exception as e:
|
||||||
|
print(f" EXCEPTION: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
proxy = ProxyConfig(
|
||||||
|
server="https://network.joinmassive.com:65535",
|
||||||
|
username="mpuQHs4sWZ-country-US",
|
||||||
|
password="D0yWxVQo8wQ05RWqz1Bn",
|
||||||
|
)
|
||||||
|
|
||||||
|
# 1. Persistent context + proxy (the fixed path)
|
||||||
|
pd = os.path.expanduser(f"~/.crawl4ai/test_{uuid.uuid4().hex[:8]}")
|
||||||
|
os.makedirs(pd, exist_ok=True)
|
||||||
|
try:
|
||||||
|
await test(
|
||||||
|
"Persistent + proxy (launch_persistent_context)",
|
||||||
|
BrowserConfig(
|
||||||
|
headless=True,
|
||||||
|
use_persistent_context=True,
|
||||||
|
user_data_dir=pd,
|
||||||
|
proxy_config=proxy,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
shutil.rmtree(pd, ignore_errors=True)
|
||||||
|
|
||||||
|
# 2. Persistent context WITHOUT proxy
|
||||||
|
pd2 = os.path.expanduser(f"~/.crawl4ai/test_{uuid.uuid4().hex[:8]}")
|
||||||
|
os.makedirs(pd2, exist_ok=True)
|
||||||
|
try:
|
||||||
|
await test(
|
||||||
|
"Persistent, no proxy (launch_persistent_context)",
|
||||||
|
BrowserConfig(
|
||||||
|
headless=True,
|
||||||
|
use_persistent_context=True,
|
||||||
|
user_data_dir=pd2,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
shutil.rmtree(pd2, ignore_errors=True)
|
||||||
|
|
||||||
|
# 3. Non-persistent + proxy on CrawlerRunConfig
|
||||||
|
await test(
|
||||||
|
"Non-persistent + proxy on RunConfig",
|
||||||
|
BrowserConfig(headless=True),
|
||||||
|
CrawlerRunConfig(
|
||||||
|
proxy_config=proxy,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# 4. Basic crawl - no proxy, no persistent
|
||||||
|
await test(
|
||||||
|
"Basic crawl (no proxy, no persistent)",
|
||||||
|
BrowserConfig(headless=True),
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("All regression tests complete.")
|
||||||
|
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
Reference in New Issue
Block a user