#1375 : refactor(proxy) Deprecate 'proxy' parameter in BrowserConfig and enhance proxy string parsing

- Updated ProxyConfig.from_string to support multiple proxy formats, including URLs with credentials.
- Deprecated the 'proxy' parameter in BrowserConfig, replacing it with 'proxy_config' for better flexibility.
- Added warnings for deprecated usage and clarified behavior when both parameters are provided.
- Updated documentation and tests to reflect changes in proxy configuration handling.
This commit is contained in:
AHMET YILMAZ
2025-08-28 17:21:49 +08:00
parent 4e1c4bd24e
commit f7a3366f72
9 changed files with 188 additions and 42 deletions

View File

@@ -1,5 +1,6 @@
import os import os
from typing import Union from typing import Union
import warnings
from .config import ( from .config import (
DEFAULT_PROVIDER, DEFAULT_PROVIDER,
DEFAULT_PROVIDER_API_KEY, DEFAULT_PROVIDER_API_KEY,
@@ -257,24 +258,39 @@ class ProxyConfig:
@staticmethod @staticmethod
def from_string(proxy_str: str) -> "ProxyConfig": def from_string(proxy_str: str) -> "ProxyConfig":
"""Create a ProxyConfig from a string in the format 'ip:port:username:password'.""" """Create a ProxyConfig from a string.
parts = proxy_str.split(":")
if len(parts) == 4: # ip:port:username:password Supported formats:
- 'http://username:password@ip:port'
- 'http://ip:port'
- 'socks5://ip:port'
- 'ip:port:username:password'
- 'ip:port'
"""
s = (proxy_str or "").strip()
# URL with credentials
if "@" in s and "://" in s:
auth_part, server_part = s.split("@", 1)
protocol, credentials = auth_part.split("://", 1)
if ":" in credentials:
username, password = credentials.split(":", 1)
return ProxyConfig(
server=f"{protocol}://{server_part}",
username=username,
password=password,
)
# URL without credentials (keep scheme)
if "://" in s and "@" not in s:
return ProxyConfig(server=s)
# Colon separated forms
parts = s.split(":")
if len(parts) == 4:
ip, port, username, password = parts ip, port, username, password = parts
return ProxyConfig( return ProxyConfig(server=f"http://{ip}:{port}", username=username, password=password)
server=f"http://{ip}:{port}", if len(parts) == 2:
username=username,
password=password,
ip=ip
)
elif len(parts) == 2: # ip:port only
ip, port = parts ip, port = parts
return ProxyConfig( return ProxyConfig(server=f"http://{ip}:{port}")
server=f"http://{ip}:{port}", raise ValueError(f"Invalid proxy string format: {proxy_str}")
ip=ip
)
else:
raise ValueError(f"Invalid proxy string format: {proxy_str}")
@staticmethod @staticmethod
def from_dict(proxy_dict: Dict) -> "ProxyConfig": def from_dict(proxy_dict: Dict) -> "ProxyConfig":
@@ -438,6 +454,7 @@ class BrowserConfig:
host: str = "localhost", host: str = "localhost",
enable_stealth: bool = False, enable_stealth: bool = False,
): ):
self.browser_type = browser_type self.browser_type = browser_type
self.headless = headless self.headless = headless
self.browser_mode = browser_mode self.browser_mode = browser_mode
@@ -450,13 +467,23 @@ class BrowserConfig:
if self.browser_type in ["firefox", "webkit"]: if self.browser_type in ["firefox", "webkit"]:
self.channel = "" self.channel = ""
self.chrome_channel = "" self.chrome_channel = ""
if proxy:
warnings.warn("The 'proxy' parameter is deprecated and will be removed in a future release. Use 'proxy_config' instead.", DeprecationWarning)
self.proxy = proxy self.proxy = proxy
self.proxy_config = proxy_config self.proxy_config = proxy_config
if isinstance(self.proxy_config, dict): if isinstance(self.proxy_config, dict):
self.proxy_config = ProxyConfig.from_dict(self.proxy_config) self.proxy_config = ProxyConfig.from_dict(self.proxy_config)
if isinstance(self.proxy_config, str): if isinstance(self.proxy_config, str):
self.proxy_config = ProxyConfig.from_string(self.proxy_config) self.proxy_config = ProxyConfig.from_string(self.proxy_config)
if self.proxy and self.proxy_config:
warnings.warn("Both 'proxy' and 'proxy_config' are provided. 'proxy_config' will take precedence.", UserWarning)
print(f"[DEBUG] Both proxy and proxy_config provided. Setting proxy to None.")
self.proxy = None
elif self.proxy:
# Convert proxy string to ProxyConfig if proxy_config is not provided
self.proxy_config = ProxyConfig.from_string(self.proxy)
self.proxy = None
self.viewport_width = viewport_width self.viewport_width = viewport_width
self.viewport_height = viewport_height self.viewport_height = viewport_height

View File

@@ -15,6 +15,7 @@ from .js_snippet import load_js_script
from .config import DOWNLOAD_PAGE_TIMEOUT from .config import DOWNLOAD_PAGE_TIMEOUT
from .async_configs import BrowserConfig, CrawlerRunConfig from .async_configs import BrowserConfig, CrawlerRunConfig
from .utils import get_chromium_path from .utils import get_chromium_path
import warnings
BROWSER_DISABLE_OPTIONS = [ BROWSER_DISABLE_OPTIONS = [
@@ -741,17 +742,18 @@ class BrowserManager:
) )
os.makedirs(browser_args["downloads_path"], exist_ok=True) os.makedirs(browser_args["downloads_path"], exist_ok=True)
if self.config.proxy or self.config.proxy_config: if self.config.proxy:
warnings.warn(
"BrowserConfig.proxy is deprecated and ignored. Use proxy_config instead.",
DeprecationWarning,
)
if self.config.proxy_config:
from playwright.async_api import ProxySettings from playwright.async_api import ProxySettings
proxy_settings = ( proxy_settings = ProxySettings(
ProxySettings(server=self.config.proxy) server=self.config.proxy_config.server,
if self.config.proxy username=self.config.proxy_config.username,
else ProxySettings( password=self.config.proxy_config.password,
server=self.config.proxy_config.server,
username=self.config.proxy_config.username,
password=self.config.proxy_config.password,
)
) )
browser_args["proxy"] = proxy_settings browser_args["proxy"] = proxy_settings

View File

@@ -7520,17 +7520,18 @@ class BrowserManager:
) )
os.makedirs(browser_args["downloads_path"], exist_ok=True) os.makedirs(browser_args["downloads_path"], exist_ok=True)
if self.config.proxy or self.config.proxy_config: if self.config.proxy:
warnings.warn(
"BrowserConfig.proxy is deprecated and ignored. Use proxy_config instead.",
DeprecationWarning,
)
if self.config.proxy_config:
from playwright.async_api import ProxySettings from playwright.async_api import ProxySettings
proxy_settings = ( proxy_settings = ProxySettings(
ProxySettings(server=self.config.proxy) server=self.config.proxy_config.server,
if self.config.proxy username=self.config.proxy_config.username,
else ProxySettings( password=self.config.proxy_config.password,
server=self.config.proxy_config.server,
username=self.config.proxy_config.username,
password=self.config.proxy_config.password,
)
) )
browser_args["proxy"] = proxy_settings browser_args["proxy"] = proxy_settings

View File

@@ -7,13 +7,13 @@ Simple proxy configuration with `BrowserConfig`:
```python ```python
from crawl4ai.async_configs import BrowserConfig from crawl4ai.async_configs import BrowserConfig
# Using proxy URL # Using HTTP proxy
browser_config = BrowserConfig(proxy="http://proxy.example.com:8080") browser_config = BrowserConfig(proxy_config={"server": "http://proxy.example.com:8080"})
async with AsyncWebCrawler(config=browser_config) as crawler: async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url="https://example.com") result = await crawler.arun(url="https://example.com")
# Using SOCKS proxy # Using SOCKS proxy
browser_config = BrowserConfig(proxy="socks5://proxy.example.com:1080") browser_config = BrowserConfig(proxy_config={"server": "socks5://proxy.example.com:1080"})
async with AsyncWebCrawler(config=browser_config) as crawler: async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url="https://example.com") result = await crawler.arun(url="https://example.com")
``` ```
@@ -25,7 +25,11 @@ Use an authenticated proxy with `BrowserConfig`:
```python ```python
from crawl4ai.async_configs import BrowserConfig from crawl4ai.async_configs import BrowserConfig
browser_config = BrowserConfig(proxy="http://[username]:[password]@[host]:[port]") browser_config = BrowserConfig(proxy_config={
"server": "http://[host]:[port]",
"username": "[username]",
"password": "[password]",
})
async with AsyncWebCrawler(config=browser_config) as crawler: async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url="https://example.com") result = await crawler.arun(url="https://example.com")
``` ```

View File

@@ -23,7 +23,7 @@ browser_cfg = BrowserConfig(
| **`headless`** | `bool` (default: `True`) | Headless means no visible UI. `False` is handy for debugging. | | **`headless`** | `bool` (default: `True`) | Headless means no visible UI. `False` is handy for debugging. |
| **`viewport_width`** | `int` (default: `1080`) | Initial page width (in px). Useful for testing responsive layouts. | | **`viewport_width`** | `int` (default: `1080`) | Initial page width (in px). Useful for testing responsive layouts. |
| **`viewport_height`** | `int` (default: `600`) | Initial page height (in px). | | **`viewport_height`** | `int` (default: `600`) | Initial page height (in px). |
| **`proxy`** | `str` (default: `None`) | Single-proxy URL if you want all traffic to go through it, e.g. `"http://user:pass@proxy:8080"`. | | **`proxy`** | `str` (deprecated) | Deprecated. Use `proxy_config` instead. If set, it will be auto-converted internally. |
| **`proxy_config`** | `dict` (default: `None`) | For advanced or multi-proxy needs, specify details like `{"server": "...", "username": "...", ...}`. | | **`proxy_config`** | `dict` (default: `None`) | For advanced or multi-proxy needs, specify details like `{"server": "...", "username": "...", ...}`. |
| **`use_persistent_context`** | `bool` (default: `False`) | If `True`, uses a **persistent** browser context (keep cookies, sessions across runs). Also sets `use_managed_browser=True`. | | **`use_persistent_context`** | `bool` (default: `False`) | If `True`, uses a **persistent** browser context (keep cookies, sessions across runs). Also sets `use_managed_browser=True`. |
| **`user_data_dir`** | `str or None` (default: `None`) | Directory to store user data (profiles, cookies). Must be set if you want permanent sessions. | | **`user_data_dir`** | `str or None` (default: `None`) | Directory to store user data (profiles, cookies). Must be set if you want permanent sessions. |

View File

@@ -0,0 +1,70 @@
import sys
import warnings
from crawl4ai.async_configs import BrowserConfig, ProxyConfig
def main() -> int:
warnings.simplefilter("always", DeprecationWarning)
# Case 1: Using deprecated proxy string should emit DeprecationWarning and auto-convert
captured = []
proxy_str = "23.95.150.145:6114:username:password"
with warnings.catch_warnings(record=True) as w:
cfg = BrowserConfig(proxy=proxy_str, headless=True)
captured = [m for m in w if issubclass(m.category, DeprecationWarning)]
if not captured:
print("[FAIL] No DeprecationWarning emitted for BrowserConfig(proxy=...) usage.")
return 1
if cfg.proxy is not None:
print("[FAIL] cfg.proxy should be None after auto-conversion.")
return 1
if not isinstance(cfg.proxy_config, ProxyConfig):
print("[FAIL] cfg.proxy_config should be a ProxyConfig instance after auto-conversion.")
return 1
# Basic sanity checks on auto-parsed proxy_config
if not cfg.proxy_config.server or ":" not in (cfg.proxy_config.server or ""):
print("[FAIL] proxy_config.server appears invalid after conversion:", cfg.proxy_config.server)
return 1
if not cfg.proxy_config.username or not cfg.proxy_config.password:
print("[FAIL] proxy_config credentials missing after conversion.")
return 1
print("[OK] DeprecationWarning captured and proxy auto-converted to proxy_config.")
# Case 2: Using proxy_config directly should not emit DeprecationWarning
with warnings.catch_warnings(record=True) as w2:
cfg2 = BrowserConfig(
proxy_config={
"server": "http://127.0.0.1:8080",
"username": "u",
"password": "p",
},
headless=True,
)
if any(issubclass(m.category, DeprecationWarning) for m in w2):
print("[FAIL] Unexpected DeprecationWarning when using proxy_config.")
return 1
if cfg2.proxy is not None:
print("[FAIL] cfg2.proxy should be None (only proxy_config should be used).")
return 1
if not isinstance(cfg2.proxy_config, ProxyConfig):
print("[FAIL] cfg2.proxy_config should be a ProxyConfig instance.")
return 1
print("[OK] proxy_config path works without deprecation warnings.")
print("All checks passed.")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -112,7 +112,7 @@ async def test_proxy_settings():
headless=True, headless=True,
verbose=False, verbose=False,
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36", user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
proxy="http://127.0.0.1:8080", # Assuming local proxy server for test proxy_config={"server": "http://127.0.0.1:8080"}, # Assuming local proxy server for test
use_managed_browser=False, use_managed_browser=False,
use_persistent_context=False, use_persistent_context=False,
) as crawler: ) as crawler:

View File

@@ -24,7 +24,7 @@ CASES = [
# --- BrowserConfig variants --- # --- BrowserConfig variants ---
"BrowserConfig()", "BrowserConfig()",
"BrowserConfig(headless=False, extra_args=['--disable-gpu'])", "BrowserConfig(headless=False, extra_args=['--disable-gpu'])",
"BrowserConfig(browser_mode='builtin', proxy='http://1.2.3.4:8080')", "BrowserConfig(browser_mode='builtin', proxy_config={'server': 'http://1.2.3.4:8080'})",
] ]
for code in CASES: for code in CASES:

View File

@@ -0,0 +1,42 @@
import warnings
import pytest
from crawl4ai.async_configs import BrowserConfig, ProxyConfig
def test_browser_config_proxy_string_emits_deprecation_and_autoconverts():
warnings.simplefilter("always", DeprecationWarning)
proxy_str = "23.95.150.145:6114:username:password"
with warnings.catch_warnings(record=True) as caught:
cfg = BrowserConfig(proxy=proxy_str, headless=True)
dep_warnings = [w for w in caught if issubclass(w.category, DeprecationWarning)]
assert dep_warnings, "Expected DeprecationWarning when using BrowserConfig(proxy=...)"
assert cfg.proxy is None, "cfg.proxy should be None after auto-conversion"
assert isinstance(cfg.proxy_config, ProxyConfig), "cfg.proxy_config should be ProxyConfig instance"
assert cfg.proxy_config.username == "username"
assert cfg.proxy_config.password == "password"
assert cfg.proxy_config.server.startswith("http://")
assert cfg.proxy_config.server.endswith(":6114")
def test_browser_config_with_proxy_config_emits_no_deprecation():
warnings.simplefilter("always", DeprecationWarning)
with warnings.catch_warnings(record=True) as caught:
cfg = BrowserConfig(
headless=True,
proxy_config={
"server": "http://127.0.0.1:8080",
"username": "u",
"password": "p",
},
)
dep_warnings = [w for w in caught if issubclass(w.category, DeprecationWarning)]
assert not dep_warnings, "Did not expect DeprecationWarning when using proxy_config"
assert cfg.proxy is None
assert isinstance(cfg.proxy_config, ProxyConfig)