Merge pull request #1464 from unclecode/fix/proxy_deprecation
Fix/proxy deprecation
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -265,7 +265,7 @@ CLAUDE.md
|
|||||||
tests/**/test_site
|
tests/**/test_site
|
||||||
tests/**/reports
|
tests/**/reports
|
||||||
tests/**/benchmark_reports
|
tests/**/benchmark_reports
|
||||||
|
test_scripts/
|
||||||
docs/**/data
|
docs/**/data
|
||||||
.codecat/
|
.codecat/
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
import warnings
|
||||||
from .config import (
|
from .config import (
|
||||||
DEFAULT_PROVIDER,
|
DEFAULT_PROVIDER,
|
||||||
DEFAULT_PROVIDER_API_KEY,
|
DEFAULT_PROVIDER_API_KEY,
|
||||||
@@ -257,24 +258,39 @@ class ProxyConfig:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_string(proxy_str: str) -> "ProxyConfig":
|
def from_string(proxy_str: str) -> "ProxyConfig":
|
||||||
"""Create a ProxyConfig from a string in the format 'ip:port:username:password'."""
|
"""Create a ProxyConfig from a string.
|
||||||
parts = proxy_str.split(":")
|
|
||||||
if len(parts) == 4: # ip:port:username:password
|
Supported formats:
|
||||||
|
- 'http://username:password@ip:port'
|
||||||
|
- 'http://ip:port'
|
||||||
|
- 'socks5://ip:port'
|
||||||
|
- 'ip:port:username:password'
|
||||||
|
- 'ip:port'
|
||||||
|
"""
|
||||||
|
s = (proxy_str or "").strip()
|
||||||
|
# URL with credentials
|
||||||
|
if "@" in s and "://" in s:
|
||||||
|
auth_part, server_part = s.split("@", 1)
|
||||||
|
protocol, credentials = auth_part.split("://", 1)
|
||||||
|
if ":" in credentials:
|
||||||
|
username, password = credentials.split(":", 1)
|
||||||
|
return ProxyConfig(
|
||||||
|
server=f"{protocol}://{server_part}",
|
||||||
|
username=username,
|
||||||
|
password=password,
|
||||||
|
)
|
||||||
|
# URL without credentials (keep scheme)
|
||||||
|
if "://" in s and "@" not in s:
|
||||||
|
return ProxyConfig(server=s)
|
||||||
|
# Colon separated forms
|
||||||
|
parts = s.split(":")
|
||||||
|
if len(parts) == 4:
|
||||||
ip, port, username, password = parts
|
ip, port, username, password = parts
|
||||||
return ProxyConfig(
|
return ProxyConfig(server=f"http://{ip}:{port}", username=username, password=password)
|
||||||
server=f"http://{ip}:{port}",
|
if len(parts) == 2:
|
||||||
username=username,
|
|
||||||
password=password,
|
|
||||||
ip=ip
|
|
||||||
)
|
|
||||||
elif len(parts) == 2: # ip:port only
|
|
||||||
ip, port = parts
|
ip, port = parts
|
||||||
return ProxyConfig(
|
return ProxyConfig(server=f"http://{ip}:{port}")
|
||||||
server=f"http://{ip}:{port}",
|
raise ValueError(f"Invalid proxy string format: {proxy_str}")
|
||||||
ip=ip
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Invalid proxy string format: {proxy_str}")
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_dict(proxy_dict: Dict) -> "ProxyConfig":
|
def from_dict(proxy_dict: Dict) -> "ProxyConfig":
|
||||||
@@ -438,6 +454,7 @@ class BrowserConfig:
|
|||||||
host: str = "localhost",
|
host: str = "localhost",
|
||||||
enable_stealth: bool = False,
|
enable_stealth: bool = False,
|
||||||
):
|
):
|
||||||
|
|
||||||
self.browser_type = browser_type
|
self.browser_type = browser_type
|
||||||
self.headless = headless
|
self.headless = headless
|
||||||
self.browser_mode = browser_mode
|
self.browser_mode = browser_mode
|
||||||
@@ -450,13 +467,22 @@ class BrowserConfig:
|
|||||||
if self.browser_type in ["firefox", "webkit"]:
|
if self.browser_type in ["firefox", "webkit"]:
|
||||||
self.channel = ""
|
self.channel = ""
|
||||||
self.chrome_channel = ""
|
self.chrome_channel = ""
|
||||||
|
if proxy:
|
||||||
|
warnings.warn("The 'proxy' parameter is deprecated and will be removed in a future release. Use 'proxy_config' instead.", UserWarning)
|
||||||
self.proxy = proxy
|
self.proxy = proxy
|
||||||
self.proxy_config = proxy_config
|
self.proxy_config = proxy_config
|
||||||
if isinstance(self.proxy_config, dict):
|
if isinstance(self.proxy_config, dict):
|
||||||
self.proxy_config = ProxyConfig.from_dict(self.proxy_config)
|
self.proxy_config = ProxyConfig.from_dict(self.proxy_config)
|
||||||
if isinstance(self.proxy_config, str):
|
if isinstance(self.proxy_config, str):
|
||||||
self.proxy_config = ProxyConfig.from_string(self.proxy_config)
|
self.proxy_config = ProxyConfig.from_string(self.proxy_config)
|
||||||
|
|
||||||
|
if self.proxy and self.proxy_config:
|
||||||
|
warnings.warn("Both 'proxy' and 'proxy_config' are provided. 'proxy_config' will take precedence.", UserWarning)
|
||||||
|
self.proxy = None
|
||||||
|
elif self.proxy:
|
||||||
|
# Convert proxy string to ProxyConfig if proxy_config is not provided
|
||||||
|
self.proxy_config = ProxyConfig.from_string(self.proxy)
|
||||||
|
self.proxy = None
|
||||||
|
|
||||||
self.viewport_width = viewport_width
|
self.viewport_width = viewport_width
|
||||||
self.viewport_height = viewport_height
|
self.viewport_height = viewport_height
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ from .js_snippet import load_js_script
|
|||||||
from .config import DOWNLOAD_PAGE_TIMEOUT
|
from .config import DOWNLOAD_PAGE_TIMEOUT
|
||||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
from .utils import get_chromium_path
|
from .utils import get_chromium_path
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
|
||||||
BROWSER_DISABLE_OPTIONS = [
|
BROWSER_DISABLE_OPTIONS = [
|
||||||
@@ -741,17 +742,18 @@ class BrowserManager:
|
|||||||
)
|
)
|
||||||
os.makedirs(browser_args["downloads_path"], exist_ok=True)
|
os.makedirs(browser_args["downloads_path"], exist_ok=True)
|
||||||
|
|
||||||
if self.config.proxy or self.config.proxy_config:
|
if self.config.proxy:
|
||||||
|
warnings.warn(
|
||||||
|
"BrowserConfig.proxy is deprecated and ignored. Use proxy_config instead.",
|
||||||
|
DeprecationWarning,
|
||||||
|
)
|
||||||
|
if self.config.proxy_config:
|
||||||
from playwright.async_api import ProxySettings
|
from playwright.async_api import ProxySettings
|
||||||
|
|
||||||
proxy_settings = (
|
proxy_settings = ProxySettings(
|
||||||
ProxySettings(server=self.config.proxy)
|
server=self.config.proxy_config.server,
|
||||||
if self.config.proxy
|
username=self.config.proxy_config.username,
|
||||||
else ProxySettings(
|
password=self.config.proxy_config.password,
|
||||||
server=self.config.proxy_config.server,
|
|
||||||
username=self.config.proxy_config.username,
|
|
||||||
password=self.config.proxy_config.password,
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
browser_args["proxy"] = proxy_settings
|
browser_args["proxy"] = proxy_settings
|
||||||
|
|
||||||
|
|||||||
@@ -7520,17 +7520,18 @@ class BrowserManager:
|
|||||||
)
|
)
|
||||||
os.makedirs(browser_args["downloads_path"], exist_ok=True)
|
os.makedirs(browser_args["downloads_path"], exist_ok=True)
|
||||||
|
|
||||||
if self.config.proxy or self.config.proxy_config:
|
if self.config.proxy:
|
||||||
|
warnings.warn(
|
||||||
|
"BrowserConfig.proxy is deprecated and ignored. Use proxy_config instead.",
|
||||||
|
DeprecationWarning,
|
||||||
|
)
|
||||||
|
if self.config.proxy_config:
|
||||||
from playwright.async_api import ProxySettings
|
from playwright.async_api import ProxySettings
|
||||||
|
|
||||||
proxy_settings = (
|
proxy_settings = ProxySettings(
|
||||||
ProxySettings(server=self.config.proxy)
|
server=self.config.proxy_config.server,
|
||||||
if self.config.proxy
|
username=self.config.proxy_config.username,
|
||||||
else ProxySettings(
|
password=self.config.proxy_config.password,
|
||||||
server=self.config.proxy_config.server,
|
|
||||||
username=self.config.proxy_config.username,
|
|
||||||
password=self.config.proxy_config.password,
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
browser_args["proxy"] = proxy_settings
|
browser_args["proxy"] = proxy_settings
|
||||||
|
|
||||||
|
|||||||
@@ -7,13 +7,13 @@ Simple proxy configuration with `BrowserConfig`:
|
|||||||
```python
|
```python
|
||||||
from crawl4ai.async_configs import BrowserConfig
|
from crawl4ai.async_configs import BrowserConfig
|
||||||
|
|
||||||
# Using proxy URL
|
# Using HTTP proxy
|
||||||
browser_config = BrowserConfig(proxy="http://proxy.example.com:8080")
|
browser_config = BrowserConfig(proxy_config={"server": "http://proxy.example.com:8080"})
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
result = await crawler.arun(url="https://example.com")
|
result = await crawler.arun(url="https://example.com")
|
||||||
|
|
||||||
# Using SOCKS proxy
|
# Using SOCKS proxy
|
||||||
browser_config = BrowserConfig(proxy="socks5://proxy.example.com:1080")
|
browser_config = BrowserConfig(proxy_config={"server": "socks5://proxy.example.com:1080"})
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
result = await crawler.arun(url="https://example.com")
|
result = await crawler.arun(url="https://example.com")
|
||||||
```
|
```
|
||||||
@@ -25,7 +25,11 @@ Use an authenticated proxy with `BrowserConfig`:
|
|||||||
```python
|
```python
|
||||||
from crawl4ai.async_configs import BrowserConfig
|
from crawl4ai.async_configs import BrowserConfig
|
||||||
|
|
||||||
browser_config = BrowserConfig(proxy="http://[username]:[password]@[host]:[port]")
|
browser_config = BrowserConfig(proxy_config={
|
||||||
|
"server": "http://[host]:[port]",
|
||||||
|
"username": "[username]",
|
||||||
|
"password": "[password]",
|
||||||
|
})
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
result = await crawler.arun(url="https://example.com")
|
result = await crawler.arun(url="https://example.com")
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ browser_cfg = BrowserConfig(
|
|||||||
| **`headless`** | `bool` (default: `True`) | Headless means no visible UI. `False` is handy for debugging. |
|
| **`headless`** | `bool` (default: `True`) | Headless means no visible UI. `False` is handy for debugging. |
|
||||||
| **`viewport_width`** | `int` (default: `1080`) | Initial page width (in px). Useful for testing responsive layouts. |
|
| **`viewport_width`** | `int` (default: `1080`) | Initial page width (in px). Useful for testing responsive layouts. |
|
||||||
| **`viewport_height`** | `int` (default: `600`) | Initial page height (in px). |
|
| **`viewport_height`** | `int` (default: `600`) | Initial page height (in px). |
|
||||||
| **`proxy`** | `str` (default: `None`) | Single-proxy URL if you want all traffic to go through it, e.g. `"http://user:pass@proxy:8080"`. |
|
| **`proxy`** | `str` (deprecated) | Deprecated. Use `proxy_config` instead. If set, it will be auto-converted internally. |
|
||||||
| **`proxy_config`** | `dict` (default: `None`) | For advanced or multi-proxy needs, specify details like `{"server": "...", "username": "...", ...}`. |
|
| **`proxy_config`** | `dict` (default: `None`) | For advanced or multi-proxy needs, specify details like `{"server": "...", "username": "...", ...}`. |
|
||||||
| **`use_persistent_context`** | `bool` (default: `False`) | If `True`, uses a **persistent** browser context (keep cookies, sessions across runs). Also sets `use_managed_browser=True`. |
|
| **`use_persistent_context`** | `bool` (default: `False`) | If `True`, uses a **persistent** browser context (keep cookies, sessions across runs). Also sets `use_managed_browser=True`. |
|
||||||
| **`user_data_dir`** | `str or None` (default: `None`) | Directory to store user data (profiles, cookies). Must be set if you want permanent sessions. |
|
| **`user_data_dir`** | `str or None` (default: `None`) | Directory to store user data (profiles, cookies). Must be set if you want permanent sessions. |
|
||||||
|
|||||||
@@ -112,7 +112,7 @@ async def test_proxy_settings():
|
|||||||
headless=True,
|
headless=True,
|
||||||
verbose=False,
|
verbose=False,
|
||||||
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
|
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
|
||||||
proxy="http://127.0.0.1:8080", # Assuming local proxy server for test
|
proxy_config={"server": "http://127.0.0.1:8080"}, # Assuming local proxy server for test
|
||||||
use_managed_browser=False,
|
use_managed_browser=False,
|
||||||
use_persistent_context=False,
|
use_persistent_context=False,
|
||||||
) as crawler:
|
) as crawler:
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ CASES = [
|
|||||||
# --- BrowserConfig variants ---
|
# --- BrowserConfig variants ---
|
||||||
"BrowserConfig()",
|
"BrowserConfig()",
|
||||||
"BrowserConfig(headless=False, extra_args=['--disable-gpu'])",
|
"BrowserConfig(headless=False, extra_args=['--disable-gpu'])",
|
||||||
"BrowserConfig(browser_mode='builtin', proxy='http://1.2.3.4:8080')",
|
"BrowserConfig(browser_mode='builtin', proxy_config={'server': 'http://1.2.3.4:8080'})",
|
||||||
]
|
]
|
||||||
|
|
||||||
for code in CASES:
|
for code in CASES:
|
||||||
|
|||||||
42
tests/proxy/test_proxy_deprecation.py
Normal file
42
tests/proxy/test_proxy_deprecation.py
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
import warnings
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from crawl4ai.async_configs import BrowserConfig, ProxyConfig
|
||||||
|
|
||||||
|
|
||||||
|
def test_browser_config_proxy_string_emits_deprecation_and_autoconverts():
|
||||||
|
warnings.simplefilter("always", DeprecationWarning)
|
||||||
|
|
||||||
|
proxy_str = "23.95.150.145:6114:username:password"
|
||||||
|
with warnings.catch_warnings(record=True) as caught:
|
||||||
|
cfg = BrowserConfig(proxy=proxy_str, headless=True)
|
||||||
|
|
||||||
|
dep_warnings = [w for w in caught if issubclass(w.category, DeprecationWarning)]
|
||||||
|
assert dep_warnings, "Expected DeprecationWarning when using BrowserConfig(proxy=...)"
|
||||||
|
|
||||||
|
assert cfg.proxy is None, "cfg.proxy should be None after auto-conversion"
|
||||||
|
assert isinstance(cfg.proxy_config, ProxyConfig), "cfg.proxy_config should be ProxyConfig instance"
|
||||||
|
assert cfg.proxy_config.username == "username"
|
||||||
|
assert cfg.proxy_config.password == "password"
|
||||||
|
assert cfg.proxy_config.server.startswith("http://")
|
||||||
|
assert cfg.proxy_config.server.endswith(":6114")
|
||||||
|
|
||||||
|
|
||||||
|
def test_browser_config_with_proxy_config_emits_no_deprecation():
|
||||||
|
warnings.simplefilter("always", DeprecationWarning)
|
||||||
|
|
||||||
|
with warnings.catch_warnings(record=True) as caught:
|
||||||
|
cfg = BrowserConfig(
|
||||||
|
headless=True,
|
||||||
|
proxy_config={
|
||||||
|
"server": "http://127.0.0.1:8080",
|
||||||
|
"username": "u",
|
||||||
|
"password": "p",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
dep_warnings = [w for w in caught if issubclass(w.category, DeprecationWarning)]
|
||||||
|
assert not dep_warnings, "Did not expect DeprecationWarning when using proxy_config"
|
||||||
|
assert cfg.proxy is None
|
||||||
|
assert isinstance(cfg.proxy_config, ProxyConfig)
|
||||||
Reference in New Issue
Block a user