diff --git a/.gitignore b/.gitignore index 6277b5cf..03c5b355 100644 --- a/.gitignore +++ b/.gitignore @@ -265,7 +265,7 @@ CLAUDE.md tests/**/test_site tests/**/reports tests/**/benchmark_reports - +test_scripts/ docs/**/data .codecat/ diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index acdd6403..50fe99ba 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -1,5 +1,6 @@ import os from typing import Union +import warnings from .config import ( DEFAULT_PROVIDER, DEFAULT_PROVIDER_API_KEY, @@ -257,24 +258,39 @@ class ProxyConfig: @staticmethod def from_string(proxy_str: str) -> "ProxyConfig": - """Create a ProxyConfig from a string in the format 'ip:port:username:password'.""" - parts = proxy_str.split(":") - if len(parts) == 4: # ip:port:username:password + """Create a ProxyConfig from a string. + + Supported formats: + - 'http://username:password@ip:port' + - 'http://ip:port' + - 'socks5://ip:port' + - 'ip:port:username:password' + - 'ip:port' + """ + s = (proxy_str or "").strip() + # URL with credentials + if "@" in s and "://" in s: + auth_part, server_part = s.split("@", 1) + protocol, credentials = auth_part.split("://", 1) + if ":" in credentials: + username, password = credentials.split(":", 1) + return ProxyConfig( + server=f"{protocol}://{server_part}", + username=username, + password=password, + ) + # URL without credentials (keep scheme) + if "://" in s and "@" not in s: + return ProxyConfig(server=s) + # Colon separated forms + parts = s.split(":") + if len(parts) == 4: ip, port, username, password = parts - return ProxyConfig( - server=f"http://{ip}:{port}", - username=username, - password=password, - ip=ip - ) - elif len(parts) == 2: # ip:port only + return ProxyConfig(server=f"http://{ip}:{port}", username=username, password=password) + if len(parts) == 2: ip, port = parts - return ProxyConfig( - server=f"http://{ip}:{port}", - ip=ip - ) - else: - raise ValueError(f"Invalid proxy string format: {proxy_str}") + return ProxyConfig(server=f"http://{ip}:{port}") + raise ValueError(f"Invalid proxy string format: {proxy_str}") @staticmethod def from_dict(proxy_dict: Dict) -> "ProxyConfig": @@ -438,6 +454,7 @@ class BrowserConfig: host: str = "localhost", enable_stealth: bool = False, ): + self.browser_type = browser_type self.headless = headless self.browser_mode = browser_mode @@ -450,13 +467,22 @@ class BrowserConfig: if self.browser_type in ["firefox", "webkit"]: self.channel = "" self.chrome_channel = "" + if proxy: + warnings.warn("The 'proxy' parameter is deprecated and will be removed in a future release. Use 'proxy_config' instead.", UserWarning) self.proxy = proxy self.proxy_config = proxy_config if isinstance(self.proxy_config, dict): self.proxy_config = ProxyConfig.from_dict(self.proxy_config) if isinstance(self.proxy_config, str): self.proxy_config = ProxyConfig.from_string(self.proxy_config) - + + if self.proxy and self.proxy_config: + warnings.warn("Both 'proxy' and 'proxy_config' are provided. 'proxy_config' will take precedence.", UserWarning) + self.proxy = None + elif self.proxy: + # Convert proxy string to ProxyConfig if proxy_config is not provided + self.proxy_config = ProxyConfig.from_string(self.proxy) + self.proxy = None self.viewport_width = viewport_width self.viewport_height = viewport_height diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index 8fed970c..48737fcb 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -15,6 +15,7 @@ from .js_snippet import load_js_script from .config import DOWNLOAD_PAGE_TIMEOUT from .async_configs import BrowserConfig, CrawlerRunConfig from .utils import get_chromium_path +import warnings BROWSER_DISABLE_OPTIONS = [ @@ -741,17 +742,18 @@ class BrowserManager: ) os.makedirs(browser_args["downloads_path"], exist_ok=True) - if self.config.proxy or self.config.proxy_config: + if self.config.proxy: + warnings.warn( + "BrowserConfig.proxy is deprecated and ignored. Use proxy_config instead.", + DeprecationWarning, + ) + if self.config.proxy_config: from playwright.async_api import ProxySettings - proxy_settings = ( - ProxySettings(server=self.config.proxy) - if self.config.proxy - else ProxySettings( - server=self.config.proxy_config.server, - username=self.config.proxy_config.username, - password=self.config.proxy_config.password, - ) + proxy_settings = ProxySettings( + server=self.config.proxy_config.server, + username=self.config.proxy_config.username, + password=self.config.proxy_config.password, ) browser_args["proxy"] = proxy_settings diff --git a/deploy/docker/c4ai-code-context.md b/deploy/docker/c4ai-code-context.md index eb29b94c..c18fbc78 100644 --- a/deploy/docker/c4ai-code-context.md +++ b/deploy/docker/c4ai-code-context.md @@ -7520,17 +7520,18 @@ class BrowserManager: ) os.makedirs(browser_args["downloads_path"], exist_ok=True) - if self.config.proxy or self.config.proxy_config: + if self.config.proxy: + warnings.warn( + "BrowserConfig.proxy is deprecated and ignored. Use proxy_config instead.", + DeprecationWarning, + ) + if self.config.proxy_config: from playwright.async_api import ProxySettings - proxy_settings = ( - ProxySettings(server=self.config.proxy) - if self.config.proxy - else ProxySettings( - server=self.config.proxy_config.server, - username=self.config.proxy_config.username, - password=self.config.proxy_config.password, - ) + proxy_settings = ProxySettings( + server=self.config.proxy_config.server, + username=self.config.proxy_config.username, + password=self.config.proxy_config.password, ) browser_args["proxy"] = proxy_settings diff --git a/docs/md_v2/advanced/proxy-security.md b/docs/md_v2/advanced/proxy-security.md index 13191cd7..d1c868b2 100644 --- a/docs/md_v2/advanced/proxy-security.md +++ b/docs/md_v2/advanced/proxy-security.md @@ -7,13 +7,13 @@ Simple proxy configuration with `BrowserConfig`: ```python from crawl4ai.async_configs import BrowserConfig -# Using proxy URL -browser_config = BrowserConfig(proxy="http://proxy.example.com:8080") +# Using HTTP proxy +browser_config = BrowserConfig(proxy_config={"server": "http://proxy.example.com:8080"}) async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun(url="https://example.com") # Using SOCKS proxy -browser_config = BrowserConfig(proxy="socks5://proxy.example.com:1080") +browser_config = BrowserConfig(proxy_config={"server": "socks5://proxy.example.com:1080"}) async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun(url="https://example.com") ``` @@ -25,7 +25,11 @@ Use an authenticated proxy with `BrowserConfig`: ```python from crawl4ai.async_configs import BrowserConfig -browser_config = BrowserConfig(proxy="http://[username]:[password]@[host]:[port]") +browser_config = BrowserConfig(proxy_config={ + "server": "http://[host]:[port]", + "username": "[username]", + "password": "[password]", +}) async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun(url="https://example.com") ``` diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md index 47f719c8..368f53fb 100644 --- a/docs/md_v2/api/parameters.md +++ b/docs/md_v2/api/parameters.md @@ -23,7 +23,7 @@ browser_cfg = BrowserConfig( | **`headless`** | `bool` (default: `True`) | Headless means no visible UI. `False` is handy for debugging. | | **`viewport_width`** | `int` (default: `1080`) | Initial page width (in px). Useful for testing responsive layouts. | | **`viewport_height`** | `int` (default: `600`) | Initial page height (in px). | -| **`proxy`** | `str` (default: `None`) | Single-proxy URL if you want all traffic to go through it, e.g. `"http://user:pass@proxy:8080"`. | +| **`proxy`** | `str` (deprecated) | Deprecated. Use `proxy_config` instead. If set, it will be auto-converted internally. | | **`proxy_config`** | `dict` (default: `None`) | For advanced or multi-proxy needs, specify details like `{"server": "...", "username": "...", ...}`. | | **`use_persistent_context`** | `bool` (default: `False`) | If `True`, uses a **persistent** browser context (keep cookies, sessions across runs). Also sets `use_managed_browser=True`. | | **`user_data_dir`** | `str or None` (default: `None`) | Directory to store user data (profiles, cookies). Must be set if you want permanent sessions. | diff --git a/tests/async/test_0.4.2_browser_manager.py b/tests/async/test_0.4.2_browser_manager.py index 21b4be11..635b731f 100644 --- a/tests/async/test_0.4.2_browser_manager.py +++ b/tests/async/test_0.4.2_browser_manager.py @@ -112,7 +112,7 @@ async def test_proxy_settings(): headless=True, verbose=False, user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36", - proxy="http://127.0.0.1:8080", # Assuming local proxy server for test + proxy_config={"server": "http://127.0.0.1:8080"}, # Assuming local proxy server for test use_managed_browser=False, use_persistent_context=False, ) as crawler: diff --git a/tests/memory/test_docker_config_gen.py b/tests/memory/test_docker_config_gen.py index ae6e533c..41beb30a 100644 --- a/tests/memory/test_docker_config_gen.py +++ b/tests/memory/test_docker_config_gen.py @@ -24,7 +24,7 @@ CASES = [ # --- BrowserConfig variants --- "BrowserConfig()", "BrowserConfig(headless=False, extra_args=['--disable-gpu'])", - "BrowserConfig(browser_mode='builtin', proxy='http://1.2.3.4:8080')", + "BrowserConfig(browser_mode='builtin', proxy_config={'server': 'http://1.2.3.4:8080'})", ] for code in CASES: diff --git a/tests/proxy/test_proxy_deprecation.py b/tests/proxy/test_proxy_deprecation.py new file mode 100644 index 00000000..95ccfc70 --- /dev/null +++ b/tests/proxy/test_proxy_deprecation.py @@ -0,0 +1,42 @@ +import warnings + +import pytest + +from crawl4ai.async_configs import BrowserConfig, ProxyConfig + + +def test_browser_config_proxy_string_emits_deprecation_and_autoconverts(): + warnings.simplefilter("always", DeprecationWarning) + + proxy_str = "23.95.150.145:6114:username:password" + with warnings.catch_warnings(record=True) as caught: + cfg = BrowserConfig(proxy=proxy_str, headless=True) + + dep_warnings = [w for w in caught if issubclass(w.category, DeprecationWarning)] + assert dep_warnings, "Expected DeprecationWarning when using BrowserConfig(proxy=...)" + + assert cfg.proxy is None, "cfg.proxy should be None after auto-conversion" + assert isinstance(cfg.proxy_config, ProxyConfig), "cfg.proxy_config should be ProxyConfig instance" + assert cfg.proxy_config.username == "username" + assert cfg.proxy_config.password == "password" + assert cfg.proxy_config.server.startswith("http://") + assert cfg.proxy_config.server.endswith(":6114") + + +def test_browser_config_with_proxy_config_emits_no_deprecation(): + warnings.simplefilter("always", DeprecationWarning) + + with warnings.catch_warnings(record=True) as caught: + cfg = BrowserConfig( + headless=True, + proxy_config={ + "server": "http://127.0.0.1:8080", + "username": "u", + "password": "p", + }, + ) + + dep_warnings = [w for w in caught if issubclass(w.category, DeprecationWarning)] + assert not dep_warnings, "Did not expect DeprecationWarning when using proxy_config" + assert cfg.proxy is None + assert isinstance(cfg.proxy_config, ProxyConfig)