From 89cf5aba2bf7ed471f3d6e3828b5e39d616d6247 Mon Sep 17 00:00:00 2001 From: AHMET YILMAZ Date: Wed, 6 Aug 2025 18:34:23 +0800 Subject: [PATCH] #1057 : enhance ProxyConfig initialization to support dict and string formats --- crawl4ai/async_configs.py | 9 + tests/proxy/test_proxy_config.py | 582 +++++++++++++++++++++++++++++++ 2 files changed, 591 insertions(+) create mode 100644 tests/proxy/test_proxy_config.py diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 3c70d634..042969a8 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -448,6 +448,10 @@ class BrowserConfig: self.chrome_channel = "" self.proxy = proxy self.proxy_config = proxy_config + if isinstance(self.proxy_config, dict): + self.proxy_config = ProxyConfig.from_dict(self.proxy_config) + if isinstance(self.proxy_config, str): + self.proxy_config = ProxyConfig.from_string(self.proxy_config) self.viewport_width = viewport_width @@ -1159,6 +1163,11 @@ class CrawlerRunConfig(): self.parser_type = parser_type self.scraping_strategy = scraping_strategy or LXMLWebScrapingStrategy() self.proxy_config = proxy_config + if isinstance(proxy_config, dict): + self.proxy_config = ProxyConfig.from_dict(proxy_config) + if isinstance(proxy_config, str): + self.proxy_config = ProxyConfig.from_string(proxy_config) + self.proxy_rotation_strategy = proxy_rotation_strategy # Browser Location and Identity Parameters diff --git a/tests/proxy/test_proxy_config.py b/tests/proxy/test_proxy_config.py new file mode 100644 index 00000000..ebea5676 --- /dev/null +++ b/tests/proxy/test_proxy_config.py @@ -0,0 +1,582 @@ +""" +Comprehensive test suite for ProxyConfig in different forms: +1. String form (ip:port:username:password) +2. Dict form (dictionary with keys) +3. Object form (ProxyConfig instance) +4. Environment variable form (from env vars) + +Tests cover all possible scenarios and edge cases using pytest. +""" + +import asyncio +import os +import pytest +import tempfile +from unittest.mock import patch + +from crawl4ai import AsyncWebCrawler, BrowserConfig +from crawl4ai.async_configs import CrawlerRunConfig, ProxyConfig +from crawl4ai.cache_context import CacheMode + + +class TestProxyConfig: + """Comprehensive test suite for ProxyConfig functionality.""" + + # Test data for different scenarios + # get free proxy server from from webshare.io https://www.webshare.io/?referral_code=3sqog0y1fvsl + TEST_PROXY_DATA = { + "server": "", + "username": "", + "password": "", + "ip": "" + } + + def setup_method(self): + """Setup for each test method.""" + self.test_url = "https://httpbin.org/ip" # Use httpbin for testing + + # ==================== OBJECT FORM TESTS ==================== + + def test_proxy_config_object_creation_basic(self): + """Test basic ProxyConfig object creation.""" + proxy = ProxyConfig(server="127.0.0.1:8080") + assert proxy.server == "127.0.0.1:8080" + assert proxy.username is None + assert proxy.password is None + assert proxy.ip == "127.0.0.1" # Should auto-extract IP + + def test_proxy_config_object_creation_full(self): + """Test ProxyConfig object creation with all parameters.""" + proxy = ProxyConfig( + server=f"http://{self.TEST_PROXY_DATA['server']}", + username=self.TEST_PROXY_DATA['username'], + password=self.TEST_PROXY_DATA['password'], + ip=self.TEST_PROXY_DATA['ip'] + ) + assert proxy.server == f"http://{self.TEST_PROXY_DATA['server']}" + assert proxy.username == self.TEST_PROXY_DATA['username'] + assert proxy.password == self.TEST_PROXY_DATA['password'] + assert proxy.ip == self.TEST_PROXY_DATA['ip'] + + def test_proxy_config_object_ip_extraction(self): + """Test automatic IP extraction from server URL.""" + test_cases = [ + ("http://192.168.1.1:8080", "192.168.1.1"), + ("https://10.0.0.1:3128", "10.0.0.1"), + ("192.168.1.100:8080", "192.168.1.100"), + ("proxy.example.com:8080", "proxy.example.com"), + ] + + for server, expected_ip in test_cases: + proxy = ProxyConfig(server=server) + assert proxy.ip == expected_ip, f"Failed for server: {server}" + + def test_proxy_config_object_invalid_server(self): + """Test ProxyConfig with invalid server formats.""" + # Should not raise exception but may not extract IP properly + proxy = ProxyConfig(server="invalid-format") + assert proxy.server == "invalid-format" + # IP extraction might fail but object should still be created + + # ==================== DICT FORM TESTS ==================== + + def test_proxy_config_from_dict_basic(self): + """Test creating ProxyConfig from basic dictionary.""" + proxy_dict = {"server": "127.0.0.1:8080"} + proxy = ProxyConfig.from_dict(proxy_dict) + assert proxy.server == "127.0.0.1:8080" + assert proxy.username is None + assert proxy.password is None + + def test_proxy_config_from_dict_full(self): + """Test creating ProxyConfig from complete dictionary.""" + proxy_dict = { + "server": f"http://{self.TEST_PROXY_DATA['server']}", + "username": self.TEST_PROXY_DATA['username'], + "password": self.TEST_PROXY_DATA['password'], + "ip": self.TEST_PROXY_DATA['ip'] + } + proxy = ProxyConfig.from_dict(proxy_dict) + assert proxy.server == proxy_dict["server"] + assert proxy.username == proxy_dict["username"] + assert proxy.password == proxy_dict["password"] + assert proxy.ip == proxy_dict["ip"] + + def test_proxy_config_from_dict_missing_keys(self): + """Test creating ProxyConfig from dictionary with missing keys.""" + proxy_dict = {"server": "127.0.0.1:8080", "username": "user"} + proxy = ProxyConfig.from_dict(proxy_dict) + assert proxy.server == "127.0.0.1:8080" + assert proxy.username == "user" + assert proxy.password is None + assert proxy.ip == "127.0.0.1" # Should auto-extract + + def test_proxy_config_from_dict_empty(self): + """Test creating ProxyConfig from empty dictionary.""" + proxy_dict = {} + proxy = ProxyConfig.from_dict(proxy_dict) + assert proxy.server is None + assert proxy.username is None + assert proxy.password is None + assert proxy.ip is None + + def test_proxy_config_from_dict_none_values(self): + """Test creating ProxyConfig from dictionary with None values.""" + proxy_dict = { + "server": "127.0.0.1:8080", + "username": None, + "password": None, + "ip": None + } + proxy = ProxyConfig.from_dict(proxy_dict) + assert proxy.server == "127.0.0.1:8080" + assert proxy.username is None + assert proxy.password is None + assert proxy.ip == "127.0.0.1" # Should auto-extract despite None + + # ==================== STRING FORM TESTS ==================== + + def test_proxy_config_from_string_full_format(self): + """Test creating ProxyConfig from full string format (ip:port:username:password).""" + proxy_str = f"{self.TEST_PROXY_DATA['ip']}:6114:{self.TEST_PROXY_DATA['username']}:{self.TEST_PROXY_DATA['password']}" + proxy = ProxyConfig.from_string(proxy_str) + assert proxy.server == f"http://{self.TEST_PROXY_DATA['ip']}:6114" + assert proxy.username == self.TEST_PROXY_DATA['username'] + assert proxy.password == self.TEST_PROXY_DATA['password'] + assert proxy.ip == self.TEST_PROXY_DATA['ip'] + + def test_proxy_config_from_string_ip_port_only(self): + """Test creating ProxyConfig from string with only ip:port.""" + proxy_str = "192.168.1.1:8080" + proxy = ProxyConfig.from_string(proxy_str) + assert proxy.server == "http://192.168.1.1:8080" + assert proxy.username is None + assert proxy.password is None + assert proxy.ip == "192.168.1.1" + + def test_proxy_config_from_string_invalid_format(self): + """Test creating ProxyConfig from invalid string formats.""" + invalid_formats = [ + "invalid", + "ip:port:user", # Missing password (3 parts) + "ip:port:user:pass:extra", # Too many parts (5 parts) + "", + "::", # Empty parts but 3 total (invalid) + "::::", # Empty parts but 5 total (invalid) + ] + + for proxy_str in invalid_formats: + with pytest.raises(ValueError, match="Invalid proxy string format"): + ProxyConfig.from_string(proxy_str) + + def test_proxy_config_from_string_edge_cases_that_work(self): + """Test string formats that should work but might be edge cases.""" + # These cases actually work as valid formats + edge_cases = [ + (":", "http://:", ""), # ip:port format with empty values + (":::", "http://:", ""), # ip:port:user:pass format with empty values + ] + + for proxy_str, expected_server, expected_ip in edge_cases: + proxy = ProxyConfig.from_string(proxy_str) + assert proxy.server == expected_server + assert proxy.ip == expected_ip + + def test_proxy_config_from_string_edge_cases(self): + """Test string parsing edge cases.""" + # Test with different port numbers + proxy_str = "10.0.0.1:3128:user:pass" + proxy = ProxyConfig.from_string(proxy_str) + assert proxy.server == "http://10.0.0.1:3128" + + # Test with special characters in credentials + proxy_str = "10.0.0.1:8080:user@domain:pass:word" + with pytest.raises(ValueError): # Should fail due to extra colon in password + ProxyConfig.from_string(proxy_str) + + # ==================== ENVIRONMENT VARIABLE TESTS ==================== + + def test_proxy_config_from_env_single_proxy(self): + """Test loading single proxy from environment variable.""" + proxy_str = f"{self.TEST_PROXY_DATA['ip']}:6114:{self.TEST_PROXY_DATA['username']}:{self.TEST_PROXY_DATA['password']}" + + with patch.dict(os.environ, {'TEST_PROXIES': proxy_str}): + proxies = ProxyConfig.from_env('TEST_PROXIES') + assert len(proxies) == 1 + proxy = proxies[0] + assert proxy.ip == self.TEST_PROXY_DATA['ip'] + assert proxy.username == self.TEST_PROXY_DATA['username'] + assert proxy.password == self.TEST_PROXY_DATA['password'] + + def test_proxy_config_from_env_multiple_proxies(self): + """Test loading multiple proxies from environment variable.""" + proxy_list = [ + "192.168.1.1:8080:user1:pass1", + "192.168.1.2:8080:user2:pass2", + "10.0.0.1:3128" # No auth + ] + proxy_str = ",".join(proxy_list) + + with patch.dict(os.environ, {'TEST_PROXIES': proxy_str}): + proxies = ProxyConfig.from_env('TEST_PROXIES') + assert len(proxies) == 3 + + # Check first proxy + assert proxies[0].ip == "192.168.1.1" + assert proxies[0].username == "user1" + assert proxies[0].password == "pass1" + + # Check second proxy + assert proxies[1].ip == "192.168.1.2" + assert proxies[1].username == "user2" + assert proxies[1].password == "pass2" + + # Check third proxy (no auth) + assert proxies[2].ip == "10.0.0.1" + assert proxies[2].username is None + assert proxies[2].password is None + + def test_proxy_config_from_env_empty_var(self): + """Test loading from empty environment variable.""" + with patch.dict(os.environ, {'TEST_PROXIES': ''}): + proxies = ProxyConfig.from_env('TEST_PROXIES') + assert len(proxies) == 0 + + def test_proxy_config_from_env_missing_var(self): + """Test loading from missing environment variable.""" + # Ensure the env var doesn't exist + with patch.dict(os.environ, {}, clear=True): + proxies = ProxyConfig.from_env('NON_EXISTENT_VAR') + assert len(proxies) == 0 + + def test_proxy_config_from_env_with_empty_entries(self): + """Test loading proxies with empty entries in the list.""" + proxy_str = "192.168.1.1:8080:user:pass,,10.0.0.1:3128," + + with patch.dict(os.environ, {'TEST_PROXIES': proxy_str}): + proxies = ProxyConfig.from_env('TEST_PROXIES') + assert len(proxies) == 2 # Empty entries should be skipped + assert proxies[0].ip == "192.168.1.1" + assert proxies[1].ip == "10.0.0.1" + + def test_proxy_config_from_env_with_invalid_entries(self): + """Test loading proxies with some invalid entries.""" + proxy_str = "192.168.1.1:8080:user:pass,invalid_proxy,10.0.0.1:3128" + + with patch.dict(os.environ, {'TEST_PROXIES': proxy_str}): + # Should handle errors gracefully and return valid proxies + proxies = ProxyConfig.from_env('TEST_PROXIES') + # Depending on implementation, might return partial list or empty + # This tests error handling + assert isinstance(proxies, list) + + # ==================== SERIALIZATION TESTS ==================== + + def test_proxy_config_to_dict(self): + """Test converting ProxyConfig to dictionary.""" + proxy = ProxyConfig( + server=f"http://{self.TEST_PROXY_DATA['server']}", + username=self.TEST_PROXY_DATA['username'], + password=self.TEST_PROXY_DATA['password'], + ip=self.TEST_PROXY_DATA['ip'] + ) + + result_dict = proxy.to_dict() + expected = { + "server": f"http://{self.TEST_PROXY_DATA['server']}", + "username": self.TEST_PROXY_DATA['username'], + "password": self.TEST_PROXY_DATA['password'], + "ip": self.TEST_PROXY_DATA['ip'] + } + assert result_dict == expected + + def test_proxy_config_clone(self): + """Test cloning ProxyConfig with modifications.""" + original = ProxyConfig( + server="http://127.0.0.1:8080", + username="user", + password="pass" + ) + + # Clone with modifications + cloned = original.clone(username="new_user", password="new_pass") + + # Original should be unchanged + assert original.username == "user" + assert original.password == "pass" + + # Clone should have new values + assert cloned.username == "new_user" + assert cloned.password == "new_pass" + assert cloned.server == original.server # Unchanged value + + def test_proxy_config_roundtrip_serialization(self): + """Test that ProxyConfig can be serialized and deserialized without loss.""" + original = ProxyConfig( + server=f"http://{self.TEST_PROXY_DATA['server']}", + username=self.TEST_PROXY_DATA['username'], + password=self.TEST_PROXY_DATA['password'], + ip=self.TEST_PROXY_DATA['ip'] + ) + + # Serialize to dict and back + serialized = original.to_dict() + deserialized = ProxyConfig.from_dict(serialized) + + assert deserialized.server == original.server + assert deserialized.username == original.username + assert deserialized.password == original.password + assert deserialized.ip == original.ip + + # ==================== INTEGRATION TESTS ==================== + + @pytest.mark.asyncio + async def test_crawler_with_proxy_config_object(self): + """Test AsyncWebCrawler with ProxyConfig object.""" + proxy_config = ProxyConfig( + server=f"http://{self.TEST_PROXY_DATA['server']}", + username=self.TEST_PROXY_DATA['username'], + password=self.TEST_PROXY_DATA['password'] + ) + + browser_config = BrowserConfig(headless=True) + + # Test that the crawler accepts the ProxyConfig object without errors + async with AsyncWebCrawler(config=browser_config) as crawler: + try: + # Note: This might fail due to actual proxy connection, but should not fail due to config issues + result = await crawler.arun( + url=self.test_url, + config=CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + proxy_config=proxy_config, + page_timeout=10000 # Short timeout for testing + ) + ) + # If we get here, proxy config was accepted + assert result is not None + except Exception as e: + # We expect connection errors with test proxies, but not config errors + error_msg = str(e).lower() + assert "attribute" not in error_msg, f"Config error: {e}" + assert "proxy_config" not in error_msg, f"Proxy config error: {e}" + + @pytest.mark.asyncio + async def test_crawler_with_proxy_config_dict(self): + """Test AsyncWebCrawler with ProxyConfig from dictionary.""" + proxy_dict = { + "server": f"http://{self.TEST_PROXY_DATA['server']}", + "username": self.TEST_PROXY_DATA['username'], + "password": self.TEST_PROXY_DATA['password'] + } + proxy_config = ProxyConfig.from_dict(proxy_dict) + + browser_config = BrowserConfig(headless=True) + + async with AsyncWebCrawler(config=browser_config) as crawler: + try: + result = await crawler.arun( + url=self.test_url, + config=CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + proxy_config=proxy_config, + page_timeout=10000 + ) + ) + assert result is not None + except Exception as e: + error_msg = str(e).lower() + assert "attribute" not in error_msg, f"Config error: {e}" + + @pytest.mark.asyncio + async def test_crawler_with_proxy_config_from_string(self): + """Test AsyncWebCrawler with ProxyConfig from string.""" + proxy_str = f"{self.TEST_PROXY_DATA['ip']}:6114:{self.TEST_PROXY_DATA['username']}:{self.TEST_PROXY_DATA['password']}" + proxy_config = ProxyConfig.from_string(proxy_str) + + browser_config = BrowserConfig(headless=True) + + async with AsyncWebCrawler(config=browser_config) as crawler: + try: + result = await crawler.arun( + url=self.test_url, + config=CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + proxy_config=proxy_config, + page_timeout=10000 + ) + ) + assert result is not None + except Exception as e: + error_msg = str(e).lower() + assert "attribute" not in error_msg, f"Config error: {e}" + + # ==================== EDGE CASES AND ERROR HANDLING ==================== + + def test_proxy_config_with_none_server(self): + """Test ProxyConfig behavior with None server.""" + proxy = ProxyConfig(server=None) + assert proxy.server is None + assert proxy.ip is None # Should not crash + + def test_proxy_config_with_empty_string_server(self): + """Test ProxyConfig behavior with empty string server.""" + proxy = ProxyConfig(server="") + assert proxy.server == "" + assert proxy.ip is None or proxy.ip == "" + + def test_proxy_config_special_characters_in_credentials(self): + """Test ProxyConfig with special characters in username/password.""" + special_chars_tests = [ + ("user@domain.com", "pass!@#$%"), + ("user_123", "p@ssw0rd"), + ("user-test", "pass-word"), + ] + + for username, password in special_chars_tests: + proxy = ProxyConfig( + server="http://127.0.0.1:8080", + username=username, + password=password + ) + assert proxy.username == username + assert proxy.password == password + + def test_proxy_config_unicode_handling(self): + """Test ProxyConfig with unicode characters.""" + proxy = ProxyConfig( + server="http://127.0.0.1:8080", + username="ユーザー", # Japanese characters + password="пароль" # Cyrillic characters + ) + assert proxy.username == "ユーザー" + assert proxy.password == "пароль" + + # ==================== PERFORMANCE TESTS ==================== + + def test_proxy_config_creation_performance(self): + """Test that ProxyConfig creation is reasonably fast.""" + import time + + start_time = time.time() + for i in range(1000): + proxy = ProxyConfig( + server=f"http://192.168.1.{i % 255}:8080", + username=f"user{i}", + password=f"pass{i}" + ) + end_time = time.time() + + # Should be able to create 1000 configs in less than 1 second + assert (end_time - start_time) < 1.0 + + def test_proxy_config_from_env_performance(self): + """Test that loading many proxies from env is reasonably fast.""" + import time + + # Create a large list of proxy strings + proxy_list = [f"192.168.1.{i}:8080:user{i}:pass{i}" for i in range(100)] + proxy_str = ",".join(proxy_list) + + with patch.dict(os.environ, {'PERF_TEST_PROXIES': proxy_str}): + start_time = time.time() + proxies = ProxyConfig.from_env('PERF_TEST_PROXIES') + end_time = time.time() + + assert len(proxies) == 100 + # Should be able to parse 100 proxies in less than 1 second + assert (end_time - start_time) < 1.0 + + +# ==================== STANDALONE TEST FUNCTIONS ==================== + +@pytest.mark.asyncio +async def test_dict_proxy(): + """Original test function for dict proxy - kept for backward compatibility.""" + proxy_config = { + "server": "23.95.150.145:6114", + "username": "cfyswbwn", + "password": "1gs266hoqysi" + } + proxy_config_obj = ProxyConfig.from_dict(proxy_config) + + browser_config = BrowserConfig(headless=True) + async with AsyncWebCrawler(config=browser_config) as crawler: + try: + result = await crawler.arun(url="https://httpbin.org/ip", config=CrawlerRunConfig( + stream=False, + cache_mode=CacheMode.BYPASS, + proxy_config=proxy_config_obj, + page_timeout=10000 + )) + print("Dict proxy test passed!") + print(result.markdown[:200] if result and result.markdown else "No result") + except Exception as e: + print(f"Dict proxy test error (expected): {e}") + + +@pytest.mark.asyncio +async def test_string_proxy(): + """Test function for string proxy format.""" + proxy_str = "23.95.150.145:6114:cfyswbwn:1gs266hoqysi" + proxy_config_obj = ProxyConfig.from_string(proxy_str) + + browser_config = BrowserConfig(headless=True) + async with AsyncWebCrawler(config=browser_config) as crawler: + try: + result = await crawler.arun(url="https://httpbin.org/ip", config=CrawlerRunConfig( + stream=False, + cache_mode=CacheMode.BYPASS, + proxy_config=proxy_config_obj, + page_timeout=10000 + )) + print("String proxy test passed!") + print(result.markdown[:200] if result and result.markdown else "No result") + except Exception as e: + print(f"String proxy test error (expected): {e}") + + +@pytest.mark.asyncio +async def test_env_proxy(): + """Test function for environment variable proxy.""" + # Set environment variable + os.environ['TEST_PROXIES'] = "23.95.150.145:6114:cfyswbwn:1gs266hoqysi" + + proxies = ProxyConfig.from_env('TEST_PROXIES') + if proxies: + proxy_config_obj = proxies[0] # Use first proxy + + browser_config = BrowserConfig(headless=True) + async with AsyncWebCrawler(config=browser_config) as crawler: + try: + result = await crawler.arun(url="https://httpbin.org/ip", config=CrawlerRunConfig( + stream=False, + cache_mode=CacheMode.BYPASS, + proxy_config=proxy_config_obj, + page_timeout=10000 + )) + print("Environment proxy test passed!") + print(result.markdown[:200] if result and result.markdown else "No result") + except Exception as e: + print(f"Environment proxy test error (expected): {e}") + else: + print("No proxies loaded from environment") + + +if __name__ == "__main__": + print("Running comprehensive ProxyConfig tests...") + print("=" * 50) + + # Run the standalone test functions + print("\n1. Testing dict proxy format...") + asyncio.run(test_dict_proxy()) + + print("\n2. Testing string proxy format...") + asyncio.run(test_string_proxy()) + + print("\n3. Testing environment variable proxy format...") + asyncio.run(test_env_proxy()) + + print("\n" + "=" * 50) + print("To run the full pytest suite, use: pytest " + __file__) + print("=" * 50) \ No newline at end of file