""" Comprehensive test suite for ProxyConfig in different forms: 1. String form (ip:port:username:password) 2. Dict form (dictionary with keys) 3. Object form (ProxyConfig instance) 4. Environment variable form (from env vars) Tests cover all possible scenarios and edge cases using pytest. """ import asyncio import os import pytest import tempfile from unittest.mock import patch from crawl4ai import AsyncWebCrawler, BrowserConfig from crawl4ai.async_configs import CrawlerRunConfig, ProxyConfig from crawl4ai.cache_context import CacheMode class TestProxyConfig: """Comprehensive test suite for ProxyConfig functionality.""" # Test data for different scenarios # get free proxy server from from webshare.io https://www.webshare.io/?referral_code=3sqog0y1fvsl TEST_PROXY_DATA = { "server": "", "username": "", "password": "", "ip": "" } def setup_method(self): """Setup for each test method.""" self.test_url = "https://httpbin.org/ip" # Use httpbin for testing # ==================== OBJECT FORM TESTS ==================== def test_proxy_config_object_creation_basic(self): """Test basic ProxyConfig object creation.""" proxy = ProxyConfig(server="127.0.0.1:8080") assert proxy.server == "127.0.0.1:8080" assert proxy.username is None assert proxy.password is None assert proxy.ip == "127.0.0.1" # Should auto-extract IP def test_proxy_config_object_creation_full(self): """Test ProxyConfig object creation with all parameters.""" proxy = ProxyConfig( server=f"http://{self.TEST_PROXY_DATA['server']}", username=self.TEST_PROXY_DATA['username'], password=self.TEST_PROXY_DATA['password'], ip=self.TEST_PROXY_DATA['ip'] ) assert proxy.server == f"http://{self.TEST_PROXY_DATA['server']}" assert proxy.username == self.TEST_PROXY_DATA['username'] assert proxy.password == self.TEST_PROXY_DATA['password'] assert proxy.ip == self.TEST_PROXY_DATA['ip'] def test_proxy_config_object_ip_extraction(self): """Test automatic IP extraction from server URL.""" test_cases = [ ("http://192.168.1.1:8080", "192.168.1.1"), ("https://10.0.0.1:3128", "10.0.0.1"), ("192.168.1.100:8080", "192.168.1.100"), ("proxy.example.com:8080", "proxy.example.com"), ] for server, expected_ip in test_cases: proxy = ProxyConfig(server=server) assert proxy.ip == expected_ip, f"Failed for server: {server}" def test_proxy_config_object_invalid_server(self): """Test ProxyConfig with invalid server formats.""" # Should not raise exception but may not extract IP properly proxy = ProxyConfig(server="invalid-format") assert proxy.server == "invalid-format" # IP extraction might fail but object should still be created # ==================== DICT FORM TESTS ==================== def test_proxy_config_from_dict_basic(self): """Test creating ProxyConfig from basic dictionary.""" proxy_dict = {"server": "127.0.0.1:8080"} proxy = ProxyConfig.from_dict(proxy_dict) assert proxy.server == "127.0.0.1:8080" assert proxy.username is None assert proxy.password is None def test_proxy_config_from_dict_full(self): """Test creating ProxyConfig from complete dictionary.""" proxy_dict = { "server": f"http://{self.TEST_PROXY_DATA['server']}", "username": self.TEST_PROXY_DATA['username'], "password": self.TEST_PROXY_DATA['password'], "ip": self.TEST_PROXY_DATA['ip'] } proxy = ProxyConfig.from_dict(proxy_dict) assert proxy.server == proxy_dict["server"] assert proxy.username == proxy_dict["username"] assert proxy.password == proxy_dict["password"] assert proxy.ip == proxy_dict["ip"] def test_proxy_config_from_dict_missing_keys(self): """Test creating ProxyConfig from dictionary with missing keys.""" proxy_dict = {"server": "127.0.0.1:8080", "username": "user"} proxy = ProxyConfig.from_dict(proxy_dict) assert proxy.server == "127.0.0.1:8080" assert proxy.username == "user" assert proxy.password is None assert proxy.ip == "127.0.0.1" # Should auto-extract def test_proxy_config_from_dict_empty(self): """Test creating ProxyConfig from empty dictionary.""" proxy_dict = {} proxy = ProxyConfig.from_dict(proxy_dict) assert proxy.server is None assert proxy.username is None assert proxy.password is None assert proxy.ip is None def test_proxy_config_from_dict_none_values(self): """Test creating ProxyConfig from dictionary with None values.""" proxy_dict = { "server": "127.0.0.1:8080", "username": None, "password": None, "ip": None } proxy = ProxyConfig.from_dict(proxy_dict) assert proxy.server == "127.0.0.1:8080" assert proxy.username is None assert proxy.password is None assert proxy.ip == "127.0.0.1" # Should auto-extract despite None # ==================== STRING FORM TESTS ==================== def test_proxy_config_from_string_full_format(self): """Test creating ProxyConfig from full string format (ip:port:username:password).""" proxy_str = f"{self.TEST_PROXY_DATA['ip']}:6114:{self.TEST_PROXY_DATA['username']}:{self.TEST_PROXY_DATA['password']}" proxy = ProxyConfig.from_string(proxy_str) assert proxy.server == f"http://{self.TEST_PROXY_DATA['ip']}:6114" assert proxy.username == self.TEST_PROXY_DATA['username'] assert proxy.password == self.TEST_PROXY_DATA['password'] assert proxy.ip == self.TEST_PROXY_DATA['ip'] def test_proxy_config_from_string_ip_port_only(self): """Test creating ProxyConfig from string with only ip:port.""" proxy_str = "192.168.1.1:8080" proxy = ProxyConfig.from_string(proxy_str) assert proxy.server == "http://192.168.1.1:8080" assert proxy.username is None assert proxy.password is None assert proxy.ip == "192.168.1.1" def test_proxy_config_from_string_invalid_format(self): """Test creating ProxyConfig from invalid string formats.""" invalid_formats = [ "invalid", "ip:port:user", # Missing password (3 parts) "ip:port:user:pass:extra", # Too many parts (5 parts) "", "::", # Empty parts but 3 total (invalid) "::::", # Empty parts but 5 total (invalid) ] for proxy_str in invalid_formats: with pytest.raises(ValueError, match="Invalid proxy string format"): ProxyConfig.from_string(proxy_str) def test_proxy_config_from_string_edge_cases_that_work(self): """Test string formats that should work but might be edge cases.""" # These cases actually work as valid formats edge_cases = [ (":", "http://:", ""), # ip:port format with empty values (":::", "http://:", ""), # ip:port:user:pass format with empty values ] for proxy_str, expected_server, expected_ip in edge_cases: proxy = ProxyConfig.from_string(proxy_str) assert proxy.server == expected_server assert proxy.ip == expected_ip def test_proxy_config_from_string_edge_cases(self): """Test string parsing edge cases.""" # Test with different port numbers proxy_str = "10.0.0.1:3128:user:pass" proxy = ProxyConfig.from_string(proxy_str) assert proxy.server == "http://10.0.0.1:3128" # Test with special characters in credentials proxy_str = "10.0.0.1:8080:user@domain:pass:word" with pytest.raises(ValueError): # Should fail due to extra colon in password ProxyConfig.from_string(proxy_str) # ==================== ENVIRONMENT VARIABLE TESTS ==================== def test_proxy_config_from_env_single_proxy(self): """Test loading single proxy from environment variable.""" proxy_str = f"{self.TEST_PROXY_DATA['ip']}:6114:{self.TEST_PROXY_DATA['username']}:{self.TEST_PROXY_DATA['password']}" with patch.dict(os.environ, {'TEST_PROXIES': proxy_str}): proxies = ProxyConfig.from_env('TEST_PROXIES') assert len(proxies) == 1 proxy = proxies[0] assert proxy.ip == self.TEST_PROXY_DATA['ip'] assert proxy.username == self.TEST_PROXY_DATA['username'] assert proxy.password == self.TEST_PROXY_DATA['password'] def test_proxy_config_from_env_multiple_proxies(self): """Test loading multiple proxies from environment variable.""" proxy_list = [ "192.168.1.1:8080:user1:pass1", "192.168.1.2:8080:user2:pass2", "10.0.0.1:3128" # No auth ] proxy_str = ",".join(proxy_list) with patch.dict(os.environ, {'TEST_PROXIES': proxy_str}): proxies = ProxyConfig.from_env('TEST_PROXIES') assert len(proxies) == 3 # Check first proxy assert proxies[0].ip == "192.168.1.1" assert proxies[0].username == "user1" assert proxies[0].password == "pass1" # Check second proxy assert proxies[1].ip == "192.168.1.2" assert proxies[1].username == "user2" assert proxies[1].password == "pass2" # Check third proxy (no auth) assert proxies[2].ip == "10.0.0.1" assert proxies[2].username is None assert proxies[2].password is None def test_proxy_config_from_env_empty_var(self): """Test loading from empty environment variable.""" with patch.dict(os.environ, {'TEST_PROXIES': ''}): proxies = ProxyConfig.from_env('TEST_PROXIES') assert len(proxies) == 0 def test_proxy_config_from_env_missing_var(self): """Test loading from missing environment variable.""" # Ensure the env var doesn't exist with patch.dict(os.environ, {}, clear=True): proxies = ProxyConfig.from_env('NON_EXISTENT_VAR') assert len(proxies) == 0 def test_proxy_config_from_env_with_empty_entries(self): """Test loading proxies with empty entries in the list.""" proxy_str = "192.168.1.1:8080:user:pass,,10.0.0.1:3128," with patch.dict(os.environ, {'TEST_PROXIES': proxy_str}): proxies = ProxyConfig.from_env('TEST_PROXIES') assert len(proxies) == 2 # Empty entries should be skipped assert proxies[0].ip == "192.168.1.1" assert proxies[1].ip == "10.0.0.1" def test_proxy_config_from_env_with_invalid_entries(self): """Test loading proxies with some invalid entries.""" proxy_str = "192.168.1.1:8080:user:pass,invalid_proxy,10.0.0.1:3128" with patch.dict(os.environ, {'TEST_PROXIES': proxy_str}): # Should handle errors gracefully and return valid proxies proxies = ProxyConfig.from_env('TEST_PROXIES') # Depending on implementation, might return partial list or empty # This tests error handling assert isinstance(proxies, list) # ==================== SERIALIZATION TESTS ==================== def test_proxy_config_to_dict(self): """Test converting ProxyConfig to dictionary.""" proxy = ProxyConfig( server=f"http://{self.TEST_PROXY_DATA['server']}", username=self.TEST_PROXY_DATA['username'], password=self.TEST_PROXY_DATA['password'], ip=self.TEST_PROXY_DATA['ip'] ) result_dict = proxy.to_dict() expected = { "server": f"http://{self.TEST_PROXY_DATA['server']}", "username": self.TEST_PROXY_DATA['username'], "password": self.TEST_PROXY_DATA['password'], "ip": self.TEST_PROXY_DATA['ip'] } assert result_dict == expected def test_proxy_config_clone(self): """Test cloning ProxyConfig with modifications.""" original = ProxyConfig( server="http://127.0.0.1:8080", username="user", password="pass" ) # Clone with modifications cloned = original.clone(username="new_user", password="new_pass") # Original should be unchanged assert original.username == "user" assert original.password == "pass" # Clone should have new values assert cloned.username == "new_user" assert cloned.password == "new_pass" assert cloned.server == original.server # Unchanged value def test_proxy_config_roundtrip_serialization(self): """Test that ProxyConfig can be serialized and deserialized without loss.""" original = ProxyConfig( server=f"http://{self.TEST_PROXY_DATA['server']}", username=self.TEST_PROXY_DATA['username'], password=self.TEST_PROXY_DATA['password'], ip=self.TEST_PROXY_DATA['ip'] ) # Serialize to dict and back serialized = original.to_dict() deserialized = ProxyConfig.from_dict(serialized) assert deserialized.server == original.server assert deserialized.username == original.username assert deserialized.password == original.password assert deserialized.ip == original.ip # ==================== INTEGRATION TESTS ==================== @pytest.mark.asyncio async def test_crawler_with_proxy_config_object(self): """Test AsyncWebCrawler with ProxyConfig object.""" proxy_config = ProxyConfig( server=f"http://{self.TEST_PROXY_DATA['server']}", username=self.TEST_PROXY_DATA['username'], password=self.TEST_PROXY_DATA['password'] ) browser_config = BrowserConfig(headless=True) # Test that the crawler accepts the ProxyConfig object without errors async with AsyncWebCrawler(config=browser_config) as crawler: try: # Note: This might fail due to actual proxy connection, but should not fail due to config issues result = await crawler.arun( url=self.test_url, config=CrawlerRunConfig( cache_mode=CacheMode.BYPASS, proxy_config=proxy_config, page_timeout=10000 # Short timeout for testing ) ) # If we get here, proxy config was accepted assert result is not None except Exception as e: # We expect connection errors with test proxies, but not config errors error_msg = str(e).lower() assert "attribute" not in error_msg, f"Config error: {e}" assert "proxy_config" not in error_msg, f"Proxy config error: {e}" @pytest.mark.asyncio async def test_crawler_with_proxy_config_dict(self): """Test AsyncWebCrawler with ProxyConfig from dictionary.""" proxy_dict = { "server": f"http://{self.TEST_PROXY_DATA['server']}", "username": self.TEST_PROXY_DATA['username'], "password": self.TEST_PROXY_DATA['password'] } proxy_config = ProxyConfig.from_dict(proxy_dict) browser_config = BrowserConfig(headless=True) async with AsyncWebCrawler(config=browser_config) as crawler: try: result = await crawler.arun( url=self.test_url, config=CrawlerRunConfig( cache_mode=CacheMode.BYPASS, proxy_config=proxy_config, page_timeout=10000 ) ) assert result is not None except Exception as e: error_msg = str(e).lower() assert "attribute" not in error_msg, f"Config error: {e}" @pytest.mark.asyncio async def test_crawler_with_proxy_config_from_string(self): """Test AsyncWebCrawler with ProxyConfig from string.""" proxy_str = f"{self.TEST_PROXY_DATA['ip']}:6114:{self.TEST_PROXY_DATA['username']}:{self.TEST_PROXY_DATA['password']}" proxy_config = ProxyConfig.from_string(proxy_str) browser_config = BrowserConfig(headless=True) async with AsyncWebCrawler(config=browser_config) as crawler: try: result = await crawler.arun( url=self.test_url, config=CrawlerRunConfig( cache_mode=CacheMode.BYPASS, proxy_config=proxy_config, page_timeout=10000 ) ) assert result is not None except Exception as e: error_msg = str(e).lower() assert "attribute" not in error_msg, f"Config error: {e}" # ==================== EDGE CASES AND ERROR HANDLING ==================== def test_proxy_config_with_none_server(self): """Test ProxyConfig behavior with None server.""" proxy = ProxyConfig(server=None) assert proxy.server is None assert proxy.ip is None # Should not crash def test_proxy_config_with_empty_string_server(self): """Test ProxyConfig behavior with empty string server.""" proxy = ProxyConfig(server="") assert proxy.server == "" assert proxy.ip is None or proxy.ip == "" def test_proxy_config_special_characters_in_credentials(self): """Test ProxyConfig with special characters in username/password.""" special_chars_tests = [ ("user@domain.com", "pass!@#$%"), ("user_123", "p@ssw0rd"), ("user-test", "pass-word"), ] for username, password in special_chars_tests: proxy = ProxyConfig( server="http://127.0.0.1:8080", username=username, password=password ) assert proxy.username == username assert proxy.password == password def test_proxy_config_unicode_handling(self): """Test ProxyConfig with unicode characters.""" proxy = ProxyConfig( server="http://127.0.0.1:8080", username="ユーザー", # Japanese characters password="пароль" # Cyrillic characters ) assert proxy.username == "ユーザー" assert proxy.password == "пароль" # ==================== PERFORMANCE TESTS ==================== def test_proxy_config_creation_performance(self): """Test that ProxyConfig creation is reasonably fast.""" import time start_time = time.time() for i in range(1000): proxy = ProxyConfig( server=f"http://192.168.1.{i % 255}:8080", username=f"user{i}", password=f"pass{i}" ) end_time = time.time() # Should be able to create 1000 configs in less than 1 second assert (end_time - start_time) < 1.0 def test_proxy_config_from_env_performance(self): """Test that loading many proxies from env is reasonably fast.""" import time # Create a large list of proxy strings proxy_list = [f"192.168.1.{i}:8080:user{i}:pass{i}" for i in range(100)] proxy_str = ",".join(proxy_list) with patch.dict(os.environ, {'PERF_TEST_PROXIES': proxy_str}): start_time = time.time() proxies = ProxyConfig.from_env('PERF_TEST_PROXIES') end_time = time.time() assert len(proxies) == 100 # Should be able to parse 100 proxies in less than 1 second assert (end_time - start_time) < 1.0 # ==================== STANDALONE TEST FUNCTIONS ==================== @pytest.mark.asyncio async def test_dict_proxy(): """Original test function for dict proxy - kept for backward compatibility.""" proxy_config = { "server": "23.95.150.145:6114", "username": "cfyswbwn", "password": "1gs266hoqysi" } proxy_config_obj = ProxyConfig.from_dict(proxy_config) browser_config = BrowserConfig(headless=True) async with AsyncWebCrawler(config=browser_config) as crawler: try: result = await crawler.arun(url="https://httpbin.org/ip", config=CrawlerRunConfig( stream=False, cache_mode=CacheMode.BYPASS, proxy_config=proxy_config_obj, page_timeout=10000 )) print("Dict proxy test passed!") print(result.markdown[:200] if result and result.markdown else "No result") except Exception as e: print(f"Dict proxy test error (expected): {e}") @pytest.mark.asyncio async def test_string_proxy(): """Test function for string proxy format.""" proxy_str = "23.95.150.145:6114:cfyswbwn:1gs266hoqysi" proxy_config_obj = ProxyConfig.from_string(proxy_str) browser_config = BrowserConfig(headless=True) async with AsyncWebCrawler(config=browser_config) as crawler: try: result = await crawler.arun(url="https://httpbin.org/ip", config=CrawlerRunConfig( stream=False, cache_mode=CacheMode.BYPASS, proxy_config=proxy_config_obj, page_timeout=10000 )) print("String proxy test passed!") print(result.markdown[:200] if result and result.markdown else "No result") except Exception as e: print(f"String proxy test error (expected): {e}") @pytest.mark.asyncio async def test_env_proxy(): """Test function for environment variable proxy.""" # Set environment variable os.environ['TEST_PROXIES'] = "23.95.150.145:6114:cfyswbwn:1gs266hoqysi" proxies = ProxyConfig.from_env('TEST_PROXIES') if proxies: proxy_config_obj = proxies[0] # Use first proxy browser_config = BrowserConfig(headless=True) async with AsyncWebCrawler(config=browser_config) as crawler: try: result = await crawler.arun(url="https://httpbin.org/ip", config=CrawlerRunConfig( stream=False, cache_mode=CacheMode.BYPASS, proxy_config=proxy_config_obj, page_timeout=10000 )) print("Environment proxy test passed!") print(result.markdown[:200] if result and result.markdown else "No result") except Exception as e: print(f"Environment proxy test error (expected): {e}") else: print("No proxies loaded from environment") if __name__ == "__main__": print("Running comprehensive ProxyConfig tests...") print("=" * 50) # Run the standalone test functions print("\n1. Testing dict proxy format...") asyncio.run(test_dict_proxy()) print("\n2. Testing string proxy format...") asyncio.run(test_string_proxy()) print("\n3. Testing environment variable proxy format...") asyncio.run(test_env_proxy()) print("\n" + "=" * 50) print("To run the full pytest suite, use: pytest " + __file__) print("=" * 50)