- Implemented `test_adapter_verification.py` to verify correct usage of browser adapters. - Created `test_all_features.py` for a comprehensive suite covering URL seeding, adaptive crawling, browser adapters, proxy rotation, and dispatchers. - Developed `test_anti_bot_strategy.py` to validate the functionality of various anti-bot strategies. - Added `test_antibot_simple.py` for simple testing of anti-bot strategies using async web crawling. - Introduced `test_bot_detection.py` to assess adapter performance against bot detection mechanisms. - Compiled `test_final_summary.py` to provide a detailed summary of all tests and their results.
729 lines
26 KiB
Python
729 lines
26 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Proxy Rotation Demo Script
|
||
|
||
This script demonstrates real-world usage scenarios for the proxy rotation feature.
|
||
It simulates actual user workflows and shows how to integrate proxy rotation
|
||
into your crawling tasks.
|
||
|
||
Usage:
|
||
python demo_proxy_rotation.py
|
||
|
||
Note: Update the proxy configuration with your actual proxy servers for real testing.
|
||
"""
|
||
|
||
import asyncio
|
||
import json
|
||
import time
|
||
from typing import List, Dict, Any
|
||
import requests
|
||
from colorama import Fore, Style, init
|
||
from datetime import datetime
|
||
|
||
# Initialize colorama for colored output
|
||
init(autoreset=True)
|
||
|
||
# Configuration
|
||
API_BASE_URL = "http://localhost:11235"
|
||
|
||
# Import real proxy configuration
|
||
try:
|
||
from real_proxy_config import REAL_PROXIES, PROXY_POOL_SMALL, PROXY_POOL_MEDIUM, PROXY_POOL_LARGE
|
||
USE_REAL_PROXIES = True
|
||
print(f"{Fore.GREEN}✅ Loaded {len(REAL_PROXIES)} real proxies from configuration{Style.RESET_ALL}")
|
||
except ImportError:
|
||
# Fallback to demo proxies if real_proxy_config.py not found
|
||
REAL_PROXIES = [
|
||
{"server": "http://proxy1.example.com:8080", "username": "user1", "password": "pass1"},
|
||
{"server": "http://proxy2.example.com:8080", "username": "user2", "password": "pass2"},
|
||
{"server": "http://proxy3.example.com:8080", "username": "user3", "password": "pass3"},
|
||
]
|
||
PROXY_POOL_SMALL = REAL_PROXIES[:2]
|
||
PROXY_POOL_MEDIUM = REAL_PROXIES[:2]
|
||
PROXY_POOL_LARGE = REAL_PROXIES
|
||
USE_REAL_PROXIES = False
|
||
print(f"{Fore.YELLOW}⚠️ Using demo proxies (real_proxy_config.py not found){Style.RESET_ALL}")
|
||
|
||
# Alias for backward compatibility
|
||
DEMO_PROXIES = REAL_PROXIES
|
||
|
||
# Set to True to test with actual proxies, False for demo mode (no proxies, just shows API)
|
||
USE_REAL_PROXIES = False
|
||
|
||
# Test URLs that help verify proxy rotation
|
||
TEST_URLS = [
|
||
"https://httpbin.org/ip", # Shows origin IP
|
||
"https://httpbin.org/headers", # Shows all headers
|
||
"https://httpbin.org/user-agent", # Shows user agent
|
||
]
|
||
|
||
|
||
def print_header(text: str):
|
||
"""Print a formatted header"""
|
||
print(f"\n{Fore.CYAN}{'='*60}{Style.RESET_ALL}")
|
||
print(f"{Fore.CYAN}{text.center(60)}{Style.RESET_ALL}")
|
||
print(f"{Fore.CYAN}{'='*60}{Style.RESET_ALL}\n")
|
||
|
||
|
||
def print_success(text: str):
|
||
"""Print success message"""
|
||
print(f"{Fore.GREEN}✅ {text}{Style.RESET_ALL}")
|
||
|
||
|
||
def print_info(text: str):
|
||
"""Print info message"""
|
||
print(f"{Fore.BLUE}ℹ️ {text}{Style.RESET_ALL}")
|
||
|
||
|
||
def print_warning(text: str):
|
||
"""Print warning message"""
|
||
print(f"{Fore.YELLOW}⚠️ {text}{Style.RESET_ALL}")
|
||
|
||
|
||
def print_error(text: str):
|
||
"""Print error message"""
|
||
print(f"{Fore.RED}❌ {text}{Style.RESET_ALL}")
|
||
|
||
|
||
def check_server_health() -> bool:
|
||
"""Check if the Crawl4AI server is running"""
|
||
try:
|
||
response = requests.get(f"{API_BASE_URL}/health", timeout=5)
|
||
if response.status_code == 200:
|
||
print_success("Crawl4AI server is running")
|
||
return True
|
||
else:
|
||
print_error(f"Server returned status code: {response.status_code}")
|
||
return False
|
||
except Exception as e:
|
||
print_error(f"Cannot connect to server: {e}")
|
||
print_warning("Make sure the Crawl4AI server is running on localhost:11235")
|
||
return False
|
||
|
||
|
||
def demo_1_basic_round_robin():
|
||
"""Demo 1: Basic proxy rotation with round robin strategy"""
|
||
print_header("Demo 1: Basic Round Robin Rotation")
|
||
|
||
print_info("Use case: Even distribution across proxies for general crawling")
|
||
print_info("Strategy: Round Robin - cycles through proxies sequentially\n")
|
||
|
||
if USE_REAL_PROXIES:
|
||
payload = {
|
||
"urls": [TEST_URLS[0]], # Just checking IP
|
||
"proxy_rotation_strategy": "round_robin",
|
||
"proxies": PROXY_POOL_SMALL, # Use small pool (3 proxies)
|
||
"headless": True,
|
||
"browser_config": {
|
||
"type": "BrowserConfig",
|
||
"params": {"headless": True, "verbose": False}
|
||
},
|
||
"crawler_config": {
|
||
"type": "CrawlerRunConfig",
|
||
"params": {"cache_mode": "bypass", "verbose": False}
|
||
}
|
||
}
|
||
else:
|
||
print_warning("Demo mode: Showing API structure without actual proxy connections")
|
||
payload = {
|
||
"urls": [TEST_URLS[0]],
|
||
"headless": True,
|
||
"browser_config": {
|
||
"type": "BrowserConfig",
|
||
"params": {"headless": True, "verbose": False}
|
||
},
|
||
"crawler_config": {
|
||
"type": "CrawlerRunConfig",
|
||
"params": {"cache_mode": "bypass", "verbose": False}
|
||
}
|
||
}
|
||
|
||
print(f"{Fore.YELLOW}Request payload:{Style.RESET_ALL}")
|
||
print(json.dumps(payload, indent=2))
|
||
|
||
if USE_REAL_PROXIES:
|
||
print()
|
||
print_info("With real proxies, the request would:")
|
||
print_info(" 1. Initialize RoundRobinProxyStrategy")
|
||
print_info(" 2. Cycle through proxy1 → proxy2 → proxy1...")
|
||
print_info(" 3. Each request uses the next proxy in sequence")
|
||
|
||
try:
|
||
start_time = time.time()
|
||
response = requests.post(f"{API_BASE_URL}/crawl", json=payload, timeout=30)
|
||
elapsed = time.time() - start_time
|
||
|
||
if response.status_code == 200:
|
||
data = response.json()
|
||
print_success(f"Request completed in {elapsed:.2f} seconds")
|
||
print_info(f"Results: {len(data.get('results', []))} URL(s) crawled")
|
||
|
||
# Show first result summary
|
||
if data.get("results"):
|
||
result = data["results"][0]
|
||
print_info(f"Success: {result.get('success')}")
|
||
print_info(f"URL: {result.get('url')}")
|
||
|
||
if not USE_REAL_PROXIES:
|
||
print()
|
||
print_success("✨ API integration works! Add real proxies to test rotation.")
|
||
else:
|
||
print_error(f"Request failed: {response.status_code}")
|
||
if "PROXY_CONNECTION_FAILED" in response.text:
|
||
print_warning("Proxy connection failed - this is expected with example proxies")
|
||
print_info("Update DEMO_PROXIES and set USE_REAL_PROXIES = True to test with real proxies")
|
||
else:
|
||
print(response.text)
|
||
|
||
except Exception as e:
|
||
print_error(f"Error: {e}")
|
||
|
||
|
||
def demo_2_random_stealth():
|
||
"""Demo 2: Random proxy rotation with stealth mode"""
|
||
print_header("Demo 2: Random Rotation + Stealth Mode")
|
||
|
||
print_info("Use case: Unpredictable traffic pattern with anti-bot evasion")
|
||
print_info("Strategy: Random - unpredictable proxy selection")
|
||
print_info("Feature: Combined with stealth anti-bot strategy\n")
|
||
|
||
payload = {
|
||
"urls": [TEST_URLS[1]], # Check headers
|
||
"proxy_rotation_strategy": "random",
|
||
"anti_bot_strategy": "stealth", # Combined with anti-bot
|
||
"proxies": PROXY_POOL_MEDIUM, # Use medium pool (5 proxies)
|
||
"headless": True,
|
||
"browser_config": {
|
||
"type": "BrowserConfig",
|
||
"params": {
|
||
"headless": True,
|
||
"enable_stealth": True,
|
||
"verbose": False
|
||
}
|
||
},
|
||
"crawler_config": {
|
||
"type": "CrawlerRunConfig",
|
||
"params": {"cache_mode": "bypass"}
|
||
}
|
||
}
|
||
|
||
print(f"{Fore.YELLOW}Request payload (key parts):{Style.RESET_ALL}")
|
||
print(json.dumps({
|
||
"urls": payload["urls"],
|
||
"proxy_rotation_strategy": payload["proxy_rotation_strategy"],
|
||
"anti_bot_strategy": payload["anti_bot_strategy"],
|
||
"proxies": f"{len(payload['proxies'])} proxies configured"
|
||
}, indent=2))
|
||
|
||
try:
|
||
start_time = time.time()
|
||
response = requests.post(f"{API_BASE_URL}/crawl", json=payload, timeout=30)
|
||
elapsed = time.time() - start_time
|
||
|
||
if response.status_code == 200:
|
||
data = response.json()
|
||
print_success(f"Request completed in {elapsed:.2f} seconds")
|
||
print_success("Random proxy + stealth mode working together!")
|
||
else:
|
||
print_error(f"Request failed: {response.status_code}")
|
||
|
||
except Exception as e:
|
||
print_error(f"Error: {e}")
|
||
|
||
|
||
def demo_3_least_used_multiple_urls():
|
||
"""Demo 3: Least used strategy with multiple URLs"""
|
||
print_header("Demo 3: Least Used Strategy (Load Balancing)")
|
||
|
||
print_info("Use case: Optimal load distribution across multiple requests")
|
||
print_info("Strategy: Least Used - balances load across proxy pool")
|
||
print_info("Feature: Crawling multiple URLs efficiently\n")
|
||
|
||
payload = {
|
||
"urls": TEST_URLS, # All test URLs
|
||
"proxy_rotation_strategy": "least_used",
|
||
"proxies": PROXY_POOL_LARGE, # Use full pool (all proxies)
|
||
"headless": True,
|
||
"browser_config": {
|
||
"type": "BrowserConfig",
|
||
"params": {"headless": True, "verbose": False}
|
||
},
|
||
"crawler_config": {
|
||
"type": "CrawlerRunConfig",
|
||
"params": {
|
||
"cache_mode": "bypass",
|
||
"wait_for_images": False, # Speed up crawling
|
||
"verbose": False
|
||
}
|
||
}
|
||
}
|
||
|
||
print(f"{Fore.YELLOW}Crawling {len(payload['urls'])} URLs with load balancing:{Style.RESET_ALL}")
|
||
for i, url in enumerate(payload["urls"], 1):
|
||
print(f" {i}. {url}")
|
||
|
||
try:
|
||
start_time = time.time()
|
||
response = requests.post(f"{API_BASE_URL}/crawl", json=payload, timeout=60)
|
||
elapsed = time.time() - start_time
|
||
|
||
if response.status_code == 200:
|
||
data = response.json()
|
||
results = data.get('results', [])
|
||
print_success(f"Completed {len(results)} URLs in {elapsed:.2f} seconds")
|
||
print_info(f"Average time per URL: {elapsed/len(results):.2f}s")
|
||
|
||
# Show success rate
|
||
successful = sum(1 for r in results if r.get('success'))
|
||
print_info(f"Success rate: {successful}/{len(results)} ({successful/len(results)*100:.1f}%)")
|
||
else:
|
||
print_error(f"Request failed: {response.status_code}")
|
||
|
||
except Exception as e:
|
||
print_error(f"Error: {e}")
|
||
|
||
|
||
def demo_4_failure_aware_production():
|
||
"""Demo 4: Failure-aware strategy for production use"""
|
||
print_header("Demo 4: Failure-Aware Strategy (Production)")
|
||
|
||
print_info("Use case: High-availability crawling with automatic recovery")
|
||
print_info("Strategy: Failure Aware - tracks proxy health")
|
||
print_info("Feature: Auto-recovery after failures\n")
|
||
|
||
payload = {
|
||
"urls": [TEST_URLS[0]],
|
||
"proxy_rotation_strategy": "failure_aware",
|
||
"proxy_failure_threshold": 2, # Mark unhealthy after 2 failures
|
||
"proxy_recovery_time": 120, # 2 minutes recovery time
|
||
"proxies": PROXY_POOL_MEDIUM, # Use medium pool (5 proxies)
|
||
"headless": True,
|
||
"browser_config": {
|
||
"type": "BrowserConfig",
|
||
"params": {"headless": True, "verbose": False}
|
||
},
|
||
"crawler_config": {
|
||
"type": "CrawlerRunConfig",
|
||
"params": {"cache_mode": "bypass"}
|
||
}
|
||
}
|
||
|
||
print(f"{Fore.YELLOW}Configuration:{Style.RESET_ALL}")
|
||
print(f" Failure threshold: {payload['proxy_failure_threshold']} failures")
|
||
print(f" Recovery time: {payload['proxy_recovery_time']} seconds")
|
||
print(f" Proxy pool size: {len(payload['proxies'])} proxies")
|
||
|
||
try:
|
||
start_time = time.time()
|
||
response = requests.post(f"{API_BASE_URL}/crawl", json=payload, timeout=30)
|
||
elapsed = time.time() - start_time
|
||
|
||
if response.status_code == 200:
|
||
data = response.json()
|
||
print_success(f"Request completed in {elapsed:.2f} seconds")
|
||
print_success("Failure-aware strategy initialized successfully")
|
||
print_info("The strategy will now track proxy health automatically")
|
||
else:
|
||
print_error(f"Request failed: {response.status_code}")
|
||
|
||
except Exception as e:
|
||
print_error(f"Error: {e}")
|
||
|
||
|
||
def demo_5_streaming_with_proxies():
|
||
"""Demo 5: Streaming endpoint with proxy rotation"""
|
||
print_header("Demo 5: Streaming with Proxy Rotation")
|
||
|
||
print_info("Use case: Real-time results with proxy rotation")
|
||
print_info("Strategy: Random - varies proxies across stream")
|
||
print_info("Feature: Streaming endpoint support\n")
|
||
|
||
payload = {
|
||
"urls": TEST_URLS[:2], # First 2 URLs
|
||
"proxy_rotation_strategy": "random",
|
||
"proxies": PROXY_POOL_SMALL, # Use small pool (3 proxies)
|
||
"headless": True,
|
||
"browser_config": {
|
||
"type": "BrowserConfig",
|
||
"params": {"headless": True, "verbose": False}
|
||
},
|
||
"crawler_config": {
|
||
"type": "CrawlerRunConfig",
|
||
"params": {
|
||
"stream": True,
|
||
"cache_mode": "bypass",
|
||
"verbose": False
|
||
}
|
||
}
|
||
}
|
||
|
||
print_info("Streaming 2 URLs with random proxy rotation...")
|
||
|
||
try:
|
||
start_time = time.time()
|
||
response = requests.post(
|
||
f"{API_BASE_URL}/crawl/stream",
|
||
json=payload,
|
||
timeout=60,
|
||
stream=True
|
||
)
|
||
|
||
if response.status_code == 200:
|
||
results_count = 0
|
||
for line in response.iter_lines():
|
||
if line:
|
||
try:
|
||
data = json.loads(line.decode('utf-8'))
|
||
if data.get("status") == "processing":
|
||
print_info(f"Processing: {data.get('url', 'unknown')}")
|
||
elif data.get("status") == "completed":
|
||
results_count += 1
|
||
print_success(f"Completed: {data.get('url', 'unknown')}")
|
||
except json.JSONDecodeError:
|
||
pass
|
||
|
||
elapsed = time.time() - start_time
|
||
print_success(f"\nStreaming completed: {results_count} results in {elapsed:.2f}s")
|
||
else:
|
||
print_error(f"Streaming failed: {response.status_code}")
|
||
|
||
except Exception as e:
|
||
print_error(f"Error: {e}")
|
||
|
||
|
||
def demo_6_error_handling():
|
||
"""Demo 6: Error handling demonstration"""
|
||
print_header("Demo 6: Error Handling")
|
||
|
||
print_info("Demonstrating how the system handles errors gracefully\n")
|
||
|
||
# Test 1: Invalid strategy
|
||
print(f"{Fore.YELLOW}Test 1: Invalid strategy name{Style.RESET_ALL}")
|
||
payload = {
|
||
"urls": [TEST_URLS[0]],
|
||
"proxy_rotation_strategy": "invalid_strategy",
|
||
"proxies": [PROXY_POOL_SMALL[0]], # Use just 1 proxy
|
||
"headless": True
|
||
}
|
||
|
||
try:
|
||
response = requests.post(f"{API_BASE_URL}/crawl", json=payload, timeout=10)
|
||
if response.status_code != 200:
|
||
print_error(f"Expected error: {response.json().get('detail', 'Unknown error')}")
|
||
else:
|
||
print_warning("Unexpected: Request succeeded")
|
||
except Exception as e:
|
||
print_error(f"Error: {e}")
|
||
|
||
print()
|
||
|
||
# Test 2: Missing server field
|
||
print(f"{Fore.YELLOW}Test 2: Invalid proxy configuration{Style.RESET_ALL}")
|
||
payload = {
|
||
"urls": [TEST_URLS[0]],
|
||
"proxy_rotation_strategy": "round_robin",
|
||
"proxies": [{"username": "user1"}], # Missing server
|
||
"headless": True
|
||
}
|
||
|
||
try:
|
||
response = requests.post(f"{API_BASE_URL}/crawl", json=payload, timeout=10)
|
||
if response.status_code != 200:
|
||
print_error(f"Expected error: {response.json().get('detail', 'Unknown error')}")
|
||
else:
|
||
print_warning("Unexpected: Request succeeded")
|
||
except Exception as e:
|
||
print_error(f"Error: {e}")
|
||
|
||
print()
|
||
print_success("Error handling working as expected!")
|
||
|
||
|
||
def demo_7_real_world_scenario():
|
||
"""Demo 7: Real-world e-commerce price monitoring scenario"""
|
||
print_header("Demo 7: Real-World Scenario - Price Monitoring")
|
||
|
||
print_info("Scenario: Monitoring multiple product pages with high availability")
|
||
print_info("Requirements: Anti-detection + Proxy rotation + Fault tolerance\n")
|
||
|
||
# Simulated product URLs (using httpbin for demo)
|
||
product_urls = [
|
||
"https://httpbin.org/delay/1", # Simulates slow page
|
||
"https://httpbin.org/html", # Simulates product page
|
||
"https://httpbin.org/json", # Simulates API endpoint
|
||
]
|
||
|
||
payload = {
|
||
"urls": product_urls,
|
||
"anti_bot_strategy": "stealth",
|
||
"proxy_rotation_strategy": "failure_aware",
|
||
"proxy_failure_threshold": 2,
|
||
"proxy_recovery_time": 180,
|
||
"proxies": PROXY_POOL_LARGE, # Use full pool for high availability
|
||
"headless": True,
|
||
"browser_config": {
|
||
"type": "BrowserConfig",
|
||
"params": {
|
||
"headless": True,
|
||
"enable_stealth": True,
|
||
"verbose": False
|
||
}
|
||
},
|
||
"crawler_config": {
|
||
"type": "CrawlerRunConfig",
|
||
"params": {
|
||
"cache_mode": "bypass",
|
||
"page_timeout": 30000,
|
||
"wait_for_images": False,
|
||
"verbose": False
|
||
}
|
||
}
|
||
}
|
||
|
||
print(f"{Fore.YELLOW}Configuration:{Style.RESET_ALL}")
|
||
print(f" URLs to monitor: {len(product_urls)}")
|
||
print(f" Anti-bot strategy: stealth")
|
||
print(f" Proxy strategy: failure_aware")
|
||
print(f" Proxy pool: {len(DEMO_PROXIES)} proxies")
|
||
print()
|
||
|
||
print_info("Starting price monitoring crawl...")
|
||
|
||
try:
|
||
start_time = time.time()
|
||
response = requests.post(f"{API_BASE_URL}/crawl", json=payload, timeout=90)
|
||
elapsed = time.time() - start_time
|
||
|
||
if response.status_code == 200:
|
||
data = response.json()
|
||
results = data.get('results', [])
|
||
|
||
print_success(f"Monitoring completed in {elapsed:.2f} seconds\n")
|
||
|
||
# Detailed results
|
||
print(f"{Fore.YELLOW}Results Summary:{Style.RESET_ALL}")
|
||
for i, result in enumerate(results, 1):
|
||
url = result.get('url', 'unknown')
|
||
success = result.get('success', False)
|
||
status = "✅ Success" if success else "❌ Failed"
|
||
print(f" {i}. {status} - {url}")
|
||
|
||
successful = sum(1 for r in results if r.get('success'))
|
||
print()
|
||
print_info(f"Success rate: {successful}/{len(results)} ({successful/len(results)*100:.1f}%)")
|
||
print_info(f"Average time per product: {elapsed/len(results):.2f}s")
|
||
|
||
print()
|
||
print_success("✨ Real-world scenario completed successfully!")
|
||
print_info("This configuration is production-ready for:")
|
||
print_info(" - E-commerce price monitoring")
|
||
print_info(" - Competitive analysis")
|
||
print_info(" - Market research")
|
||
print_info(" - Any high-availability crawling needs")
|
||
else:
|
||
print_error(f"Request failed: {response.status_code}")
|
||
print(response.text)
|
||
|
||
except Exception as e:
|
||
print_error(f"Error: {e}")
|
||
|
||
|
||
def show_python_integration_example():
|
||
"""Show Python integration code example"""
|
||
print_header("Python Integration Example")
|
||
|
||
code = '''
|
||
import requests
|
||
import json
|
||
|
||
class ProxyCrawler:
|
||
"""Example class for integrating proxy rotation into your application"""
|
||
|
||
def __init__(self, api_url="http://localhost:11235"):
|
||
self.api_url = api_url
|
||
self.proxies = [
|
||
{"server": "http://proxy1.com:8080", "username": "user", "password": "pass"},
|
||
{"server": "http://proxy2.com:8080", "username": "user", "password": "pass"},
|
||
]
|
||
|
||
def crawl_with_proxies(self, urls, strategy="round_robin"):
|
||
"""Crawl URLs with proxy rotation"""
|
||
payload = {
|
||
"urls": urls,
|
||
"proxy_rotation_strategy": strategy,
|
||
"proxies": self.proxies,
|
||
"headless": True,
|
||
"crawler_config": {
|
||
"type": "CrawlerRunConfig",
|
||
"params": {"cache_mode": "bypass"}
|
||
}
|
||
}
|
||
|
||
response = requests.post(f"{self.api_url}/crawl", json=payload, timeout=60)
|
||
return response.json()
|
||
|
||
def monitor_prices(self, product_urls):
|
||
"""Monitor product prices with high availability"""
|
||
payload = {
|
||
"urls": product_urls,
|
||
"anti_bot_strategy": "stealth",
|
||
"proxy_rotation_strategy": "failure_aware",
|
||
"proxy_failure_threshold": 2,
|
||
"proxies": self.proxies,
|
||
"headless": True
|
||
}
|
||
|
||
response = requests.post(f"{self.api_url}/crawl", json=payload, timeout=120)
|
||
return response.json()
|
||
|
||
# Usage
|
||
crawler = ProxyCrawler()
|
||
|
||
# Simple crawling
|
||
results = crawler.crawl_with_proxies(
|
||
urls=["https://example.com"],
|
||
strategy="round_robin"
|
||
)
|
||
|
||
# Price monitoring
|
||
product_results = crawler.monitor_prices(
|
||
product_urls=["https://shop.example.com/product1", "https://shop.example.com/product2"]
|
||
)
|
||
'''
|
||
|
||
print(f"{Fore.GREEN}{code}{Style.RESET_ALL}")
|
||
print_info("Copy this code to integrate proxy rotation into your application!")
|
||
|
||
|
||
def demo_0_proxy_setup_guide():
|
||
"""Demo 0: Guide for setting up real proxies"""
|
||
print_header("Proxy Setup Guide")
|
||
|
||
print_info("This demo can run in two modes:\n")
|
||
|
||
print(f"{Fore.YELLOW}1. DEMO MODE (Current):{Style.RESET_ALL}")
|
||
print(" - Tests API integration without proxies")
|
||
print(" - Shows request/response structure")
|
||
print(" - Safe to run without proxy servers\n")
|
||
|
||
print(f"{Fore.YELLOW}2. REAL PROXY MODE:{Style.RESET_ALL}")
|
||
print(" - Tests actual proxy rotation")
|
||
print(" - Requires valid proxy servers")
|
||
print(" - Shows real proxy switching in action\n")
|
||
|
||
print(f"{Fore.GREEN}To enable real proxy testing:{Style.RESET_ALL}")
|
||
print(" 1. Update DEMO_PROXIES with your actual proxy servers:")
|
||
print()
|
||
print(f"{Fore.CYAN} DEMO_PROXIES = [")
|
||
print(f" {{'server': 'http://your-proxy1.com:8080', 'username': 'user', 'password': 'pass'}},")
|
||
print(f" {{'server': 'http://your-proxy2.com:8080', 'username': 'user', 'password': 'pass'}},")
|
||
print(f" ]{Style.RESET_ALL}")
|
||
print()
|
||
print(f" 2. Set: {Fore.CYAN}USE_REAL_PROXIES = True{Style.RESET_ALL}")
|
||
print()
|
||
|
||
print(f"{Fore.YELLOW}Popular Proxy Providers:{Style.RESET_ALL}")
|
||
print(" - Bright Data (formerly Luminati)")
|
||
print(" - Oxylabs")
|
||
print(" - Smartproxy")
|
||
print(" - ProxyMesh")
|
||
print(" - Your own proxy servers")
|
||
print()
|
||
|
||
if USE_REAL_PROXIES:
|
||
print_success("Real proxy mode is ENABLED")
|
||
print_info(f"Using {len(DEMO_PROXIES)} configured proxies")
|
||
else:
|
||
print_info("Demo mode is active (USE_REAL_PROXIES = False)")
|
||
print_info("API structure will be demonstrated without actual proxy connections")
|
||
|
||
|
||
def main():
|
||
"""Main demo runner"""
|
||
print(f"""
|
||
{Fore.CYAN}╔══════════════════════════════════════════════════════════╗
|
||
║ ║
|
||
║ Crawl4AI Proxy Rotation Demo Suite ║
|
||
║ ║
|
||
║ Demonstrating real-world proxy rotation scenarios ║
|
||
║ ║
|
||
╚══════════════════════════════════════════════════════════╝{Style.RESET_ALL}
|
||
""")
|
||
|
||
if USE_REAL_PROXIES:
|
||
print_success(f"✨ Using {len(REAL_PROXIES)} real Webshare proxies")
|
||
print_info(f"📊 Proxy pools configured:")
|
||
print_info(f" • Small pool: {len(PROXY_POOL_SMALL)} proxies (quick tests)")
|
||
print_info(f" • Medium pool: {len(PROXY_POOL_MEDIUM)} proxies (balanced)")
|
||
print_info(f" • Large pool: {len(PROXY_POOL_LARGE)} proxies (high availability)")
|
||
else:
|
||
print_warning("⚠️ Using demo proxy configuration (won't connect)")
|
||
print_info("To use real proxies, create real_proxy_config.py with your proxies")
|
||
print()
|
||
|
||
# Check server health
|
||
if not check_server_health():
|
||
print()
|
||
print_error("Please start the Crawl4AI server first:")
|
||
print_info("cd deploy/docker && docker-compose up")
|
||
print_info("or run: ./dev.sh")
|
||
return
|
||
|
||
print()
|
||
input(f"{Fore.YELLOW}Press Enter to start the demos...{Style.RESET_ALL}")
|
||
|
||
# Run all demos
|
||
demos = [
|
||
demo_0_proxy_setup_guide,
|
||
demo_1_basic_round_robin,
|
||
demo_2_random_stealth,
|
||
demo_3_least_used_multiple_urls,
|
||
demo_4_failure_aware_production,
|
||
demo_5_streaming_with_proxies,
|
||
demo_6_error_handling,
|
||
demo_7_real_world_scenario,
|
||
]
|
||
|
||
for i, demo in enumerate(demos, 1):
|
||
try:
|
||
demo()
|
||
if i < len(demos):
|
||
print()
|
||
input(f"{Fore.YELLOW}Press Enter to continue to next demo...{Style.RESET_ALL}")
|
||
except KeyboardInterrupt:
|
||
print()
|
||
print_warning("Demo interrupted by user")
|
||
break
|
||
except Exception as e:
|
||
print_error(f"Demo failed: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
|
||
# Show integration example
|
||
print()
|
||
show_python_integration_example()
|
||
|
||
# Summary
|
||
print_header("Demo Suite Complete!")
|
||
print_success("You've seen all major proxy rotation features!")
|
||
print()
|
||
print_info("Next steps:")
|
||
print_info(" 1. Update DEMO_PROXIES with your actual proxy servers")
|
||
print_info(" 2. Run: python test_proxy_rotation_strategies.py (full test suite)")
|
||
print_info(" 3. Read: PROXY_ROTATION_STRATEGY_DOCS.md (complete documentation)")
|
||
print_info(" 4. Integrate into your application using the examples above")
|
||
print()
|
||
print(f"{Fore.CYAN}Happy crawling! 🚀{Style.RESET_ALL}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
try:
|
||
main()
|
||
except KeyboardInterrupt:
|
||
print()
|
||
print_warning("\nDemo interrupted. Goodbye!")
|
||
except Exception as e:
|
||
print_error(f"\nUnexpected error: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|