feat(proxy): add proxy rotation support and documentation
Implements dynamic proxy rotation functionality with authentication support and IP verification. Updates include: - Added proxy rotation demo in features example - Updated proxy configuration handling in BrowserManager - Added proxy rotation documentation - Updated README with new proxy rotation feature - Bumped version to 0.4.3b2 This change enables users to dynamically switch between proxies and verify IP addresses for each request.
This commit is contained in:
@@ -491,6 +491,7 @@ async def test_news_crawl():
|
|||||||
- **Ollama Support**: Use open-source or self-hosted models for private or cost-effective extraction.
|
- **Ollama Support**: Use open-source or self-hosted models for private or cost-effective extraction.
|
||||||
- **🏎️ Faster Scraping Option**: New `LXMLWebScrapingStrategy` offers **10-20x speedup** for large, complex pages (experimental).
|
- **🏎️ Faster Scraping Option**: New `LXMLWebScrapingStrategy` offers **10-20x speedup** for large, complex pages (experimental).
|
||||||
- **🤖 robots.txt Compliance**: Respect website rules with `check_robots_txt=True` and efficient local caching.
|
- **🤖 robots.txt Compliance**: Respect website rules with `check_robots_txt=True` and efficient local caching.
|
||||||
|
- **🔄 Proxy Rotation**: Built-in support for dynamic proxy switching and IP verification, with support for authenticated proxies and session persistence.
|
||||||
- **➡️ URL Redirection Tracking**: The `final_url` field now captures the final destination after any redirects.
|
- **➡️ URL Redirection Tracking**: The `final_url` field now captures the final destination after any redirects.
|
||||||
- **🪞 Improved Mirroring**: The `LXMLWebScrapingStrategy` now has much greater fidelity, allowing for almost pixel-perfect mirroring of websites.
|
- **🪞 Improved Mirroring**: The `LXMLWebScrapingStrategy` now has much greater fidelity, allowing for almost pixel-perfect mirroring of websites.
|
||||||
- **📈 Enhanced Monitoring**: Track memory, CPU, and individual crawler status with `CrawlerMonitor`.
|
- **📈 Enhanced Monitoring**: Track memory, CPU, and individual crawler status with `CrawlerMonitor`.
|
||||||
|
|||||||
@@ -1,2 +1,2 @@
|
|||||||
# crawl4ai/_version.py
|
# crawl4ai/_version.py
|
||||||
__version__ = "0.4.3b1"
|
__version__ = "0.4.3b2"
|
||||||
|
|||||||
@@ -633,9 +633,12 @@ class BrowserManager:
|
|||||||
if crawlerRunConfig.proxy_config:
|
if crawlerRunConfig.proxy_config:
|
||||||
proxy_settings = {
|
proxy_settings = {
|
||||||
"server": crawlerRunConfig.proxy_config.get("server"),
|
"server": crawlerRunConfig.proxy_config.get("server"),
|
||||||
"username": crawlerRunConfig.proxy_config.get("username"),
|
|
||||||
"password": crawlerRunConfig.proxy_config.get("password"),
|
|
||||||
}
|
}
|
||||||
|
if crawlerRunConfig.proxy_config.get("username"):
|
||||||
|
proxy_settings.update({
|
||||||
|
"username": crawlerRunConfig.proxy_config.get("username"),
|
||||||
|
"password": crawlerRunConfig.proxy_config.get("password"),
|
||||||
|
})
|
||||||
context_settings["proxy"] = proxy_settings
|
context_settings["proxy"] = proxy_settings
|
||||||
|
|
||||||
if self.config.text_mode:
|
if self.config.text_mode:
|
||||||
|
|||||||
@@ -233,6 +233,64 @@ async def demo_llm_schema_generation():
|
|||||||
print("Successfully used generated schema for crawling")
|
print("Successfully used generated schema for crawling")
|
||||||
|
|
||||||
|
|
||||||
|
async def get_next_proxy(proxy_file: str = f"proxies.txt") -> Optional[Dict]:
|
||||||
|
"""Get next proxy from local file"""
|
||||||
|
try:
|
||||||
|
with open(proxy_file) as f:
|
||||||
|
proxies = f.read().splitlines()
|
||||||
|
if not proxies:
|
||||||
|
return None
|
||||||
|
|
||||||
|
ip, port, username, password = random.choice(proxies).split(":")
|
||||||
|
return {
|
||||||
|
"server": f"http://{ip}:{port}",
|
||||||
|
"username": username,
|
||||||
|
"password": password,
|
||||||
|
"ip": ip # Store original IP for verification
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error loading proxy: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def demo_proxy_rotation():
|
||||||
|
"""
|
||||||
|
8. Proxy Rotation Demo
|
||||||
|
===================
|
||||||
|
Demonstrates how to rotate proxies for each request using Crawl4ai.
|
||||||
|
"""
|
||||||
|
print("\n=== 8. Proxy Rotation Demo ===")
|
||||||
|
|
||||||
|
|
||||||
|
# Create 10 test requests to httpbin
|
||||||
|
urls = ["https://httpbin.org/ip"] * 3
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(headless=True)
|
||||||
|
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
for url in urls:
|
||||||
|
proxy = await get_next_proxy()
|
||||||
|
if not proxy:
|
||||||
|
print("No proxy available, skipping...")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Create new config with proxy
|
||||||
|
current_config = run_config.clone(proxy_config=proxy)
|
||||||
|
result = await crawler.arun(url=url, config=current_config)
|
||||||
|
|
||||||
|
if result.success:
|
||||||
|
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
|
||||||
|
print(f"Proxy {proxy['ip']} -> Response IP: {ip_match.group(0) if ip_match else 'Not found'}")
|
||||||
|
verified = ip_match.group(0) == proxy['ip']
|
||||||
|
if verified:
|
||||||
|
print(f"✅ Proxy working! IP matches: {proxy['ip']}")
|
||||||
|
else:
|
||||||
|
print(f"❌ Proxy failed or IP mismatch!")
|
||||||
|
else:
|
||||||
|
print(f"Failed with proxy {proxy['ip']}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
"""Run all feature demonstrations."""
|
"""Run all feature demonstrations."""
|
||||||
demo_memory_dispatcher(),
|
demo_memory_dispatcher(),
|
||||||
@@ -247,6 +305,8 @@ async def main():
|
|||||||
print("\n" + "=" * 50 + "\n")
|
print("\n" + "=" * 50 + "\n")
|
||||||
demo_robots_compliance(),
|
demo_robots_compliance(),
|
||||||
print("\n" + "=" * 50 + "\n")
|
print("\n" + "=" * 50 + "\n")
|
||||||
|
demo_proxy_rotation()
|
||||||
|
print("\n" + "=" * 50 + "\n")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ async with AsyncWebCrawler(config=browser_config) as crawler:
|
|||||||
|
|
||||||
Here's the corrected documentation:
|
Here's the corrected documentation:
|
||||||
|
|
||||||
## Rotating Proxies [COMING SOON]
|
## Rotating Proxies
|
||||||
|
|
||||||
Example using a proxy rotation service dynamically:
|
Example using a proxy rotation service dynamically:
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user