Compare commits

..

2 Commits

Author SHA1 Message Date
AHMET YILMAZ
fe353c4e27 Refactor proxy configuration documentation for clarity and consistency 2025-11-13 11:20:24 +08:00
AHMET YILMAZ
263ac890fd #1591
: Enhance proxy configuration documentation with security features, SSL analysis, and improved examples
2025-11-10 11:42:07 +08:00
3 changed files with 285 additions and 228 deletions

View File

@@ -4,26 +4,14 @@ from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
from ..models import CrawlResult from ..models import CrawlResult
from .bfs_strategy import BFSDeepCrawlStrategy # noqa from .bfs_strategy import BFSDeepCrawlStrategy # noqa
from ..types import AsyncWebCrawler, CrawlerRunConfig from ..types import AsyncWebCrawler, CrawlerRunConfig
from ..utils import normalize_url_for_deep_crawl
class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy): class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
""" """
Depth-first deep crawling with familiar BFS rules. Depth-First Search (DFS) deep crawling strategy.
We reuse the same filters, scoring, and page limits from :class:`BFSDeepCrawlStrategy`, Inherits URL validation and link discovery from BFSDeepCrawlStrategy.
but walk the graph with a stack so we fully explore one branch before hopping to the Overrides _arun_batch and _arun_stream to use a stack (LIFO) for DFS traversal.
next. DFS also keeps its own ``_dfs_seen`` set so we can drop duplicate links at
discovery time without accidentally marking them as “already crawled”.
""" """
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._dfs_seen: Set[str] = set()
def _reset_seen(self, start_url: str) -> None:
"""Start each crawl with a clean dedupe set seeded with the root URL."""
self._dfs_seen = {start_url}
async def _arun_batch( async def _arun_batch(
self, self,
start_url: str, start_url: str,
@@ -31,19 +19,14 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
config: CrawlerRunConfig, config: CrawlerRunConfig,
) -> List[CrawlResult]: ) -> List[CrawlResult]:
""" """
Crawl level-by-level but emit results at the end. Batch (non-streaming) DFS mode.
Uses a stack to traverse URLs in DFS order, aggregating CrawlResults into a list.
We keep a stack of ``(url, parent, depth)`` tuples, pop one at a time, and
hand it to ``crawler.arun_many`` with deep crawling disabled so we remain
in control of traversal. Every successful page bumps ``_pages_crawled`` and
seeds new stack items discovered via :meth:`link_discovery`.
""" """
visited: Set[str] = set() visited: Set[str] = set()
# Stack items: (url, parent_url, depth) # Stack items: (url, parent_url, depth)
stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)] stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
depths: Dict[str, int] = {start_url: 0} depths: Dict[str, int] = {start_url: 0}
results: List[CrawlResult] = [] results: List[CrawlResult] = []
self._reset_seen(start_url)
while stack and not self._cancel_event.is_set(): while stack and not self._cancel_event.is_set():
url, parent, depth = stack.pop() url, parent, depth = stack.pop()
@@ -88,16 +71,12 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
config: CrawlerRunConfig, config: CrawlerRunConfig,
) -> AsyncGenerator[CrawlResult, None]: ) -> AsyncGenerator[CrawlResult, None]:
""" """
Same traversal as :meth:`_arun_batch`, but yield pages immediately. Streaming DFS mode.
Uses a stack to traverse URLs in DFS order and yields CrawlResults as they become available.
Each popped URL is crawled, its metadata annotated, then the result gets
yielded before we even look at the next stack entry. Successful crawls
still feed :meth:`link_discovery`, keeping DFS order intact.
""" """
visited: Set[str] = set() visited: Set[str] = set()
stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)] stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
depths: Dict[str, int] = {start_url: 0} depths: Dict[str, int] = {start_url: 0}
self._reset_seen(start_url)
while stack and not self._cancel_event.is_set(): while stack and not self._cancel_event.is_set():
url, parent, depth = stack.pop() url, parent, depth = stack.pop()
@@ -129,92 +108,3 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
for new_url, new_parent in reversed(new_links): for new_url, new_parent in reversed(new_links):
new_depth = depths.get(new_url, depth + 1) new_depth = depths.get(new_url, depth + 1)
stack.append((new_url, new_parent, new_depth)) stack.append((new_url, new_parent, new_depth))
async def link_discovery(
self,
result: CrawlResult,
source_url: str,
current_depth: int,
_visited: Set[str],
next_level: List[Tuple[str, Optional[str]]],
depths: Dict[str, int],
) -> None:
"""
Find the next URLs we should push onto the DFS stack.
Parameters
----------
result : CrawlResult
Output of the page we just crawled; its ``links`` block is our raw material.
source_url : str
URL of the parent page; stored so callers can track ancestry.
current_depth : int
Depth of the parent; children naturally sit at ``current_depth + 1``.
_visited : Set[str]
Present to match the BFS signature, but we rely on ``_dfs_seen`` instead.
next_level : list of tuples
The stack buffer supplied by the caller; we append new ``(url, parent)`` items here.
depths : dict
Shared depth map so future metadata tagging knows how deep each URL lives.
Notes
-----
- ``_dfs_seen`` keeps us from pushing duplicates without touching the traversal guard.
- Validation, scoring, and capacity trimming mirror the BFS version so behaviour stays consistent.
"""
next_depth = current_depth + 1
if next_depth > self.max_depth:
return
remaining_capacity = self.max_pages - self._pages_crawled
if remaining_capacity <= 0:
self.logger.info(
f"Max pages limit ({self.max_pages}) reached, stopping link discovery"
)
return
links = result.links.get("internal", [])
if self.include_external:
links += result.links.get("external", [])
seen = self._dfs_seen
valid_links: List[Tuple[str, float]] = []
for link in links:
raw_url = link.get("href")
if not raw_url:
continue
normalized_url = normalize_url_for_deep_crawl(raw_url, source_url)
if not normalized_url or normalized_url in seen:
continue
if not await self.can_process_url(raw_url, next_depth):
self.stats.urls_skipped += 1
continue
score = self.url_scorer.score(normalized_url) if self.url_scorer else 0
if score < self.score_threshold:
self.logger.debug(
f"URL {normalized_url} skipped: score {score} below threshold {self.score_threshold}"
)
self.stats.urls_skipped += 1
continue
seen.add(normalized_url)
valid_links.append((normalized_url, score))
if len(valid_links) > remaining_capacity:
if self.url_scorer:
valid_links.sort(key=lambda x: x[1], reverse=True)
valid_links = valid_links[:remaining_capacity]
self.logger.info(
f"Limiting to {remaining_capacity} URLs due to max_pages limit"
)
for url, score in valid_links:
if score:
result.metadata = result.metadata or {}
result.metadata["score"] = score
next_level.append((url, source_url))
depths[url] = next_depth

View File

@@ -1,39 +0,0 @@
"""
Simple demonstration of the DFS deep crawler visiting multiple pages.
Run with: python docs/examples/dfs_crawl_demo.py
"""
import asyncio
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.async_webcrawler import AsyncWebCrawler
from crawl4ai.cache_context import CacheMode
from crawl4ai.deep_crawling.dfs_strategy import DFSDeepCrawlStrategy
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
async def main() -> None:
dfs_strategy = DFSDeepCrawlStrategy(
max_depth=3,
max_pages=50,
include_external=False,
)
config = CrawlerRunConfig(
deep_crawl_strategy=dfs_strategy,
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(),
stream=True,
)
seed_url = "https://docs.python.org/3/" # Plenty of internal links
async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
async for result in await crawler.arun(url=seed_url, config=config):
depth = result.metadata.get("depth")
status = "SUCCESS" if result.success else "FAILED"
print(f"[{status}] depth={depth} url={result.url}")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,98 +1,304 @@
# Proxy # Proxy & Security
This guide covers proxy configuration and security features in Crawl4AI, including SSL certificate analysis and proxy rotation strategies.
## Understanding Proxy Configuration
Crawl4AI recommends configuring proxies per request through `CrawlerRunConfig.proxy_config`. This gives you precise control, enables rotation strategies, and keeps examples simple enough to copy, paste, and run.
## Basic Proxy Setup ## Basic Proxy Setup
Simple proxy configuration with `BrowserConfig`: Configure proxies that apply to each crawl operation:
```python ```python
from crawl4ai.async_configs import BrowserConfig
# Using HTTP proxy
browser_config = BrowserConfig(proxy_config={"server": "http://proxy.example.com:8080"})
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url="https://example.com")
# Using SOCKS proxy
browser_config = BrowserConfig(proxy_config={"server": "socks5://proxy.example.com:1080"})
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url="https://example.com")
```
## Authenticated Proxy
Use an authenticated proxy with `BrowserConfig`:
```python
from crawl4ai.async_configs import BrowserConfig
browser_config = BrowserConfig(proxy_config={
"server": "http://[host]:[port]",
"username": "[username]",
"password": "[password]",
})
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url="https://example.com")
```
## Rotating Proxies
Example using a proxy rotation service dynamically:
```python
import re
from crawl4ai import (
AsyncWebCrawler,
BrowserConfig,
CrawlerRunConfig,
CacheMode,
RoundRobinProxyStrategy,
)
import asyncio import asyncio
from crawl4ai import ProxyConfig from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, ProxyConfig
run_config = CrawlerRunConfig(proxy_config=ProxyConfig(server="http://proxy.example.com:8080"))
# run_config = CrawlerRunConfig(proxy_config={"server": "http://proxy.example.com:8080"})
# run_config = CrawlerRunConfig(proxy_config="http://proxy.example.com:8080")
async def main(): async def main():
# Load proxies and create rotation strategy browser_config = BrowserConfig()
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url="https://example.com", config=run_config)
print(f"Success: {result.success} -> {result.url}")
if __name__ == "__main__":
asyncio.run(main())
```
!!! note "Why request-level?"
`CrawlerRunConfig.proxy_config` keeps each request self-contained, so swapping proxies or rotation strategies is just a matter of building a new run configuration.
## Supported Proxy Formats
The `ProxyConfig.from_string()` method supports multiple formats:
```python
from crawl4ai import ProxyConfig
# HTTP proxy with authentication
proxy1 = ProxyConfig.from_string("http://user:pass@192.168.1.1:8080")
# HTTPS proxy
proxy2 = ProxyConfig.from_string("https://proxy.example.com:8080")
# SOCKS5 proxy
proxy3 = ProxyConfig.from_string("socks5://proxy.example.com:1080")
# Simple IP:port format
proxy4 = ProxyConfig.from_string("192.168.1.1:8080")
# IP:port:user:pass format
proxy5 = ProxyConfig.from_string("192.168.1.1:8080:user:pass")
```
## Authenticated Proxies
For proxies requiring authentication:
```python
import asyncio
from crawl4ai import AsyncWebCrawler,BrowserConfig, CrawlerRunConfig, ProxyConfig
run_config = CrawlerRunConfig(
proxy_config=ProxyConfig(
server="http://proxy.example.com:8080",
username="your_username",
password="your_password",
)
)
# Or dictionary style:
# run_config = CrawlerRunConfig(proxy_config={
# "server": "http://proxy.example.com:8080",
# "username": "your_username",
# "password": "your_password",
# })
async def main():
browser_config = BrowserConfig()
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url="https://example.com", config=run_config)
print(f"Success: {result.success} -> {result.url}")
if __name__ == "__main__":
asyncio.run(main())
```
## Environment Variable Configuration
Load proxies from environment variables for easy configuration:
```python
import os
from crawl4ai import ProxyConfig, CrawlerRunConfig
# Set environment variable
os.environ["PROXIES"] = "ip1:port1:user1:pass1,ip2:port2:user2:pass2,ip3:port3"
# Load all proxies
proxies = ProxyConfig.from_env()
print(f"Loaded {len(proxies)} proxies")
# Use first proxy
if proxies:
run_config = CrawlerRunConfig(proxy_config=proxies[0])
```
## Rotating Proxies
Crawl4AI supports automatic proxy rotation to distribute requests across multiple proxy servers. Rotation is applied per request using a rotation strategy on `CrawlerRunConfig`.
### Proxy Rotation (recommended)
```python
import asyncio
import re
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, ProxyConfig
from crawl4ai.proxy_strategy import RoundRobinProxyStrategy
async def main():
# Load proxies from environment
proxies = ProxyConfig.from_env() proxies = ProxyConfig.from_env()
#eg: export PROXIES="ip1:port1:username1:password1,ip2:port2:username2:password2"
if not proxies: if not proxies:
print("No proxies found in environment. Set PROXIES env variable!") print("No proxies found! Set PROXIES environment variable.")
return return
# Create rotation strategy
proxy_strategy = RoundRobinProxyStrategy(proxies) proxy_strategy = RoundRobinProxyStrategy(proxies)
# Create configs # Configure per-request with proxy rotation
browser_config = BrowserConfig(headless=True, verbose=False) browser_config = BrowserConfig(headless=True, verbose=False)
run_config = CrawlerRunConfig( run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, cache_mode=CacheMode.BYPASS,
proxy_rotation_strategy=proxy_strategy proxy_rotation_strategy=proxy_strategy,
) )
async with AsyncWebCrawler(config=browser_config) as crawler: async with AsyncWebCrawler(config=browser_config) as crawler:
urls = ["https://httpbin.org/ip"] * (len(proxies) * 2) # Test each proxy twice urls = ["https://httpbin.org/ip"] * (len(proxies) * 2) # Test each proxy twice
print("\n📈 Initializing crawler with proxy rotation...") print(f"🚀 Testing {len(proxies)} proxies with rotation...")
async with AsyncWebCrawler(config=browser_config) as crawler: results = await crawler.arun_many(urls=urls, config=run_config)
print("\n🚀 Starting batch crawl with proxy rotation...")
results = await crawler.arun_many(
urls=urls,
config=run_config
)
for result in results:
if result.success:
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
current_proxy = run_config.proxy_config if run_config.proxy_config else None
if current_proxy and ip_match: for i, result in enumerate(results):
print(f"URL {result.url}") if result.success:
print(f"Proxy {current_proxy.server} -> Response IP: {ip_match.group(0)}") # Extract IP from response
verified = ip_match.group(0) == current_proxy.ip ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
if verified: if ip_match:
print(f"✅ Proxy working! IP matches: {current_proxy.ip}") detected_ip = ip_match.group(0)
else: proxy_index = i % len(proxies)
print("❌ Proxy failed or IP mismatch!") expected_ip = proxies[proxy_index].ip
print("---")
asyncio.run(main()) print(f"✅ Request {i+1}: Proxy {proxy_index+1} -> IP {detected_ip}")
if detected_ip == expected_ip:
print(" 🎯 IP matches proxy configuration")
else:
print(f" ⚠️ IP mismatch (expected {expected_ip})")
else:
print(f"❌ Request {i+1}: Could not extract IP from response")
else:
print(f"❌ Request {i+1}: Failed - {result.error_message}")
if __name__ == "__main__":
asyncio.run(main())
``` ```
## SSL Certificate Analysis
Combine proxy usage with SSL certificate inspection for enhanced security analysis. SSL certificate fetching is configured per request via `CrawlerRunConfig`.
### Per-Request SSL Certificate Analysis
```python
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
run_config = CrawlerRunConfig(
proxy_config={
"server": "http://proxy.example.com:8080",
"username": "user",
"password": "pass",
},
fetch_ssl_certificate=True, # Enable SSL certificate analysis for this request
)
async def main():
browser_config = BrowserConfig()
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url="https://example.com", config=run_config)
if result.success:
print(f"✅ Crawled via proxy: {result.url}")
# Analyze SSL certificate
if result.ssl_certificate:
cert = result.ssl_certificate
print("🔒 SSL Certificate Info:")
print(f" Issuer: {cert.issuer}")
print(f" Subject: {cert.subject}")
print(f" Valid until: {cert.valid_until}")
print(f" Fingerprint: {cert.fingerprint}")
# Export certificate
cert.to_json("certificate.json")
print("💾 Certificate exported to certificate.json")
else:
print("⚠️ No SSL certificate information available")
if __name__ == "__main__":
asyncio.run(main())
```
## Security Best Practices
### 1. Proxy Rotation for Anonymity
```python
from crawl4ai import CrawlerRunConfig, ProxyConfig
from crawl4ai.proxy_strategy import RoundRobinProxyStrategy
# Use multiple proxies to avoid IP blocking
proxies = ProxyConfig.from_env("PROXIES")
strategy = RoundRobinProxyStrategy(proxies)
# Configure rotation per request (recommended)
run_config = CrawlerRunConfig(proxy_rotation_strategy=strategy)
# For a fixed proxy across all requests, just reuse the same run_config instance
static_run_config = run_config
```
### 2. SSL Certificate Verification
```python
from crawl4ai import CrawlerRunConfig
# Always verify SSL certificates when possible
# Per-request (affects specific requests)
run_config = CrawlerRunConfig(fetch_ssl_certificate=True)
```
### 3. Environment Variable Security
```bash
# Use environment variables for sensitive proxy credentials
# Avoid hardcoding usernames/passwords in code
export PROXIES="ip1:port1:user1:pass1,ip2:port2:user2:pass2"
```
### 4. SOCKS5 for Enhanced Security
```python
from crawl4ai import CrawlerRunConfig
# Prefer SOCKS5 proxies for better protocol support
run_config = CrawlerRunConfig(proxy_config="socks5://proxy.example.com:1080")
```
## Migration from Deprecated `proxy` Parameter
!!! warning "Deprecation Notice"
The legacy `proxy` argument on `BrowserConfig` is deprecated. Configure proxies through `CrawlerRunConfig.proxy_config` so each request fully describes its network settings.
```python
# Old (deprecated) approach
# from crawl4ai import BrowserConfig
# browser_config = BrowserConfig(proxy="http://proxy.example.com:8080")
# New (preferred) approach
from crawl4ai import CrawlerRunConfig
run_config = CrawlerRunConfig(proxy_config="http://proxy.example.com:8080")
```
### Safe Logging of Proxies
```python
from crawl4ai import ProxyConfig
def safe_proxy_repr(proxy: ProxyConfig):
if getattr(proxy, "username", None):
return f"{proxy.server} (auth: ****)"
return proxy.server
```
## Troubleshooting
### Common Issues
???+ question "Proxy connection failed"
- Verify the proxy server is reachable from your network.
- Double-check authentication credentials.
- Ensure the protocol matches (`http`, `https`, or `socks5`).
???+ question "SSL certificate errors"
- Some proxies break SSL inspection; switch proxies if you see repeated failures.
- Consider temporarily disabling certificate fetching to isolate the issue.
???+ question "Environment variables not loading"
- Confirm `PROXIES` (or your custom env var) is set before running the script.
- Check formatting: `ip:port:user:pass,ip:port:user:pass`.
???+ question "Proxy rotation not working"
- Ensure `ProxyConfig.from_env()` actually loaded entries (`len(proxies) > 0`).
- Attach `proxy_rotation_strategy` to `CrawlerRunConfig`.
- Validate the proxy definitions you pass into the strategy.