Automatically detect when crawls are blocked by anti-bot systems (Akamai, Cloudflare, PerimeterX, DataDome, Imperva, etc.) and escalate through configurable retry and fallback strategies. New features on CrawlerRunConfig: - max_retries: retry rounds when blocking is detected - fallback_proxy_configs: list of fallback proxies tried each round - fallback_fetch_function: async last-resort function returning raw HTML New field on ProxyConfig: - is_fallback: skip proxy on first attempt, activate only when blocked Escalation chain per round: main proxy → fallback proxies in order. After all rounds: fallback_fetch_function as last resort. Detection uses tiered heuristics — structural HTML markers (high confidence) trigger on any page, generic patterns only on short error pages to avoid false positives.
316 lines
12 KiB
Python
316 lines
12 KiB
Python
"""
|
|
Unit tests for antibot_detector.is_blocked().
|
|
|
|
Tests are organized into:
|
|
- TRUE POSITIVES: Real block pages that MUST be detected
|
|
- TRUE NEGATIVES: Legitimate pages that MUST NOT be flagged
|
|
- EDGE CASES: Boundary conditions
|
|
"""
|
|
|
|
import sys, os
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../.."))
|
|
|
|
from crawl4ai.antibot_detector import is_blocked
|
|
|
|
PASS = 0
|
|
FAIL = 0
|
|
|
|
def check(name, result, expected_blocked, expected_substr=None):
|
|
global PASS, FAIL
|
|
blocked, reason = result
|
|
ok = blocked == expected_blocked
|
|
if expected_substr and blocked:
|
|
ok = ok and expected_substr.lower() in reason.lower()
|
|
status = "PASS" if ok else "FAIL"
|
|
if not ok:
|
|
FAIL += 1
|
|
print(f" {status}: {name}")
|
|
print(f" got blocked={blocked}, reason={reason!r}")
|
|
print(f" expected blocked={expected_blocked}" +
|
|
(f", substr={expected_substr!r}" if expected_substr else ""))
|
|
else:
|
|
PASS += 1
|
|
if blocked:
|
|
print(f" {status}: {name} -> {reason}")
|
|
else:
|
|
print(f" {status}: {name} -> not blocked")
|
|
|
|
|
|
# =========================================================================
|
|
# TRUE POSITIVES — real block pages that MUST be detected
|
|
# =========================================================================
|
|
print("\n=== TRUE POSITIVES (must detect as blocked) ===\n")
|
|
|
|
# --- Akamai ---
|
|
check("Akamai Reference #",
|
|
is_blocked(403, '<html><body>Access Denied\nYour request was blocked.\nReference #18.2d351ab8.1557333295.a4e16ab</body></html>'),
|
|
True, "Akamai")
|
|
|
|
check("Akamai Pardon Our Interruption",
|
|
is_blocked(403, '<html><head><title>Pardon Our Interruption</title></head><body><p>Please verify you are human</p></body></html>'),
|
|
True, "Pardon")
|
|
|
|
check("Akamai 403 short Access Denied",
|
|
is_blocked(403, '<html><body><h1>Access Denied</h1></body></html>'),
|
|
True) # Detected via near-empty 403 or Access Denied pattern
|
|
|
|
# --- Cloudflare ---
|
|
check("Cloudflare challenge form",
|
|
is_blocked(403, '''<html><body>
|
|
<form id="challenge-form" action="/cdn-cgi/l/chk_jschl?__cf_chl_f_tk=abc123">
|
|
<input type="hidden" name="jschl_vc" value="test"/>
|
|
</form></body></html>'''),
|
|
True, "Cloudflare challenge")
|
|
|
|
check("Cloudflare error 1020",
|
|
is_blocked(403, '''<html><body>
|
|
<div class="cf-wrapper"><span class="cf-error-code">1020</span></div>
|
|
<p>Access denied</p></body></html>'''),
|
|
True, "Cloudflare firewall")
|
|
|
|
check("Cloudflare IUAM script",
|
|
is_blocked(403, '<html><script src="/cdn-cgi/challenge-platform/h/g/orchestrate/jsch/v1"></script></html>'),
|
|
True, "Cloudflare JS challenge")
|
|
|
|
check("Cloudflare Just a moment",
|
|
is_blocked(403, '<html><head><title>Just a moment...</title></head><body>Checking your browser</body></html>'),
|
|
True) # Detected via near-empty 403 or Cloudflare pattern
|
|
|
|
check("Cloudflare Checking your browser (short 503)",
|
|
is_blocked(503, '<html><body>Checking your browser before accessing the site.</body></html>'),
|
|
True, "Cloudflare browser check")
|
|
|
|
# --- PerimeterX ---
|
|
check("PerimeterX block page",
|
|
is_blocked(403, '''<html><head><title>Access to This Page Has Been Blocked</title></head>
|
|
<body><div id="px-captcha"></div>
|
|
<script>window._pxAppId = 'PX12345';</script></body></html>'''),
|
|
True, "PerimeterX")
|
|
|
|
check("PerimeterX captcha CDN",
|
|
is_blocked(403, '<html><body><script src="https://captcha.px-cdn.net/PX12345/captcha.js"></script></body></html>'),
|
|
True, "PerimeterX captcha")
|
|
|
|
# --- DataDome ---
|
|
check("DataDome captcha delivery",
|
|
is_blocked(403, '''<html><body><script>
|
|
var dd = {'rt':'i','cid':'AHrlq...','host':'geo.captcha-delivery.com'};
|
|
</script></body></html>'''),
|
|
True, "DataDome")
|
|
|
|
# --- Imperva/Incapsula ---
|
|
check("Imperva Incapsula Resource",
|
|
is_blocked(403, '<html><body><iframe src="/_Incapsula_Resource?incident_id=123&sess_id=abc"></iframe></body></html>'),
|
|
True, "Imperva")
|
|
|
|
check("Imperva incident ID",
|
|
is_blocked(200, '<html><body>Request unsuccessful. Incapsula incident ID: 12345-67890</body></html>'),
|
|
True, "Incapsula incident")
|
|
|
|
# --- Sucuri ---
|
|
check("Sucuri firewall",
|
|
is_blocked(403, '<html><body><h1>Sucuri WebSite Firewall - Access Denied</h1></body></html>'),
|
|
True, "Sucuri")
|
|
|
|
# --- Kasada ---
|
|
check("Kasada challenge",
|
|
is_blocked(403, '<html><script>KPSDK.scriptStart = KPSDK.now();</script></html>'),
|
|
True, "Kasada")
|
|
|
|
# --- HTTP 429 ---
|
|
check("HTTP 429 rate limit",
|
|
is_blocked(429, '<html><body>Rate limit exceeded</body></html>'),
|
|
True, "429")
|
|
|
|
check("HTTP 429 empty body",
|
|
is_blocked(429, ''),
|
|
True, "429")
|
|
|
|
# --- Empty 200 ---
|
|
check("HTTP 200 empty page",
|
|
is_blocked(200, ''),
|
|
True, "empty")
|
|
|
|
check("HTTP 200 whitespace only",
|
|
is_blocked(200, ' \n\n '),
|
|
True, "empty")
|
|
|
|
# --- 403 near-empty ---
|
|
check("HTTP 403 near-empty (10 bytes)",
|
|
is_blocked(403, '<html></html>'),
|
|
True, "403")
|
|
|
|
|
|
# =========================================================================
|
|
# TRUE NEGATIVES — legitimate pages that MUST NOT be flagged
|
|
# =========================================================================
|
|
print("\n=== TRUE NEGATIVES (must NOT detect as blocked) ===\n")
|
|
|
|
# --- Normal pages ---
|
|
check("Normal 200 page (example.com size)",
|
|
is_blocked(200, '<html><head><title>Example</title></head><body>' + 'x' * 500 + '</body></html>'),
|
|
False)
|
|
|
|
check("Normal 200 large page",
|
|
is_blocked(200, '<html><body>' + '<p>Some content here.</p>\n' * 5000 + '</body></html>'),
|
|
False)
|
|
|
|
# --- Security articles (false positive trap!) ---
|
|
check("Article about bot detection (large page)",
|
|
is_blocked(200, '<html><head><title>How to Detect Bots</title></head><body>' +
|
|
'<h1>How to Detect Bots on Your Website</h1>' +
|
|
'<p>Anti-bot solutions like DataDome, PerimeterX, and Cloudflare ' +
|
|
'help detect and block bot traffic. When a bot is detected, ' +
|
|
'services show a CAPTCHA or Access Denied page. ' +
|
|
'Common signals include blocked by security warnings.</p>' +
|
|
'<p>The g-recaptcha and h-captcha widgets are used for challenges.</p>' +
|
|
'<p>' + 'More article content. ' * 500 + '</p>' +
|
|
'</body></html>'),
|
|
False)
|
|
|
|
check("DataDome marketing page (large)",
|
|
is_blocked(200, '<html><body><h1>DataDome Bot Protection</h1>' +
|
|
'<p>DataDome protects websites from bot attacks. ' +
|
|
'Our solution detects automated traffic using advanced fingerprinting. ' +
|
|
'Competitors like PerimeterX use window._pxAppId for tracking.</p>' +
|
|
'<p>' + 'Marketing content. ' * 1000 + '</p>' +
|
|
'</body></html>'),
|
|
False)
|
|
|
|
|
|
# --- Login pages with CAPTCHA (not a block!) ---
|
|
check("Login page with reCAPTCHA (large page)",
|
|
is_blocked(200, '<html><head><title>Sign In</title></head><body>' +
|
|
'<nav>Home | Products | Contact</nav>' +
|
|
'<form action="/login" method="POST">' +
|
|
'<input name="email" type="email"/>' +
|
|
'<input name="password" type="password"/>' +
|
|
'<div class="g-recaptcha" data-sitekey="abc123"></div>' +
|
|
'<button type="submit">Sign In</button>' +
|
|
'</form>' +
|
|
'<footer>Copyright 2024</footer>' +
|
|
'<p>' + 'Page content. ' * 500 + '</p>' +
|
|
'</body></html>'),
|
|
False)
|
|
|
|
check("Signup page with hCaptcha (large page)",
|
|
is_blocked(200, '<html><body>' +
|
|
'<h1>Create Account</h1>' +
|
|
'<form><div class="h-captcha" data-sitekey="xyz"></div></form>' +
|
|
'<p>' + 'Registration info. ' * 500 + '</p>' +
|
|
'</body></html>'),
|
|
False)
|
|
|
|
# --- Legitimate 403 pages (not anti-bot) ---
|
|
check("Apache directory listing denied (403, large-ish)",
|
|
is_blocked(403, '<html><head><title>403 Forbidden</title></head><body>' +
|
|
'<h1>Forbidden</h1>' +
|
|
'<p>You don\'t have permission to access this resource on this server.</p>' +
|
|
'<hr><address>Apache/2.4.41 (Ubuntu) Server at example.com Port 80</address>' +
|
|
'<p>' + 'Server info. ' * 500 + '</p>' +
|
|
'</body></html>'),
|
|
False)
|
|
|
|
check("Nginx 403 (large page)",
|
|
is_blocked(403, '<html><head><title>403 Forbidden</title></head><body>' +
|
|
'<center><h1>403 Forbidden</h1></center>' +
|
|
'<hr><center>nginx/1.18.0</center>' +
|
|
'<p>' + 'Content. ' * 500 + '</p>' +
|
|
'</body></html>'),
|
|
False)
|
|
|
|
check("API 403 auth required (JSON)",
|
|
is_blocked(403, '{"error": "Forbidden", "message": "Invalid API key", "code": 403}'),
|
|
False)
|
|
|
|
# --- Cloudflare-served normal pages (not blocked!) ---
|
|
check("Cloudflare-served normal page with footer",
|
|
is_blocked(200, '<html><body>' +
|
|
'<h1>Welcome to Our Site</h1>' +
|
|
'<p>This is a normal page served through Cloudflare CDN.</p>' +
|
|
'<footer>Performance & security by Cloudflare</footer>' +
|
|
'<p>' + 'Normal content. ' * 500 + '</p>' +
|
|
'</body></html>'),
|
|
False)
|
|
|
|
# --- Small but legitimate pages ---
|
|
check("Small valid 200 page (150 bytes)",
|
|
is_blocked(200, '<html><head><title>OK</title></head><body><p>Your request was processed successfully.</p></body></html>'),
|
|
False)
|
|
|
|
check("Small JSON 200 response",
|
|
is_blocked(200, '{"status": "ok", "data": {"id": 123, "name": "test"}, "timestamp": "2024-01-01T00:00:00Z"}'),
|
|
False)
|
|
|
|
check("Redirect page 200",
|
|
is_blocked(200, '<html><head><meta http-equiv="refresh" content="0;url=/dashboard"></head><body>Redirecting...</body></html>'),
|
|
False)
|
|
|
|
# --- 503 legitimate server errors ---
|
|
check("Legitimate 503 maintenance (large)",
|
|
is_blocked(503, '<html><body><h1>Service Temporarily Unavailable</h1>' +
|
|
'<p>We are performing scheduled maintenance. Please try again later.</p>' +
|
|
'<p>' + 'Maintenance info. ' * 500 + '</p>' +
|
|
'</body></html>'),
|
|
False)
|
|
|
|
# --- 200 with short but real content ---
|
|
check("Short thank you page (200, 120 bytes)",
|
|
is_blocked(200, '<html><body><h1>Thank You!</h1><p>Your order has been placed. Confirmation email sent.</p></body></html>'),
|
|
False)
|
|
|
|
|
|
# =========================================================================
|
|
# EDGE CASES
|
|
# =========================================================================
|
|
print("\n=== EDGE CASES ===\n")
|
|
|
|
check("None status code + empty html",
|
|
is_blocked(None, ''),
|
|
False)
|
|
|
|
check("None status code + block content",
|
|
is_blocked(None, '<html><body>Reference #18.2d351ab8.1557333295.a4e16ab</body></html>'),
|
|
True, "Akamai")
|
|
|
|
check("200 + tier1 pattern (Imperva deceptive 200)",
|
|
is_blocked(200, '<html><body>Request unsuccessful. Incapsula incident ID: 555-999</body></html>'),
|
|
True, "Incapsula")
|
|
|
|
check("403 + 4999 bytes (just under threshold)",
|
|
is_blocked(403, '<html><body>Access Denied' + 'x' * 4950 + '</body></html>'),
|
|
True, "Access Denied")
|
|
|
|
check("403 + 5001 bytes (just over threshold, no tier2 match)",
|
|
is_blocked(403, '<html><body>Some error page' + 'x' * 4960 + '</body></html>'),
|
|
False)
|
|
|
|
check("403 + 9999 bytes with generic block text",
|
|
is_blocked(403, '<html><body>blocked by security' + 'x' * 9950 + '</body></html>'),
|
|
True, "Blocked by security")
|
|
|
|
check("403 + 10001 bytes with generic block text (too big for tier2)",
|
|
is_blocked(403, '<html><body>blocked by security' + 'x' * 9970 + '</body></html>'),
|
|
False)
|
|
|
|
check("200 + whitespace-padded but 89 bytes content (above threshold for meaningful)",
|
|
is_blocked(200, ' ' * 10 + 'x' * 89 + ' ' * 10),
|
|
True, "empty")
|
|
|
|
check("200 + exactly 100 bytes stripped (at threshold)",
|
|
is_blocked(200, 'x' * 100),
|
|
False)
|
|
|
|
|
|
# =========================================================================
|
|
# SUMMARY
|
|
# =========================================================================
|
|
print(f"\n{'=' * 60}")
|
|
print(f"RESULTS: {PASS} passed, {FAIL} failed out of {PASS + FAIL} tests")
|
|
print(f"{'=' * 60}")
|
|
if FAIL > 0:
|
|
print("SOME TESTS FAILED!")
|
|
sys.exit(1)
|
|
else:
|
|
print("ALL TESTS PASSED!")
|