Files
crawl4ai/tests/proxy/test_antibot_detector.py
unclecode 72b546c48d Add anti-bot detection, retry, and fallback system
Automatically detect when crawls are blocked by anti-bot systems
(Akamai, Cloudflare, PerimeterX, DataDome, Imperva, etc.) and
escalate through configurable retry and fallback strategies.

New features on CrawlerRunConfig:
- max_retries: retry rounds when blocking is detected
- fallback_proxy_configs: list of fallback proxies tried each round
- fallback_fetch_function: async last-resort function returning raw HTML

New field on ProxyConfig:
- is_fallback: skip proxy on first attempt, activate only when blocked

Escalation chain per round: main proxy → fallback proxies in order.
After all rounds: fallback_fetch_function as last resort.

Detection uses tiered heuristics — structural HTML markers (high
confidence) trigger on any page, generic patterns only on short
error pages to avoid false positives.
2026-02-14 05:24:07 +00:00

316 lines
12 KiB
Python

"""
Unit tests for antibot_detector.is_blocked().
Tests are organized into:
- TRUE POSITIVES: Real block pages that MUST be detected
- TRUE NEGATIVES: Legitimate pages that MUST NOT be flagged
- EDGE CASES: Boundary conditions
"""
import sys, os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../.."))
from crawl4ai.antibot_detector import is_blocked
PASS = 0
FAIL = 0
def check(name, result, expected_blocked, expected_substr=None):
global PASS, FAIL
blocked, reason = result
ok = blocked == expected_blocked
if expected_substr and blocked:
ok = ok and expected_substr.lower() in reason.lower()
status = "PASS" if ok else "FAIL"
if not ok:
FAIL += 1
print(f" {status}: {name}")
print(f" got blocked={blocked}, reason={reason!r}")
print(f" expected blocked={expected_blocked}" +
(f", substr={expected_substr!r}" if expected_substr else ""))
else:
PASS += 1
if blocked:
print(f" {status}: {name} -> {reason}")
else:
print(f" {status}: {name} -> not blocked")
# =========================================================================
# TRUE POSITIVES — real block pages that MUST be detected
# =========================================================================
print("\n=== TRUE POSITIVES (must detect as blocked) ===\n")
# --- Akamai ---
check("Akamai Reference #",
is_blocked(403, '<html><body>Access Denied\nYour request was blocked.\nReference #18.2d351ab8.1557333295.a4e16ab</body></html>'),
True, "Akamai")
check("Akamai Pardon Our Interruption",
is_blocked(403, '<html><head><title>Pardon Our Interruption</title></head><body><p>Please verify you are human</p></body></html>'),
True, "Pardon")
check("Akamai 403 short Access Denied",
is_blocked(403, '<html><body><h1>Access Denied</h1></body></html>'),
True) # Detected via near-empty 403 or Access Denied pattern
# --- Cloudflare ---
check("Cloudflare challenge form",
is_blocked(403, '''<html><body>
<form id="challenge-form" action="/cdn-cgi/l/chk_jschl?__cf_chl_f_tk=abc123">
<input type="hidden" name="jschl_vc" value="test"/>
</form></body></html>'''),
True, "Cloudflare challenge")
check("Cloudflare error 1020",
is_blocked(403, '''<html><body>
<div class="cf-wrapper"><span class="cf-error-code">1020</span></div>
<p>Access denied</p></body></html>'''),
True, "Cloudflare firewall")
check("Cloudflare IUAM script",
is_blocked(403, '<html><script src="/cdn-cgi/challenge-platform/h/g/orchestrate/jsch/v1"></script></html>'),
True, "Cloudflare JS challenge")
check("Cloudflare Just a moment",
is_blocked(403, '<html><head><title>Just a moment...</title></head><body>Checking your browser</body></html>'),
True) # Detected via near-empty 403 or Cloudflare pattern
check("Cloudflare Checking your browser (short 503)",
is_blocked(503, '<html><body>Checking your browser before accessing the site.</body></html>'),
True, "Cloudflare browser check")
# --- PerimeterX ---
check("PerimeterX block page",
is_blocked(403, '''<html><head><title>Access to This Page Has Been Blocked</title></head>
<body><div id="px-captcha"></div>
<script>window._pxAppId = 'PX12345';</script></body></html>'''),
True, "PerimeterX")
check("PerimeterX captcha CDN",
is_blocked(403, '<html><body><script src="https://captcha.px-cdn.net/PX12345/captcha.js"></script></body></html>'),
True, "PerimeterX captcha")
# --- DataDome ---
check("DataDome captcha delivery",
is_blocked(403, '''<html><body><script>
var dd = {'rt':'i','cid':'AHrlq...','host':'geo.captcha-delivery.com'};
</script></body></html>'''),
True, "DataDome")
# --- Imperva/Incapsula ---
check("Imperva Incapsula Resource",
is_blocked(403, '<html><body><iframe src="/_Incapsula_Resource?incident_id=123&sess_id=abc"></iframe></body></html>'),
True, "Imperva")
check("Imperva incident ID",
is_blocked(200, '<html><body>Request unsuccessful. Incapsula incident ID: 12345-67890</body></html>'),
True, "Incapsula incident")
# --- Sucuri ---
check("Sucuri firewall",
is_blocked(403, '<html><body><h1>Sucuri WebSite Firewall - Access Denied</h1></body></html>'),
True, "Sucuri")
# --- Kasada ---
check("Kasada challenge",
is_blocked(403, '<html><script>KPSDK.scriptStart = KPSDK.now();</script></html>'),
True, "Kasada")
# --- HTTP 429 ---
check("HTTP 429 rate limit",
is_blocked(429, '<html><body>Rate limit exceeded</body></html>'),
True, "429")
check("HTTP 429 empty body",
is_blocked(429, ''),
True, "429")
# --- Empty 200 ---
check("HTTP 200 empty page",
is_blocked(200, ''),
True, "empty")
check("HTTP 200 whitespace only",
is_blocked(200, ' \n\n '),
True, "empty")
# --- 403 near-empty ---
check("HTTP 403 near-empty (10 bytes)",
is_blocked(403, '<html></html>'),
True, "403")
# =========================================================================
# TRUE NEGATIVES — legitimate pages that MUST NOT be flagged
# =========================================================================
print("\n=== TRUE NEGATIVES (must NOT detect as blocked) ===\n")
# --- Normal pages ---
check("Normal 200 page (example.com size)",
is_blocked(200, '<html><head><title>Example</title></head><body>' + 'x' * 500 + '</body></html>'),
False)
check("Normal 200 large page",
is_blocked(200, '<html><body>' + '<p>Some content here.</p>\n' * 5000 + '</body></html>'),
False)
# --- Security articles (false positive trap!) ---
check("Article about bot detection (large page)",
is_blocked(200, '<html><head><title>How to Detect Bots</title></head><body>' +
'<h1>How to Detect Bots on Your Website</h1>' +
'<p>Anti-bot solutions like DataDome, PerimeterX, and Cloudflare ' +
'help detect and block bot traffic. When a bot is detected, ' +
'services show a CAPTCHA or Access Denied page. ' +
'Common signals include blocked by security warnings.</p>' +
'<p>The g-recaptcha and h-captcha widgets are used for challenges.</p>' +
'<p>' + 'More article content. ' * 500 + '</p>' +
'</body></html>'),
False)
check("DataDome marketing page (large)",
is_blocked(200, '<html><body><h1>DataDome Bot Protection</h1>' +
'<p>DataDome protects websites from bot attacks. ' +
'Our solution detects automated traffic using advanced fingerprinting. ' +
'Competitors like PerimeterX use window._pxAppId for tracking.</p>' +
'<p>' + 'Marketing content. ' * 1000 + '</p>' +
'</body></html>'),
False)
# --- Login pages with CAPTCHA (not a block!) ---
check("Login page with reCAPTCHA (large page)",
is_blocked(200, '<html><head><title>Sign In</title></head><body>' +
'<nav>Home | Products | Contact</nav>' +
'<form action="/login" method="POST">' +
'<input name="email" type="email"/>' +
'<input name="password" type="password"/>' +
'<div class="g-recaptcha" data-sitekey="abc123"></div>' +
'<button type="submit">Sign In</button>' +
'</form>' +
'<footer>Copyright 2024</footer>' +
'<p>' + 'Page content. ' * 500 + '</p>' +
'</body></html>'),
False)
check("Signup page with hCaptcha (large page)",
is_blocked(200, '<html><body>' +
'<h1>Create Account</h1>' +
'<form><div class="h-captcha" data-sitekey="xyz"></div></form>' +
'<p>' + 'Registration info. ' * 500 + '</p>' +
'</body></html>'),
False)
# --- Legitimate 403 pages (not anti-bot) ---
check("Apache directory listing denied (403, large-ish)",
is_blocked(403, '<html><head><title>403 Forbidden</title></head><body>' +
'<h1>Forbidden</h1>' +
'<p>You don\'t have permission to access this resource on this server.</p>' +
'<hr><address>Apache/2.4.41 (Ubuntu) Server at example.com Port 80</address>' +
'<p>' + 'Server info. ' * 500 + '</p>' +
'</body></html>'),
False)
check("Nginx 403 (large page)",
is_blocked(403, '<html><head><title>403 Forbidden</title></head><body>' +
'<center><h1>403 Forbidden</h1></center>' +
'<hr><center>nginx/1.18.0</center>' +
'<p>' + 'Content. ' * 500 + '</p>' +
'</body></html>'),
False)
check("API 403 auth required (JSON)",
is_blocked(403, '{"error": "Forbidden", "message": "Invalid API key", "code": 403}'),
False)
# --- Cloudflare-served normal pages (not blocked!) ---
check("Cloudflare-served normal page with footer",
is_blocked(200, '<html><body>' +
'<h1>Welcome to Our Site</h1>' +
'<p>This is a normal page served through Cloudflare CDN.</p>' +
'<footer>Performance & security by Cloudflare</footer>' +
'<p>' + 'Normal content. ' * 500 + '</p>' +
'</body></html>'),
False)
# --- Small but legitimate pages ---
check("Small valid 200 page (150 bytes)",
is_blocked(200, '<html><head><title>OK</title></head><body><p>Your request was processed successfully.</p></body></html>'),
False)
check("Small JSON 200 response",
is_blocked(200, '{"status": "ok", "data": {"id": 123, "name": "test"}, "timestamp": "2024-01-01T00:00:00Z"}'),
False)
check("Redirect page 200",
is_blocked(200, '<html><head><meta http-equiv="refresh" content="0;url=/dashboard"></head><body>Redirecting...</body></html>'),
False)
# --- 503 legitimate server errors ---
check("Legitimate 503 maintenance (large)",
is_blocked(503, '<html><body><h1>Service Temporarily Unavailable</h1>' +
'<p>We are performing scheduled maintenance. Please try again later.</p>' +
'<p>' + 'Maintenance info. ' * 500 + '</p>' +
'</body></html>'),
False)
# --- 200 with short but real content ---
check("Short thank you page (200, 120 bytes)",
is_blocked(200, '<html><body><h1>Thank You!</h1><p>Your order has been placed. Confirmation email sent.</p></body></html>'),
False)
# =========================================================================
# EDGE CASES
# =========================================================================
print("\n=== EDGE CASES ===\n")
check("None status code + empty html",
is_blocked(None, ''),
False)
check("None status code + block content",
is_blocked(None, '<html><body>Reference #18.2d351ab8.1557333295.a4e16ab</body></html>'),
True, "Akamai")
check("200 + tier1 pattern (Imperva deceptive 200)",
is_blocked(200, '<html><body>Request unsuccessful. Incapsula incident ID: 555-999</body></html>'),
True, "Incapsula")
check("403 + 4999 bytes (just under threshold)",
is_blocked(403, '<html><body>Access Denied' + 'x' * 4950 + '</body></html>'),
True, "Access Denied")
check("403 + 5001 bytes (just over threshold, no tier2 match)",
is_blocked(403, '<html><body>Some error page' + 'x' * 4960 + '</body></html>'),
False)
check("403 + 9999 bytes with generic block text",
is_blocked(403, '<html><body>blocked by security' + 'x' * 9950 + '</body></html>'),
True, "Blocked by security")
check("403 + 10001 bytes with generic block text (too big for tier2)",
is_blocked(403, '<html><body>blocked by security' + 'x' * 9970 + '</body></html>'),
False)
check("200 + whitespace-padded but 89 bytes content (above threshold for meaningful)",
is_blocked(200, ' ' * 10 + 'x' * 89 + ' ' * 10),
True, "empty")
check("200 + exactly 100 bytes stripped (at threshold)",
is_blocked(200, 'x' * 100),
False)
# =========================================================================
# SUMMARY
# =========================================================================
print(f"\n{'=' * 60}")
print(f"RESULTS: {PASS} passed, {FAIL} failed out of {PASS + FAIL} tests")
print(f"{'=' * 60}")
if FAIL > 0:
print("SOME TESTS FAILED!")
sys.exit(1)
else:
print("ALL TESTS PASSED!")