""" Unit tests for antibot_detector.is_blocked(). Tests are organized into: - TRUE POSITIVES: Real block pages that MUST be detected - TRUE NEGATIVES: Legitimate pages that MUST NOT be flagged - EDGE CASES: Boundary conditions """ import sys, os sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../..")) from crawl4ai.antibot_detector import is_blocked PASS = 0 FAIL = 0 def check(name, result, expected_blocked, expected_substr=None): global PASS, FAIL blocked, reason = result ok = blocked == expected_blocked if expected_substr and blocked: ok = ok and expected_substr.lower() in reason.lower() status = "PASS" if ok else "FAIL" if not ok: FAIL += 1 print(f" {status}: {name}") print(f" got blocked={blocked}, reason={reason!r}") print(f" expected blocked={expected_blocked}" + (f", substr={expected_substr!r}" if expected_substr else "")) else: PASS += 1 if blocked: print(f" {status}: {name} -> {reason}") else: print(f" {status}: {name} -> not blocked") # ========================================================================= # TRUE POSITIVES — real block pages that MUST be detected # ========================================================================= print("\n=== TRUE POSITIVES (must detect as blocked) ===\n") # --- Akamai --- check("Akamai Reference #", is_blocked(403, 'Access Denied\nYour request was blocked.\nReference #18.2d351ab8.1557333295.a4e16ab'), True, "Akamai") check("Akamai Pardon Our Interruption", is_blocked(403, 'Pardon Our Interruption

Please verify you are human

'), True, "Pardon") check("Akamai 403 short Access Denied", is_blocked(403, '

Access Denied

'), True) # Detected via near-empty 403 or Access Denied pattern # --- Cloudflare --- check("Cloudflare challenge form", is_blocked(403, '''
'''), True, "Cloudflare challenge") check("Cloudflare error 1020", is_blocked(403, '''
1020

Access denied

'''), True, "Cloudflare firewall") check("Cloudflare IUAM script", is_blocked(403, ''), True, "Cloudflare JS challenge") check("Cloudflare Just a moment", is_blocked(403, 'Just a moment...Checking your browser'), True) # Detected via near-empty 403 or Cloudflare pattern check("Cloudflare Checking your browser (short 503)", is_blocked(503, 'Checking your browser before accessing the site.'), True, "Cloudflare browser check") # --- PerimeterX --- check("PerimeterX block page", is_blocked(403, '''Access to This Page Has Been Blocked
'''), True, "PerimeterX") check("PerimeterX captcha CDN", is_blocked(403, ''), True, "PerimeterX captcha") # --- DataDome --- check("DataDome captcha delivery", is_blocked(403, ''''''), True, "DataDome") # --- Imperva/Incapsula --- check("Imperva Incapsula Resource", is_blocked(403, ''), True, "Imperva") check("Imperva incident ID", is_blocked(200, 'Request unsuccessful. Incapsula incident ID: 12345-67890'), True, "Incapsula incident") # --- Sucuri --- check("Sucuri firewall", is_blocked(403, '

Sucuri WebSite Firewall - Access Denied

'), True, "Sucuri") # --- Kasada --- check("Kasada challenge", is_blocked(403, ''), True, "Kasada") # --- HTTP 429 --- check("HTTP 429 rate limit", is_blocked(429, 'Rate limit exceeded'), True, "429") check("HTTP 429 empty body", is_blocked(429, ''), True, "429") # --- Empty 200 --- check("HTTP 200 empty page", is_blocked(200, ''), True, "empty") check("HTTP 200 whitespace only", is_blocked(200, ' \n\n '), True, "empty") # --- 403 near-empty --- check("HTTP 403 near-empty (10 bytes)", is_blocked(403, ''), True, "403") # ========================================================================= # TRUE NEGATIVES — legitimate pages that MUST NOT be flagged # ========================================================================= print("\n=== TRUE NEGATIVES (must NOT detect as blocked) ===\n") # --- Normal pages --- check("Normal 200 page (example.com size)", is_blocked(200, 'Example' + 'x' * 500 + ''), False) check("Normal 200 large page", is_blocked(200, '' + '

Some content here.

\n' * 5000 + ''), False) # --- Security articles (false positive trap!) --- check("Article about bot detection (large page)", is_blocked(200, 'How to Detect Bots' + '

How to Detect Bots on Your Website

' + '

Anti-bot solutions like DataDome, PerimeterX, and Cloudflare ' + 'help detect and block bot traffic. When a bot is detected, ' + 'services show a CAPTCHA or Access Denied page. ' + 'Common signals include blocked by security warnings.

' + '

The g-recaptcha and h-captcha widgets are used for challenges.

' + '

' + 'More article content. ' * 500 + '

' + ''), False) check("DataDome marketing page (large)", is_blocked(200, '

DataDome Bot Protection

' + '

DataDome protects websites from bot attacks. ' + 'Our solution detects automated traffic using advanced fingerprinting. ' + 'Competitors like PerimeterX use window._pxAppId for tracking.

' + '

' + 'Marketing content. ' * 1000 + '

' + ''), False) # --- Login pages with CAPTCHA (not a block!) --- check("Login page with reCAPTCHA (large page)", is_blocked(200, 'Sign In' + '' + '
' + '' + '' + '
' + '' + '
' + '' + '

' + 'Page content. ' * 500 + '

' + ''), False) check("Signup page with hCaptcha (large page)", is_blocked(200, '' + '

Create Account

' + '
' + '

' + 'Registration info. ' * 500 + '

' + ''), False) # --- Legitimate 403 pages (not anti-bot) --- check("Apache directory listing denied (403, large-ish)", is_blocked(403, '403 Forbidden' + '

Forbidden

' + '

You don\'t have permission to access this resource on this server.

' + '
Apache/2.4.41 (Ubuntu) Server at example.com Port 80
' + '

' + 'Server info. ' * 500 + '

' + ''), False) check("Nginx 403 (large page)", is_blocked(403, '403 Forbidden' + '

403 Forbidden

' + '
nginx/1.18.0
' + '

' + 'Content. ' * 500 + '

' + ''), False) check("API 403 auth required (JSON)", is_blocked(403, '{"error": "Forbidden", "message": "Invalid API key", "code": 403}'), False) # --- Cloudflare-served normal pages (not blocked!) --- check("Cloudflare-served normal page with footer", is_blocked(200, '' + '

Welcome to Our Site

' + '

This is a normal page served through Cloudflare CDN.

' + '' + '

' + 'Normal content. ' * 500 + '

' + ''), False) # --- Small but legitimate pages --- check("Small valid 200 page (150 bytes)", is_blocked(200, 'OK

Your request was processed successfully.

'), False) check("Small JSON 200 response", is_blocked(200, '{"status": "ok", "data": {"id": 123, "name": "test"}, "timestamp": "2024-01-01T00:00:00Z"}'), False) check("Redirect page 200", is_blocked(200, 'Redirecting...'), False) # --- 503 legitimate server errors --- check("Legitimate 503 maintenance (large)", is_blocked(503, '

Service Temporarily Unavailable

' + '

We are performing scheduled maintenance. Please try again later.

' + '

' + 'Maintenance info. ' * 500 + '

' + ''), False) # --- 200 with short but real content --- check("Short thank you page (200, 120 bytes)", is_blocked(200, '

Thank You!

Your order has been placed. Confirmation email sent.

'), False) # ========================================================================= # EDGE CASES # ========================================================================= print("\n=== EDGE CASES ===\n") check("None status code + empty html", is_blocked(None, ''), False) check("None status code + block content", is_blocked(None, 'Reference #18.2d351ab8.1557333295.a4e16ab'), True, "Akamai") check("200 + tier1 pattern (Imperva deceptive 200)", is_blocked(200, 'Request unsuccessful. Incapsula incident ID: 555-999'), True, "Incapsula") check("403 + 4999 bytes (just under threshold)", is_blocked(403, 'Access Denied' + 'x' * 4950 + ''), True, "Access Denied") check("403 + 5001 bytes (just over threshold, no tier2 match)", is_blocked(403, 'Some error page' + 'x' * 4960 + ''), False) check("403 + 9999 bytes with generic block text", is_blocked(403, 'blocked by security' + 'x' * 9950 + ''), True, "Blocked by security") check("403 + 10001 bytes with generic block text (too big for tier2)", is_blocked(403, 'blocked by security' + 'x' * 9970 + ''), False) check("200 + whitespace-padded but 89 bytes content (above threshold for meaningful)", is_blocked(200, ' ' * 10 + 'x' * 89 + ' ' * 10), True, "empty") check("200 + exactly 100 bytes stripped (at threshold)", is_blocked(200, 'x' * 100), False) # ========================================================================= # SUMMARY # ========================================================================= print(f"\n{'=' * 60}") print(f"RESULTS: {PASS} passed, {FAIL} failed out of {PASS + FAIL} tests") print(f"{'=' * 60}") if FAIL > 0: print("SOME TESTS FAILED!") sys.exit(1) else: print("ALL TESTS PASSED!")