""" Unit tests for antibot_detector.is_blocked(). Tests are organized into: - TRUE POSITIVES: Real block pages that MUST be detected - TRUE NEGATIVES: Legitimate pages that MUST NOT be flagged - EDGE CASES: Boundary conditions """ import sys, os sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../..")) from crawl4ai.antibot_detector import is_blocked PASS = 0 FAIL = 0 def check(name, result, expected_blocked, expected_substr=None): global PASS, FAIL blocked, reason = result ok = blocked == expected_blocked if expected_substr and blocked: ok = ok and expected_substr.lower() in reason.lower() status = "PASS" if ok else "FAIL" if not ok: FAIL += 1 print(f" {status}: {name}") print(f" got blocked={blocked}, reason={reason!r}") print(f" expected blocked={expected_blocked}" + (f", substr={expected_substr!r}" if expected_substr else "")) else: PASS += 1 if blocked: print(f" {status}: {name} -> {reason}") else: print(f" {status}: {name} -> not blocked") # ========================================================================= # TRUE POSITIVES — real block pages that MUST be detected # ========================================================================= print("\n=== TRUE POSITIVES (must detect as blocked) ===\n") # --- Akamai --- check("Akamai Reference #", is_blocked(403, '
Access Denied\nYour request was blocked.\nReference #18.2d351ab8.1557333295.a4e16ab'), True, "Akamai") check("Akamai Pardon Our Interruption", is_blocked(403, 'Please verify you are human
'), True, "Pardon") check("Akamai 403 short Access Denied", is_blocked(403, 'Access denied
'''), True, "Cloudflare firewall") check("Cloudflare IUAM script", is_blocked(403, ''), True, "Cloudflare JS challenge") check("Cloudflare Just a moment", is_blocked(403, 'Some content here.
\n' * 5000 + ''), False) # --- Security articles (false positive trap!) --- check("Article about bot detection (large page)", is_blocked(200, 'Anti-bot solutions like DataDome, PerimeterX, and Cloudflare ' + 'help detect and block bot traffic. When a bot is detected, ' + 'services show a CAPTCHA or Access Denied page. ' + 'Common signals include blocked by security warnings.
' + 'The g-recaptcha and h-captcha widgets are used for challenges.
' + '' + 'More article content. ' * 500 + '
' + ''), False) check("DataDome marketing page (large)", is_blocked(200, 'DataDome protects websites from bot attacks. ' + 'Our solution detects automated traffic using advanced fingerprinting. ' + 'Competitors like PerimeterX use window._pxAppId for tracking.
' + '' + 'Marketing content. ' * 1000 + '
' + ''), False) # --- Login pages with CAPTCHA (not a block!) --- check("Login page with reCAPTCHA (large page)", is_blocked(200, '' + 'Page content. ' * 500 + '
' + ''), False) check("Signup page with hCaptcha (large page)", is_blocked(200, '' + '' + 'Registration info. ' * 500 + '
' + ''), False) # --- Legitimate 403 pages (not anti-bot) --- check("Apache directory listing denied (403, large-ish)", is_blocked(403, 'You don\'t have permission to access this resource on this server.
' + '' + 'Server info. ' * 500 + '
' + ''), False) check("Nginx 403 (large page)", is_blocked(403, '' + 'Content. ' * 500 + '
' + ''), False) check("API 403 auth required (JSON)", is_blocked(403, '{"error": "Forbidden", "message": "Invalid API key", "code": 403}'), False) # --- Cloudflare-served normal pages (not blocked!) --- check("Cloudflare-served normal page with footer", is_blocked(200, '' + 'This is a normal page served through Cloudflare CDN.
' + '' + '' + 'Normal content. ' * 500 + '
' + ''), False) # --- Small but legitimate pages --- check("Small valid 200 page (150 bytes)", is_blocked(200, 'Your request was processed successfully.
'), False) check("Small JSON 200 response", is_blocked(200, '{"status": "ok", "data": {"id": 123, "name": "test"}, "timestamp": "2024-01-01T00:00:00Z"}'), False) check("Redirect page 200", is_blocked(200, 'Redirecting...'), False) # --- 503 legitimate server errors --- check("Legitimate 503 maintenance (large)", is_blocked(503, 'We are performing scheduled maintenance. Please try again later.
' + '' + 'Maintenance info. ' * 500 + '
' + ''), False) # --- 200 with short but real content --- check("Short thank you page (200, 120 bytes)", is_blocked(200, 'Your order has been placed. Confirmation email sent.
'), False) # ========================================================================= # EDGE CASES # ========================================================================= print("\n=== EDGE CASES ===\n") check("None status code + empty html", is_blocked(None, ''), False) check("None status code + block content", is_blocked(None, 'Reference #18.2d351ab8.1557333295.a4e16ab'), True, "Akamai") check("200 + tier1 pattern (Imperva deceptive 200)", is_blocked(200, 'Request unsuccessful. Incapsula incident ID: 555-999'), True, "Incapsula") check("403 + 4999 bytes (just under threshold)", is_blocked(403, 'Access Denied' + 'x' * 4950 + ''), True, "Access Denied") check("403 + 5001 bytes (just over threshold, no tier2 match)", is_blocked(403, 'Some error page' + 'x' * 4960 + ''), False) check("403 + 9999 bytes with generic block text", is_blocked(403, 'blocked by security' + 'x' * 9950 + ''), True, "Blocked by security") check("403 + 10001 bytes with generic block text (too big for tier2)", is_blocked(403, 'blocked by security' + 'x' * 9970 + ''), False) check("200 + whitespace-padded but 89 bytes content (above threshold for meaningful)", is_blocked(200, ' ' * 10 + 'x' * 89 + ' ' * 10), True, "empty") check("200 + exactly 100 bytes stripped (at threshold)", is_blocked(200, 'x' * 100), False) # ========================================================================= # SUMMARY # ========================================================================= print(f"\n{'=' * 60}") print(f"RESULTS: {PASS} passed, {FAIL} failed out of {PASS + FAIL} tests") print(f"{'=' * 60}") if FAIL > 0: print("SOME TESTS FAILED!") sys.exit(1) else: print("ALL TESTS PASSED!")