From 13048a106bd1e43299be7740576d9ccb3e6977cf Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Wed, 18 Feb 2026 06:59:22 +0000
Subject: [PATCH] Add Tier 3 structural integrity check to anti-bot detector

Catches silent blocks, anti-bot redirects, and incomplete renders that
pass pattern-based detection (Tiers 1/2) but are structurally broken:

- No <body> tag on pages under 50KB
- Minimal visible text after stripping scripts/styles/tags
- No semantic content elements (p, h1-6, article, section, li, td, a)
- Script-heavy shells with scripts but no real content

Uses signal scoring: 2+ signals = blocked, 1 signal on small page
(<5KB) = blocked. Skips large pages and JSON/XML data responses.
---
 crawl4ai/antibot_detector.py | 73 ++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)
diff --git a/crawl4ai/antibot_detector.py b/crawl4ai/antibot_detector.py
index e2831e46..07dac28f 100644
--- a/crawl4ai/antibot_detector.py
+++ b/crawl4ai/antibot_detector.py
@@ -87,6 +87,20 @@ _TIER2_PATTERNS = [
 
 _TIER2_MAX_SIZE = 10000  # Only check tier 2 patterns on pages under 10KB
 
+# ---------------------------------------------------------------------------
+# Tier 3: Structural integrity — catches silent blocks, anti-bot redirects,
+# incomplete renders that pass pattern detection but are structurally broken
+# ---------------------------------------------------------------------------
+_STRUCTURAL_MAX_SIZE = 50000  # Only check pages under 50KB
+_CONTENT_ELEMENTS_RE = re.compile(
+    r'<(?:p|h[1-6]|article|section|li|td|a)\b', re.IGNORECASE
+)
+_SCRIPT_TAG_RE = re.compile(r'<script\b', re.IGNORECASE)
+_STYLE_TAG_RE = re.compile(r'<style\b[\s\S]*?</style>', re.IGNORECASE)
+_SCRIPT_BLOCK_RE = re.compile(r'<script\b[\s\S]*?</script>', re.IGNORECASE)
+_TAG_RE = re.compile(r'<[^>]+>')
+_BODY_RE = re.compile(r'<body\b', re.IGNORECASE)
+
 # ---------------------------------------------------------------------------
 # Thresholds
 # ---------------------------------------------------------------------------
@@ -102,6 +116,59 @@ def _looks_like_data(html: str) -> bool:
     return stripped[0] in ('{', '[', '<' ) and not stripped.startswith('<html') and not stripped.startswith('<!') and not stripped.startswith('<HTML')
 
 
+def _structural_integrity_check(html: str) -> Tuple[bool, str]:
+    """
+    Tier 3: Structural integrity check for pages that pass pattern detection
+    but are structurally broken — incomplete renders, anti-bot redirects, empty shells.
+
+    Only applies to pages < 50KB that aren't JSON/XML.
+
+    Returns:
+        Tuple of (is_blocked, reason).
+    """
+    html_len = len(html)
+
+    # Skip large pages (unlikely to be block pages) and data responses
+    if html_len > _STRUCTURAL_MAX_SIZE or _looks_like_data(html):
+        return False, ""
+
+    signals = []
+
+    # Signal 1: No <body> tag — definitive structural failure
+    if not _BODY_RE.search(html):
+        return True, f"Structural: no <body> tag ({html_len} bytes)"
+
+    # Signal 2: Minimal visible text after stripping scripts/styles/tags
+    body_match = re.search(r'<body\b[^>]*>([\s\S]*)</body>', html, re.IGNORECASE)
+    body_content = body_match.group(1) if body_match else html
+    stripped = _SCRIPT_BLOCK_RE.sub('', body_content)
+    stripped = _STYLE_TAG_RE.sub('', stripped)
+    visible_text = _TAG_RE.sub('', stripped).strip()
+    visible_len = len(visible_text)
+    if visible_len < 50:
+        signals.append("minimal_text")
+
+    # Signal 3: No content elements (semantic HTML)
+    content_elements = len(_CONTENT_ELEMENTS_RE.findall(html))
+    if content_elements == 0:
+        signals.append("no_content_elements")
+
+    # Signal 4: Script-heavy shell — scripts present but no content
+    script_count = len(_SCRIPT_TAG_RE.findall(html))
+    if script_count > 0 and content_elements == 0 and visible_len < 100:
+        signals.append("script_heavy_shell")
+
+    # Scoring
+    signal_count = len(signals)
+    if signal_count >= 2:
+        return True, f"Structural: {', '.join(signals)} ({html_len} bytes, {visible_len} chars visible)"
+
+    if signal_count == 1 and html_len < 5000:
+        return True, f"Structural: {signals[0]} on small page ({html_len} bytes, {visible_len} chars visible)"
+
+    return False, ""
+
+
 def is_blocked(
     status_code: Optional[int],
     html: str,
@@ -113,6 +180,7 @@ def is_blocked(
     Uses layered detection to maximize coverage while minimizing false positives:
     - Tier 1 patterns (structural markers) trigger on any page size
     - Tier 2 patterns (generic terms) only trigger on short pages (< 10KB)
+    - Tier 3 structural integrity catches silent blocks and empty shells
     - Status-code checks require corroborating content signals
 
     Args:
@@ -161,4 +229,9 @@ def is_blocked(
         if len(stripped) < _EMPTY_CONTENT_THRESHOLD and not _looks_like_data(html):
             return True, f"Near-empty content ({len(stripped)} bytes) with HTTP 200"
 
+    # --- Tier 3: Structural integrity (catches silent blocks, redirects, incomplete renders) ---
+    _blocked, _reason = _structural_integrity_check(html)
+    if _blocked:
+        return True, _reason
+
     return False, ""