From 13048a106bd1e43299be7740576d9ccb3e6977cf Mon Sep 17 00:00:00 2001 From: unclecode Date: Wed, 18 Feb 2026 06:59:22 +0000 Subject: [PATCH] Add Tier 3 structural integrity check to anti-bot detector Catches silent blocks, anti-bot redirects, and incomplete renders that pass pattern-based detection (Tiers 1/2) but are structurally broken: - No tag on pages under 50KB - Minimal visible text after stripping scripts/styles/tags - No semantic content elements (p, h1-6, article, section, li, td, a) - Script-heavy shells with scripts but no real content Uses signal scoring: 2+ signals = blocked, 1 signal on small page (<5KB) = blocked. Skips large pages and JSON/XML data responses. --- crawl4ai/antibot_detector.py | 73 ++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/crawl4ai/antibot_detector.py b/crawl4ai/antibot_detector.py index e2831e46..07dac28f 100644 --- a/crawl4ai/antibot_detector.py +++ b/crawl4ai/antibot_detector.py @@ -87,6 +87,20 @@ _TIER2_PATTERNS = [ _TIER2_MAX_SIZE = 10000 # Only check tier 2 patterns on pages under 10KB +# --------------------------------------------------------------------------- +# Tier 3: Structural integrity — catches silent blocks, anti-bot redirects, +# incomplete renders that pass pattern detection but are structurally broken +# --------------------------------------------------------------------------- +_STRUCTURAL_MAX_SIZE = 50000 # Only check pages under 50KB +_CONTENT_ELEMENTS_RE = re.compile( + r'<(?:p|h[1-6]|article|section|li|td|a)\b', re.IGNORECASE +) +_SCRIPT_TAG_RE = re.compile(r'', re.IGNORECASE) +_SCRIPT_BLOCK_RE = re.compile(r'', re.IGNORECASE) +_TAG_RE = re.compile(r'<[^>]+>') +_BODY_RE = re.compile(r' bool: return stripped[0] in ('{', '[', '<' ) and not stripped.startswith(' Tuple[bool, str]: + """ + Tier 3: Structural integrity check for pages that pass pattern detection + but are structurally broken — incomplete renders, anti-bot redirects, empty shells. + + Only applies to pages < 50KB that aren't JSON/XML. + + Returns: + Tuple of (is_blocked, reason). + """ + html_len = len(html) + + # Skip large pages (unlikely to be block pages) and data responses + if html_len > _STRUCTURAL_MAX_SIZE or _looks_like_data(html): + return False, "" + + signals = [] + + # Signal 1: No tag — definitive structural failure + if not _BODY_RE.search(html): + return True, f"Structural: no tag ({html_len} bytes)" + + # Signal 2: Minimal visible text after stripping scripts/styles/tags + body_match = re.search(r']*>([\s\S]*)', html, re.IGNORECASE) + body_content = body_match.group(1) if body_match else html + stripped = _SCRIPT_BLOCK_RE.sub('', body_content) + stripped = _STYLE_TAG_RE.sub('', stripped) + visible_text = _TAG_RE.sub('', stripped).strip() + visible_len = len(visible_text) + if visible_len < 50: + signals.append("minimal_text") + + # Signal 3: No content elements (semantic HTML) + content_elements = len(_CONTENT_ELEMENTS_RE.findall(html)) + if content_elements == 0: + signals.append("no_content_elements") + + # Signal 4: Script-heavy shell — scripts present but no content + script_count = len(_SCRIPT_TAG_RE.findall(html)) + if script_count > 0 and content_elements == 0 and visible_len < 100: + signals.append("script_heavy_shell") + + # Scoring + signal_count = len(signals) + if signal_count >= 2: + return True, f"Structural: {', '.join(signals)} ({html_len} bytes, {visible_len} chars visible)" + + if signal_count == 1 and html_len < 5000: + return True, f"Structural: {signals[0]} on small page ({html_len} bytes, {visible_len} chars visible)" + + return False, "" + + def is_blocked( status_code: Optional[int], html: str, @@ -113,6 +180,7 @@ def is_blocked( Uses layered detection to maximize coverage while minimizing false positives: - Tier 1 patterns (structural markers) trigger on any page size - Tier 2 patterns (generic terms) only trigger on short pages (< 10KB) + - Tier 3 structural integrity catches silent blocks and empty shells - Status-code checks require corroborating content signals Args: @@ -161,4 +229,9 @@ def is_blocked( if len(stripped) < _EMPTY_CONTENT_THRESHOLD and not _looks_like_data(html): return True, f"Near-empty content ({len(stripped)} bytes) with HTTP 200" + # --- Tier 3: Structural integrity (catches silent blocks, redirects, incomplete renders) --- + _blocked, _reason = _structural_integrity_check(html) + if _blocked: + return True, _reason + return False, ""