Add Tier 3 structural integrity check to anti-bot detector

Catches silent blocks, anti-bot redirects, and incomplete renders that
pass pattern-based detection (Tiers 1/2) but are structurally broken:

- No <body> tag on pages under 50KB
- Minimal visible text after stripping scripts/styles/tags
- No semantic content elements (p, h1-6, article, section, li, td, a)
- Script-heavy shells with scripts but no real content

Uses signal scoring: 2+ signals = blocked, 1 signal on small page
(<5KB) = blocked. Skips large pages and JSON/XML data responses.
This commit is contained in:
unclecode
2026-02-18 06:59:22 +00:00
parent c9cb0160cf
commit 13048a106b

View File

@@ -87,6 +87,20 @@ _TIER2_PATTERNS = [
_TIER2_MAX_SIZE = 10000 # Only check tier 2 patterns on pages under 10KB
# ---------------------------------------------------------------------------
# Tier 3: Structural integrity — catches silent blocks, anti-bot redirects,
# incomplete renders that pass pattern detection but are structurally broken
# ---------------------------------------------------------------------------
_STRUCTURAL_MAX_SIZE = 50000 # Only check pages under 50KB
_CONTENT_ELEMENTS_RE = re.compile(
r'<(?:p|h[1-6]|article|section|li|td|a)\b', re.IGNORECASE
)
_SCRIPT_TAG_RE = re.compile(r'<script\b', re.IGNORECASE)
_STYLE_TAG_RE = re.compile(r'<style\b[\s\S]*?</style>', re.IGNORECASE)
_SCRIPT_BLOCK_RE = re.compile(r'<script\b[\s\S]*?</script>', re.IGNORECASE)
_TAG_RE = re.compile(r'<[^>]+>')
_BODY_RE = re.compile(r'<body\b', re.IGNORECASE)
# ---------------------------------------------------------------------------
# Thresholds
# ---------------------------------------------------------------------------
@@ -102,6 +116,59 @@ def _looks_like_data(html: str) -> bool:
return stripped[0] in ('{', '[', '<' ) and not stripped.startswith('<html') and not stripped.startswith('<!') and not stripped.startswith('<HTML')
def _structural_integrity_check(html: str) -> Tuple[bool, str]:
"""
Tier 3: Structural integrity check for pages that pass pattern detection
but are structurally broken — incomplete renders, anti-bot redirects, empty shells.
Only applies to pages < 50KB that aren't JSON/XML.
Returns:
Tuple of (is_blocked, reason).
"""
html_len = len(html)
# Skip large pages (unlikely to be block pages) and data responses
if html_len > _STRUCTURAL_MAX_SIZE or _looks_like_data(html):
return False, ""
signals = []
# Signal 1: No <body> tag — definitive structural failure
if not _BODY_RE.search(html):
return True, f"Structural: no <body> tag ({html_len} bytes)"
# Signal 2: Minimal visible text after stripping scripts/styles/tags
body_match = re.search(r'<body\b[^>]*>([\s\S]*)</body>', html, re.IGNORECASE)
body_content = body_match.group(1) if body_match else html
stripped = _SCRIPT_BLOCK_RE.sub('', body_content)
stripped = _STYLE_TAG_RE.sub('', stripped)
visible_text = _TAG_RE.sub('', stripped).strip()
visible_len = len(visible_text)
if visible_len < 50:
signals.append("minimal_text")
# Signal 3: No content elements (semantic HTML)
content_elements = len(_CONTENT_ELEMENTS_RE.findall(html))
if content_elements == 0:
signals.append("no_content_elements")
# Signal 4: Script-heavy shell — scripts present but no content
script_count = len(_SCRIPT_TAG_RE.findall(html))
if script_count > 0 and content_elements == 0 and visible_len < 100:
signals.append("script_heavy_shell")
# Scoring
signal_count = len(signals)
if signal_count >= 2:
return True, f"Structural: {', '.join(signals)} ({html_len} bytes, {visible_len} chars visible)"
if signal_count == 1 and html_len < 5000:
return True, f"Structural: {signals[0]} on small page ({html_len} bytes, {visible_len} chars visible)"
return False, ""
def is_blocked(
status_code: Optional[int],
html: str,
@@ -113,6 +180,7 @@ def is_blocked(
Uses layered detection to maximize coverage while minimizing false positives:
- Tier 1 patterns (structural markers) trigger on any page size
- Tier 2 patterns (generic terms) only trigger on short pages (< 10KB)
- Tier 3 structural integrity catches silent blocks and empty shells
- Status-code checks require corroborating content signals
Args:
@@ -161,4 +229,9 @@ def is_blocked(
if len(stripped) < _EMPTY_CONTENT_THRESHOLD and not _looks_like_data(html):
return True, f"Near-empty content ({len(stripped)} bytes) with HTTP 200"
# --- Tier 3: Structural integrity (catches silent blocks, redirects, incomplete renders) ---
_blocked, _reason = _structural_integrity_check(html)
if _blocked:
return True, _reason
return False, ""