Add Tier 3 structural integrity check to anti-bot detector
Catches silent blocks, anti-bot redirects, and incomplete renders that pass pattern-based detection (Tiers 1/2) but are structurally broken: - No <body> tag on pages under 50KB - Minimal visible text after stripping scripts/styles/tags - No semantic content elements (p, h1-6, article, section, li, td, a) - Script-heavy shells with scripts but no real content Uses signal scoring: 2+ signals = blocked, 1 signal on small page (<5KB) = blocked. Skips large pages and JSON/XML data responses.
This commit is contained in:
@@ -87,6 +87,20 @@ _TIER2_PATTERNS = [
|
||||
|
||||
_TIER2_MAX_SIZE = 10000 # Only check tier 2 patterns on pages under 10KB
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tier 3: Structural integrity — catches silent blocks, anti-bot redirects,
|
||||
# incomplete renders that pass pattern detection but are structurally broken
|
||||
# ---------------------------------------------------------------------------
|
||||
_STRUCTURAL_MAX_SIZE = 50000 # Only check pages under 50KB
|
||||
_CONTENT_ELEMENTS_RE = re.compile(
|
||||
r'<(?:p|h[1-6]|article|section|li|td|a)\b', re.IGNORECASE
|
||||
)
|
||||
_SCRIPT_TAG_RE = re.compile(r'<script\b', re.IGNORECASE)
|
||||
_STYLE_TAG_RE = re.compile(r'<style\b[\s\S]*?</style>', re.IGNORECASE)
|
||||
_SCRIPT_BLOCK_RE = re.compile(r'<script\b[\s\S]*?</script>', re.IGNORECASE)
|
||||
_TAG_RE = re.compile(r'<[^>]+>')
|
||||
_BODY_RE = re.compile(r'<body\b', re.IGNORECASE)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Thresholds
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -102,6 +116,59 @@ def _looks_like_data(html: str) -> bool:
|
||||
return stripped[0] in ('{', '[', '<' ) and not stripped.startswith('<html') and not stripped.startswith('<!') and not stripped.startswith('<HTML')
|
||||
|
||||
|
||||
def _structural_integrity_check(html: str) -> Tuple[bool, str]:
|
||||
"""
|
||||
Tier 3: Structural integrity check for pages that pass pattern detection
|
||||
but are structurally broken — incomplete renders, anti-bot redirects, empty shells.
|
||||
|
||||
Only applies to pages < 50KB that aren't JSON/XML.
|
||||
|
||||
Returns:
|
||||
Tuple of (is_blocked, reason).
|
||||
"""
|
||||
html_len = len(html)
|
||||
|
||||
# Skip large pages (unlikely to be block pages) and data responses
|
||||
if html_len > _STRUCTURAL_MAX_SIZE or _looks_like_data(html):
|
||||
return False, ""
|
||||
|
||||
signals = []
|
||||
|
||||
# Signal 1: No <body> tag — definitive structural failure
|
||||
if not _BODY_RE.search(html):
|
||||
return True, f"Structural: no <body> tag ({html_len} bytes)"
|
||||
|
||||
# Signal 2: Minimal visible text after stripping scripts/styles/tags
|
||||
body_match = re.search(r'<body\b[^>]*>([\s\S]*)</body>', html, re.IGNORECASE)
|
||||
body_content = body_match.group(1) if body_match else html
|
||||
stripped = _SCRIPT_BLOCK_RE.sub('', body_content)
|
||||
stripped = _STYLE_TAG_RE.sub('', stripped)
|
||||
visible_text = _TAG_RE.sub('', stripped).strip()
|
||||
visible_len = len(visible_text)
|
||||
if visible_len < 50:
|
||||
signals.append("minimal_text")
|
||||
|
||||
# Signal 3: No content elements (semantic HTML)
|
||||
content_elements = len(_CONTENT_ELEMENTS_RE.findall(html))
|
||||
if content_elements == 0:
|
||||
signals.append("no_content_elements")
|
||||
|
||||
# Signal 4: Script-heavy shell — scripts present but no content
|
||||
script_count = len(_SCRIPT_TAG_RE.findall(html))
|
||||
if script_count > 0 and content_elements == 0 and visible_len < 100:
|
||||
signals.append("script_heavy_shell")
|
||||
|
||||
# Scoring
|
||||
signal_count = len(signals)
|
||||
if signal_count >= 2:
|
||||
return True, f"Structural: {', '.join(signals)} ({html_len} bytes, {visible_len} chars visible)"
|
||||
|
||||
if signal_count == 1 and html_len < 5000:
|
||||
return True, f"Structural: {signals[0]} on small page ({html_len} bytes, {visible_len} chars visible)"
|
||||
|
||||
return False, ""
|
||||
|
||||
|
||||
def is_blocked(
|
||||
status_code: Optional[int],
|
||||
html: str,
|
||||
@@ -113,6 +180,7 @@ def is_blocked(
|
||||
Uses layered detection to maximize coverage while minimizing false positives:
|
||||
- Tier 1 patterns (structural markers) trigger on any page size
|
||||
- Tier 2 patterns (generic terms) only trigger on short pages (< 10KB)
|
||||
- Tier 3 structural integrity catches silent blocks and empty shells
|
||||
- Status-code checks require corroborating content signals
|
||||
|
||||
Args:
|
||||
@@ -161,4 +229,9 @@ def is_blocked(
|
||||
if len(stripped) < _EMPTY_CONTENT_THRESHOLD and not _looks_like_data(html):
|
||||
return True, f"Near-empty content ({len(stripped)} bytes) with HTTP 200"
|
||||
|
||||
# --- Tier 3: Structural integrity (catches silent blocks, redirects, incomplete renders) ---
|
||||
_blocked, _reason = _structural_integrity_check(html)
|
||||
if _blocked:
|
||||
return True, _reason
|
||||
|
||||
return False, ""
|
||||
|
||||
Reference in New Issue
Block a user