Add Tier 3 structural integrity check to anti-bot detector
Catches silent blocks, anti-bot redirects, and incomplete renders that pass pattern-based detection (Tiers 1/2) but are structurally broken: - No <body> tag on pages under 50KB - Minimal visible text after stripping scripts/styles/tags - No semantic content elements (p, h1-6, article, section, li, td, a) - Script-heavy shells with scripts but no real content Uses signal scoring: 2+ signals = blocked, 1 signal on small page (<5KB) = blocked. Skips large pages and JSON/XML data responses.
This commit is contained in:
@@ -87,6 +87,20 @@ _TIER2_PATTERNS = [
|
|||||||
|
|
||||||
_TIER2_MAX_SIZE = 10000 # Only check tier 2 patterns on pages under 10KB
|
_TIER2_MAX_SIZE = 10000 # Only check tier 2 patterns on pages under 10KB
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Tier 3: Structural integrity — catches silent blocks, anti-bot redirects,
|
||||||
|
# incomplete renders that pass pattern detection but are structurally broken
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
_STRUCTURAL_MAX_SIZE = 50000 # Only check pages under 50KB
|
||||||
|
_CONTENT_ELEMENTS_RE = re.compile(
|
||||||
|
r'<(?:p|h[1-6]|article|section|li|td|a)\b', re.IGNORECASE
|
||||||
|
)
|
||||||
|
_SCRIPT_TAG_RE = re.compile(r'<script\b', re.IGNORECASE)
|
||||||
|
_STYLE_TAG_RE = re.compile(r'<style\b[\s\S]*?</style>', re.IGNORECASE)
|
||||||
|
_SCRIPT_BLOCK_RE = re.compile(r'<script\b[\s\S]*?</script>', re.IGNORECASE)
|
||||||
|
_TAG_RE = re.compile(r'<[^>]+>')
|
||||||
|
_BODY_RE = re.compile(r'<body\b', re.IGNORECASE)
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Thresholds
|
# Thresholds
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -102,6 +116,59 @@ def _looks_like_data(html: str) -> bool:
|
|||||||
return stripped[0] in ('{', '[', '<' ) and not stripped.startswith('<html') and not stripped.startswith('<!') and not stripped.startswith('<HTML')
|
return stripped[0] in ('{', '[', '<' ) and not stripped.startswith('<html') and not stripped.startswith('<!') and not stripped.startswith('<HTML')
|
||||||
|
|
||||||
|
|
||||||
|
def _structural_integrity_check(html: str) -> Tuple[bool, str]:
|
||||||
|
"""
|
||||||
|
Tier 3: Structural integrity check for pages that pass pattern detection
|
||||||
|
but are structurally broken — incomplete renders, anti-bot redirects, empty shells.
|
||||||
|
|
||||||
|
Only applies to pages < 50KB that aren't JSON/XML.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (is_blocked, reason).
|
||||||
|
"""
|
||||||
|
html_len = len(html)
|
||||||
|
|
||||||
|
# Skip large pages (unlikely to be block pages) and data responses
|
||||||
|
if html_len > _STRUCTURAL_MAX_SIZE or _looks_like_data(html):
|
||||||
|
return False, ""
|
||||||
|
|
||||||
|
signals = []
|
||||||
|
|
||||||
|
# Signal 1: No <body> tag — definitive structural failure
|
||||||
|
if not _BODY_RE.search(html):
|
||||||
|
return True, f"Structural: no <body> tag ({html_len} bytes)"
|
||||||
|
|
||||||
|
# Signal 2: Minimal visible text after stripping scripts/styles/tags
|
||||||
|
body_match = re.search(r'<body\b[^>]*>([\s\S]*)</body>', html, re.IGNORECASE)
|
||||||
|
body_content = body_match.group(1) if body_match else html
|
||||||
|
stripped = _SCRIPT_BLOCK_RE.sub('', body_content)
|
||||||
|
stripped = _STYLE_TAG_RE.sub('', stripped)
|
||||||
|
visible_text = _TAG_RE.sub('', stripped).strip()
|
||||||
|
visible_len = len(visible_text)
|
||||||
|
if visible_len < 50:
|
||||||
|
signals.append("minimal_text")
|
||||||
|
|
||||||
|
# Signal 3: No content elements (semantic HTML)
|
||||||
|
content_elements = len(_CONTENT_ELEMENTS_RE.findall(html))
|
||||||
|
if content_elements == 0:
|
||||||
|
signals.append("no_content_elements")
|
||||||
|
|
||||||
|
# Signal 4: Script-heavy shell — scripts present but no content
|
||||||
|
script_count = len(_SCRIPT_TAG_RE.findall(html))
|
||||||
|
if script_count > 0 and content_elements == 0 and visible_len < 100:
|
||||||
|
signals.append("script_heavy_shell")
|
||||||
|
|
||||||
|
# Scoring
|
||||||
|
signal_count = len(signals)
|
||||||
|
if signal_count >= 2:
|
||||||
|
return True, f"Structural: {', '.join(signals)} ({html_len} bytes, {visible_len} chars visible)"
|
||||||
|
|
||||||
|
if signal_count == 1 and html_len < 5000:
|
||||||
|
return True, f"Structural: {signals[0]} on small page ({html_len} bytes, {visible_len} chars visible)"
|
||||||
|
|
||||||
|
return False, ""
|
||||||
|
|
||||||
|
|
||||||
def is_blocked(
|
def is_blocked(
|
||||||
status_code: Optional[int],
|
status_code: Optional[int],
|
||||||
html: str,
|
html: str,
|
||||||
@@ -113,6 +180,7 @@ def is_blocked(
|
|||||||
Uses layered detection to maximize coverage while minimizing false positives:
|
Uses layered detection to maximize coverage while minimizing false positives:
|
||||||
- Tier 1 patterns (structural markers) trigger on any page size
|
- Tier 1 patterns (structural markers) trigger on any page size
|
||||||
- Tier 2 patterns (generic terms) only trigger on short pages (< 10KB)
|
- Tier 2 patterns (generic terms) only trigger on short pages (< 10KB)
|
||||||
|
- Tier 3 structural integrity catches silent blocks and empty shells
|
||||||
- Status-code checks require corroborating content signals
|
- Status-code checks require corroborating content signals
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -161,4 +229,9 @@ def is_blocked(
|
|||||||
if len(stripped) < _EMPTY_CONTENT_THRESHOLD and not _looks_like_data(html):
|
if len(stripped) < _EMPTY_CONTENT_THRESHOLD and not _looks_like_data(html):
|
||||||
return True, f"Near-empty content ({len(stripped)} bytes) with HTTP 200"
|
return True, f"Near-empty content ({len(stripped)} bytes) with HTTP 200"
|
||||||
|
|
||||||
|
# --- Tier 3: Structural integrity (catches silent blocks, redirects, incomplete renders) ---
|
||||||
|
_blocked, _reason = _structural_integrity_check(html)
|
||||||
|
if _blocked:
|
||||||
|
return True, _reason
|
||||||
|
|
||||||
return False, ""
|
return False, ""
|
||||||
|
|||||||
Reference in New Issue
Block a user