Strip markdown code fences (```json ... ```) from LLM responses before
json.loads() in agenerate_schema(). Anthropic models wrap JSON output
in markdown fences when litellm silently drops the unsupported
response_format parameter, causing json.loads("") parse failures.
- Add _strip_markdown_fences() helper to extraction_strategy.py
- Apply fence stripping + empty response check in agenerate_schema()
- Separate JSONDecodeError for clearer error messages
- Add 34 tests: unit, real API integration (Anthropic/OpenAI/Groq
against quotes.toscrape.com), and regression parametrized
322 lines
12 KiB
Python
322 lines
12 KiB
Python
"""
|
|
Tests for _strip_markdown_fences helper and agenerate_schema() JSON parsing fix.
|
|
|
|
Covers:
|
|
- Unit tests for _strip_markdown_fences (pure logic, no API calls)
|
|
- Real integration tests calling Anthropic/OpenAI/Groq against quotes.toscrape.com
|
|
- Regression tests ensuring clean JSON is never corrupted
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import pytest
|
|
|
|
from crawl4ai.extraction_strategy import (
|
|
_strip_markdown_fences,
|
|
JsonCssExtractionStrategy,
|
|
JsonXPathExtractionStrategy,
|
|
)
|
|
from crawl4ai.async_configs import LLMConfig
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Sample schemas for unit tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
SIMPLE_SCHEMA = {
|
|
"name": "Quotes",
|
|
"baseSelector": ".quote",
|
|
"fields": [
|
|
{"name": "text", "selector": ".text", "type": "text"},
|
|
{"name": "author", "selector": ".author", "type": "text"},
|
|
],
|
|
}
|
|
|
|
NESTED_SCHEMA = {
|
|
"name": "Products",
|
|
"baseSelector": ".product-card",
|
|
"baseFields": [{"name": "id", "selector": "", "type": "attribute", "attribute": "data-id"}],
|
|
"fields": [
|
|
{"name": "title", "selector": "h2.title", "type": "text"},
|
|
{"name": "price", "selector": ".price", "type": "text"},
|
|
{"name": "description", "selector": ".desc", "type": "text"},
|
|
{"name": "image", "selector": "img.product-img", "type": "attribute", "attribute": "src"},
|
|
],
|
|
}
|
|
|
|
TEST_URL = "https://quotes.toscrape.com/"
|
|
|
|
|
|
# ===========================================================================
|
|
# Unit tests for _strip_markdown_fences
|
|
# ===========================================================================
|
|
|
|
|
|
class TestStripMarkdownFences:
|
|
"""Direct unit tests for the _strip_markdown_fences helper."""
|
|
|
|
def test_clean_json_passthrough(self):
|
|
"""Clean JSON (no fences) must pass through unchanged."""
|
|
raw = json.dumps(SIMPLE_SCHEMA)
|
|
assert _strip_markdown_fences(raw) == raw
|
|
|
|
def test_json_fence(self):
|
|
"""```json ... ``` wrapping is stripped correctly."""
|
|
raw = '```json\n{"key": "value"}\n```'
|
|
assert json.loads(_strip_markdown_fences(raw)) == {"key": "value"}
|
|
|
|
def test_bare_fence(self):
|
|
"""``` ... ``` (no language tag) is stripped correctly."""
|
|
raw = '```\n{"key": "value"}\n```'
|
|
assert json.loads(_strip_markdown_fences(raw)) == {"key": "value"}
|
|
|
|
def test_fence_with_language_variants(self):
|
|
"""Various language tags after ``` are stripped."""
|
|
for lang in ["json", "JSON", "javascript", "js", "text", "jsonc"]:
|
|
raw = f"```{lang}\n{{\"a\": 1}}\n```"
|
|
result = _strip_markdown_fences(raw)
|
|
assert json.loads(result) == {"a": 1}, f"Failed for language tag: {lang}"
|
|
|
|
def test_leading_trailing_whitespace(self):
|
|
"""Whitespace around fenced content is stripped."""
|
|
raw = ' \n ```json\n{"key": "value"}\n``` \n '
|
|
assert json.loads(_strip_markdown_fences(raw)) == {"key": "value"}
|
|
|
|
def test_no_fences_with_whitespace(self):
|
|
"""Plain JSON with surrounding whitespace is handled."""
|
|
raw = ' \n {"key": "value"} \n '
|
|
assert json.loads(_strip_markdown_fences(raw)) == {"key": "value"}
|
|
|
|
def test_nested_code_block_in_value(self):
|
|
"""JSON with a string value containing ``` is not corrupted."""
|
|
inner = {"code": "Use ```python\\nprint()\\n``` for code blocks"}
|
|
raw = f'```json\n{json.dumps(inner)}\n```'
|
|
result = _strip_markdown_fences(raw)
|
|
parsed = json.loads(result)
|
|
assert "```python" in parsed["code"]
|
|
|
|
def test_complex_schema(self):
|
|
"""A real-world multi-field schema wrapped in fences parses correctly."""
|
|
raw = f"```json\n{json.dumps(NESTED_SCHEMA, indent=2)}\n```"
|
|
result = _strip_markdown_fences(raw)
|
|
assert json.loads(result) == NESTED_SCHEMA
|
|
|
|
def test_empty_string(self):
|
|
"""Empty string returns empty string."""
|
|
assert _strip_markdown_fences("") == ""
|
|
|
|
def test_only_whitespace(self):
|
|
"""Whitespace-only string returns empty string."""
|
|
assert _strip_markdown_fences(" \n\n ") == ""
|
|
|
|
def test_only_fences(self):
|
|
"""Bare fences with nothing inside return empty string."""
|
|
assert _strip_markdown_fences("```json\n```") == ""
|
|
|
|
def test_multiline_json(self):
|
|
"""Multiline pretty-printed JSON inside fences."""
|
|
pretty = json.dumps(SIMPLE_SCHEMA, indent=4)
|
|
raw = f"```json\n{pretty}\n```"
|
|
assert json.loads(_strip_markdown_fences(raw)) == SIMPLE_SCHEMA
|
|
|
|
def test_already_clean_does_not_mutate(self):
|
|
"""Passing already-clean JSON multiple times is idempotent."""
|
|
raw = json.dumps(SIMPLE_SCHEMA)
|
|
once = _strip_markdown_fences(raw)
|
|
twice = _strip_markdown_fences(once)
|
|
assert once == twice == raw
|
|
|
|
|
|
# ===========================================================================
|
|
# Real integration tests — actual LLM API calls against quotes.toscrape.com
|
|
# ===========================================================================
|
|
|
|
|
|
def _validate_schema(schema: dict):
|
|
"""Validate that a generated schema has the expected structure."""
|
|
assert isinstance(schema, dict), f"Schema must be a dict, got {type(schema)}"
|
|
assert "name" in schema, "Schema must have a 'name' field"
|
|
assert "baseSelector" in schema, "Schema must have a 'baseSelector' field"
|
|
assert "fields" in schema, "Schema must have a 'fields' field"
|
|
assert isinstance(schema["fields"], list), "'fields' must be a list"
|
|
assert len(schema["fields"]) > 0, "'fields' must not be empty"
|
|
for field in schema["fields"]:
|
|
assert "name" in field, f"Each field must have a 'name': {field}"
|
|
assert "selector" in field, f"Each field must have a 'selector': {field}"
|
|
assert "type" in field, f"Each field must have a 'type': {field}"
|
|
|
|
|
|
class TestRealAnthropicSchemaGeneration:
|
|
"""Real API calls to Anthropic models — the exact scenario from the bug report."""
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.skipif(
|
|
not os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
|
|
reason="CRAWL4AI_ANTHROPIC_KEY not set",
|
|
)
|
|
async def test_anthropic_haiku_css_schema(self):
|
|
"""Reproduce the original bug: anthropic/claude-haiku-4-5 + CSS schema."""
|
|
schema = await JsonCssExtractionStrategy.agenerate_schema(
|
|
url=TEST_URL,
|
|
schema_type="CSS",
|
|
query="Extract all quotes with their text, author, and tags",
|
|
llm_config=LLMConfig(
|
|
provider="anthropic/claude-haiku-4-5",
|
|
api_token=os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
|
|
),
|
|
)
|
|
_validate_schema(schema)
|
|
print(f"\n[Anthropic Haiku CSS] Generated schema: {json.dumps(schema, indent=2)}")
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.skipif(
|
|
not os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
|
|
reason="CRAWL4AI_ANTHROPIC_KEY not set",
|
|
)
|
|
async def test_anthropic_haiku_xpath_schema(self):
|
|
"""Anthropic haiku with XPath schema type."""
|
|
schema = await JsonXPathExtractionStrategy.agenerate_schema(
|
|
url=TEST_URL,
|
|
schema_type="XPATH",
|
|
query="Extract all quotes with their text, author, and tags",
|
|
llm_config=LLMConfig(
|
|
provider="anthropic/claude-haiku-4-5",
|
|
api_token=os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
|
|
),
|
|
)
|
|
_validate_schema(schema)
|
|
print(f"\n[Anthropic Haiku XPath] Generated schema: {json.dumps(schema, indent=2)}")
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.skipif(
|
|
not os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
|
|
reason="CRAWL4AI_ANTHROPIC_KEY not set",
|
|
)
|
|
async def test_anthropic_no_query(self):
|
|
"""Anthropic with no query — should auto-detect schema from page structure."""
|
|
schema = await JsonCssExtractionStrategy.agenerate_schema(
|
|
url=TEST_URL,
|
|
schema_type="CSS",
|
|
llm_config=LLMConfig(
|
|
provider="anthropic/claude-haiku-4-5",
|
|
api_token=os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
|
|
),
|
|
)
|
|
_validate_schema(schema)
|
|
print(f"\n[Anthropic Haiku no-query] Generated schema: {json.dumps(schema, indent=2)}")
|
|
|
|
|
|
class TestRealOpenAISchemaGeneration:
|
|
"""OpenAI models — should still work as before (regression check)."""
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.skipif(
|
|
not os.getenv("CRAWL4AI_OPENAI_KEY"),
|
|
reason="CRAWL4AI_OPENAI_KEY not set",
|
|
)
|
|
async def test_openai_gpt4o_mini_css_schema(self):
|
|
"""OpenAI gpt-4o-mini with CSS — this already worked, must not regress."""
|
|
schema = await JsonCssExtractionStrategy.agenerate_schema(
|
|
url=TEST_URL,
|
|
schema_type="CSS",
|
|
query="Extract all quotes with their text, author, and tags",
|
|
llm_config=LLMConfig(
|
|
provider="openai/gpt-4o-mini",
|
|
api_token=os.getenv("CRAWL4AI_OPENAI_KEY"),
|
|
),
|
|
)
|
|
_validate_schema(schema)
|
|
print(f"\n[OpenAI gpt-4o-mini CSS] Generated schema: {json.dumps(schema, indent=2)}")
|
|
|
|
|
|
class TestRealGroqSchemaGeneration:
|
|
"""Groq with the updated model name."""
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.skipif(
|
|
not os.getenv("CRAWL4AI_GROQ_KEY") and not os.getenv("GROQ_API_KEY"),
|
|
reason="No Groq API key set",
|
|
)
|
|
async def test_groq_llama33_css_schema(self):
|
|
"""Groq with llama-3.3-70b-versatile (replacement for decommissioned 3.1)."""
|
|
api_key = os.getenv("CRAWL4AI_GROQ_KEY") or os.getenv("GROQ_API_KEY")
|
|
schema = await JsonCssExtractionStrategy.agenerate_schema(
|
|
url=TEST_URL,
|
|
schema_type="CSS",
|
|
query="Extract all quotes with their text, author, and tags",
|
|
llm_config=LLMConfig(
|
|
provider="groq/llama-3.3-70b-versatile",
|
|
api_token=api_key,
|
|
),
|
|
)
|
|
_validate_schema(schema)
|
|
print(f"\n[Groq llama-3.3] Generated schema: {json.dumps(schema, indent=2)}")
|
|
|
|
|
|
# ===========================================================================
|
|
# Regression: ensure _strip_markdown_fences doesn't break valid JSON
|
|
# ===========================================================================
|
|
|
|
|
|
class TestRegressionNoBreakage:
|
|
"""Ensure the fix doesn't break any currently-working JSON formats."""
|
|
|
|
@pytest.mark.parametrize(
|
|
"raw_json",
|
|
[
|
|
'{"simple": true}',
|
|
'[]',
|
|
'[{"a": 1}, {"a": 2}]',
|
|
'{"nested": {"deep": {"value": 42}}}',
|
|
'{"unicode": "\u3053\u3093\u306b\u3061\u306f\u4e16\u754c"}',
|
|
'{"special": "line1\\nline2\\ttab"}',
|
|
'{"url": "https://example.com/path?q=1&b=2"}',
|
|
json.dumps(SIMPLE_SCHEMA),
|
|
json.dumps(NESTED_SCHEMA),
|
|
json.dumps(NESTED_SCHEMA, indent=2),
|
|
json.dumps(NESTED_SCHEMA, indent=4),
|
|
],
|
|
ids=[
|
|
"simple_object",
|
|
"empty_array",
|
|
"array_of_objects",
|
|
"deeply_nested",
|
|
"unicode_content",
|
|
"escape_sequences",
|
|
"url_in_value",
|
|
"simple_schema_compact",
|
|
"nested_schema_compact",
|
|
"nested_schema_indent2",
|
|
"nested_schema_indent4",
|
|
],
|
|
)
|
|
def test_clean_json_unchanged(self, raw_json):
|
|
"""Already-clean JSON must parse identically after stripping."""
|
|
original = json.loads(raw_json)
|
|
after_strip = json.loads(_strip_markdown_fences(raw_json))
|
|
assert after_strip == original
|
|
|
|
@pytest.mark.parametrize(
|
|
"raw_json",
|
|
[
|
|
'{"simple": true}',
|
|
'[]',
|
|
'[{"a": 1}, {"a": 2}]',
|
|
json.dumps(SIMPLE_SCHEMA),
|
|
json.dumps(NESTED_SCHEMA, indent=2),
|
|
],
|
|
ids=[
|
|
"simple_object",
|
|
"empty_array",
|
|
"array_of_objects",
|
|
"simple_schema",
|
|
"nested_schema",
|
|
],
|
|
)
|
|
def test_fenced_json_matches_clean(self, raw_json):
|
|
"""Fenced version of any JSON must parse to the same value as clean."""
|
|
original = json.loads(raw_json)
|
|
fenced = f"```json\n{raw_json}\n```"
|
|
after_strip = json.loads(_strip_markdown_fences(fenced))
|
|
assert after_strip == original
|