Fix agenerate_schema() JSON parsing for Anthropic models

Strip markdown code fences (```json ... ```) from LLM responses before
json.loads() in agenerate_schema(). Anthropic models wrap JSON output
in markdown fences when litellm silently drops the unsupported
response_format parameter, causing json.loads("") parse failures.

- Add _strip_markdown_fences() helper to extraction_strategy.py
- Apply fence stripping + empty response check in agenerate_schema()
- Separate JSONDecodeError for clearer error messages
- Add 34 tests: unit, real API integration (Anthropic/OpenAI/Groq
  against quotes.toscrape.com), and regression parametrized
This commit is contained in:
unclecode
2026-01-29 11:38:53 +00:00
parent 0a17fe8f19
commit 911bbce8b1
2 changed files with 335 additions and 1 deletions

View File

@@ -47,6 +47,14 @@ from bs4 import BeautifulSoup
from lxml import html, etree
def _strip_markdown_fences(text: str) -> str:
"""Strip markdown code fences (e.g. ```json ... ```) from LLM responses."""
text = text.strip()
return re.sub(
r"^```(?:[a-zA-Z0-9_-]+)?\s*|```$", "", text, flags=re.MULTILINE
).strip()
class ExtractionStrategy(ABC):
"""
Abstract base class for all extraction strategies.
@@ -1513,7 +1521,12 @@ In this scenario, use your best judgment to generate the schema. You need to exa
base_url=llm_config.base_url,
extra_args=kwargs
)
return json.loads(response.choices[0].message.content)
raw = response.choices[0].message.content
if not raw or not raw.strip():
raise ValueError("LLM returned an empty response")
return json.loads(_strip_markdown_fences(raw))
except json.JSONDecodeError as e:
raise Exception(f"Failed to parse schema JSON: {str(e)}")
except Exception as e:
raise Exception(f"Failed to generate schema: {str(e)}")

View File

@@ -0,0 +1,321 @@
"""
Tests for _strip_markdown_fences helper and agenerate_schema() JSON parsing fix.
Covers:
- Unit tests for _strip_markdown_fences (pure logic, no API calls)
- Real integration tests calling Anthropic/OpenAI/Groq against quotes.toscrape.com
- Regression tests ensuring clean JSON is never corrupted
"""
import json
import os
import pytest
from crawl4ai.extraction_strategy import (
_strip_markdown_fences,
JsonCssExtractionStrategy,
JsonXPathExtractionStrategy,
)
from crawl4ai.async_configs import LLMConfig
# ---------------------------------------------------------------------------
# Sample schemas for unit tests
# ---------------------------------------------------------------------------
SIMPLE_SCHEMA = {
"name": "Quotes",
"baseSelector": ".quote",
"fields": [
{"name": "text", "selector": ".text", "type": "text"},
{"name": "author", "selector": ".author", "type": "text"},
],
}
NESTED_SCHEMA = {
"name": "Products",
"baseSelector": ".product-card",
"baseFields": [{"name": "id", "selector": "", "type": "attribute", "attribute": "data-id"}],
"fields": [
{"name": "title", "selector": "h2.title", "type": "text"},
{"name": "price", "selector": ".price", "type": "text"},
{"name": "description", "selector": ".desc", "type": "text"},
{"name": "image", "selector": "img.product-img", "type": "attribute", "attribute": "src"},
],
}
TEST_URL = "https://quotes.toscrape.com/"
# ===========================================================================
# Unit tests for _strip_markdown_fences
# ===========================================================================
class TestStripMarkdownFences:
"""Direct unit tests for the _strip_markdown_fences helper."""
def test_clean_json_passthrough(self):
"""Clean JSON (no fences) must pass through unchanged."""
raw = json.dumps(SIMPLE_SCHEMA)
assert _strip_markdown_fences(raw) == raw
def test_json_fence(self):
"""```json ... ``` wrapping is stripped correctly."""
raw = '```json\n{"key": "value"}\n```'
assert json.loads(_strip_markdown_fences(raw)) == {"key": "value"}
def test_bare_fence(self):
"""``` ... ``` (no language tag) is stripped correctly."""
raw = '```\n{"key": "value"}\n```'
assert json.loads(_strip_markdown_fences(raw)) == {"key": "value"}
def test_fence_with_language_variants(self):
"""Various language tags after ``` are stripped."""
for lang in ["json", "JSON", "javascript", "js", "text", "jsonc"]:
raw = f"```{lang}\n{{\"a\": 1}}\n```"
result = _strip_markdown_fences(raw)
assert json.loads(result) == {"a": 1}, f"Failed for language tag: {lang}"
def test_leading_trailing_whitespace(self):
"""Whitespace around fenced content is stripped."""
raw = ' \n ```json\n{"key": "value"}\n``` \n '
assert json.loads(_strip_markdown_fences(raw)) == {"key": "value"}
def test_no_fences_with_whitespace(self):
"""Plain JSON with surrounding whitespace is handled."""
raw = ' \n {"key": "value"} \n '
assert json.loads(_strip_markdown_fences(raw)) == {"key": "value"}
def test_nested_code_block_in_value(self):
"""JSON with a string value containing ``` is not corrupted."""
inner = {"code": "Use ```python\\nprint()\\n``` for code blocks"}
raw = f'```json\n{json.dumps(inner)}\n```'
result = _strip_markdown_fences(raw)
parsed = json.loads(result)
assert "```python" in parsed["code"]
def test_complex_schema(self):
"""A real-world multi-field schema wrapped in fences parses correctly."""
raw = f"```json\n{json.dumps(NESTED_SCHEMA, indent=2)}\n```"
result = _strip_markdown_fences(raw)
assert json.loads(result) == NESTED_SCHEMA
def test_empty_string(self):
"""Empty string returns empty string."""
assert _strip_markdown_fences("") == ""
def test_only_whitespace(self):
"""Whitespace-only string returns empty string."""
assert _strip_markdown_fences(" \n\n ") == ""
def test_only_fences(self):
"""Bare fences with nothing inside return empty string."""
assert _strip_markdown_fences("```json\n```") == ""
def test_multiline_json(self):
"""Multiline pretty-printed JSON inside fences."""
pretty = json.dumps(SIMPLE_SCHEMA, indent=4)
raw = f"```json\n{pretty}\n```"
assert json.loads(_strip_markdown_fences(raw)) == SIMPLE_SCHEMA
def test_already_clean_does_not_mutate(self):
"""Passing already-clean JSON multiple times is idempotent."""
raw = json.dumps(SIMPLE_SCHEMA)
once = _strip_markdown_fences(raw)
twice = _strip_markdown_fences(once)
assert once == twice == raw
# ===========================================================================
# Real integration tests — actual LLM API calls against quotes.toscrape.com
# ===========================================================================
def _validate_schema(schema: dict):
"""Validate that a generated schema has the expected structure."""
assert isinstance(schema, dict), f"Schema must be a dict, got {type(schema)}"
assert "name" in schema, "Schema must have a 'name' field"
assert "baseSelector" in schema, "Schema must have a 'baseSelector' field"
assert "fields" in schema, "Schema must have a 'fields' field"
assert isinstance(schema["fields"], list), "'fields' must be a list"
assert len(schema["fields"]) > 0, "'fields' must not be empty"
for field in schema["fields"]:
assert "name" in field, f"Each field must have a 'name': {field}"
assert "selector" in field, f"Each field must have a 'selector': {field}"
assert "type" in field, f"Each field must have a 'type': {field}"
class TestRealAnthropicSchemaGeneration:
"""Real API calls to Anthropic models — the exact scenario from the bug report."""
@pytest.mark.asyncio
@pytest.mark.skipif(
not os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
reason="CRAWL4AI_ANTHROPIC_KEY not set",
)
async def test_anthropic_haiku_css_schema(self):
"""Reproduce the original bug: anthropic/claude-haiku-4-5 + CSS schema."""
schema = await JsonCssExtractionStrategy.agenerate_schema(
url=TEST_URL,
schema_type="CSS",
query="Extract all quotes with their text, author, and tags",
llm_config=LLMConfig(
provider="anthropic/claude-haiku-4-5",
api_token=os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
),
)
_validate_schema(schema)
print(f"\n[Anthropic Haiku CSS] Generated schema: {json.dumps(schema, indent=2)}")
@pytest.mark.asyncio
@pytest.mark.skipif(
not os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
reason="CRAWL4AI_ANTHROPIC_KEY not set",
)
async def test_anthropic_haiku_xpath_schema(self):
"""Anthropic haiku with XPath schema type."""
schema = await JsonXPathExtractionStrategy.agenerate_schema(
url=TEST_URL,
schema_type="XPATH",
query="Extract all quotes with their text, author, and tags",
llm_config=LLMConfig(
provider="anthropic/claude-haiku-4-5",
api_token=os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
),
)
_validate_schema(schema)
print(f"\n[Anthropic Haiku XPath] Generated schema: {json.dumps(schema, indent=2)}")
@pytest.mark.asyncio
@pytest.mark.skipif(
not os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
reason="CRAWL4AI_ANTHROPIC_KEY not set",
)
async def test_anthropic_no_query(self):
"""Anthropic with no query — should auto-detect schema from page structure."""
schema = await JsonCssExtractionStrategy.agenerate_schema(
url=TEST_URL,
schema_type="CSS",
llm_config=LLMConfig(
provider="anthropic/claude-haiku-4-5",
api_token=os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
),
)
_validate_schema(schema)
print(f"\n[Anthropic Haiku no-query] Generated schema: {json.dumps(schema, indent=2)}")
class TestRealOpenAISchemaGeneration:
"""OpenAI models — should still work as before (regression check)."""
@pytest.mark.asyncio
@pytest.mark.skipif(
not os.getenv("CRAWL4AI_OPENAI_KEY"),
reason="CRAWL4AI_OPENAI_KEY not set",
)
async def test_openai_gpt4o_mini_css_schema(self):
"""OpenAI gpt-4o-mini with CSS — this already worked, must not regress."""
schema = await JsonCssExtractionStrategy.agenerate_schema(
url=TEST_URL,
schema_type="CSS",
query="Extract all quotes with their text, author, and tags",
llm_config=LLMConfig(
provider="openai/gpt-4o-mini",
api_token=os.getenv("CRAWL4AI_OPENAI_KEY"),
),
)
_validate_schema(schema)
print(f"\n[OpenAI gpt-4o-mini CSS] Generated schema: {json.dumps(schema, indent=2)}")
class TestRealGroqSchemaGeneration:
"""Groq with the updated model name."""
@pytest.mark.asyncio
@pytest.mark.skipif(
not os.getenv("CRAWL4AI_GROQ_KEY") and not os.getenv("GROQ_API_KEY"),
reason="No Groq API key set",
)
async def test_groq_llama33_css_schema(self):
"""Groq with llama-3.3-70b-versatile (replacement for decommissioned 3.1)."""
api_key = os.getenv("CRAWL4AI_GROQ_KEY") or os.getenv("GROQ_API_KEY")
schema = await JsonCssExtractionStrategy.agenerate_schema(
url=TEST_URL,
schema_type="CSS",
query="Extract all quotes with their text, author, and tags",
llm_config=LLMConfig(
provider="groq/llama-3.3-70b-versatile",
api_token=api_key,
),
)
_validate_schema(schema)
print(f"\n[Groq llama-3.3] Generated schema: {json.dumps(schema, indent=2)}")
# ===========================================================================
# Regression: ensure _strip_markdown_fences doesn't break valid JSON
# ===========================================================================
class TestRegressionNoBreakage:
"""Ensure the fix doesn't break any currently-working JSON formats."""
@pytest.mark.parametrize(
"raw_json",
[
'{"simple": true}',
'[]',
'[{"a": 1}, {"a": 2}]',
'{"nested": {"deep": {"value": 42}}}',
'{"unicode": "\u3053\u3093\u306b\u3061\u306f\u4e16\u754c"}',
'{"special": "line1\\nline2\\ttab"}',
'{"url": "https://example.com/path?q=1&b=2"}',
json.dumps(SIMPLE_SCHEMA),
json.dumps(NESTED_SCHEMA),
json.dumps(NESTED_SCHEMA, indent=2),
json.dumps(NESTED_SCHEMA, indent=4),
],
ids=[
"simple_object",
"empty_array",
"array_of_objects",
"deeply_nested",
"unicode_content",
"escape_sequences",
"url_in_value",
"simple_schema_compact",
"nested_schema_compact",
"nested_schema_indent2",
"nested_schema_indent4",
],
)
def test_clean_json_unchanged(self, raw_json):
"""Already-clean JSON must parse identically after stripping."""
original = json.loads(raw_json)
after_strip = json.loads(_strip_markdown_fences(raw_json))
assert after_strip == original
@pytest.mark.parametrize(
"raw_json",
[
'{"simple": true}',
'[]',
'[{"a": 1}, {"a": 2}]',
json.dumps(SIMPLE_SCHEMA),
json.dumps(NESTED_SCHEMA, indent=2),
],
ids=[
"simple_object",
"empty_array",
"array_of_objects",
"simple_schema",
"nested_schema",
],
)
def test_fenced_json_matches_clean(self, raw_json):
"""Fenced version of any JSON must parse to the same value as clean."""
original = json.loads(raw_json)
fenced = f"```json\n{raw_json}\n```"
after_strip = json.loads(_strip_markdown_fences(fenced))
assert after_strip == original