Fix agenerate_schema() JSON parsing for Anthropic models
Strip markdown code fences (```json ... ```) from LLM responses before
json.loads() in agenerate_schema(). Anthropic models wrap JSON output
in markdown fences when litellm silently drops the unsupported
response_format parameter, causing json.loads("") parse failures.
- Add _strip_markdown_fences() helper to extraction_strategy.py
- Apply fence stripping + empty response check in agenerate_schema()
- Separate JSONDecodeError for clearer error messages
- Add 34 tests: unit, real API integration (Anthropic/OpenAI/Groq
against quotes.toscrape.com), and regression parametrized
This commit is contained in:
@@ -47,6 +47,14 @@ from bs4 import BeautifulSoup
|
||||
from lxml import html, etree
|
||||
|
||||
|
||||
def _strip_markdown_fences(text: str) -> str:
|
||||
"""Strip markdown code fences (e.g. ```json ... ```) from LLM responses."""
|
||||
text = text.strip()
|
||||
return re.sub(
|
||||
r"^```(?:[a-zA-Z0-9_-]+)?\s*|```$", "", text, flags=re.MULTILINE
|
||||
).strip()
|
||||
|
||||
|
||||
class ExtractionStrategy(ABC):
|
||||
"""
|
||||
Abstract base class for all extraction strategies.
|
||||
@@ -1513,7 +1521,12 @@ In this scenario, use your best judgment to generate the schema. You need to exa
|
||||
base_url=llm_config.base_url,
|
||||
extra_args=kwargs
|
||||
)
|
||||
return json.loads(response.choices[0].message.content)
|
||||
raw = response.choices[0].message.content
|
||||
if not raw or not raw.strip():
|
||||
raise ValueError("LLM returned an empty response")
|
||||
return json.loads(_strip_markdown_fences(raw))
|
||||
except json.JSONDecodeError as e:
|
||||
raise Exception(f"Failed to parse schema JSON: {str(e)}")
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to generate schema: {str(e)}")
|
||||
|
||||
|
||||
321
tests/general/test_strip_markdown_fences.py
Normal file
321
tests/general/test_strip_markdown_fences.py
Normal file
@@ -0,0 +1,321 @@
|
||||
"""
|
||||
Tests for _strip_markdown_fences helper and agenerate_schema() JSON parsing fix.
|
||||
|
||||
Covers:
|
||||
- Unit tests for _strip_markdown_fences (pure logic, no API calls)
|
||||
- Real integration tests calling Anthropic/OpenAI/Groq against quotes.toscrape.com
|
||||
- Regression tests ensuring clean JSON is never corrupted
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import pytest
|
||||
|
||||
from crawl4ai.extraction_strategy import (
|
||||
_strip_markdown_fences,
|
||||
JsonCssExtractionStrategy,
|
||||
JsonXPathExtractionStrategy,
|
||||
)
|
||||
from crawl4ai.async_configs import LLMConfig
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Sample schemas for unit tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
SIMPLE_SCHEMA = {
|
||||
"name": "Quotes",
|
||||
"baseSelector": ".quote",
|
||||
"fields": [
|
||||
{"name": "text", "selector": ".text", "type": "text"},
|
||||
{"name": "author", "selector": ".author", "type": "text"},
|
||||
],
|
||||
}
|
||||
|
||||
NESTED_SCHEMA = {
|
||||
"name": "Products",
|
||||
"baseSelector": ".product-card",
|
||||
"baseFields": [{"name": "id", "selector": "", "type": "attribute", "attribute": "data-id"}],
|
||||
"fields": [
|
||||
{"name": "title", "selector": "h2.title", "type": "text"},
|
||||
{"name": "price", "selector": ".price", "type": "text"},
|
||||
{"name": "description", "selector": ".desc", "type": "text"},
|
||||
{"name": "image", "selector": "img.product-img", "type": "attribute", "attribute": "src"},
|
||||
],
|
||||
}
|
||||
|
||||
TEST_URL = "https://quotes.toscrape.com/"
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Unit tests for _strip_markdown_fences
|
||||
# ===========================================================================
|
||||
|
||||
|
||||
class TestStripMarkdownFences:
|
||||
"""Direct unit tests for the _strip_markdown_fences helper."""
|
||||
|
||||
def test_clean_json_passthrough(self):
|
||||
"""Clean JSON (no fences) must pass through unchanged."""
|
||||
raw = json.dumps(SIMPLE_SCHEMA)
|
||||
assert _strip_markdown_fences(raw) == raw
|
||||
|
||||
def test_json_fence(self):
|
||||
"""```json ... ``` wrapping is stripped correctly."""
|
||||
raw = '```json\n{"key": "value"}\n```'
|
||||
assert json.loads(_strip_markdown_fences(raw)) == {"key": "value"}
|
||||
|
||||
def test_bare_fence(self):
|
||||
"""``` ... ``` (no language tag) is stripped correctly."""
|
||||
raw = '```\n{"key": "value"}\n```'
|
||||
assert json.loads(_strip_markdown_fences(raw)) == {"key": "value"}
|
||||
|
||||
def test_fence_with_language_variants(self):
|
||||
"""Various language tags after ``` are stripped."""
|
||||
for lang in ["json", "JSON", "javascript", "js", "text", "jsonc"]:
|
||||
raw = f"```{lang}\n{{\"a\": 1}}\n```"
|
||||
result = _strip_markdown_fences(raw)
|
||||
assert json.loads(result) == {"a": 1}, f"Failed for language tag: {lang}"
|
||||
|
||||
def test_leading_trailing_whitespace(self):
|
||||
"""Whitespace around fenced content is stripped."""
|
||||
raw = ' \n ```json\n{"key": "value"}\n``` \n '
|
||||
assert json.loads(_strip_markdown_fences(raw)) == {"key": "value"}
|
||||
|
||||
def test_no_fences_with_whitespace(self):
|
||||
"""Plain JSON with surrounding whitespace is handled."""
|
||||
raw = ' \n {"key": "value"} \n '
|
||||
assert json.loads(_strip_markdown_fences(raw)) == {"key": "value"}
|
||||
|
||||
def test_nested_code_block_in_value(self):
|
||||
"""JSON with a string value containing ``` is not corrupted."""
|
||||
inner = {"code": "Use ```python\\nprint()\\n``` for code blocks"}
|
||||
raw = f'```json\n{json.dumps(inner)}\n```'
|
||||
result = _strip_markdown_fences(raw)
|
||||
parsed = json.loads(result)
|
||||
assert "```python" in parsed["code"]
|
||||
|
||||
def test_complex_schema(self):
|
||||
"""A real-world multi-field schema wrapped in fences parses correctly."""
|
||||
raw = f"```json\n{json.dumps(NESTED_SCHEMA, indent=2)}\n```"
|
||||
result = _strip_markdown_fences(raw)
|
||||
assert json.loads(result) == NESTED_SCHEMA
|
||||
|
||||
def test_empty_string(self):
|
||||
"""Empty string returns empty string."""
|
||||
assert _strip_markdown_fences("") == ""
|
||||
|
||||
def test_only_whitespace(self):
|
||||
"""Whitespace-only string returns empty string."""
|
||||
assert _strip_markdown_fences(" \n\n ") == ""
|
||||
|
||||
def test_only_fences(self):
|
||||
"""Bare fences with nothing inside return empty string."""
|
||||
assert _strip_markdown_fences("```json\n```") == ""
|
||||
|
||||
def test_multiline_json(self):
|
||||
"""Multiline pretty-printed JSON inside fences."""
|
||||
pretty = json.dumps(SIMPLE_SCHEMA, indent=4)
|
||||
raw = f"```json\n{pretty}\n```"
|
||||
assert json.loads(_strip_markdown_fences(raw)) == SIMPLE_SCHEMA
|
||||
|
||||
def test_already_clean_does_not_mutate(self):
|
||||
"""Passing already-clean JSON multiple times is idempotent."""
|
||||
raw = json.dumps(SIMPLE_SCHEMA)
|
||||
once = _strip_markdown_fences(raw)
|
||||
twice = _strip_markdown_fences(once)
|
||||
assert once == twice == raw
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Real integration tests — actual LLM API calls against quotes.toscrape.com
|
||||
# ===========================================================================
|
||||
|
||||
|
||||
def _validate_schema(schema: dict):
|
||||
"""Validate that a generated schema has the expected structure."""
|
||||
assert isinstance(schema, dict), f"Schema must be a dict, got {type(schema)}"
|
||||
assert "name" in schema, "Schema must have a 'name' field"
|
||||
assert "baseSelector" in schema, "Schema must have a 'baseSelector' field"
|
||||
assert "fields" in schema, "Schema must have a 'fields' field"
|
||||
assert isinstance(schema["fields"], list), "'fields' must be a list"
|
||||
assert len(schema["fields"]) > 0, "'fields' must not be empty"
|
||||
for field in schema["fields"]:
|
||||
assert "name" in field, f"Each field must have a 'name': {field}"
|
||||
assert "selector" in field, f"Each field must have a 'selector': {field}"
|
||||
assert "type" in field, f"Each field must have a 'type': {field}"
|
||||
|
||||
|
||||
class TestRealAnthropicSchemaGeneration:
|
||||
"""Real API calls to Anthropic models — the exact scenario from the bug report."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skipif(
|
||||
not os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
|
||||
reason="CRAWL4AI_ANTHROPIC_KEY not set",
|
||||
)
|
||||
async def test_anthropic_haiku_css_schema(self):
|
||||
"""Reproduce the original bug: anthropic/claude-haiku-4-5 + CSS schema."""
|
||||
schema = await JsonCssExtractionStrategy.agenerate_schema(
|
||||
url=TEST_URL,
|
||||
schema_type="CSS",
|
||||
query="Extract all quotes with their text, author, and tags",
|
||||
llm_config=LLMConfig(
|
||||
provider="anthropic/claude-haiku-4-5",
|
||||
api_token=os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
|
||||
),
|
||||
)
|
||||
_validate_schema(schema)
|
||||
print(f"\n[Anthropic Haiku CSS] Generated schema: {json.dumps(schema, indent=2)}")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skipif(
|
||||
not os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
|
||||
reason="CRAWL4AI_ANTHROPIC_KEY not set",
|
||||
)
|
||||
async def test_anthropic_haiku_xpath_schema(self):
|
||||
"""Anthropic haiku with XPath schema type."""
|
||||
schema = await JsonXPathExtractionStrategy.agenerate_schema(
|
||||
url=TEST_URL,
|
||||
schema_type="XPATH",
|
||||
query="Extract all quotes with their text, author, and tags",
|
||||
llm_config=LLMConfig(
|
||||
provider="anthropic/claude-haiku-4-5",
|
||||
api_token=os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
|
||||
),
|
||||
)
|
||||
_validate_schema(schema)
|
||||
print(f"\n[Anthropic Haiku XPath] Generated schema: {json.dumps(schema, indent=2)}")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skipif(
|
||||
not os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
|
||||
reason="CRAWL4AI_ANTHROPIC_KEY not set",
|
||||
)
|
||||
async def test_anthropic_no_query(self):
|
||||
"""Anthropic with no query — should auto-detect schema from page structure."""
|
||||
schema = await JsonCssExtractionStrategy.agenerate_schema(
|
||||
url=TEST_URL,
|
||||
schema_type="CSS",
|
||||
llm_config=LLMConfig(
|
||||
provider="anthropic/claude-haiku-4-5",
|
||||
api_token=os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
|
||||
),
|
||||
)
|
||||
_validate_schema(schema)
|
||||
print(f"\n[Anthropic Haiku no-query] Generated schema: {json.dumps(schema, indent=2)}")
|
||||
|
||||
|
||||
class TestRealOpenAISchemaGeneration:
|
||||
"""OpenAI models — should still work as before (regression check)."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skipif(
|
||||
not os.getenv("CRAWL4AI_OPENAI_KEY"),
|
||||
reason="CRAWL4AI_OPENAI_KEY not set",
|
||||
)
|
||||
async def test_openai_gpt4o_mini_css_schema(self):
|
||||
"""OpenAI gpt-4o-mini with CSS — this already worked, must not regress."""
|
||||
schema = await JsonCssExtractionStrategy.agenerate_schema(
|
||||
url=TEST_URL,
|
||||
schema_type="CSS",
|
||||
query="Extract all quotes with their text, author, and tags",
|
||||
llm_config=LLMConfig(
|
||||
provider="openai/gpt-4o-mini",
|
||||
api_token=os.getenv("CRAWL4AI_OPENAI_KEY"),
|
||||
),
|
||||
)
|
||||
_validate_schema(schema)
|
||||
print(f"\n[OpenAI gpt-4o-mini CSS] Generated schema: {json.dumps(schema, indent=2)}")
|
||||
|
||||
|
||||
class TestRealGroqSchemaGeneration:
|
||||
"""Groq with the updated model name."""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skipif(
|
||||
not os.getenv("CRAWL4AI_GROQ_KEY") and not os.getenv("GROQ_API_KEY"),
|
||||
reason="No Groq API key set",
|
||||
)
|
||||
async def test_groq_llama33_css_schema(self):
|
||||
"""Groq with llama-3.3-70b-versatile (replacement for decommissioned 3.1)."""
|
||||
api_key = os.getenv("CRAWL4AI_GROQ_KEY") or os.getenv("GROQ_API_KEY")
|
||||
schema = await JsonCssExtractionStrategy.agenerate_schema(
|
||||
url=TEST_URL,
|
||||
schema_type="CSS",
|
||||
query="Extract all quotes with their text, author, and tags",
|
||||
llm_config=LLMConfig(
|
||||
provider="groq/llama-3.3-70b-versatile",
|
||||
api_token=api_key,
|
||||
),
|
||||
)
|
||||
_validate_schema(schema)
|
||||
print(f"\n[Groq llama-3.3] Generated schema: {json.dumps(schema, indent=2)}")
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Regression: ensure _strip_markdown_fences doesn't break valid JSON
|
||||
# ===========================================================================
|
||||
|
||||
|
||||
class TestRegressionNoBreakage:
|
||||
"""Ensure the fix doesn't break any currently-working JSON formats."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"raw_json",
|
||||
[
|
||||
'{"simple": true}',
|
||||
'[]',
|
||||
'[{"a": 1}, {"a": 2}]',
|
||||
'{"nested": {"deep": {"value": 42}}}',
|
||||
'{"unicode": "\u3053\u3093\u306b\u3061\u306f\u4e16\u754c"}',
|
||||
'{"special": "line1\\nline2\\ttab"}',
|
||||
'{"url": "https://example.com/path?q=1&b=2"}',
|
||||
json.dumps(SIMPLE_SCHEMA),
|
||||
json.dumps(NESTED_SCHEMA),
|
||||
json.dumps(NESTED_SCHEMA, indent=2),
|
||||
json.dumps(NESTED_SCHEMA, indent=4),
|
||||
],
|
||||
ids=[
|
||||
"simple_object",
|
||||
"empty_array",
|
||||
"array_of_objects",
|
||||
"deeply_nested",
|
||||
"unicode_content",
|
||||
"escape_sequences",
|
||||
"url_in_value",
|
||||
"simple_schema_compact",
|
||||
"nested_schema_compact",
|
||||
"nested_schema_indent2",
|
||||
"nested_schema_indent4",
|
||||
],
|
||||
)
|
||||
def test_clean_json_unchanged(self, raw_json):
|
||||
"""Already-clean JSON must parse identically after stripping."""
|
||||
original = json.loads(raw_json)
|
||||
after_strip = json.loads(_strip_markdown_fences(raw_json))
|
||||
assert after_strip == original
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"raw_json",
|
||||
[
|
||||
'{"simple": true}',
|
||||
'[]',
|
||||
'[{"a": 1}, {"a": 2}]',
|
||||
json.dumps(SIMPLE_SCHEMA),
|
||||
json.dumps(NESTED_SCHEMA, indent=2),
|
||||
],
|
||||
ids=[
|
||||
"simple_object",
|
||||
"empty_array",
|
||||
"array_of_objects",
|
||||
"simple_schema",
|
||||
"nested_schema",
|
||||
],
|
||||
)
|
||||
def test_fenced_json_matches_clean(self, raw_json):
|
||||
"""Fenced version of any JSON must parse to the same value as clean."""
|
||||
original = json.loads(raw_json)
|
||||
fenced = f"```json\n{raw_json}\n```"
|
||||
after_strip = json.loads(_strip_markdown_fences(fenced))
|
||||
assert after_strip == original
|
||||
Reference in New Issue
Block a user