Fix agenerate_schema() JSON parsing for Anthropic models

Strip markdown code fences (```json ... ```) from LLM responses before json.loads() in agenerate_schema(). Anthropic models wrap JSON output in markdown fences when litellm silently drops the unsupported response_format parameter, causing json.loads("") parse failures. - Add _strip_markdown_fences() helper to extraction_strategy.py - Apply fence stripping + empty response check in agenerate_schema() - Separate JSONDecodeError for clearer error messages - Add 34 tests: unit, real API integration (Anthropic/OpenAI/Groq against quotes.toscrape.com), and regression parametrized
2026-01-29 11:38:53 +00:00
parent 0a17fe8f19
commit 911bbce8b1
2 changed files with 335 additions and 1 deletions
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -47,6 +47,14 @@ from bs4 import BeautifulSoup
 from lxml import html, etree


+def _strip_markdown_fences(text: str) -> str:
+    """Strip markdown code fences (e.g. ```json ... ```) from LLM responses."""
+    text = text.strip()
+    return re.sub(
+        r"^```(?:[a-zA-Z0-9_-]+)?\s*|```$", "", text, flags=re.MULTILINE
+    ).strip()
+
+
 class ExtractionStrategy(ABC):
    """
    Abstract base class for all extraction strategies.
@@ -1513,7 +1521,12 @@ In this scenario, use your best judgment to generate the schema. You need to exa
                base_url=llm_config.base_url,
                extra_args=kwargs
            )
-            return json.loads(response.choices[0].message.content)
+            raw = response.choices[0].message.content
+            if not raw or not raw.strip():
+                raise ValueError("LLM returned an empty response")
+            return json.loads(_strip_markdown_fences(raw))
+        except json.JSONDecodeError as e:
+            raise Exception(f"Failed to parse schema JSON: {str(e)}")
        except Exception as e:
            raise Exception(f"Failed to generate schema: {str(e)}")

--- a/tests/general/test_strip_markdown_fences.py
+++ b/tests/general/test_strip_markdown_fences.py
@@ -0,0 +1,321 @@
+"""
+Tests for _strip_markdown_fences helper and agenerate_schema() JSON parsing fix.
+
+Covers:
+- Unit tests for _strip_markdown_fences (pure logic, no API calls)
+- Real integration tests calling Anthropic/OpenAI/Groq against quotes.toscrape.com
+- Regression tests ensuring clean JSON is never corrupted
+"""
+
+import json
+import os
+import pytest
+
+from crawl4ai.extraction_strategy import (
+    _strip_markdown_fences,
+    JsonCssExtractionStrategy,
+    JsonXPathExtractionStrategy,
+)
+from crawl4ai.async_configs import LLMConfig
+
+
+# ---------------------------------------------------------------------------
+# Sample schemas for unit tests
+# ---------------------------------------------------------------------------
+
+SIMPLE_SCHEMA = {
+    "name": "Quotes",
+    "baseSelector": ".quote",
+    "fields": [
+        {"name": "text", "selector": ".text", "type": "text"},
+        {"name": "author", "selector": ".author", "type": "text"},
+    ],
+}
+
+NESTED_SCHEMA = {
+    "name": "Products",
+    "baseSelector": ".product-card",
+    "baseFields": [{"name": "id", "selector": "", "type": "attribute", "attribute": "data-id"}],
+    "fields": [
+        {"name": "title", "selector": "h2.title", "type": "text"},
+        {"name": "price", "selector": ".price", "type": "text"},
+        {"name": "description", "selector": ".desc", "type": "text"},
+        {"name": "image", "selector": "img.product-img", "type": "attribute", "attribute": "src"},
+    ],
+}
+
+TEST_URL = "https://quotes.toscrape.com/"
+
+
+# ===========================================================================
+# Unit tests for _strip_markdown_fences
+# ===========================================================================
+
+
+class TestStripMarkdownFences:
+    """Direct unit tests for the _strip_markdown_fences helper."""
+
+    def test_clean_json_passthrough(self):
+        """Clean JSON (no fences) must pass through unchanged."""
+        raw = json.dumps(SIMPLE_SCHEMA)
+        assert _strip_markdown_fences(raw) == raw
+
+    def test_json_fence(self):
+        """```json ... ``` wrapping is stripped correctly."""
+        raw = '```json\n{"key": "value"}\n```'
+        assert json.loads(_strip_markdown_fences(raw)) == {"key": "value"}
+
+    def test_bare_fence(self):
+        """``` ... ``` (no language tag) is stripped correctly."""
+        raw = '```\n{"key": "value"}\n```'
+        assert json.loads(_strip_markdown_fences(raw)) == {"key": "value"}
+
+    def test_fence_with_language_variants(self):
+        """Various language tags after ``` are stripped."""
+        for lang in ["json", "JSON", "javascript", "js", "text", "jsonc"]:
+            raw = f"```{lang}\n{{\"a\": 1}}\n```"
+            result = _strip_markdown_fences(raw)
+            assert json.loads(result) == {"a": 1}, f"Failed for language tag: {lang}"
+
+    def test_leading_trailing_whitespace(self):
+        """Whitespace around fenced content is stripped."""
+        raw = '  \n  ```json\n{"key": "value"}\n```  \n  '
+        assert json.loads(_strip_markdown_fences(raw)) == {"key": "value"}
+
+    def test_no_fences_with_whitespace(self):
+        """Plain JSON with surrounding whitespace is handled."""
+        raw = '  \n  {"key": "value"}  \n  '
+        assert json.loads(_strip_markdown_fences(raw)) == {"key": "value"}
+
+    def test_nested_code_block_in_value(self):
+        """JSON with a string value containing ``` is not corrupted."""
+        inner = {"code": "Use ```python\\nprint()\\n``` for code blocks"}
+        raw = f'```json\n{json.dumps(inner)}\n```'
+        result = _strip_markdown_fences(raw)
+        parsed = json.loads(result)
+        assert "```python" in parsed["code"]
+
+    def test_complex_schema(self):
+        """A real-world multi-field schema wrapped in fences parses correctly."""
+        raw = f"```json\n{json.dumps(NESTED_SCHEMA, indent=2)}\n```"
+        result = _strip_markdown_fences(raw)
+        assert json.loads(result) == NESTED_SCHEMA
+
+    def test_empty_string(self):
+        """Empty string returns empty string."""
+        assert _strip_markdown_fences("") == ""
+
+    def test_only_whitespace(self):
+        """Whitespace-only string returns empty string."""
+        assert _strip_markdown_fences("   \n\n  ") == ""
+
+    def test_only_fences(self):
+        """Bare fences with nothing inside return empty string."""
+        assert _strip_markdown_fences("```json\n```") == ""
+
+    def test_multiline_json(self):
+        """Multiline pretty-printed JSON inside fences."""
+        pretty = json.dumps(SIMPLE_SCHEMA, indent=4)
+        raw = f"```json\n{pretty}\n```"
+        assert json.loads(_strip_markdown_fences(raw)) == SIMPLE_SCHEMA
+
+    def test_already_clean_does_not_mutate(self):
+        """Passing already-clean JSON multiple times is idempotent."""
+        raw = json.dumps(SIMPLE_SCHEMA)
+        once = _strip_markdown_fences(raw)
+        twice = _strip_markdown_fences(once)
+        assert once == twice == raw
+
+
+# ===========================================================================
+# Real integration tests — actual LLM API calls against quotes.toscrape.com
+# ===========================================================================
+
+
+def _validate_schema(schema: dict):
+    """Validate that a generated schema has the expected structure."""
+    assert isinstance(schema, dict), f"Schema must be a dict, got {type(schema)}"
+    assert "name" in schema, "Schema must have a 'name' field"
+    assert "baseSelector" in schema, "Schema must have a 'baseSelector' field"
+    assert "fields" in schema, "Schema must have a 'fields' field"
+    assert isinstance(schema["fields"], list), "'fields' must be a list"
+    assert len(schema["fields"]) > 0, "'fields' must not be empty"
+    for field in schema["fields"]:
+        assert "name" in field, f"Each field must have a 'name': {field}"
+        assert "selector" in field, f"Each field must have a 'selector': {field}"
+        assert "type" in field, f"Each field must have a 'type': {field}"
+
+
+class TestRealAnthropicSchemaGeneration:
+    """Real API calls to Anthropic models — the exact scenario from the bug report."""
+
+    @pytest.mark.asyncio
+    @pytest.mark.skipif(
+        not os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
+        reason="CRAWL4AI_ANTHROPIC_KEY not set",
+    )
+    async def test_anthropic_haiku_css_schema(self):
+        """Reproduce the original bug: anthropic/claude-haiku-4-5 + CSS schema."""
+        schema = await JsonCssExtractionStrategy.agenerate_schema(
+            url=TEST_URL,
+            schema_type="CSS",
+            query="Extract all quotes with their text, author, and tags",
+            llm_config=LLMConfig(
+                provider="anthropic/claude-haiku-4-5",
+                api_token=os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
+            ),
+        )
+        _validate_schema(schema)
+        print(f"\n[Anthropic Haiku CSS] Generated schema: {json.dumps(schema, indent=2)}")
+
+    @pytest.mark.asyncio
+    @pytest.mark.skipif(
+        not os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
+        reason="CRAWL4AI_ANTHROPIC_KEY not set",
+    )
+    async def test_anthropic_haiku_xpath_schema(self):
+        """Anthropic haiku with XPath schema type."""
+        schema = await JsonXPathExtractionStrategy.agenerate_schema(
+            url=TEST_URL,
+            schema_type="XPATH",
+            query="Extract all quotes with their text, author, and tags",
+            llm_config=LLMConfig(
+                provider="anthropic/claude-haiku-4-5",
+                api_token=os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
+            ),
+        )
+        _validate_schema(schema)
+        print(f"\n[Anthropic Haiku XPath] Generated schema: {json.dumps(schema, indent=2)}")
+
+    @pytest.mark.asyncio
+    @pytest.mark.skipif(
+        not os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
+        reason="CRAWL4AI_ANTHROPIC_KEY not set",
+    )
+    async def test_anthropic_no_query(self):
+        """Anthropic with no query — should auto-detect schema from page structure."""
+        schema = await JsonCssExtractionStrategy.agenerate_schema(
+            url=TEST_URL,
+            schema_type="CSS",
+            llm_config=LLMConfig(
+                provider="anthropic/claude-haiku-4-5",
+                api_token=os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
+            ),
+        )
+        _validate_schema(schema)
+        print(f"\n[Anthropic Haiku no-query] Generated schema: {json.dumps(schema, indent=2)}")
+
+
+class TestRealOpenAISchemaGeneration:
+    """OpenAI models — should still work as before (regression check)."""
+
+    @pytest.mark.asyncio
+    @pytest.mark.skipif(
+        not os.getenv("CRAWL4AI_OPENAI_KEY"),
+        reason="CRAWL4AI_OPENAI_KEY not set",
+    )
+    async def test_openai_gpt4o_mini_css_schema(self):
+        """OpenAI gpt-4o-mini with CSS — this already worked, must not regress."""
+        schema = await JsonCssExtractionStrategy.agenerate_schema(
+            url=TEST_URL,
+            schema_type="CSS",
+            query="Extract all quotes with their text, author, and tags",
+            llm_config=LLMConfig(
+                provider="openai/gpt-4o-mini",
+                api_token=os.getenv("CRAWL4AI_OPENAI_KEY"),
+            ),
+        )
+        _validate_schema(schema)
+        print(f"\n[OpenAI gpt-4o-mini CSS] Generated schema: {json.dumps(schema, indent=2)}")
+
+
+class TestRealGroqSchemaGeneration:
+    """Groq with the updated model name."""
+
+    @pytest.mark.asyncio
+    @pytest.mark.skipif(
+        not os.getenv("CRAWL4AI_GROQ_KEY") and not os.getenv("GROQ_API_KEY"),
+        reason="No Groq API key set",
+    )
+    async def test_groq_llama33_css_schema(self):
+        """Groq with llama-3.3-70b-versatile (replacement for decommissioned 3.1)."""
+        api_key = os.getenv("CRAWL4AI_GROQ_KEY") or os.getenv("GROQ_API_KEY")
+        schema = await JsonCssExtractionStrategy.agenerate_schema(
+            url=TEST_URL,
+            schema_type="CSS",
+            query="Extract all quotes with their text, author, and tags",
+            llm_config=LLMConfig(
+                provider="groq/llama-3.3-70b-versatile",
+                api_token=api_key,
+            ),
+        )
+        _validate_schema(schema)
+        print(f"\n[Groq llama-3.3] Generated schema: {json.dumps(schema, indent=2)}")
+
+
+# ===========================================================================
+# Regression: ensure _strip_markdown_fences doesn't break valid JSON
+# ===========================================================================
+
+
+class TestRegressionNoBreakage:
+    """Ensure the fix doesn't break any currently-working JSON formats."""
+
+    @pytest.mark.parametrize(
+        "raw_json",
+        [
+            '{"simple": true}',
+            '[]',
+            '[{"a": 1}, {"a": 2}]',
+            '{"nested": {"deep": {"value": 42}}}',
+            '{"unicode": "\u3053\u3093\u306b\u3061\u306f\u4e16\u754c"}',
+            '{"special": "line1\\nline2\\ttab"}',
+            '{"url": "https://example.com/path?q=1&b=2"}',
+            json.dumps(SIMPLE_SCHEMA),
+            json.dumps(NESTED_SCHEMA),
+            json.dumps(NESTED_SCHEMA, indent=2),
+            json.dumps(NESTED_SCHEMA, indent=4),
+        ],
+        ids=[
+            "simple_object",
+            "empty_array",
+            "array_of_objects",
+            "deeply_nested",
+            "unicode_content",
+            "escape_sequences",
+            "url_in_value",
+            "simple_schema_compact",
+            "nested_schema_compact",
+            "nested_schema_indent2",
+            "nested_schema_indent4",
+        ],
+    )
+    def test_clean_json_unchanged(self, raw_json):
+        """Already-clean JSON must parse identically after stripping."""
+        original = json.loads(raw_json)
+        after_strip = json.loads(_strip_markdown_fences(raw_json))
+        assert after_strip == original
+
+    @pytest.mark.parametrize(
+        "raw_json",
+        [
+            '{"simple": true}',
+            '[]',
+            '[{"a": 1}, {"a": 2}]',
+            json.dumps(SIMPLE_SCHEMA),
+            json.dumps(NESTED_SCHEMA, indent=2),
+        ],
+        ids=[
+            "simple_object",
+            "empty_array",
+            "array_of_objects",
+            "simple_schema",
+            "nested_schema",
+        ],
+    )
+    def test_fenced_json_matches_clean(self, raw_json):
+        """Fenced version of any JSON must parse to the same value as clean."""
+        original = json.loads(raw_json)
+        fenced = f"```json\n{raw_json}\n```"
+        after_strip = json.loads(_strip_markdown_fences(fenced))
+        assert after_strip == original