crawl4ai/tests/general/test_strip_markdown_fences.py

"""
Tests for _strip_markdown_fences helper and agenerate_schema() JSON parsing fix.

Covers:
- Unit tests for _strip_markdown_fences (pure logic, no API calls)
- Real integration tests calling Anthropic/OpenAI/Groq against quotes.toscrape.com
- Regression tests ensuring clean JSON is never corrupted
"""

import json
import os
import pytest

from crawl4ai.extraction_strategy import (
    _strip_markdown_fences,
    JsonCssExtractionStrategy,
    JsonXPathExtractionStrategy,
)
from crawl4ai.async_configs import LLMConfig


# ---------------------------------------------------------------------------
# Sample schemas for unit tests
# ---------------------------------------------------------------------------

SIMPLE_SCHEMA = {
    "name": "Quotes",
    "baseSelector": ".quote",
    "fields": [
        {"name": "text", "selector": ".text", "type": "text"},
        {"name": "author", "selector": ".author", "type": "text"},
    ],
}

NESTED_SCHEMA = {
    "name": "Products",
    "baseSelector": ".product-card",
    "baseFields": [{"name": "id", "selector": "", "type": "attribute", "attribute": "data-id"}],
    "fields": [
        {"name": "title", "selector": "h2.title", "type": "text"},
        {"name": "price", "selector": ".price", "type": "text"},
        {"name": "description", "selector": ".desc", "type": "text"},
        {"name": "image", "selector": "img.product-img", "type": "attribute", "attribute": "src"},
    ],
}

TEST_URL = "https://quotes.toscrape.com/"


# ===========================================================================
# Unit tests for _strip_markdown_fences
# ===========================================================================


class TestStripMarkdownFences:
    """Direct unit tests for the _strip_markdown_fences helper."""

    def test_clean_json_passthrough(self):
        """Clean JSON (no fences) must pass through unchanged."""
        raw = json.dumps(SIMPLE_SCHEMA)
        assert _strip_markdown_fences(raw) == raw

    def test_json_fence(self):
        """```json ... ``` wrapping is stripped correctly."""
        raw = '```json\n{"key": "value"}\n```'
        assert json.loads(_strip_markdown_fences(raw)) == {"key": "value"}

    def test_bare_fence(self):
        """``` ... ``` (no language tag) is stripped correctly."""
        raw = '```\n{"key": "value"}\n```'
        assert json.loads(_strip_markdown_fences(raw)) == {"key": "value"}

    def test_fence_with_language_variants(self):
        """Various language tags after ``` are stripped."""
        for lang in ["json", "JSON", "javascript", "js", "text", "jsonc"]:
            raw = f"```{lang}\n{{\"a\": 1}}\n```"
            result = _strip_markdown_fences(raw)
            assert json.loads(result) == {"a": 1}, f"Failed for language tag: {lang}"

    def test_leading_trailing_whitespace(self):
        """Whitespace around fenced content is stripped."""
        raw = '  \n  ```json\n{"key": "value"}\n```  \n  '
        assert json.loads(_strip_markdown_fences(raw)) == {"key": "value"}

    def test_no_fences_with_whitespace(self):
        """Plain JSON with surrounding whitespace is handled."""
        raw = '  \n  {"key": "value"}  \n  '
        assert json.loads(_strip_markdown_fences(raw)) == {"key": "value"}

    def test_nested_code_block_in_value(self):
        """JSON with a string value containing ``` is not corrupted."""
        inner = {"code": "Use ```python\\nprint()\\n``` for code blocks"}
        raw = f'```json\n{json.dumps(inner)}\n```'
        result = _strip_markdown_fences(raw)
        parsed = json.loads(result)
        assert "```python" in parsed["code"]

    def test_complex_schema(self):
        """A real-world multi-field schema wrapped in fences parses correctly."""
        raw = f"```json\n{json.dumps(NESTED_SCHEMA, indent=2)}\n```"
        result = _strip_markdown_fences(raw)
        assert json.loads(result) == NESTED_SCHEMA

    def test_empty_string(self):
        """Empty string returns empty string."""
        assert _strip_markdown_fences("") == ""

    def test_only_whitespace(self):
        """Whitespace-only string returns empty string."""
        assert _strip_markdown_fences("   \n\n  ") == ""

    def test_only_fences(self):
        """Bare fences with nothing inside return empty string."""
        assert _strip_markdown_fences("```json\n```") == ""

    def test_multiline_json(self):
        """Multiline pretty-printed JSON inside fences."""
        pretty = json.dumps(SIMPLE_SCHEMA, indent=4)
        raw = f"```json\n{pretty}\n```"
        assert json.loads(_strip_markdown_fences(raw)) == SIMPLE_SCHEMA

    def test_already_clean_does_not_mutate(self):
        """Passing already-clean JSON multiple times is idempotent."""
        raw = json.dumps(SIMPLE_SCHEMA)
        once = _strip_markdown_fences(raw)
        twice = _strip_markdown_fences(once)
        assert once == twice == raw


# ===========================================================================
# Real integration tests — actual LLM API calls against quotes.toscrape.com
# ===========================================================================


def _validate_schema(schema: dict):
    """Validate that a generated schema has the expected structure."""
    assert isinstance(schema, dict), f"Schema must be a dict, got {type(schema)}"
    assert "name" in schema, "Schema must have a 'name' field"
    assert "baseSelector" in schema, "Schema must have a 'baseSelector' field"
    assert "fields" in schema, "Schema must have a 'fields' field"
    assert isinstance(schema["fields"], list), "'fields' must be a list"
    assert len(schema["fields"]) > 0, "'fields' must not be empty"
    for field in schema["fields"]:
        assert "name" in field, f"Each field must have a 'name': {field}"
        assert "selector" in field, f"Each field must have a 'selector': {field}"
        assert "type" in field, f"Each field must have a 'type': {field}"


class TestRealAnthropicSchemaGeneration:
    """Real API calls to Anthropic models — the exact scenario from the bug report."""

    @pytest.mark.asyncio
    @pytest.mark.skipif(
        not os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
        reason="CRAWL4AI_ANTHROPIC_KEY not set",
    )
    async def test_anthropic_haiku_css_schema(self):
        """Reproduce the original bug: anthropic/claude-haiku-4-5 + CSS schema."""
        schema = await JsonCssExtractionStrategy.agenerate_schema(
            url=TEST_URL,
            schema_type="CSS",
            query="Extract all quotes with their text, author, and tags",
            llm_config=LLMConfig(
                provider="anthropic/claude-haiku-4-5",
                api_token=os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
            ),
        )
        _validate_schema(schema)
        print(f"\n[Anthropic Haiku CSS] Generated schema: {json.dumps(schema, indent=2)}")

    @pytest.mark.asyncio
    @pytest.mark.skipif(
        not os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
        reason="CRAWL4AI_ANTHROPIC_KEY not set",
    )
    async def test_anthropic_haiku_xpath_schema(self):
        """Anthropic haiku with XPath schema type."""
        schema = await JsonXPathExtractionStrategy.agenerate_schema(
            url=TEST_URL,
            schema_type="XPATH",
            query="Extract all quotes with their text, author, and tags",
            llm_config=LLMConfig(
                provider="anthropic/claude-haiku-4-5",
                api_token=os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
            ),
        )
        _validate_schema(schema)
        print(f"\n[Anthropic Haiku XPath] Generated schema: {json.dumps(schema, indent=2)}")

    @pytest.mark.asyncio
    @pytest.mark.skipif(
        not os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
        reason="CRAWL4AI_ANTHROPIC_KEY not set",
    )
    async def test_anthropic_no_query(self):
        """Anthropic with no query — should auto-detect schema from page structure."""
        schema = await JsonCssExtractionStrategy.agenerate_schema(
            url=TEST_URL,
            schema_type="CSS",
            llm_config=LLMConfig(
                provider="anthropic/claude-haiku-4-5",
                api_token=os.getenv("CRAWL4AI_ANTHROPIC_KEY"),
            ),
        )
        _validate_schema(schema)
        print(f"\n[Anthropic Haiku no-query] Generated schema: {json.dumps(schema, indent=2)}")


class TestRealOpenAISchemaGeneration:
    """OpenAI models — should still work as before (regression check)."""

    @pytest.mark.asyncio
    @pytest.mark.skipif(
        not os.getenv("CRAWL4AI_OPENAI_KEY"),
        reason="CRAWL4AI_OPENAI_KEY not set",
    )
    async def test_openai_gpt4o_mini_css_schema(self):
        """OpenAI gpt-4o-mini with CSS — this already worked, must not regress."""
        schema = await JsonCssExtractionStrategy.agenerate_schema(
            url=TEST_URL,
            schema_type="CSS",
            query="Extract all quotes with their text, author, and tags",
            llm_config=LLMConfig(
                provider="openai/gpt-4o-mini",
                api_token=os.getenv("CRAWL4AI_OPENAI_KEY"),
            ),
        )
        _validate_schema(schema)
        print(f"\n[OpenAI gpt-4o-mini CSS] Generated schema: {json.dumps(schema, indent=2)}")


class TestRealGroqSchemaGeneration:
    """Groq with the updated model name."""

    @pytest.mark.asyncio
    @pytest.mark.skipif(
        not os.getenv("CRAWL4AI_GROQ_KEY") and not os.getenv("GROQ_API_KEY"),
        reason="No Groq API key set",
    )
    async def test_groq_llama33_css_schema(self):
        """Groq with llama-3.3-70b-versatile (replacement for decommissioned 3.1)."""
        api_key = os.getenv("CRAWL4AI_GROQ_KEY") or os.getenv("GROQ_API_KEY")
        schema = await JsonCssExtractionStrategy.agenerate_schema(
            url=TEST_URL,
            schema_type="CSS",
            query="Extract all quotes with their text, author, and tags",
            llm_config=LLMConfig(
                provider="groq/llama-3.3-70b-versatile",
                api_token=api_key,
            ),
        )
        _validate_schema(schema)
        print(f"\n[Groq llama-3.3] Generated schema: {json.dumps(schema, indent=2)}")


# ===========================================================================
# Regression: ensure _strip_markdown_fences doesn't break valid JSON
# ===========================================================================


class TestRegressionNoBreakage:
    """Ensure the fix doesn't break any currently-working JSON formats."""

    @pytest.mark.parametrize(
        "raw_json",
        [
            '{"simple": true}',
            '[]',
            '[{"a": 1}, {"a": 2}]',
            '{"nested": {"deep": {"value": 42}}}',
            '{"unicode": "\u3053\u3093\u306b\u3061\u306f\u4e16\u754c"}',
            '{"special": "line1\\nline2\\ttab"}',
            '{"url": "https://example.com/path?q=1&b=2"}',
            json.dumps(SIMPLE_SCHEMA),
            json.dumps(NESTED_SCHEMA),
            json.dumps(NESTED_SCHEMA, indent=2),
            json.dumps(NESTED_SCHEMA, indent=4),
        ],
        ids=[
            "simple_object",
            "empty_array",
            "array_of_objects",
            "deeply_nested",
            "unicode_content",
            "escape_sequences",
            "url_in_value",
            "simple_schema_compact",
            "nested_schema_compact",
            "nested_schema_indent2",
            "nested_schema_indent4",
        ],
    )
    def test_clean_json_unchanged(self, raw_json):
        """Already-clean JSON must parse identically after stripping."""
        original = json.loads(raw_json)
        after_strip = json.loads(_strip_markdown_fences(raw_json))
        assert after_strip == original

    @pytest.mark.parametrize(
        "raw_json",
        [
            '{"simple": true}',
            '[]',
            '[{"a": 1}, {"a": 2}]',
            json.dumps(SIMPLE_SCHEMA),
            json.dumps(NESTED_SCHEMA, indent=2),
        ],
        ids=[
            "simple_object",
            "empty_array",
            "array_of_objects",
            "simple_schema",
            "nested_schema",
        ],
    )
    def test_fenced_json_matches_clean(self, raw_json):
        """Fenced version of any JSON must parse to the same value as clean."""
        original = json.loads(raw_json)
        fenced = f"```json\n{raw_json}\n```"
        after_strip = json.loads(_strip_markdown_fences(fenced))
        assert after_strip == original