feat(extraction_strategy): Enhance schema generation with improved validation and task description handling

fix(prompts): Update GENERATE_SCRIPT_PROMPT to raw string for better formatting docs: Add missing import for GENERATE_SCRIPT_PROMPT in hello_world example
2025-07-29 19:33:36 +08:00
parent 843457a9cb
commit 54ae10d957
3 changed files with 135 additions and 97 deletions
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -1088,111 +1088,147 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
    @staticmethod
    def generate_schema(
        html: str,
-        schema_type: str = "CSS", # or XPATH
-        query: str = None,
-        target_json_example: str = None,
-        llm_config: 'LLMConfig' = create_llm_config(),
-        provider: str = None,
-        api_token: str = None,
-        **kwargs
+        *,
+        schema_type: str = "CSS",              # "CSS" or "XPATH"
+        query: str | None = None,
+        target_json_example: str | None = None,
+        last_instruction: str | None = None,   # extra “IMPORTANT” notes
+        llm_config: "LLMConfig" = create_llm_config(),
+        token_usages: Optional[list["TokenUsage"]] = None,
+        prompt: str | None = None,
+        **kwargs,
    ) -> dict:
        """
-        Generate extraction schema from HTML content and optional query.
-        
-        Args:
-            html (str): The HTML content to analyze
-            query (str, optional): Natural language description of what data to extract
-            provider (str): Legacy Parameter. LLM provider to use 
-            api_token (str): Legacy Parameter. API token for LLM provider
-            llm_config (LLMConfig): LLM configuration object
-            prompt (str, optional): Custom prompt template to use
-            **kwargs: Additional args passed to LLM processor
-            
-        Returns:
-            dict: Generated schema following the JsonElementExtractionStrategy format
+        Produce a JSON extraction schema from raw HTML.
+
+        - If `query` is given, the task section echoes it.
+        - If no `query` but `target_json_example` exists,
+          we instruct the model to fit the schema to that example.
+        - If neither is provided, we ask the model to detect
+          the most obvious repeating data and build a schema.
+
+        Returns
+        -------
+        dict
+            A schema compliant with JsonElementExtractionStrategy.
        """
-        from .prompts import JSON_SCHEMA_BUILDER
+        import json, re, textwrap
+        from .prompts import JSON_SCHEMA_BUILDER, JSON_SCHEMA_BUILDER_XPATH
        from .utils import perform_completion_with_backoff
-        for name, message in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items():
-            if locals()[name] is not None:
-                raise AttributeError(f"Setting '{name}' is deprecated. {message}")
-        
-        # Use default or custom prompt
-        prompt_template = JSON_SCHEMA_BUILDER if schema_type == "CSS" else JSON_SCHEMA_BUILDER_XPATH
-        
-        # Build the prompt
-        system_message = {
-            "role": "system", 
-            "content": f"""You specialize in generating special JSON schemas for web scraping. This schema uses CSS or XPATH selectors to present a repetitive pattern in crawled HTML, such as a product in a product list or a search result item in a list of search results. We use this JSON schema to pass to a language model along with the HTML content to extract structured data from the HTML. The language model uses the JSON schema to extract data from the HTML and retrieve values for fields in the JSON schema, following the schema.

-Generating this HTML manually is not feasible, so you need to generate the JSON schema using the HTML content. The HTML copied from the crawled website is provided below, which we believe contains the repetitive pattern.
+        # ─── basic validation ────────────────────────────────────
+        if not html or not html.strip():
+            raise ValueError("html must be non-empty")
+        if schema_type not in {"CSS", "XPATH"}:
+            raise ValueError("schema_type must be 'CSS' or 'XPATH'")
+        for name, msg in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items():
+            if locals().get(name) is not None:
+                raise AttributeError(f"Setting '{name}' is deprecated. {msg}")

-# Schema main keys:
- name: This is the name of the schema.
- baseSelector: This is the CSS or XPATH selector that identifies the base element that contains all the repetitive patterns.
- baseFields: This is a list of fields that you extract from the base element itself.
- fields: This is a list of fields that you extract from the children of the base element. {{name, selector, type}} based on the type, you may have extra keys such as "attribute" when the type is "attribute".
-
-# Extra Context:
-In this context, the following items may or may not be present:
- Example of target JSON object: This is a sample of the final JSON object that we hope to extract from the HTML using the schema you are generating.
- Extra Instructions: This is optional instructions to consider when generating the schema provided by the user.
- Query or explanation of target/goal data item: This is a description of what data we are trying to extract from the HTML. This explanation means we're not sure about the rigid schema of the structures we want, so we leave it to you to use your expertise to create the best and most comprehensive structures aimed at maximizing data extraction from this page. You must ensure that you do not pick up nuances that may exist on a particular page. The focus should be on the data we are extracting, and it must be valid, safe, and robust based on the given HTML.
-
-# What if there is no example of target JSON object and also no extra instructions or even no explanation of target/goal data item?
-In this scenario, use your best judgment to generate the schema. You need to examine the content of the page and understand the data it provides. If the page contains repetitive data, such as lists of items, products, jobs, places, books, or movies, focus on one single item that repeats. If the page is a detailed page about one product or item, create a schema to extract the entire structured data. At this stage, you must think and decide for yourself. Try to maximize the number of fields that you can extract from the HTML.
-
-# What are the instructions and details for this schema generation?
-{prompt_template}"""
-        }
-        
-        user_message = {
-            "role": "user",
-            "content": f"""
-                HTML to analyze:
-                ```html
-                {html}
-                ```
-                """
-        }
+        # ─── prompt selection ────────────────────────────────────
+        prompt_template = (
+            prompt
+            if prompt is not None
+            else (JSON_SCHEMA_BUILDER if schema_type == "CSS" else JSON_SCHEMA_BUILDER_XPATH)
+        )

+        # ─── derive task description ─────────────────────────────
        if query:
-            user_message["content"] += f"\n\n## Query or explanation of target/goal data item:\n{query}"
-        if target_json_example:
-            user_message["content"] += f"\n\n## Example of target JSON object:\n```json\n{target_json_example}\n```"
-
-        if query and not target_json_example:
-            user_message["content"] += """IMPORTANT: To remind you, in this process, we are not providing a rigid example of the adjacent objects we seek. We rely on your understanding of the explanation provided in the above section. Make sure to grasp what we are looking for and, based on that, create the best schema.."""
-        elif not query and target_json_example:
-            user_message["content"] += """IMPORTANT: Please remember that in this process, we provided a proper example of a target JSON object. Make sure to adhere to the structure and create a schema that exactly fits this example. If you find that some elements on the page do not match completely, vote for the majority."""
-        elif not query and not target_json_example:
-            user_message["content"] += """IMPORTANT: Since we neither have a query nor an example, it is crucial to rely solely on the HTML content provided. Leverage your expertise to determine the schema based on the repetitive patterns observed in the content."""
-        
-        user_message["content"] += """IMPORTANT: 
-        0/ Ensure your schema remains reliable by avoiding selectors that appear to generate dynamically and are not dependable. You want a reliable schema, as it consistently returns the same data even after many page reloads.
-        1/ DO NOT USE use base64 kind of classes, they are temporary and not reliable.
-        2/ Every selector must refer to only one unique element. You should ensure your selector points to a single element and is unique to the place that contains the information. You have to use available techniques based on CSS or XPATH requested schema to make sure your selector is unique and also not fragile, meaning if we reload the page now or in the future, the selector should remain reliable.
-        3/ Do not use Regex as much as possible.
-
-        Analyze the HTML and generate a JSON schema that follows the specified format. Only output valid JSON schema, nothing else.
-        """
-
-        try:
-            # Call LLM with backoff handling
-            response = perform_completion_with_backoff(
-                provider=llm_config.provider,
-                prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]),
-                json_response = True,                
-                api_token=llm_config.api_token,
-                base_url=llm_config.base_url,
-                extra_args=kwargs
+            task_line = query.strip()
+        elif target_json_example:
+            task_line = (
+                "Use the example JSON below to infer all required fields, "
+                "then generate a schema that extracts matching data."
            )
-            
-            # Extract and return schema
-            return json.loads(response.choices[0].message.content)
-            
-        except Exception as e:
-            raise Exception(f"Failed to generate schema: {str(e)}")
+        else:
+            task_line = (
+                "Detect the most obvious repeating data on this page and "
+                "generate a schema that captures it completely."
+            )
+
+        # ─── build user prompt body ──────────────────────────────
+        html_clean = re.sub(r"\s{2,}", " ", textwrap.dedent(html).strip())
+
+        parts: list[str] = [
+            f"{prompt_template}",
+            "\n\n## Extracted HTML\n"
+            "==================== Beginning of Html ====================\n",
+            html_clean,
+            "\n==================== End of Html ====================\n",
+        ]
+
+        if target_json_example:
+            parts.extend(
+                [
+                    "\n## Example of end result\n",
+                    target_json_example.strip(),
+                    "\n",
+                ]
+            )
+
+        if last_instruction:
+            parts.extend(
+                [
+                    "\n## Important\n",
+                    last_instruction.strip(),
+                    "\n",
+                ]
+            )
+
+        parts.extend(
+            [
+                "\n## Task:\n",
+                task_line,
+            ]
+        )
+
+        user_message = {"role": "user", "content": "".join(parts)}
+
+        # slim system message, JSON_SCHEMA_BUILDER already holds heavy guidance
+        system_message = {
+            "role": "system",
+            "content": (
+                "You generate reliable JSON schemas for structured extraction. "
+                "Return valid JSON only."
+            ),
+        }
+
+        # ─── call LLM ─────────────────────────────────────────────
+        response = perform_completion_with_backoff(
+            provider=llm_config.provider,
+            prompt_with_variables="\n\n".join(
+                [system_message["content"], user_message["content"]]
+            ),
+            json_response=True,
+            api_token=llm_config.api_token,
+            base_url=llm_config.base_url,
+            extra_args=kwargs,
+        )
+
+        # ─── token usage accounting ──────────────────────────────
+        if token_usages is not None and hasattr(response, "usage"):
+            token_usages.append(
+                TokenUsage(
+                    completion_tokens=getattr(response.usage, "completion_tokens", 0),
+                    prompt_tokens=getattr(response.usage, "prompt_tokens", 0),
+                    total_tokens=getattr(response.usage, "total_tokens", 0),
+                )
+            )
+
+        # ─── parse and validate JSON answer ──────────────────────
+        try:
+            schema = json.loads(response.choices[0].message.content)
+        except Exception as exc:
+            raise ValueError(f"LLM returned invalid JSON: {exc}") from exc
+
+        required = {"name", "baseSelector", "fields"}
+        if not required.issubset(schema):
+            missing = required - set(schema)
+            raise ValueError(f"Generated schema missing required keys: {missing}")
+
+        return schema
+
+

 class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
    """
--- a/crawl4ai/prompts.py
+++ b/crawl4ai/prompts.py
@@ -1056,7 +1056,7 @@ Your output must:
 </output_requirements>
 """

-GENERATE_SCRIPT_PROMPT = """You are a world-class browser automation specialist. Your sole purpose is to convert a natural language objective and a snippet of HTML into the most **efficient, robust, and simple** script possible to prepare a web page for data extraction.
+GENERATE_SCRIPT_PROMPT = r"""You are a world-class browser automation specialist. Your sole purpose is to convert a natural language objective and a snippet of HTML into the most **efficient, robust, and simple** script possible to prepare a web page for data extraction.

 Your scripts run **before the crawl** to handle dynamic content, user interactions, and other obstacles. You are a master of two tools: raw **JavaScript** and the high-level **Crawl4ai Script (c4a)**.

--- a/docs/examples/hello_world.py
+++ b/docs/examples/hello_world.py
@@ -8,6 +8,8 @@ from crawl4ai import (
    CrawlResult
 )

+from crawl4ai.prompts import GENERATE_SCRIPT_PROMPT
+

 async def main():
    browser_config = BrowserConfig(