feat(extraction): add LLM-powered schema generation utility

Adds new static method generate_schema() to JsonElementExtractionStrategy classes
that can automatically generate extraction schemas using LLM (OpenAI or Ollama).
This provides a convenient way to bootstrap extraction schemas while maintaining
the performance benefits of selector-based extraction.

Key changes:
- Added generate_schema() static method to base extraction strategy
- Added support for both CSS and XPath schema generation
- Updated documentation with examples and best practices
- Added new prompt templates for schema generation
This commit is contained in:
UncleCode
2025-01-20 17:28:00 +08:00
parent 4b1309cbf2
commit 2cec527a22
6 changed files with 1052 additions and 3 deletions

View File

@@ -5,7 +5,7 @@ import json
import time
import os
from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH
from .config import (
DEFAULT_PROVIDER, PROVIDER_MODELS,
CHUNK_TOKEN_THRESHOLD,
@@ -1060,6 +1060,72 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
"""Get attribute value from element"""
pass
@staticmethod
def generate_schema(
html: str,
schema_type: str = "CSS", # or XPATH
query: str = None,
provider: str = "gpt-4o",
api_token: str = os.getenv("OPENAI_API_KEY"),
**kwargs
) -> dict:
"""
Generate extraction schema from HTML content and optional query.
Args:
html (str): The HTML content to analyze
query (str, optional): Natural language description of what data to extract
provider (str): LLM provider to use
api_token (str): API token for LLM provider
prompt (str, optional): Custom prompt template to use
**kwargs: Additional args passed to perform_completion_with_backoff
Returns:
dict: Generated schema following the JsonElementExtractionStrategy format
"""
from .prompts import JSON_SCHEMA_BUILDER
from .utils import perform_completion_with_backoff
# Use default or custom prompt
prompt_template = JSON_SCHEMA_BUILDER if schema_type == "CSS" else JSON_SCHEMA_BUILDER_XPATH
# Build the prompt
system_message = {
"role": "system",
"content": "You are a specialized HTML schema generator. Analyze the HTML and generate a JSON schema that follows the specified format. Only output valid JSON schema, nothing else."
}
user_message = {
"role": "user",
"content": f"""
Instructions:
{prompt_template}
HTML to analyze:
```html
{html}
```
{"Extract the following data: " + query if query else "Please analyze the HTML structure and create the most appropriate schema for data extraction."}
"""
}
try:
# Call LLM with backoff handling
response = perform_completion_with_backoff(
provider=provider,
prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]),
json_response = True,
api_token=api_token,
**kwargs
)
# Extract and return schema
return json.loads(response.choices[0].message.content)
except Exception as e:
raise Exception(f"Failed to generate schema: {str(e)}")
class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
"""
@@ -1171,3 +1237,4 @@ class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
def _get_element_attribute(self, element, attribute: str):
return element.get(attribute)