Refactor extraction strategy internals and improve error handling

This commit is contained in:
unclecode
2026-01-24 03:08:41 +00:00
parent 777d0878f2
commit b0b3ca1222

View File

@@ -1342,82 +1342,141 @@ In this scenario, use your best judgment to generate the schema. You need to exa
@staticmethod @staticmethod
def generate_schema( def generate_schema(
html: str, html: str = None,
schema_type: str = "CSS", schema_type: str = "CSS",
query: str = None, query: str = None,
target_json_example: str = None, target_json_example: str = None,
llm_config: 'LLMConfig' = create_llm_config(), llm_config: 'LLMConfig' = create_llm_config(),
provider: str = None, provider: str = None,
api_token: str = None, api_token: str = None,
url: str = None,
**kwargs **kwargs
) -> dict: ) -> dict:
""" """
Generate extraction schema from HTML content and optional query (sync version). Generate extraction schema from HTML content or URL (sync version).
Args: Args:
html (str): The HTML content to analyze html (str, optional): The HTML content to analyze. If not provided, url must be set.
query (str, optional): Natural language description of what data to extract schema_type (str): "CSS" or "XPATH". Defaults to "CSS".
provider (str): Legacy Parameter. LLM provider to use query (str, optional): Natural language description of what data to extract.
api_token (str): Legacy Parameter. API token for LLM provider target_json_example (str, optional): Example of desired JSON output.
llm_config (LLMConfig): LLM configuration object llm_config (LLMConfig): LLM configuration object.
**kwargs: Additional args passed to LLM processor provider (str): Legacy Parameter. LLM provider to use.
api_token (str): Legacy Parameter. API token for LLM provider.
url (str, optional): URL to fetch HTML from. If provided, html parameter is ignored.
**kwargs: Additional args passed to LLM processor.
Returns: Returns:
dict: Generated schema following the JsonElementExtractionStrategy format dict: Generated schema following the JsonElementExtractionStrategy format.
Raises:
ValueError: If neither html nor url is provided.
""" """
from .utils import perform_completion_with_backoff import asyncio
for name, message in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items():
if locals()[name] is not None:
raise AttributeError(f"Setting '{name}' is deprecated. {message}")
prompt = JsonElementExtractionStrategy._build_schema_prompt(html, schema_type, query, target_json_example)
try: try:
response = perform_completion_with_backoff( loop = asyncio.get_running_loop()
provider=llm_config.provider, except RuntimeError:
prompt_with_variables=prompt, loop = None
json_response=True,
api_token=llm_config.api_token, coro = JsonElementExtractionStrategy.agenerate_schema(
base_url=llm_config.base_url, html=html,
extra_args=kwargs schema_type=schema_type,
) query=query,
return json.loads(response.choices[0].message.content) target_json_example=target_json_example,
except Exception as e: llm_config=llm_config,
raise Exception(f"Failed to generate schema: {str(e)}") provider=provider,
api_token=api_token,
url=url,
**kwargs
)
if loop is None:
return asyncio.run(coro)
else:
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
future = executor.submit(asyncio.run, coro)
return future.result()
@staticmethod @staticmethod
async def agenerate_schema( async def agenerate_schema(
html: str, html: str = None,
schema_type: str = "CSS", schema_type: str = "CSS",
query: str = None, query: str = None,
target_json_example: str = None, target_json_example: str = None,
llm_config: 'LLMConfig' = None, llm_config: 'LLMConfig' = None,
provider: str = None,
api_token: str = None,
url: str = None,
**kwargs **kwargs
) -> dict: ) -> dict:
""" """
Generate extraction schema from HTML content (async version). Generate extraction schema from HTML content or URL (async version).
Use this method when calling from async contexts (e.g., FastAPI) to avoid Use this method when calling from async contexts (e.g., FastAPI) to avoid
issues with certain LLM providers (e.g., Gemini/Vertex AI) that require issues with certain LLM providers (e.g., Gemini/Vertex AI) that require
async execution. async execution.
Args: Args:
html (str): The HTML content to analyze html (str, optional): The HTML content to analyze. If not provided, url must be set.
schema_type (str): "CSS" or "XPATH" schema_type (str): "CSS" or "XPATH". Defaults to "CSS".
query (str, optional): Natural language description of what data to extract query (str, optional): Natural language description of what data to extract.
target_json_example (str, optional): Example of desired JSON output target_json_example (str, optional): Example of desired JSON output.
llm_config (LLMConfig): LLM configuration object llm_config (LLMConfig): LLM configuration object.
**kwargs: Additional args passed to LLM processor provider (str): Legacy Parameter. LLM provider to use.
api_token (str): Legacy Parameter. API token for LLM provider.
url (str, optional): URL to fetch HTML from. If provided, html parameter is ignored.
**kwargs: Additional args passed to LLM processor.
Returns: Returns:
dict: Generated schema following the JsonElementExtractionStrategy format dict: Generated schema following the JsonElementExtractionStrategy format.
Raises:
ValueError: If neither html nor url is provided.
""" """
from .utils import aperform_completion_with_backoff from .utils import aperform_completion_with_backoff, preprocess_html_for_schema
# Validate inputs
if html is None and url is None:
raise ValueError("Either 'html' or 'url' must be provided")
# Check deprecated parameters
for name, message in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items():
if locals()[name] is not None:
raise AttributeError(f"Setting '{name}' is deprecated. {message}")
if llm_config is None: if llm_config is None:
llm_config = create_llm_config() llm_config = create_llm_config()
# Fetch HTML from URL if provided
if url is not None:
from .async_webcrawler import AsyncWebCrawler
from .async_configs import BrowserConfig, CrawlerRunConfig, CacheMode
browser_config = BrowserConfig(
headless=True,
text_mode=True,
light_mode=True,
)
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url=url, config=crawler_config)
if not result.success:
raise Exception(f"Failed to fetch URL '{url}': {result.error_message}")
if result.status_code >= 400:
raise Exception(f"HTTP {result.status_code} error for URL '{url}'")
html = result.html
# Preprocess HTML for schema generation (higher text_threshold for better LLM context)
html = preprocess_html_for_schema(
html_content=html,
text_threshold=2000,
attr_value_threshold=500,
max_size=500_000
)
prompt = JsonElementExtractionStrategy._build_schema_prompt(html, schema_type, query, target_json_example) prompt = JsonElementExtractionStrategy._build_schema_prompt(html, schema_type, query, target_json_example)
try: try: