Refactor extraction strategy internals and improve error handling
This commit is contained in:
@@ -1342,82 +1342,141 @@ In this scenario, use your best judgment to generate the schema. You need to exa
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def generate_schema(
|
def generate_schema(
|
||||||
html: str,
|
html: str = None,
|
||||||
schema_type: str = "CSS",
|
schema_type: str = "CSS",
|
||||||
query: str = None,
|
query: str = None,
|
||||||
target_json_example: str = None,
|
target_json_example: str = None,
|
||||||
llm_config: 'LLMConfig' = create_llm_config(),
|
llm_config: 'LLMConfig' = create_llm_config(),
|
||||||
provider: str = None,
|
provider: str = None,
|
||||||
api_token: str = None,
|
api_token: str = None,
|
||||||
|
url: str = None,
|
||||||
**kwargs
|
**kwargs
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""
|
"""
|
||||||
Generate extraction schema from HTML content and optional query (sync version).
|
Generate extraction schema from HTML content or URL (sync version).
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
html (str): The HTML content to analyze
|
html (str, optional): The HTML content to analyze. If not provided, url must be set.
|
||||||
query (str, optional): Natural language description of what data to extract
|
schema_type (str): "CSS" or "XPATH". Defaults to "CSS".
|
||||||
provider (str): Legacy Parameter. LLM provider to use
|
query (str, optional): Natural language description of what data to extract.
|
||||||
api_token (str): Legacy Parameter. API token for LLM provider
|
target_json_example (str, optional): Example of desired JSON output.
|
||||||
llm_config (LLMConfig): LLM configuration object
|
llm_config (LLMConfig): LLM configuration object.
|
||||||
**kwargs: Additional args passed to LLM processor
|
provider (str): Legacy Parameter. LLM provider to use.
|
||||||
|
api_token (str): Legacy Parameter. API token for LLM provider.
|
||||||
|
url (str, optional): URL to fetch HTML from. If provided, html parameter is ignored.
|
||||||
|
**kwargs: Additional args passed to LLM processor.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict: Generated schema following the JsonElementExtractionStrategy format
|
dict: Generated schema following the JsonElementExtractionStrategy format.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If neither html nor url is provided.
|
||||||
"""
|
"""
|
||||||
from .utils import perform_completion_with_backoff
|
import asyncio
|
||||||
|
|
||||||
for name, message in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items():
|
|
||||||
if locals()[name] is not None:
|
|
||||||
raise AttributeError(f"Setting '{name}' is deprecated. {message}")
|
|
||||||
|
|
||||||
prompt = JsonElementExtractionStrategy._build_schema_prompt(html, schema_type, query, target_json_example)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = perform_completion_with_backoff(
|
loop = asyncio.get_running_loop()
|
||||||
provider=llm_config.provider,
|
except RuntimeError:
|
||||||
prompt_with_variables=prompt,
|
loop = None
|
||||||
json_response=True,
|
|
||||||
api_token=llm_config.api_token,
|
coro = JsonElementExtractionStrategy.agenerate_schema(
|
||||||
base_url=llm_config.base_url,
|
html=html,
|
||||||
extra_args=kwargs
|
schema_type=schema_type,
|
||||||
)
|
query=query,
|
||||||
return json.loads(response.choices[0].message.content)
|
target_json_example=target_json_example,
|
||||||
except Exception as e:
|
llm_config=llm_config,
|
||||||
raise Exception(f"Failed to generate schema: {str(e)}")
|
provider=provider,
|
||||||
|
api_token=api_token,
|
||||||
|
url=url,
|
||||||
|
**kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
if loop is None:
|
||||||
|
return asyncio.run(coro)
|
||||||
|
else:
|
||||||
|
import concurrent.futures
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
||||||
|
future = executor.submit(asyncio.run, coro)
|
||||||
|
return future.result()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
async def agenerate_schema(
|
async def agenerate_schema(
|
||||||
html: str,
|
html: str = None,
|
||||||
schema_type: str = "CSS",
|
schema_type: str = "CSS",
|
||||||
query: str = None,
|
query: str = None,
|
||||||
target_json_example: str = None,
|
target_json_example: str = None,
|
||||||
llm_config: 'LLMConfig' = None,
|
llm_config: 'LLMConfig' = None,
|
||||||
|
provider: str = None,
|
||||||
|
api_token: str = None,
|
||||||
|
url: str = None,
|
||||||
**kwargs
|
**kwargs
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""
|
"""
|
||||||
Generate extraction schema from HTML content (async version).
|
Generate extraction schema from HTML content or URL (async version).
|
||||||
|
|
||||||
Use this method when calling from async contexts (e.g., FastAPI) to avoid
|
Use this method when calling from async contexts (e.g., FastAPI) to avoid
|
||||||
issues with certain LLM providers (e.g., Gemini/Vertex AI) that require
|
issues with certain LLM providers (e.g., Gemini/Vertex AI) that require
|
||||||
async execution.
|
async execution.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
html (str): The HTML content to analyze
|
html (str, optional): The HTML content to analyze. If not provided, url must be set.
|
||||||
schema_type (str): "CSS" or "XPATH"
|
schema_type (str): "CSS" or "XPATH". Defaults to "CSS".
|
||||||
query (str, optional): Natural language description of what data to extract
|
query (str, optional): Natural language description of what data to extract.
|
||||||
target_json_example (str, optional): Example of desired JSON output
|
target_json_example (str, optional): Example of desired JSON output.
|
||||||
llm_config (LLMConfig): LLM configuration object
|
llm_config (LLMConfig): LLM configuration object.
|
||||||
**kwargs: Additional args passed to LLM processor
|
provider (str): Legacy Parameter. LLM provider to use.
|
||||||
|
api_token (str): Legacy Parameter. API token for LLM provider.
|
||||||
|
url (str, optional): URL to fetch HTML from. If provided, html parameter is ignored.
|
||||||
|
**kwargs: Additional args passed to LLM processor.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict: Generated schema following the JsonElementExtractionStrategy format
|
dict: Generated schema following the JsonElementExtractionStrategy format.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If neither html nor url is provided.
|
||||||
"""
|
"""
|
||||||
from .utils import aperform_completion_with_backoff
|
from .utils import aperform_completion_with_backoff, preprocess_html_for_schema
|
||||||
|
|
||||||
|
# Validate inputs
|
||||||
|
if html is None and url is None:
|
||||||
|
raise ValueError("Either 'html' or 'url' must be provided")
|
||||||
|
|
||||||
|
# Check deprecated parameters
|
||||||
|
for name, message in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items():
|
||||||
|
if locals()[name] is not None:
|
||||||
|
raise AttributeError(f"Setting '{name}' is deprecated. {message}")
|
||||||
|
|
||||||
if llm_config is None:
|
if llm_config is None:
|
||||||
llm_config = create_llm_config()
|
llm_config = create_llm_config()
|
||||||
|
|
||||||
|
# Fetch HTML from URL if provided
|
||||||
|
if url is not None:
|
||||||
|
from .async_webcrawler import AsyncWebCrawler
|
||||||
|
from .async_configs import BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
headless=True,
|
||||||
|
text_mode=True,
|
||||||
|
light_mode=True,
|
||||||
|
)
|
||||||
|
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
result = await crawler.arun(url=url, config=crawler_config)
|
||||||
|
if not result.success:
|
||||||
|
raise Exception(f"Failed to fetch URL '{url}': {result.error_message}")
|
||||||
|
if result.status_code >= 400:
|
||||||
|
raise Exception(f"HTTP {result.status_code} error for URL '{url}'")
|
||||||
|
html = result.html
|
||||||
|
|
||||||
|
# Preprocess HTML for schema generation (higher text_threshold for better LLM context)
|
||||||
|
html = preprocess_html_for_schema(
|
||||||
|
html_content=html,
|
||||||
|
text_threshold=2000,
|
||||||
|
attr_value_threshold=500,
|
||||||
|
max_size=500_000
|
||||||
|
)
|
||||||
|
|
||||||
prompt = JsonElementExtractionStrategy._build_schema_prompt(html, schema_type, query, target_json_example)
|
prompt = JsonElementExtractionStrategy._build_schema_prompt(html, schema_type, query, target_json_example)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|||||||
Reference in New Issue
Block a user