Refactor extraction strategy internals and improve error handling
This commit is contained in:
@@ -1342,82 +1342,141 @@ In this scenario, use your best judgment to generate the schema. You need to exa
|
||||
|
||||
@staticmethod
|
||||
def generate_schema(
|
||||
html: str,
|
||||
html: str = None,
|
||||
schema_type: str = "CSS",
|
||||
query: str = None,
|
||||
target_json_example: str = None,
|
||||
llm_config: 'LLMConfig' = create_llm_config(),
|
||||
provider: str = None,
|
||||
api_token: str = None,
|
||||
url: str = None,
|
||||
**kwargs
|
||||
) -> dict:
|
||||
"""
|
||||
Generate extraction schema from HTML content and optional query (sync version).
|
||||
Generate extraction schema from HTML content or URL (sync version).
|
||||
|
||||
Args:
|
||||
html (str): The HTML content to analyze
|
||||
query (str, optional): Natural language description of what data to extract
|
||||
provider (str): Legacy Parameter. LLM provider to use
|
||||
api_token (str): Legacy Parameter. API token for LLM provider
|
||||
llm_config (LLMConfig): LLM configuration object
|
||||
**kwargs: Additional args passed to LLM processor
|
||||
html (str, optional): The HTML content to analyze. If not provided, url must be set.
|
||||
schema_type (str): "CSS" or "XPATH". Defaults to "CSS".
|
||||
query (str, optional): Natural language description of what data to extract.
|
||||
target_json_example (str, optional): Example of desired JSON output.
|
||||
llm_config (LLMConfig): LLM configuration object.
|
||||
provider (str): Legacy Parameter. LLM provider to use.
|
||||
api_token (str): Legacy Parameter. API token for LLM provider.
|
||||
url (str, optional): URL to fetch HTML from. If provided, html parameter is ignored.
|
||||
**kwargs: Additional args passed to LLM processor.
|
||||
|
||||
Returns:
|
||||
dict: Generated schema following the JsonElementExtractionStrategy format
|
||||
dict: Generated schema following the JsonElementExtractionStrategy format.
|
||||
|
||||
Raises:
|
||||
ValueError: If neither html nor url is provided.
|
||||
"""
|
||||
from .utils import perform_completion_with_backoff
|
||||
|
||||
for name, message in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items():
|
||||
if locals()[name] is not None:
|
||||
raise AttributeError(f"Setting '{name}' is deprecated. {message}")
|
||||
|
||||
prompt = JsonElementExtractionStrategy._build_schema_prompt(html, schema_type, query, target_json_example)
|
||||
import asyncio
|
||||
|
||||
try:
|
||||
response = perform_completion_with_backoff(
|
||||
provider=llm_config.provider,
|
||||
prompt_with_variables=prompt,
|
||||
json_response=True,
|
||||
api_token=llm_config.api_token,
|
||||
base_url=llm_config.base_url,
|
||||
extra_args=kwargs
|
||||
)
|
||||
return json.loads(response.choices[0].message.content)
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to generate schema: {str(e)}")
|
||||
loop = asyncio.get_running_loop()
|
||||
except RuntimeError:
|
||||
loop = None
|
||||
|
||||
coro = JsonElementExtractionStrategy.agenerate_schema(
|
||||
html=html,
|
||||
schema_type=schema_type,
|
||||
query=query,
|
||||
target_json_example=target_json_example,
|
||||
llm_config=llm_config,
|
||||
provider=provider,
|
||||
api_token=api_token,
|
||||
url=url,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
if loop is None:
|
||||
return asyncio.run(coro)
|
||||
else:
|
||||
import concurrent.futures
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
||||
future = executor.submit(asyncio.run, coro)
|
||||
return future.result()
|
||||
|
||||
@staticmethod
|
||||
async def agenerate_schema(
|
||||
html: str,
|
||||
html: str = None,
|
||||
schema_type: str = "CSS",
|
||||
query: str = None,
|
||||
target_json_example: str = None,
|
||||
llm_config: 'LLMConfig' = None,
|
||||
provider: str = None,
|
||||
api_token: str = None,
|
||||
url: str = None,
|
||||
**kwargs
|
||||
) -> dict:
|
||||
"""
|
||||
Generate extraction schema from HTML content (async version).
|
||||
Generate extraction schema from HTML content or URL (async version).
|
||||
|
||||
Use this method when calling from async contexts (e.g., FastAPI) to avoid
|
||||
issues with certain LLM providers (e.g., Gemini/Vertex AI) that require
|
||||
async execution.
|
||||
|
||||
Args:
|
||||
html (str): The HTML content to analyze
|
||||
schema_type (str): "CSS" or "XPATH"
|
||||
query (str, optional): Natural language description of what data to extract
|
||||
target_json_example (str, optional): Example of desired JSON output
|
||||
llm_config (LLMConfig): LLM configuration object
|
||||
**kwargs: Additional args passed to LLM processor
|
||||
html (str, optional): The HTML content to analyze. If not provided, url must be set.
|
||||
schema_type (str): "CSS" or "XPATH". Defaults to "CSS".
|
||||
query (str, optional): Natural language description of what data to extract.
|
||||
target_json_example (str, optional): Example of desired JSON output.
|
||||
llm_config (LLMConfig): LLM configuration object.
|
||||
provider (str): Legacy Parameter. LLM provider to use.
|
||||
api_token (str): Legacy Parameter. API token for LLM provider.
|
||||
url (str, optional): URL to fetch HTML from. If provided, html parameter is ignored.
|
||||
**kwargs: Additional args passed to LLM processor.
|
||||
|
||||
Returns:
|
||||
dict: Generated schema following the JsonElementExtractionStrategy format
|
||||
dict: Generated schema following the JsonElementExtractionStrategy format.
|
||||
|
||||
Raises:
|
||||
ValueError: If neither html nor url is provided.
|
||||
"""
|
||||
from .utils import aperform_completion_with_backoff
|
||||
from .utils import aperform_completion_with_backoff, preprocess_html_for_schema
|
||||
|
||||
# Validate inputs
|
||||
if html is None and url is None:
|
||||
raise ValueError("Either 'html' or 'url' must be provided")
|
||||
|
||||
# Check deprecated parameters
|
||||
for name, message in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items():
|
||||
if locals()[name] is not None:
|
||||
raise AttributeError(f"Setting '{name}' is deprecated. {message}")
|
||||
|
||||
if llm_config is None:
|
||||
llm_config = create_llm_config()
|
||||
|
||||
# Fetch HTML from URL if provided
|
||||
if url is not None:
|
||||
from .async_webcrawler import AsyncWebCrawler
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
text_mode=True,
|
||||
light_mode=True,
|
||||
)
|
||||
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url=url, config=crawler_config)
|
||||
if not result.success:
|
||||
raise Exception(f"Failed to fetch URL '{url}': {result.error_message}")
|
||||
if result.status_code >= 400:
|
||||
raise Exception(f"HTTP {result.status_code} error for URL '{url}'")
|
||||
html = result.html
|
||||
|
||||
# Preprocess HTML for schema generation (higher text_threshold for better LLM context)
|
||||
html = preprocess_html_for_schema(
|
||||
html_content=html,
|
||||
text_threshold=2000,
|
||||
attr_value_threshold=500,
|
||||
max_size=500_000
|
||||
)
|
||||
|
||||
prompt = JsonElementExtractionStrategy._build_schema_prompt(html, schema_type, query, target_json_example)
|
||||
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user