Add support for parallel URL processing in extraction utilities
This commit is contained in:
@@ -1349,11 +1349,11 @@ In this scenario, use your best judgment to generate the schema. You need to exa
|
|||||||
llm_config: 'LLMConfig' = create_llm_config(),
|
llm_config: 'LLMConfig' = create_llm_config(),
|
||||||
provider: str = None,
|
provider: str = None,
|
||||||
api_token: str = None,
|
api_token: str = None,
|
||||||
url: str = None,
|
url: Union[str, List[str]] = None,
|
||||||
**kwargs
|
**kwargs
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""
|
"""
|
||||||
Generate extraction schema from HTML content or URL (sync version).
|
Generate extraction schema from HTML content or URL(s) (sync version).
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
html (str, optional): The HTML content to analyze. If not provided, url must be set.
|
html (str, optional): The HTML content to analyze. If not provided, url must be set.
|
||||||
@@ -1363,7 +1363,8 @@ In this scenario, use your best judgment to generate the schema. You need to exa
|
|||||||
llm_config (LLMConfig): LLM configuration object.
|
llm_config (LLMConfig): LLM configuration object.
|
||||||
provider (str): Legacy Parameter. LLM provider to use.
|
provider (str): Legacy Parameter. LLM provider to use.
|
||||||
api_token (str): Legacy Parameter. API token for LLM provider.
|
api_token (str): Legacy Parameter. API token for LLM provider.
|
||||||
url (str, optional): URL to fetch HTML from. If provided, html parameter is ignored.
|
url (str or List[str], optional): URL(s) to fetch HTML from. If provided, html parameter is ignored.
|
||||||
|
When multiple URLs are provided, HTMLs are fetched in parallel and concatenated.
|
||||||
**kwargs: Additional args passed to LLM processor.
|
**kwargs: Additional args passed to LLM processor.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@@ -1408,11 +1409,11 @@ In this scenario, use your best judgment to generate the schema. You need to exa
|
|||||||
llm_config: 'LLMConfig' = None,
|
llm_config: 'LLMConfig' = None,
|
||||||
provider: str = None,
|
provider: str = None,
|
||||||
api_token: str = None,
|
api_token: str = None,
|
||||||
url: str = None,
|
url: Union[str, List[str]] = None,
|
||||||
**kwargs
|
**kwargs
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""
|
"""
|
||||||
Generate extraction schema from HTML content or URL (async version).
|
Generate extraction schema from HTML content or URL(s) (async version).
|
||||||
|
|
||||||
Use this method when calling from async contexts (e.g., FastAPI) to avoid
|
Use this method when calling from async contexts (e.g., FastAPI) to avoid
|
||||||
issues with certain LLM providers (e.g., Gemini/Vertex AI) that require
|
issues with certain LLM providers (e.g., Gemini/Vertex AI) that require
|
||||||
@@ -1426,7 +1427,8 @@ In this scenario, use your best judgment to generate the schema. You need to exa
|
|||||||
llm_config (LLMConfig): LLM configuration object.
|
llm_config (LLMConfig): LLM configuration object.
|
||||||
provider (str): Legacy Parameter. LLM provider to use.
|
provider (str): Legacy Parameter. LLM provider to use.
|
||||||
api_token (str): Legacy Parameter. API token for LLM provider.
|
api_token (str): Legacy Parameter. API token for LLM provider.
|
||||||
url (str, optional): URL to fetch HTML from. If provided, html parameter is ignored.
|
url (str or List[str], optional): URL(s) to fetch HTML from. If provided, html parameter is ignored.
|
||||||
|
When multiple URLs are provided, HTMLs are fetched in parallel and concatenated.
|
||||||
**kwargs: Additional args passed to LLM processor.
|
**kwargs: Additional args passed to LLM processor.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@@ -1438,7 +1440,7 @@ In this scenario, use your best judgment to generate the schema. You need to exa
|
|||||||
from .utils import aperform_completion_with_backoff, preprocess_html_for_schema
|
from .utils import aperform_completion_with_backoff, preprocess_html_for_schema
|
||||||
|
|
||||||
# Validate inputs
|
# Validate inputs
|
||||||
if html is None and url is None:
|
if html is None and (url is None or (isinstance(url, list) and len(url) == 0)):
|
||||||
raise ValueError("Either 'html' or 'url' must be provided")
|
raise ValueError("Either 'html' or 'url' must be provided")
|
||||||
|
|
||||||
# Check deprecated parameters
|
# Check deprecated parameters
|
||||||
@@ -1449,7 +1451,7 @@ In this scenario, use your best judgment to generate the schema. You need to exa
|
|||||||
if llm_config is None:
|
if llm_config is None:
|
||||||
llm_config = create_llm_config()
|
llm_config = create_llm_config()
|
||||||
|
|
||||||
# Fetch HTML from URL if provided
|
# Fetch HTML from URL(s) if provided
|
||||||
if url is not None:
|
if url is not None:
|
||||||
from .async_webcrawler import AsyncWebCrawler
|
from .async_webcrawler import AsyncWebCrawler
|
||||||
from .async_configs import BrowserConfig, CrawlerRunConfig, CacheMode
|
from .async_configs import BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
@@ -1461,21 +1463,42 @@ In this scenario, use your best judgment to generate the schema. You need to exa
|
|||||||
)
|
)
|
||||||
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
|
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
# Normalize to list
|
||||||
result = await crawler.arun(url=url, config=crawler_config)
|
urls = [url] if isinstance(url, str) else url
|
||||||
if not result.success:
|
|
||||||
raise Exception(f"Failed to fetch URL '{url}': {result.error_message}")
|
|
||||||
if result.status_code >= 400:
|
|
||||||
raise Exception(f"HTTP {result.status_code} error for URL '{url}'")
|
|
||||||
html = result.html
|
|
||||||
|
|
||||||
# Preprocess HTML for schema generation (higher text_threshold for better LLM context)
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
html = preprocess_html_for_schema(
|
if len(urls) == 1:
|
||||||
html_content=html,
|
result = await crawler.arun(url=urls[0], config=crawler_config)
|
||||||
text_threshold=2000,
|
if not result.success:
|
||||||
attr_value_threshold=500,
|
raise Exception(f"Failed to fetch URL '{urls[0]}': {result.error_message}")
|
||||||
max_size=500_000
|
if result.status_code >= 400:
|
||||||
)
|
raise Exception(f"HTTP {result.status_code} error for URL '{urls[0]}'")
|
||||||
|
html = result.html
|
||||||
|
else:
|
||||||
|
results = await crawler.arun_many(urls=urls, config=crawler_config)
|
||||||
|
html_parts = []
|
||||||
|
for i, result in enumerate(results, 1):
|
||||||
|
if not result.success:
|
||||||
|
raise Exception(f"Failed to fetch URL '{result.url}': {result.error_message}")
|
||||||
|
if result.status_code >= 400:
|
||||||
|
raise Exception(f"HTTP {result.status_code} error for URL '{result.url}'")
|
||||||
|
cleaned = preprocess_html_for_schema(
|
||||||
|
html_content=result.html,
|
||||||
|
text_threshold=2000,
|
||||||
|
attr_value_threshold=500,
|
||||||
|
max_size=500_000
|
||||||
|
)
|
||||||
|
html_parts.append(f"'''html example {i}\n{cleaned}\n'''")
|
||||||
|
html = "\n\n".join(html_parts)
|
||||||
|
|
||||||
|
# Preprocess HTML for schema generation (skip if already preprocessed from multiple URLs)
|
||||||
|
if url is None or isinstance(url, str):
|
||||||
|
html = preprocess_html_for_schema(
|
||||||
|
html_content=html,
|
||||||
|
text_threshold=2000,
|
||||||
|
attr_value_threshold=500,
|
||||||
|
max_size=500_000
|
||||||
|
)
|
||||||
|
|
||||||
prompt = JsonElementExtractionStrategy._build_schema_prompt(html, schema_type, query, target_json_example)
|
prompt = JsonElementExtractionStrategy._build_schema_prompt(html, schema_type, query, target_json_example)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user