From b0b3ca122202f60d35c20485db5fd1287a4dcbfe Mon Sep 17 00:00:00 2001 From: unclecode Date: Sat, 24 Jan 2026 03:08:41 +0000 Subject: [PATCH] Refactor extraction strategy internals and improve error handling --- crawl4ai/extraction_strategy.py | 133 +++++++++++++++++++++++--------- 1 file changed, 96 insertions(+), 37 deletions(-) diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 6be1c7c7..60a6e7d6 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -1342,82 +1342,141 @@ In this scenario, use your best judgment to generate the schema. You need to exa @staticmethod def generate_schema( - html: str, + html: str = None, schema_type: str = "CSS", query: str = None, target_json_example: str = None, llm_config: 'LLMConfig' = create_llm_config(), provider: str = None, api_token: str = None, + url: str = None, **kwargs ) -> dict: """ - Generate extraction schema from HTML content and optional query (sync version). + Generate extraction schema from HTML content or URL (sync version). Args: - html (str): The HTML content to analyze - query (str, optional): Natural language description of what data to extract - provider (str): Legacy Parameter. LLM provider to use - api_token (str): Legacy Parameter. API token for LLM provider - llm_config (LLMConfig): LLM configuration object - **kwargs: Additional args passed to LLM processor + html (str, optional): The HTML content to analyze. If not provided, url must be set. + schema_type (str): "CSS" or "XPATH". Defaults to "CSS". + query (str, optional): Natural language description of what data to extract. + target_json_example (str, optional): Example of desired JSON output. + llm_config (LLMConfig): LLM configuration object. + provider (str): Legacy Parameter. LLM provider to use. + api_token (str): Legacy Parameter. API token for LLM provider. + url (str, optional): URL to fetch HTML from. If provided, html parameter is ignored. + **kwargs: Additional args passed to LLM processor. Returns: - dict: Generated schema following the JsonElementExtractionStrategy format + dict: Generated schema following the JsonElementExtractionStrategy format. + + Raises: + ValueError: If neither html nor url is provided. """ - from .utils import perform_completion_with_backoff - - for name, message in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items(): - if locals()[name] is not None: - raise AttributeError(f"Setting '{name}' is deprecated. {message}") - - prompt = JsonElementExtractionStrategy._build_schema_prompt(html, schema_type, query, target_json_example) + import asyncio try: - response = perform_completion_with_backoff( - provider=llm_config.provider, - prompt_with_variables=prompt, - json_response=True, - api_token=llm_config.api_token, - base_url=llm_config.base_url, - extra_args=kwargs - ) - return json.loads(response.choices[0].message.content) - except Exception as e: - raise Exception(f"Failed to generate schema: {str(e)}") + loop = asyncio.get_running_loop() + except RuntimeError: + loop = None + + coro = JsonElementExtractionStrategy.agenerate_schema( + html=html, + schema_type=schema_type, + query=query, + target_json_example=target_json_example, + llm_config=llm_config, + provider=provider, + api_token=api_token, + url=url, + **kwargs + ) + + if loop is None: + return asyncio.run(coro) + else: + import concurrent.futures + with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: + future = executor.submit(asyncio.run, coro) + return future.result() @staticmethod async def agenerate_schema( - html: str, + html: str = None, schema_type: str = "CSS", query: str = None, target_json_example: str = None, llm_config: 'LLMConfig' = None, + provider: str = None, + api_token: str = None, + url: str = None, **kwargs ) -> dict: """ - Generate extraction schema from HTML content (async version). + Generate extraction schema from HTML content or URL (async version). Use this method when calling from async contexts (e.g., FastAPI) to avoid issues with certain LLM providers (e.g., Gemini/Vertex AI) that require async execution. Args: - html (str): The HTML content to analyze - schema_type (str): "CSS" or "XPATH" - query (str, optional): Natural language description of what data to extract - target_json_example (str, optional): Example of desired JSON output - llm_config (LLMConfig): LLM configuration object - **kwargs: Additional args passed to LLM processor + html (str, optional): The HTML content to analyze. If not provided, url must be set. + schema_type (str): "CSS" or "XPATH". Defaults to "CSS". + query (str, optional): Natural language description of what data to extract. + target_json_example (str, optional): Example of desired JSON output. + llm_config (LLMConfig): LLM configuration object. + provider (str): Legacy Parameter. LLM provider to use. + api_token (str): Legacy Parameter. API token for LLM provider. + url (str, optional): URL to fetch HTML from. If provided, html parameter is ignored. + **kwargs: Additional args passed to LLM processor. Returns: - dict: Generated schema following the JsonElementExtractionStrategy format + dict: Generated schema following the JsonElementExtractionStrategy format. + + Raises: + ValueError: If neither html nor url is provided. """ - from .utils import aperform_completion_with_backoff + from .utils import aperform_completion_with_backoff, preprocess_html_for_schema + + # Validate inputs + if html is None and url is None: + raise ValueError("Either 'html' or 'url' must be provided") + + # Check deprecated parameters + for name, message in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items(): + if locals()[name] is not None: + raise AttributeError(f"Setting '{name}' is deprecated. {message}") if llm_config is None: llm_config = create_llm_config() + # Fetch HTML from URL if provided + if url is not None: + from .async_webcrawler import AsyncWebCrawler + from .async_configs import BrowserConfig, CrawlerRunConfig, CacheMode + + browser_config = BrowserConfig( + headless=True, + text_mode=True, + light_mode=True, + ) + crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url=url, config=crawler_config) + if not result.success: + raise Exception(f"Failed to fetch URL '{url}': {result.error_message}") + if result.status_code >= 400: + raise Exception(f"HTTP {result.status_code} error for URL '{url}'") + html = result.html + + # Preprocess HTML for schema generation (higher text_threshold for better LLM context) + html = preprocess_html_for_schema( + html_content=html, + text_threshold=2000, + attr_value_threshold=500, + max_size=500_000 + ) + prompt = JsonElementExtractionStrategy._build_schema_prompt(html, schema_type, query, target_json_example) try: