Refactor extraction strategy internals and improve error handling

2026-01-24 03:08:41 +00:00
parent 777d0878f2
commit b0b3ca1222
1 changed files with 96 additions and 37 deletions
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -1342,82 +1342,141 @@ In this scenario, use your best judgment to generate the schema. You need to exa
    @staticmethod
    def generate_schema(
-        html: str,
+        html: str = None,
        schema_type: str = "CSS",
        query: str = None,
        target_json_example: str = None,
        llm_config: 'LLMConfig' = create_llm_config(),
        provider: str = None,
        api_token: str = None,
        url: str = None,
        **kwargs
    ) -> dict:
        """
-        Generate extraction schema from HTML content and optional query (sync version).
+        Generate extraction schema from HTML content or URL (sync version).
        Args:
-            html (str): The HTML content to analyze
+            html (str, optional): The HTML content to analyze. If not provided, url must be set.
-            query (str, optional): Natural language description of what data to extract
+            schema_type (str): "CSS" or "XPATH". Defaults to "CSS".
-            provider (str): Legacy Parameter. LLM provider to use
+            query (str, optional): Natural language description of what data to extract.
-            api_token (str): Legacy Parameter. API token for LLM provider
+            target_json_example (str, optional): Example of desired JSON output.
-            llm_config (LLMConfig): LLM configuration object
+            llm_config (LLMConfig): LLM configuration object.
-            **kwargs: Additional args passed to LLM processor
+            provider (str): Legacy Parameter. LLM provider to use.
            api_token (str): Legacy Parameter. API token for LLM provider.
            url (str, optional): URL to fetch HTML from. If provided, html parameter is ignored.
            **kwargs: Additional args passed to LLM processor.
        Returns:
-            dict: Generated schema following the JsonElementExtractionStrategy format
+            dict: Generated schema following the JsonElementExtractionStrategy format.
        Raises:
            ValueError: If neither html nor url is provided.
        """
-        from .utils import perform_completion_with_backoff
+        import asyncio
        for name, message in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items():
            if locals()[name] is not None:
                raise AttributeError(f"Setting '{name}' is deprecated. {message}")
        prompt = JsonElementExtractionStrategy._build_schema_prompt(html, schema_type, query, target_json_example)
        try:
-            response = perform_completion_with_backoff(
+            loop = asyncio.get_running_loop()
-                provider=llm_config.provider,
+        except RuntimeError:
-                prompt_with_variables=prompt,
+            loop = None
-                json_response=True,
+
-                api_token=llm_config.api_token,
+        coro = JsonElementExtractionStrategy.agenerate_schema(
-                base_url=llm_config.base_url,
+            html=html,
-                extra_args=kwargs
+            schema_type=schema_type,
-            )
+            query=query,
-            return json.loads(response.choices[0].message.content)
+            target_json_example=target_json_example,
-        except Exception as e:
+            llm_config=llm_config,
-            raise Exception(f"Failed to generate schema: {str(e)}")
+            provider=provider,
            api_token=api_token,
            url=url,
            **kwargs
        )
        if loop is None:
            return asyncio.run(coro)
        else:
            import concurrent.futures
            with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
                future = executor.submit(asyncio.run, coro)
                return future.result()
    @staticmethod
    async def agenerate_schema(
-        html: str,
+        html: str = None,
        schema_type: str = "CSS",
        query: str = None,
        target_json_example: str = None,
        llm_config: 'LLMConfig' = None,
        provider: str = None,
        api_token: str = None,
        url: str = None,
        **kwargs
    ) -> dict:
        """
-        Generate extraction schema from HTML content (async version).
+        Generate extraction schema from HTML content or URL (async version).
        Use this method when calling from async contexts (e.g., FastAPI) to avoid
        issues with certain LLM providers (e.g., Gemini/Vertex AI) that require
        async execution.
        Args:
-            html (str): The HTML content to analyze
+            html (str, optional): The HTML content to analyze. If not provided, url must be set.
-            schema_type (str): "CSS" or "XPATH"
+            schema_type (str): "CSS" or "XPATH". Defaults to "CSS".
-            query (str, optional): Natural language description of what data to extract
+            query (str, optional): Natural language description of what data to extract.
-            target_json_example (str, optional): Example of desired JSON output
+            target_json_example (str, optional): Example of desired JSON output.
-            llm_config (LLMConfig): LLM configuration object
+            llm_config (LLMConfig): LLM configuration object.
-            **kwargs: Additional args passed to LLM processor
+            provider (str): Legacy Parameter. LLM provider to use.
            api_token (str): Legacy Parameter. API token for LLM provider.
            url (str, optional): URL to fetch HTML from. If provided, html parameter is ignored.
            **kwargs: Additional args passed to LLM processor.
        Returns:
-            dict: Generated schema following the JsonElementExtractionStrategy format
+            dict: Generated schema following the JsonElementExtractionStrategy format.
        Raises:
            ValueError: If neither html nor url is provided.
        """
-        from .utils import aperform_completion_with_backoff
+        from .utils import aperform_completion_with_backoff, preprocess_html_for_schema
        # Validate inputs
        if html is None and url is None:
            raise ValueError("Either 'html' or 'url' must be provided")
        # Check deprecated parameters
        for name, message in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items():
            if locals()[name] is not None:
                raise AttributeError(f"Setting '{name}' is deprecated. {message}")
        if llm_config is None:
            llm_config = create_llm_config()
        # Fetch HTML from URL if provided
        if url is not None:
            from .async_webcrawler import AsyncWebCrawler
            from .async_configs import BrowserConfig, CrawlerRunConfig, CacheMode
            browser_config = BrowserConfig(
                headless=True,
                text_mode=True,
                light_mode=True,
            )
            crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
            async with AsyncWebCrawler(config=browser_config) as crawler:
                result = await crawler.arun(url=url, config=crawler_config)
                if not result.success:
                    raise Exception(f"Failed to fetch URL '{url}': {result.error_message}")
                if result.status_code >= 400:
                    raise Exception(f"HTTP {result.status_code} error for URL '{url}'")
                html = result.html
        # Preprocess HTML for schema generation (higher text_threshold for better LLM context)
        html = preprocess_html_for_schema(
            html_content=html,
            text_threshold=2000,
            attr_value_threshold=500,
            max_size=500_000
        )
        prompt = JsonElementExtractionStrategy._build_schema_prompt(html, schema_type, query, target_json_example)
        try: