Refactor extraction strategy internals and improve error handling

2026-01-24 03:08:41 +00:00
parent 777d0878f2
commit b0b3ca1222
1 changed files with 96 additions and 37 deletions
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -1342,82 +1342,141 @@ In this scenario, use your best judgment to generate the schema. You need to exa

    @staticmethod
    def generate_schema(
-        html: str,
+        html: str = None,
        schema_type: str = "CSS",
        query: str = None,
        target_json_example: str = None,
        llm_config: 'LLMConfig' = create_llm_config(),
        provider: str = None,
        api_token: str = None,
+        url: str = None,
        **kwargs
    ) -> dict:
        """
-        Generate extraction schema from HTML content and optional query (sync version).
+        Generate extraction schema from HTML content or URL (sync version).

        Args:
-            html (str): The HTML content to analyze
-            query (str, optional): Natural language description of what data to extract
-            provider (str): Legacy Parameter. LLM provider to use
-            api_token (str): Legacy Parameter. API token for LLM provider
-            llm_config (LLMConfig): LLM configuration object
-            **kwargs: Additional args passed to LLM processor
+            html (str, optional): The HTML content to analyze. If not provided, url must be set.
+            schema_type (str): "CSS" or "XPATH". Defaults to "CSS".
+            query (str, optional): Natural language description of what data to extract.
+            target_json_example (str, optional): Example of desired JSON output.
+            llm_config (LLMConfig): LLM configuration object.
+            provider (str): Legacy Parameter. LLM provider to use.
+            api_token (str): Legacy Parameter. API token for LLM provider.
+            url (str, optional): URL to fetch HTML from. If provided, html parameter is ignored.
+            **kwargs: Additional args passed to LLM processor.

        Returns:
-            dict: Generated schema following the JsonElementExtractionStrategy format
+            dict: Generated schema following the JsonElementExtractionStrategy format.
+
+        Raises:
+            ValueError: If neither html nor url is provided.
        """
-        from .utils import perform_completion_with_backoff
-
-        for name, message in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items():
-            if locals()[name] is not None:
-                raise AttributeError(f"Setting '{name}' is deprecated. {message}")
-
-        prompt = JsonElementExtractionStrategy._build_schema_prompt(html, schema_type, query, target_json_example)
+        import asyncio

        try:
-            response = perform_completion_with_backoff(
-                provider=llm_config.provider,
-                prompt_with_variables=prompt,
-                json_response=True,
-                api_token=llm_config.api_token,
-                base_url=llm_config.base_url,
-                extra_args=kwargs
-            )
-            return json.loads(response.choices[0].message.content)
-        except Exception as e:
-            raise Exception(f"Failed to generate schema: {str(e)}")
+            loop = asyncio.get_running_loop()
+        except RuntimeError:
+            loop = None
+
+        coro = JsonElementExtractionStrategy.agenerate_schema(
+            html=html,
+            schema_type=schema_type,
+            query=query,
+            target_json_example=target_json_example,
+            llm_config=llm_config,
+            provider=provider,
+            api_token=api_token,
+            url=url,
+            **kwargs
+        )
+
+        if loop is None:
+            return asyncio.run(coro)
+        else:
+            import concurrent.futures
+            with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+                future = executor.submit(asyncio.run, coro)
+                return future.result()

    @staticmethod
    async def agenerate_schema(
-        html: str,
+        html: str = None,
        schema_type: str = "CSS",
        query: str = None,
        target_json_example: str = None,
        llm_config: 'LLMConfig' = None,
+        provider: str = None,
+        api_token: str = None,
+        url: str = None,
        **kwargs
    ) -> dict:
        """
-        Generate extraction schema from HTML content (async version).
+        Generate extraction schema from HTML content or URL (async version).

        Use this method when calling from async contexts (e.g., FastAPI) to avoid
        issues with certain LLM providers (e.g., Gemini/Vertex AI) that require
        async execution.

        Args:
-            html (str): The HTML content to analyze
-            schema_type (str): "CSS" or "XPATH"
-            query (str, optional): Natural language description of what data to extract
-            target_json_example (str, optional): Example of desired JSON output
-            llm_config (LLMConfig): LLM configuration object
-            **kwargs: Additional args passed to LLM processor
+            html (str, optional): The HTML content to analyze. If not provided, url must be set.
+            schema_type (str): "CSS" or "XPATH". Defaults to "CSS".
+            query (str, optional): Natural language description of what data to extract.
+            target_json_example (str, optional): Example of desired JSON output.
+            llm_config (LLMConfig): LLM configuration object.
+            provider (str): Legacy Parameter. LLM provider to use.
+            api_token (str): Legacy Parameter. API token for LLM provider.
+            url (str, optional): URL to fetch HTML from. If provided, html parameter is ignored.
+            **kwargs: Additional args passed to LLM processor.

        Returns:
-            dict: Generated schema following the JsonElementExtractionStrategy format
+            dict: Generated schema following the JsonElementExtractionStrategy format.
+
+        Raises:
+            ValueError: If neither html nor url is provided.
        """
-        from .utils import aperform_completion_with_backoff
+        from .utils import aperform_completion_with_backoff, preprocess_html_for_schema
+
+        # Validate inputs
+        if html is None and url is None:
+            raise ValueError("Either 'html' or 'url' must be provided")
+
+        # Check deprecated parameters
+        for name, message in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items():
+            if locals()[name] is not None:
+                raise AttributeError(f"Setting '{name}' is deprecated. {message}")

        if llm_config is None:
            llm_config = create_llm_config()

+        # Fetch HTML from URL if provided
+        if url is not None:
+            from .async_webcrawler import AsyncWebCrawler
+            from .async_configs import BrowserConfig, CrawlerRunConfig, CacheMode
+
+            browser_config = BrowserConfig(
+                headless=True,
+                text_mode=True,
+                light_mode=True,
+            )
+            crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+
+            async with AsyncWebCrawler(config=browser_config) as crawler:
+                result = await crawler.arun(url=url, config=crawler_config)
+                if not result.success:
+                    raise Exception(f"Failed to fetch URL '{url}': {result.error_message}")
+                if result.status_code >= 400:
+                    raise Exception(f"HTTP {result.status_code} error for URL '{url}'")
+                html = result.html
+
+        # Preprocess HTML for schema generation (higher text_threshold for better LLM context)
+        html = preprocess_html_for_schema(
+            html_content=html,
+            text_threshold=2000,
+            attr_value_threshold=500,
+            max_size=500_000
+        )
+
        prompt = JsonElementExtractionStrategy._build_schema_prompt(html, schema_type, query, target_json_example)

        try: