From b0b3ca122202f60d35c20485db5fd1287a4dcbfe Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Sat, 24 Jan 2026 03:08:41 +0000
Subject: [PATCH] Refactor extraction strategy internals and improve error
 handling

---
 crawl4ai/extraction_strategy.py | 133 +++++++++++++++++++++++---------
 1 file changed, 96 insertions(+), 37 deletions(-)

diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py
index 6be1c7c7..60a6e7d6 100644
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -1342,82 +1342,141 @@ In this scenario, use your best judgment to generate the schema. You need to exa
 
     @staticmethod
     def generate_schema(
-        html: str,
+        html: str = None,
         schema_type: str = "CSS",
         query: str = None,
         target_json_example: str = None,
         llm_config: 'LLMConfig' = create_llm_config(),
         provider: str = None,
         api_token: str = None,
+        url: str = None,
         **kwargs
     ) -> dict:
         """
-        Generate extraction schema from HTML content and optional query (sync version).
+        Generate extraction schema from HTML content or URL (sync version).
 
         Args:
-            html (str): The HTML content to analyze
-            query (str, optional): Natural language description of what data to extract
-            provider (str): Legacy Parameter. LLM provider to use
-            api_token (str): Legacy Parameter. API token for LLM provider
-            llm_config (LLMConfig): LLM configuration object
-            **kwargs: Additional args passed to LLM processor
+            html (str, optional): The HTML content to analyze. If not provided, url must be set.
+            schema_type (str): "CSS" or "XPATH". Defaults to "CSS".
+            query (str, optional): Natural language description of what data to extract.
+            target_json_example (str, optional): Example of desired JSON output.
+            llm_config (LLMConfig): LLM configuration object.
+            provider (str): Legacy Parameter. LLM provider to use.
+            api_token (str): Legacy Parameter. API token for LLM provider.
+            url (str, optional): URL to fetch HTML from. If provided, html parameter is ignored.
+            **kwargs: Additional args passed to LLM processor.
 
         Returns:
-            dict: Generated schema following the JsonElementExtractionStrategy format
+            dict: Generated schema following the JsonElementExtractionStrategy format.
+
+        Raises:
+            ValueError: If neither html nor url is provided.
         """
-        from .utils import perform_completion_with_backoff
-
-        for name, message in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items():
-            if locals()[name] is not None:
-                raise AttributeError(f"Setting '{name}' is deprecated. {message}")
-
-        prompt = JsonElementExtractionStrategy._build_schema_prompt(html, schema_type, query, target_json_example)
+        import asyncio
 
         try:
-            response = perform_completion_with_backoff(
-                provider=llm_config.provider,
-                prompt_with_variables=prompt,
-                json_response=True,
-                api_token=llm_config.api_token,
-                base_url=llm_config.base_url,
-                extra_args=kwargs
-            )
-            return json.loads(response.choices[0].message.content)
-        except Exception as e:
-            raise Exception(f"Failed to generate schema: {str(e)}")
+            loop = asyncio.get_running_loop()
+        except RuntimeError:
+            loop = None
+
+        coro = JsonElementExtractionStrategy.agenerate_schema(
+            html=html,
+            schema_type=schema_type,
+            query=query,
+            target_json_example=target_json_example,
+            llm_config=llm_config,
+            provider=provider,
+            api_token=api_token,
+            url=url,
+            **kwargs
+        )
+
+        if loop is None:
+            return asyncio.run(coro)
+        else:
+            import concurrent.futures
+            with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+                future = executor.submit(asyncio.run, coro)
+                return future.result()
 
     @staticmethod
     async def agenerate_schema(
-        html: str,
+        html: str = None,
         schema_type: str = "CSS",
         query: str = None,
         target_json_example: str = None,
         llm_config: 'LLMConfig' = None,
+        provider: str = None,
+        api_token: str = None,
+        url: str = None,
         **kwargs
     ) -> dict:
         """
-        Generate extraction schema from HTML content (async version).
+        Generate extraction schema from HTML content or URL (async version).
 
         Use this method when calling from async contexts (e.g., FastAPI) to avoid
         issues with certain LLM providers (e.g., Gemini/Vertex AI) that require
         async execution.
 
         Args:
-            html (str): The HTML content to analyze
-            schema_type (str): "CSS" or "XPATH"
-            query (str, optional): Natural language description of what data to extract
-            target_json_example (str, optional): Example of desired JSON output
-            llm_config (LLMConfig): LLM configuration object
-            **kwargs: Additional args passed to LLM processor
+            html (str, optional): The HTML content to analyze. If not provided, url must be set.
+            schema_type (str): "CSS" or "XPATH". Defaults to "CSS".
+            query (str, optional): Natural language description of what data to extract.
+            target_json_example (str, optional): Example of desired JSON output.
+            llm_config (LLMConfig): LLM configuration object.
+            provider (str): Legacy Parameter. LLM provider to use.
+            api_token (str): Legacy Parameter. API token for LLM provider.
+            url (str, optional): URL to fetch HTML from. If provided, html parameter is ignored.
+            **kwargs: Additional args passed to LLM processor.
 
         Returns:
-            dict: Generated schema following the JsonElementExtractionStrategy format
+            dict: Generated schema following the JsonElementExtractionStrategy format.
+
+        Raises:
+            ValueError: If neither html nor url is provided.
         """
-        from .utils import aperform_completion_with_backoff
+        from .utils import aperform_completion_with_backoff, preprocess_html_for_schema
+
+        # Validate inputs
+        if html is None and url is None:
+            raise ValueError("Either 'html' or 'url' must be provided")
+
+        # Check deprecated parameters
+        for name, message in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items():
+            if locals()[name] is not None:
+                raise AttributeError(f"Setting '{name}' is deprecated. {message}")
 
         if llm_config is None:
             llm_config = create_llm_config()
 
+        # Fetch HTML from URL if provided
+        if url is not None:
+            from .async_webcrawler import AsyncWebCrawler
+            from .async_configs import BrowserConfig, CrawlerRunConfig, CacheMode
+
+            browser_config = BrowserConfig(
+                headless=True,
+                text_mode=True,
+                light_mode=True,
+            )
+            crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+
+            async with AsyncWebCrawler(config=browser_config) as crawler:
+                result = await crawler.arun(url=url, config=crawler_config)
+                if not result.success:
+                    raise Exception(f"Failed to fetch URL '{url}': {result.error_message}")
+                if result.status_code >= 400:
+                    raise Exception(f"HTTP {result.status_code} error for URL '{url}'")
+                html = result.html
+
+        # Preprocess HTML for schema generation (higher text_threshold for better LLM context)
+        html = preprocess_html_for_schema(
+            html_content=html,
+            text_threshold=2000,
+            attr_value_threshold=500,
+            max_size=500_000
+        )
+
         prompt = JsonElementExtractionStrategy._build_schema_prompt(html, schema_type, query, target_json_example)
 
         try: