Commit Message:

- Added examples for Amazon product data extraction methods - Updated configuration options and enhance documentation - Minor refactoring for improved performance and readability - Cleaned up version control settings.
2024-12-29 20:05:18 +08:00
parent f2d9912697
commit fb33a24891
27 changed files with 4371 additions and 1408 deletions
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -62,29 +62,66 @@ class ExtractionStrategy(ABC):
        return extracted_content    
    
 class NoExtractionStrategy(ExtractionStrategy):
+    """
+    A strategy that does not extract any meaningful content from the HTML. It simply returns the entire HTML as a single block.
+    """
    def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
+        """
+        Extract meaningful blocks or chunks from the given HTML.
+        """
        return [{"index": 0, "content": html}]
    
    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
        return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)]
-   

 #######################################################
 # Strategies using LLM-based extraction for text data #
 #######################################################
-
-
-    
 class LLMExtractionStrategy(ExtractionStrategy):
+    """
+    A strategy that uses an LLM to extract meaningful content from the HTML.
+    
+    Attributes:
+        provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
+        api_token: The API token for the provider.
+        instruction: The instruction to use for the LLM model.  
+        schema: Pydantic model schema for structured data.
+        extraction_type: "block" or "schema".
+        chunk_token_threshold: Maximum tokens per chunk.
+        overlap_rate: Overlap between chunks.
+        word_token_rate: Word to token conversion rate.
+        apply_chunking: Whether to apply chunking.
+        base_url: The base URL for the API request.
+        api_base: The base URL for the API request.
+        extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
+        verbose: Whether to print verbose output.
+        usages: List of individual token usages.
+        total_usage: Accumulated token usage.
+    """
+
    def __init__(self, 
                 provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, 
                 instruction:str = None, schema:Dict = None, extraction_type = "block", **kwargs):
        """
        Initialize the strategy with clustering parameters.
+        
+        Args:
+            provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
+            api_token: The API token for the provider.
+            instruction: The instruction to use for the LLM model.  
+            schema: Pydantic model schema for structured data.
+            extraction_type: "block" or "schema".
+            chunk_token_threshold: Maximum tokens per chunk.
+            overlap_rate: Overlap between chunks.
+            word_token_rate: Word to token conversion rate.
+            apply_chunking: Whether to apply chunking.
+            base_url: The base URL for the API request.
+            api_base: The base URL for the API request.
+            extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
+            verbose: Whether to print verbose output.
+            usages: List of individual token usages.
+            total_usage: Accumulated token usage.   

-        :param provider: The provider to use for extraction.
-        :param api_token: The API token for the provider.
-        :param instruction: The instruction to use for the LLM model.
        """
        super().__init__(**kwargs)
        self.provider = provider
@@ -114,6 +151,22 @@ class LLMExtractionStrategy(ExtractionStrategy):
        
            
    def extract(self, url: str, ix:int, html: str) -> List[Dict[str, Any]]:
+        """
+        Extract meaningful blocks or chunks from the given HTML using an LLM.
+        
+        How it works:
+        1. Construct a prompt with variables.
+        2. Make a request to the LLM using the prompt.
+        3. Parse the response and extract blocks or chunks.
+        
+        Args:
+            url: The URL of the webpage.
+            ix: Index of the block.
+            html: The HTML content of the webpage.
+            
+        Returns:
+            A list of extracted blocks or chunks.
+        """
        if self.verbose:
            # print("[LOG] Extracting blocks from URL:", url)
            print(f"[LOG] Call LLM for {url} - block index: {ix}")
@@ -180,6 +233,9 @@ class LLMExtractionStrategy(ExtractionStrategy):
        return blocks
    
    def _merge(self, documents, chunk_token_threshold, overlap):
+        """
+        Merge documents into sections based on chunk_token_threshold and overlap.
+        """
        chunks = []
        sections = []
        total_tokens = 0
@@ -229,6 +285,13 @@ class LLMExtractionStrategy(ExtractionStrategy):
    def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
        """
        Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy.
+        
+        Args:
+            url: The URL of the webpage.
+            sections: List of sections (strings) to process.
+            
+        Returns:
+            A list of extracted blocks or chunks.
        """
        
        merged_sections = self._merge(
@@ -285,12 +348,30 @@ class LLMExtractionStrategy(ExtractionStrategy):
        for i, usage in enumerate(self.usages, 1):
            print(f"{i:<10} {usage.completion_tokens:>12,} {usage.prompt_tokens:>12,} {usage.total_tokens:>12,}")
  
-
 #######################################################
 # Strategies using clustering for text data extraction #
 #######################################################

 class CosineStrategy(ExtractionStrategy):
+    """
+    Extract meaningful blocks or chunks from the given HTML using cosine similarity.
+    
+    How it works:
+    1. Pre-filter documents using embeddings and semantic_filter.
+    2. Perform clustering using cosine similarity.
+    3. Organize texts by their cluster labels, retaining order.
+    4. Filter clusters by word count.
+    5. Extract meaningful blocks or chunks from the filtered clusters.
+    
+    Attributes:
+        semantic_filter (str): A keyword filter for document filtering.
+        word_count_threshold (int): Minimum number of words per cluster.
+        max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
+        linkage_method (str): The linkage method for hierarchical clustering.
+        top_k (int): Number of top categories to extract.
+        model_name (str): The name of the sentence-transformers model.
+        sim_threshold (float): The similarity threshold for clustering.
+    """ 
    def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'sentence-transformers/all-MiniLM-L6-v2', sim_threshold = 0.3, **kwargs):
        """
        Initialize the strategy with clustering parameters.
@@ -368,11 +449,13 @@ class CosineStrategy(ExtractionStrategy):
        """
        Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filter embedding.

-        :param documents: List of text chunks (documents).
-        :param semantic_filter: A string containing the keywords for filtering.
-        :param threshold: Cosine similarity threshold for filtering documents.
-        :param at_least_k: Minimum number of documents to return.
-        :return: List of filtered documents, ensuring at least `at_least_k` documents.
+        Args:
+            documents (List[str]): A list of document texts.
+            semantic_filter (str): A keyword filter for document filtering.
+            at_least_k (int): The minimum number of documents to return.
+
+        Returns:
+            List[str]: A list of filtered and sorted document texts.
        """
        
        if not semantic_filter:
@@ -410,8 +493,11 @@ class CosineStrategy(ExtractionStrategy):
        """
        Get BERT embeddings for a list of sentences.

-        :param sentences: List of text chunks (sentences).
-        :return: NumPy array of embeddings.
+        Args:
+            sentences (List[str]): A list of text chunks (sentences).
+
+        Returns:
+            NumPy array of embeddings.
        """
        # if self.buffer_embeddings.any() and not bypass_buffer:
        #     return self.buffer_embeddings
@@ -455,8 +541,11 @@ class CosineStrategy(ExtractionStrategy):
        """
        Perform hierarchical clustering on sentences and return cluster labels.

-        :param sentences: List of text chunks (sentences).
-        :return: NumPy array of cluster labels.
+        Args:
+            sentences (List[str]): A list of text chunks (sentences).
+
+        Returns:
+            NumPy array of cluster labels.
        """
        # Get embeddings
        from scipy.cluster.hierarchy import linkage, fcluster
@@ -472,12 +561,15 @@ class CosineStrategy(ExtractionStrategy):
        labels = fcluster(linked, self.max_dist, criterion='distance')
        return labels

-    def filter_clusters_by_word_count(self, clusters: Dict[int, List[str]]):
+    def filter_clusters_by_word_count(self, clusters: Dict[int, List[str]]) -> Dict[int, List[str]]:
        """
        Filter clusters to remove those with a word count below the threshold.

-        :param clusters: Dictionary of clusters.
-        :return: Filtered dictionary of clusters.
+        Args:
+            clusters (Dict[int, List[str]]): Dictionary of clusters.
+
+        Returns:
+            Dict[int, List[str]]: Filtered dictionary of clusters.
        """
        filtered_clusters = {}
        for cluster_id, texts in clusters.items():
@@ -496,9 +588,12 @@ class CosineStrategy(ExtractionStrategy):
        """
        Extract clusters from HTML content using hierarchical clustering.

-        :param url: The URL of the webpage.
-        :param html: The HTML content of the webpage.
-        :return: A list of dictionaries representing the clusters.
+        Args:
+            url (str): The URL of the webpage.
+            html (str): The HTML content of the webpage.
+
+        Returns:
+            List[Dict[str, Any]]: A list of processed JSON blocks.
        """
        # Assume `html` is a list of text chunks for this strategy
        t = time.time()
@@ -560,159 +655,85 @@ class CosineStrategy(ExtractionStrategy):
        """
        Process sections using hierarchical clustering.

-        :param url: The URL of the webpage.
-        :param sections: List of sections (strings) to process.
-        :param provider: The provider to be used for extraction (not used here).
-        :param api_token: Optional API token for the provider (not used here).
-        :return: A list of processed JSON blocks.
+        Args:
+            url (str): The URL of the webpage.
+            sections (List[str]): List of sections (strings) to process.
+
+        Returns:
        """
        # This strategy processes all sections together
        
        return self.extract(url, self.DEL.join(sections), **kwargs)
    
-
-#######################################################
-# Strategies based on the extraction of specific types #
-#######################################################
-    
-class TopicExtractionStrategy(ExtractionStrategy):
-    def __init__(self, num_keywords: int = 3, **kwargs):
-        """
-        Initialize the topic extraction strategy with parameters for topic segmentation.
-
-        :param num_keywords: Number of keywords to represent each topic segment.
-        """
-        import nltk
-        super().__init__(**kwargs)
-        self.num_keywords = num_keywords
-        self.tokenizer = nltk.TextTilingTokenizer()
-
-    def extract_keywords(self, text: str) -> List[str]:
-        """
-        Extract keywords from a given text segment using simple frequency analysis.
-
-        :param text: The text segment from which to extract keywords.
-        :return: A list of keyword strings.
-        """
-        import nltk
-        # Tokenize the text and compute word frequency
-        words = nltk.word_tokenize(text)
-        freq_dist = nltk.FreqDist(words)
-        # Get the most common words as keywords
-        keywords = [word for (word, _) in freq_dist.most_common(self.num_keywords)]
-        return keywords
-
-    def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
-        """
-        Extract topics from HTML content using TextTiling for segmentation and keyword extraction.
-
-        :param url: The URL of the webpage.
-        :param html: The HTML content of the webpage.
-        :param provider: The provider to be used for extraction (not used here).
-        :param api_token: Optional API token for the provider (not used here).
-        :return: A list of dictionaries representing the topics.
-        """
-        # Use TextTiling to segment the text into topics
-        segmented_topics = html.split(self.DEL)  # Split by lines or paragraphs as needed
-
-        # Prepare the output as a list of dictionaries
-        topic_list = []
-        for i, segment in enumerate(segmented_topics):
-            # Extract keywords for each segment
-            keywords = self.extract_keywords(segment)
-            topic_list.append({
-                "index": i,
-                "content": segment,
-                "keywords": keywords
-            })
-
-        return topic_list
-
-    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
-        """
-        Process sections using topic segmentation and keyword extraction.
-
-        :param url: The URL of the webpage.
-        :param sections: List of sections (strings) to process.
-        :param provider: The provider to be used for extraction (not used here).
-        :param api_token: Optional API token for the provider (not used here).
-        :return: A list of processed JSON blocks.
-        """
-        # Concatenate sections into a single text for coherent topic segmentation
-        
-        
-        return self.extract(url, self.DEL.join(sections), **kwargs)
-    
-class ContentSummarizationStrategy(ExtractionStrategy):
-    def __init__(self, model_name: str = "sshleifer/distilbart-cnn-12-6", **kwargs):
-        """
-        Initialize the content summarization strategy with a specific model.
-
-        :param model_name: The model to use for summarization.
-        """
-        super().__init__(**kwargs)
-        from transformers import pipeline
-        self.summarizer = pipeline("summarization", model=model_name)
-
-    def extract(self, url: str, text: str, provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]:
-        """
-        Summarize a single section of text.
-
-        :param url: The URL of the webpage.
-        :param text: A section of text to summarize.
-        :param provider: The provider to be used for extraction (not used here).
-        :param api_token: Optional API token for the provider (not used here).
-        :return: A dictionary with the summary.
-        """
-        try:
-            summary = self.summarizer(text, max_length=130, min_length=30, do_sample=False)
-            return {"summary": summary[0]['summary_text']}
-        except Exception as e:
-            print(f"Error summarizing text: {e}")
-            return {"summary": text}  # Fallback to original text if summarization fails
-
-    def run(self, url: str, sections: List[str], provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]:
-        """
-        Process each section in parallel to produce summaries.
-
-        :param url: The URL of the webpage.
-        :param sections: List of sections (strings) to summarize.
-        :param provider: The provider to be used for extraction (not used here).
-        :param api_token: Optional API token for the provider (not used here).
-        :return: A list of dictionaries with summaries for each section.
-        """
-        # Use a ThreadPoolExecutor to summarize in parallel
-        summaries = []
-        with ThreadPoolExecutor() as executor:
-            # Create a future for each section's summarization
-            future_to_section = {executor.submit(self.extract, url, section, provider, api_token): i for i, section in enumerate(sections)}
-            for future in as_completed(future_to_section):
-                section_index = future_to_section[future]
-                try:
-                    summary_result = future.result()
-                    summaries.append((section_index, summary_result))
-                except Exception as e:
-                    print(f"Error processing section {section_index}: {e}")
-                    summaries.append((section_index, {"summary": sections[section_index]}))  # Fallback to original text
-
-        # Sort summaries by the original section index to maintain order
-        summaries.sort(key=lambda x: x[0])
-        return [summary for _, summary in summaries]
- 
-
 #######################################################
 # New extraction strategies for JSON-based extraction #
 ####################################################### 

 class JsonElementExtractionStrategy(ExtractionStrategy):
+    """
+    Abstract base class for extracting structured JSON from HTML content.
+
+    How it works:
+    1. Parses HTML content using the `_parse_html` method.
+    2. Uses a schema to define base selectors, fields, and transformations.
+    3. Extracts data hierarchically, supporting nested fields and lists.
+    4. Handles computed fields with expressions or functions.
+
+    Attributes:
+        DEL (str): Delimiter used to combine HTML sections. Defaults to '\n'.
+        schema (Dict[str, Any]): The schema defining the extraction rules.
+        verbose (bool): Enables verbose logging for debugging purposes.
+
+    Methods:
+        extract(url, html_content, *q, **kwargs): Extracts structured data from HTML content.
+        _extract_item(element, fields): Extracts fields from a single element.
+        _extract_single_field(element, field): Extracts a single field based on its type.
+        _apply_transform(value, transform): Applies a transformation to a value.
+        _compute_field(item, field): Computes a field value using an expression or function.
+        run(url, sections, *q, **kwargs): Combines HTML sections and runs the extraction strategy.
+
+    Abstract Methods:
+        _parse_html(html_content): Parses raw HTML into a structured format (e.g., BeautifulSoup or lxml).
+        _get_base_elements(parsed_html, selector): Retrieves base elements using a selector.
+        _get_elements(element, selector): Retrieves child elements using a selector.
+        _get_element_text(element): Extracts text content from an element.
+        _get_element_html(element): Extracts raw HTML from an element.
+        _get_element_attribute(element, attribute): Extracts an attribute's value from an element.
+    """
+
+    
    DEL = '\n'

    def __init__(self, schema: Dict[str, Any], **kwargs):
+        """
+        Initialize the JSON element extraction strategy with a schema.
+
+        Args:
+            schema (Dict[str, Any]): The schema defining the extraction rules.
+        """
        super().__init__(**kwargs)
        self.schema = schema
        self.verbose = kwargs.get('verbose', False)

    def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]:
+        """
+        Extract structured data from HTML content.
+
+        How it works:
+        1. Parses the HTML content using the `_parse_html` method.
+        2. Identifies base elements using the schema's base selector.
+        3. Extracts fields from each base element using `_extract_item`.
+
+        Args:
+            url (str): The URL of the page being processed.
+            html_content (str): The raw HTML content to parse and extract.
+            *q: Additional positional arguments.
+            **kwargs: Additional keyword arguments for custom extraction.
+
+        Returns:
+            List[Dict[str, Any]]: A list of extracted items, each represented as a dictionary.
+        """
+        
        parsed_html = self._parse_html(html_content)
        base_elements = self._get_base_elements(parsed_html, self.schema['baseSelector'])
        
@@ -772,6 +793,22 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
            return field.get('default')

    def _extract_single_field(self, element, field):
+        """
+        Extract a single field based on its type.
+
+        How it works:
+        1. Selects the target element using the field's selector.
+        2. Extracts the field value based on its type (e.g., text, attribute, regex).
+        3. Applies transformations if defined in the schema.
+
+        Args:
+            element: The base element to extract the field from.
+            field (Dict[str, Any]): The field definition in the schema.
+
+        Returns:
+            Any: The extracted field value.
+        """
+        
        if 'selector' in field:
            selected = self._get_elements(element, field['selector'])
            if not selected:
@@ -806,6 +843,22 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
        return item

    def _extract_item(self, element, fields):
+        """
+        Extracts fields from a given element.
+
+        How it works:
+        1. Iterates through the fields defined in the schema.
+        2. Handles computed, single, and nested field types.
+        3. Updates the item dictionary with extracted field values.
+
+        Args:
+            element: The base element to extract fields from.
+            fields (List[Dict[str, Any]]): The list of fields to extract.
+
+        Returns:
+            Dict[str, Any]: A dictionary representing the extracted item.
+        """
+        
        item = {}
        for field in fields:
            if field['type'] == 'computed':
@@ -817,6 +870,22 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
        return item

    def _apply_transform(self, value, transform):
+        """
+        Apply a transformation to a value.
+
+        How it works:
+        1. Checks the transformation type (e.g., `lowercase`, `strip`).
+        2. Applies the transformation to the value.
+        3. Returns the transformed value.
+
+        Args:
+            value (str): The value to transform.
+            transform (str): The type of transformation to apply.
+
+        Returns:
+            str: The transformed value.
+        """
+        
        if transform == 'lowercase':
            return value.lower()
        elif transform == 'uppercase':
@@ -837,6 +906,23 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
            return field.get('default')

    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
+        """
+        Run the extraction strategy on a combined HTML content.
+
+        How it works:
+        1. Combines multiple HTML sections using the `DEL` delimiter.
+        2. Calls the `extract` method with the combined HTML.
+
+        Args:
+            url (str): The URL of the page being processed.
+            sections (List[str]): A list of HTML sections.
+            *q: Additional positional arguments.
+            **kwargs: Additional keyword arguments for custom extraction.
+
+        Returns:
+            List[Dict[str, Any]]: A list of extracted items.
+        """
+        
        combined_html = self.DEL.join(sections)
        return self.extract(url, combined_html, **kwargs)

@@ -856,6 +942,27 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
        pass

 class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
+    """
+    Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors.
+
+    How it works:
+    1. Parses HTML content with BeautifulSoup.
+    2. Selects elements using CSS selectors defined in the schema.
+    3. Extracts field data and applies transformations as defined.
+
+    Attributes:
+        schema (Dict[str, Any]): The schema defining the extraction rules.
+        verbose (bool): Enables verbose logging for debugging purposes.
+
+    Methods:
+        _parse_html(html_content): Parses HTML content into a BeautifulSoup object.
+        _get_base_elements(parsed_html, selector): Selects base elements using a CSS selector.
+        _get_elements(element, selector): Selects child elements using a CSS selector.
+        _get_element_text(element): Extracts text content from a BeautifulSoup element.
+        _get_element_html(element): Extracts the raw HTML content of a BeautifulSoup element.
+        _get_element_attribute(element, attribute): Retrieves an attribute value from a BeautifulSoup element.
+    """
+    
    def __init__(self, schema: Dict[str, Any], **kwargs):
        kwargs['input_format'] = 'html'  # Force HTML input
        super().__init__(schema, **kwargs)
@@ -880,6 +987,28 @@ class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
        return element.get(attribute)

 class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
+    """
+    Concrete implementation of `JsonElementExtractionStrategy` using XPath selectors.
+
+    How it works:
+    1. Parses HTML content into an lxml tree.
+    2. Selects elements using XPath expressions.
+    3. Converts CSS selectors to XPath when needed.
+
+    Attributes:
+        schema (Dict[str, Any]): The schema defining the extraction rules.
+        verbose (bool): Enables verbose logging for debugging purposes.
+
+    Methods:
+        _parse_html(html_content): Parses HTML content into an lxml tree.
+        _get_base_elements(parsed_html, selector): Selects base elements using an XPath selector.
+        _css_to_xpath(css_selector): Converts a CSS selector to an XPath expression.
+        _get_elements(element, selector): Selects child elements using an XPath selector.
+        _get_element_text(element): Extracts text content from an lxml element.
+        _get_element_html(element): Extracts the raw HTML content of an lxml element.
+        _get_element_attribute(element, attribute): Retrieves an attribute value from an lxml element.
+    """
+    
    def __init__(self, schema: Dict[str, Any], **kwargs):
        kwargs['input_format'] = 'html'  # Force HTML input
        super().__init__(schema, **kwargs)
@@ -921,259 +1050,3 @@ class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
    def _get_element_attribute(self, element, attribute: str):
        return element.get(attribute)
 
- 
-class _JsonCssExtractionStrategy(ExtractionStrategy):
-    def __init__(self, schema: Dict[str, Any], **kwargs):
-        kwargs['input_format'] = 'html'  # Force HTML input
-        super().__init__(**kwargs)
-        self.schema = schema
-
-    def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
-        soup = BeautifulSoup(html, 'html.parser')
-        base_elements = soup.select(self.schema['baseSelector'])
-        
-        results = []
-        for element in base_elements:
-            # Extract base element attributes first
-            item = {}
-            if 'baseFields' in self.schema:
-                for field in self.schema['baseFields']:
-                    value = self._extract_single_field(element, field)
-                    if value is not None:
-                        item[field['name']] = value
-            
-            # Then extract child fields
-            field_data = self._extract_item(element, self.schema['fields'])
-            item.update(field_data)
-            
-            results.append(item)
-        
-        return results
-
-    def _extract_field(self, element, field):
-        try:
-            if field['type'] == 'nested':
-                nested_element = element.select_one(field['selector'])
-                return self._extract_item(nested_element, field['fields']) if nested_element else {}
-            
-            if field['type'] == 'list':
-                elements = element.select(field['selector'])
-                return [self._extract_list_item(el, field['fields']) for el in elements]
-            
-            if field['type'] == 'nested_list':
-                elements = element.select(field['selector'])
-                return [self._extract_item(el, field['fields']) for el in elements]
-            
-            return self._extract_single_field(element, field)
-        except Exception as e:
-            if self.verbose:
-                print(f"Error extracting field {field['name']}: {str(e)}")
-            return field.get('default')
-
-    def _extract_list_item(self, element, fields):
-        item = {}
-        for field in fields:
-            value = self._extract_single_field(element, field)
-            if value is not None:
-                item[field['name']] = value
-        return item
-    
-    def _extract_single_field(self, element, field):
-        if 'selector' in field:
-            selected = element.select_one(field['selector'])
-            if not selected:
-                return field.get('default')
-        else:
-            selected = element
-
-        value = None
-        if field['type'] == 'text':
-            value = selected.get_text(strip=True)
-        elif field['type'] == 'attribute':
-            value = selected.get(field['attribute'])
-        elif field['type'] == 'html':
-            value = str(selected)
-        elif field['type'] == 'regex':
-            text = selected.get_text(strip=True)
-            match = re.search(field['pattern'], text)
-            value = match.group(1) if match else None
-
-        if 'transform' in field:
-            value = self._apply_transform(value, field['transform'])
-
-        return value if value is not None else field.get('default')
-
-    def _extract_item(self, element, fields):
-        item = {}
-        for field in fields:
-            if field['type'] == 'computed':
-                value = self._compute_field(item, field)
-            else:
-                value = self._extract_field(element, field)
-            if value is not None:
-                item[field['name']] = value
-        return item
-    
-    def _apply_transform(self, value, transform):
-        if transform == 'lowercase':
-            return value.lower()
-        elif transform == 'uppercase':
-            return value.upper()
-        elif transform == 'strip':
-            return value.strip()
-        return value
-
-    def _compute_field(self, item, field):
-        try:
-            if 'expression' in field:
-                return eval(field['expression'], {}, item)
-            elif 'function' in field:
-                return field['function'](item)
-        except Exception as e:
-            if self.verbose:
-                print(f"Error computing field {field['name']}: {str(e)}")
-            return field.get('default')
-
-    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
-        combined_html = self.DEL.join(sections)
-        return self.extract(url, combined_html, **kwargs)
-class _JsonXPathExtractionStrategy(ExtractionStrategy):
-    def __init__(self, schema: Dict[str, Any], **kwargs):
-        kwargs['input_format'] = 'html'  # Force HTML input
-        super().__init__(**kwargs)
-        self.schema = schema
-
-    def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]:
-        tree = html.fromstring(html_content)
-        base_xpath = self.schema['baseSelector']
-        base_elements = tree.xpath(base_xpath)
-        
-        results = []
-        for element in base_elements:
-            # Extract base element attributes first
-            item = {}
-            if 'baseFields' in self.schema:
-                for field in self.schema['baseFields']:
-                    value = self._extract_single_field(element, field)
-                    if value is not None:
-                        item[field['name']] = value
-            
-            # Then extract child fields
-            field_data = self._extract_item(element, self.schema['fields'])
-            item.update(field_data)
-            
-            results.append(item)
-        
-        return results
-
-    def _css_to_xpath(self, css_selector: str) -> str:
-        """Convert CSS selector to XPath if needed"""
-        if '/' in css_selector:  # Already an XPath
-            return css_selector
-        else:
-            # Fallback to basic conversion for common cases
-            return self._basic_css_to_xpath(css_selector)
-
-    def _basic_css_to_xpath(self, css_selector: str) -> str:
-        """Basic CSS to XPath conversion for common cases"""
-        # Handle basic cases
-        if ' > ' in css_selector:
-            parts = css_selector.split(' > ')
-            return '//' + '/'.join(parts)
-        if ' ' in css_selector:
-            parts = css_selector.split(' ')
-            return '//' + '//'.join(parts)
-        return '//' + css_selector
-
-    def _extract_field(self, element, field):
-        try:
-            if field['type'] == 'nested':
-                xpath = self._css_to_xpath(field['selector'])
-                nested_element = element.xpath(xpath)[0] if element.xpath(xpath) else None
-                return self._extract_item(nested_element, field['fields']) if nested_element is not None else {}
-            
-            if field['type'] == 'list':
-                xpath = self._css_to_xpath(field['selector'])
-                elements = element.xpath(xpath)
-                return [self._extract_list_item(el, field['fields']) for el in elements]
-            
-            if field['type'] == 'nested_list':
-                xpath = self._css_to_xpath(field['selector'])
-                elements = element.xpath(xpath)
-                return [self._extract_item(el, field['fields']) for el in elements]
-            
-            return self._extract_single_field(element, field)
-        except Exception as e:
-            if self.verbose:
-                print(f"Error extracting field {field['name']}: {str(e)}")
-            return field.get('default')
-
-    def _extract_list_item(self, element, fields):
-        item = {}
-        for field in fields:
-            value = self._extract_single_field(element, field)
-            if value is not None:
-                item[field['name']] = value
-        return item
-    
-    def _extract_single_field(self, element, field):
-        if 'selector' in field:
-            xpath = self._css_to_xpath(field['selector'])
-            selected = element.xpath(xpath)
-            if not selected:
-                return field.get('default')
-            selected = selected[0]
-        else:
-            selected = element
-
-        value = None
-        if field['type'] == 'text':
-            value = ''.join(selected.xpath('.//text()')).strip()
-        elif field['type'] == 'attribute':
-            value = selected.get(field['attribute'])
-        elif field['type'] == 'html':
-            value = etree.tostring(selected, encoding='unicode')
-        elif field['type'] == 'regex':
-            text = ''.join(selected.xpath('.//text()')).strip()
-            match = re.search(field['pattern'], text)
-            value = match.group(1) if match else None
-
-        if 'transform' in field:
-            value = self._apply_transform(value, field['transform'])
-
-        return value if value is not None else field.get('default')
-
-    def _extract_item(self, element, fields):
-        item = {}
-        for field in fields:
-            if field['type'] == 'computed':
-                value = self._compute_field(item, field)
-            else:
-                value = self._extract_field(element, field)
-            if value is not None:
-                item[field['name']] = value
-        return item
-    
-    def _apply_transform(self, value, transform):
-        if transform == 'lowercase':
-            return value.lower()
-        elif transform == 'uppercase':
-            return value.upper()
-        elif transform == 'strip':
-            return value.strip()
-        return value
-
-    def _compute_field(self, item, field):
-        try:
-            if 'expression' in field:
-                return eval(field['expression'], {}, item)
-            elif 'function' in field:
-                return field['function'](item)
-        except Exception as e:
-            if self.verbose:
-                print(f"Error computing field {field['name']}: {str(e)}")
-            return field.get('default')
-
-    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
-        combined_html = self.DEL.join(sections)
-        return self.extract(url, combined_html, **kwargs)