Update .gitignore to include test_env/ and tmp/ directories

refactor: Update extraction strategy to handle schema extraction with non-empty schema
This code change updates the `LLMExtractionStrategy` class to handle schema extraction when the schema is non-empty. Previously, the schema extraction was only triggered when the `extract_type` was set to "schema", regardless of whether a schema was provided. With this update, the schema extraction will only be performed if the `extract_type` is "schema" and a non-empty schema is provided. This ensures that the extraction strategy behaves correctly and avoids unnecessary schema extraction when not needed. Also "numpy" is removed from default installation mode.
2024-09-28 00:12:58 +08:00 · 2024-08-19 15:37:07 +08:00
4 changed files with 5 additions and 6 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -189,4 +189,6 @@ a.txt
 .lambda_function.py
 ec2*
-update_changelog.sh
+update_changelog.sh
 test_env/
 tmp/
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -101,7 +101,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
            variable_values["REQUEST"] = self.instruction
            prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
-        if self.extract_type == "schema":
+        if self.extract_type == "schema" and self.schema:
            variable_values["SCHEMA"] = json.dumps(self.schema, indent=2)
            prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -834,7 +834,6 @@ def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_toke
    return sum(all_blocks, [])
 def merge_chunks_based_on_token_threshold(chunks, token_threshold):
    """
    Merges small chunks into larger ones based on the total token threshold.
@@ -880,7 +879,6 @@ def process_sections(url: str, sections: list, provider: str, api_token: str) ->
    return extracted_content
 def wrap_text(draw, text, font, max_width):
    # Wrap the text to fit within the specified width
    lines = []
@@ -892,7 +890,6 @@ def wrap_text(draw, text, font, max_width):
        lines.append(line)
    return '\n'.join(lines)
 def format_html(html_string):
    soup = BeautifulSoup(html_string, 'html.parser')
    return soup.prettify()
--- a/setup.py
+++ b/setup.py
@@ -19,7 +19,7 @@ with open("requirements.txt") as f:
    requirements = f.read().splitlines()
 # Define the requirements for different environments
-default_requirements = [req for req in requirements if not req.startswith(("torch", "transformers", "onnxruntime", "nltk", "spacy", "tokenizers", "scikit-learn", "numpy"))]
+default_requirements = [req for req in requirements if not req.startswith(("torch", "transformers", "onnxruntime", "nltk", "spacy", "tokenizers", "scikit-learn"))]
 torch_requirements = [req for req in requirements if req.startswith(("torch", "nltk", "spacy", "scikit-learn", "numpy"))]
 transformer_requirements = [req for req in requirements if req.startswith(("transformers", "tokenizers", "onnxruntime"))]