diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 080229f4..d04ec298 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -101,7 +101,7 @@ class LLMExtractionStrategy(ExtractionStrategy): variable_values["REQUEST"] = self.instruction prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION - if self.extract_type == "schema": + if self.extract_type == "schema" and self.schema: variable_values["SCHEMA"] = json.dumps(self.schema, indent=2) prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 07832888..b6f97223 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -834,7 +834,6 @@ def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_toke return sum(all_blocks, []) - def merge_chunks_based_on_token_threshold(chunks, token_threshold): """ Merges small chunks into larger ones based on the total token threshold. @@ -880,7 +879,6 @@ def process_sections(url: str, sections: list, provider: str, api_token: str) -> return extracted_content - def wrap_text(draw, text, font, max_width): # Wrap the text to fit within the specified width lines = [] @@ -892,7 +890,6 @@ def wrap_text(draw, text, font, max_width): lines.append(line) return '\n'.join(lines) - def format_html(html_string): soup = BeautifulSoup(html_string, 'html.parser') return soup.prettify() diff --git a/setup.py b/setup.py index af929125..841f85a8 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ with open("requirements.txt") as f: requirements = f.read().splitlines() # Define the requirements for different environments -default_requirements = [req for req in requirements if not req.startswith(("torch", "transformers", "onnxruntime", "nltk", "spacy", "tokenizers", "scikit-learn", "numpy"))] +default_requirements = [req for req in requirements if not req.startswith(("torch", "transformers", "onnxruntime", "nltk", "spacy", "tokenizers", "scikit-learn"))] torch_requirements = [req for req in requirements if req.startswith(("torch", "nltk", "spacy", "scikit-learn", "numpy"))] transformer_requirements = [req for req in requirements if req.startswith(("transformers", "tokenizers", "onnxruntime"))]