diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 937ae4eb..edcb4b4e 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -1,6 +1,7 @@ import os from .config import ( DEFAULT_PROVIDER, + DEFAULT_PROVIDER_API_KEY, MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, PROVIDER_MODELS, @@ -1080,7 +1081,7 @@ class LLMConfig: self.api_token = os.getenv(api_token[4:]) else: self.api_token = PROVIDER_MODELS.get(provider, "no-token") or os.getenv( - "OPENAI_API_KEY" + DEFAULT_PROVIDER_API_KEY ) self.base_url = base_url diff --git a/crawl4ai/config.py b/crawl4ai/config.py index 790ba6d0..866c7dc0 100644 --- a/crawl4ai/config.py +++ b/crawl4ai/config.py @@ -4,7 +4,8 @@ from dotenv import load_dotenv load_dotenv() # Load environment variables from .env file # Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy -DEFAULT_PROVIDER = "openai/gpt-4o-mini" +DEFAULT_PROVIDER = "openai/gpt-4o" +DEFAULT_PROVIDER_API_KEY = "OPENAI_API_KEY" MODEL_REPO_BRANCH = "new-release-0.0.2" # Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy PROVIDER_MODELS = { diff --git a/crawl4ai/crawlers/google_search/crawler.py b/crawl4ai/crawlers/google_search/crawler.py index cae5f81d..e1288de1 100644 --- a/crawl4ai/crawlers/google_search/crawler.py +++ b/crawl4ai/crawlers/google_search/crawler.py @@ -1,6 +1,6 @@ from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig, CacheMode from crawl4ai.hub import BaseCrawler -from crawl4ai.utils import optimize_html, get_home_folder +from crawl4ai.utils import optimize_html, get_home_folder, preprocess_html_for_schema from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from pathlib import Path import json @@ -68,7 +68,8 @@ class GoogleSearchCrawler(BaseCrawler): home_dir = get_home_folder() if not schema_cache_path else schema_cache_path os.makedirs(f"{home_dir}/schema", exist_ok=True) - cleaned_html = optimize_html(html, threshold=100) + # cleaned_html = optimize_html(html, threshold=100) + cleaned_html = preprocess_html_for_schema(html) organic_schema = None if os.path.exists(f"{home_dir}/schema/organic_schema.json"): diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 97512bf3..0e0300fb 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -34,7 +34,7 @@ from .model_loader import ( calculate_batch_size ) -from .types import LLMConfig +from .types import LLMConfig, create_llm_config from functools import partial import numpy as np @@ -757,8 +757,6 @@ class LLMExtractionStrategy(ExtractionStrategy): ####################################################### # New extraction strategies for JSON-based extraction # ####################################################### - - class JsonElementExtractionStrategy(ExtractionStrategy): """ Abstract base class for extracting structured JSON from HTML content. @@ -1049,7 +1047,7 @@ class JsonElementExtractionStrategy(ExtractionStrategy): schema_type: str = "CSS", # or XPATH query: str = None, target_json_example: str = None, - llm_config: 'LLMConfig' = None, + llm_config: 'LLMConfig' = create_llm_config(), provider: str = None, api_token: str = None, **kwargs @@ -1140,7 +1138,6 @@ In this scenario, use your best judgment to generate the schema. Try to maximize except Exception as e: raise Exception(f"Failed to generate schema: {str(e)}") - class JsonCssExtractionStrategy(JsonElementExtractionStrategy): """ Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors. diff --git a/crawl4ai/types.py b/crawl4ai/types.py index 2f689e1c..63fd45ba 100644 --- a/crawl4ai/types.py +++ b/crawl4ai/types.py @@ -178,4 +178,10 @@ if TYPE_CHECKING: BestFirstCrawlingStrategy as BestFirstCrawlingStrategyType, DFSDeepCrawlStrategy as DFSDeepCrawlStrategyType, DeepCrawlDecorator as DeepCrawlDecoratorType, - ) \ No newline at end of file + ) + + + +def create_llm_config(*args, **kwargs) -> 'LLMConfigType': + from .async_configs import LLMConfig + return LLMConfig(*args, **kwargs) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 146ce06c..acaf7933 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -26,7 +26,7 @@ import cProfile import pstats from functools import wraps import asyncio - +from lxml import etree, html as lhtml import sqlite3 import hashlib @@ -2617,3 +2617,116 @@ class HeadPeekr: def get_title(head_content: str): title_match = re.search(r'(.*?)', head_content, re.IGNORECASE | re.DOTALL) return title_match.group(1) if title_match else None + +def preprocess_html_for_schema(html_content, text_threshold=100, attr_value_threshold=200, max_size=100000): + """ + Preprocess HTML to reduce size while preserving structure for schema generation. + + Args: + html_content (str): Raw HTML content + text_threshold (int): Maximum length for text nodes before truncation + attr_value_threshold (int): Maximum length for attribute values before truncation + max_size (int): Target maximum size for output HTML + + Returns: + str: Preprocessed HTML content + """ + try: + # Parse HTML with error recovery + parser = etree.HTMLParser(remove_comments=True, remove_blank_text=True) + tree = lhtml.fromstring(html_content, parser=parser) + + # 1. Remove HEAD section (keep only BODY) + head_elements = tree.xpath('//head') + for head in head_elements: + if head.getparent() is not None: + head.getparent().remove(head) + + # 2. Define tags to remove completely + tags_to_remove = [ + 'script', 'style', 'noscript', 'iframe', 'canvas', 'svg', + 'video', 'audio', 'source', 'track', 'map', 'area' + ] + + # Remove unwanted elements + for tag in tags_to_remove: + elements = tree.xpath(f'//{tag}') + for element in elements: + if element.getparent() is not None: + element.getparent().remove(element) + + # 3. Process remaining elements to clean attributes and truncate text + for element in tree.iter(): + # Skip if we're at the root level + if element.getparent() is None: + continue + + # Clean non-essential attributes but preserve structural ones + # attribs_to_keep = {'id', 'class', 'name', 'href', 'src', 'type', 'value', 'data-'} + + # This is more aggressive than the previous version + attribs_to_keep = {'id', 'class', 'name', 'type', 'value'} + + # attributes_hates_truncate = ['id', 'class', "data-"] + + # This means, I don't care, if an attribute is too long, truncate it, go and find a better css selector to build a schema + attributes_hates_truncate = [] + + # Process each attribute + for attrib in list(element.attrib.keys()): + # Keep if it's essential or starts with data- + if not (attrib in attribs_to_keep or attrib.startswith('data-')): + element.attrib.pop(attrib) + # Truncate long attribute values except for selectors + elif attrib not in attributes_hates_truncate and len(element.attrib[attrib]) > attr_value_threshold: + element.attrib[attrib] = element.attrib[attrib][:attr_value_threshold] + '...' + + # Truncate text content if it's too long + if element.text and len(element.text.strip()) > text_threshold: + element.text = element.text.strip()[:text_threshold] + '...' + + # Also truncate tail text if present + if element.tail and len(element.tail.strip()) > text_threshold: + element.tail = element.tail.strip()[:text_threshold] + '...' + + # 4. Find repeated patterns and keep only a few examples + # This is a simplistic approach - more sophisticated pattern detection could be implemented + pattern_elements = {} + for element in tree.xpath('//*[contains(@class, "")]'): + parent = element.getparent() + if parent is None: + continue + + # Create a signature based on tag and classes + classes = element.get('class', '') + if not classes: + continue + signature = f"{element.tag}.{classes}" + + if signature in pattern_elements: + pattern_elements[signature].append(element) + else: + pattern_elements[signature] = [element] + + # Keep only 3 examples of each repeating pattern + for signature, elements in pattern_elements.items(): + if len(elements) > 3: + # Keep the first 2 and last elements + for element in elements[2:-1]: + if element.getparent() is not None: + element.getparent().remove(element) + + # 5. Convert back to string + result = etree.tostring(tree, encoding='unicode', method='html') + + # If still over the size limit, apply more aggressive truncation + if len(result) > max_size: + return result[:max_size] + "..." + + return result + + except Exception as e: + # Fallback for parsing errors + return html_content[:max_size] if len(html_content) > max_size else html_content + + diff --git a/pyproject.toml b/pyproject.toml index b4fb392f..c3f03bfd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,9 @@ dependencies = [ "pyperclip>=1.8.2", "faust-cchardet>=2.1.19", "aiohttp>=3.11.11", - "humanize>=4.10.0" + "humanize>=4.10.0", + "zstandard>=0.23.0", + "msgpack>=1.1.0" ] classifiers = [ "Development Status :: 4 - Beta", diff --git a/tests/20241401/test_schema_builder.py b/tests/20241401/test_schema_builder.py index 431fb001..46d0e240 100644 --- a/tests/20241401/test_schema_builder.py +++ b/tests/20241401/test_schema_builder.py @@ -10,6 +10,7 @@ import asyncio from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy +from crawl4ai.utils import preprocess_html_for_schema, JsonXPathExtractionStrategy import json # Test HTML - A complex job board with companies, departments, and positions