diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
index 937ae4eb..edcb4b4e 100644
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -1,6 +1,7 @@
import os
from .config import (
DEFAULT_PROVIDER,
+ DEFAULT_PROVIDER_API_KEY,
MIN_WORD_THRESHOLD,
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
PROVIDER_MODELS,
@@ -1080,7 +1081,7 @@ class LLMConfig:
self.api_token = os.getenv(api_token[4:])
else:
self.api_token = PROVIDER_MODELS.get(provider, "no-token") or os.getenv(
- "OPENAI_API_KEY"
+ DEFAULT_PROVIDER_API_KEY
)
self.base_url = base_url
diff --git a/crawl4ai/config.py b/crawl4ai/config.py
index 790ba6d0..866c7dc0 100644
--- a/crawl4ai/config.py
+++ b/crawl4ai/config.py
@@ -4,7 +4,8 @@ from dotenv import load_dotenv
load_dotenv() # Load environment variables from .env file
# Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy
-DEFAULT_PROVIDER = "openai/gpt-4o-mini"
+DEFAULT_PROVIDER = "openai/gpt-4o"
+DEFAULT_PROVIDER_API_KEY = "OPENAI_API_KEY"
MODEL_REPO_BRANCH = "new-release-0.0.2"
# Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy
PROVIDER_MODELS = {
diff --git a/crawl4ai/crawlers/google_search/crawler.py b/crawl4ai/crawlers/google_search/crawler.py
index cae5f81d..e1288de1 100644
--- a/crawl4ai/crawlers/google_search/crawler.py
+++ b/crawl4ai/crawlers/google_search/crawler.py
@@ -1,6 +1,6 @@
from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.hub import BaseCrawler
-from crawl4ai.utils import optimize_html, get_home_folder
+from crawl4ai.utils import optimize_html, get_home_folder, preprocess_html_for_schema
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from pathlib import Path
import json
@@ -68,7 +68,8 @@ class GoogleSearchCrawler(BaseCrawler):
home_dir = get_home_folder() if not schema_cache_path else schema_cache_path
os.makedirs(f"{home_dir}/schema", exist_ok=True)
- cleaned_html = optimize_html(html, threshold=100)
+ # cleaned_html = optimize_html(html, threshold=100)
+ cleaned_html = preprocess_html_for_schema(html)
organic_schema = None
if os.path.exists(f"{home_dir}/schema/organic_schema.json"):
diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py
index 97512bf3..0e0300fb 100644
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -34,7 +34,7 @@ from .model_loader import (
calculate_batch_size
)
-from .types import LLMConfig
+from .types import LLMConfig, create_llm_config
from functools import partial
import numpy as np
@@ -757,8 +757,6 @@ class LLMExtractionStrategy(ExtractionStrategy):
#######################################################
# New extraction strategies for JSON-based extraction #
#######################################################
-
-
class JsonElementExtractionStrategy(ExtractionStrategy):
"""
Abstract base class for extracting structured JSON from HTML content.
@@ -1049,7 +1047,7 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
schema_type: str = "CSS", # or XPATH
query: str = None,
target_json_example: str = None,
- llm_config: 'LLMConfig' = None,
+ llm_config: 'LLMConfig' = create_llm_config(),
provider: str = None,
api_token: str = None,
**kwargs
@@ -1140,7 +1138,6 @@ In this scenario, use your best judgment to generate the schema. Try to maximize
except Exception as e:
raise Exception(f"Failed to generate schema: {str(e)}")
-
class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
"""
Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors.
diff --git a/crawl4ai/types.py b/crawl4ai/types.py
index 2f689e1c..63fd45ba 100644
--- a/crawl4ai/types.py
+++ b/crawl4ai/types.py
@@ -178,4 +178,10 @@ if TYPE_CHECKING:
BestFirstCrawlingStrategy as BestFirstCrawlingStrategyType,
DFSDeepCrawlStrategy as DFSDeepCrawlStrategyType,
DeepCrawlDecorator as DeepCrawlDecoratorType,
- )
\ No newline at end of file
+ )
+
+
+
+def create_llm_config(*args, **kwargs) -> 'LLMConfigType':
+ from .async_configs import LLMConfig
+ return LLMConfig(*args, **kwargs)
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index 146ce06c..acaf7933 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -26,7 +26,7 @@ import cProfile
import pstats
from functools import wraps
import asyncio
-
+from lxml import etree, html as lhtml
import sqlite3
import hashlib
@@ -2617,3 +2617,116 @@ class HeadPeekr:
def get_title(head_content: str):
title_match = re.search(r'
(.*?)', head_content, re.IGNORECASE | re.DOTALL)
return title_match.group(1) if title_match else None
+
+def preprocess_html_for_schema(html_content, text_threshold=100, attr_value_threshold=200, max_size=100000):
+ """
+ Preprocess HTML to reduce size while preserving structure for schema generation.
+
+ Args:
+ html_content (str): Raw HTML content
+ text_threshold (int): Maximum length for text nodes before truncation
+ attr_value_threshold (int): Maximum length for attribute values before truncation
+ max_size (int): Target maximum size for output HTML
+
+ Returns:
+ str: Preprocessed HTML content
+ """
+ try:
+ # Parse HTML with error recovery
+ parser = etree.HTMLParser(remove_comments=True, remove_blank_text=True)
+ tree = lhtml.fromstring(html_content, parser=parser)
+
+ # 1. Remove HEAD section (keep only BODY)
+ head_elements = tree.xpath('//head')
+ for head in head_elements:
+ if head.getparent() is not None:
+ head.getparent().remove(head)
+
+ # 2. Define tags to remove completely
+ tags_to_remove = [
+ 'script', 'style', 'noscript', 'iframe', 'canvas', 'svg',
+ 'video', 'audio', 'source', 'track', 'map', 'area'
+ ]
+
+ # Remove unwanted elements
+ for tag in tags_to_remove:
+ elements = tree.xpath(f'//{tag}')
+ for element in elements:
+ if element.getparent() is not None:
+ element.getparent().remove(element)
+
+ # 3. Process remaining elements to clean attributes and truncate text
+ for element in tree.iter():
+ # Skip if we're at the root level
+ if element.getparent() is None:
+ continue
+
+ # Clean non-essential attributes but preserve structural ones
+ # attribs_to_keep = {'id', 'class', 'name', 'href', 'src', 'type', 'value', 'data-'}
+
+ # This is more aggressive than the previous version
+ attribs_to_keep = {'id', 'class', 'name', 'type', 'value'}
+
+ # attributes_hates_truncate = ['id', 'class', "data-"]
+
+ # This means, I don't care, if an attribute is too long, truncate it, go and find a better css selector to build a schema
+ attributes_hates_truncate = []
+
+ # Process each attribute
+ for attrib in list(element.attrib.keys()):
+ # Keep if it's essential or starts with data-
+ if not (attrib in attribs_to_keep or attrib.startswith('data-')):
+ element.attrib.pop(attrib)
+ # Truncate long attribute values except for selectors
+ elif attrib not in attributes_hates_truncate and len(element.attrib[attrib]) > attr_value_threshold:
+ element.attrib[attrib] = element.attrib[attrib][:attr_value_threshold] + '...'
+
+ # Truncate text content if it's too long
+ if element.text and len(element.text.strip()) > text_threshold:
+ element.text = element.text.strip()[:text_threshold] + '...'
+
+ # Also truncate tail text if present
+ if element.tail and len(element.tail.strip()) > text_threshold:
+ element.tail = element.tail.strip()[:text_threshold] + '...'
+
+ # 4. Find repeated patterns and keep only a few examples
+ # This is a simplistic approach - more sophisticated pattern detection could be implemented
+ pattern_elements = {}
+ for element in tree.xpath('//*[contains(@class, "")]'):
+ parent = element.getparent()
+ if parent is None:
+ continue
+
+ # Create a signature based on tag and classes
+ classes = element.get('class', '')
+ if not classes:
+ continue
+ signature = f"{element.tag}.{classes}"
+
+ if signature in pattern_elements:
+ pattern_elements[signature].append(element)
+ else:
+ pattern_elements[signature] = [element]
+
+ # Keep only 3 examples of each repeating pattern
+ for signature, elements in pattern_elements.items():
+ if len(elements) > 3:
+ # Keep the first 2 and last elements
+ for element in elements[2:-1]:
+ if element.getparent() is not None:
+ element.getparent().remove(element)
+
+ # 5. Convert back to string
+ result = etree.tostring(tree, encoding='unicode', method='html')
+
+ # If still over the size limit, apply more aggressive truncation
+ if len(result) > max_size:
+ return result[:max_size] + "..."
+
+ return result
+
+ except Exception as e:
+ # Fallback for parsing errors
+ return html_content[:max_size] if len(html_content) > max_size else html_content
+
+
diff --git a/pyproject.toml b/pyproject.toml
index b4fb392f..c3f03bfd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,7 +42,9 @@ dependencies = [
"pyperclip>=1.8.2",
"faust-cchardet>=2.1.19",
"aiohttp>=3.11.11",
- "humanize>=4.10.0"
+ "humanize>=4.10.0",
+ "zstandard>=0.23.0",
+ "msgpack>=1.1.0"
]
classifiers = [
"Development Status :: 4 - Beta",
diff --git a/tests/20241401/test_schema_builder.py b/tests/20241401/test_schema_builder.py
index 431fb001..46d0e240 100644
--- a/tests/20241401/test_schema_builder.py
+++ b/tests/20241401/test_schema_builder.py
@@ -10,6 +10,7 @@ import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
+from crawl4ai.utils import preprocess_html_for_schema, JsonXPathExtractionStrategy
import json
# Test HTML - A complex job board with companies, departments, and positions