feat(schema): improve HTML preprocessing for schema generation

Add new preprocess_html_for_schema utility function to better handle HTML cleaning
for schema generation. This replaces the previous optimize_html function in the
GoogleSearchCrawler and includes smarter attribute handling and pattern detection.

Other changes:
- Update default provider to gpt-4o
- Add DEFAULT_PROVIDER_API_KEY constant
- Make LLMConfig creation more flexible with create_llm_config helper
- Add new dependencies: zstandard and msgpack

This change improves schema generation reliability while reducing noise in the
processed HTML.
This commit is contained in:
UncleCode
2025-03-12 22:40:46 +08:00
parent 1630fbdafe
commit dc36997a08
8 changed files with 134 additions and 12 deletions

View File

@@ -1,6 +1,7 @@
import os import os
from .config import ( from .config import (
DEFAULT_PROVIDER, DEFAULT_PROVIDER,
DEFAULT_PROVIDER_API_KEY,
MIN_WORD_THRESHOLD, MIN_WORD_THRESHOLD,
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
PROVIDER_MODELS, PROVIDER_MODELS,
@@ -1080,7 +1081,7 @@ class LLMConfig:
self.api_token = os.getenv(api_token[4:]) self.api_token = os.getenv(api_token[4:])
else: else:
self.api_token = PROVIDER_MODELS.get(provider, "no-token") or os.getenv( self.api_token = PROVIDER_MODELS.get(provider, "no-token") or os.getenv(
"OPENAI_API_KEY" DEFAULT_PROVIDER_API_KEY
) )
self.base_url = base_url self.base_url = base_url

View File

@@ -4,7 +4,8 @@ from dotenv import load_dotenv
load_dotenv() # Load environment variables from .env file load_dotenv() # Load environment variables from .env file
# Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy # Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy
DEFAULT_PROVIDER = "openai/gpt-4o-mini" DEFAULT_PROVIDER = "openai/gpt-4o"
DEFAULT_PROVIDER_API_KEY = "OPENAI_API_KEY"
MODEL_REPO_BRANCH = "new-release-0.0.2" MODEL_REPO_BRANCH = "new-release-0.0.2"
# Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy # Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy
PROVIDER_MODELS = { PROVIDER_MODELS = {

View File

@@ -1,6 +1,6 @@
from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig, CacheMode from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.hub import BaseCrawler from crawl4ai.hub import BaseCrawler
from crawl4ai.utils import optimize_html, get_home_folder from crawl4ai.utils import optimize_html, get_home_folder, preprocess_html_for_schema
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from pathlib import Path from pathlib import Path
import json import json
@@ -68,7 +68,8 @@ class GoogleSearchCrawler(BaseCrawler):
home_dir = get_home_folder() if not schema_cache_path else schema_cache_path home_dir = get_home_folder() if not schema_cache_path else schema_cache_path
os.makedirs(f"{home_dir}/schema", exist_ok=True) os.makedirs(f"{home_dir}/schema", exist_ok=True)
cleaned_html = optimize_html(html, threshold=100) # cleaned_html = optimize_html(html, threshold=100)
cleaned_html = preprocess_html_for_schema(html)
organic_schema = None organic_schema = None
if os.path.exists(f"{home_dir}/schema/organic_schema.json"): if os.path.exists(f"{home_dir}/schema/organic_schema.json"):

View File

@@ -34,7 +34,7 @@ from .model_loader import (
calculate_batch_size calculate_batch_size
) )
from .types import LLMConfig from .types import LLMConfig, create_llm_config
from functools import partial from functools import partial
import numpy as np import numpy as np
@@ -757,8 +757,6 @@ class LLMExtractionStrategy(ExtractionStrategy):
####################################################### #######################################################
# New extraction strategies for JSON-based extraction # # New extraction strategies for JSON-based extraction #
####################################################### #######################################################
class JsonElementExtractionStrategy(ExtractionStrategy): class JsonElementExtractionStrategy(ExtractionStrategy):
""" """
Abstract base class for extracting structured JSON from HTML content. Abstract base class for extracting structured JSON from HTML content.
@@ -1049,7 +1047,7 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
schema_type: str = "CSS", # or XPATH schema_type: str = "CSS", # or XPATH
query: str = None, query: str = None,
target_json_example: str = None, target_json_example: str = None,
llm_config: 'LLMConfig' = None, llm_config: 'LLMConfig' = create_llm_config(),
provider: str = None, provider: str = None,
api_token: str = None, api_token: str = None,
**kwargs **kwargs
@@ -1140,7 +1138,6 @@ In this scenario, use your best judgment to generate the schema. Try to maximize
except Exception as e: except Exception as e:
raise Exception(f"Failed to generate schema: {str(e)}") raise Exception(f"Failed to generate schema: {str(e)}")
class JsonCssExtractionStrategy(JsonElementExtractionStrategy): class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
""" """
Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors. Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors.

View File

@@ -179,3 +179,9 @@ if TYPE_CHECKING:
DFSDeepCrawlStrategy as DFSDeepCrawlStrategyType, DFSDeepCrawlStrategy as DFSDeepCrawlStrategyType,
DeepCrawlDecorator as DeepCrawlDecoratorType, DeepCrawlDecorator as DeepCrawlDecoratorType,
) )
def create_llm_config(*args, **kwargs) -> 'LLMConfigType':
from .async_configs import LLMConfig
return LLMConfig(*args, **kwargs)

View File

@@ -26,7 +26,7 @@ import cProfile
import pstats import pstats
from functools import wraps from functools import wraps
import asyncio import asyncio
from lxml import etree, html as lhtml
import sqlite3 import sqlite3
import hashlib import hashlib
@@ -2617,3 +2617,116 @@ class HeadPeekr:
def get_title(head_content: str): def get_title(head_content: str):
title_match = re.search(r'<title>(.*?)</title>', head_content, re.IGNORECASE | re.DOTALL) title_match = re.search(r'<title>(.*?)</title>', head_content, re.IGNORECASE | re.DOTALL)
return title_match.group(1) if title_match else None return title_match.group(1) if title_match else None
def preprocess_html_for_schema(html_content, text_threshold=100, attr_value_threshold=200, max_size=100000):
"""
Preprocess HTML to reduce size while preserving structure for schema generation.
Args:
html_content (str): Raw HTML content
text_threshold (int): Maximum length for text nodes before truncation
attr_value_threshold (int): Maximum length for attribute values before truncation
max_size (int): Target maximum size for output HTML
Returns:
str: Preprocessed HTML content
"""
try:
# Parse HTML with error recovery
parser = etree.HTMLParser(remove_comments=True, remove_blank_text=True)
tree = lhtml.fromstring(html_content, parser=parser)
# 1. Remove HEAD section (keep only BODY)
head_elements = tree.xpath('//head')
for head in head_elements:
if head.getparent() is not None:
head.getparent().remove(head)
# 2. Define tags to remove completely
tags_to_remove = [
'script', 'style', 'noscript', 'iframe', 'canvas', 'svg',
'video', 'audio', 'source', 'track', 'map', 'area'
]
# Remove unwanted elements
for tag in tags_to_remove:
elements = tree.xpath(f'//{tag}')
for element in elements:
if element.getparent() is not None:
element.getparent().remove(element)
# 3. Process remaining elements to clean attributes and truncate text
for element in tree.iter():
# Skip if we're at the root level
if element.getparent() is None:
continue
# Clean non-essential attributes but preserve structural ones
# attribs_to_keep = {'id', 'class', 'name', 'href', 'src', 'type', 'value', 'data-'}
# This is more aggressive than the previous version
attribs_to_keep = {'id', 'class', 'name', 'type', 'value'}
# attributes_hates_truncate = ['id', 'class', "data-"]
# This means, I don't care, if an attribute is too long, truncate it, go and find a better css selector to build a schema
attributes_hates_truncate = []
# Process each attribute
for attrib in list(element.attrib.keys()):
# Keep if it's essential or starts with data-
if not (attrib in attribs_to_keep or attrib.startswith('data-')):
element.attrib.pop(attrib)
# Truncate long attribute values except for selectors
elif attrib not in attributes_hates_truncate and len(element.attrib[attrib]) > attr_value_threshold:
element.attrib[attrib] = element.attrib[attrib][:attr_value_threshold] + '...'
# Truncate text content if it's too long
if element.text and len(element.text.strip()) > text_threshold:
element.text = element.text.strip()[:text_threshold] + '...'
# Also truncate tail text if present
if element.tail and len(element.tail.strip()) > text_threshold:
element.tail = element.tail.strip()[:text_threshold] + '...'
# 4. Find repeated patterns and keep only a few examples
# This is a simplistic approach - more sophisticated pattern detection could be implemented
pattern_elements = {}
for element in tree.xpath('//*[contains(@class, "")]'):
parent = element.getparent()
if parent is None:
continue
# Create a signature based on tag and classes
classes = element.get('class', '')
if not classes:
continue
signature = f"{element.tag}.{classes}"
if signature in pattern_elements:
pattern_elements[signature].append(element)
else:
pattern_elements[signature] = [element]
# Keep only 3 examples of each repeating pattern
for signature, elements in pattern_elements.items():
if len(elements) > 3:
# Keep the first 2 and last elements
for element in elements[2:-1]:
if element.getparent() is not None:
element.getparent().remove(element)
# 5. Convert back to string
result = etree.tostring(tree, encoding='unicode', method='html')
# If still over the size limit, apply more aggressive truncation
if len(result) > max_size:
return result[:max_size] + "..."
return result
except Exception as e:
# Fallback for parsing errors
return html_content[:max_size] if len(html_content) > max_size else html_content

View File

@@ -42,7 +42,9 @@ dependencies = [
"pyperclip>=1.8.2", "pyperclip>=1.8.2",
"faust-cchardet>=2.1.19", "faust-cchardet>=2.1.19",
"aiohttp>=3.11.11", "aiohttp>=3.11.11",
"humanize>=4.10.0" "humanize>=4.10.0",
"zstandard>=0.23.0",
"msgpack>=1.1.0"
] ]
classifiers = [ classifiers = [
"Development Status :: 4 - Beta", "Development Status :: 4 - Beta",

View File

@@ -10,6 +10,7 @@ import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
from crawl4ai.utils import preprocess_html_for_schema, JsonXPathExtractionStrategy
import json import json
# Test HTML - A complex job board with companies, departments, and positions # Test HTML - A complex job board with companies, departments, and positions