feat(schema): improve HTML preprocessing for schema generation
Add new preprocess_html_for_schema utility function to better handle HTML cleaning for schema generation. This replaces the previous optimize_html function in the GoogleSearchCrawler and includes smarter attribute handling and pattern detection. Other changes: - Update default provider to gpt-4o - Add DEFAULT_PROVIDER_API_KEY constant - Make LLMConfig creation more flexible with create_llm_config helper - Add new dependencies: zstandard and msgpack This change improves schema generation reliability while reducing noise in the processed HTML.
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
import os
|
import os
|
||||||
from .config import (
|
from .config import (
|
||||||
DEFAULT_PROVIDER,
|
DEFAULT_PROVIDER,
|
||||||
|
DEFAULT_PROVIDER_API_KEY,
|
||||||
MIN_WORD_THRESHOLD,
|
MIN_WORD_THRESHOLD,
|
||||||
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
||||||
PROVIDER_MODELS,
|
PROVIDER_MODELS,
|
||||||
@@ -1080,7 +1081,7 @@ class LLMConfig:
|
|||||||
self.api_token = os.getenv(api_token[4:])
|
self.api_token = os.getenv(api_token[4:])
|
||||||
else:
|
else:
|
||||||
self.api_token = PROVIDER_MODELS.get(provider, "no-token") or os.getenv(
|
self.api_token = PROVIDER_MODELS.get(provider, "no-token") or os.getenv(
|
||||||
"OPENAI_API_KEY"
|
DEFAULT_PROVIDER_API_KEY
|
||||||
)
|
)
|
||||||
self.base_url = base_url
|
self.base_url = base_url
|
||||||
|
|
||||||
|
|||||||
@@ -4,7 +4,8 @@ from dotenv import load_dotenv
|
|||||||
load_dotenv() # Load environment variables from .env file
|
load_dotenv() # Load environment variables from .env file
|
||||||
|
|
||||||
# Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy
|
# Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy
|
||||||
DEFAULT_PROVIDER = "openai/gpt-4o-mini"
|
DEFAULT_PROVIDER = "openai/gpt-4o"
|
||||||
|
DEFAULT_PROVIDER_API_KEY = "OPENAI_API_KEY"
|
||||||
MODEL_REPO_BRANCH = "new-release-0.0.2"
|
MODEL_REPO_BRANCH = "new-release-0.0.2"
|
||||||
# Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy
|
# Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy
|
||||||
PROVIDER_MODELS = {
|
PROVIDER_MODELS = {
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||||
from crawl4ai.hub import BaseCrawler
|
from crawl4ai.hub import BaseCrawler
|
||||||
from crawl4ai.utils import optimize_html, get_home_folder
|
from crawl4ai.utils import optimize_html, get_home_folder, preprocess_html_for_schema
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import json
|
import json
|
||||||
@@ -68,7 +68,8 @@ class GoogleSearchCrawler(BaseCrawler):
|
|||||||
home_dir = get_home_folder() if not schema_cache_path else schema_cache_path
|
home_dir = get_home_folder() if not schema_cache_path else schema_cache_path
|
||||||
os.makedirs(f"{home_dir}/schema", exist_ok=True)
|
os.makedirs(f"{home_dir}/schema", exist_ok=True)
|
||||||
|
|
||||||
cleaned_html = optimize_html(html, threshold=100)
|
# cleaned_html = optimize_html(html, threshold=100)
|
||||||
|
cleaned_html = preprocess_html_for_schema(html)
|
||||||
|
|
||||||
organic_schema = None
|
organic_schema = None
|
||||||
if os.path.exists(f"{home_dir}/schema/organic_schema.json"):
|
if os.path.exists(f"{home_dir}/schema/organic_schema.json"):
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ from .model_loader import (
|
|||||||
calculate_batch_size
|
calculate_batch_size
|
||||||
)
|
)
|
||||||
|
|
||||||
from .types import LLMConfig
|
from .types import LLMConfig, create_llm_config
|
||||||
|
|
||||||
from functools import partial
|
from functools import partial
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -757,8 +757,6 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
#######################################################
|
#######################################################
|
||||||
# New extraction strategies for JSON-based extraction #
|
# New extraction strategies for JSON-based extraction #
|
||||||
#######################################################
|
#######################################################
|
||||||
|
|
||||||
|
|
||||||
class JsonElementExtractionStrategy(ExtractionStrategy):
|
class JsonElementExtractionStrategy(ExtractionStrategy):
|
||||||
"""
|
"""
|
||||||
Abstract base class for extracting structured JSON from HTML content.
|
Abstract base class for extracting structured JSON from HTML content.
|
||||||
@@ -1049,7 +1047,7 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
|
|||||||
schema_type: str = "CSS", # or XPATH
|
schema_type: str = "CSS", # or XPATH
|
||||||
query: str = None,
|
query: str = None,
|
||||||
target_json_example: str = None,
|
target_json_example: str = None,
|
||||||
llm_config: 'LLMConfig' = None,
|
llm_config: 'LLMConfig' = create_llm_config(),
|
||||||
provider: str = None,
|
provider: str = None,
|
||||||
api_token: str = None,
|
api_token: str = None,
|
||||||
**kwargs
|
**kwargs
|
||||||
@@ -1140,7 +1138,6 @@ In this scenario, use your best judgment to generate the schema. Try to maximize
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise Exception(f"Failed to generate schema: {str(e)}")
|
raise Exception(f"Failed to generate schema: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
|
class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
|
||||||
"""
|
"""
|
||||||
Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors.
|
Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors.
|
||||||
|
|||||||
@@ -179,3 +179,9 @@ if TYPE_CHECKING:
|
|||||||
DFSDeepCrawlStrategy as DFSDeepCrawlStrategyType,
|
DFSDeepCrawlStrategy as DFSDeepCrawlStrategyType,
|
||||||
DeepCrawlDecorator as DeepCrawlDecoratorType,
|
DeepCrawlDecorator as DeepCrawlDecoratorType,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def create_llm_config(*args, **kwargs) -> 'LLMConfigType':
|
||||||
|
from .async_configs import LLMConfig
|
||||||
|
return LLMConfig(*args, **kwargs)
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ import cProfile
|
|||||||
import pstats
|
import pstats
|
||||||
from functools import wraps
|
from functools import wraps
|
||||||
import asyncio
|
import asyncio
|
||||||
|
from lxml import etree, html as lhtml
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import hashlib
|
import hashlib
|
||||||
|
|
||||||
@@ -2617,3 +2617,116 @@ class HeadPeekr:
|
|||||||
def get_title(head_content: str):
|
def get_title(head_content: str):
|
||||||
title_match = re.search(r'<title>(.*?)</title>', head_content, re.IGNORECASE | re.DOTALL)
|
title_match = re.search(r'<title>(.*?)</title>', head_content, re.IGNORECASE | re.DOTALL)
|
||||||
return title_match.group(1) if title_match else None
|
return title_match.group(1) if title_match else None
|
||||||
|
|
||||||
|
def preprocess_html_for_schema(html_content, text_threshold=100, attr_value_threshold=200, max_size=100000):
|
||||||
|
"""
|
||||||
|
Preprocess HTML to reduce size while preserving structure for schema generation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
html_content (str): Raw HTML content
|
||||||
|
text_threshold (int): Maximum length for text nodes before truncation
|
||||||
|
attr_value_threshold (int): Maximum length for attribute values before truncation
|
||||||
|
max_size (int): Target maximum size for output HTML
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Preprocessed HTML content
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Parse HTML with error recovery
|
||||||
|
parser = etree.HTMLParser(remove_comments=True, remove_blank_text=True)
|
||||||
|
tree = lhtml.fromstring(html_content, parser=parser)
|
||||||
|
|
||||||
|
# 1. Remove HEAD section (keep only BODY)
|
||||||
|
head_elements = tree.xpath('//head')
|
||||||
|
for head in head_elements:
|
||||||
|
if head.getparent() is not None:
|
||||||
|
head.getparent().remove(head)
|
||||||
|
|
||||||
|
# 2. Define tags to remove completely
|
||||||
|
tags_to_remove = [
|
||||||
|
'script', 'style', 'noscript', 'iframe', 'canvas', 'svg',
|
||||||
|
'video', 'audio', 'source', 'track', 'map', 'area'
|
||||||
|
]
|
||||||
|
|
||||||
|
# Remove unwanted elements
|
||||||
|
for tag in tags_to_remove:
|
||||||
|
elements = tree.xpath(f'//{tag}')
|
||||||
|
for element in elements:
|
||||||
|
if element.getparent() is not None:
|
||||||
|
element.getparent().remove(element)
|
||||||
|
|
||||||
|
# 3. Process remaining elements to clean attributes and truncate text
|
||||||
|
for element in tree.iter():
|
||||||
|
# Skip if we're at the root level
|
||||||
|
if element.getparent() is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Clean non-essential attributes but preserve structural ones
|
||||||
|
# attribs_to_keep = {'id', 'class', 'name', 'href', 'src', 'type', 'value', 'data-'}
|
||||||
|
|
||||||
|
# This is more aggressive than the previous version
|
||||||
|
attribs_to_keep = {'id', 'class', 'name', 'type', 'value'}
|
||||||
|
|
||||||
|
# attributes_hates_truncate = ['id', 'class', "data-"]
|
||||||
|
|
||||||
|
# This means, I don't care, if an attribute is too long, truncate it, go and find a better css selector to build a schema
|
||||||
|
attributes_hates_truncate = []
|
||||||
|
|
||||||
|
# Process each attribute
|
||||||
|
for attrib in list(element.attrib.keys()):
|
||||||
|
# Keep if it's essential or starts with data-
|
||||||
|
if not (attrib in attribs_to_keep or attrib.startswith('data-')):
|
||||||
|
element.attrib.pop(attrib)
|
||||||
|
# Truncate long attribute values except for selectors
|
||||||
|
elif attrib not in attributes_hates_truncate and len(element.attrib[attrib]) > attr_value_threshold:
|
||||||
|
element.attrib[attrib] = element.attrib[attrib][:attr_value_threshold] + '...'
|
||||||
|
|
||||||
|
# Truncate text content if it's too long
|
||||||
|
if element.text and len(element.text.strip()) > text_threshold:
|
||||||
|
element.text = element.text.strip()[:text_threshold] + '...'
|
||||||
|
|
||||||
|
# Also truncate tail text if present
|
||||||
|
if element.tail and len(element.tail.strip()) > text_threshold:
|
||||||
|
element.tail = element.tail.strip()[:text_threshold] + '...'
|
||||||
|
|
||||||
|
# 4. Find repeated patterns and keep only a few examples
|
||||||
|
# This is a simplistic approach - more sophisticated pattern detection could be implemented
|
||||||
|
pattern_elements = {}
|
||||||
|
for element in tree.xpath('//*[contains(@class, "")]'):
|
||||||
|
parent = element.getparent()
|
||||||
|
if parent is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Create a signature based on tag and classes
|
||||||
|
classes = element.get('class', '')
|
||||||
|
if not classes:
|
||||||
|
continue
|
||||||
|
signature = f"{element.tag}.{classes}"
|
||||||
|
|
||||||
|
if signature in pattern_elements:
|
||||||
|
pattern_elements[signature].append(element)
|
||||||
|
else:
|
||||||
|
pattern_elements[signature] = [element]
|
||||||
|
|
||||||
|
# Keep only 3 examples of each repeating pattern
|
||||||
|
for signature, elements in pattern_elements.items():
|
||||||
|
if len(elements) > 3:
|
||||||
|
# Keep the first 2 and last elements
|
||||||
|
for element in elements[2:-1]:
|
||||||
|
if element.getparent() is not None:
|
||||||
|
element.getparent().remove(element)
|
||||||
|
|
||||||
|
# 5. Convert back to string
|
||||||
|
result = etree.tostring(tree, encoding='unicode', method='html')
|
||||||
|
|
||||||
|
# If still over the size limit, apply more aggressive truncation
|
||||||
|
if len(result) > max_size:
|
||||||
|
return result[:max_size] + "..."
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# Fallback for parsing errors
|
||||||
|
return html_content[:max_size] if len(html_content) > max_size else html_content
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -42,7 +42,9 @@ dependencies = [
|
|||||||
"pyperclip>=1.8.2",
|
"pyperclip>=1.8.2",
|
||||||
"faust-cchardet>=2.1.19",
|
"faust-cchardet>=2.1.19",
|
||||||
"aiohttp>=3.11.11",
|
"aiohttp>=3.11.11",
|
||||||
"humanize>=4.10.0"
|
"humanize>=4.10.0",
|
||||||
|
"zstandard>=0.23.0",
|
||||||
|
"msgpack>=1.1.0"
|
||||||
]
|
]
|
||||||
classifiers = [
|
classifiers = [
|
||||||
"Development Status :: 4 - Beta",
|
"Development Status :: 4 - Beta",
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ import asyncio
|
|||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
|
||||||
|
from crawl4ai.utils import preprocess_html_for_schema, JsonXPathExtractionStrategy
|
||||||
import json
|
import json
|
||||||
|
|
||||||
# Test HTML - A complex job board with companies, departments, and positions
|
# Test HTML - A complex job board with companies, departments, and positions
|
||||||
|
|||||||
Reference in New Issue
Block a user