feat(schema): improve HTML preprocessing for schema generation

Add new preprocess_html_for_schema utility function to better handle HTML cleaning for schema generation. This replaces the previous optimize_html function in the GoogleSearchCrawler and includes smarter attribute handling and pattern detection. Other changes: - Update default provider to gpt-4o - Add DEFAULT_PROVIDER_API_KEY constant - Make LLMConfig creation more flexible with create_llm_config helper - Add new dependencies: zstandard and msgpack This change improves schema generation reliability while reducing noise in the processed HTML.
2025-03-12 22:40:46 +08:00
parent 1630fbdafe
commit dc36997a08
8 changed files with 134 additions and 12 deletions
--- a/crawl4ai/crawlers/google_search/crawler.py
+++ b/crawl4ai/crawlers/google_search/crawler.py
@@ -1,6 +1,6 @@
 from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig, CacheMode
 from crawl4ai.hub import BaseCrawler
-from crawl4ai.utils import optimize_html, get_home_folder
+from crawl4ai.utils import optimize_html, get_home_folder, preprocess_html_for_schema
 from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
 from pathlib import Path
 import json
@@ -68,7 +68,8 @@ class GoogleSearchCrawler(BaseCrawler):
        home_dir = get_home_folder() if not schema_cache_path else schema_cache_path
        os.makedirs(f"{home_dir}/schema", exist_ok=True)

-        cleaned_html = optimize_html(html, threshold=100)
+        # cleaned_html = optimize_html(html, threshold=100)
+        cleaned_html = preprocess_html_for_schema(html) 

        organic_schema = None
        if os.path.exists(f"{home_dir}/schema/organic_schema.json"):