From 79ebfce9131da1445189123c0d64a28e441b20ae Mon Sep 17 00:00:00 2001 From: unclecode Date: Sat, 24 Jan 2026 04:19:50 +0000 Subject: [PATCH] Refactor HTML block delimiter to use config constant --- crawl4ai/config.py | 3 +++ crawl4ai/extraction_strategy.py | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/crawl4ai/config.py b/crawl4ai/config.py index 08f56b83..c868a99d 100644 --- a/crawl4ai/config.py +++ b/crawl4ai/config.py @@ -102,6 +102,9 @@ SCREENSHOT_HEIGHT_TRESHOLD = 10000 PAGE_TIMEOUT = 60000 DOWNLOAD_PAGE_TIMEOUT = 60000 +# Delimiter for concatenating multiple HTML examples in schema generation +HTML_EXAMPLE_DELIMITER = "=== HTML EXAMPLE {index} ===" + # Global user settings with descriptions and default values USER_SETTINGS = { "DEFAULT_LLM_PROVIDER": { diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index b3b51568..e2392e90 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -13,6 +13,7 @@ from .config import ( CHUNK_TOKEN_THRESHOLD, OVERLAP_RATE, WORD_TOKEN_RATE, + HTML_EXAMPLE_DELIMITER, ) from .utils import * # noqa: F403 @@ -1488,7 +1489,8 @@ In this scenario, use your best judgment to generate the schema. You need to exa attr_value_threshold=500, max_size=500_000 ) - html_parts.append(f"'''html example {i}\n{cleaned}\n'''") + header = HTML_EXAMPLE_DELIMITER.format(index=i) + html_parts.append(f"{header}\n{cleaned}") html = "\n\n".join(html_parts) # Preprocess HTML for schema generation (skip if already preprocessed from multiple URLs)