From 413595542ab5440867cf202456a5ae87762b9d0d Mon Sep 17 00:00:00 2001 From: unclecode Date: Mon, 17 Jun 2024 15:14:34 +0800 Subject: [PATCH] Enhancement: Replaced inline HTML tags with textual format for better LLM context handling #24 --- CHANGELOG.md | 5 +++++ README.md | 5 ++++- crawl4ai/utils.py | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 793353b7..ea932dfe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## [0.2.5] - 2024-06-17 +### Added +- Enhancement issue #24: Replaced inline HTML tags (e.g., DEL, INS, SUB, ABBR) with textual format for better context handling in LLM. + + ## [0.2.4] - 2024-06-17 ### Fixed - Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs \ No newline at end of file diff --git a/README.md b/README.md index ad070606..a6856e2e 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,10 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information ## Recent Changes -### v0.2.34 +### v0.2.5 +- ✨ Maintaining the semantic context of inline tags (e.g., abbreviation, DEL, INS) for improved LLM-friendliness. + +### v0.2.4 - 🐞 Resolve the issue with the long url. (Issue #22) ### v0.2.3 diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index f201ba0b..f5ebd256 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -151,6 +151,38 @@ class CustomHTML2Text(HTML2Text): super().handle_tag(tag, attrs, start) +def replace_inline_tags(soup, tags): + tag_replacements = { + 'b': lambda tag: f"**{tag.text}**", + 'i': lambda tag: f"*{tag.text}*", + 'u': lambda tag: f"__{tag.text}__", + 'span': lambda tag: f"{tag.text}", + 'del': lambda tag: f"~~{tag.text}~~", + 'ins': lambda tag: f"++{tag.text}++", + 'sub': lambda tag: f"~{tag.text}~", + 'sup': lambda tag: f"^^{tag.text}^^", + 'strong': lambda tag: f"**{tag.text}**", + 'em': lambda tag: f"*{tag.text}*", + 'code': lambda tag: f"`{tag.text}`", + 'kbd': lambda tag: f"`{tag.text}`", + 'var': lambda tag: f"_{tag.text}_", + 's': lambda tag: f"~~{tag.text}~~", + 'q': lambda tag: f'"{tag.text}"', + 'abbr': lambda tag: f"{tag.text} ({tag.get('title', '')})", + 'cite': lambda tag: f"_{tag.text}_", + 'dfn': lambda tag: f"_{tag.text}_", + 'time': lambda tag: f"{tag.text}", + 'small': lambda tag: f"{tag.text}", + 'mark': lambda tag: f"=={tag.text}==" + } + + for tag_name in tags: + for tag in soup.find_all(tag_name): + replacement_text = tag_replacements.get(tag_name, lambda t: t.text)(tag) + tag.replace_with(replacement_text) + + return soup + def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None): try: if not html: @@ -249,6 +281,9 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, # Replace all "pre" tags with their inner text body = replace_pre_tags_with_text(body) + + # Replace inline tags with their text content + body = replace_inline_tags(body, ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']) # Recursively remove empty elements, their parent elements, and elements with word count below threshold def remove_empty_and_low_word_count_elements(node, word_count_threshold):