From 413595542ab5440867cf202456a5ae87762b9d0d Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Mon, 17 Jun 2024 15:14:34 +0800
Subject: [PATCH] Enhancement: Replaced inline HTML tags with textual format
 for better LLM context handling #24

---
 CHANGELOG.md      |  5 +++++
 README.md         |  5 ++++-
 crawl4ai/utils.py | 35 +++++++++++++++++++++++++++++++++++
 3 files changed, 44 insertions(+), 1 deletion(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 793353b7..ea932dfe 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changelog
 
+## [0.2.5] - 2024-06-17
+### Added
+- Enhancement issue #24: Replaced inline HTML tags (e.g., DEL, INS, SUB, ABBR) with textual format for better context handling in LLM.
+
+
 ## [0.2.4] - 2024-06-17
 ### Fixed
 - Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs
\ No newline at end of file
diff --git a/README.md b/README.md
index ad070606..a6856e2e 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,10 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
 
 ## Recent Changes 
 
-### v0.2.34
+### v0.2.5
+- ✨ Maintaining the semantic context of inline tags (e.g., abbreviation, DEL, INS) for improved LLM-friendliness.
+
+### v0.2.4
 - 🐞 Resolve the issue with the long url. (Issue #22)
 
 ### v0.2.3
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index f201ba0b..f5ebd256 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -151,6 +151,38 @@ class CustomHTML2Text(HTML2Text):
 
         super().handle_tag(tag, attrs, start)
 
+def replace_inline_tags(soup, tags):
+    tag_replacements = {
+        'b': lambda tag: f"**{tag.text}**",
+        'i': lambda tag: f"*{tag.text}*",
+        'u': lambda tag: f"__{tag.text}__",
+        'span': lambda tag: f"{tag.text}",
+        'del': lambda tag: f"~~{tag.text}~~",
+        'ins': lambda tag: f"++{tag.text}++",
+        'sub': lambda tag: f"~{tag.text}~",
+        'sup': lambda tag: f"^^{tag.text}^^",
+        'strong': lambda tag: f"**{tag.text}**",
+        'em': lambda tag: f"*{tag.text}*",
+        'code': lambda tag: f"`{tag.text}`",
+        'kbd': lambda tag: f"`{tag.text}`",
+        'var': lambda tag: f"_{tag.text}_",
+        's': lambda tag: f"~~{tag.text}~~",
+        'q': lambda tag: f'"{tag.text}"',
+        'abbr': lambda tag: f"{tag.text} ({tag.get('title', '')})",
+        'cite': lambda tag: f"_{tag.text}_",
+        'dfn': lambda tag: f"_{tag.text}_",
+        'time': lambda tag: f"{tag.text}",
+        'small': lambda tag: f"<small>{tag.text}</small>",
+        'mark': lambda tag: f"=={tag.text}=="
+    }
+
+    for tag_name in tags:
+        for tag in soup.find_all(tag_name):
+            replacement_text = tag_replacements.get(tag_name, lambda t: t.text)(tag)
+            tag.replace_with(replacement_text)
+
+    return soup
+
 def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None):
     try:
         if not html:
@@ -249,6 +281,9 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD,
         
         # Replace all "pre" tags with their inner text
         body = replace_pre_tags_with_text(body)
+        
+        # Replace inline tags with their text content
+        body = replace_inline_tags(body, ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark'])
 
         # Recursively remove empty elements, their parent elements, and elements with word count below threshold
         def remove_empty_and_low_word_count_elements(node, word_count_threshold):