Enhancement: Replaced inline HTML tags with textual format for better LLM context handling #45

Enhancement: Replaced inline HTML tags with textual format for better LLM context handling #24
2024-06-17 15:14:56 +08:00 · 2024-06-17 15:14:34 +08:00
5 changed files with 46 additions and 3 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changelog

+## [0.2.5] - 2024-06-17
+### Added
+- Enhancement issue #24: Replaced inline HTML tags (e.g., DEL, INS, SUB, ABBR) with textual format for better context handling in LLM.
+
+
 ## [0.2.4] - 2024-06-17
 ### Fixed
 - Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs
--- a/README.md
+++ b/README.md
@@ -13,7 +13,10 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information

 ## Recent Changes 

-### v0.2.34
+### v0.2.5
+- ✨ Maintaining the semantic context of inline tags (e.g., abbreviation, DEL, INS) for improved LLM-friendliness.
+
+### v0.2.4
 - 🐞 Resolve the issue with the long url. (Issue #22)

 ### v0.2.3
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -151,6 +151,38 @@ class CustomHTML2Text(HTML2Text):

        super().handle_tag(tag, attrs, start)

+def replace_inline_tags(soup, tags):
+    tag_replacements = {
+        'b': lambda tag: f"**{tag.text}**",
+        'i': lambda tag: f"*{tag.text}*",
+        'u': lambda tag: f"__{tag.text}__",
+        'span': lambda tag: f"{tag.text}",
+        'del': lambda tag: f"~~{tag.text}~~",
+        'ins': lambda tag: f"++{tag.text}++",
+        'sub': lambda tag: f"~{tag.text}~",
+        'sup': lambda tag: f"^^{tag.text}^^",
+        'strong': lambda tag: f"**{tag.text}**",
+        'em': lambda tag: f"*{tag.text}*",
+        'code': lambda tag: f"`{tag.text}`",
+        'kbd': lambda tag: f"`{tag.text}`",
+        'var': lambda tag: f"_{tag.text}_",
+        's': lambda tag: f"~~{tag.text}~~",
+        'q': lambda tag: f'"{tag.text}"',
+        'abbr': lambda tag: f"{tag.text} ({tag.get('title', '')})",
+        'cite': lambda tag: f"_{tag.text}_",
+        'dfn': lambda tag: f"_{tag.text}_",
+        'time': lambda tag: f"{tag.text}",
+        'small': lambda tag: f"<small>{tag.text}</small>",
+        'mark': lambda tag: f"=={tag.text}=="
+    }
+
+    for tag_name in tags:
+        for tag in soup.find_all(tag_name):
+            replacement_text = tag_replacements.get(tag_name, lambda t: t.text)(tag)
+            tag.replace_with(replacement_text)
+
+    return soup
+
 def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None):
    try:
        if not html:
@@ -249,6 +281,9 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD,
        
        # Replace all "pre" tags with their inner text
        body = replace_pre_tags_with_text(body)
+        
+        # Replace inline tags with their text content
+        body = replace_inline_tags(body, ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark'])

        # Recursively remove empty elements, their parent elements, and elements with word count below threshold
        def remove_empty_and_low_word_count_elements(node, word_count_threshold):
--- a/pages/index.html
+++ b/pages/index.html
@@ -25,7 +25,7 @@
        <header class="bg-zinc-950 text-lime-500 py-4 flex">
            
            <div class="mx-auto px-4">
-                <h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.4</h1>
+                <h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.5</h1>
            </div>
            <div class="mx-auto px-4 flex font-bold text-xl gap-2">
                <span>📊 Total Website Processed</span>
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,7 @@ class CustomInstallCommand(install):

 setup(
    name="Crawl4AI",
-    version="0.2.4",
+    version="0.2.5",
    description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
    long_description=open("README.md").read(),
    long_description_content_type="text/markdown",
Author	SHA1	Message	Date
unclecode	2f246d19f4	Enhancement: Replaced inline HTML tags with textual format for better LLM context handling #45	2024-06-17 15:14:56 +08:00
unclecode	413595542a	Enhancement: Replaced inline HTML tags with textual format for better LLM context handling #24	2024-06-17 15:14:34 +08:00