Merge branch 'format-inline-tags'

2024-06-19 00:48:38 +08:00
parent 21e2538e57 2f246d19f4
commit 3f0e265baf
5 changed files with 56 additions and 3 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,8 @@
  - before_return_html: Called when the data is parsed and ready.
  - on_user_agent_updated: Called when the user changes the user_agent, causing the driver to reinitialize.
 - Added an example in `quickstart.py` in the example folder under the docs.
+- Enhancement issue #24: Replaced inline HTML tags (e.g., DEL, INS, SUB, ABBR) with textual format for better context handling in LLM.
+

 ## [0.2.4] - 2024-06-17
 ### Fixed
--- a/README.md
+++ b/README.md
@@ -21,6 +21,7 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
  - 🟠 before_return_html: Called when the data is parsed and ready.
  - 🟡 on_user_agent_updated: Called when the user changes the user_agent, causing the driver to reinitialize.
 - 📄 Added an example in [`quickstart.py`](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart.py) in the example folder under the docs.
+- ✨ Maintaining the semantic context of inline tags (e.g., abbreviation, DEL, INS) for improved LLM-friendliness.

 ### v0.2.4
 - 🐞 Resolve the issue with the long url. (Issue #22)
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -151,7 +151,42 @@ class CustomHTML2Text(HTML2Text):

        super().handle_tag(tag, attrs, start)

-def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None):
+def replace_inline_tags(soup, tags, only_text=False):
+    tag_replacements = {
+        'b': lambda tag: f"**{tag.text}**",
+        'i': lambda tag: f"*{tag.text}*",
+        'u': lambda tag: f"__{tag.text}__",
+        'span': lambda tag: f"{tag.text}",
+        'del': lambda tag: f"~~{tag.text}~~",
+        'ins': lambda tag: f"++{tag.text}++",
+        'sub': lambda tag: f"~{tag.text}~",
+        'sup': lambda tag: f"^^{tag.text}^^",
+        'strong': lambda tag: f"**{tag.text}**",
+        'em': lambda tag: f"*{tag.text}*",
+        'code': lambda tag: f"`{tag.text}`",
+        'kbd': lambda tag: f"`{tag.text}`",
+        'var': lambda tag: f"_{tag.text}_",
+        's': lambda tag: f"~~{tag.text}~~",
+        'q': lambda tag: f'"{tag.text}"',
+        'abbr': lambda tag: f"{tag.text} ({tag.get('title', '')})",
+        'cite': lambda tag: f"_{tag.text}_",
+        'dfn': lambda tag: f"_{tag.text}_",
+        'time': lambda tag: f"{tag.text}",
+        'small': lambda tag: f"<small>{tag.text}</small>",
+        'mark': lambda tag: f"=={tag.text}=="
+    }
+
+    for tag_name in tags:
+        for tag in soup.find_all(tag_name):
+            if not only_text:
+                replacement_text = tag_replacements.get(tag_name, lambda t: t.text)(tag)
+                tag.replace_with(replacement_text)
+            else:
+                tag.replace_with(tag.text)
+
+    return soup
+
+def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None, **kwargs):
    try:
        if not html:
            return None
@@ -250,6 +285,13 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD,
        # Replace all "pre" tags with their inner text
        body = replace_pre_tags_with_text(body)
        
+        # Replace inline tags with their text content
+        body = replace_inline_tags(
+            body, 
+            ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark'],
+            only_text=kwargs.get('only_text', False)
+        )
+
        # Recursively remove empty elements, their parent elements, and elements with word count below threshold
        def remove_empty_and_low_word_count_elements(node, word_count_threshold):
            for child in node.contents:
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -176,7 +176,7 @@ class WebCrawler:
            t = time.time()
            # Extract content from HTML
            try:
-                result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
+                result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
                metadata = extract_metadata(html)
                if result is None:
                    raise ValueError(f"Failed to extract content from the website: {url}")
--- a/docs/examples/quickstart.py
+++ b/docs/examples/quickstart.py
@@ -35,7 +35,13 @@ def cprint(message, press_any_key=False):

 def basic_usage(crawler):
    cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]")
-    result = crawler.run(url="https://www.nbcnews.com/business")
+    result = crawler.run(url="https://www.nbcnews.com/business", only_text = True)
+    cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
+    print_result(result)
+
+def basic_usage_some_params(crawler):
+    cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]")
+    result = crawler.run(url="https://www.nbcnews.com/business", word_count_threshold=1, only_text = True)
    cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
    print_result(result)

@@ -260,7 +266,9 @@ def main():

    crawler = create_crawler()

+    crawler.always_by_pass_cache = True
    basic_usage(crawler)
+    # basic_usage_some_params(crawler)
    understanding_parameters(crawler)
    
    crawler.always_by_pass_cache = True