diff --git a/CHANGELOG.md b/CHANGELOG.md index eb854b1d..df96beab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ - before_return_html: Called when the data is parsed and ready. - on_user_agent_updated: Called when the user changes the user_agent, causing the driver to reinitialize. - Added an example in `quickstart.py` in the example folder under the docs. +- Enhancement issue #24: Replaced inline HTML tags (e.g., DEL, INS, SUB, ABBR) with textual format for better context handling in LLM. + ## [0.2.4] - 2024-06-17 ### Fixed diff --git a/README.md b/README.md index b0b12510..b951d468 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information - 🟠 before_return_html: Called when the data is parsed and ready. - 🟡 on_user_agent_updated: Called when the user changes the user_agent, causing the driver to reinitialize. - 📄 Added an example in [`quickstart.py`](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart.py) in the example folder under the docs. +- ✨ Maintaining the semantic context of inline tags (e.g., abbreviation, DEL, INS) for improved LLM-friendliness. ### v0.2.4 - 🐞 Resolve the issue with the long url. (Issue #22) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index f201ba0b..9892134f 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -151,7 +151,42 @@ class CustomHTML2Text(HTML2Text): super().handle_tag(tag, attrs, start) -def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None): +def replace_inline_tags(soup, tags, only_text=False): + tag_replacements = { + 'b': lambda tag: f"**{tag.text}**", + 'i': lambda tag: f"*{tag.text}*", + 'u': lambda tag: f"__{tag.text}__", + 'span': lambda tag: f"{tag.text}", + 'del': lambda tag: f"~~{tag.text}~~", + 'ins': lambda tag: f"++{tag.text}++", + 'sub': lambda tag: f"~{tag.text}~", + 'sup': lambda tag: f"^^{tag.text}^^", + 'strong': lambda tag: f"**{tag.text}**", + 'em': lambda tag: f"*{tag.text}*", + 'code': lambda tag: f"`{tag.text}`", + 'kbd': lambda tag: f"`{tag.text}`", + 'var': lambda tag: f"_{tag.text}_", + 's': lambda tag: f"~~{tag.text}~~", + 'q': lambda tag: f'"{tag.text}"', + 'abbr': lambda tag: f"{tag.text} ({tag.get('title', '')})", + 'cite': lambda tag: f"_{tag.text}_", + 'dfn': lambda tag: f"_{tag.text}_", + 'time': lambda tag: f"{tag.text}", + 'small': lambda tag: f"{tag.text}", + 'mark': lambda tag: f"=={tag.text}==" + } + + for tag_name in tags: + for tag in soup.find_all(tag_name): + if not only_text: + replacement_text = tag_replacements.get(tag_name, lambda t: t.text)(tag) + tag.replace_with(replacement_text) + else: + tag.replace_with(tag.text) + + return soup + +def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None, **kwargs): try: if not html: return None @@ -249,6 +284,13 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, # Replace all "pre" tags with their inner text body = replace_pre_tags_with_text(body) + + # Replace inline tags with their text content + body = replace_inline_tags( + body, + ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark'], + only_text=kwargs.get('only_text', False) + ) # Recursively remove empty elements, their parent elements, and elements with word count below threshold def remove_empty_and_low_word_count_elements(node, word_count_threshold): diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index da44cc19..17153689 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -176,7 +176,7 @@ class WebCrawler: t = time.time() # Extract content from HTML try: - result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector) + result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False)) metadata = extract_metadata(html) if result is None: raise ValueError(f"Failed to extract content from the website: {url}") diff --git a/docs/examples/quickstart.py b/docs/examples/quickstart.py index 3e02fed7..24486cc1 100644 --- a/docs/examples/quickstart.py +++ b/docs/examples/quickstart.py @@ -35,7 +35,13 @@ def cprint(message, press_any_key=False): def basic_usage(crawler): cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]") - result = crawler.run(url="https://www.nbcnews.com/business") + result = crawler.run(url="https://www.nbcnews.com/business", only_text = True) + cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]") + print_result(result) + +def basic_usage_some_params(crawler): + cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]") + result = crawler.run(url="https://www.nbcnews.com/business", word_count_threshold=1, only_text = True) cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]") print_result(result) @@ -260,7 +266,9 @@ def main(): crawler = create_crawler() + crawler.always_by_pass_cache = True basic_usage(crawler) + # basic_usage_some_params(crawler) understanding_parameters(crawler) crawler.always_by_pass_cache = True