Compare commits

...

2 Commits

5 changed files with 46 additions and 3 deletions

View File

@@ -1,5 +1,10 @@
# Changelog
## [0.2.5] - 2024-06-17
### Added
- Enhancement issue #24: Replaced inline HTML tags (e.g., DEL, INS, SUB, ABBR) with textual format for better context handling in LLM.
## [0.2.4] - 2024-06-17
### Fixed
- Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs

View File

@@ -13,7 +13,10 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
## Recent Changes
### v0.2.34
### v0.2.5
- ✨ Maintaining the semantic context of inline tags (e.g., abbreviation, DEL, INS) for improved LLM-friendliness.
### v0.2.4
- 🐞 Resolve the issue with the long url. (Issue #22)
### v0.2.3

View File

@@ -151,6 +151,38 @@ class CustomHTML2Text(HTML2Text):
super().handle_tag(tag, attrs, start)
def replace_inline_tags(soup, tags):
tag_replacements = {
'b': lambda tag: f"**{tag.text}**",
'i': lambda tag: f"*{tag.text}*",
'u': lambda tag: f"__{tag.text}__",
'span': lambda tag: f"{tag.text}",
'del': lambda tag: f"~~{tag.text}~~",
'ins': lambda tag: f"++{tag.text}++",
'sub': lambda tag: f"~{tag.text}~",
'sup': lambda tag: f"^^{tag.text}^^",
'strong': lambda tag: f"**{tag.text}**",
'em': lambda tag: f"*{tag.text}*",
'code': lambda tag: f"`{tag.text}`",
'kbd': lambda tag: f"`{tag.text}`",
'var': lambda tag: f"_{tag.text}_",
's': lambda tag: f"~~{tag.text}~~",
'q': lambda tag: f'"{tag.text}"',
'abbr': lambda tag: f"{tag.text} ({tag.get('title', '')})",
'cite': lambda tag: f"_{tag.text}_",
'dfn': lambda tag: f"_{tag.text}_",
'time': lambda tag: f"{tag.text}",
'small': lambda tag: f"<small>{tag.text}</small>",
'mark': lambda tag: f"=={tag.text}=="
}
for tag_name in tags:
for tag in soup.find_all(tag_name):
replacement_text = tag_replacements.get(tag_name, lambda t: t.text)(tag)
tag.replace_with(replacement_text)
return soup
def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None):
try:
if not html:
@@ -249,6 +281,9 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD,
# Replace all "pre" tags with their inner text
body = replace_pre_tags_with_text(body)
# Replace inline tags with their text content
body = replace_inline_tags(body, ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark'])
# Recursively remove empty elements, their parent elements, and elements with word count below threshold
def remove_empty_and_low_word_count_elements(node, word_count_threshold):

View File

@@ -25,7 +25,7 @@
<header class="bg-zinc-950 text-lime-500 py-4 flex">
<div class="mx-auto px-4">
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.4</h1>
<h1 class="text-2xl font-bold">🔥🕷️ Crawl4AI: Web Data for your Thoughts v0.2.5</h1>
</div>
<div class="mx-auto px-4 flex font-bold text-xl gap-2">
<span>📊 Total Website Processed</span>

View File

@@ -26,7 +26,7 @@ class CustomInstallCommand(install):
setup(
name="Crawl4AI",
version="0.2.4",
version="0.2.5",
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
long_description=open("README.md").read(),
long_description_content_type="text/markdown",