Merge branch 'format-inline-tags'
This commit is contained in:
@@ -9,6 +9,8 @@
|
||||
- before_return_html: Called when the data is parsed and ready.
|
||||
- on_user_agent_updated: Called when the user changes the user_agent, causing the driver to reinitialize.
|
||||
- Added an example in `quickstart.py` in the example folder under the docs.
|
||||
- Enhancement issue #24: Replaced inline HTML tags (e.g., DEL, INS, SUB, ABBR) with textual format for better context handling in LLM.
|
||||
|
||||
|
||||
## [0.2.4] - 2024-06-17
|
||||
### Fixed
|
||||
|
||||
@@ -21,6 +21,7 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
|
||||
- 🟠 before_return_html: Called when the data is parsed and ready.
|
||||
- 🟡 on_user_agent_updated: Called when the user changes the user_agent, causing the driver to reinitialize.
|
||||
- 📄 Added an example in [`quickstart.py`](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart.py) in the example folder under the docs.
|
||||
- ✨ Maintaining the semantic context of inline tags (e.g., abbreviation, DEL, INS) for improved LLM-friendliness.
|
||||
|
||||
### v0.2.4
|
||||
- 🐞 Resolve the issue with the long url. (Issue #22)
|
||||
|
||||
@@ -151,7 +151,42 @@ class CustomHTML2Text(HTML2Text):
|
||||
|
||||
super().handle_tag(tag, attrs, start)
|
||||
|
||||
def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None):
|
||||
def replace_inline_tags(soup, tags, only_text=False):
|
||||
tag_replacements = {
|
||||
'b': lambda tag: f"**{tag.text}**",
|
||||
'i': lambda tag: f"*{tag.text}*",
|
||||
'u': lambda tag: f"__{tag.text}__",
|
||||
'span': lambda tag: f"{tag.text}",
|
||||
'del': lambda tag: f"~~{tag.text}~~",
|
||||
'ins': lambda tag: f"++{tag.text}++",
|
||||
'sub': lambda tag: f"~{tag.text}~",
|
||||
'sup': lambda tag: f"^^{tag.text}^^",
|
||||
'strong': lambda tag: f"**{tag.text}**",
|
||||
'em': lambda tag: f"*{tag.text}*",
|
||||
'code': lambda tag: f"`{tag.text}`",
|
||||
'kbd': lambda tag: f"`{tag.text}`",
|
||||
'var': lambda tag: f"_{tag.text}_",
|
||||
's': lambda tag: f"~~{tag.text}~~",
|
||||
'q': lambda tag: f'"{tag.text}"',
|
||||
'abbr': lambda tag: f"{tag.text} ({tag.get('title', '')})",
|
||||
'cite': lambda tag: f"_{tag.text}_",
|
||||
'dfn': lambda tag: f"_{tag.text}_",
|
||||
'time': lambda tag: f"{tag.text}",
|
||||
'small': lambda tag: f"<small>{tag.text}</small>",
|
||||
'mark': lambda tag: f"=={tag.text}=="
|
||||
}
|
||||
|
||||
for tag_name in tags:
|
||||
for tag in soup.find_all(tag_name):
|
||||
if not only_text:
|
||||
replacement_text = tag_replacements.get(tag_name, lambda t: t.text)(tag)
|
||||
tag.replace_with(replacement_text)
|
||||
else:
|
||||
tag.replace_with(tag.text)
|
||||
|
||||
return soup
|
||||
|
||||
def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None, **kwargs):
|
||||
try:
|
||||
if not html:
|
||||
return None
|
||||
@@ -250,6 +285,13 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD,
|
||||
# Replace all "pre" tags with their inner text
|
||||
body = replace_pre_tags_with_text(body)
|
||||
|
||||
# Replace inline tags with their text content
|
||||
body = replace_inline_tags(
|
||||
body,
|
||||
['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark'],
|
||||
only_text=kwargs.get('only_text', False)
|
||||
)
|
||||
|
||||
# Recursively remove empty elements, their parent elements, and elements with word count below threshold
|
||||
def remove_empty_and_low_word_count_elements(node, word_count_threshold):
|
||||
for child in node.contents:
|
||||
|
||||
@@ -176,7 +176,7 @@ class WebCrawler:
|
||||
t = time.time()
|
||||
# Extract content from HTML
|
||||
try:
|
||||
result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
|
||||
result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
|
||||
metadata = extract_metadata(html)
|
||||
if result is None:
|
||||
raise ValueError(f"Failed to extract content from the website: {url}")
|
||||
|
||||
@@ -35,7 +35,13 @@ def cprint(message, press_any_key=False):
|
||||
|
||||
def basic_usage(crawler):
|
||||
cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]")
|
||||
result = crawler.run(url="https://www.nbcnews.com/business")
|
||||
result = crawler.run(url="https://www.nbcnews.com/business", only_text = True)
|
||||
cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
def basic_usage_some_params(crawler):
|
||||
cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]")
|
||||
result = crawler.run(url="https://www.nbcnews.com/business", word_count_threshold=1, only_text = True)
|
||||
cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
|
||||
print_result(result)
|
||||
|
||||
@@ -260,7 +266,9 @@ def main():
|
||||
|
||||
crawler = create_crawler()
|
||||
|
||||
crawler.always_by_pass_cache = True
|
||||
basic_usage(crawler)
|
||||
# basic_usage_some_params(crawler)
|
||||
understanding_parameters(crawler)
|
||||
|
||||
crawler.always_by_pass_cache = True
|
||||
|
||||
Reference in New Issue
Block a user