- Test all methods

- Update index.hml
- Update Readme
- Resolve some bugs
This commit is contained in:
unclecode
2024-05-14 21:27:41 +08:00
parent 5fea6c064b
commit f6e59157bf
17 changed files with 1004 additions and 402 deletions

View File

@@ -10,6 +10,8 @@ from html2text import HTML2Text
from .prompts import PROMPT_EXTRACT_BLOCKS
from .config import *
class InvalidCSSSelectorError(Exception):
pass
def beautify_html(escaped_html):
"""
@@ -140,13 +142,25 @@ class CustomHTML2Text(HTML2Text):
super().handle_tag(tag, attrs, start)
def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD):
def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None):
try:
if not html:
return None
# Parse HTML content with BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
# Get the content within the <body> tag
body = soup.body
# If css_selector is provided, extract content based on the selector
if css_selector:
selected_elements = body.select(css_selector)
if not selected_elements:
raise InvalidCSSSelectorError(f"Invalid CSS selector , No elements found for CSS selector: {css_selector}")
div_tag = soup.new_tag('div')
for el in selected_elements:
div_tag.append(el)
body = div_tag
# Remove script, style, and other tags that don't carry useful content from body
for tag in body.find_all(['script', 'style', 'link', 'meta', 'noscript']):
@@ -255,7 +269,7 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD):
# Remove comments
for comment in soup.find_all(text=lambda text: isinstance(text, Comment)):
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
comment.extract()
# Remove consecutive empty newlines and replace multiple spaces with a single space
@@ -281,7 +295,7 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD):
except Exception as e:
print('Error processing HTML content:', str(e))
return None
raise InvalidCSSSelectorError(f"Invalid CSS selector: {css_selector}") from e
def extract_xml_tags(string):
tags = re.findall(r'<(\w+)>', string)