refactor: Update image description minimum word threshold in get_content_of_website_optimized

This commit is contained in:
unclecode
2024-08-02 14:53:11 +08:00
parent 8ae6c43ca4
commit 9ee988753d
2 changed files with 4 additions and 1 deletions

View File

@@ -27,6 +27,7 @@ WORD_TOKEN_RATE = 1.3
# Threshold for the minimum number of word in a HTML tag to be considered
MIN_WORD_THRESHOLD = 1
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD = 1
# Threshold for the Image extraction - Range is 1 to 6
# Images are scored based on point based system, to filter based on usefulness. Points are assigned

View File

@@ -438,6 +438,8 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
soup = BeautifulSoup(html, 'html.parser')
body = soup.body
image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
if css_selector:
selected_elements = body.select(css_selector)
@@ -530,7 +532,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
if current_tag:
text_content = current_tag.get_text(separator=' ',strip=True)
# Check if the text content has at least word_count_threshold
if len(text_content.split()) >= word_count_threshold:
if len(text_content.split()) >= image_description_min_word_threshold:
return text_content
return None