refactor: Update image description minimum word threshold in get_content_of_website_optimized
This commit is contained in:
@@ -27,6 +27,7 @@ WORD_TOKEN_RATE = 1.3
|
|||||||
|
|
||||||
# Threshold for the minimum number of word in a HTML tag to be considered
|
# Threshold for the minimum number of word in a HTML tag to be considered
|
||||||
MIN_WORD_THRESHOLD = 1
|
MIN_WORD_THRESHOLD = 1
|
||||||
|
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD = 1
|
||||||
|
|
||||||
# Threshold for the Image extraction - Range is 1 to 6
|
# Threshold for the Image extraction - Range is 1 to 6
|
||||||
# Images are scored based on point based system, to filter based on usefulness. Points are assigned
|
# Images are scored based on point based system, to filter based on usefulness. Points are assigned
|
||||||
|
|||||||
@@ -439,6 +439,8 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
|||||||
soup = BeautifulSoup(html, 'html.parser')
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
body = soup.body
|
body = soup.body
|
||||||
|
|
||||||
|
image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
|
||||||
|
|
||||||
if css_selector:
|
if css_selector:
|
||||||
selected_elements = body.select(css_selector)
|
selected_elements = body.select(css_selector)
|
||||||
if not selected_elements:
|
if not selected_elements:
|
||||||
@@ -530,7 +532,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
|||||||
if current_tag:
|
if current_tag:
|
||||||
text_content = current_tag.get_text(separator=' ',strip=True)
|
text_content = current_tag.get_text(separator=' ',strip=True)
|
||||||
# Check if the text content has at least word_count_threshold
|
# Check if the text content has at least word_count_threshold
|
||||||
if len(text_content.split()) >= word_count_threshold:
|
if len(text_content.split()) >= image_description_min_word_threshold:
|
||||||
return text_content
|
return text_content
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user