From 9ee988753d19e440c96cb14085ef8494ba36b8c8 Mon Sep 17 00:00:00 2001 From: unclecode Date: Fri, 2 Aug 2024 14:53:11 +0800 Subject: [PATCH] refactor: Update image description minimum word threshold in get_content_of_website_optimized --- crawl4ai/config.py | 1 + crawl4ai/utils.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/crawl4ai/config.py b/crawl4ai/config.py index 4d9edc69..00b1eb46 100644 --- a/crawl4ai/config.py +++ b/crawl4ai/config.py @@ -27,6 +27,7 @@ WORD_TOKEN_RATE = 1.3 # Threshold for the minimum number of word in a HTML tag to be considered MIN_WORD_THRESHOLD = 1 +IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD = 1 # Threshold for the Image extraction - Range is 1 to 6 # Images are scored based on point based system, to filter based on usefulness. Points are assigned diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index e0d5d1ea..07832888 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -438,6 +438,8 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: soup = BeautifulSoup(html, 'html.parser') body = soup.body + + image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD) if css_selector: selected_elements = body.select(css_selector) @@ -530,7 +532,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: if current_tag: text_content = current_tag.get_text(separator=' ',strip=True) # Check if the text content has at least word_count_threshold - if len(text_content.split()) >= word_count_threshold: + if len(text_content.split()) >= image_description_min_word_threshold: return text_content return None