From 9ee988753d19e440c96cb14085ef8494ba36b8c8 Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Fri, 2 Aug 2024 14:53:11 +0800
Subject: [PATCH] refactor: Update image description minimum word threshold in
 get_content_of_website_optimized

---
 crawl4ai/config.py | 1 +
 crawl4ai/utils.py  | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/crawl4ai/config.py b/crawl4ai/config.py
index 4d9edc69..00b1eb46 100644
--- a/crawl4ai/config.py
+++ b/crawl4ai/config.py
@@ -27,6 +27,7 @@ WORD_TOKEN_RATE = 1.3
 
 # Threshold for the minimum number of word in a HTML tag to be considered 
 MIN_WORD_THRESHOLD = 1
+IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD = 1
 
 # Threshold for the Image extraction - Range is 1 to 6
 # Images are scored based on point based system, to filter based on usefulness. Points are assigned
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index e0d5d1ea..07832888 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -438,6 +438,8 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
 
     soup = BeautifulSoup(html, 'html.parser')
     body = soup.body
+    
+    image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
 
     if css_selector:
         selected_elements = body.select(css_selector)
@@ -530,7 +532,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
                     if current_tag:
                         text_content = current_tag.get_text(separator=' ',strip=True)
                         # Check if the text content has at least word_count_threshold
-                        if len(text_content.split()) >= word_count_threshold:
+                        if len(text_content.split()) >= image_description_min_word_threshold:
                             return text_content
                 return None