refactor: Update image description minimum word threshold in get_content_of_website_optimized

2024-08-02 14:53:11 +08:00
parent 8ae6c43ca4
commit 9ee988753d
2 changed files with 4 additions and 1 deletions
--- a/crawl4ai/config.py
+++ b/crawl4ai/config.py
@@ -27,6 +27,7 @@ WORD_TOKEN_RATE = 1.3

 # Threshold for the minimum number of word in a HTML tag to be considered 
 MIN_WORD_THRESHOLD = 1
+IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD = 1

 # Threshold for the Image extraction - Range is 1 to 6
 # Images are scored based on point based system, to filter based on usefulness. Points are assigned
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -438,6 +438,8 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:

    soup = BeautifulSoup(html, 'html.parser')
    body = soup.body
+    
+    image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)

    if css_selector:
        selected_elements = body.select(css_selector)
@@ -530,7 +532,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
                    if current_tag:
                        text_content = current_tag.get_text(separator=' ',strip=True)
                        # Check if the text content has at least word_count_threshold
-                        if len(text_content.split()) >= word_count_threshold:
+                        if len(text_content.split()) >= image_description_min_word_threshold:
                            return text_content
                return None