moved score threshold to config.py & replaced the separator for tag.get_text in find_closest_parent_with_useful_text fn from period(.) to space( ) to keep the text more neutral.

2024-07-21 15:18:23 +05:30
parent e5ecf291f3
commit cf6c835e18
2 changed files with 12 additions and 2 deletions
--- a/crawl4ai/config.py
+++ b/crawl4ai/config.py
@@ -27,3 +27,13 @@ WORD_TOKEN_RATE = 1.3

 # Threshold for the minimum number of word in a HTML tag to be considered 
 MIN_WORD_THRESHOLD = 1
+
+# Threshold for the Image extraction - Range is 1 to 6
+# Images are scored based on point based system, to filter based on usefulness. Points are assigned
+# to each image based on the following aspects.
+# If either height or width exceeds 150px
+# If image size is greater than 10Kb
+# If alt property is set
+# If image format is in jpg, png or webp
+# If image is in the first half of the total images extracted from the page
+IMAGE_SCORE_THRESHOLD = 2
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -526,7 +526,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
                    current_tag = current_tag.parent
                    # Get the text content of the parent tag
                    if current_tag:
-                        text_content = current_tag.get_text(separator='. ',strip=True)
+                        text_content = current_tag.get_text(separator=' ',strip=True)
                        # Check if the text content has at least word_count_threshold
                        if len(text_content.split()) >= word_count_threshold:
                            return text_content
@@ -535,7 +535,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
            if not is_valid_image(img, img.parent, img.parent.get('class', [])):
                return None
            score = score_image_for_usefulness(img, url, index, total_images)
-            if score <= 2:
+            if score <= IMAGE_SCORE_THRESHOLD:
                return None
            return {
                'src': img.get('src', ''),