From cf6c835e18c3e6e579d11f21728eb9999cbe8bc4 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Sun, 21 Jul 2024 15:18:23 +0530 Subject: [PATCH] moved score threshold to config.py & replaced the separator for tag.get_text in find_closest_parent_with_useful_text fn from period(.) to space( ) to keep the text more neutral. --- crawl4ai/config.py | 10 ++++++++++ crawl4ai/utils.py | 4 ++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/crawl4ai/config.py b/crawl4ai/config.py index 77273b78..4d9edc69 100644 --- a/crawl4ai/config.py +++ b/crawl4ai/config.py @@ -27,3 +27,13 @@ WORD_TOKEN_RATE = 1.3 # Threshold for the minimum number of word in a HTML tag to be considered MIN_WORD_THRESHOLD = 1 + +# Threshold for the Image extraction - Range is 1 to 6 +# Images are scored based on point based system, to filter based on usefulness. Points are assigned +# to each image based on the following aspects. +# If either height or width exceeds 150px +# If image size is greater than 10Kb +# If alt property is set +# If image format is in jpg, png or webp +# If image is in the first half of the total images extracted from the page +IMAGE_SCORE_THRESHOLD = 2 diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 338cc77e..e50adb72 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -526,7 +526,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: current_tag = current_tag.parent # Get the text content of the parent tag if current_tag: - text_content = current_tag.get_text(separator='. ',strip=True) + text_content = current_tag.get_text(separator=' ',strip=True) # Check if the text content has at least word_count_threshold if len(text_content.split()) >= word_count_threshold: return text_content @@ -535,7 +535,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: if not is_valid_image(img, img.parent, img.parent.get('class', [])): return None score = score_image_for_usefulness(img, url, index, total_images) - if score <= 2: + if score <= IMAGE_SCORE_THRESHOLD: return None return { 'src': img.get('src', ''),