Compare commits

...

5 Commits

Author SHA1 Message Date
Aravind Karnam
cf6c835e18 moved score threshold to config.py & replaced the separator for tag.get_text in find_closest_parent_with_useful_text fn from period(.) to space( ) to keep the text more neutral. 2024-07-21 15:18:23 +05:30
Aravind Karnam
e5ecf291f3 Implemented filtering for images and grabbing the contextual text from nearest parent 2024-07-21 15:03:17 +05:30
Aravind Karnam
9d0cafcfa6 fixed import error in model_loader.py 2024-07-21 14:55:58 +05:30
unclecode
7715623430 chore: Fix typos and update .gitignore
These changes fix typos in `chunking_strategy.py` and `crawler_strategy.py` to improve code readability. Additionally, the `.test_pads/` directory is removed from the `.gitignore` file to keep the repository clean and organized.
2024-07-19 17:42:39 +08:00
unclecode
f5a4e80e2c chore: Fix typo in chunking_strategy.py and crawler_strategy.py
The commit fixes a typo in the `chunking_strategy.py` file where `nl.toknize.TextTilingTokenizer()` was corrected to `nl.tokenize.TextTilingTokenizer()`. Additionally, in the `crawler_strategy.py` file, the commit converts the screenshot image to RGB mode before saving it as a JPEG. This ensures consistent image quality and compression.
2024-07-19 17:40:31 +08:00
9 changed files with 147 additions and 10 deletions

View File

@@ -1,5 +1,14 @@
# Changelog # Changelog
## [v0.2.75] - 2024-07-19
Minor improvements for a more maintainable codebase:
- 🔄 Fixed typos in `chunking_strategy.py` and `crawler_strategy.py` to improve code readability
- 🔄 Removed `.test_pads/` directory from `.gitignore` to keep our repository clean and organized
These changes may seem small, but they contribute to a more stable and sustainable codebase. By fixing typos and updating our `.gitignore` settings, we're ensuring that our code is easier to maintain and scale in the long run.
## [v0.2.74] - 2024-07-08 ## [v0.2.74] - 2024-07-08
A slew of exciting updates to improve the crawler's stability and robustness! 🎉 A slew of exciting updates to improve the crawler's stability and robustness! 🎉

View File

@@ -1,4 +1,4 @@
# Crawl4AI v0.2.74 🕷️🤖 # Crawl4AI v0.2.75 🕷️🤖
[![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers) [![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers)
[![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members) [![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members)

View File

@@ -55,7 +55,7 @@ class TopicSegmentationChunking(ChunkingStrategy):
def __init__(self, num_keywords=3, **kwargs): def __init__(self, num_keywords=3, **kwargs):
import nltk as nl import nltk as nl
self.tokenizer = nl.toknize.TextTilingTokenizer() self.tokenizer = nl.tokenize.TextTilingTokenizer()
self.num_keywords = num_keywords self.num_keywords = num_keywords
def chunk(self, text: str) -> list: def chunk(self, text: str) -> list:

View File

@@ -27,3 +27,13 @@ WORD_TOKEN_RATE = 1.3
# Threshold for the minimum number of word in a HTML tag to be considered # Threshold for the minimum number of word in a HTML tag to be considered
MIN_WORD_THRESHOLD = 1 MIN_WORD_THRESHOLD = 1
# Threshold for the Image extraction - Range is 1 to 6
# Images are scored based on point based system, to filter based on usefulness. Points are assigned
# to each image based on the following aspects.
# If either height or width exceeds 150px
# If image size is greater than 10Kb
# If alt property is set
# If image format is in jpg, png or webp
# If image is in the first half of the total images extracted from the page
IMAGE_SCORE_THRESHOLD = 2

View File

@@ -292,15 +292,22 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
# Open the screenshot with PIL # Open the screenshot with PIL
image = Image.open(BytesIO(screenshot)) image = Image.open(BytesIO(screenshot))
# Convert image to RGB mode
rgb_image = image.convert('RGB')
# Convert to JPEG and compress # Convert to JPEG and compress
buffered = BytesIO() buffered = BytesIO()
image.save(buffered, format="JPEG", quality=85) rgb_image.save(buffered, format="JPEG", quality=85)
img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8') img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
if self.verbose: if self.verbose:
print(f"[LOG] 📸 Screenshot taken and converted to base64") print(f"[LOG] 📸 Screenshot taken and converted to base64")
return img_base64 return img_base64
except Exception as e:
if self.verbose:
print(f"[ERROR] Failed to take screenshot: {str(e)}")
return ""
except Exception as e: except Exception as e:
error_message = sanitize_input_encode(f"Failed to take screenshot: {str(e)}") error_message = sanitize_input_encode(f"Failed to take screenshot: {str(e)}")

View File

@@ -3,7 +3,7 @@ from pathlib import Path
import subprocess, os import subprocess, os
import shutil import shutil
import tarfile import tarfile
from crawl4ai.config import MODEL_REPO_BRANCH from .model_loader import *
import argparse import argparse
import urllib.request import urllib.request
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))

View File

@@ -11,6 +11,9 @@ from .prompts import PROMPT_EXTRACT_BLOCKS
from .config import * from .config import *
from pathlib import Path from pathlib import Path
from typing import Dict, Any from typing import Dict, Any
from urllib.parse import urljoin
import requests
from requests.exceptions import InvalidSchema
class InvalidCSSSelectorError(Exception): class InvalidCSSSelectorError(Exception):
pass pass
@@ -447,6 +450,101 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
links = {'internal': [], 'external': []} links = {'internal': [], 'external': []}
media = {'images': [], 'videos': [], 'audios': []} media = {'images': [], 'videos': [], 'audios': []}
def process_image(img, url, index, total_images):
#Check if an image has valid display and inside undesired html elements
def is_valid_image(img, parent, parent_classes):
style = img.get('style', '')
src = img.get('src', '')
classes_to_check = ['button', 'icon', 'logo']
tags_to_check = ['button', 'input']
return all([
'display:none' not in style,
src,
not any(s in var for var in [src, img.get('alt', ''), *parent_classes] for s in classes_to_check),
parent.name not in tags_to_check
])
#Score an image for it's usefulness
def score_image_for_usefulness(img, base_url, index, images_count):
# Function to parse image height/width value and units
def parse_dimension(dimension):
if dimension:
match = re.match(r"(\d+)(\D*)", dimension)
if match:
number = int(match.group(1))
unit = match.group(2) or 'px' # Default unit is 'px' if not specified
return number, unit
return None, None
# Fetch image file metadata to extract size and extension
def fetch_image_file_size(img, base_url):
#If src is relative path construct full URL, if not it may be CDN URL
img_url = urljoin(base_url,img.get('src'))
try:
response = requests.head(img_url)
if response.status_code == 200:
return response.headers.get('Content-Length',None)
else:
print(f"Failed to retrieve file size for {img_url}")
return None
except InvalidSchema as e:
return None
finally:
return
image_height = img.get('height')
height_value, height_unit = parse_dimension(image_height)
image_width = img.get('width')
width_value, width_unit = parse_dimension(image_width)
image_size = int(fetch_image_file_size(img,base_url) or 0)
image_format = os.path.splitext(img.get('src',''))[1].lower()
score = 0
if height_value:
if height_unit == 'px' and height_value > 150:
score += 1
if height_unit in ['%','vh','vmin','vmax'] and height_value >30:
score += 1
if width_value:
if width_unit == 'px' and width_value > 150:
score += 1
if width_unit in ['%','vh','vmin','vmax'] and width_value >30:
score += 1
if image_size > 10000:
score += 1
if img.get('alt') != '':
score+=1
if any(image_format==format for format in ['jpg','png','webp']):
score+=1
if index/images_count<0.5:
score+=1
return score
# Extract meaningful text for images from closest parent
def find_closest_parent_with_useful_text(tag):
current_tag = tag
while current_tag:
current_tag = current_tag.parent
# Get the text content of the parent tag
if current_tag:
text_content = current_tag.get_text(separator=' ',strip=True)
# Check if the text content has at least word_count_threshold
if len(text_content.split()) >= word_count_threshold:
return text_content
return None
if not is_valid_image(img, img.parent, img.parent.get('class', [])):
return None
score = score_image_for_usefulness(img, url, index, total_images)
if score <= IMAGE_SCORE_THRESHOLD:
return None
return {
'src': img.get('src', ''),
'alt': img.get('alt', ''),
'desc': find_closest_parent_with_useful_text(img),
'score': score,
'type': 'image'
}
def process_element(element: element.PageElement) -> bool: def process_element(element: element.PageElement) -> bool:
try: try:
if isinstance(element, NavigableString): if isinstance(element, NavigableString):
@@ -471,11 +569,6 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
keep_element = True keep_element = True
elif element.name == 'img': elif element.name == 'img':
media['images'].append({
'src': element.get('src'),
'alt': element.get('alt'),
'type': 'image'
})
return True # Always keep image elements return True # Always keep image elements
elif element.name in ['video', 'audio']: elif element.name in ['video', 'audio']:
@@ -518,6 +611,14 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
print('Error processing element:', str(e)) print('Error processing element:', str(e))
return False return False
#process images by filtering and extracting contextual text from the page
imgs = body.find_all('img')
media['images'] = [
result for result in
(process_image(img, url, i, len(imgs)) for i, img in enumerate(imgs))
if result is not None
]
process_element(body) process_element(body)
def flatten_nested_elements(node): def flatten_nested_elements(node):

View File

@@ -1,5 +1,15 @@
# Changelog # Changelog
## [v0.2.75] - 2024-07-19
Minor improvements for a more maintainable codebase:
- 🔄 Fixed typos in `chunking_strategy.py` and `crawler_strategy.py` to improve code readability
- 🔄 Removed `.test_pads/` directory from `.gitignore` to keep our repository clean and organized
These changes may seem small, but they contribute to a more stable and sustainable codebase. By fixing typos and updating our `.gitignore` settings, we're ensuring that our code is easier to maintain and scale in the long run.
## v0.2.74 - 2024-07-08 ## v0.2.74 - 2024-07-08
A slew of exciting updates to improve the crawler's stability and robustness! 🎉 A slew of exciting updates to improve the crawler's stability and robustness! 🎉

View File

@@ -1,4 +1,4 @@
# Crawl4AI v0.2.74 # Crawl4AI v0.2.75
Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI. Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.