Compare commits
5 Commits
main-img-c
...
image-desc
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cf6c835e18 | ||
|
|
e5ecf291f3 | ||
|
|
9d0cafcfa6 | ||
|
|
7715623430 | ||
|
|
f5a4e80e2c |
@@ -1,5 +1,14 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## [v0.2.75] - 2024-07-19
|
||||||
|
|
||||||
|
Minor improvements for a more maintainable codebase:
|
||||||
|
|
||||||
|
- 🔄 Fixed typos in `chunking_strategy.py` and `crawler_strategy.py` to improve code readability
|
||||||
|
- 🔄 Removed `.test_pads/` directory from `.gitignore` to keep our repository clean and organized
|
||||||
|
|
||||||
|
These changes may seem small, but they contribute to a more stable and sustainable codebase. By fixing typos and updating our `.gitignore` settings, we're ensuring that our code is easier to maintain and scale in the long run.
|
||||||
|
|
||||||
## [v0.2.74] - 2024-07-08
|
## [v0.2.74] - 2024-07-08
|
||||||
A slew of exciting updates to improve the crawler's stability and robustness! 🎉
|
A slew of exciting updates to improve the crawler's stability and robustness! 🎉
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
# Crawl4AI v0.2.74 🕷️🤖
|
# Crawl4AI v0.2.75 🕷️🤖
|
||||||
|
|
||||||
[](https://github.com/unclecode/crawl4ai/stargazers)
|
[](https://github.com/unclecode/crawl4ai/stargazers)
|
||||||
[](https://github.com/unclecode/crawl4ai/network/members)
|
[](https://github.com/unclecode/crawl4ai/network/members)
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ class TopicSegmentationChunking(ChunkingStrategy):
|
|||||||
|
|
||||||
def __init__(self, num_keywords=3, **kwargs):
|
def __init__(self, num_keywords=3, **kwargs):
|
||||||
import nltk as nl
|
import nltk as nl
|
||||||
self.tokenizer = nl.toknize.TextTilingTokenizer()
|
self.tokenizer = nl.tokenize.TextTilingTokenizer()
|
||||||
self.num_keywords = num_keywords
|
self.num_keywords = num_keywords
|
||||||
|
|
||||||
def chunk(self, text: str) -> list:
|
def chunk(self, text: str) -> list:
|
||||||
|
|||||||
@@ -27,3 +27,13 @@ WORD_TOKEN_RATE = 1.3
|
|||||||
|
|
||||||
# Threshold for the minimum number of word in a HTML tag to be considered
|
# Threshold for the minimum number of word in a HTML tag to be considered
|
||||||
MIN_WORD_THRESHOLD = 1
|
MIN_WORD_THRESHOLD = 1
|
||||||
|
|
||||||
|
# Threshold for the Image extraction - Range is 1 to 6
|
||||||
|
# Images are scored based on point based system, to filter based on usefulness. Points are assigned
|
||||||
|
# to each image based on the following aspects.
|
||||||
|
# If either height or width exceeds 150px
|
||||||
|
# If image size is greater than 10Kb
|
||||||
|
# If alt property is set
|
||||||
|
# If image format is in jpg, png or webp
|
||||||
|
# If image is in the first half of the total images extracted from the page
|
||||||
|
IMAGE_SCORE_THRESHOLD = 2
|
||||||
|
|||||||
@@ -292,15 +292,22 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
# Open the screenshot with PIL
|
# Open the screenshot with PIL
|
||||||
image = Image.open(BytesIO(screenshot))
|
image = Image.open(BytesIO(screenshot))
|
||||||
|
|
||||||
|
# Convert image to RGB mode
|
||||||
|
rgb_image = image.convert('RGB')
|
||||||
|
|
||||||
# Convert to JPEG and compress
|
# Convert to JPEG and compress
|
||||||
buffered = BytesIO()
|
buffered = BytesIO()
|
||||||
image.save(buffered, format="JPEG", quality=85)
|
rgb_image.save(buffered, format="JPEG", quality=85)
|
||||||
img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||||
|
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(f"[LOG] 📸 Screenshot taken and converted to base64")
|
print(f"[LOG] 📸 Screenshot taken and converted to base64")
|
||||||
|
|
||||||
return img_base64
|
return img_base64
|
||||||
|
except Exception as e:
|
||||||
|
if self.verbose:
|
||||||
|
print(f"[ERROR] Failed to take screenshot: {str(e)}")
|
||||||
|
return ""
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_message = sanitize_input_encode(f"Failed to take screenshot: {str(e)}")
|
error_message = sanitize_input_encode(f"Failed to take screenshot: {str(e)}")
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ from pathlib import Path
|
|||||||
import subprocess, os
|
import subprocess, os
|
||||||
import shutil
|
import shutil
|
||||||
import tarfile
|
import tarfile
|
||||||
from crawl4ai.config import MODEL_REPO_BRANCH
|
from .model_loader import *
|
||||||
import argparse
|
import argparse
|
||||||
import urllib.request
|
import urllib.request
|
||||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||||
|
|||||||
@@ -11,6 +11,9 @@ from .prompts import PROMPT_EXTRACT_BLOCKS
|
|||||||
from .config import *
|
from .config import *
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Any
|
from typing import Dict, Any
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
import requests
|
||||||
|
from requests.exceptions import InvalidSchema
|
||||||
|
|
||||||
class InvalidCSSSelectorError(Exception):
|
class InvalidCSSSelectorError(Exception):
|
||||||
pass
|
pass
|
||||||
@@ -447,6 +450,101 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
|||||||
links = {'internal': [], 'external': []}
|
links = {'internal': [], 'external': []}
|
||||||
media = {'images': [], 'videos': [], 'audios': []}
|
media = {'images': [], 'videos': [], 'audios': []}
|
||||||
|
|
||||||
|
def process_image(img, url, index, total_images):
|
||||||
|
#Check if an image has valid display and inside undesired html elements
|
||||||
|
def is_valid_image(img, parent, parent_classes):
|
||||||
|
style = img.get('style', '')
|
||||||
|
src = img.get('src', '')
|
||||||
|
classes_to_check = ['button', 'icon', 'logo']
|
||||||
|
tags_to_check = ['button', 'input']
|
||||||
|
return all([
|
||||||
|
'display:none' not in style,
|
||||||
|
src,
|
||||||
|
not any(s in var for var in [src, img.get('alt', ''), *parent_classes] for s in classes_to_check),
|
||||||
|
parent.name not in tags_to_check
|
||||||
|
])
|
||||||
|
|
||||||
|
#Score an image for it's usefulness
|
||||||
|
def score_image_for_usefulness(img, base_url, index, images_count):
|
||||||
|
# Function to parse image height/width value and units
|
||||||
|
def parse_dimension(dimension):
|
||||||
|
if dimension:
|
||||||
|
match = re.match(r"(\d+)(\D*)", dimension)
|
||||||
|
if match:
|
||||||
|
number = int(match.group(1))
|
||||||
|
unit = match.group(2) or 'px' # Default unit is 'px' if not specified
|
||||||
|
return number, unit
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# Fetch image file metadata to extract size and extension
|
||||||
|
def fetch_image_file_size(img, base_url):
|
||||||
|
#If src is relative path construct full URL, if not it may be CDN URL
|
||||||
|
img_url = urljoin(base_url,img.get('src'))
|
||||||
|
try:
|
||||||
|
response = requests.head(img_url)
|
||||||
|
if response.status_code == 200:
|
||||||
|
return response.headers.get('Content-Length',None)
|
||||||
|
else:
|
||||||
|
print(f"Failed to retrieve file size for {img_url}")
|
||||||
|
return None
|
||||||
|
except InvalidSchema as e:
|
||||||
|
return None
|
||||||
|
finally:
|
||||||
|
return
|
||||||
|
|
||||||
|
image_height = img.get('height')
|
||||||
|
height_value, height_unit = parse_dimension(image_height)
|
||||||
|
image_width = img.get('width')
|
||||||
|
width_value, width_unit = parse_dimension(image_width)
|
||||||
|
image_size = int(fetch_image_file_size(img,base_url) or 0)
|
||||||
|
image_format = os.path.splitext(img.get('src',''))[1].lower()
|
||||||
|
score = 0
|
||||||
|
if height_value:
|
||||||
|
if height_unit == 'px' and height_value > 150:
|
||||||
|
score += 1
|
||||||
|
if height_unit in ['%','vh','vmin','vmax'] and height_value >30:
|
||||||
|
score += 1
|
||||||
|
if width_value:
|
||||||
|
if width_unit == 'px' and width_value > 150:
|
||||||
|
score += 1
|
||||||
|
if width_unit in ['%','vh','vmin','vmax'] and width_value >30:
|
||||||
|
score += 1
|
||||||
|
if image_size > 10000:
|
||||||
|
score += 1
|
||||||
|
if img.get('alt') != '':
|
||||||
|
score+=1
|
||||||
|
if any(image_format==format for format in ['jpg','png','webp']):
|
||||||
|
score+=1
|
||||||
|
if index/images_count<0.5:
|
||||||
|
score+=1
|
||||||
|
return score
|
||||||
|
|
||||||
|
# Extract meaningful text for images from closest parent
|
||||||
|
def find_closest_parent_with_useful_text(tag):
|
||||||
|
current_tag = tag
|
||||||
|
while current_tag:
|
||||||
|
current_tag = current_tag.parent
|
||||||
|
# Get the text content of the parent tag
|
||||||
|
if current_tag:
|
||||||
|
text_content = current_tag.get_text(separator=' ',strip=True)
|
||||||
|
# Check if the text content has at least word_count_threshold
|
||||||
|
if len(text_content.split()) >= word_count_threshold:
|
||||||
|
return text_content
|
||||||
|
return None
|
||||||
|
|
||||||
|
if not is_valid_image(img, img.parent, img.parent.get('class', [])):
|
||||||
|
return None
|
||||||
|
score = score_image_for_usefulness(img, url, index, total_images)
|
||||||
|
if score <= IMAGE_SCORE_THRESHOLD:
|
||||||
|
return None
|
||||||
|
return {
|
||||||
|
'src': img.get('src', ''),
|
||||||
|
'alt': img.get('alt', ''),
|
||||||
|
'desc': find_closest_parent_with_useful_text(img),
|
||||||
|
'score': score,
|
||||||
|
'type': 'image'
|
||||||
|
}
|
||||||
|
|
||||||
def process_element(element: element.PageElement) -> bool:
|
def process_element(element: element.PageElement) -> bool:
|
||||||
try:
|
try:
|
||||||
if isinstance(element, NavigableString):
|
if isinstance(element, NavigableString):
|
||||||
@@ -471,11 +569,6 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
|||||||
keep_element = True
|
keep_element = True
|
||||||
|
|
||||||
elif element.name == 'img':
|
elif element.name == 'img':
|
||||||
media['images'].append({
|
|
||||||
'src': element.get('src'),
|
|
||||||
'alt': element.get('alt'),
|
|
||||||
'type': 'image'
|
|
||||||
})
|
|
||||||
return True # Always keep image elements
|
return True # Always keep image elements
|
||||||
|
|
||||||
elif element.name in ['video', 'audio']:
|
elif element.name in ['video', 'audio']:
|
||||||
@@ -518,6 +611,14 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
|||||||
print('Error processing element:', str(e))
|
print('Error processing element:', str(e))
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
#process images by filtering and extracting contextual text from the page
|
||||||
|
imgs = body.find_all('img')
|
||||||
|
media['images'] = [
|
||||||
|
result for result in
|
||||||
|
(process_image(img, url, i, len(imgs)) for i, img in enumerate(imgs))
|
||||||
|
if result is not None
|
||||||
|
]
|
||||||
|
|
||||||
process_element(body)
|
process_element(body)
|
||||||
|
|
||||||
def flatten_nested_elements(node):
|
def flatten_nested_elements(node):
|
||||||
|
|||||||
@@ -1,5 +1,15 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## [v0.2.75] - 2024-07-19
|
||||||
|
|
||||||
|
Minor improvements for a more maintainable codebase:
|
||||||
|
|
||||||
|
- 🔄 Fixed typos in `chunking_strategy.py` and `crawler_strategy.py` to improve code readability
|
||||||
|
- 🔄 Removed `.test_pads/` directory from `.gitignore` to keep our repository clean and organized
|
||||||
|
|
||||||
|
These changes may seem small, but they contribute to a more stable and sustainable codebase. By fixing typos and updating our `.gitignore` settings, we're ensuring that our code is easier to maintain and scale in the long run.
|
||||||
|
|
||||||
|
|
||||||
## v0.2.74 - 2024-07-08
|
## v0.2.74 - 2024-07-08
|
||||||
A slew of exciting updates to improve the crawler's stability and robustness! 🎉
|
A slew of exciting updates to improve the crawler's stability and robustness! 🎉
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
# Crawl4AI v0.2.74
|
# Crawl4AI v0.2.75
|
||||||
|
|
||||||
Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.
|
Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user