Fix Base64 image parsing in WebScrappingStrategy (issue 182)
- Add support for extracting Base64 encoded images - Improve image format detection to include Base64 images - Enhance compatibility with locally saved HTML files using Base64 image encoding
This commit is contained in:
@@ -2,6 +2,9 @@
|
|||||||
|
|
||||||
## [v0.3.72] - 2024-10-20
|
## [v0.3.72] - 2024-10-20
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- Added support for parsing Base64 encoded images in WebScrappingStrategy
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
- Forked and integrated a customized version of the html2text library for more control over Markdown generation
|
- Forked and integrated a customized version of the html2text library for more control over Markdown generation
|
||||||
- New configuration options for controlling external content:
|
- New configuration options for controlling external content:
|
||||||
|
|||||||
@@ -127,7 +127,11 @@ class WebScrappingStrategy(ContentScrappingStrategy):
|
|||||||
image_width = img.get('width')
|
image_width = img.get('width')
|
||||||
width_value, width_unit = parse_dimension(image_width)
|
width_value, width_unit = parse_dimension(image_width)
|
||||||
image_size = 0 #int(fetch_image_file_size(img,base_url) or 0)
|
image_size = 0 #int(fetch_image_file_size(img,base_url) or 0)
|
||||||
image_format = os.path.splitext(img.get('src',''))[1].lower()
|
image_src = img.get('src','')
|
||||||
|
if "data:image/" in image_src:
|
||||||
|
image_format = image_src.split(',')[0].split(';')[0].split('/')[1]
|
||||||
|
else:
|
||||||
|
image_format = os.path.splitext(img.get('src',''))[1].lower()
|
||||||
# Remove . from format
|
# Remove . from format
|
||||||
image_format = image_format.strip('.').split('?')[0]
|
image_format = image_format.strip('.').split('?')[0]
|
||||||
score = 0
|
score = 0
|
||||||
|
|||||||
Reference in New Issue
Block a user