- Extract all media tags
- Take screenshot of the page
This commit is contained in:
unclecode
2024-06-07 15:23:13 +08:00
parent aead6de888
commit 0533aeb814
8 changed files with 90 additions and 22 deletions

View File

@@ -180,6 +180,35 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
if tag.name != 'img':
tag.attrs = {}
# Extract all img tgas inti [{src: '', alt: ''}]
media = {
'images': [],
'videos': [],
'audios': []
}
for img in body.find_all('img'):
media['images'].append({
'src': img.get('src'),
'alt': img.get('alt'),
"type": "image"
})
# Extract all video tags into [{src: '', alt: ''}]
for video in body.find_all('video'):
media['videos'].append({
'src': video.get('src'),
'alt': video.get('alt'),
"type": "video"
})
# Extract all audio tags into [{src: '', alt: ''}]
for audio in body.find_all('audio'):
media['audios'].append({
'src': audio.get('src'),
'alt': audio.get('alt'),
"type": "audio"
})
# Replace images with their alt text or remove them if no alt text is available
for img in body.find_all('img'):
alt_text = img.get('alt')
@@ -299,7 +328,8 @@ def get_content_of_website(html, word_count_threshold = MIN_WORD_THRESHOLD, css_
return{
'markdown': markdown,
'cleaned_html': cleaned_html,
'success': True
'success': True,
'media': media
}
except Exception as e: