Merge pull request #229 from bizrockman/main

Preventing NoneType has no attribute get Errors
This commit is contained in:
UncleCode
2024-11-06 07:31:07 +01:00
committed by GitHub
5 changed files with 10 additions and 7 deletions

View File

@@ -706,9 +706,12 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
body = flatten_nested_elements(body)
base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)')
for img in imgs:
try:
src = img.get('src', '')
if base64_pattern.match(src):
img['src'] = base64_pattern.sub('', src)
except:
pass
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
cleaned_html = sanitize_html(cleaned_html)