This commit is contained in:
UncleCode
2024-11-07 19:30:22 +08:00
5 changed files with 10 additions and 7 deletions

View File

@@ -706,9 +706,12 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
body = flatten_nested_elements(body) body = flatten_nested_elements(body)
base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)') base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)')
for img in imgs: for img in imgs:
src = img.get('src', '') try:
if base64_pattern.match(src): src = img.get('src', '')
img['src'] = base64_pattern.sub('', src) if base64_pattern.match(src):
img['src'] = base64_pattern.sub('', src)
except:
pass
cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ') cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ')
cleaned_html = sanitize_html(cleaned_html) cleaned_html = sanitize_html(cleaned_html)