fix: Respect <base> tag for relative link resolution in html2text

Fixes #1680

The HTML2Text class was ignoring the <base> tag, causing relative links
to be resolved against the page URL instead of the base URL specified
in the <base href="..."> attribute.

Added <base> tag handling in both HTML2Text and CustomHTML2Text to update
self.baseurl when the tag is encountered, ensuring proper link resolution
according to HTML standards.

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Yurii Chukhlib
2026-01-17 11:17:28 +01:00
parent c85f56b085
commit 2016d669a9

View File

@@ -316,6 +316,12 @@ class HTML2Text(html.parser.HTMLParser):
if self.tag_callback(self, tag, attrs, start) is True: if self.tag_callback(self, tag, attrs, start) is True:
return return
# Handle <base> tag to update base URL for relative links
if tag == "base" and start:
href = attrs.get("href")
if href:
self.baseurl = href
# first thing inside the anchor tag is another tag # first thing inside the anchor tag is another tag
# that produces some output # that produces some output
if ( if (
@@ -1069,6 +1075,15 @@ class CustomHTML2Text(HTML2Text):
setattr(self, key, value) setattr(self, key, value)
def handle_tag(self, tag, attrs, start): def handle_tag(self, tag, attrs, start):
# Handle <base> tag to update base URL for relative links
# Must be handled before preserved tags since <base> is in <head>
if tag == "base" and start:
href = attrs.get("href") if attrs else None
if href:
self.baseurl = href
# Also let parent class handle it
return super().handle_tag(tag, attrs, start)
# Handle preserved tags # Handle preserved tags
if tag in self.preserve_tags: if tag in self.preserve_tags:
if start: if start: