fix: Respect <base> tag for relative link resolution in html2text
Fixes #1680 The HTML2Text class was ignoring the <base> tag, causing relative links to be resolved against the page URL instead of the base URL specified in the <base href="..."> attribute. Added <base> tag handling in both HTML2Text and CustomHTML2Text to update self.baseurl when the tag is encountered, ensuring proper link resolution according to HTML standards. Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -316,6 +316,12 @@ class HTML2Text(html.parser.HTMLParser):
|
|||||||
if self.tag_callback(self, tag, attrs, start) is True:
|
if self.tag_callback(self, tag, attrs, start) is True:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Handle <base> tag to update base URL for relative links
|
||||||
|
if tag == "base" and start:
|
||||||
|
href = attrs.get("href")
|
||||||
|
if href:
|
||||||
|
self.baseurl = href
|
||||||
|
|
||||||
# first thing inside the anchor tag is another tag
|
# first thing inside the anchor tag is another tag
|
||||||
# that produces some output
|
# that produces some output
|
||||||
if (
|
if (
|
||||||
@@ -1069,6 +1075,15 @@ class CustomHTML2Text(HTML2Text):
|
|||||||
setattr(self, key, value)
|
setattr(self, key, value)
|
||||||
|
|
||||||
def handle_tag(self, tag, attrs, start):
|
def handle_tag(self, tag, attrs, start):
|
||||||
|
# Handle <base> tag to update base URL for relative links
|
||||||
|
# Must be handled before preserved tags since <base> is in <head>
|
||||||
|
if tag == "base" and start:
|
||||||
|
href = attrs.get("href") if attrs else None
|
||||||
|
if href:
|
||||||
|
self.baseurl = href
|
||||||
|
# Also let parent class handle it
|
||||||
|
return super().handle_tag(tag, attrs, start)
|
||||||
|
|
||||||
# Handle preserved tags
|
# Handle preserved tags
|
||||||
if tag in self.preserve_tags:
|
if tag in self.preserve_tags:
|
||||||
if start:
|
if start:
|
||||||
|
|||||||
Reference in New Issue
Block a user