diff --git a/crawl4ai/html2text/__init__.py b/crawl4ai/html2text/__init__.py index ca15b453..7f6e788b 100644 --- a/crawl4ai/html2text/__init__.py +++ b/crawl4ai/html2text/__init__.py @@ -316,6 +316,12 @@ class HTML2Text(html.parser.HTMLParser): if self.tag_callback(self, tag, attrs, start) is True: return + # Handle tag to update base URL for relative links + if tag == "base" and start: + href = attrs.get("href") + if href: + self.baseurl = href + # first thing inside the anchor tag is another tag # that produces some output if ( @@ -1069,6 +1075,15 @@ class CustomHTML2Text(HTML2Text): setattr(self, key, value) def handle_tag(self, tag, attrs, start): + # Handle tag to update base URL for relative links + # Must be handled before preserved tags since is in + if tag == "base" and start: + href = attrs.get("href") if attrs else None + if href: + self.baseurl = href + # Also let parent class handle it + return super().handle_tag(tag, attrs, start) + # Handle preserved tags if tag in self.preserve_tags: if start: