from abc import ABC, abstractmethod from typing import Optional, Dict, Any, Tuple from .models import MarkdownGenerationResult from .html2text import CustomHTML2Text from .content_filter_strategy import RelevantContentFilter import re from urllib.parse import urljoin # Pre-compile the regex pattern LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)') def fast_urljoin(base: str, url: str) -> str: """Fast URL joining for common cases.""" if url.startswith(("http://", "https://", "mailto:", "//")): return url if url.startswith("/"): # Handle absolute paths if base.endswith("/"): return base[:-1] + url return base + url return urljoin(base, url) class MarkdownGenerationStrategy(ABC): """Abstract base class for markdown generation strategies.""" def __init__( self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None, ): self.content_filter = content_filter self.options = options or {} @abstractmethod def generate_markdown( self, cleaned_html: str, base_url: str = "", html2text_options: Optional[Dict[str, Any]] = None, content_filter: Optional[RelevantContentFilter] = None, citations: bool = True, **kwargs, ) -> MarkdownGenerationResult: """Generate markdown from cleaned HTML.""" pass class DefaultMarkdownGenerator(MarkdownGenerationStrategy): """ Default implementation of markdown generation strategy. How it works: 1. Generate raw markdown from cleaned HTML. 2. Convert links to citations. 3. Generate fit markdown if content filter is provided. 4. Return MarkdownGenerationResult. Args: content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown. options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None. Returns: MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown. """ def __init__( self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None, ): super().__init__(content_filter, options) def convert_links_to_citations( self, markdown: str, base_url: str = "" ) -> Tuple[str, str]: """ Convert links in markdown to citations. How it works: 1. Find all links in the markdown. 2. Convert links to citations. 3. Return converted markdown and references markdown. Note: This function uses a regex pattern to find links in markdown. Args: markdown (str): Markdown text. base_url (str): Base URL for URL joins. Returns: Tuple[str, str]: Converted markdown and references markdown. """ link_map = {} url_cache = {} # Cache for URL joins parts = [] last_end = 0 counter = 1 for match in LINK_PATTERN.finditer(markdown): parts.append(markdown[last_end : match.start()]) text, url, title = match.groups() # Use cached URL if available, otherwise compute and cache if base_url and not url.startswith(("http://", "https://", "mailto:")): if url not in url_cache: url_cache[url] = fast_urljoin(base_url, url) url = url_cache[url] if url not in link_map: desc = [] if title: desc.append(title) if text and text != title: desc.append(text) link_map[url] = (counter, ": " + " - ".join(desc) if desc else "") counter += 1 num = link_map[url][0] parts.append( f"{text}⟨{num}⟩" if not match.group(0).startswith("!") else f"![{text}⟨{num}⟩]" ) last_end = match.end() parts.append(markdown[last_end:]) converted_text = "".join(parts) # Pre-build reference strings references = ["\n\n## References\n\n"] references.extend( f"⟨{num}⟩ {url}{desc}\n" for url, (num, desc) in sorted(link_map.items(), key=lambda x: x[1][0]) ) return converted_text, "".join(references) def generate_markdown( self, cleaned_html: str, base_url: str = "", html2text_options: Optional[Dict[str, Any]] = None, options: Optional[Dict[str, Any]] = None, content_filter: Optional[RelevantContentFilter] = None, citations: bool = True, **kwargs, ) -> MarkdownGenerationResult: """ Generate markdown with citations from cleaned HTML. How it works: 1. Generate raw markdown from cleaned HTML. 2. Convert links to citations. 3. Generate fit markdown if content filter is provided. 4. Return MarkdownGenerationResult. Args: cleaned_html (str): Cleaned HTML content. base_url (str): Base URL for URL joins. html2text_options (Optional[Dict[str, Any]]): HTML2Text options. options (Optional[Dict[str, Any]]): Additional options for markdown generation. content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown. citations (bool): Whether to generate citations. Returns: MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown. """ try: # Initialize HTML2Text with default options for better conversion h = CustomHTML2Text(baseurl=base_url) default_options = { "body_width": 0, # Disable text wrapping "ignore_emphasis": False, "ignore_links": False, "ignore_images": False, "protect_links": True, "single_line_break": True, "mark_code": True, "escape_snob": False, } # Update with custom options if provided if html2text_options: default_options.update(html2text_options) elif options: default_options.update(options) elif self.options: default_options.update(self.options) h.update_params(**default_options) # Ensure we have valid input if not cleaned_html: cleaned_html = "" elif not isinstance(cleaned_html, str): cleaned_html = str(cleaned_html) # Generate raw markdown try: raw_markdown = h.handle(cleaned_html) except Exception as e: raw_markdown = f"Error converting HTML to markdown: {str(e)}" raw_markdown = raw_markdown.replace(" ```", "```") # Convert links to citations markdown_with_citations: str = raw_markdown references_markdown: str = "" if citations: try: ( markdown_with_citations, references_markdown, ) = self.convert_links_to_citations(raw_markdown, base_url) except Exception as e: markdown_with_citations = raw_markdown references_markdown = f"Error generating citations: {str(e)}" # Generate fit markdown if content filter is provided fit_markdown: Optional[str] = "" filtered_html: Optional[str] = "" if content_filter or self.content_filter: try: content_filter = content_filter or self.content_filter filtered_html = content_filter.filter_content(cleaned_html) filtered_html = "\n".join( "