Rename LlmConfig to LLMConfig across the codebase to follow consistent naming conventions. Update all imports and usages to use the new name. Update documentation and examples to reflect the change. BREAKING CHANGE: LlmConfig has been renamed to LLMConfig. Users need to update their imports and usage.
257 lines
9.1 KiB
Python
257 lines
9.1 KiB
Python
from abc import ABC, abstractmethod
|
|
from typing import Optional, Dict, Any, Tuple
|
|
from .models import MarkdownGenerationResult
|
|
from .html2text import CustomHTML2Text
|
|
from .types import RelevantContentFilter
|
|
# from .content_filter_strategy import RelevantContentFilter
|
|
import re
|
|
from urllib.parse import urljoin
|
|
|
|
# Pre-compile the regex pattern
|
|
LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)')
|
|
|
|
|
|
def fast_urljoin(base: str, url: str) -> str:
|
|
"""Fast URL joining for common cases."""
|
|
if url.startswith(("http://", "https://", "mailto:", "//")):
|
|
return url
|
|
if url.startswith("/"):
|
|
# Handle absolute paths
|
|
if base.endswith("/"):
|
|
return base[:-1] + url
|
|
return base + url
|
|
return urljoin(base, url)
|
|
|
|
|
|
class MarkdownGenerationStrategy(ABC):
|
|
"""Abstract base class for markdown generation strategies."""
|
|
|
|
def __init__(
|
|
self,
|
|
content_filter: Optional[RelevantContentFilter] = None,
|
|
options: Optional[Dict[str, Any]] = None,
|
|
verbose: bool = False,
|
|
):
|
|
self.content_filter = content_filter
|
|
self.options = options or {}
|
|
self.verbose = verbose
|
|
|
|
@abstractmethod
|
|
def generate_markdown(
|
|
self,
|
|
cleaned_html: str,
|
|
base_url: str = "",
|
|
html2text_options: Optional[Dict[str, Any]] = None,
|
|
content_filter: Optional[RelevantContentFilter] = None,
|
|
citations: bool = True,
|
|
**kwargs,
|
|
) -> MarkdownGenerationResult:
|
|
"""Generate markdown from cleaned HTML."""
|
|
pass
|
|
|
|
|
|
class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
|
|
"""
|
|
Default implementation of markdown generation strategy.
|
|
|
|
How it works:
|
|
1. Generate raw markdown from cleaned HTML.
|
|
2. Convert links to citations.
|
|
3. Generate fit markdown if content filter is provided.
|
|
4. Return MarkdownGenerationResult.
|
|
|
|
Args:
|
|
content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
|
|
options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None.
|
|
|
|
Returns:
|
|
MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
content_filter: Optional[RelevantContentFilter] = None,
|
|
options: Optional[Dict[str, Any]] = None,
|
|
):
|
|
super().__init__(content_filter, options)
|
|
|
|
def convert_links_to_citations(
|
|
self, markdown: str, base_url: str = ""
|
|
) -> Tuple[str, str]:
|
|
"""
|
|
Convert links in markdown to citations.
|
|
|
|
How it works:
|
|
1. Find all links in the markdown.
|
|
2. Convert links to citations.
|
|
3. Return converted markdown and references markdown.
|
|
|
|
Note:
|
|
This function uses a regex pattern to find links in markdown.
|
|
|
|
Args:
|
|
markdown (str): Markdown text.
|
|
base_url (str): Base URL for URL joins.
|
|
|
|
Returns:
|
|
Tuple[str, str]: Converted markdown and references markdown.
|
|
"""
|
|
link_map = {}
|
|
url_cache = {} # Cache for URL joins
|
|
parts = []
|
|
last_end = 0
|
|
counter = 1
|
|
|
|
for match in LINK_PATTERN.finditer(markdown):
|
|
parts.append(markdown[last_end : match.start()])
|
|
text, url, title = match.groups()
|
|
|
|
# Use cached URL if available, otherwise compute and cache
|
|
if base_url and not url.startswith(("http://", "https://", "mailto:")):
|
|
if url not in url_cache:
|
|
url_cache[url] = fast_urljoin(base_url, url)
|
|
url = url_cache[url]
|
|
|
|
if url not in link_map:
|
|
desc = []
|
|
if title:
|
|
desc.append(title)
|
|
if text and text != title:
|
|
desc.append(text)
|
|
link_map[url] = (counter, ": " + " - ".join(desc) if desc else "")
|
|
counter += 1
|
|
|
|
num = link_map[url][0]
|
|
parts.append(
|
|
f"{text}⟨{num}⟩"
|
|
if not match.group(0).startswith("!")
|
|
else f"![{text}⟨{num}⟩]"
|
|
)
|
|
last_end = match.end()
|
|
|
|
parts.append(markdown[last_end:])
|
|
converted_text = "".join(parts)
|
|
|
|
# Pre-build reference strings
|
|
references = ["\n\n## References\n\n"]
|
|
references.extend(
|
|
f"⟨{num}⟩ {url}{desc}\n"
|
|
for url, (num, desc) in sorted(link_map.items(), key=lambda x: x[1][0])
|
|
)
|
|
|
|
return converted_text, "".join(references)
|
|
|
|
def generate_markdown(
|
|
self,
|
|
cleaned_html: str,
|
|
base_url: str = "",
|
|
html2text_options: Optional[Dict[str, Any]] = None,
|
|
options: Optional[Dict[str, Any]] = None,
|
|
content_filter: Optional[RelevantContentFilter] = None,
|
|
citations: bool = True,
|
|
**kwargs,
|
|
) -> MarkdownGenerationResult:
|
|
"""
|
|
Generate markdown with citations from cleaned HTML.
|
|
|
|
How it works:
|
|
1. Generate raw markdown from cleaned HTML.
|
|
2. Convert links to citations.
|
|
3. Generate fit markdown if content filter is provided.
|
|
4. Return MarkdownGenerationResult.
|
|
|
|
Args:
|
|
cleaned_html (str): Cleaned HTML content.
|
|
base_url (str): Base URL for URL joins.
|
|
html2text_options (Optional[Dict[str, Any]]): HTML2Text options.
|
|
options (Optional[Dict[str, Any]]): Additional options for markdown generation.
|
|
content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
|
|
citations (bool): Whether to generate citations.
|
|
|
|
Returns:
|
|
MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
|
|
"""
|
|
try:
|
|
# Initialize HTML2Text with default options for better conversion
|
|
h = CustomHTML2Text(baseurl=base_url)
|
|
default_options = {
|
|
"body_width": 0, # Disable text wrapping
|
|
"ignore_emphasis": False,
|
|
"ignore_links": False,
|
|
"ignore_images": False,
|
|
"protect_links": False,
|
|
"single_line_break": True,
|
|
"mark_code": True,
|
|
"escape_snob": False,
|
|
}
|
|
|
|
# Update with custom options if provided
|
|
if html2text_options:
|
|
default_options.update(html2text_options)
|
|
elif options:
|
|
default_options.update(options)
|
|
elif self.options:
|
|
default_options.update(self.options)
|
|
|
|
h.update_params(**default_options)
|
|
|
|
# Ensure we have valid input
|
|
if not cleaned_html:
|
|
cleaned_html = ""
|
|
elif not isinstance(cleaned_html, str):
|
|
cleaned_html = str(cleaned_html)
|
|
|
|
# Generate raw markdown
|
|
try:
|
|
raw_markdown = h.handle(cleaned_html)
|
|
except Exception as e:
|
|
raw_markdown = f"Error converting HTML to markdown: {str(e)}"
|
|
|
|
raw_markdown = raw_markdown.replace(" ```", "```")
|
|
|
|
# Convert links to citations
|
|
markdown_with_citations: str = raw_markdown
|
|
references_markdown: str = ""
|
|
if citations:
|
|
try:
|
|
(
|
|
markdown_with_citations,
|
|
references_markdown,
|
|
) = self.convert_links_to_citations(raw_markdown, base_url)
|
|
except Exception as e:
|
|
markdown_with_citations = raw_markdown
|
|
references_markdown = f"Error generating citations: {str(e)}"
|
|
|
|
# Generate fit markdown if content filter is provided
|
|
fit_markdown: Optional[str] = ""
|
|
filtered_html: Optional[str] = ""
|
|
if content_filter or self.content_filter:
|
|
try:
|
|
content_filter = content_filter or self.content_filter
|
|
filtered_html = content_filter.filter_content(cleaned_html)
|
|
filtered_html = "\n".join(
|
|
"<div>{}</div>".format(s) for s in filtered_html
|
|
)
|
|
fit_markdown = h.handle(filtered_html)
|
|
except Exception as e:
|
|
fit_markdown = f"Error generating fit markdown: {str(e)}"
|
|
filtered_html = ""
|
|
|
|
return MarkdownGenerationResult(
|
|
raw_markdown=raw_markdown or "",
|
|
markdown_with_citations=markdown_with_citations or "",
|
|
references_markdown=references_markdown or "",
|
|
fit_markdown=fit_markdown or "",
|
|
fit_html=filtered_html or "",
|
|
)
|
|
except Exception as e:
|
|
# If anything fails, return empty strings with error message
|
|
error_msg = f"Error in markdown generation: {str(e)}"
|
|
return MarkdownGenerationResult(
|
|
raw_markdown=error_msg,
|
|
markdown_with_citations=error_msg,
|
|
references_markdown="",
|
|
fit_markdown="",
|
|
fit_html="",
|
|
)
|