In this commit, we introduce the new concept of MakrdownGenerationStrategy, which allows us to expand our future strategies to generate better markdown. Right now, we generate raw markdown as we were doing before. We have a new algorithm for fitting markdown based on BM25, and now we add the ability to refine markdown into a citation form. Our links will be extracted and replaced by a citation reference number, and then we will have reference sections at the very end; we add all the links with the descriptions. This format is more suitable for large language models. In case we don't need to pass links, we can reduce the size of the markdown significantly and also attach the list of references as a separate file to a large language model. This commit contains changes for this direction.

This commit is contained in:
UncleCode
2024-11-21 18:21:43 +08:00
parent 7047422e48
commit dbb751c8f0
12 changed files with 506 additions and 762 deletions

View File

@@ -0,0 +1,115 @@
from abc import ABC, abstractmethod
from typing import Optional, Dict, Any, Tuple
from .models import MarkdownGenerationResult
from .utils import CustomHTML2Text
from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter
import re
from urllib.parse import urljoin
# Pre-compile the regex pattern
LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)')
class MarkdownGenerationStrategy(ABC):
"""Abstract base class for markdown generation strategies."""
@abstractmethod
def generate_markdown(self,
cleaned_html: str,
base_url: str = "",
html2text_options: Optional[Dict[str, Any]] = None,
content_filter: Optional[RelevantContentFilter] = None,
citations: bool = True,
**kwargs) -> MarkdownGenerationResult:
"""Generate markdown from cleaned HTML."""
pass
class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy):
"""Default implementation of markdown generation strategy."""
def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]:
link_map = {}
url_cache = {} # Cache for URL joins
parts = []
last_end = 0
counter = 1
for match in LINK_PATTERN.finditer(markdown):
parts.append(markdown[last_end:match.start()])
text, url, title = match.groups()
# Use cached URL if available, otherwise compute and cache
if base_url and not url.startswith(('http://', 'https://', 'mailto:')):
if url not in url_cache:
url_cache[url] = fast_urljoin(base_url, url)
url = url_cache[url]
if url not in link_map:
desc = []
if title: desc.append(title)
if text and text != title: desc.append(text)
link_map[url] = (counter, ": " + " - ".join(desc) if desc else "")
counter += 1
num = link_map[url][0]
parts.append(f"{text}{num}" if not match.group(0).startswith('!') else f"![{text}{num}⟩]")
last_end = match.end()
parts.append(markdown[last_end:])
converted_text = ''.join(parts)
# Pre-build reference strings
references = ["\n\n## References\n\n"]
references.extend(
f"{num}{url}{desc}\n"
for url, (num, desc) in sorted(link_map.items(), key=lambda x: x[1][0])
)
return converted_text, ''.join(references)
def generate_markdown(self,
cleaned_html: str,
base_url: str = "",
html2text_options: Optional[Dict[str, Any]] = None,
content_filter: Optional[RelevantContentFilter] = None,
citations: bool = True,
**kwargs) -> MarkdownGenerationResult:
"""Generate markdown with citations from cleaned HTML."""
# Initialize HTML2Text with options
h = CustomHTML2Text()
if html2text_options:
h.update_params(**html2text_options)
# Generate raw markdown
raw_markdown = h.handle(cleaned_html)
raw_markdown = raw_markdown.replace(' ```', '```')
# Convert links to citations
if citations:
markdown_with_citations, references_markdown = self.convert_links_to_citations(
raw_markdown, base_url
)
# Generate fit markdown if content filter is provided
fit_markdown: Optional[str] = None
if content_filter:
filtered_html = content_filter.filter_content(cleaned_html)
filtered_html = '\n'.join('<div>{}</div>'.format(s) for s in filtered_html)
fit_markdown = h.handle(filtered_html)
return MarkdownGenerationResult(
raw_markdown=raw_markdown,
markdown_with_citations=markdown_with_citations,
references_markdown=references_markdown,
fit_markdown=fit_markdown
)
def fast_urljoin(base: str, url: str) -> str:
"""Fast URL joining for common cases."""
if url.startswith(('http://', 'https://', 'mailto:', '//')):
return url
if url.startswith('/'):
# Handle absolute paths
if base.endswith('/'):
return base[:-1] + url
return base + url
return urljoin(base, url)