In this commit, we introduce the new concept of MakrdownGenerationStrategy, which allows us to expand our future strategies to generate better markdown. Right now, we generate raw markdown as we were doing before. We have a new algorithm for fitting markdown based on BM25, and now we add the ability to refine markdown into a citation form. Our links will be extracted and replaced by a citation reference number, and then we will have reference sections at the very end; we add all the links with the descriptions. This format is more suitable for large language models. In case we don't need to pass links, we can reduce the size of the markdown significantly and also attach the list of references as a separate file to a large language model. This commit contains changes for this direction.

2024-11-21 18:21:43 +08:00
parent 7047422e48
commit dbb751c8f0
12 changed files with 506 additions and 762 deletions
--- a/crawl4ai/markdown_generation_strategy.py
+++ b/crawl4ai/markdown_generation_strategy.py
@@ -0,0 +1,115 @@
+from abc import ABC, abstractmethod
+from typing import Optional, Dict, Any, Tuple
+from .models import MarkdownGenerationResult
+from .utils import CustomHTML2Text
+from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter
+import re
+from urllib.parse import urljoin
+
+# Pre-compile the regex pattern
+LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)')
+
+class MarkdownGenerationStrategy(ABC):
+    """Abstract base class for markdown generation strategies."""
+    
+    @abstractmethod
+    def generate_markdown(self, 
+                         cleaned_html: str, 
+                         base_url: str = "",
+                         html2text_options: Optional[Dict[str, Any]] = None,
+                         content_filter: Optional[RelevantContentFilter] = None,
+                         citations: bool = True,
+                         **kwargs) -> MarkdownGenerationResult:
+        """Generate markdown from cleaned HTML."""
+        pass
+
+class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy):
+    """Default implementation of markdown generation strategy."""
+    
+    def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]:
+        link_map = {}
+        url_cache = {}  # Cache for URL joins
+        parts = []
+        last_end = 0
+        counter = 1
+        
+        for match in LINK_PATTERN.finditer(markdown):
+            parts.append(markdown[last_end:match.start()])
+            text, url, title = match.groups()
+            
+            # Use cached URL if available, otherwise compute and cache
+            if base_url and not url.startswith(('http://', 'https://', 'mailto:')):
+                if url not in url_cache:
+                    url_cache[url] = fast_urljoin(base_url, url)
+                url = url_cache[url]
+                
+            if url not in link_map:
+                desc = []
+                if title: desc.append(title)
+                if text and text != title: desc.append(text)
+                link_map[url] = (counter, ": " + " - ".join(desc) if desc else "")
+                counter += 1
+                
+            num = link_map[url][0]
+            parts.append(f"{text}⟨{num}⟩" if not match.group(0).startswith('!') else f"![{text}⟨{num}⟩]")
+            last_end = match.end()
+        
+        parts.append(markdown[last_end:])
+        converted_text = ''.join(parts)
+        
+        # Pre-build reference strings
+        references = ["\n\n## References\n\n"]
+        references.extend(
+            f"⟨{num}⟩ {url}{desc}\n" 
+            for url, (num, desc) in sorted(link_map.items(), key=lambda x: x[1][0])
+        )
+        
+        return converted_text, ''.join(references)
+
+    def generate_markdown(self, 
+                         cleaned_html: str, 
+                         base_url: str = "",
+                         html2text_options: Optional[Dict[str, Any]] = None,
+                         content_filter: Optional[RelevantContentFilter] = None,
+                         citations: bool = True,
+                         **kwargs) -> MarkdownGenerationResult:
+        """Generate markdown with citations from cleaned HTML."""
+        # Initialize HTML2Text with options
+        h = CustomHTML2Text()
+        if html2text_options:
+            h.update_params(**html2text_options)
+
+        # Generate raw markdown
+        raw_markdown = h.handle(cleaned_html)
+        raw_markdown = raw_markdown.replace('    ```', '```')
+
+        # Convert links to citations
+        if citations:
+            markdown_with_citations, references_markdown = self.convert_links_to_citations(
+                raw_markdown, base_url
+            )
+
+        # Generate fit markdown if content filter is provided
+        fit_markdown: Optional[str] = None
+        if content_filter:
+            filtered_html = content_filter.filter_content(cleaned_html)
+            filtered_html = '\n'.join('<div>{}</div>'.format(s) for s in filtered_html)
+            fit_markdown = h.handle(filtered_html)
+
+        return MarkdownGenerationResult(
+            raw_markdown=raw_markdown,
+            markdown_with_citations=markdown_with_citations,
+            references_markdown=references_markdown,
+            fit_markdown=fit_markdown
+        )
+
+def fast_urljoin(base: str, url: str) -> str:
+    """Fast URL joining for common cases."""
+    if url.startswith(('http://', 'https://', 'mailto:', '//')):
+        return url
+    if url.startswith('/'):
+        # Handle absolute paths
+        if base.endswith('/'):
+            return base[:-1] + url
+        return base + url
+    return urljoin(base, url)