feat: enhance Markdown generation to include fit_html attribute

This commit is contained in:
UncleCode
2024-11-22 18:47:17 +08:00
parent 571dda6549
commit 24ad2fe2dd
3 changed files with 6 additions and 11 deletions

View File

@@ -109,25 +109,18 @@ class WebScrapingStrategy(ContentScrapingStrategy):
if markdown_generator:
try:
markdown_result = markdown_generator.generate_markdown(
markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
cleaned_html=cleaned_html,
base_url=url,
html2text_options=kwargs.get('html2text', {}),
content_filter=kwargs.get('content_filter', None)
)
markdown_v2 = MarkdownGenerationResult(
raw_markdown=markdown_result.raw_markdown,
markdown_with_citations=markdown_result.markdown_with_citations,
references_markdown=markdown_result.references_markdown,
fit_markdown=markdown_result.fit_markdown
)
return {
'markdown': markdown_result.raw_markdown,
'fit_markdown': markdown_result.fit_markdown or "Set flag 'fit_markdown' to True to get cleaned HTML content.",
'fit_html': kwargs.get('content_filter', None).filter_content(html) if kwargs.get('content_filter') else "Set flag 'fit_markdown' to True to get cleaned HTML content.",
'markdown_v2': markdown_v2
'fit_html': markdown_result.fit_html or "Set flag 'fit_markdown' to True to get cleaned HTML content.",
'markdown_v2': markdown_result
}
except Exception as e:
self._log('error',

View File

@@ -100,7 +100,8 @@ class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy):
raw_markdown=raw_markdown,
markdown_with_citations=markdown_with_citations,
references_markdown=references_markdown,
fit_markdown=fit_markdown
fit_markdown=fit_markdown,
fit_html=filtered_html
)
def fast_urljoin(base: str, url: str) -> str:

View File

@@ -12,6 +12,7 @@ class MarkdownGenerationResult(BaseModel):
markdown_with_citations: str
references_markdown: str
fit_markdown: Optional[str] = None
fit_html: Optional[str] = None
class CrawlResult(BaseModel):
url: str