feat: enhance Markdown generation to include fit_html attribute
This commit is contained in:
@@ -109,25 +109,18 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
|
||||
if markdown_generator:
|
||||
try:
|
||||
markdown_result = markdown_generator.generate_markdown(
|
||||
markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
|
||||
cleaned_html=cleaned_html,
|
||||
base_url=url,
|
||||
html2text_options=kwargs.get('html2text', {}),
|
||||
content_filter=kwargs.get('content_filter', None)
|
||||
)
|
||||
|
||||
markdown_v2 = MarkdownGenerationResult(
|
||||
raw_markdown=markdown_result.raw_markdown,
|
||||
markdown_with_citations=markdown_result.markdown_with_citations,
|
||||
references_markdown=markdown_result.references_markdown,
|
||||
fit_markdown=markdown_result.fit_markdown
|
||||
)
|
||||
|
||||
return {
|
||||
'markdown': markdown_result.raw_markdown,
|
||||
'fit_markdown': markdown_result.fit_markdown or "Set flag 'fit_markdown' to True to get cleaned HTML content.",
|
||||
'fit_html': kwargs.get('content_filter', None).filter_content(html) if kwargs.get('content_filter') else "Set flag 'fit_markdown' to True to get cleaned HTML content.",
|
||||
'markdown_v2': markdown_v2
|
||||
'fit_html': markdown_result.fit_html or "Set flag 'fit_markdown' to True to get cleaned HTML content.",
|
||||
'markdown_v2': markdown_result
|
||||
}
|
||||
except Exception as e:
|
||||
self._log('error',
|
||||
|
||||
@@ -100,7 +100,8 @@ class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy):
|
||||
raw_markdown=raw_markdown,
|
||||
markdown_with_citations=markdown_with_citations,
|
||||
references_markdown=references_markdown,
|
||||
fit_markdown=fit_markdown
|
||||
fit_markdown=fit_markdown,
|
||||
fit_html=filtered_html
|
||||
)
|
||||
|
||||
def fast_urljoin(base: str, url: str) -> str:
|
||||
|
||||
@@ -12,6 +12,7 @@ class MarkdownGenerationResult(BaseModel):
|
||||
markdown_with_citations: str
|
||||
references_markdown: str
|
||||
fit_markdown: Optional[str] = None
|
||||
fit_html: Optional[str] = None
|
||||
|
||||
class CrawlResult(BaseModel):
|
||||
url: str
|
||||
|
||||
Reference in New Issue
Block a user