feat: enhance Markdown generation to include fit_html attribute
This commit is contained in:
@@ -109,25 +109,18 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
|
|
||||||
if markdown_generator:
|
if markdown_generator:
|
||||||
try:
|
try:
|
||||||
markdown_result = markdown_generator.generate_markdown(
|
markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
|
||||||
cleaned_html=cleaned_html,
|
cleaned_html=cleaned_html,
|
||||||
base_url=url,
|
base_url=url,
|
||||||
html2text_options=kwargs.get('html2text', {}),
|
html2text_options=kwargs.get('html2text', {}),
|
||||||
content_filter=kwargs.get('content_filter', None)
|
content_filter=kwargs.get('content_filter', None)
|
||||||
)
|
)
|
||||||
|
|
||||||
markdown_v2 = MarkdownGenerationResult(
|
|
||||||
raw_markdown=markdown_result.raw_markdown,
|
|
||||||
markdown_with_citations=markdown_result.markdown_with_citations,
|
|
||||||
references_markdown=markdown_result.references_markdown,
|
|
||||||
fit_markdown=markdown_result.fit_markdown
|
|
||||||
)
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'markdown': markdown_result.raw_markdown,
|
'markdown': markdown_result.raw_markdown,
|
||||||
'fit_markdown': markdown_result.fit_markdown or "Set flag 'fit_markdown' to True to get cleaned HTML content.",
|
'fit_markdown': markdown_result.fit_markdown or "Set flag 'fit_markdown' to True to get cleaned HTML content.",
|
||||||
'fit_html': kwargs.get('content_filter', None).filter_content(html) if kwargs.get('content_filter') else "Set flag 'fit_markdown' to True to get cleaned HTML content.",
|
'fit_html': markdown_result.fit_html or "Set flag 'fit_markdown' to True to get cleaned HTML content.",
|
||||||
'markdown_v2': markdown_v2
|
'markdown_v2': markdown_result
|
||||||
}
|
}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self._log('error',
|
self._log('error',
|
||||||
|
|||||||
@@ -100,7 +100,8 @@ class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy):
|
|||||||
raw_markdown=raw_markdown,
|
raw_markdown=raw_markdown,
|
||||||
markdown_with_citations=markdown_with_citations,
|
markdown_with_citations=markdown_with_citations,
|
||||||
references_markdown=references_markdown,
|
references_markdown=references_markdown,
|
||||||
fit_markdown=fit_markdown
|
fit_markdown=fit_markdown,
|
||||||
|
fit_html=filtered_html
|
||||||
)
|
)
|
||||||
|
|
||||||
def fast_urljoin(base: str, url: str) -> str:
|
def fast_urljoin(base: str, url: str) -> str:
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ class MarkdownGenerationResult(BaseModel):
|
|||||||
markdown_with_citations: str
|
markdown_with_citations: str
|
||||||
references_markdown: str
|
references_markdown: str
|
||||||
fit_markdown: Optional[str] = None
|
fit_markdown: Optional[str] = None
|
||||||
|
fit_html: Optional[str] = None
|
||||||
|
|
||||||
class CrawlResult(BaseModel):
|
class CrawlResult(BaseModel):
|
||||||
url: str
|
url: str
|
||||||
|
|||||||
Reference in New Issue
Block a user