diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 3b41ec82..d4b901d2 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -109,25 +109,18 @@ class WebScrapingStrategy(ContentScrapingStrategy): if markdown_generator: try: - markdown_result = markdown_generator.generate_markdown( + markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown( cleaned_html=cleaned_html, base_url=url, html2text_options=kwargs.get('html2text', {}), content_filter=kwargs.get('content_filter', None) ) - markdown_v2 = MarkdownGenerationResult( - raw_markdown=markdown_result.raw_markdown, - markdown_with_citations=markdown_result.markdown_with_citations, - references_markdown=markdown_result.references_markdown, - fit_markdown=markdown_result.fit_markdown - ) - return { 'markdown': markdown_result.raw_markdown, 'fit_markdown': markdown_result.fit_markdown or "Set flag 'fit_markdown' to True to get cleaned HTML content.", - 'fit_html': kwargs.get('content_filter', None).filter_content(html) if kwargs.get('content_filter') else "Set flag 'fit_markdown' to True to get cleaned HTML content.", - 'markdown_v2': markdown_v2 + 'fit_html': markdown_result.fit_html or "Set flag 'fit_markdown' to True to get cleaned HTML content.", + 'markdown_v2': markdown_result } except Exception as e: self._log('error', diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py index 1adb4c28..7922c413 100644 --- a/crawl4ai/markdown_generation_strategy.py +++ b/crawl4ai/markdown_generation_strategy.py @@ -100,7 +100,8 @@ class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy): raw_markdown=raw_markdown, markdown_with_citations=markdown_with_citations, references_markdown=references_markdown, - fit_markdown=fit_markdown + fit_markdown=fit_markdown, + fit_html=filtered_html ) def fast_urljoin(base: str, url: str) -> str: diff --git a/crawl4ai/models.py b/crawl4ai/models.py index 122434ad..3a1b8bd1 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -12,6 +12,7 @@ class MarkdownGenerationResult(BaseModel): markdown_with_citations: str references_markdown: str fit_markdown: Optional[str] = None + fit_html: Optional[str] = None class CrawlResult(BaseModel): url: str