From f998e9e94906302a4ee32cd5e581f4fa7bd22021 Mon Sep 17 00:00:00 2001 From: Hamza Farhan Date: Wed, 27 Nov 2024 16:20:54 +0500 Subject: [PATCH] Fix: handled the cases where markdown_with_citations, references_markdown, and filtered_html might not be defined. (#293) Thanks, dear Farhan, for the changes you made in the code. I accepted and merged them into the main branch. Also, I will add your name to our contributor list. Thank you so much. --- crawl4ai/markdown_generation_strategy.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py index 7922c413..249bc1ce 100644 --- a/crawl4ai/markdown_generation_strategy.py +++ b/crawl4ai/markdown_generation_strategy.py @@ -84,6 +84,8 @@ class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy): raw_markdown = raw_markdown.replace(' ```', '```') # Convert links to citations + markdown_with_citations: str = "" + references_markdown: str = "" if citations: markdown_with_citations, references_markdown = self.convert_links_to_citations( raw_markdown, base_url @@ -91,9 +93,9 @@ class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy): # Generate fit markdown if content filter is provided fit_markdown: Optional[str] = None + filtered_html: Optional[str] = None if content_filter: - filtered_html = content_filter.filter_content(cleaned_html) - filtered_html = '\n'.join('
{}
'.format(s) for s in filtered_html) + filtered_html = '\n'.join('
{}
'.format(s) for s in content_filter.filter_content(cleaned_html)) fit_markdown = h.handle(filtered_html) return MarkdownGenerationResult( @@ -101,7 +103,7 @@ class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy): markdown_with_citations=markdown_with_citations, references_markdown=references_markdown, fit_markdown=fit_markdown, - fit_html=filtered_html + fit_html=filtered_html, ) def fast_urljoin(base: str, url: str) -> str: