From a96e05d4ae9599191ea910b76c1ce58c8468d260 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Wed, 1 Jan 2025 19:39:02 +0800
Subject: [PATCH 1/3] refactor(crawler): optimize response handling and default
 settings

- Set wait_for_images default to false for better performance
- Simplify response attribute copying in AsyncWebCrawler
- Update hello_world example with proper content filtering
---
 crawl4ai/async_configs.py                |   6 +-
 crawl4ai/async_webcrawler.py             |  54 ++++++-----
 crawl4ai/markdown_generation_strategy.py | 112 ++++++++++++++++-------
 3 files changed, 106 insertions(+), 66 deletions(-)

diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
index bd33d0b5..a4de071f 100644
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -246,7 +246,7 @@ class CrawlerRunConfig:
         wait_for (str or None): A CSS selector or JS condition to wait for before extracting content.
                                 Default: None.
         wait_for_images (bool): If True, wait for images to load before extracting content.
-                                Default: True.
+                                Default: False.
         delay_before_return_html (float): Delay in seconds before retrieving final HTML.
                                           Default: 0.1.
         mean_delay (float): Mean base delay between requests when calling arun_many.
@@ -345,7 +345,7 @@ class CrawlerRunConfig:
         wait_until: str = "domcontentloaded",
         page_timeout: int = PAGE_TIMEOUT,
         wait_for: str = None,
-        wait_for_images: bool = True,
+        wait_for_images: bool = False,
         delay_before_return_html: float = 0.1,
         mean_delay: float = 0.1,
         max_range: float = 0.3,
@@ -503,7 +503,7 @@ class CrawlerRunConfig:
             wait_until=kwargs.get("wait_until", "domcontentloaded"),
             page_timeout=kwargs.get("page_timeout", 60000),
             wait_for=kwargs.get("wait_for"),
-            wait_for_images=kwargs.get("wait_for_images", True),
+            wait_for_images=kwargs.get("wait_for_images", False),
             delay_before_return_html=kwargs.get("delay_before_return_html", 0.1),
             mean_delay=kwargs.get("mean_delay", 0.1),
             max_range=kwargs.get("max_range", 0.3),
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index f99586a3..6ed8ec8f 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -418,34 +418,30 @@ class AsyncWebCrawler:
                             **kwargs
                         )
 
-                    #     crawl_result.status_code = async_response.status_code
-                    #     crawl_result.response_headers = async_response.response_headers
-                    #     crawl_result.downloaded_files = async_response.downloaded_files
-                    #     crawl_result.ssl_certificate = async_response.ssl_certificate  # Add SSL certificate
-                    # else:
-                    #     crawl_result.status_code = 200
-                    #     crawl_result.response_headers = cached_result.response_headers if cached_result else {}
-                    #     crawl_result.ssl_certificate = cached_result.ssl_certificate if cached_result else None  # Add SSL certificate from cache
+                        crawl_result.status_code = async_response.status_code
+                        crawl_result.response_headers = async_response.response_headers
+                        crawl_result.downloaded_files = async_response.downloaded_files
+                        crawl_result.ssl_certificate = async_response.ssl_certificate  # Add SSL certificate
 
                         # # Check and set values from async_response to crawl_result
-                        try:
-                            for key in vars(async_response):
-                                if hasattr(crawl_result, key):
-                                    value = getattr(async_response, key, None)
-                                    current_value = getattr(crawl_result, key, None)
-                                    if value is not None and not current_value:
-                                        try:
-                                            setattr(crawl_result, key, value)
-                                        except Exception as e:
-                                            self.logger.warning(
-                                                message=f"Failed to set attribute {key}: {str(e)}",
-                                                tag="WARNING"
-                                            )
-                        except Exception as e:
-                            self.logger.warning(
-                                message=f"Error copying response attributes: {str(e)}",
-                                tag="WARNING"
-                            )
+                        # try:
+                        #     for key in vars(async_response):
+                        #         if hasattr(crawl_result, key):
+                        #             value = getattr(async_response, key, None)
+                        #             current_value = getattr(crawl_result, key, None)
+                        #             if value is not None and not current_value:
+                        #                 try:
+                        #                     setattr(crawl_result, key, value)
+                        #                 except Exception as e:
+                        #                     self.logger.warning(
+                        #                         message=f"Failed to set attribute {key}: {str(e)}",
+                        #                         tag="WARNING"
+                        #                     )
+                        # except Exception as e:
+                        #     self.logger.warning(
+                        #         message=f"Error copying response attributes: {str(e)}",
+                        #         tag="WARNING"
+                        #     )
 
                         crawl_result.success = bool(html)
                         crawl_result.session_id = getattr(config, 'session_id', None)
@@ -585,8 +581,10 @@ class AsyncWebCrawler:
 
             # Markdown Generation
             markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
-            if not config.content_filter and not markdown_generator.content_filter:
-                markdown_generator.content_filter = PruningContentFilter()
+            
+            # Uncomment if by default we want to use PruningContentFilter
+            # if not config.content_filter and not markdown_generator.content_filter:
+            #     markdown_generator.content_filter = PruningContentFilter()
             
             markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
                 cleaned_html=cleaned_html,
diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py
index 474dc9e8..89e5e34e 100644
--- a/crawl4ai/markdown_generation_strategy.py
+++ b/crawl4ai/markdown_generation_strategy.py
@@ -143,41 +143,83 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
         Returns:
             MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
         """
-        # Initialize HTML2Text with options
-        h = CustomHTML2Text()
-        if html2text_options:
-            h.update_params(**html2text_options)
-        elif options:
-            h.update_params(**options)
-        elif self.options:
-            h.update_params(**self.options)
+        try:
+            # Initialize HTML2Text with default options for better conversion
+            h = CustomHTML2Text(baseurl=base_url)
+            default_options = {
+                'body_width': 0,  # Disable text wrapping
+                'ignore_emphasis': False,
+                'ignore_links': False,
+                'ignore_images': False,
+                'protect_links': True,
+                'single_line_break': True,
+                'mark_code': True,
+                'escape_snob': False
+            }
+            
+            # Update with custom options if provided
+            if html2text_options:
+                default_options.update(html2text_options)
+            elif options:
+                default_options.update(options)
+            elif self.options:
+                default_options.update(self.options)
+            
+            h.update_params(**default_options)
 
-        # Generate raw markdown
-        raw_markdown = h.handle(cleaned_html)
-        raw_markdown = raw_markdown.replace('    ```', '```')
+            # Ensure we have valid input
+            if not cleaned_html:
+                cleaned_html = ""
+            elif not isinstance(cleaned_html, str):
+                cleaned_html = str(cleaned_html)
 
-        # Convert links to citations
-        markdown_with_citations: str = ""
-        references_markdown: str = ""
-        if citations:
-            markdown_with_citations, references_markdown = self.convert_links_to_citations(
-                raw_markdown, base_url
+            # Generate raw markdown
+            try:
+                raw_markdown = h.handle(cleaned_html)
+            except Exception as e:
+                raw_markdown = f"Error converting HTML to markdown: {str(e)}"
+            
+            raw_markdown = raw_markdown.replace('    ```', '```')
+
+            # Convert links to citations
+            markdown_with_citations: str = raw_markdown
+            references_markdown: str = ""
+            if citations:
+                try:
+                    markdown_with_citations, references_markdown = self.convert_links_to_citations(
+                        raw_markdown, base_url
+                    )
+                except Exception as e:
+                    markdown_with_citations = raw_markdown
+                    references_markdown = f"Error generating citations: {str(e)}"
+
+            # Generate fit markdown if content filter is provided
+            fit_markdown: Optional[str] = ""
+            filtered_html: Optional[str] = ""
+            if content_filter or self.content_filter:
+                try:
+                    content_filter = content_filter or self.content_filter
+                    filtered_html = content_filter.filter_content(cleaned_html)
+                    filtered_html = '\n'.join('<div>{}</div>'.format(s) for s in filtered_html)
+                    fit_markdown = h.handle(filtered_html)
+                except Exception as e:
+                    fit_markdown = f"Error generating fit markdown: {str(e)}"
+                    filtered_html = ""
+
+            return MarkdownGenerationResult(
+                raw_markdown=raw_markdown or "",
+                markdown_with_citations=markdown_with_citations or "",
+                references_markdown=references_markdown or "",
+                fit_markdown=fit_markdown or "",
+                fit_html=filtered_html or "",
+            )
+        except Exception as e:
+            # If anything fails, return empty strings with error message
+            error_msg = f"Error in markdown generation: {str(e)}"
+            return MarkdownGenerationResult(
+                raw_markdown=error_msg,
+                markdown_with_citations=error_msg,
+                references_markdown="",
+                fit_markdown="",
+                fit_html="",
             )
-
-        # Generate fit markdown if content filter is provided
-        fit_markdown: Optional[str] = ""
-        filtered_html: Optional[str] = ""
-        if content_filter or self.content_filter:
-            content_filter = content_filter or self.content_filter
-            filtered_html = content_filter.filter_content(cleaned_html)
-            filtered_html = '\n'.join('<div>{}</div>'.format(s) for s in filtered_html)
-            fit_markdown = h.handle(filtered_html)
-
-        return MarkdownGenerationResult(
-            raw_markdown=raw_markdown,
-            markdown_with_citations=markdown_with_citations,
-            references_markdown=references_markdown,
-            fit_markdown=fit_markdown,
-            fit_html=filtered_html,
-        )
-

From aa4f92f4587c4cc5757ef00ebf71fd3515633c8b Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Wed, 1 Jan 2025 19:39:42 +0800
Subject: [PATCH 2/3] refactor(crawler):

- Update hello_world example with proper content filtering
---
 docs/examples/hello_world.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 docs/examples/hello_world.py

diff --git a/docs/examples/hello_world.py b/docs/examples/hello_world.py
new file mode 100644
index 00000000..bcdb0d71
--- /dev/null
+++ b/docs/examples/hello_world.py
@@ -0,0 +1,25 @@
+import os, sys
+
+sys.path.append(
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+)
+
+import asyncio
+from crawl4ai import *
+
+async def main():
+    async with AsyncWebCrawler() as crawler:
+        crawler_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            markdown_generator=DefaultMarkdownGenerator(
+                content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
+            )
+        )
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            config=crawler_config
+        )
+        print(result.markdown_v2.raw_markdown[:500])
+
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file

From da1bc0f7bf952d85bdf35f76e2b8a8a13eb11d00 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Wed, 1 Jan 2025 19:42:35 +0800
Subject: [PATCH 3/3] Update version file

---
 crawl4ai/__version__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py
index 7f7cf687..3f798c0c 100644
--- a/crawl4ai/__version__.py
+++ b/crawl4ai/__version__.py
@@ -1,2 +1,2 @@
 # crawl4ai/_version.py
-__version__ = "0.4.245"
+__version__ = "0.4.246"