From 24b3da717ae2e81345127b1431902b49b95d475e Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Thu, 2 Jan 2025 17:53:30 +0800
Subject: [PATCH 1/6] refactor():

- Update hello world example
---
 docs/examples/hello_world.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/examples/hello_world.py b/docs/examples/hello_world.py
index bcec9a9a..18534d0e 100644
--- a/docs/examples/hello_world.py
+++ b/docs/examples/hello_world.py
@@ -2,7 +2,8 @@ import asyncio
 from crawl4ai import *
 
 async def main():
-    async with AsyncWebCrawler() as crawler:
+    browser_config = BrowserConfig(headless=True, verbose=True)
+    async with AsyncWebCrawler(config=browser_config) as crawler:
         crawler_config = CrawlerRunConfig(
             cache_mode=CacheMode.BYPASS,
             markdown_generator=DefaultMarkdownGenerator(
@@ -10,7 +11,7 @@ async def main():
             )
         )
         result = await crawler.arun(
-            url="https://crawl4ai.com",
+            url="https://www.helloworld.org",
             config=crawler_config
         )
         print(result.markdown_v2.raw_markdown[:500])

From 196dc79ec7005a1cabf22af621f7b6b029288e47 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Fri, 3 Jan 2025 21:17:23 +0800
Subject: [PATCH 2/6] fix: prevent memory leaks by ensuring proper closure of
 Playwright pages

- Fixes critical memory leak issue where browser pages remained open
- Ensures proper cleanup of Playwright resources after page operations
- Improves resource management in browser farm implementation

This is an urgent fix to address resource leakage that could impact system stability.
---
 .gitignore                         |  1 +
 crawl4ai/async_crawler_strategy.py | 15 ++++++++++-----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/.gitignore b/.gitignore
index 6a3b65f0..7ce3ee0c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -225,3 +225,4 @@ tree.md
 .scripts
 .local
 .do
+/plans
\ No newline at end of file
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 32bd14b8..82e445e1 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -1475,8 +1475,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
 
         except Exception as e:
             raise e
+        
+        finally:
+            # If no session_id is given we should close the page
+            if not config.session_id:
+                await page.close()
 
-    async def _handle_full_page_scan(self, page: Page, scroll_delay: float):
+    async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1):
         """
         Helper method to handle full page scanning. 
         
@@ -1500,7 +1505,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             current_position = viewport_height
 
             # await page.evaluate(f"window.scrollTo(0, {current_position})")
-            await self.safe_scroll(page, 0, current_position)
+            await self.safe_scroll(page, 0, current_position, delay=scroll_delay)
             # await self.csp_scroll_to(page, 0, current_position)
             # await asyncio.sleep(scroll_delay)
 
@@ -1510,7 +1515,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             
             while current_position < total_height:
                 current_position = min(current_position + viewport_height, total_height)
-                await self.safe_scroll(page, 0, current_position)
+                await self.safe_scroll(page, 0, current_position, delay=scroll_delay)
                 # await page.evaluate(f"window.scrollTo(0, {current_position})")
                 # await asyncio.sleep(scroll_delay)
 
@@ -2066,7 +2071,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             }
         """)       
         
-    async def safe_scroll(self, page: Page, x: int, y: int):
+    async def safe_scroll(self, page: Page, x: int, y: int, delay: float = 0.1):
         """
         Safely scroll the page with rendering time.
         
@@ -2077,7 +2082,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         """
         result = await self.csp_scroll_to(page, x, y)
         if result['success']:
-            await page.wait_for_timeout(100)  # Allow for rendering
+            await page.wait_for_timeout(delay * 1000)
         return result
             
     async def csp_scroll_to(self, page: Page, x: int, y: int) -> Dict[str, Any]:

From 72fbdac467b8e0a3aba511e93353cb42d45b1842 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Sun, 5 Jan 2025 19:26:46 +0800
Subject: [PATCH 3/6] fix(extraction): JsonCss selector and crawler
 improvements

- Fix JsonCssExtractionStrategy._get_elements to return all matching elements instead of just one
- Add robust error handling to page_need_scroll with default fallback
- Improve JSON extraction strategies documentation
- Refactor content scraping strategy
- Update version to 0.4.247
---
 crawl4ai/__version__.py                       |  2 +-
 crawl4ai/async_crawler_strategy.py            | 17 +++-
 crawl4ai/content_scraping_strategy.py         | 93 -------------------
 crawl4ai/extraction_strategy.py               |  4 +-
 crawl4ai/utils.py                             | 21 +++++
 .../tutorials/async-webcrawler-basics.md      | 21 ++++-
 6 files changed, 56 insertions(+), 102 deletions(-)

diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py
index 3f798c0c..8ec3d053 100644
--- a/crawl4ai/__version__.py
+++ b/crawl4ai/__version__.py
@@ -1,2 +1,2 @@
 # crawl4ai/_version.py
-__version__ = "0.4.246"
+__version__ = "0.4.247"
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 0cdaffd5..b879413c 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -2163,7 +2163,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             }
         """)
     
-    async def page_need_scroll(self, page: Page):
+    async def page_need_scroll(self, page: Page) -> bool:
         """
         Determine whether the page need to scroll
         
@@ -2171,12 +2171,21 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
             page: Playwright page object
             
         Returns:
-            page should scroll or not
+            bool: True if page needs scrolling
         """
-        return await page.evaluate("""
+        try:
+            need_scroll = await page.evaluate("""
             () => {
                 const scrollHeight = document.documentElement.scrollHeight;
                 const viewportHeight = window.innerHeight;
                 return scrollHeight > viewportHeight;
             }
-        """)
\ No newline at end of file
+            """)
+            return need_scroll
+        except Exception as e:
+            self.logger.warning(
+                message="Failed to check scroll need: {error}. Defaulting to True for safety.",
+                tag="SCROLL",
+                params={"error": str(e)}
+            )
+            return True  # Default to scrolling if check fails
\ No newline at end of file
diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index 985ff592..f3a96cf3 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -122,92 +122,6 @@ class WebScrapingStrategy(ContentScrapingStrategy):
         """
         return await asyncio.to_thread(self._scrap, url, html, **kwargs)
 
-    def _generate_markdown_content(self, cleaned_html: str,html: str,url: str, success: bool, **kwargs) -> Dict[str, Any]:
-        """
-        Generate markdown content from cleaned HTML.
-
-        Args:
-            cleaned_html (str): The cleaned HTML content.
-            html (str): The original HTML content.
-            url (str): The URL of the page.
-            success (bool): Whether the content was successfully cleaned.
-            **kwargs: Additional keyword arguments.
-
-        Returns:
-            Dict[str, Any]: A dictionary containing the generated markdown content.
-        """
-        markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerator())
-        
-        if markdown_generator:
-            try:
-                if kwargs.get('fit_markdown', False) and not markdown_generator.content_filter:
-                        markdown_generator.content_filter = BM25ContentFilter(
-                            user_query=kwargs.get('fit_markdown_user_query', None),
-                            bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0)
-                        )
-                
-                markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
-                    cleaned_html=cleaned_html,
-                    base_url=url,
-                    html2text_options=kwargs.get('html2text', {})
-                )
-                
-                return {
-                    'markdown': markdown_result.raw_markdown,  
-                    'fit_markdown': markdown_result.fit_markdown,
-                    'fit_html': markdown_result.fit_html, 
-                    'markdown_v2': markdown_result
-                }
-            except Exception as e:
-                self._log('error',
-                    message="Error using new markdown generation strategy: {error}",
-                    tag="SCRAPE",
-                    params={"error": str(e)}
-                )
-                markdown_generator = None
-                return {
-                    'markdown': f"Error using new markdown generation strategy: {str(e)}",
-                    'fit_markdown': "Set flag 'fit_markdown' to True to get cleaned HTML content.",
-                    'fit_html': "Set flag 'fit_markdown' to True to get cleaned HTML content.",
-                    'markdown_v2': None                    
-                }
-
-        # Legacy method
-        """
-        # h = CustomHTML2Text()
-        # h.update_params(**kwargs.get('html2text', {}))            
-        # markdown = h.handle(cleaned_html)
-        # markdown = markdown.replace('    ```', '```')
-        
-        # fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content."
-        # fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content."
-        
-        # if kwargs.get('content_filter', None) or kwargs.get('fit_markdown', False):
-        #     content_filter = kwargs.get('content_filter', None)
-        #     if not content_filter:
-        #         content_filter = BM25ContentFilter(
-        #             user_query=kwargs.get('fit_markdown_user_query', None),
-        #             bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0)
-        #         )
-        #     fit_html = content_filter.filter_content(html)
-        #     fit_html = '\n'.join('<div>{}</div>'.format(s) for s in fit_html)
-        #     fit_markdown = h.handle(fit_html)
-
-        # markdown_v2 = MarkdownGenerationResult(
-        #     raw_markdown=markdown,
-        #     markdown_with_citations=markdown,
-        #     references_markdown=markdown,
-        #     fit_markdown=fit_markdown
-        # )
-        
-        # return {
-        #     'markdown': markdown,
-        #     'fit_markdown': fit_markdown,
-        #     'fit_html': fit_html,
-        #     'markdown_v2' : markdown_v2
-        # }
-        """
-
     def flatten_nested_elements(self, node):
         """
         Flatten nested elements in a HTML tree.
@@ -798,13 +712,6 @@ class WebScrapingStrategy(ContentScrapingStrategy):
 
         cleaned_html = str_body.replace('\n\n', '\n').replace('  ', ' ')
 
-        # markdown_content = self._generate_markdown_content(
-        #     cleaned_html=cleaned_html,
-        #     html=html,
-        #     url=url,
-        #     success=success,
-        #     **kwargs
-        # )
         
         return {
             # **markdown_content,
diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py
index 7441e32d..1e9d9c79 100644
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -974,8 +974,7 @@ class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
         return parsed_html.select(selector)
 
     def _get_elements(self, element, selector: str):
-        selected = element.select_one(selector)
-        return [selected] if selected else []
+        return element.select(selector)
 
     def _get_element_text(self, element) -> str:
         return element.get_text(strip=True)
@@ -1050,3 +1049,4 @@ class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
     def _get_element_attribute(self, element, attribute: str):
         return element.get(attribute)
  
+
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index 214ebbc6..6fd7429f 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -21,6 +21,8 @@ import textwrap
 import cProfile
 import pstats
 from functools import wraps
+import asyncio
+
 
 class InvalidCSSSelectorError(Exception):
     pass
@@ -1579,6 +1581,25 @@ def ensure_content_dirs(base_path: str) -> Dict[str, str]:
         
     return content_paths
 
+def configure_windows_event_loop():
+    """
+    Configure the Windows event loop to use ProactorEventLoop.
+    This resolves the NotImplementedError that occurs on Windows when using asyncio subprocesses.
+    
+    This function should only be called on Windows systems and before any async operations.
+    On non-Windows systems, this function does nothing.
+    
+    Example:
+        ```python
+        from crawl4ai.async_configs import configure_windows_event_loop
+        
+        # Call this before any async operations if you're on Windows
+        configure_windows_event_loop()
+        ```
+    """
+    if platform.system() == 'Windows':
+        asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
+
 def get_error_context(exc_info, context_lines: int = 5):
     """
     Extract error context with more reliable line number tracking.
diff --git a/docs/md_v3/tutorials/async-webcrawler-basics.md b/docs/md_v3/tutorials/async-webcrawler-basics.md
index 46256eaa..6236d899 100644
--- a/docs/md_v3/tutorials/async-webcrawler-basics.md
+++ b/docs/md_v3/tutorials/async-webcrawler-basics.md
@@ -148,7 +148,24 @@ Below are a few `BrowserConfig` and `CrawlerRunConfig` parameters you might twea
 
 ---
 
-## 5. Putting It All Together
+## 5. Windows-Specific Configuration
+
+When using AsyncWebCrawler on Windows, you might encounter a `NotImplementedError` related to `asyncio.create_subprocess_exec`. This is a known Windows-specific issue that occurs because Windows' default event loop doesn't support subprocess operations.
+
+To resolve this, Crawl4AI provides a utility function to configure Windows to use the ProactorEventLoop. Call this function before running any async operations:
+
+```python
+from crawl4ai.utils import configure_windows_event_loop
+
+# Call this before any async operations if you're on Windows
+configure_windows_event_loop()
+
+# Your AsyncWebCrawler code here
+```
+
+---
+
+## 6. Putting It All Together
 
 Here’s a slightly more in-depth example that shows off a few key config parameters at once:
 
@@ -193,7 +210,7 @@ if __name__ == "__main__":
 
 ---
 
-## 6. Next Steps
+## 7. Next Steps
 
 - **Smart Crawling Techniques**: Learn to handle iframes, advanced caching, and selective extraction in the [next tutorial](./smart-crawling.md).
 - **Hooks & Custom Code**: See how to inject custom logic before and after navigation in a dedicated [Hooks Tutorial](./hooks-custom.md).

From ae376f15fb8b92701ea1a0b167f9a0e9c2d6804c Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Sun, 5 Jan 2025 19:39:15 +0800
Subject: [PATCH 4/6] docs(extraction): add clarifying comments for CSS
 selector behavior

Add explanatory comments to JsonCssExtractionStrategy._get_elements() method to clarify that it returns all matching elements using select() instead of select_one(). This helps developers understand the method's behavior and its difference from single element selection.

Removed trailing whitespace at end of file.
---
 crawl4ai/extraction_strategy.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py
index 1e9d9c79..3e688f13 100644
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -974,6 +974,8 @@ class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
         return parsed_html.select(selector)
 
     def _get_elements(self, element, selector: str):
+        # Return all matching elements using select() instead of select_one()
+        # This ensures that we get all elements that match the selector, not just the first one
         return element.select(selector)
 
     def _get_element_text(self, element) -> str:
@@ -1048,5 +1050,3 @@ class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
 
     def _get_element_attribute(self, element, attribute: str):
         return element.get(attribute)
- 
-

From 3427ead8b8854f70aef2b8fd485648ba22623e21 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Mon, 6 Jan 2025 15:13:43 +0800
Subject: [PATCH 5/6] Update CHANGELOG

---
 CHANGELOG.md | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b654953f..afa841c9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,43 @@ All notable changes to Crawl4AI will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+---
+
+## [0.4.267] - 2025 - 01 - 06
+
+### Added
+- **Windows Event Loop Configuration**: Introduced a utility function `configure_windows_event_loop` to resolve `NotImplementedError` for asyncio subprocesses on Windows. ([#utils.py](crawl4ai/utils.py), [#tutorials/async-webcrawler-basics.md](docs/md_v3/tutorials/async-webcrawler-basics.md))
+- **`page_need_scroll` Method**: Added a method to determine if a page requires scrolling before taking actions in `AsyncPlaywrightCrawlerStrategy`. ([#async_crawler_strategy.py](crawl4ai/async_crawler_strategy.py))
+
+### Changed
+- **Version Bump**: Updated the version from `0.4.246` to `0.4.247`. ([#__version__.py](crawl4ai/__version__.py))
+- **Improved Scrolling Logic**: Enhanced scrolling methods in `AsyncPlaywrightCrawlerStrategy` by adding a `scroll_delay` parameter for better control. ([#async_crawler_strategy.py](crawl4ai/async_crawler_strategy.py))
+- **Markdown Generation Example**: Updated the `hello_world.py` example to reflect the latest API changes and better illustrate features. ([#examples/hello_world.py](docs/examples/hello_world.py))
+- **Documentation Update**: 
+  - Added Windows-specific instructions for handling asyncio event loops. ([#async-webcrawler-basics.md](docs/md_v3/tutorials/async-webcrawler-basics.md))
+
+### Removed
+- **Legacy Markdown Generation Code**: Removed outdated and unused code for markdown generation in `content_scraping_strategy.py`. ([#content_scraping_strategy.py](crawl4ai/content_scraping_strategy.py))
+
+### Fixed
+- **Page Closing to Prevent Memory Leaks**:
+  - **Description**: Added a `finally` block to ensure pages are closed when no `session_id` is provided.
+  - **Impact**: Prevents memory leaks caused by lingering pages after a crawl.
+  - **File**: [`async_crawler_strategy.py`](crawl4ai/async_crawler_strategy.py)
+  - **Code**:
+    ```python
+    finally:
+        # If no session_id is given we should close the page
+        if not config.session_id:
+            await page.close()
+    ```
+- **Multiple Element Selection**: Modified `_get_elements` in `JsonCssExtractionStrategy` to return all matching elements instead of just the first one, ensuring comprehensive extraction. ([#extraction_strategy.py](crawl4ai/extraction_strategy.py))
+- **Error Handling in Scrolling**: Added robust error handling to ensure scrolling proceeds safely even if a configuration is missing. ([#async_crawler_strategy.py](crawl4ai/async_crawler_strategy.py))
+
+### Other
+- **Git Ignore Update**: Added `/plans` to `.gitignore` for better development environment consistency. ([#.gitignore](.gitignore))
+
+
 ## [0.4.24] - 2024-12-31
 
 ### Added

From 12880f1ffad9702aad6adbca3e0f16e391c081ba Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Mon, 6 Jan 2025 15:19:01 +0800
Subject: [PATCH 6/6] Update gitignore

---
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 7ce3ee0c..943c059c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -225,4 +225,5 @@ tree.md
 .scripts
 .local
 .do
-/plans
\ No newline at end of file
+/plans
+plans/
\ No newline at end of file