From 196dc79ec7005a1cabf22af621f7b6b029288e47 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 3 Jan 2025 21:17:23 +0800 Subject: [PATCH] fix: prevent memory leaks by ensuring proper closure of Playwright pages - Fixes critical memory leak issue where browser pages remained open - Ensures proper cleanup of Playwright resources after page operations - Improves resource management in browser farm implementation This is an urgent fix to address resource leakage that could impact system stability. --- .gitignore | 1 + crawl4ai/async_crawler_strategy.py | 15 ++++++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 6a3b65f0..7ce3ee0c 100644 --- a/.gitignore +++ b/.gitignore @@ -225,3 +225,4 @@ tree.md .scripts .local .do +/plans \ No newline at end of file diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 32bd14b8..82e445e1 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1475,8 +1475,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): except Exception as e: raise e + + finally: + # If no session_id is given we should close the page + if not config.session_id: + await page.close() - async def _handle_full_page_scan(self, page: Page, scroll_delay: float): + async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1): """ Helper method to handle full page scanning. @@ -1500,7 +1505,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): current_position = viewport_height # await page.evaluate(f"window.scrollTo(0, {current_position})") - await self.safe_scroll(page, 0, current_position) + await self.safe_scroll(page, 0, current_position, delay=scroll_delay) # await self.csp_scroll_to(page, 0, current_position) # await asyncio.sleep(scroll_delay) @@ -1510,7 +1515,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): while current_position < total_height: current_position = min(current_position + viewport_height, total_height) - await self.safe_scroll(page, 0, current_position) + await self.safe_scroll(page, 0, current_position, delay=scroll_delay) # await page.evaluate(f"window.scrollTo(0, {current_position})") # await asyncio.sleep(scroll_delay) @@ -2066,7 +2071,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): } """) - async def safe_scroll(self, page: Page, x: int, y: int): + async def safe_scroll(self, page: Page, x: int, y: int, delay: float = 0.1): """ Safely scroll the page with rendering time. @@ -2077,7 +2082,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): """ result = await self.csp_scroll_to(page, x, y) if result['success']: - await page.wait_for_timeout(100) # Allow for rendering + await page.wait_for_timeout(delay * 1000) return result async def csp_scroll_to(self, page: Page, x: int, y: int) -> Dict[str, Any]: