From 07b4c1c0ed1cbd67ad0459a582e7f8263041256c Mon Sep 17 00:00:00 2001 From: Guilume <49784245+TheCutestCat@users.noreply.github.com> Date: Sun, 5 Jan 2025 17:04:34 +0800 Subject: [PATCH] fix: not working long page screenshot (#403) --- crawl4ai/async_crawler_strategy.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 32bd14b8..4723a836 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1639,11 +1639,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): Returns: str: The base64-encoded screenshot data """ - dimensions = await self.get_page_dimensions(page) - page_height = dimensions['height'] - if page_height < kwargs.get( - "screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD - ): + need_scroll = await self.page_need_scroll(page) + + if not need_scroll: # Page is short enough, just take a screenshot return await self.take_screenshot_naive(page) else: @@ -2158,4 +2156,22 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): const {scrollWidth, scrollHeight} = document.documentElement; return {width: scrollWidth, height: scrollHeight}; } + """) + + async def page_need_scroll(self, page: Page): + """ + Determine whether the page need to scroll + + Args: + page: Playwright page object + + Returns: + page should scroll or not + """ + return await page.evaluate(""" + () => { + const scrollHeight = document.documentElement.scrollHeight; + const viewportHeight = window.innerHeight; + return scrollHeight > viewportHeight; + } """) \ No newline at end of file