diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 3fcd9911..e6cf9279 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -789,6 +789,8 @@ class CrawlerRunConfig(): Default: False. scroll_delay (float): Delay in seconds between scroll steps if scan_full_page is True. Default: 0.2. + max_scroll_steps (Optional[int]): Maximum number of scroll steps to perform during full page scan. + If None, scrolls until the entire page is loaded. Default: None. process_iframes (bool): If True, attempts to process and inline iframe content. Default: False. remove_overlay_elements (bool): If True, remove overlays/popups before extracting HTML. @@ -919,6 +921,7 @@ class CrawlerRunConfig(): ignore_body_visibility: bool = True, scan_full_page: bool = False, scroll_delay: float = 0.2, + max_scroll_steps: Optional[int] = None, process_iframes: bool = False, remove_overlay_elements: bool = False, simulate_user: bool = False, @@ -1017,6 +1020,7 @@ class CrawlerRunConfig(): self.ignore_body_visibility = ignore_body_visibility self.scan_full_page = scan_full_page self.scroll_delay = scroll_delay + self.max_scroll_steps = max_scroll_steps self.process_iframes = process_iframes self.remove_overlay_elements = remove_overlay_elements self.simulate_user = simulate_user @@ -1158,6 +1162,7 @@ class CrawlerRunConfig(): ignore_body_visibility=kwargs.get("ignore_body_visibility", True), scan_full_page=kwargs.get("scan_full_page", False), scroll_delay=kwargs.get("scroll_delay", 0.2), + max_scroll_steps=kwargs.get("max_scroll_steps"), process_iframes=kwargs.get("process_iframes", False), remove_overlay_elements=kwargs.get("remove_overlay_elements", False), simulate_user=kwargs.get("simulate_user", False), @@ -1267,6 +1272,7 @@ class CrawlerRunConfig(): "ignore_body_visibility": self.ignore_body_visibility, "scan_full_page": self.scan_full_page, "scroll_delay": self.scroll_delay, + "max_scroll_steps": self.max_scroll_steps, "process_iframes": self.process_iframes, "remove_overlay_elements": self.remove_overlay_elements, "simulate_user": self.simulate_user, diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index a1873bfd..d349388f 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -902,7 +902,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Handle full page scanning if config.scan_full_page: - await self._handle_full_page_scan(page, config.scroll_delay) + # await self._handle_full_page_scan(page, config.scroll_delay) + await self._handle_full_page_scan(page, config.scroll_delay, config.max_scroll_steps) # Execute JavaScript if provided # if config.js_code: @@ -1090,7 +1091,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Close the page await page.close() - async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1): + # async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1): + async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1, max_scroll_steps: Optional[int] = None): """ Helper method to handle full page scanning. @@ -1105,6 +1107,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): Args: page (Page): The Playwright page object scroll_delay (float): The delay between page scrolls + max_scroll_steps (Optional[int]): Maximum number of scroll steps to perform. If None, scrolls until end. """ try: @@ -1129,9 +1132,21 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): dimensions = await self.get_page_dimensions(page) total_height = dimensions["height"] + scroll_step_count = 0 while current_position < total_height: + #### + # NEW FEATURE: Check if we've reached the maximum allowed scroll steps + # This prevents infinite scrolling on very long pages or infinite scroll scenarios + # If max_scroll_steps is None, this check is skipped (unlimited scrolling - original behavior) + #### + if max_scroll_steps is not None and scroll_step_count >= max_scroll_steps: + break current_position = min(current_position + viewport_height, total_height) await self.safe_scroll(page, 0, current_position, delay=scroll_delay) + + # Increment the step counter for max_scroll_steps tracking + scroll_step_count += 1 + # await page.evaluate(f"window.scrollTo(0, {current_position})") # await asyncio.sleep(scroll_delay) diff --git a/tests/general/test_max_scroll.py b/tests/general/test_max_scroll.py new file mode 100644 index 00000000..1cf8908c --- /dev/null +++ b/tests/general/test_max_scroll.py @@ -0,0 +1,115 @@ +""" +Sample script to test the max_scroll_steps parameter implementation +""" +import asyncio +import os +import sys + +# Get the grandparent directory +grandparent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +sys.path.append(grandparent_dir) +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + + + +from crawl4ai import AsyncWebCrawler +from crawl4ai.async_configs import CrawlerRunConfig + +async def test_max_scroll_steps(): + """ + Test the max_scroll_steps parameter with different configurations + """ + print("šŸš€ Testing max_scroll_steps parameter implementation") + print("=" * 60) + + async with AsyncWebCrawler(verbose=True) as crawler: + + # Test 1: Without max_scroll_steps (unlimited scrolling) + print("\\nšŸ“‹ Test 1: Unlimited scrolling (max_scroll_steps=None)") + config1 = CrawlerRunConfig( + scan_full_page=True, + scroll_delay=0.1, + max_scroll_steps=None, # Default behavior + verbose=True + ) + + print(f"Config: scan_full_page={config1.scan_full_page}, max_scroll_steps={config1.max_scroll_steps}") + + try: + result1 = await crawler.arun( + url="https://example.com", # Simple page for testing + config=config1 + ) + print(f"āœ… Test 1 Success: Crawled {len(result1.markdown)} characters") + except Exception as e: + print(f"āŒ Test 1 Failed: {e}") + + # Test 2: With limited scroll steps + print("\\nšŸ“‹ Test 2: Limited scrolling (max_scroll_steps=3)") + config2 = CrawlerRunConfig( + scan_full_page=True, + scroll_delay=0.1, + max_scroll_steps=3, # Limit to 3 scroll steps + verbose=True + ) + + print(f"Config: scan_full_page={config2.scan_full_page}, max_scroll_steps={config2.max_scroll_steps}") + + try: + result2 = await crawler.arun( + url="https://techcrunch.com/", # Another test page + config=config2 + ) + print(f"āœ… Test 2 Success: Crawled {len(result2.markdown)} characters") + except Exception as e: + print(f"āŒ Test 2 Failed: {e}") + + # Test 3: Test serialization/deserialization + print("\\nšŸ“‹ Test 3: Configuration serialization test") + config3 = CrawlerRunConfig( + scan_full_page=True, + max_scroll_steps=5, + scroll_delay=0.2 + ) + + # Test to_dict + config_dict = config3.to_dict() + print(f"Serialized max_scroll_steps: {config_dict.get('max_scroll_steps')}") + + # Test from_kwargs + config4 = CrawlerRunConfig.from_kwargs({ + 'scan_full_page': True, + 'max_scroll_steps': 7, + 'scroll_delay': 0.3 + }) + print(f"Deserialized max_scroll_steps: {config4.max_scroll_steps}") + print("āœ… Test 3 Success: Serialization works correctly") + + # Test 4: Edge case - max_scroll_steps = 0 + print("\\nšŸ“‹ Test 4: Edge case (max_scroll_steps=0)") + config5 = CrawlerRunConfig( + scan_full_page=True, + max_scroll_steps=0, # Should not scroll at all + verbose=True + ) + + try: + result5 = await crawler.arun( + url="https://techcrunch.com/", + config=config5 + ) + print(f"āœ… Test 4 Success: No scrolling performed, crawled {len(result5.markdown)} characters") + except Exception as e: + print(f"āŒ Test 4 Failed: {e}") + + print("\\n" + "=" * 60) + print("šŸŽ‰ All tests completed!") + print("\\nThe max_scroll_steps parameter is working correctly:") + print("- None: Unlimited scrolling (default behavior)") + print("- Positive integer: Limits scroll steps to that number") + print("- 0: No scrolling performed") + print("- Properly serializes/deserializes in config") + +if __name__ == "__main__": + print("Starting max_scroll_steps test...") + asyncio.run(test_max_scroll_steps()) \ No newline at end of file