""" Test virtual scroll implementation according to the design: - Create a page with virtual scroll that replaces content - Verify all 1000 items are captured """ import asyncio import os from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, VirtualScrollConfig, CacheMode, BrowserConfig async def test_virtual_scroll(): """Test virtual scroll with content replacement (true virtual scroll)""" # Create test HTML with true virtual scroll that replaces content test_html = '''

Virtual Scroll Test - 1000 Items

''' # Save test HTML to a file import tempfile with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f: f.write(test_html) test_file_path = f.name httpd = None old_cwd = os.getcwd() try: # Start a simple HTTP server import http.server import socketserver import threading import random # Find available port for _ in range(10): PORT = random.randint(8000, 9999) try: Handler = http.server.SimpleHTTPRequestHandler os.chdir(os.path.dirname(test_file_path)) httpd = socketserver.TCPServer(("", PORT), Handler) break except OSError: continue if httpd is None: raise RuntimeError("Could not find available port") server_thread = threading.Thread(target=httpd.serve_forever) server_thread.daemon = True server_thread.start() # Give server time to start await asyncio.sleep(0.5) # Configure virtual scroll # With 10 items per page and 1000 total, we need 100 pages # Let's do 120 scrolls to ensure we get everything virtual_config = VirtualScrollConfig( container_selector="#container", scroll_count=120, scroll_by="container_height", # Scroll by container height wait_after_scroll=0.1 # Quick wait for test ) config = CrawlerRunConfig( virtual_scroll_config=virtual_config, cache_mode=CacheMode.BYPASS, verbose=True ) browserConfig = BrowserConfig( headless= False ) async with AsyncWebCrawler(verbose=True, config=browserConfig) as crawler: result = await crawler.arun( url=f"http://localhost:{PORT}/{os.path.basename(test_file_path)}", config=config ) # Count all items in the result import re items = re.findall(r'data-index="(\d+)"', result.html) unique_indices = sorted(set(int(idx) for idx in items)) print(f"\n{'='*60}") print(f"TEST RESULTS:") print(f"HTML Length: {len(result.html)}") print(f"Total items found: {len(items)}") print(f"Unique items: {len(unique_indices)}") if unique_indices: print(f"Item indices: {min(unique_indices)} to {max(unique_indices)}") print(f"Expected: 0 to 999") # Check for gaps expected = set(range(1000)) actual = set(unique_indices) missing = expected - actual if missing: print(f"\n❌ FAILED! Missing {len(missing)} items") print(f"Missing indices: {sorted(missing)[:10]}{'...' if len(missing) > 10 else ''}") else: print(f"\n✅ SUCCESS! All 1000 items captured!") # Show some sample items print(f"\nSample items from result:") sample_items = re.findall(r'

]*>([^<]+)

', result.html)[:5] for item in sample_items: print(f" - {item}") print(f"{'='*60}\n") finally: # Clean up if httpd: httpd.shutdown() os.chdir(old_cwd) os.unlink(test_file_path) if __name__ == "__main__": asyncio.run(test_virtual_scroll())