feat: Add virtual scroll support for modern web scraping

Add comprehensive virtual scroll handling to capture all content from pages that use DOM recycling techniques (Twitter, Instagram, etc). Key features: - New VirtualScrollConfig class for configuring virtual scroll behavior - Automatic detection of three scrolling scenarios: no change, content appended, content replaced - Intelligent HTML chunk capture and merging with deduplication - 100% content capture from virtual scroll pages - Seamless integration with existing extraction strategies - JavaScript-based detection and capture for performance - Tree-based DOM merging with text-based deduplication Documentation: - Comprehensive guide at docs/md_v2/advanced/virtual-scroll.md - API reference updates in parameters.md and page-interaction.md - Blog article explaining the solution and techniques - Complete examples with local test server Testing: - Full test suite achieving 100% capture of 1000 items - Examples for Twitter timeline, Instagram grid scenarios - Local test server with different scrolling behaviors This enables scraping of modern websites that were previously impossible to fully capture with traditional scrolling techniques.
2025-06-29 20:41:37 +08:00
parent 539a324cf6
commit a353515271
18 changed files with 2194 additions and 6 deletions
--- a/tests/test_virtual_scroll.py
+++ b/tests/test_virtual_scroll.py
@@ -0,0 +1,197 @@
+"""
+Test virtual scroll implementation according to the design:
+- Create a page with virtual scroll that replaces content
+- Verify all 1000 items are captured
+"""
+
+import asyncio
+import os
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, VirtualScrollConfig, CacheMode, BrowserConfig
+
+async def test_virtual_scroll():
+    """Test virtual scroll with content replacement (true virtual scroll)"""
+    
+    # Create test HTML with true virtual scroll that replaces content
+    test_html = '''
+    <html>
+    <head>
+        <style>
+            #container {
+                height: 500px;
+                overflow-y: auto;
+                border: 1px solid #ccc;
+            }
+            .item {
+                height: 50px;
+                padding: 10px;
+                border-bottom: 1px solid #eee;
+            }
+        </style>
+    </head>
+    <body>
+        <h1>Virtual Scroll Test - 1000 Items</h1>
+        <div id="container"></div>
+        <script>
+            // True virtual scroll that REPLACES content
+            const container = document.getElementById('container');
+            const totalItems = 1000;
+            const itemsPerPage = 10; // Only show 10 items at a time
+            let currentStartIndex = 0;
+            
+            // All our data
+            const allData = [];
+            for (let i = 0; i < totalItems; i++) {
+                allData.push({
+                    id: i,
+                    text: `Item ${i + 1} of ${totalItems} - Unique ID: ${i}`
+                });
+            }
+            
+            // Function to render current page
+            function renderPage(startIndex) {
+                const items = [];
+                const endIndex = Math.min(startIndex + itemsPerPage, totalItems);
+                
+                for (let i = startIndex; i < endIndex; i++) {
+                    const item = allData[i];
+                    items.push(`<div class="item" data-index="${item.id}">${item.text}</div>`);
+                }
+                
+                // REPLACE container content (virtual scroll)
+                container.innerHTML = items.join('');
+                currentStartIndex = startIndex;
+            }
+            
+            // Initial render
+            renderPage(0);
+            
+            // Handle scroll
+            container.addEventListener('scroll', () => {
+                const scrollTop = container.scrollTop;
+                const scrollHeight = container.scrollHeight;
+                const clientHeight = container.clientHeight;
+                
+                // Calculate which page we should show based on scroll position
+                // This creates a virtual scroll effect
+                if (scrollTop + clientHeight >= scrollHeight - 50) {
+                    // Load next page
+                    const nextIndex = currentStartIndex + itemsPerPage;
+                    if (nextIndex < totalItems) {
+                        renderPage(nextIndex);
+                        // Reset scroll to top to continue scrolling
+                        container.scrollTop = 10;
+                    }
+                }
+            });
+        </script>
+    </body>
+    </html>
+    '''
+    
+    # Save test HTML to a file
+    import tempfile
+    
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f:
+        f.write(test_html)
+        test_file_path = f.name
+    
+    httpd = None
+    old_cwd = os.getcwd()
+    
+    try:
+        # Start a simple HTTP server
+        import http.server
+        import socketserver
+        import threading
+        import random
+        
+        # Find available port
+        for _ in range(10):
+            PORT = random.randint(8000, 9999)
+            try:
+                Handler = http.server.SimpleHTTPRequestHandler
+                os.chdir(os.path.dirname(test_file_path))
+                httpd = socketserver.TCPServer(("", PORT), Handler)
+                break
+            except OSError:
+                continue
+        
+        if httpd is None:
+            raise RuntimeError("Could not find available port")
+            
+        server_thread = threading.Thread(target=httpd.serve_forever)
+        server_thread.daemon = True
+        server_thread.start()
+        
+        # Give server time to start
+        await asyncio.sleep(0.5)
+        
+        # Configure virtual scroll
+        # With 10 items per page and 1000 total, we need 100 pages
+        # Let's do 120 scrolls to ensure we get everything
+        virtual_config = VirtualScrollConfig(
+            container_selector="#container",
+            scroll_count=120,
+            scroll_by="container_height",  # Scroll by container height
+            wait_after_scroll=0.1  # Quick wait for test
+        )
+        
+        config = CrawlerRunConfig(
+            virtual_scroll_config=virtual_config,
+            cache_mode=CacheMode.BYPASS,
+            verbose=True
+        )
+        
+        browserConfig = BrowserConfig(
+            headless= False
+        )
+        
+        async with AsyncWebCrawler(verbose=True, config=browserConfig) as crawler:
+            result = await crawler.arun(
+                url=f"http://localhost:{PORT}/{os.path.basename(test_file_path)}",
+                config=config
+            )
+            
+            # Count all items in the result
+            import re
+            items = re.findall(r'data-index="(\d+)"', result.html)
+            unique_indices = sorted(set(int(idx) for idx in items))
+            
+            print(f"\n{'='*60}")
+            print(f"TEST RESULTS:")
+            print(f"HTML Length: {len(result.html)}")
+            print(f"Total items found: {len(items)}")
+            print(f"Unique items: {len(unique_indices)}")
+            
+            if unique_indices:
+                print(f"Item indices: {min(unique_indices)} to {max(unique_indices)}")
+                print(f"Expected: 0 to 999")
+                
+                # Check for gaps
+                expected = set(range(1000))
+                actual = set(unique_indices)
+                missing = expected - actual
+                
+                if missing:
+                    print(f"\n❌ FAILED! Missing {len(missing)} items")
+                    print(f"Missing indices: {sorted(missing)[:10]}{'...' if len(missing) > 10 else ''}")
+                else:
+                    print(f"\n✅ SUCCESS! All 1000 items captured!")
+                    
+                # Show some sample items
+                print(f"\nSample items from result:")
+                sample_items = re.findall(r'<div class="item"[^>]*>([^<]+)</div>', result.html)[:5]
+                for item in sample_items:
+                    print(f"  - {item}")
+            
+            print(f"{'='*60}\n")
+                
+    finally:
+        # Clean up
+        if httpd:
+            httpd.shutdown()
+        os.chdir(old_cwd)
+        os.unlink(test_file_path)
+
+if __name__ == "__main__":
+    asyncio.run(test_virtual_scroll())