feat: Add virtual scroll support for modern web scraping

Add comprehensive virtual scroll handling to capture all content from pages that use DOM recycling techniques (Twitter, Instagram, etc). Key features: - New VirtualScrollConfig class for configuring virtual scroll behavior - Automatic detection of three scrolling scenarios: no change, content appended, content replaced - Intelligent HTML chunk capture and merging with deduplication - 100% content capture from virtual scroll pages - Seamless integration with existing extraction strategies - JavaScript-based detection and capture for performance - Tree-based DOM merging with text-based deduplication Documentation: - Comprehensive guide at docs/md_v2/advanced/virtual-scroll.md - API reference updates in parameters.md and page-interaction.md - Blog article explaining the solution and techniques - Complete examples with local test server Testing: - Full test suite achieving 100% capture of 1000 items - Examples for Twitter timeline, Instagram grid scenarios - Local test server with different scrolling behaviors This enables scraping of modern websites that were previously impossible to fully capture with traditional scrolling techniques.
2025-06-29 20:41:37 +08:00
parent 539a324cf6
commit a353515271
18 changed files with 2194 additions and 6 deletions
--- a/docs/examples/assets/instagram_grid_result.png
+++ b/docs/examples/assets/instagram_grid_result.png
--- a/docs/examples/assets/virtual_scroll_append_only.html
+++ b/docs/examples/assets/virtual_scroll_append_only.html
@@ -0,0 +1,132 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Append-Only Scroll (Traditional Infinite Scroll)</title>
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            margin: 0;
+            padding: 20px;
+            background-color: #f5f5f5;
+        }
+        
+        h1 {
+            color: #333;
+            text-align: center;
+        }
+        
+        .posts-container {
+            max-width: 800px;
+            margin: 0 auto;
+            background: white;
+            border: 1px solid #ddd;
+            border-radius: 5px;
+            padding: 20px;
+        }
+        
+        .post {
+            background: #f9f9f9;
+            padding: 15px;
+            margin-bottom: 15px;
+            border-radius: 5px;
+            border: 1px solid #eee;
+        }
+        
+        .post-title {
+            font-size: 18px;
+            font-weight: bold;
+            color: #2c3e50;
+            margin-bottom: 10px;
+        }
+        
+        .post-content {
+            color: #555;
+            line-height: 1.6;
+        }
+        
+        .loading {
+            text-align: center;
+            padding: 20px;
+            color: #888;
+        }
+    </style>
+</head>
+<body>
+    <h1>Traditional Infinite Scroll Demo</h1>
+    <p style="text-align: center; color: #666;">This appends new content without removing old content</p>
+    <div class="posts-container"></div>
+    
+    <script>
+        // Traditional infinite scroll - APPENDS content
+        const container = document.querySelector('.posts-container');
+        const totalPosts = 200;
+        const postsPerPage = 20;
+        let loadedPosts = 0;
+        let isLoading = false;
+        
+        // Generate fake post data
+        function generatePost(index) {
+            return {
+                id: index,
+                title: `Post Title #${index + 1}`,
+                content: `This is the content of post ${index + 1}. In traditional infinite scroll, new content is appended to existing content. The DOM keeps growing. Post ID: ${index}`
+            };
+        }
+        
+        // Load more posts - APPENDS to existing content
+        function loadMorePosts() {
+            if (isLoading || loadedPosts >= totalPosts) return;
+            
+            isLoading = true;
+            
+            // Show loading indicator
+            const loadingDiv = document.createElement('div');
+            loadingDiv.className = 'loading';
+            loadingDiv.textContent = 'Loading more posts...';
+            container.appendChild(loadingDiv);
+            
+            // Simulate network delay
+            setTimeout(() => {
+                // Remove loading indicator
+                container.removeChild(loadingDiv);
+                
+                // Add new posts
+                const fragment = document.createDocumentFragment();
+                const endIndex = Math.min(loadedPosts + postsPerPage, totalPosts);
+                
+                for (let i = loadedPosts; i < endIndex; i++) {
+                    const post = generatePost(i);
+                    const postElement = document.createElement('div');
+                    postElement.className = 'post';
+                    postElement.setAttribute('data-post-id', post.id);
+                    postElement.innerHTML = `
+                        <div class="post-title">${post.title}</div>
+                        <div class="post-content">${post.content}</div>
+                    `;
+                    fragment.appendChild(postElement);
+                }
+                
+                // APPEND new posts to existing ones
+                container.appendChild(fragment);
+                loadedPosts = endIndex;
+                isLoading = false;
+                
+                console.log(`Loaded ${loadedPosts} of ${totalPosts} posts`);
+            }, 300);
+        }
+        
+        // Initial load
+        loadMorePosts();
+        
+        // Load more on scroll
+        window.addEventListener('scroll', () => {
+            const scrollBottom = window.innerHeight + window.scrollY;
+            const threshold = document.body.offsetHeight - 500;
+            
+            if (scrollBottom >= threshold) {
+                loadMorePosts();
+            }
+        });
+    </script>
+</body>
+</html>
--- a/docs/examples/assets/virtual_scroll_instagram_grid.html
+++ b/docs/examples/assets/virtual_scroll_instagram_grid.html
@@ -0,0 +1,158 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Instagram-like Grid Virtual Scroll</title>
+    <style>
+        body {
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif;
+            margin: 0;
+            padding: 20px;
+            background-color: #fafafa;
+        }
+        
+        h1 {
+            text-align: center;
+            color: #262626;
+            font-weight: 300;
+        }
+        
+        .feed-container {
+            max-width: 935px;
+            margin: 0 auto;
+            height: 800px;
+            overflow-y: auto;
+            background: white;
+            border: 1px solid #dbdbdb;
+            border-radius: 3px;
+        }
+        
+        .grid {
+            display: grid;
+            grid-template-columns: repeat(3, 1fr);
+            gap: 28px;
+            padding: 28px;
+        }
+        
+        .post {
+            aspect-ratio: 1;
+            background: #f0f0f0;
+            border-radius: 3px;
+            position: relative;
+            overflow: hidden;
+            cursor: pointer;
+        }
+        
+        .post:hover .overlay {
+            opacity: 1;
+        }
+        
+        .post img {
+            width: 100%;
+            height: 100%;
+            object-fit: cover;
+        }
+        
+        .overlay {
+            position: absolute;
+            top: 0;
+            left: 0;
+            right: 0;
+            bottom: 0;
+            background: rgba(0, 0, 0, 0.3);
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            color: white;
+            font-size: 14px;
+            opacity: 0;
+            transition: opacity 0.2s;
+        }
+        
+        .stats {
+            display: flex;
+            gap: 20px;
+        }
+    </style>
+</head>
+<body>
+    <h1>Instagram Grid Virtual Scroll</h1>
+    <p style="text-align: center; color: #8e8e8e;">Grid layout with virtual scrolling - only visible rows are rendered</p>
+    <div class="feed-container">
+        <div class="grid" id="grid"></div>
+    </div>
+    
+    <script>
+        // Instagram-like grid virtual scroll
+        const grid = document.getElementById('grid');
+        const container = document.querySelector('.feed-container');
+        const totalPosts = 999; // Instagram style count
+        const postsPerRow = 3;
+        const rowsPerPage = 4; // 12 posts per page
+        const postsPerPage = postsPerRow * rowsPerPage;
+        let currentStartIndex = 0;
+        
+        // Generate fake Instagram post data
+        const allPosts = [];
+        for (let i = 0; i < totalPosts; i++) {
+            allPosts.push({
+                id: i,
+                likes: Math.floor(Math.random() * 10000),
+                comments: Math.floor(Math.random() * 500),
+                imageNumber: (i % 10) + 1 // Cycle through 10 placeholder images
+            });
+        }
+        
+        // Render grid - REPLACES content for performance
+        function renderGrid(startIndex) {
+            const posts = [];
+            const endIndex = Math.min(startIndex + postsPerPage, totalPosts);
+            
+            for (let i = startIndex; i < endIndex; i++) {
+                const post = allPosts[i];
+                posts.push(`
+                    <div class="post" data-post-id="${post.id}">
+                        <img src="data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='400' height='400'%3E%3Crect width='400' height='400' fill='%23${Math.floor(Math.random()*16777215).toString(16)}'/%3E%3Ctext x='50%25' y='50%25' text-anchor='middle' dy='.3em' font-family='Arial' font-size='48' fill='white'%3E${post.id + 1}%3C/text%3E%3C/svg%3E" alt="Post ${post.id + 1}">
+                        <div class="overlay">
+                            <div class="stats">
+                                <span>❤️ ${post.likes.toLocaleString()}</span>
+                                <span>💬 ${post.comments}</span>
+                            </div>
+                        </div>
+                    </div>
+                `);
+            }
+            
+            // REPLACE grid content (virtual scroll)
+            grid.innerHTML = posts.join('');
+            currentStartIndex = startIndex;
+        }
+        
+        // Initial render
+        renderGrid(0);
+        
+        // Handle scroll
+        let scrollTimeout;
+        container.addEventListener('scroll', () => {
+            clearTimeout(scrollTimeout);
+            scrollTimeout = setTimeout(() => {
+                const scrollTop = container.scrollTop;
+                const scrollHeight = container.scrollHeight;
+                const clientHeight = container.clientHeight;
+                
+                // Calculate which "page" we should show
+                const scrollPercentage = scrollTop / (scrollHeight - clientHeight);
+                const targetIndex = Math.floor(scrollPercentage * (totalPosts - postsPerPage) / postsPerPage) * postsPerPage;
+                
+                // When scrolled to bottom, show next page
+                if (scrollTop + clientHeight >= scrollHeight - 100) {
+                    const nextIndex = currentStartIndex + postsPerPage;
+                    if (nextIndex < totalPosts) {
+                        renderGrid(nextIndex);
+                        container.scrollTop = 100; // Reset scroll for continuous experience
+                    }
+                }
+            }, 50);
+        });
+    </script>
+</body>
+</html>
--- a/docs/examples/assets/virtual_scroll_news_feed.html
+++ b/docs/examples/assets/virtual_scroll_news_feed.html
@@ -0,0 +1,210 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>News Feed with Mixed Scroll Behavior</title>
+    <style>
+        body {
+            font-family: Georgia, serif;
+            margin: 0;
+            padding: 20px;
+            background-color: #f8f8f8;
+        }
+        
+        h1 {
+            text-align: center;
+            color: #1a1a1a;
+            font-size: 32px;
+            margin-bottom: 10px;
+        }
+        
+        .description {
+            text-align: center;
+            color: #666;
+            margin-bottom: 20px;
+        }
+        
+        #newsContainer {
+            max-width: 900px;
+            margin: 0 auto;
+            height: 700px;
+            overflow-y: auto;
+            background: white;
+            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
+            padding: 20px;
+        }
+        
+        .article {
+            margin-bottom: 30px;
+            padding-bottom: 30px;
+            border-bottom: 1px solid #e0e0e0;
+        }
+        
+        .article:last-child {
+            border-bottom: none;
+        }
+        
+        .article-header {
+            margin-bottom: 15px;
+        }
+        
+        .category {
+            display: inline-block;
+            background: #ff6b6b;
+            color: white;
+            padding: 4px 12px;
+            font-size: 12px;
+            text-transform: uppercase;
+            border-radius: 3px;
+            margin-bottom: 10px;
+        }
+        
+        .headline {
+            font-size: 24px;
+            font-weight: bold;
+            color: #1a1a1a;
+            margin: 10px 0;
+            line-height: 1.3;
+        }
+        
+        .meta {
+            color: #888;
+            font-size: 14px;
+            margin-bottom: 15px;
+        }
+        
+        .content {
+            font-size: 16px;
+            line-height: 1.8;
+            color: #333;
+        }
+        
+        .featured {
+            background: #fff9e6;
+            padding: 20px;
+            border-radius: 5px;
+            margin-bottom: 30px;
+        }
+        
+        .featured .category {
+            background: #ffa500;
+        }
+    </style>
+</head>
+<body>
+    <h1>📰 Dynamic News Feed</h1>
+    <p class="description">Mixed behavior: Featured articles stay, regular articles use virtual scroll</p>
+    <div id="newsContainer"></div>
+    
+    <script>
+        const container = document.getElementById('newsContainer');
+        const totalArticles = 100;
+        const articlesPerPage = 5;
+        let currentRegularIndex = 0;
+        
+        // Categories for variety
+        const categories = ['Politics', 'Technology', 'Business', 'Science', 'Sports', 'Entertainment'];
+        
+        // Generate article data
+        const featuredArticles = [];
+        const regularArticles = [];
+        
+        // 3 featured articles that always stay
+        for (let i = 0; i < 3; i++) {
+            featuredArticles.push({
+                id: `featured-${i}`,
+                category: 'Featured',
+                headline: `Breaking: Major Story ${i + 1} That Stays Visible`,
+                date: new Date().toLocaleDateString(),
+                content: `This is featured article ${i + 1}. Featured articles remain in the DOM and are not replaced during scrolling. They provide important persistent content.`
+            });
+        }
+        
+        // Regular articles that get virtualized
+        for (let i = 0; i < totalArticles; i++) {
+            regularArticles.push({
+                id: `article-${i}`,
+                category: categories[i % categories.length],
+                headline: `${categories[i % categories.length]} News: Article ${i + 1} of ${totalArticles}`,
+                date: new Date(Date.now() - i * 86400000).toLocaleDateString(),
+                content: `This is regular article ${i + 1}. These articles are replaced as you scroll to maintain performance. Only a subset is shown at any time. Article ID: ${i}`
+            });
+        }
+        
+        // Render articles - Featured stay, regular ones are replaced
+        function renderArticles(regularStartIndex) {
+            const html = [];
+            
+            // Always show featured articles
+            featuredArticles.forEach(article => {
+                html.push(`
+                    <div class="article featured" data-article-id="${article.id}">
+                        <div class="article-header">
+                            <span class="category">${article.category}</span>
+                            <h2 class="headline">${article.headline}</h2>
+                            <div class="meta">📅 ${article.date}</div>
+                        </div>
+                        <div class="content">${article.content}</div>
+                    </div>
+                `);
+            });
+            
+            // Add divider
+            html.push('<div style="text-align: center; color: #999; margin: 20px 0;">— Latest News —</div>');
+            
+            // Show current page of regular articles (virtual scroll)
+            const endIndex = Math.min(regularStartIndex + articlesPerPage, totalArticles);
+            for (let i = regularStartIndex; i < endIndex; i++) {
+                const article = regularArticles[i];
+                html.push(`
+                    <div class="article" data-article-id="${article.id}">
+                        <div class="article-header">
+                            <span class="category" style="background: ${getCategoryColor(article.category)}">${article.category}</span>
+                            <h2 class="headline">${article.headline}</h2>
+                            <div class="meta">📅 ${article.date}</div>
+                        </div>
+                        <div class="content">${article.content}</div>
+                    </div>
+                `);
+            }
+            
+            container.innerHTML = html.join('');
+            currentRegularIndex = regularStartIndex;
+        }
+        
+        function getCategoryColor(category) {
+            const colors = {
+                'Politics': '#e74c3c',
+                'Technology': '#3498db',
+                'Business': '#2ecc71',
+                'Science': '#9b59b6',
+                'Sports': '#f39c12',
+                'Entertainment': '#e91e63'
+            };
+            return colors[category] || '#95a5a6';
+        }
+        
+        // Initial render
+        renderArticles(0);
+        
+        // Handle scroll
+        container.addEventListener('scroll', () => {
+            const scrollTop = container.scrollTop;
+            const scrollHeight = container.scrollHeight;
+            const clientHeight = container.clientHeight;
+            
+            // When near bottom, load next page of regular articles
+            if (scrollTop + clientHeight >= scrollHeight - 200) {
+                const nextIndex = currentRegularIndex + articlesPerPage;
+                if (nextIndex < totalArticles) {
+                    renderArticles(nextIndex);
+                    // Scroll to where regular articles start
+                    const regularStart = document.querySelector('.article:not(.featured)');
+                    if (regularStart) {
+                        container.scrollTop = regularStart.offsetTop - 100;
+                    }
+                }
+            }
+        });
+    </script>
+</body>
+</html>
--- a/docs/examples/assets/virtual_scroll_twitter_like.html
+++ b/docs/examples/assets/virtual_scroll_twitter_like.html
@@ -0,0 +1,122 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Twitter-like Virtual Scroll</title>
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            margin: 0;
+            padding: 20px;
+            background-color: #f0f2f5;
+        }
+        
+        h1 {
+            color: #1da1f2;
+            text-align: center;
+        }
+        
+        #timeline {
+            max-width: 600px;
+            margin: 0 auto;
+            height: 600px;
+            overflow-y: auto;
+            background: white;
+            border: 1px solid #e1e8ed;
+            border-radius: 10px;
+        }
+        
+        .tweet {
+            padding: 15px;
+            border-bottom: 1px solid #e1e8ed;
+            min-height: 80px;
+        }
+        
+        .tweet:hover {
+            background-color: #f7f9fa;
+        }
+        
+        .author {
+            font-weight: bold;
+            color: #14171a;
+            margin-bottom: 5px;
+        }
+        
+        .content {
+            color: #14171a;
+            line-height: 1.5;
+        }
+        
+        .stats {
+            color: #657786;
+            font-size: 14px;
+            margin-top: 10px;
+        }
+    </style>
+</head>
+<body>
+    <h1>Virtual Scroll Demo - Twitter Style</h1>
+    <p style="text-align: center; color: #666;">This simulates Twitter's timeline where content is replaced as you scroll</p>
+    <div id="timeline"></div>
+    
+    <script>
+        // Simulate Twitter-like virtual scrolling where DOM elements are replaced
+        const timeline = document.getElementById('timeline');
+        const totalTweets = 500;
+        const tweetsPerPage = 10;
+        let currentIndex = 0;
+        
+        // Generate fake tweet data
+        const allTweets = [];
+        for (let i = 0; i < totalTweets; i++) {
+            allTweets.push({
+                id: i,
+                author: `User_${i + 1}`,
+                content: `This is tweet #${i + 1} of ${totalTweets}. Virtual scrolling replaces DOM elements to maintain performance. Unique content ID: ${i}`,
+                likes: Math.floor(Math.random() * 1000),
+                retweets: Math.floor(Math.random() * 500)
+            });
+        }
+        
+        // Render tweets - REPLACES content
+        function renderTweets(startIndex) {
+            const tweets = [];
+            const endIndex = Math.min(startIndex + tweetsPerPage, totalTweets);
+            
+            for (let i = startIndex; i < endIndex; i++) {
+                const tweet = allTweets[i];
+                tweets.push(`
+                    <div class="tweet" data-tweet-id="${tweet.id}">
+                        <div class="author">@${tweet.author}</div>
+                        <div class="content">${tweet.content}</div>
+                        <div class="stats">❤️ ${tweet.likes} | 🔁 ${tweet.retweets}</div>
+                    </div>
+                `);
+            }
+            
+            // REPLACE entire content (virtual scroll behavior)
+            timeline.innerHTML = tweets.join('');
+            currentIndex = startIndex;
+        }
+        
+        // Initial render
+        renderTweets(0);
+        
+        // Handle scroll
+        timeline.addEventListener('scroll', () => {
+            const scrollTop = timeline.scrollTop;
+            const scrollHeight = timeline.scrollHeight;
+            const clientHeight = timeline.clientHeight;
+            
+            // When near bottom, load next page
+            if (scrollTop + clientHeight >= scrollHeight - 100) {
+                const nextIndex = currentIndex + tweetsPerPage;
+                if (nextIndex < totalTweets) {
+                    renderTweets(nextIndex);
+                    // Small scroll adjustment for continuous scrolling
+                    timeline.scrollTop = 50;
+                }
+            }
+        });
+    </script>
+</body>
+</html>
--- a/docs/examples/virtual_scroll_example.py
+++ b/docs/examples/virtual_scroll_example.py
@@ -0,0 +1,367 @@
+"""
+Example of using the virtual scroll feature to capture content from pages
+with virtualized scrolling (like Twitter, Instagram, or other infinite scroll feeds).
+
+This example demonstrates virtual scroll with a local test server serving
+different types of scrolling behaviors from HTML files in the assets directory.
+"""
+
+import asyncio
+import os
+import http.server
+import socketserver
+import threading
+from pathlib import Path
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, VirtualScrollConfig, CacheMode, BrowserConfig
+
+# Get the assets directory path
+ASSETS_DIR = Path(__file__).parent / "assets"
+
+class TestServer:
+    """Simple HTTP server to serve our test HTML files"""
+    
+    def __init__(self, port=8080):
+        self.port = port
+        self.httpd = None
+        self.server_thread = None
+        
+    async def start(self):
+        """Start the test server"""
+        Handler = http.server.SimpleHTTPRequestHandler
+        
+        # Save current directory and change to assets directory
+        self.original_cwd = os.getcwd()
+        os.chdir(ASSETS_DIR)
+        
+        # Try to find an available port
+        for _ in range(10):
+            try:
+                self.httpd = socketserver.TCPServer(("", self.port), Handler)
+                break
+            except OSError:
+                self.port += 1
+                
+        if self.httpd is None:
+            raise RuntimeError("Could not find available port")
+            
+        self.server_thread = threading.Thread(target=self.httpd.serve_forever)
+        self.server_thread.daemon = True
+        self.server_thread.start()
+        
+        # Give server time to start
+        await asyncio.sleep(0.5)
+        
+        print(f"Test server started on http://localhost:{self.port}")
+        return self.port
+        
+    def stop(self):
+        """Stop the test server"""
+        if self.httpd:
+            self.httpd.shutdown()
+        # Restore original directory
+        if hasattr(self, 'original_cwd'):
+            os.chdir(self.original_cwd)
+            
+
+async def example_twitter_like_virtual_scroll():
+    """
+    Example 1: Twitter-like virtual scroll where content is REPLACED.
+    This is the classic virtual scroll use case - only visible items exist in DOM.
+    """
+    print("\n" + "="*60)
+    print("EXAMPLE 1: Twitter-like Virtual Scroll")
+    print("="*60)
+    
+    server = TestServer()
+    port = await server.start()
+    
+    try:
+        # Configure virtual scroll for Twitter-like timeline
+        virtual_config = VirtualScrollConfig(
+            container_selector="#timeline",  # The scrollable container
+            scroll_count=50,  # Scroll up to 50 times to get all content
+            scroll_by="container_height",  # Scroll by container's height
+            wait_after_scroll=0.3  # Wait 300ms after each scroll
+        )
+        
+        config = CrawlerRunConfig(
+            virtual_scroll_config=virtual_config,
+            cache_mode=CacheMode.BYPASS
+        )
+        
+        # TIP: Set headless=False to watch the scrolling happen!
+        browser_config = BrowserConfig(
+            headless=False,
+            viewport={"width": 1280, "height": 800}
+        )
+        
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            result = await crawler.arun(
+                url=f"http://localhost:{port}/virtual_scroll_twitter_like.html",
+                config=config
+            )
+            
+            # Count tweets captured
+            import re
+            tweets = re.findall(r'data-tweet-id="(\d+)"', result.html)
+            unique_tweets = sorted(set(int(id) for id in tweets))
+            
+            print(f"\n📊 Results:")
+            print(f"   Total HTML length: {len(result.html):,} characters")
+            print(f"   Tweets captured: {len(unique_tweets)} unique tweets")
+            if unique_tweets:
+                print(f"   Tweet IDs range: {min(unique_tweets)} to {max(unique_tweets)}")
+                print(f"   Expected range: 0 to 499 (500 tweets total)")
+                
+                if len(unique_tweets) == 500:
+                    print(f"   ✅ SUCCESS! All tweets captured!")
+                else:
+                    print(f"   ⚠️  Captured {len(unique_tweets)}/500 tweets")
+                    
+    finally:
+        server.stop()
+
+
+async def example_traditional_append_scroll():
+    """
+    Example 2: Traditional infinite scroll where content is APPENDED.
+    No virtual scroll needed - all content stays in DOM.
+    """
+    print("\n" + "="*60)
+    print("EXAMPLE 2: Traditional Append-Only Scroll")
+    print("="*60)
+    
+    server = TestServer()
+    port = await server.start()
+    
+    try:
+        # Configure virtual scroll
+        virtual_config = VirtualScrollConfig(
+            container_selector=".posts-container",
+            scroll_count=15,  # Less scrolls needed since content accumulates
+            scroll_by=500,  # Scroll by 500 pixels
+            wait_after_scroll=0.4
+        )
+        
+        config = CrawlerRunConfig(
+            virtual_scroll_config=virtual_config,
+            cache_mode=CacheMode.BYPASS
+        )
+        
+        async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun(
+                url=f"http://localhost:{port}/virtual_scroll_append_only.html",
+                config=config
+            )
+            
+            # Count posts
+            import re
+            posts = re.findall(r'data-post-id="(\d+)"', result.html)
+            unique_posts = sorted(set(int(id) for id in posts))
+            
+            print(f"\n📊 Results:")
+            print(f"   Total HTML length: {len(result.html):,} characters")
+            print(f"   Posts captured: {len(unique_posts)} unique posts")
+            
+            if unique_posts:
+                print(f"   Post IDs range: {min(unique_posts)} to {max(unique_posts)}")
+                print(f"   ℹ️  Note: This page appends content, so virtual scroll")
+                print(f"       just helps trigger more loads. All content stays in DOM.")
+                
+    finally:
+        server.stop()
+
+
+async def example_instagram_grid():
+    """
+    Example 3: Instagram-like grid with virtual scroll.
+    Grid layout where only visible rows are rendered.
+    """
+    print("\n" + "="*60)
+    print("EXAMPLE 3: Instagram Grid Virtual Scroll")
+    print("="*60)
+    
+    server = TestServer()
+    port = await server.start()
+    
+    try:
+        # Configure for grid layout
+        virtual_config = VirtualScrollConfig(
+            container_selector=".feed-container",  # Container with the grid
+            scroll_count=100,  # Many scrolls for 999 posts
+            scroll_by="container_height",
+            wait_after_scroll=0.2  # Faster scrolling for grid
+        )
+        
+        config = CrawlerRunConfig(
+            virtual_scroll_config=virtual_config,
+            cache_mode=CacheMode.BYPASS,
+            screenshot=True  # Take a screenshot of the final grid
+        )
+        
+        # Show browser for this visual example
+        browser_config = BrowserConfig(
+            headless=False,
+            viewport={"width": 1200, "height": 900}
+        )
+        
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            result = await crawler.arun(
+                url=f"http://localhost:{port}/virtual_scroll_instagram_grid.html",
+                config=config
+            )
+            
+            # Count posts in grid
+            import re
+            posts = re.findall(r'data-post-id="(\d+)"', result.html)
+            unique_posts = sorted(set(int(id) for id in posts))
+            
+            print(f"\n📊 Results:")
+            print(f"   Posts in grid: {len(unique_posts)} unique posts")
+            if unique_posts:
+                print(f"   Post IDs range: {min(unique_posts)} to {max(unique_posts)}")
+                print(f"   Expected: 0 to 998 (999 posts total)")
+                
+            # Save screenshot
+            if result.screenshot:
+                import base64
+                with open("instagram_grid_result.png", "wb") as f:
+                    f.write(base64.b64decode(result.screenshot))
+                print(f"   📸 Screenshot saved as instagram_grid_result.png")
+                
+    finally:
+        server.stop()
+
+
+async def example_mixed_content():
+    """
+    Example 4: News feed with mixed behavior.
+    Featured articles stay (no virtual scroll), regular articles are virtualized.
+    """
+    print("\n" + "="*60)
+    print("EXAMPLE 4: News Feed with Mixed Behavior")
+    print("="*60)
+    
+    server = TestServer()
+    port = await server.start()
+    
+    try:
+        # Configure virtual scroll
+        virtual_config = VirtualScrollConfig(
+            container_selector="#newsContainer",
+            scroll_count=25,
+            scroll_by="container_height",
+            wait_after_scroll=0.3
+        )
+        
+        config = CrawlerRunConfig(
+            virtual_scroll_config=virtual_config,
+            cache_mode=CacheMode.BYPASS
+        )
+        
+        async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun(
+                url=f"http://localhost:{port}/virtual_scroll_news_feed.html",
+                config=config
+            )
+            
+            # Count different types of articles
+            import re
+            featured = re.findall(r'data-article-id="featured-\d+"', result.html)
+            regular = re.findall(r'data-article-id="article-(\d+)"', result.html)
+            
+            print(f"\n📊 Results:")
+            print(f"   Featured articles: {len(set(featured))} (always visible)")
+            print(f"   Regular articles: {len(set(regular))} unique articles")
+            
+            if regular:
+                regular_ids = sorted(set(int(id) for id in regular))
+                print(f"   Regular article IDs: {min(regular_ids)} to {max(regular_ids)}")
+                print(f"   ℹ️  Note: Featured articles stay in DOM, only regular")
+                print(f"       articles are replaced during virtual scroll")
+                
+    finally:
+        server.stop()
+
+
+async def compare_with_without_virtual_scroll():
+    """
+    Comparison: Show the difference between crawling with and without virtual scroll.
+    """
+    print("\n" + "="*60)
+    print("COMPARISON: With vs Without Virtual Scroll")
+    print("="*60)
+    
+    server = TestServer()
+    port = await server.start()
+    
+    try:
+        url = f"http://localhost:{port}/virtual_scroll_twitter_like.html"
+        
+        # First, crawl WITHOUT virtual scroll
+        print("\n1️⃣  Crawling WITHOUT virtual scroll...")
+        async with AsyncWebCrawler() as crawler:
+            config_normal = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+            result_normal = await crawler.arun(url=url, config=config_normal)
+            
+            # Count items
+            import re
+            tweets_normal = len(set(re.findall(r'data-tweet-id="(\d+)"', result_normal.html)))
+            
+        # Then, crawl WITH virtual scroll  
+        print("2️⃣  Crawling WITH virtual scroll...")
+        virtual_config = VirtualScrollConfig(
+            container_selector="#timeline",
+            scroll_count=50,
+            scroll_by="container_height",
+            wait_after_scroll=0.2
+        )
+        
+        config_virtual = CrawlerRunConfig(
+            virtual_scroll_config=virtual_config,
+            cache_mode=CacheMode.BYPASS
+        )
+        
+        async with AsyncWebCrawler() as crawler:
+            result_virtual = await crawler.arun(url=url, config=config_virtual)
+            
+            # Count items
+            tweets_virtual = len(set(re.findall(r'data-tweet-id="(\d+)"', result_virtual.html)))
+            
+        # Compare results
+        print(f"\n📊 Comparison Results:")
+        print(f"   Without virtual scroll: {tweets_normal} tweets (only initial visible)")
+        print(f"   With virtual scroll: {tweets_virtual} tweets (all content captured)")
+        print(f"   Improvement: {tweets_virtual / tweets_normal if tweets_normal > 0 else 'N/A':.1f}x more content!")
+        
+        print(f"\n   HTML size without: {len(result_normal.html):,} characters")
+        print(f"   HTML size with: {len(result_virtual.html):,} characters")
+        
+    finally:
+        server.stop()
+
+
+if __name__ == "__main__":
+    print("""
+╔════════════════════════════════════════════════════════════╗
+║           Virtual Scroll Examples for Crawl4AI             ║
+╚════════════════════════════════════════════════════════════╝
+
+These examples demonstrate different virtual scroll scenarios:
+1. Twitter-like (content replaced) - Classic virtual scroll
+2. Traditional append - Content accumulates 
+3. Instagram grid - Visual grid layout
+4. Mixed behavior - Some content stays, some virtualizes
+
+Starting examples...
+""")
+    
+    # Run all examples
+    asyncio.run(example_twitter_like_virtual_scroll())
+    asyncio.run(example_traditional_append_scroll())
+    asyncio.run(example_instagram_grid())
+    asyncio.run(example_mixed_content())
+    asyncio.run(compare_with_without_virtual_scroll())
+    
+    print("\n✅ All examples completed!")
+    print("\nTIP: Set headless=False in BrowserConfig to watch the scrolling in action!")