feat: Add virtual scroll support for modern web scraping

Add comprehensive virtual scroll handling to capture all content from pages that use DOM recycling techniques (Twitter, Instagram, etc). Key features: - New VirtualScrollConfig class for configuring virtual scroll behavior - Automatic detection of three scrolling scenarios: no change, content appended, content replaced - Intelligent HTML chunk capture and merging with deduplication - 100% content capture from virtual scroll pages - Seamless integration with existing extraction strategies - JavaScript-based detection and capture for performance - Tree-based DOM merging with text-based deduplication Documentation: - Comprehensive guide at docs/md_v2/advanced/virtual-scroll.md - API reference updates in parameters.md and page-interaction.md - Blog article explaining the solution and techniques - Complete examples with local test server Testing: - Full test suite achieving 100% capture of 1000 items - Examples for Twitter timeline, Instagram grid scenarios - Local test server with different scrolling behaviors This enables scraping of modern websites that were previously impossible to fully capture with traditional scrolling techniques.
2025-06-29 20:41:37 +08:00
parent 539a324cf6
commit a353515271
18 changed files with 2194 additions and 6 deletions
--- a/docs/examples/virtual_scroll_example.py
+++ b/docs/examples/virtual_scroll_example.py
@@ -0,0 +1,367 @@
+"""
+Example of using the virtual scroll feature to capture content from pages
+with virtualized scrolling (like Twitter, Instagram, or other infinite scroll feeds).
+
+This example demonstrates virtual scroll with a local test server serving
+different types of scrolling behaviors from HTML files in the assets directory.
+"""
+
+import asyncio
+import os
+import http.server
+import socketserver
+import threading
+from pathlib import Path
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, VirtualScrollConfig, CacheMode, BrowserConfig
+
+# Get the assets directory path
+ASSETS_DIR = Path(__file__).parent / "assets"
+
+class TestServer:
+    """Simple HTTP server to serve our test HTML files"""
+    
+    def __init__(self, port=8080):
+        self.port = port
+        self.httpd = None
+        self.server_thread = None
+        
+    async def start(self):
+        """Start the test server"""
+        Handler = http.server.SimpleHTTPRequestHandler
+        
+        # Save current directory and change to assets directory
+        self.original_cwd = os.getcwd()
+        os.chdir(ASSETS_DIR)
+        
+        # Try to find an available port
+        for _ in range(10):
+            try:
+                self.httpd = socketserver.TCPServer(("", self.port), Handler)
+                break
+            except OSError:
+                self.port += 1
+                
+        if self.httpd is None:
+            raise RuntimeError("Could not find available port")
+            
+        self.server_thread = threading.Thread(target=self.httpd.serve_forever)
+        self.server_thread.daemon = True
+        self.server_thread.start()
+        
+        # Give server time to start
+        await asyncio.sleep(0.5)
+        
+        print(f"Test server started on http://localhost:{self.port}")
+        return self.port
+        
+    def stop(self):
+        """Stop the test server"""
+        if self.httpd:
+            self.httpd.shutdown()
+        # Restore original directory
+        if hasattr(self, 'original_cwd'):
+            os.chdir(self.original_cwd)
+            
+
+async def example_twitter_like_virtual_scroll():
+    """
+    Example 1: Twitter-like virtual scroll where content is REPLACED.
+    This is the classic virtual scroll use case - only visible items exist in DOM.
+    """
+    print("\n" + "="*60)
+    print("EXAMPLE 1: Twitter-like Virtual Scroll")
+    print("="*60)
+    
+    server = TestServer()
+    port = await server.start()
+    
+    try:
+        # Configure virtual scroll for Twitter-like timeline
+        virtual_config = VirtualScrollConfig(
+            container_selector="#timeline",  # The scrollable container
+            scroll_count=50,  # Scroll up to 50 times to get all content
+            scroll_by="container_height",  # Scroll by container's height
+            wait_after_scroll=0.3  # Wait 300ms after each scroll
+        )
+        
+        config = CrawlerRunConfig(
+            virtual_scroll_config=virtual_config,
+            cache_mode=CacheMode.BYPASS
+        )
+        
+        # TIP: Set headless=False to watch the scrolling happen!
+        browser_config = BrowserConfig(
+            headless=False,
+            viewport={"width": 1280, "height": 800}
+        )
+        
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            result = await crawler.arun(
+                url=f"http://localhost:{port}/virtual_scroll_twitter_like.html",
+                config=config
+            )
+            
+            # Count tweets captured
+            import re
+            tweets = re.findall(r'data-tweet-id="(\d+)"', result.html)
+            unique_tweets = sorted(set(int(id) for id in tweets))
+            
+            print(f"\n📊 Results:")
+            print(f"   Total HTML length: {len(result.html):,} characters")
+            print(f"   Tweets captured: {len(unique_tweets)} unique tweets")
+            if unique_tweets:
+                print(f"   Tweet IDs range: {min(unique_tweets)} to {max(unique_tweets)}")
+                print(f"   Expected range: 0 to 499 (500 tweets total)")
+                
+                if len(unique_tweets) == 500:
+                    print(f"   ✅ SUCCESS! All tweets captured!")
+                else:
+                    print(f"   ⚠️  Captured {len(unique_tweets)}/500 tweets")
+                    
+    finally:
+        server.stop()
+
+
+async def example_traditional_append_scroll():
+    """
+    Example 2: Traditional infinite scroll where content is APPENDED.
+    No virtual scroll needed - all content stays in DOM.
+    """
+    print("\n" + "="*60)
+    print("EXAMPLE 2: Traditional Append-Only Scroll")
+    print("="*60)
+    
+    server = TestServer()
+    port = await server.start()
+    
+    try:
+        # Configure virtual scroll
+        virtual_config = VirtualScrollConfig(
+            container_selector=".posts-container",
+            scroll_count=15,  # Less scrolls needed since content accumulates
+            scroll_by=500,  # Scroll by 500 pixels
+            wait_after_scroll=0.4
+        )
+        
+        config = CrawlerRunConfig(
+            virtual_scroll_config=virtual_config,
+            cache_mode=CacheMode.BYPASS
+        )
+        
+        async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun(
+                url=f"http://localhost:{port}/virtual_scroll_append_only.html",
+                config=config
+            )
+            
+            # Count posts
+            import re
+            posts = re.findall(r'data-post-id="(\d+)"', result.html)
+            unique_posts = sorted(set(int(id) for id in posts))
+            
+            print(f"\n📊 Results:")
+            print(f"   Total HTML length: {len(result.html):,} characters")
+            print(f"   Posts captured: {len(unique_posts)} unique posts")
+            
+            if unique_posts:
+                print(f"   Post IDs range: {min(unique_posts)} to {max(unique_posts)}")
+                print(f"   ℹ️  Note: This page appends content, so virtual scroll")
+                print(f"       just helps trigger more loads. All content stays in DOM.")
+                
+    finally:
+        server.stop()
+
+
+async def example_instagram_grid():
+    """
+    Example 3: Instagram-like grid with virtual scroll.
+    Grid layout where only visible rows are rendered.
+    """
+    print("\n" + "="*60)
+    print("EXAMPLE 3: Instagram Grid Virtual Scroll")
+    print("="*60)
+    
+    server = TestServer()
+    port = await server.start()
+    
+    try:
+        # Configure for grid layout
+        virtual_config = VirtualScrollConfig(
+            container_selector=".feed-container",  # Container with the grid
+            scroll_count=100,  # Many scrolls for 999 posts
+            scroll_by="container_height",
+            wait_after_scroll=0.2  # Faster scrolling for grid
+        )
+        
+        config = CrawlerRunConfig(
+            virtual_scroll_config=virtual_config,
+            cache_mode=CacheMode.BYPASS,
+            screenshot=True  # Take a screenshot of the final grid
+        )
+        
+        # Show browser for this visual example
+        browser_config = BrowserConfig(
+            headless=False,
+            viewport={"width": 1200, "height": 900}
+        )
+        
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            result = await crawler.arun(
+                url=f"http://localhost:{port}/virtual_scroll_instagram_grid.html",
+                config=config
+            )
+            
+            # Count posts in grid
+            import re
+            posts = re.findall(r'data-post-id="(\d+)"', result.html)
+            unique_posts = sorted(set(int(id) for id in posts))
+            
+            print(f"\n📊 Results:")
+            print(f"   Posts in grid: {len(unique_posts)} unique posts")
+            if unique_posts:
+                print(f"   Post IDs range: {min(unique_posts)} to {max(unique_posts)}")
+                print(f"   Expected: 0 to 998 (999 posts total)")
+                
+            # Save screenshot
+            if result.screenshot:
+                import base64
+                with open("instagram_grid_result.png", "wb") as f:
+                    f.write(base64.b64decode(result.screenshot))
+                print(f"   📸 Screenshot saved as instagram_grid_result.png")
+                
+    finally:
+        server.stop()
+
+
+async def example_mixed_content():
+    """
+    Example 4: News feed with mixed behavior.
+    Featured articles stay (no virtual scroll), regular articles are virtualized.
+    """
+    print("\n" + "="*60)
+    print("EXAMPLE 4: News Feed with Mixed Behavior")
+    print("="*60)
+    
+    server = TestServer()
+    port = await server.start()
+    
+    try:
+        # Configure virtual scroll
+        virtual_config = VirtualScrollConfig(
+            container_selector="#newsContainer",
+            scroll_count=25,
+            scroll_by="container_height",
+            wait_after_scroll=0.3
+        )
+        
+        config = CrawlerRunConfig(
+            virtual_scroll_config=virtual_config,
+            cache_mode=CacheMode.BYPASS
+        )
+        
+        async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun(
+                url=f"http://localhost:{port}/virtual_scroll_news_feed.html",
+                config=config
+            )
+            
+            # Count different types of articles
+            import re
+            featured = re.findall(r'data-article-id="featured-\d+"', result.html)
+            regular = re.findall(r'data-article-id="article-(\d+)"', result.html)
+            
+            print(f"\n📊 Results:")
+            print(f"   Featured articles: {len(set(featured))} (always visible)")
+            print(f"   Regular articles: {len(set(regular))} unique articles")
+            
+            if regular:
+                regular_ids = sorted(set(int(id) for id in regular))
+                print(f"   Regular article IDs: {min(regular_ids)} to {max(regular_ids)}")
+                print(f"   ℹ️  Note: Featured articles stay in DOM, only regular")
+                print(f"       articles are replaced during virtual scroll")
+                
+    finally:
+        server.stop()
+
+
+async def compare_with_without_virtual_scroll():
+    """
+    Comparison: Show the difference between crawling with and without virtual scroll.
+    """
+    print("\n" + "="*60)
+    print("COMPARISON: With vs Without Virtual Scroll")
+    print("="*60)
+    
+    server = TestServer()
+    port = await server.start()
+    
+    try:
+        url = f"http://localhost:{port}/virtual_scroll_twitter_like.html"
+        
+        # First, crawl WITHOUT virtual scroll
+        print("\n1️⃣  Crawling WITHOUT virtual scroll...")
+        async with AsyncWebCrawler() as crawler:
+            config_normal = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+            result_normal = await crawler.arun(url=url, config=config_normal)
+            
+            # Count items
+            import re
+            tweets_normal = len(set(re.findall(r'data-tweet-id="(\d+)"', result_normal.html)))
+            
+        # Then, crawl WITH virtual scroll  
+        print("2️⃣  Crawling WITH virtual scroll...")
+        virtual_config = VirtualScrollConfig(
+            container_selector="#timeline",
+            scroll_count=50,
+            scroll_by="container_height",
+            wait_after_scroll=0.2
+        )
+        
+        config_virtual = CrawlerRunConfig(
+            virtual_scroll_config=virtual_config,
+            cache_mode=CacheMode.BYPASS
+        )
+        
+        async with AsyncWebCrawler() as crawler:
+            result_virtual = await crawler.arun(url=url, config=config_virtual)
+            
+            # Count items
+            tweets_virtual = len(set(re.findall(r'data-tweet-id="(\d+)"', result_virtual.html)))
+            
+        # Compare results
+        print(f"\n📊 Comparison Results:")
+        print(f"   Without virtual scroll: {tweets_normal} tweets (only initial visible)")
+        print(f"   With virtual scroll: {tweets_virtual} tweets (all content captured)")
+        print(f"   Improvement: {tweets_virtual / tweets_normal if tweets_normal > 0 else 'N/A':.1f}x more content!")
+        
+        print(f"\n   HTML size without: {len(result_normal.html):,} characters")
+        print(f"   HTML size with: {len(result_virtual.html):,} characters")
+        
+    finally:
+        server.stop()
+
+
+if __name__ == "__main__":
+    print("""
+╔════════════════════════════════════════════════════════════╗
+║           Virtual Scroll Examples for Crawl4AI             ║
+╚════════════════════════════════════════════════════════════╝
+
+These examples demonstrate different virtual scroll scenarios:
+1. Twitter-like (content replaced) - Classic virtual scroll
+2. Traditional append - Content accumulates 
+3. Instagram grid - Visual grid layout
+4. Mixed behavior - Some content stays, some virtualizes
+
+Starting examples...
+""")
+    
+    # Run all examples
+    asyncio.run(example_twitter_like_virtual_scroll())
+    asyncio.run(example_traditional_append_scroll())
+    asyncio.run(example_instagram_grid())
+    asyncio.run(example_mixed_content())
+    asyncio.run(compare_with_without_virtual_scroll())
+    
+    print("\n✅ All examples completed!")
+    print("\nTIP: Set headless=False in BrowserConfig to watch the scrolling in action!")