feat: Add virtual scroll support for modern web scraping

Add comprehensive virtual scroll handling to capture all content from pages that use DOM recycling techniques (Twitter, Instagram, etc). Key features: - New VirtualScrollConfig class for configuring virtual scroll behavior - Automatic detection of three scrolling scenarios: no change, content appended, content replaced - Intelligent HTML chunk capture and merging with deduplication - 100% content capture from virtual scroll pages - Seamless integration with existing extraction strategies - JavaScript-based detection and capture for performance - Tree-based DOM merging with text-based deduplication Documentation: - Comprehensive guide at docs/md_v2/advanced/virtual-scroll.md - API reference updates in parameters.md and page-interaction.md - Blog article explaining the solution and techniques - Complete examples with local test server Testing: - Full test suite achieving 100% capture of 1000 items - Examples for Twitter timeline, Instagram grid scenarios - Local test server with different scrolling behaviors This enables scraping of modern websites that were previously impossible to fully capture with traditional scrolling techniques.
2025-06-29 20:41:37 +08:00
parent 539a324cf6
commit a353515271
18 changed files with 2194 additions and 6 deletions
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -2,8 +2,8 @@
 import warnings

 from .async_webcrawler import AsyncWebCrawler, CacheMode
-# MODIFIED: Add SeedingConfig here
-from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig
+# MODIFIED: Add SeedingConfig and VirtualScrollConfig here
+from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig

 from .content_scraping_strategy import (
    ContentScrapingStrategy,
@@ -92,8 +92,9 @@ __all__ = [
    "BrowserProfiler",
    "LLMConfig",
    "GeolocationConfig",
-    # NEW: Add SeedingConfig
+    # NEW: Add SeedingConfig and VirtualScrollConfig
    "SeedingConfig",
+    "VirtualScrollConfig",
    # NEW: Add AsyncUrlSeeder
    "AsyncUrlSeeder",
    "DeepCrawlStrategy",
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -1,4 +1,5 @@
 import os
+from typing import Union
 from .config import (
    DEFAULT_PROVIDER,
    DEFAULT_PROVIDER_API_KEY,
@@ -594,6 +595,51 @@ class BrowserConfig:
            return config
        return BrowserConfig.from_kwargs(config)

+class VirtualScrollConfig:
+    """Configuration for virtual scroll handling.
+    
+    This config enables capturing content from pages with virtualized scrolling
+    (like Twitter, Instagram feeds) where DOM elements are recycled as user scrolls.
+    """
+    
+    def __init__(
+        self,
+        container_selector: str,
+        scroll_count: int = 10,
+        scroll_by: Union[str, int] = "container_height",
+        wait_after_scroll: float = 0.5,
+    ):
+        """
+        Initialize virtual scroll configuration.
+        
+        Args:
+            container_selector: CSS selector for the scrollable container
+            scroll_count: Maximum number of scrolls to perform
+            scroll_by: Amount to scroll - can be:
+                - "container_height": scroll by container's height
+                - "page_height": scroll by viewport height  
+                - int: fixed pixel amount
+            wait_after_scroll: Seconds to wait after each scroll for content to load
+        """
+        self.container_selector = container_selector
+        self.scroll_count = scroll_count
+        self.scroll_by = scroll_by
+        self.wait_after_scroll = wait_after_scroll
+    
+    def to_dict(self) -> dict:
+        """Convert to dictionary for serialization."""
+        return {
+            "container_selector": self.container_selector,
+            "scroll_count": self.scroll_count,
+            "scroll_by": self.scroll_by,
+            "wait_after_scroll": self.wait_after_scroll,
+        }
+    
+    @classmethod
+    def from_dict(cls, data: dict) -> "VirtualScrollConfig":
+        """Create instance from dictionary."""
+        return cls(**data)
+
 class LinkPreviewConfig:
    """Configuration for link head extraction and scoring."""
    
@@ -911,6 +957,12 @@ class CrawlerRunConfig():
        table_score_threshold (int): Minimum score threshold for processing a table.
                                     Default: 7.

+        # Virtual Scroll Parameters
+        virtual_scroll_config (VirtualScrollConfig or dict or None): Configuration for handling virtual scroll containers.
+                                                                     Used for capturing content from pages with virtualized 
+                                                                     scrolling (e.g., Twitter, Instagram feeds).
+                                                                     Default: None.
+
        # Link and Domain Handling Parameters
        exclude_social_media_domains (list of str): List of domains to exclude for social media links.
                                                    Default: SOCIAL_MEDIA_DOMAINS (from config).
@@ -1056,6 +1108,8 @@ class CrawlerRunConfig():
        deep_crawl_strategy: Optional[DeepCrawlStrategy] = None,
        # Link Extraction Parameters
        link_preview_config: Union[LinkPreviewConfig, Dict[str, Any]] = None,
+        # Virtual Scroll Parameters
+        virtual_scroll_config: Union[VirtualScrollConfig, Dict[str, Any]] = None,
        # Experimental Parameters
        experimental: Dict[str, Any] = None,
    ):
@@ -1197,6 +1251,17 @@ class CrawlerRunConfig():
        else:
            raise ValueError("link_preview_config must be LinkPreviewConfig object or dict")
        
+        # Virtual Scroll Parameters
+        if virtual_scroll_config is None:
+            self.virtual_scroll_config = None
+        elif isinstance(virtual_scroll_config, VirtualScrollConfig):
+            self.virtual_scroll_config = virtual_scroll_config
+        elif isinstance(virtual_scroll_config, dict):
+            # Convert dict to config object for backward compatibility
+            self.virtual_scroll_config = VirtualScrollConfig.from_dict(virtual_scroll_config)
+        else:
+            raise ValueError("virtual_scroll_config must be VirtualScrollConfig object or dict")
+        
        # Experimental Parameters
        self.experimental = experimental or {}
        
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -898,6 +898,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            if config.scan_full_page:
                await self._handle_full_page_scan(page, config.scroll_delay)

+            # Handle virtual scroll if configured
+            if config.virtual_scroll_config:
+                await self._handle_virtual_scroll(page, config.virtual_scroll_config)
+
            # Execute JavaScript if provided
            # if config.js_code:
            #     if isinstance(config.js_code, str):
@@ -1149,6 +1153,177 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            # await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            await self.safe_scroll(page, 0, total_height)

+    async def _handle_virtual_scroll(self, page: Page, config: "VirtualScrollConfig"):
+        """
+        Handle virtual scroll containers (e.g., Twitter-like feeds) by capturing
+        content at different scroll positions and merging unique elements.
+        
+        Following the design:
+        1. Get container HTML
+        2. Scroll by container height
+        3. Wait and check if container HTML changed
+        4. Three cases:
+           - No change: continue scrolling
+           - New items added (appended): continue (items already in page)
+           - Items replaced: capture HTML chunk and add to list
+        5. After N scrolls, merge chunks if any were captured
+        
+        Args:
+            page: The Playwright page object
+            config: Virtual scroll configuration
+        """
+        try:
+            # Import VirtualScrollConfig to avoid circular import
+            from .async_configs import VirtualScrollConfig
+            
+            # Ensure config is a VirtualScrollConfig instance
+            if isinstance(config, dict):
+                config = VirtualScrollConfig.from_dict(config)
+            
+            self.logger.info(
+                message="Starting virtual scroll capture for container: {selector}",
+                tag="VSCROLL",
+                params={"selector": config.container_selector}
+            )
+            
+            # JavaScript function to handle virtual scroll capture
+            virtual_scroll_js = """
+            async (config) => {
+                const container = document.querySelector(config.container_selector);
+                if (!container) {
+                    throw new Error(`Container not found: ${config.container_selector}`);
+                }
+                
+                // List to store HTML chunks when content is replaced
+                const htmlChunks = [];
+                let previousHTML = container.innerHTML;
+                let scrollCount = 0;
+                
+                // Determine scroll amount
+                let scrollAmount;
+                if (typeof config.scroll_by === 'number') {
+                    scrollAmount = config.scroll_by;
+                } else if (config.scroll_by === 'page_height') {
+                    scrollAmount = window.innerHeight;
+                } else { // container_height
+                    scrollAmount = container.offsetHeight;
+                }
+                
+                // Perform scrolling
+                while (scrollCount < config.scroll_count) {
+                    // Scroll the container
+                    container.scrollTop += scrollAmount;
+                    
+                    // Wait for content to potentially load
+                    await new Promise(resolve => setTimeout(resolve, config.wait_after_scroll * 1000));
+                    
+                    // Get current HTML
+                    const currentHTML = container.innerHTML;
+                    
+                    // Determine what changed
+                    if (currentHTML === previousHTML) {
+                        // Case 0: No change - continue scrolling
+                        console.log(`Scroll ${scrollCount + 1}: No change in content`);
+                    } else if (currentHTML.startsWith(previousHTML)) {
+                        // Case 1: New items appended - content already in page
+                        console.log(`Scroll ${scrollCount + 1}: New items appended`);
+                    } else {
+                        // Case 2: Items replaced - capture the previous HTML
+                        console.log(`Scroll ${scrollCount + 1}: Content replaced, capturing chunk`);
+                        htmlChunks.push(previousHTML);
+                    }
+                    
+                    // Update previous HTML for next iteration
+                    previousHTML = currentHTML;
+                    scrollCount++;
+                    
+                    // Check if we've reached the end
+                    if (container.scrollTop + container.clientHeight >= container.scrollHeight - 10) {
+                        console.log(`Reached end of scrollable content at scroll ${scrollCount}`);
+                        // Capture final chunk if content was replaced
+                        if (htmlChunks.length > 0) {
+                            htmlChunks.push(currentHTML);
+                        }
+                        break;
+                    }
+                }
+                
+                // If we have chunks (case 2 occurred), merge them
+                if (htmlChunks.length > 0) {
+                    console.log(`Merging ${htmlChunks.length} HTML chunks`);
+                    
+                    // Parse all chunks to extract unique elements
+                    const tempDiv = document.createElement('div');
+                    const seenTexts = new Set();
+                    const uniqueElements = [];
+                    
+                    // Process each chunk
+                    for (const chunk of htmlChunks) {
+                        tempDiv.innerHTML = chunk;
+                        const elements = tempDiv.children;
+                        
+                        for (let i = 0; i < elements.length; i++) {
+                            const element = elements[i];
+                            // Normalize text for deduplication
+                            const normalizedText = element.innerText
+                                .toLowerCase()
+                                .replace(/[\\s\\W]/g, ''); // Remove spaces and symbols
+                            
+                            if (!seenTexts.has(normalizedText)) {
+                                seenTexts.add(normalizedText);
+                                uniqueElements.push(element.outerHTML);
+                            }
+                        }
+                    }
+                    
+                    // Replace container content with merged unique elements
+                    container.innerHTML = uniqueElements.join('\\n');
+                    console.log(`Merged ${uniqueElements.length} unique elements from ${htmlChunks.length} chunks`);
+                    
+                    return {
+                        success: true,
+                        chunksCount: htmlChunks.length,
+                        uniqueCount: uniqueElements.length,
+                        replaced: true
+                    };
+                } else {
+                    console.log('No content replacement detected, all content remains in page');
+                    return {
+                        success: true,
+                        chunksCount: 0,
+                        uniqueCount: 0,
+                        replaced: false
+                    };
+                }
+            }
+            """
+            
+            # Execute virtual scroll capture
+            result = await page.evaluate(virtual_scroll_js, config.to_dict())
+            
+            if result.get("replaced", False):
+                self.logger.success(
+                    message="Virtual scroll completed. Merged {unique} unique elements from {chunks} chunks",
+                    tag="VSCROLL",
+                    params={
+                        "unique": result.get("uniqueCount", 0),
+                        "chunks": result.get("chunksCount", 0)
+                    }
+                )
+            else:
+                self.logger.info(
+                    message="Virtual scroll completed. Content was appended, no merging needed",
+                    tag="VSCROLL"
+                )
+            
+        except Exception as e:
+            self.logger.error(
+                message="Virtual scroll capture failed: {error}",
+                tag="VSCROLL",
+                params={"error": str(e)}
+            )
+            # Continue with normal flow even if virtual scroll fails
+
    async def _handle_download(self, download):
        """
        Handle file downloads.