feat: Add virtual scroll support for modern web scraping

Add comprehensive virtual scroll handling to capture all content from pages that use DOM recycling techniques (Twitter, Instagram, etc).

Key features:
- New VirtualScrollConfig class for configuring virtual scroll behavior
- Automatic detection of three scrolling scenarios: no change, content appended, content replaced
- Intelligent HTML chunk capture and merging with deduplication
- 100% content capture from virtual scroll pages
- Seamless integration with existing extraction strategies
- JavaScript-based detection and capture for performance
- Tree-based DOM merging with text-based deduplication

Documentation:
- Comprehensive guide at docs/md_v2/advanced/virtual-scroll.md
- API reference updates in parameters.md and page-interaction.md
- Blog article explaining the solution and techniques
- Complete examples with local test server

Testing:
- Full test suite achieving 100% capture of 1000 items
- Examples for Twitter timeline, Instagram grid scenarios
- Local test server with different scrolling behaviors

This enables scraping of modern websites that were previously impossible to fully capture with traditional scrolling techniques.
This commit is contained in:
UncleCode
2025-06-29 20:41:37 +08:00
parent 539a324cf6
commit a353515271
18 changed files with 2194 additions and 6 deletions

View File

@@ -2,8 +2,8 @@
import warnings
from .async_webcrawler import AsyncWebCrawler, CacheMode
# MODIFIED: Add SeedingConfig here
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig
# MODIFIED: Add SeedingConfig and VirtualScrollConfig here
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig
from .content_scraping_strategy import (
ContentScrapingStrategy,
@@ -92,8 +92,9 @@ __all__ = [
"BrowserProfiler",
"LLMConfig",
"GeolocationConfig",
# NEW: Add SeedingConfig
# NEW: Add SeedingConfig and VirtualScrollConfig
"SeedingConfig",
"VirtualScrollConfig",
# NEW: Add AsyncUrlSeeder
"AsyncUrlSeeder",
"DeepCrawlStrategy",

View File

@@ -1,4 +1,5 @@
import os
from typing import Union
from .config import (
DEFAULT_PROVIDER,
DEFAULT_PROVIDER_API_KEY,
@@ -594,6 +595,51 @@ class BrowserConfig:
return config
return BrowserConfig.from_kwargs(config)
class VirtualScrollConfig:
"""Configuration for virtual scroll handling.
This config enables capturing content from pages with virtualized scrolling
(like Twitter, Instagram feeds) where DOM elements are recycled as user scrolls.
"""
def __init__(
self,
container_selector: str,
scroll_count: int = 10,
scroll_by: Union[str, int] = "container_height",
wait_after_scroll: float = 0.5,
):
"""
Initialize virtual scroll configuration.
Args:
container_selector: CSS selector for the scrollable container
scroll_count: Maximum number of scrolls to perform
scroll_by: Amount to scroll - can be:
- "container_height": scroll by container's height
- "page_height": scroll by viewport height
- int: fixed pixel amount
wait_after_scroll: Seconds to wait after each scroll for content to load
"""
self.container_selector = container_selector
self.scroll_count = scroll_count
self.scroll_by = scroll_by
self.wait_after_scroll = wait_after_scroll
def to_dict(self) -> dict:
"""Convert to dictionary for serialization."""
return {
"container_selector": self.container_selector,
"scroll_count": self.scroll_count,
"scroll_by": self.scroll_by,
"wait_after_scroll": self.wait_after_scroll,
}
@classmethod
def from_dict(cls, data: dict) -> "VirtualScrollConfig":
"""Create instance from dictionary."""
return cls(**data)
class LinkPreviewConfig:
"""Configuration for link head extraction and scoring."""
@@ -911,6 +957,12 @@ class CrawlerRunConfig():
table_score_threshold (int): Minimum score threshold for processing a table.
Default: 7.
# Virtual Scroll Parameters
virtual_scroll_config (VirtualScrollConfig or dict or None): Configuration for handling virtual scroll containers.
Used for capturing content from pages with virtualized
scrolling (e.g., Twitter, Instagram feeds).
Default: None.
# Link and Domain Handling Parameters
exclude_social_media_domains (list of str): List of domains to exclude for social media links.
Default: SOCIAL_MEDIA_DOMAINS (from config).
@@ -1056,6 +1108,8 @@ class CrawlerRunConfig():
deep_crawl_strategy: Optional[DeepCrawlStrategy] = None,
# Link Extraction Parameters
link_preview_config: Union[LinkPreviewConfig, Dict[str, Any]] = None,
# Virtual Scroll Parameters
virtual_scroll_config: Union[VirtualScrollConfig, Dict[str, Any]] = None,
# Experimental Parameters
experimental: Dict[str, Any] = None,
):
@@ -1197,6 +1251,17 @@ class CrawlerRunConfig():
else:
raise ValueError("link_preview_config must be LinkPreviewConfig object or dict")
# Virtual Scroll Parameters
if virtual_scroll_config is None:
self.virtual_scroll_config = None
elif isinstance(virtual_scroll_config, VirtualScrollConfig):
self.virtual_scroll_config = virtual_scroll_config
elif isinstance(virtual_scroll_config, dict):
# Convert dict to config object for backward compatibility
self.virtual_scroll_config = VirtualScrollConfig.from_dict(virtual_scroll_config)
else:
raise ValueError("virtual_scroll_config must be VirtualScrollConfig object or dict")
# Experimental Parameters
self.experimental = experimental or {}

View File

@@ -898,6 +898,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if config.scan_full_page:
await self._handle_full_page_scan(page, config.scroll_delay)
# Handle virtual scroll if configured
if config.virtual_scroll_config:
await self._handle_virtual_scroll(page, config.virtual_scroll_config)
# Execute JavaScript if provided
# if config.js_code:
# if isinstance(config.js_code, str):
@@ -1149,6 +1153,177 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
# await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await self.safe_scroll(page, 0, total_height)
async def _handle_virtual_scroll(self, page: Page, config: "VirtualScrollConfig"):
"""
Handle virtual scroll containers (e.g., Twitter-like feeds) by capturing
content at different scroll positions and merging unique elements.
Following the design:
1. Get container HTML
2. Scroll by container height
3. Wait and check if container HTML changed
4. Three cases:
- No change: continue scrolling
- New items added (appended): continue (items already in page)
- Items replaced: capture HTML chunk and add to list
5. After N scrolls, merge chunks if any were captured
Args:
page: The Playwright page object
config: Virtual scroll configuration
"""
try:
# Import VirtualScrollConfig to avoid circular import
from .async_configs import VirtualScrollConfig
# Ensure config is a VirtualScrollConfig instance
if isinstance(config, dict):
config = VirtualScrollConfig.from_dict(config)
self.logger.info(
message="Starting virtual scroll capture for container: {selector}",
tag="VSCROLL",
params={"selector": config.container_selector}
)
# JavaScript function to handle virtual scroll capture
virtual_scroll_js = """
async (config) => {
const container = document.querySelector(config.container_selector);
if (!container) {
throw new Error(`Container not found: ${config.container_selector}`);
}
// List to store HTML chunks when content is replaced
const htmlChunks = [];
let previousHTML = container.innerHTML;
let scrollCount = 0;
// Determine scroll amount
let scrollAmount;
if (typeof config.scroll_by === 'number') {
scrollAmount = config.scroll_by;
} else if (config.scroll_by === 'page_height') {
scrollAmount = window.innerHeight;
} else { // container_height
scrollAmount = container.offsetHeight;
}
// Perform scrolling
while (scrollCount < config.scroll_count) {
// Scroll the container
container.scrollTop += scrollAmount;
// Wait for content to potentially load
await new Promise(resolve => setTimeout(resolve, config.wait_after_scroll * 1000));
// Get current HTML
const currentHTML = container.innerHTML;
// Determine what changed
if (currentHTML === previousHTML) {
// Case 0: No change - continue scrolling
console.log(`Scroll ${scrollCount + 1}: No change in content`);
} else if (currentHTML.startsWith(previousHTML)) {
// Case 1: New items appended - content already in page
console.log(`Scroll ${scrollCount + 1}: New items appended`);
} else {
// Case 2: Items replaced - capture the previous HTML
console.log(`Scroll ${scrollCount + 1}: Content replaced, capturing chunk`);
htmlChunks.push(previousHTML);
}
// Update previous HTML for next iteration
previousHTML = currentHTML;
scrollCount++;
// Check if we've reached the end
if (container.scrollTop + container.clientHeight >= container.scrollHeight - 10) {
console.log(`Reached end of scrollable content at scroll ${scrollCount}`);
// Capture final chunk if content was replaced
if (htmlChunks.length > 0) {
htmlChunks.push(currentHTML);
}
break;
}
}
// If we have chunks (case 2 occurred), merge them
if (htmlChunks.length > 0) {
console.log(`Merging ${htmlChunks.length} HTML chunks`);
// Parse all chunks to extract unique elements
const tempDiv = document.createElement('div');
const seenTexts = new Set();
const uniqueElements = [];
// Process each chunk
for (const chunk of htmlChunks) {
tempDiv.innerHTML = chunk;
const elements = tempDiv.children;
for (let i = 0; i < elements.length; i++) {
const element = elements[i];
// Normalize text for deduplication
const normalizedText = element.innerText
.toLowerCase()
.replace(/[\\s\\W]/g, ''); // Remove spaces and symbols
if (!seenTexts.has(normalizedText)) {
seenTexts.add(normalizedText);
uniqueElements.push(element.outerHTML);
}
}
}
// Replace container content with merged unique elements
container.innerHTML = uniqueElements.join('\\n');
console.log(`Merged ${uniqueElements.length} unique elements from ${htmlChunks.length} chunks`);
return {
success: true,
chunksCount: htmlChunks.length,
uniqueCount: uniqueElements.length,
replaced: true
};
} else {
console.log('No content replacement detected, all content remains in page');
return {
success: true,
chunksCount: 0,
uniqueCount: 0,
replaced: false
};
}
}
"""
# Execute virtual scroll capture
result = await page.evaluate(virtual_scroll_js, config.to_dict())
if result.get("replaced", False):
self.logger.success(
message="Virtual scroll completed. Merged {unique} unique elements from {chunks} chunks",
tag="VSCROLL",
params={
"unique": result.get("uniqueCount", 0),
"chunks": result.get("chunksCount", 0)
}
)
else:
self.logger.info(
message="Virtual scroll completed. Content was appended, no merging needed",
tag="VSCROLL"
)
except Exception as e:
self.logger.error(
message="Virtual scroll capture failed: {error}",
tag="VSCROLL",
params={"error": str(e)}
)
# Continue with normal flow even if virtual scroll fails
async def _handle_download(self, download):
"""
Handle file downloads.