feat: Add virtual scroll support for modern web scraping

Add comprehensive virtual scroll handling to capture all content from pages that use DOM recycling techniques (Twitter, Instagram, etc).

Key features:
- New VirtualScrollConfig class for configuring virtual scroll behavior
- Automatic detection of three scrolling scenarios: no change, content appended, content replaced
- Intelligent HTML chunk capture and merging with deduplication
- 100% content capture from virtual scroll pages
- Seamless integration with existing extraction strategies
- JavaScript-based detection and capture for performance
- Tree-based DOM merging with text-based deduplication

Documentation:
- Comprehensive guide at docs/md_v2/advanced/virtual-scroll.md
- API reference updates in parameters.md and page-interaction.md
- Blog article explaining the solution and techniques
- Complete examples with local test server

Testing:
- Full test suite achieving 100% capture of 1000 items
- Examples for Twitter timeline, Instagram grid scenarios
- Local test server with different scrolling behaviors

This enables scraping of modern websites that were previously impossible to fully capture with traditional scrolling techniques.
This commit is contained in:
UncleCode
2025-06-29 20:41:37 +08:00
parent 539a324cf6
commit a353515271
18 changed files with 2194 additions and 6 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.6 MiB

View File

@@ -0,0 +1,132 @@
<!DOCTYPE html>
<html>
<head>
<title>Append-Only Scroll (Traditional Infinite Scroll)</title>
<style>
body {
font-family: Arial, sans-serif;
margin: 0;
padding: 20px;
background-color: #f5f5f5;
}
h1 {
color: #333;
text-align: center;
}
.posts-container {
max-width: 800px;
margin: 0 auto;
background: white;
border: 1px solid #ddd;
border-radius: 5px;
padding: 20px;
}
.post {
background: #f9f9f9;
padding: 15px;
margin-bottom: 15px;
border-radius: 5px;
border: 1px solid #eee;
}
.post-title {
font-size: 18px;
font-weight: bold;
color: #2c3e50;
margin-bottom: 10px;
}
.post-content {
color: #555;
line-height: 1.6;
}
.loading {
text-align: center;
padding: 20px;
color: #888;
}
</style>
</head>
<body>
<h1>Traditional Infinite Scroll Demo</h1>
<p style="text-align: center; color: #666;">This appends new content without removing old content</p>
<div class="posts-container"></div>
<script>
// Traditional infinite scroll - APPENDS content
const container = document.querySelector('.posts-container');
const totalPosts = 200;
const postsPerPage = 20;
let loadedPosts = 0;
let isLoading = false;
// Generate fake post data
function generatePost(index) {
return {
id: index,
title: `Post Title #${index + 1}`,
content: `This is the content of post ${index + 1}. In traditional infinite scroll, new content is appended to existing content. The DOM keeps growing. Post ID: ${index}`
};
}
// Load more posts - APPENDS to existing content
function loadMorePosts() {
if (isLoading || loadedPosts >= totalPosts) return;
isLoading = true;
// Show loading indicator
const loadingDiv = document.createElement('div');
loadingDiv.className = 'loading';
loadingDiv.textContent = 'Loading more posts...';
container.appendChild(loadingDiv);
// Simulate network delay
setTimeout(() => {
// Remove loading indicator
container.removeChild(loadingDiv);
// Add new posts
const fragment = document.createDocumentFragment();
const endIndex = Math.min(loadedPosts + postsPerPage, totalPosts);
for (let i = loadedPosts; i < endIndex; i++) {
const post = generatePost(i);
const postElement = document.createElement('div');
postElement.className = 'post';
postElement.setAttribute('data-post-id', post.id);
postElement.innerHTML = `
<div class="post-title">${post.title}</div>
<div class="post-content">${post.content}</div>
`;
fragment.appendChild(postElement);
}
// APPEND new posts to existing ones
container.appendChild(fragment);
loadedPosts = endIndex;
isLoading = false;
console.log(`Loaded ${loadedPosts} of ${totalPosts} posts`);
}, 300);
}
// Initial load
loadMorePosts();
// Load more on scroll
window.addEventListener('scroll', () => {
const scrollBottom = window.innerHeight + window.scrollY;
const threshold = document.body.offsetHeight - 500;
if (scrollBottom >= threshold) {
loadMorePosts();
}
});
</script>
</body>
</html>

View File

@@ -0,0 +1,158 @@
<!DOCTYPE html>
<html>
<head>
<title>Instagram-like Grid Virtual Scroll</title>
<style>
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif;
margin: 0;
padding: 20px;
background-color: #fafafa;
}
h1 {
text-align: center;
color: #262626;
font-weight: 300;
}
.feed-container {
max-width: 935px;
margin: 0 auto;
height: 800px;
overflow-y: auto;
background: white;
border: 1px solid #dbdbdb;
border-radius: 3px;
}
.grid {
display: grid;
grid-template-columns: repeat(3, 1fr);
gap: 28px;
padding: 28px;
}
.post {
aspect-ratio: 1;
background: #f0f0f0;
border-radius: 3px;
position: relative;
overflow: hidden;
cursor: pointer;
}
.post:hover .overlay {
opacity: 1;
}
.post img {
width: 100%;
height: 100%;
object-fit: cover;
}
.overlay {
position: absolute;
top: 0;
left: 0;
right: 0;
bottom: 0;
background: rgba(0, 0, 0, 0.3);
display: flex;
align-items: center;
justify-content: center;
color: white;
font-size: 14px;
opacity: 0;
transition: opacity 0.2s;
}
.stats {
display: flex;
gap: 20px;
}
</style>
</head>
<body>
<h1>Instagram Grid Virtual Scroll</h1>
<p style="text-align: center; color: #8e8e8e;">Grid layout with virtual scrolling - only visible rows are rendered</p>
<div class="feed-container">
<div class="grid" id="grid"></div>
</div>
<script>
// Instagram-like grid virtual scroll
const grid = document.getElementById('grid');
const container = document.querySelector('.feed-container');
const totalPosts = 999; // Instagram style count
const postsPerRow = 3;
const rowsPerPage = 4; // 12 posts per page
const postsPerPage = postsPerRow * rowsPerPage;
let currentStartIndex = 0;
// Generate fake Instagram post data
const allPosts = [];
for (let i = 0; i < totalPosts; i++) {
allPosts.push({
id: i,
likes: Math.floor(Math.random() * 10000),
comments: Math.floor(Math.random() * 500),
imageNumber: (i % 10) + 1 // Cycle through 10 placeholder images
});
}
// Render grid - REPLACES content for performance
function renderGrid(startIndex) {
const posts = [];
const endIndex = Math.min(startIndex + postsPerPage, totalPosts);
for (let i = startIndex; i < endIndex; i++) {
const post = allPosts[i];
posts.push(`
<div class="post" data-post-id="${post.id}">
<img src="data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='400' height='400'%3E%3Crect width='400' height='400' fill='%23${Math.floor(Math.random()*16777215).toString(16)}'/%3E%3Ctext x='50%25' y='50%25' text-anchor='middle' dy='.3em' font-family='Arial' font-size='48' fill='white'%3E${post.id + 1}%3C/text%3E%3C/svg%3E" alt="Post ${post.id + 1}">
<div class="overlay">
<div class="stats">
<span>❤️ ${post.likes.toLocaleString()}</span>
<span>💬 ${post.comments}</span>
</div>
</div>
</div>
`);
}
// REPLACE grid content (virtual scroll)
grid.innerHTML = posts.join('');
currentStartIndex = startIndex;
}
// Initial render
renderGrid(0);
// Handle scroll
let scrollTimeout;
container.addEventListener('scroll', () => {
clearTimeout(scrollTimeout);
scrollTimeout = setTimeout(() => {
const scrollTop = container.scrollTop;
const scrollHeight = container.scrollHeight;
const clientHeight = container.clientHeight;
// Calculate which "page" we should show
const scrollPercentage = scrollTop / (scrollHeight - clientHeight);
const targetIndex = Math.floor(scrollPercentage * (totalPosts - postsPerPage) / postsPerPage) * postsPerPage;
// When scrolled to bottom, show next page
if (scrollTop + clientHeight >= scrollHeight - 100) {
const nextIndex = currentStartIndex + postsPerPage;
if (nextIndex < totalPosts) {
renderGrid(nextIndex);
container.scrollTop = 100; // Reset scroll for continuous experience
}
}
}, 50);
});
</script>
</body>
</html>

View File

@@ -0,0 +1,210 @@
<!DOCTYPE html>
<html>
<head>
<title>News Feed with Mixed Scroll Behavior</title>
<style>
body {
font-family: Georgia, serif;
margin: 0;
padding: 20px;
background-color: #f8f8f8;
}
h1 {
text-align: center;
color: #1a1a1a;
font-size: 32px;
margin-bottom: 10px;
}
.description {
text-align: center;
color: #666;
margin-bottom: 20px;
}
#newsContainer {
max-width: 900px;
margin: 0 auto;
height: 700px;
overflow-y: auto;
background: white;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
padding: 20px;
}
.article {
margin-bottom: 30px;
padding-bottom: 30px;
border-bottom: 1px solid #e0e0e0;
}
.article:last-child {
border-bottom: none;
}
.article-header {
margin-bottom: 15px;
}
.category {
display: inline-block;
background: #ff6b6b;
color: white;
padding: 4px 12px;
font-size: 12px;
text-transform: uppercase;
border-radius: 3px;
margin-bottom: 10px;
}
.headline {
font-size: 24px;
font-weight: bold;
color: #1a1a1a;
margin: 10px 0;
line-height: 1.3;
}
.meta {
color: #888;
font-size: 14px;
margin-bottom: 15px;
}
.content {
font-size: 16px;
line-height: 1.8;
color: #333;
}
.featured {
background: #fff9e6;
padding: 20px;
border-radius: 5px;
margin-bottom: 30px;
}
.featured .category {
background: #ffa500;
}
</style>
</head>
<body>
<h1>📰 Dynamic News Feed</h1>
<p class="description">Mixed behavior: Featured articles stay, regular articles use virtual scroll</p>
<div id="newsContainer"></div>
<script>
const container = document.getElementById('newsContainer');
const totalArticles = 100;
const articlesPerPage = 5;
let currentRegularIndex = 0;
// Categories for variety
const categories = ['Politics', 'Technology', 'Business', 'Science', 'Sports', 'Entertainment'];
// Generate article data
const featuredArticles = [];
const regularArticles = [];
// 3 featured articles that always stay
for (let i = 0; i < 3; i++) {
featuredArticles.push({
id: `featured-${i}`,
category: 'Featured',
headline: `Breaking: Major Story ${i + 1} That Stays Visible`,
date: new Date().toLocaleDateString(),
content: `This is featured article ${i + 1}. Featured articles remain in the DOM and are not replaced during scrolling. They provide important persistent content.`
});
}
// Regular articles that get virtualized
for (let i = 0; i < totalArticles; i++) {
regularArticles.push({
id: `article-${i}`,
category: categories[i % categories.length],
headline: `${categories[i % categories.length]} News: Article ${i + 1} of ${totalArticles}`,
date: new Date(Date.now() - i * 86400000).toLocaleDateString(),
content: `This is regular article ${i + 1}. These articles are replaced as you scroll to maintain performance. Only a subset is shown at any time. Article ID: ${i}`
});
}
// Render articles - Featured stay, regular ones are replaced
function renderArticles(regularStartIndex) {
const html = [];
// Always show featured articles
featuredArticles.forEach(article => {
html.push(`
<div class="article featured" data-article-id="${article.id}">
<div class="article-header">
<span class="category">${article.category}</span>
<h2 class="headline">${article.headline}</h2>
<div class="meta">📅 ${article.date}</div>
</div>
<div class="content">${article.content}</div>
</div>
`);
});
// Add divider
html.push('<div style="text-align: center; color: #999; margin: 20px 0;">— Latest News —</div>');
// Show current page of regular articles (virtual scroll)
const endIndex = Math.min(regularStartIndex + articlesPerPage, totalArticles);
for (let i = regularStartIndex; i < endIndex; i++) {
const article = regularArticles[i];
html.push(`
<div class="article" data-article-id="${article.id}">
<div class="article-header">
<span class="category" style="background: ${getCategoryColor(article.category)}">${article.category}</span>
<h2 class="headline">${article.headline}</h2>
<div class="meta">📅 ${article.date}</div>
</div>
<div class="content">${article.content}</div>
</div>
`);
}
container.innerHTML = html.join('');
currentRegularIndex = regularStartIndex;
}
function getCategoryColor(category) {
const colors = {
'Politics': '#e74c3c',
'Technology': '#3498db',
'Business': '#2ecc71',
'Science': '#9b59b6',
'Sports': '#f39c12',
'Entertainment': '#e91e63'
};
return colors[category] || '#95a5a6';
}
// Initial render
renderArticles(0);
// Handle scroll
container.addEventListener('scroll', () => {
const scrollTop = container.scrollTop;
const scrollHeight = container.scrollHeight;
const clientHeight = container.clientHeight;
// When near bottom, load next page of regular articles
if (scrollTop + clientHeight >= scrollHeight - 200) {
const nextIndex = currentRegularIndex + articlesPerPage;
if (nextIndex < totalArticles) {
renderArticles(nextIndex);
// Scroll to where regular articles start
const regularStart = document.querySelector('.article:not(.featured)');
if (regularStart) {
container.scrollTop = regularStart.offsetTop - 100;
}
}
}
});
</script>
</body>
</html>

View File

@@ -0,0 +1,122 @@
<!DOCTYPE html>
<html>
<head>
<title>Twitter-like Virtual Scroll</title>
<style>
body {
font-family: Arial, sans-serif;
margin: 0;
padding: 20px;
background-color: #f0f2f5;
}
h1 {
color: #1da1f2;
text-align: center;
}
#timeline {
max-width: 600px;
margin: 0 auto;
height: 600px;
overflow-y: auto;
background: white;
border: 1px solid #e1e8ed;
border-radius: 10px;
}
.tweet {
padding: 15px;
border-bottom: 1px solid #e1e8ed;
min-height: 80px;
}
.tweet:hover {
background-color: #f7f9fa;
}
.author {
font-weight: bold;
color: #14171a;
margin-bottom: 5px;
}
.content {
color: #14171a;
line-height: 1.5;
}
.stats {
color: #657786;
font-size: 14px;
margin-top: 10px;
}
</style>
</head>
<body>
<h1>Virtual Scroll Demo - Twitter Style</h1>
<p style="text-align: center; color: #666;">This simulates Twitter's timeline where content is replaced as you scroll</p>
<div id="timeline"></div>
<script>
// Simulate Twitter-like virtual scrolling where DOM elements are replaced
const timeline = document.getElementById('timeline');
const totalTweets = 500;
const tweetsPerPage = 10;
let currentIndex = 0;
// Generate fake tweet data
const allTweets = [];
for (let i = 0; i < totalTweets; i++) {
allTweets.push({
id: i,
author: `User_${i + 1}`,
content: `This is tweet #${i + 1} of ${totalTweets}. Virtual scrolling replaces DOM elements to maintain performance. Unique content ID: ${i}`,
likes: Math.floor(Math.random() * 1000),
retweets: Math.floor(Math.random() * 500)
});
}
// Render tweets - REPLACES content
function renderTweets(startIndex) {
const tweets = [];
const endIndex = Math.min(startIndex + tweetsPerPage, totalTweets);
for (let i = startIndex; i < endIndex; i++) {
const tweet = allTweets[i];
tweets.push(`
<div class="tweet" data-tweet-id="${tweet.id}">
<div class="author">@${tweet.author}</div>
<div class="content">${tweet.content}</div>
<div class="stats">❤️ ${tweet.likes} | 🔁 ${tweet.retweets}</div>
</div>
`);
}
// REPLACE entire content (virtual scroll behavior)
timeline.innerHTML = tweets.join('');
currentIndex = startIndex;
}
// Initial render
renderTweets(0);
// Handle scroll
timeline.addEventListener('scroll', () => {
const scrollTop = timeline.scrollTop;
const scrollHeight = timeline.scrollHeight;
const clientHeight = timeline.clientHeight;
// When near bottom, load next page
if (scrollTop + clientHeight >= scrollHeight - 100) {
const nextIndex = currentIndex + tweetsPerPage;
if (nextIndex < totalTweets) {
renderTweets(nextIndex);
// Small scroll adjustment for continuous scrolling
timeline.scrollTop = 50;
}
}
});
</script>
</body>
</html>