feat: Add virtual scroll support for modern web scraping
Add comprehensive virtual scroll handling to capture all content from pages that use DOM recycling techniques (Twitter, Instagram, etc). Key features: - New VirtualScrollConfig class for configuring virtual scroll behavior - Automatic detection of three scrolling scenarios: no change, content appended, content replaced - Intelligent HTML chunk capture and merging with deduplication - 100% content capture from virtual scroll pages - Seamless integration with existing extraction strategies - JavaScript-based detection and capture for performance - Tree-based DOM merging with text-based deduplication Documentation: - Comprehensive guide at docs/md_v2/advanced/virtual-scroll.md - API reference updates in parameters.md and page-interaction.md - Blog article explaining the solution and techniques - Complete examples with local test server Testing: - Full test suite achieving 100% capture of 1000 items - Examples for Twitter timeline, Instagram grid scenarios - Local test server with different scrolling behaviors This enables scraping of modern websites that were previously impossible to fully capture with traditional scrolling techniques.
This commit is contained in:
BIN
docs/examples/assets/instagram_grid_result.png
Normal file
BIN
docs/examples/assets/instagram_grid_result.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 6.6 MiB |
132
docs/examples/assets/virtual_scroll_append_only.html
Normal file
132
docs/examples/assets/virtual_scroll_append_only.html
Normal file
@@ -0,0 +1,132 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Append-Only Scroll (Traditional Infinite Scroll)</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: Arial, sans-serif;
|
||||
margin: 0;
|
||||
padding: 20px;
|
||||
background-color: #f5f5f5;
|
||||
}
|
||||
|
||||
h1 {
|
||||
color: #333;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.posts-container {
|
||||
max-width: 800px;
|
||||
margin: 0 auto;
|
||||
background: white;
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 5px;
|
||||
padding: 20px;
|
||||
}
|
||||
|
||||
.post {
|
||||
background: #f9f9f9;
|
||||
padding: 15px;
|
||||
margin-bottom: 15px;
|
||||
border-radius: 5px;
|
||||
border: 1px solid #eee;
|
||||
}
|
||||
|
||||
.post-title {
|
||||
font-size: 18px;
|
||||
font-weight: bold;
|
||||
color: #2c3e50;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.post-content {
|
||||
color: #555;
|
||||
line-height: 1.6;
|
||||
}
|
||||
|
||||
.loading {
|
||||
text-align: center;
|
||||
padding: 20px;
|
||||
color: #888;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Traditional Infinite Scroll Demo</h1>
|
||||
<p style="text-align: center; color: #666;">This appends new content without removing old content</p>
|
||||
<div class="posts-container"></div>
|
||||
|
||||
<script>
|
||||
// Traditional infinite scroll - APPENDS content
|
||||
const container = document.querySelector('.posts-container');
|
||||
const totalPosts = 200;
|
||||
const postsPerPage = 20;
|
||||
let loadedPosts = 0;
|
||||
let isLoading = false;
|
||||
|
||||
// Generate fake post data
|
||||
function generatePost(index) {
|
||||
return {
|
||||
id: index,
|
||||
title: `Post Title #${index + 1}`,
|
||||
content: `This is the content of post ${index + 1}. In traditional infinite scroll, new content is appended to existing content. The DOM keeps growing. Post ID: ${index}`
|
||||
};
|
||||
}
|
||||
|
||||
// Load more posts - APPENDS to existing content
|
||||
function loadMorePosts() {
|
||||
if (isLoading || loadedPosts >= totalPosts) return;
|
||||
|
||||
isLoading = true;
|
||||
|
||||
// Show loading indicator
|
||||
const loadingDiv = document.createElement('div');
|
||||
loadingDiv.className = 'loading';
|
||||
loadingDiv.textContent = 'Loading more posts...';
|
||||
container.appendChild(loadingDiv);
|
||||
|
||||
// Simulate network delay
|
||||
setTimeout(() => {
|
||||
// Remove loading indicator
|
||||
container.removeChild(loadingDiv);
|
||||
|
||||
// Add new posts
|
||||
const fragment = document.createDocumentFragment();
|
||||
const endIndex = Math.min(loadedPosts + postsPerPage, totalPosts);
|
||||
|
||||
for (let i = loadedPosts; i < endIndex; i++) {
|
||||
const post = generatePost(i);
|
||||
const postElement = document.createElement('div');
|
||||
postElement.className = 'post';
|
||||
postElement.setAttribute('data-post-id', post.id);
|
||||
postElement.innerHTML = `
|
||||
<div class="post-title">${post.title}</div>
|
||||
<div class="post-content">${post.content}</div>
|
||||
`;
|
||||
fragment.appendChild(postElement);
|
||||
}
|
||||
|
||||
// APPEND new posts to existing ones
|
||||
container.appendChild(fragment);
|
||||
loadedPosts = endIndex;
|
||||
isLoading = false;
|
||||
|
||||
console.log(`Loaded ${loadedPosts} of ${totalPosts} posts`);
|
||||
}, 300);
|
||||
}
|
||||
|
||||
// Initial load
|
||||
loadMorePosts();
|
||||
|
||||
// Load more on scroll
|
||||
window.addEventListener('scroll', () => {
|
||||
const scrollBottom = window.innerHeight + window.scrollY;
|
||||
const threshold = document.body.offsetHeight - 500;
|
||||
|
||||
if (scrollBottom >= threshold) {
|
||||
loadMorePosts();
|
||||
}
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
158
docs/examples/assets/virtual_scroll_instagram_grid.html
Normal file
158
docs/examples/assets/virtual_scroll_instagram_grid.html
Normal file
@@ -0,0 +1,158 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Instagram-like Grid Virtual Scroll</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif;
|
||||
margin: 0;
|
||||
padding: 20px;
|
||||
background-color: #fafafa;
|
||||
}
|
||||
|
||||
h1 {
|
||||
text-align: center;
|
||||
color: #262626;
|
||||
font-weight: 300;
|
||||
}
|
||||
|
||||
.feed-container {
|
||||
max-width: 935px;
|
||||
margin: 0 auto;
|
||||
height: 800px;
|
||||
overflow-y: auto;
|
||||
background: white;
|
||||
border: 1px solid #dbdbdb;
|
||||
border-radius: 3px;
|
||||
}
|
||||
|
||||
.grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(3, 1fr);
|
||||
gap: 28px;
|
||||
padding: 28px;
|
||||
}
|
||||
|
||||
.post {
|
||||
aspect-ratio: 1;
|
||||
background: #f0f0f0;
|
||||
border-radius: 3px;
|
||||
position: relative;
|
||||
overflow: hidden;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.post:hover .overlay {
|
||||
opacity: 1;
|
||||
}
|
||||
|
||||
.post img {
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
object-fit: cover;
|
||||
}
|
||||
|
||||
.overlay {
|
||||
position: absolute;
|
||||
top: 0;
|
||||
left: 0;
|
||||
right: 0;
|
||||
bottom: 0;
|
||||
background: rgba(0, 0, 0, 0.3);
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
color: white;
|
||||
font-size: 14px;
|
||||
opacity: 0;
|
||||
transition: opacity 0.2s;
|
||||
}
|
||||
|
||||
.stats {
|
||||
display: flex;
|
||||
gap: 20px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Instagram Grid Virtual Scroll</h1>
|
||||
<p style="text-align: center; color: #8e8e8e;">Grid layout with virtual scrolling - only visible rows are rendered</p>
|
||||
<div class="feed-container">
|
||||
<div class="grid" id="grid"></div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
// Instagram-like grid virtual scroll
|
||||
const grid = document.getElementById('grid');
|
||||
const container = document.querySelector('.feed-container');
|
||||
const totalPosts = 999; // Instagram style count
|
||||
const postsPerRow = 3;
|
||||
const rowsPerPage = 4; // 12 posts per page
|
||||
const postsPerPage = postsPerRow * rowsPerPage;
|
||||
let currentStartIndex = 0;
|
||||
|
||||
// Generate fake Instagram post data
|
||||
const allPosts = [];
|
||||
for (let i = 0; i < totalPosts; i++) {
|
||||
allPosts.push({
|
||||
id: i,
|
||||
likes: Math.floor(Math.random() * 10000),
|
||||
comments: Math.floor(Math.random() * 500),
|
||||
imageNumber: (i % 10) + 1 // Cycle through 10 placeholder images
|
||||
});
|
||||
}
|
||||
|
||||
// Render grid - REPLACES content for performance
|
||||
function renderGrid(startIndex) {
|
||||
const posts = [];
|
||||
const endIndex = Math.min(startIndex + postsPerPage, totalPosts);
|
||||
|
||||
for (let i = startIndex; i < endIndex; i++) {
|
||||
const post = allPosts[i];
|
||||
posts.push(`
|
||||
<div class="post" data-post-id="${post.id}">
|
||||
<img src="data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='400' height='400'%3E%3Crect width='400' height='400' fill='%23${Math.floor(Math.random()*16777215).toString(16)}'/%3E%3Ctext x='50%25' y='50%25' text-anchor='middle' dy='.3em' font-family='Arial' font-size='48' fill='white'%3E${post.id + 1}%3C/text%3E%3C/svg%3E" alt="Post ${post.id + 1}">
|
||||
<div class="overlay">
|
||||
<div class="stats">
|
||||
<span>❤️ ${post.likes.toLocaleString()}</span>
|
||||
<span>💬 ${post.comments}</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
`);
|
||||
}
|
||||
|
||||
// REPLACE grid content (virtual scroll)
|
||||
grid.innerHTML = posts.join('');
|
||||
currentStartIndex = startIndex;
|
||||
}
|
||||
|
||||
// Initial render
|
||||
renderGrid(0);
|
||||
|
||||
// Handle scroll
|
||||
let scrollTimeout;
|
||||
container.addEventListener('scroll', () => {
|
||||
clearTimeout(scrollTimeout);
|
||||
scrollTimeout = setTimeout(() => {
|
||||
const scrollTop = container.scrollTop;
|
||||
const scrollHeight = container.scrollHeight;
|
||||
const clientHeight = container.clientHeight;
|
||||
|
||||
// Calculate which "page" we should show
|
||||
const scrollPercentage = scrollTop / (scrollHeight - clientHeight);
|
||||
const targetIndex = Math.floor(scrollPercentage * (totalPosts - postsPerPage) / postsPerPage) * postsPerPage;
|
||||
|
||||
// When scrolled to bottom, show next page
|
||||
if (scrollTop + clientHeight >= scrollHeight - 100) {
|
||||
const nextIndex = currentStartIndex + postsPerPage;
|
||||
if (nextIndex < totalPosts) {
|
||||
renderGrid(nextIndex);
|
||||
container.scrollTop = 100; // Reset scroll for continuous experience
|
||||
}
|
||||
}
|
||||
}, 50);
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
210
docs/examples/assets/virtual_scroll_news_feed.html
Normal file
210
docs/examples/assets/virtual_scroll_news_feed.html
Normal file
@@ -0,0 +1,210 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>News Feed with Mixed Scroll Behavior</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: Georgia, serif;
|
||||
margin: 0;
|
||||
padding: 20px;
|
||||
background-color: #f8f8f8;
|
||||
}
|
||||
|
||||
h1 {
|
||||
text-align: center;
|
||||
color: #1a1a1a;
|
||||
font-size: 32px;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.description {
|
||||
text-align: center;
|
||||
color: #666;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
#newsContainer {
|
||||
max-width: 900px;
|
||||
margin: 0 auto;
|
||||
height: 700px;
|
||||
overflow-y: auto;
|
||||
background: white;
|
||||
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
|
||||
padding: 20px;
|
||||
}
|
||||
|
||||
.article {
|
||||
margin-bottom: 30px;
|
||||
padding-bottom: 30px;
|
||||
border-bottom: 1px solid #e0e0e0;
|
||||
}
|
||||
|
||||
.article:last-child {
|
||||
border-bottom: none;
|
||||
}
|
||||
|
||||
.article-header {
|
||||
margin-bottom: 15px;
|
||||
}
|
||||
|
||||
.category {
|
||||
display: inline-block;
|
||||
background: #ff6b6b;
|
||||
color: white;
|
||||
padding: 4px 12px;
|
||||
font-size: 12px;
|
||||
text-transform: uppercase;
|
||||
border-radius: 3px;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.headline {
|
||||
font-size: 24px;
|
||||
font-weight: bold;
|
||||
color: #1a1a1a;
|
||||
margin: 10px 0;
|
||||
line-height: 1.3;
|
||||
}
|
||||
|
||||
.meta {
|
||||
color: #888;
|
||||
font-size: 14px;
|
||||
margin-bottom: 15px;
|
||||
}
|
||||
|
||||
.content {
|
||||
font-size: 16px;
|
||||
line-height: 1.8;
|
||||
color: #333;
|
||||
}
|
||||
|
||||
.featured {
|
||||
background: #fff9e6;
|
||||
padding: 20px;
|
||||
border-radius: 5px;
|
||||
margin-bottom: 30px;
|
||||
}
|
||||
|
||||
.featured .category {
|
||||
background: #ffa500;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>📰 Dynamic News Feed</h1>
|
||||
<p class="description">Mixed behavior: Featured articles stay, regular articles use virtual scroll</p>
|
||||
<div id="newsContainer"></div>
|
||||
|
||||
<script>
|
||||
const container = document.getElementById('newsContainer');
|
||||
const totalArticles = 100;
|
||||
const articlesPerPage = 5;
|
||||
let currentRegularIndex = 0;
|
||||
|
||||
// Categories for variety
|
||||
const categories = ['Politics', 'Technology', 'Business', 'Science', 'Sports', 'Entertainment'];
|
||||
|
||||
// Generate article data
|
||||
const featuredArticles = [];
|
||||
const regularArticles = [];
|
||||
|
||||
// 3 featured articles that always stay
|
||||
for (let i = 0; i < 3; i++) {
|
||||
featuredArticles.push({
|
||||
id: `featured-${i}`,
|
||||
category: 'Featured',
|
||||
headline: `Breaking: Major Story ${i + 1} That Stays Visible`,
|
||||
date: new Date().toLocaleDateString(),
|
||||
content: `This is featured article ${i + 1}. Featured articles remain in the DOM and are not replaced during scrolling. They provide important persistent content.`
|
||||
});
|
||||
}
|
||||
|
||||
// Regular articles that get virtualized
|
||||
for (let i = 0; i < totalArticles; i++) {
|
||||
regularArticles.push({
|
||||
id: `article-${i}`,
|
||||
category: categories[i % categories.length],
|
||||
headline: `${categories[i % categories.length]} News: Article ${i + 1} of ${totalArticles}`,
|
||||
date: new Date(Date.now() - i * 86400000).toLocaleDateString(),
|
||||
content: `This is regular article ${i + 1}. These articles are replaced as you scroll to maintain performance. Only a subset is shown at any time. Article ID: ${i}`
|
||||
});
|
||||
}
|
||||
|
||||
// Render articles - Featured stay, regular ones are replaced
|
||||
function renderArticles(regularStartIndex) {
|
||||
const html = [];
|
||||
|
||||
// Always show featured articles
|
||||
featuredArticles.forEach(article => {
|
||||
html.push(`
|
||||
<div class="article featured" data-article-id="${article.id}">
|
||||
<div class="article-header">
|
||||
<span class="category">${article.category}</span>
|
||||
<h2 class="headline">${article.headline}</h2>
|
||||
<div class="meta">📅 ${article.date}</div>
|
||||
</div>
|
||||
<div class="content">${article.content}</div>
|
||||
</div>
|
||||
`);
|
||||
});
|
||||
|
||||
// Add divider
|
||||
html.push('<div style="text-align: center; color: #999; margin: 20px 0;">— Latest News —</div>');
|
||||
|
||||
// Show current page of regular articles (virtual scroll)
|
||||
const endIndex = Math.min(regularStartIndex + articlesPerPage, totalArticles);
|
||||
for (let i = regularStartIndex; i < endIndex; i++) {
|
||||
const article = regularArticles[i];
|
||||
html.push(`
|
||||
<div class="article" data-article-id="${article.id}">
|
||||
<div class="article-header">
|
||||
<span class="category" style="background: ${getCategoryColor(article.category)}">${article.category}</span>
|
||||
<h2 class="headline">${article.headline}</h2>
|
||||
<div class="meta">📅 ${article.date}</div>
|
||||
</div>
|
||||
<div class="content">${article.content}</div>
|
||||
</div>
|
||||
`);
|
||||
}
|
||||
|
||||
container.innerHTML = html.join('');
|
||||
currentRegularIndex = regularStartIndex;
|
||||
}
|
||||
|
||||
function getCategoryColor(category) {
|
||||
const colors = {
|
||||
'Politics': '#e74c3c',
|
||||
'Technology': '#3498db',
|
||||
'Business': '#2ecc71',
|
||||
'Science': '#9b59b6',
|
||||
'Sports': '#f39c12',
|
||||
'Entertainment': '#e91e63'
|
||||
};
|
||||
return colors[category] || '#95a5a6';
|
||||
}
|
||||
|
||||
// Initial render
|
||||
renderArticles(0);
|
||||
|
||||
// Handle scroll
|
||||
container.addEventListener('scroll', () => {
|
||||
const scrollTop = container.scrollTop;
|
||||
const scrollHeight = container.scrollHeight;
|
||||
const clientHeight = container.clientHeight;
|
||||
|
||||
// When near bottom, load next page of regular articles
|
||||
if (scrollTop + clientHeight >= scrollHeight - 200) {
|
||||
const nextIndex = currentRegularIndex + articlesPerPage;
|
||||
if (nextIndex < totalArticles) {
|
||||
renderArticles(nextIndex);
|
||||
// Scroll to where regular articles start
|
||||
const regularStart = document.querySelector('.article:not(.featured)');
|
||||
if (regularStart) {
|
||||
container.scrollTop = regularStart.offsetTop - 100;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
122
docs/examples/assets/virtual_scroll_twitter_like.html
Normal file
122
docs/examples/assets/virtual_scroll_twitter_like.html
Normal file
@@ -0,0 +1,122 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Twitter-like Virtual Scroll</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: Arial, sans-serif;
|
||||
margin: 0;
|
||||
padding: 20px;
|
||||
background-color: #f0f2f5;
|
||||
}
|
||||
|
||||
h1 {
|
||||
color: #1da1f2;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
#timeline {
|
||||
max-width: 600px;
|
||||
margin: 0 auto;
|
||||
height: 600px;
|
||||
overflow-y: auto;
|
||||
background: white;
|
||||
border: 1px solid #e1e8ed;
|
||||
border-radius: 10px;
|
||||
}
|
||||
|
||||
.tweet {
|
||||
padding: 15px;
|
||||
border-bottom: 1px solid #e1e8ed;
|
||||
min-height: 80px;
|
||||
}
|
||||
|
||||
.tweet:hover {
|
||||
background-color: #f7f9fa;
|
||||
}
|
||||
|
||||
.author {
|
||||
font-weight: bold;
|
||||
color: #14171a;
|
||||
margin-bottom: 5px;
|
||||
}
|
||||
|
||||
.content {
|
||||
color: #14171a;
|
||||
line-height: 1.5;
|
||||
}
|
||||
|
||||
.stats {
|
||||
color: #657786;
|
||||
font-size: 14px;
|
||||
margin-top: 10px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Virtual Scroll Demo - Twitter Style</h1>
|
||||
<p style="text-align: center; color: #666;">This simulates Twitter's timeline where content is replaced as you scroll</p>
|
||||
<div id="timeline"></div>
|
||||
|
||||
<script>
|
||||
// Simulate Twitter-like virtual scrolling where DOM elements are replaced
|
||||
const timeline = document.getElementById('timeline');
|
||||
const totalTweets = 500;
|
||||
const tweetsPerPage = 10;
|
||||
let currentIndex = 0;
|
||||
|
||||
// Generate fake tweet data
|
||||
const allTweets = [];
|
||||
for (let i = 0; i < totalTweets; i++) {
|
||||
allTweets.push({
|
||||
id: i,
|
||||
author: `User_${i + 1}`,
|
||||
content: `This is tweet #${i + 1} of ${totalTweets}. Virtual scrolling replaces DOM elements to maintain performance. Unique content ID: ${i}`,
|
||||
likes: Math.floor(Math.random() * 1000),
|
||||
retweets: Math.floor(Math.random() * 500)
|
||||
});
|
||||
}
|
||||
|
||||
// Render tweets - REPLACES content
|
||||
function renderTweets(startIndex) {
|
||||
const tweets = [];
|
||||
const endIndex = Math.min(startIndex + tweetsPerPage, totalTweets);
|
||||
|
||||
for (let i = startIndex; i < endIndex; i++) {
|
||||
const tweet = allTweets[i];
|
||||
tweets.push(`
|
||||
<div class="tweet" data-tweet-id="${tweet.id}">
|
||||
<div class="author">@${tweet.author}</div>
|
||||
<div class="content">${tweet.content}</div>
|
||||
<div class="stats">❤️ ${tweet.likes} | 🔁 ${tweet.retweets}</div>
|
||||
</div>
|
||||
`);
|
||||
}
|
||||
|
||||
// REPLACE entire content (virtual scroll behavior)
|
||||
timeline.innerHTML = tweets.join('');
|
||||
currentIndex = startIndex;
|
||||
}
|
||||
|
||||
// Initial render
|
||||
renderTweets(0);
|
||||
|
||||
// Handle scroll
|
||||
timeline.addEventListener('scroll', () => {
|
||||
const scrollTop = timeline.scrollTop;
|
||||
const scrollHeight = timeline.scrollHeight;
|
||||
const clientHeight = timeline.clientHeight;
|
||||
|
||||
// When near bottom, load next page
|
||||
if (scrollTop + clientHeight >= scrollHeight - 100) {
|
||||
const nextIndex = currentIndex + tweetsPerPage;
|
||||
if (nextIndex < totalTweets) {
|
||||
renderTweets(nextIndex);
|
||||
// Small scroll adjustment for continuous scrolling
|
||||
timeline.scrollTop = 50;
|
||||
}
|
||||
}
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
Reference in New Issue
Block a user