Files
crawl4ai/docs/examples/assets/virtual_scroll_news_feed.html
UncleCode a353515271 feat: Add virtual scroll support for modern web scraping
Add comprehensive virtual scroll handling to capture all content from pages that use DOM recycling techniques (Twitter, Instagram, etc).

Key features:
- New VirtualScrollConfig class for configuring virtual scroll behavior
- Automatic detection of three scrolling scenarios: no change, content appended, content replaced
- Intelligent HTML chunk capture and merging with deduplication
- 100% content capture from virtual scroll pages
- Seamless integration with existing extraction strategies
- JavaScript-based detection and capture for performance
- Tree-based DOM merging with text-based deduplication

Documentation:
- Comprehensive guide at docs/md_v2/advanced/virtual-scroll.md
- API reference updates in parameters.md and page-interaction.md
- Blog article explaining the solution and techniques
- Complete examples with local test server

Testing:
- Full test suite achieving 100% capture of 1000 items
- Examples for Twitter timeline, Instagram grid scenarios
- Local test server with different scrolling behaviors

This enables scraping of modern websites that were previously impossible to fully capture with traditional scrolling techniques.
2025-06-29 20:41:37 +08:00

210 lines
7.1 KiB
HTML

<!DOCTYPE html>
<html>
<head>
<title>News Feed with Mixed Scroll Behavior</title>
<style>
body {
font-family: Georgia, serif;
margin: 0;
padding: 20px;
background-color: #f8f8f8;
}
h1 {
text-align: center;
color: #1a1a1a;
font-size: 32px;
margin-bottom: 10px;
}
.description {
text-align: center;
color: #666;
margin-bottom: 20px;
}
#newsContainer {
max-width: 900px;
margin: 0 auto;
height: 700px;
overflow-y: auto;
background: white;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
padding: 20px;
}
.article {
margin-bottom: 30px;
padding-bottom: 30px;
border-bottom: 1px solid #e0e0e0;
}
.article:last-child {
border-bottom: none;
}
.article-header {
margin-bottom: 15px;
}
.category {
display: inline-block;
background: #ff6b6b;
color: white;
padding: 4px 12px;
font-size: 12px;
text-transform: uppercase;
border-radius: 3px;
margin-bottom: 10px;
}
.headline {
font-size: 24px;
font-weight: bold;
color: #1a1a1a;
margin: 10px 0;
line-height: 1.3;
}
.meta {
color: #888;
font-size: 14px;
margin-bottom: 15px;
}
.content {
font-size: 16px;
line-height: 1.8;
color: #333;
}
.featured {
background: #fff9e6;
padding: 20px;
border-radius: 5px;
margin-bottom: 30px;
}
.featured .category {
background: #ffa500;
}
</style>
</head>
<body>
<h1>📰 Dynamic News Feed</h1>
<p class="description">Mixed behavior: Featured articles stay, regular articles use virtual scroll</p>
<div id="newsContainer"></div>
<script>
const container = document.getElementById('newsContainer');
const totalArticles = 100;
const articlesPerPage = 5;
let currentRegularIndex = 0;
// Categories for variety
const categories = ['Politics', 'Technology', 'Business', 'Science', 'Sports', 'Entertainment'];
// Generate article data
const featuredArticles = [];
const regularArticles = [];
// 3 featured articles that always stay
for (let i = 0; i < 3; i++) {
featuredArticles.push({
id: `featured-${i}`,
category: 'Featured',
headline: `Breaking: Major Story ${i + 1} That Stays Visible`,
date: new Date().toLocaleDateString(),
content: `This is featured article ${i + 1}. Featured articles remain in the DOM and are not replaced during scrolling. They provide important persistent content.`
});
}
// Regular articles that get virtualized
for (let i = 0; i < totalArticles; i++) {
regularArticles.push({
id: `article-${i}`,
category: categories[i % categories.length],
headline: `${categories[i % categories.length]} News: Article ${i + 1} of ${totalArticles}`,
date: new Date(Date.now() - i * 86400000).toLocaleDateString(),
content: `This is regular article ${i + 1}. These articles are replaced as you scroll to maintain performance. Only a subset is shown at any time. Article ID: ${i}`
});
}
// Render articles - Featured stay, regular ones are replaced
function renderArticles(regularStartIndex) {
const html = [];
// Always show featured articles
featuredArticles.forEach(article => {
html.push(`
<div class="article featured" data-article-id="${article.id}">
<div class="article-header">
<span class="category">${article.category}</span>
<h2 class="headline">${article.headline}</h2>
<div class="meta">📅 ${article.date}</div>
</div>
<div class="content">${article.content}</div>
</div>
`);
});
// Add divider
html.push('<div style="text-align: center; color: #999; margin: 20px 0;">— Latest News —</div>');
// Show current page of regular articles (virtual scroll)
const endIndex = Math.min(regularStartIndex + articlesPerPage, totalArticles);
for (let i = regularStartIndex; i < endIndex; i++) {
const article = regularArticles[i];
html.push(`
<div class="article" data-article-id="${article.id}">
<div class="article-header">
<span class="category" style="background: ${getCategoryColor(article.category)}">${article.category}</span>
<h2 class="headline">${article.headline}</h2>
<div class="meta">📅 ${article.date}</div>
</div>
<div class="content">${article.content}</div>
</div>
`);
}
container.innerHTML = html.join('');
currentRegularIndex = regularStartIndex;
}
function getCategoryColor(category) {
const colors = {
'Politics': '#e74c3c',
'Technology': '#3498db',
'Business': '#2ecc71',
'Science': '#9b59b6',
'Sports': '#f39c12',
'Entertainment': '#e91e63'
};
return colors[category] || '#95a5a6';
}
// Initial render
renderArticles(0);
// Handle scroll
container.addEventListener('scroll', () => {
const scrollTop = container.scrollTop;
const scrollHeight = container.scrollHeight;
const clientHeight = container.clientHeight;
// When near bottom, load next page of regular articles
if (scrollTop + clientHeight >= scrollHeight - 200) {
const nextIndex = currentRegularIndex + articlesPerPage;
if (nextIndex < totalArticles) {
renderArticles(nextIndex);
// Scroll to where regular articles start
const regularStart = document.querySelector('.article:not(.featured)');
if (regularStart) {
container.scrollTop = regularStart.offsetTop - 100;
}
}
}
});
</script>
</body>
</html>