feat: Add virtual scroll support for modern web scraping
Add comprehensive virtual scroll handling to capture all content from pages that use DOM recycling techniques (Twitter, Instagram, etc). Key features: - New VirtualScrollConfig class for configuring virtual scroll behavior - Automatic detection of three scrolling scenarios: no change, content appended, content replaced - Intelligent HTML chunk capture and merging with deduplication - 100% content capture from virtual scroll pages - Seamless integration with existing extraction strategies - JavaScript-based detection and capture for performance - Tree-based DOM merging with text-based deduplication Documentation: - Comprehensive guide at docs/md_v2/advanced/virtual-scroll.md - API reference updates in parameters.md and page-interaction.md - Blog article explaining the solution and techniques - Complete examples with local test server Testing: - Full test suite achieving 100% capture of 1000 items - Examples for Twitter timeline, Instagram grid scenarios - Local test server with different scrolling behaviors This enables scraping of modern websites that were previously impossible to fully capture with traditional scrolling techniques.
This commit is contained in:
BIN
docs/examples/assets/instagram_grid_result.png
Normal file
BIN
docs/examples/assets/instagram_grid_result.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 6.6 MiB |
132
docs/examples/assets/virtual_scroll_append_only.html
Normal file
132
docs/examples/assets/virtual_scroll_append_only.html
Normal file
@@ -0,0 +1,132 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Append-Only Scroll (Traditional Infinite Scroll)</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: Arial, sans-serif;
|
||||
margin: 0;
|
||||
padding: 20px;
|
||||
background-color: #f5f5f5;
|
||||
}
|
||||
|
||||
h1 {
|
||||
color: #333;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.posts-container {
|
||||
max-width: 800px;
|
||||
margin: 0 auto;
|
||||
background: white;
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 5px;
|
||||
padding: 20px;
|
||||
}
|
||||
|
||||
.post {
|
||||
background: #f9f9f9;
|
||||
padding: 15px;
|
||||
margin-bottom: 15px;
|
||||
border-radius: 5px;
|
||||
border: 1px solid #eee;
|
||||
}
|
||||
|
||||
.post-title {
|
||||
font-size: 18px;
|
||||
font-weight: bold;
|
||||
color: #2c3e50;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.post-content {
|
||||
color: #555;
|
||||
line-height: 1.6;
|
||||
}
|
||||
|
||||
.loading {
|
||||
text-align: center;
|
||||
padding: 20px;
|
||||
color: #888;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Traditional Infinite Scroll Demo</h1>
|
||||
<p style="text-align: center; color: #666;">This appends new content without removing old content</p>
|
||||
<div class="posts-container"></div>
|
||||
|
||||
<script>
|
||||
// Traditional infinite scroll - APPENDS content
|
||||
const container = document.querySelector('.posts-container');
|
||||
const totalPosts = 200;
|
||||
const postsPerPage = 20;
|
||||
let loadedPosts = 0;
|
||||
let isLoading = false;
|
||||
|
||||
// Generate fake post data
|
||||
function generatePost(index) {
|
||||
return {
|
||||
id: index,
|
||||
title: `Post Title #${index + 1}`,
|
||||
content: `This is the content of post ${index + 1}. In traditional infinite scroll, new content is appended to existing content. The DOM keeps growing. Post ID: ${index}`
|
||||
};
|
||||
}
|
||||
|
||||
// Load more posts - APPENDS to existing content
|
||||
function loadMorePosts() {
|
||||
if (isLoading || loadedPosts >= totalPosts) return;
|
||||
|
||||
isLoading = true;
|
||||
|
||||
// Show loading indicator
|
||||
const loadingDiv = document.createElement('div');
|
||||
loadingDiv.className = 'loading';
|
||||
loadingDiv.textContent = 'Loading more posts...';
|
||||
container.appendChild(loadingDiv);
|
||||
|
||||
// Simulate network delay
|
||||
setTimeout(() => {
|
||||
// Remove loading indicator
|
||||
container.removeChild(loadingDiv);
|
||||
|
||||
// Add new posts
|
||||
const fragment = document.createDocumentFragment();
|
||||
const endIndex = Math.min(loadedPosts + postsPerPage, totalPosts);
|
||||
|
||||
for (let i = loadedPosts; i < endIndex; i++) {
|
||||
const post = generatePost(i);
|
||||
const postElement = document.createElement('div');
|
||||
postElement.className = 'post';
|
||||
postElement.setAttribute('data-post-id', post.id);
|
||||
postElement.innerHTML = `
|
||||
<div class="post-title">${post.title}</div>
|
||||
<div class="post-content">${post.content}</div>
|
||||
`;
|
||||
fragment.appendChild(postElement);
|
||||
}
|
||||
|
||||
// APPEND new posts to existing ones
|
||||
container.appendChild(fragment);
|
||||
loadedPosts = endIndex;
|
||||
isLoading = false;
|
||||
|
||||
console.log(`Loaded ${loadedPosts} of ${totalPosts} posts`);
|
||||
}, 300);
|
||||
}
|
||||
|
||||
// Initial load
|
||||
loadMorePosts();
|
||||
|
||||
// Load more on scroll
|
||||
window.addEventListener('scroll', () => {
|
||||
const scrollBottom = window.innerHeight + window.scrollY;
|
||||
const threshold = document.body.offsetHeight - 500;
|
||||
|
||||
if (scrollBottom >= threshold) {
|
||||
loadMorePosts();
|
||||
}
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
158
docs/examples/assets/virtual_scroll_instagram_grid.html
Normal file
158
docs/examples/assets/virtual_scroll_instagram_grid.html
Normal file
@@ -0,0 +1,158 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Instagram-like Grid Virtual Scroll</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif;
|
||||
margin: 0;
|
||||
padding: 20px;
|
||||
background-color: #fafafa;
|
||||
}
|
||||
|
||||
h1 {
|
||||
text-align: center;
|
||||
color: #262626;
|
||||
font-weight: 300;
|
||||
}
|
||||
|
||||
.feed-container {
|
||||
max-width: 935px;
|
||||
margin: 0 auto;
|
||||
height: 800px;
|
||||
overflow-y: auto;
|
||||
background: white;
|
||||
border: 1px solid #dbdbdb;
|
||||
border-radius: 3px;
|
||||
}
|
||||
|
||||
.grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(3, 1fr);
|
||||
gap: 28px;
|
||||
padding: 28px;
|
||||
}
|
||||
|
||||
.post {
|
||||
aspect-ratio: 1;
|
||||
background: #f0f0f0;
|
||||
border-radius: 3px;
|
||||
position: relative;
|
||||
overflow: hidden;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.post:hover .overlay {
|
||||
opacity: 1;
|
||||
}
|
||||
|
||||
.post img {
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
object-fit: cover;
|
||||
}
|
||||
|
||||
.overlay {
|
||||
position: absolute;
|
||||
top: 0;
|
||||
left: 0;
|
||||
right: 0;
|
||||
bottom: 0;
|
||||
background: rgba(0, 0, 0, 0.3);
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
color: white;
|
||||
font-size: 14px;
|
||||
opacity: 0;
|
||||
transition: opacity 0.2s;
|
||||
}
|
||||
|
||||
.stats {
|
||||
display: flex;
|
||||
gap: 20px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Instagram Grid Virtual Scroll</h1>
|
||||
<p style="text-align: center; color: #8e8e8e;">Grid layout with virtual scrolling - only visible rows are rendered</p>
|
||||
<div class="feed-container">
|
||||
<div class="grid" id="grid"></div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
// Instagram-like grid virtual scroll
|
||||
const grid = document.getElementById('grid');
|
||||
const container = document.querySelector('.feed-container');
|
||||
const totalPosts = 999; // Instagram style count
|
||||
const postsPerRow = 3;
|
||||
const rowsPerPage = 4; // 12 posts per page
|
||||
const postsPerPage = postsPerRow * rowsPerPage;
|
||||
let currentStartIndex = 0;
|
||||
|
||||
// Generate fake Instagram post data
|
||||
const allPosts = [];
|
||||
for (let i = 0; i < totalPosts; i++) {
|
||||
allPosts.push({
|
||||
id: i,
|
||||
likes: Math.floor(Math.random() * 10000),
|
||||
comments: Math.floor(Math.random() * 500),
|
||||
imageNumber: (i % 10) + 1 // Cycle through 10 placeholder images
|
||||
});
|
||||
}
|
||||
|
||||
// Render grid - REPLACES content for performance
|
||||
function renderGrid(startIndex) {
|
||||
const posts = [];
|
||||
const endIndex = Math.min(startIndex + postsPerPage, totalPosts);
|
||||
|
||||
for (let i = startIndex; i < endIndex; i++) {
|
||||
const post = allPosts[i];
|
||||
posts.push(`
|
||||
<div class="post" data-post-id="${post.id}">
|
||||
<img src="data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='400' height='400'%3E%3Crect width='400' height='400' fill='%23${Math.floor(Math.random()*16777215).toString(16)}'/%3E%3Ctext x='50%25' y='50%25' text-anchor='middle' dy='.3em' font-family='Arial' font-size='48' fill='white'%3E${post.id + 1}%3C/text%3E%3C/svg%3E" alt="Post ${post.id + 1}">
|
||||
<div class="overlay">
|
||||
<div class="stats">
|
||||
<span>❤️ ${post.likes.toLocaleString()}</span>
|
||||
<span>💬 ${post.comments}</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
`);
|
||||
}
|
||||
|
||||
// REPLACE grid content (virtual scroll)
|
||||
grid.innerHTML = posts.join('');
|
||||
currentStartIndex = startIndex;
|
||||
}
|
||||
|
||||
// Initial render
|
||||
renderGrid(0);
|
||||
|
||||
// Handle scroll
|
||||
let scrollTimeout;
|
||||
container.addEventListener('scroll', () => {
|
||||
clearTimeout(scrollTimeout);
|
||||
scrollTimeout = setTimeout(() => {
|
||||
const scrollTop = container.scrollTop;
|
||||
const scrollHeight = container.scrollHeight;
|
||||
const clientHeight = container.clientHeight;
|
||||
|
||||
// Calculate which "page" we should show
|
||||
const scrollPercentage = scrollTop / (scrollHeight - clientHeight);
|
||||
const targetIndex = Math.floor(scrollPercentage * (totalPosts - postsPerPage) / postsPerPage) * postsPerPage;
|
||||
|
||||
// When scrolled to bottom, show next page
|
||||
if (scrollTop + clientHeight >= scrollHeight - 100) {
|
||||
const nextIndex = currentStartIndex + postsPerPage;
|
||||
if (nextIndex < totalPosts) {
|
||||
renderGrid(nextIndex);
|
||||
container.scrollTop = 100; // Reset scroll for continuous experience
|
||||
}
|
||||
}
|
||||
}, 50);
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
210
docs/examples/assets/virtual_scroll_news_feed.html
Normal file
210
docs/examples/assets/virtual_scroll_news_feed.html
Normal file
@@ -0,0 +1,210 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>News Feed with Mixed Scroll Behavior</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: Georgia, serif;
|
||||
margin: 0;
|
||||
padding: 20px;
|
||||
background-color: #f8f8f8;
|
||||
}
|
||||
|
||||
h1 {
|
||||
text-align: center;
|
||||
color: #1a1a1a;
|
||||
font-size: 32px;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.description {
|
||||
text-align: center;
|
||||
color: #666;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
#newsContainer {
|
||||
max-width: 900px;
|
||||
margin: 0 auto;
|
||||
height: 700px;
|
||||
overflow-y: auto;
|
||||
background: white;
|
||||
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
|
||||
padding: 20px;
|
||||
}
|
||||
|
||||
.article {
|
||||
margin-bottom: 30px;
|
||||
padding-bottom: 30px;
|
||||
border-bottom: 1px solid #e0e0e0;
|
||||
}
|
||||
|
||||
.article:last-child {
|
||||
border-bottom: none;
|
||||
}
|
||||
|
||||
.article-header {
|
||||
margin-bottom: 15px;
|
||||
}
|
||||
|
||||
.category {
|
||||
display: inline-block;
|
||||
background: #ff6b6b;
|
||||
color: white;
|
||||
padding: 4px 12px;
|
||||
font-size: 12px;
|
||||
text-transform: uppercase;
|
||||
border-radius: 3px;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.headline {
|
||||
font-size: 24px;
|
||||
font-weight: bold;
|
||||
color: #1a1a1a;
|
||||
margin: 10px 0;
|
||||
line-height: 1.3;
|
||||
}
|
||||
|
||||
.meta {
|
||||
color: #888;
|
||||
font-size: 14px;
|
||||
margin-bottom: 15px;
|
||||
}
|
||||
|
||||
.content {
|
||||
font-size: 16px;
|
||||
line-height: 1.8;
|
||||
color: #333;
|
||||
}
|
||||
|
||||
.featured {
|
||||
background: #fff9e6;
|
||||
padding: 20px;
|
||||
border-radius: 5px;
|
||||
margin-bottom: 30px;
|
||||
}
|
||||
|
||||
.featured .category {
|
||||
background: #ffa500;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>📰 Dynamic News Feed</h1>
|
||||
<p class="description">Mixed behavior: Featured articles stay, regular articles use virtual scroll</p>
|
||||
<div id="newsContainer"></div>
|
||||
|
||||
<script>
|
||||
const container = document.getElementById('newsContainer');
|
||||
const totalArticles = 100;
|
||||
const articlesPerPage = 5;
|
||||
let currentRegularIndex = 0;
|
||||
|
||||
// Categories for variety
|
||||
const categories = ['Politics', 'Technology', 'Business', 'Science', 'Sports', 'Entertainment'];
|
||||
|
||||
// Generate article data
|
||||
const featuredArticles = [];
|
||||
const regularArticles = [];
|
||||
|
||||
// 3 featured articles that always stay
|
||||
for (let i = 0; i < 3; i++) {
|
||||
featuredArticles.push({
|
||||
id: `featured-${i}`,
|
||||
category: 'Featured',
|
||||
headline: `Breaking: Major Story ${i + 1} That Stays Visible`,
|
||||
date: new Date().toLocaleDateString(),
|
||||
content: `This is featured article ${i + 1}. Featured articles remain in the DOM and are not replaced during scrolling. They provide important persistent content.`
|
||||
});
|
||||
}
|
||||
|
||||
// Regular articles that get virtualized
|
||||
for (let i = 0; i < totalArticles; i++) {
|
||||
regularArticles.push({
|
||||
id: `article-${i}`,
|
||||
category: categories[i % categories.length],
|
||||
headline: `${categories[i % categories.length]} News: Article ${i + 1} of ${totalArticles}`,
|
||||
date: new Date(Date.now() - i * 86400000).toLocaleDateString(),
|
||||
content: `This is regular article ${i + 1}. These articles are replaced as you scroll to maintain performance. Only a subset is shown at any time. Article ID: ${i}`
|
||||
});
|
||||
}
|
||||
|
||||
// Render articles - Featured stay, regular ones are replaced
|
||||
function renderArticles(regularStartIndex) {
|
||||
const html = [];
|
||||
|
||||
// Always show featured articles
|
||||
featuredArticles.forEach(article => {
|
||||
html.push(`
|
||||
<div class="article featured" data-article-id="${article.id}">
|
||||
<div class="article-header">
|
||||
<span class="category">${article.category}</span>
|
||||
<h2 class="headline">${article.headline}</h2>
|
||||
<div class="meta">📅 ${article.date}</div>
|
||||
</div>
|
||||
<div class="content">${article.content}</div>
|
||||
</div>
|
||||
`);
|
||||
});
|
||||
|
||||
// Add divider
|
||||
html.push('<div style="text-align: center; color: #999; margin: 20px 0;">— Latest News —</div>');
|
||||
|
||||
// Show current page of regular articles (virtual scroll)
|
||||
const endIndex = Math.min(regularStartIndex + articlesPerPage, totalArticles);
|
||||
for (let i = regularStartIndex; i < endIndex; i++) {
|
||||
const article = regularArticles[i];
|
||||
html.push(`
|
||||
<div class="article" data-article-id="${article.id}">
|
||||
<div class="article-header">
|
||||
<span class="category" style="background: ${getCategoryColor(article.category)}">${article.category}</span>
|
||||
<h2 class="headline">${article.headline}</h2>
|
||||
<div class="meta">📅 ${article.date}</div>
|
||||
</div>
|
||||
<div class="content">${article.content}</div>
|
||||
</div>
|
||||
`);
|
||||
}
|
||||
|
||||
container.innerHTML = html.join('');
|
||||
currentRegularIndex = regularStartIndex;
|
||||
}
|
||||
|
||||
function getCategoryColor(category) {
|
||||
const colors = {
|
||||
'Politics': '#e74c3c',
|
||||
'Technology': '#3498db',
|
||||
'Business': '#2ecc71',
|
||||
'Science': '#9b59b6',
|
||||
'Sports': '#f39c12',
|
||||
'Entertainment': '#e91e63'
|
||||
};
|
||||
return colors[category] || '#95a5a6';
|
||||
}
|
||||
|
||||
// Initial render
|
||||
renderArticles(0);
|
||||
|
||||
// Handle scroll
|
||||
container.addEventListener('scroll', () => {
|
||||
const scrollTop = container.scrollTop;
|
||||
const scrollHeight = container.scrollHeight;
|
||||
const clientHeight = container.clientHeight;
|
||||
|
||||
// When near bottom, load next page of regular articles
|
||||
if (scrollTop + clientHeight >= scrollHeight - 200) {
|
||||
const nextIndex = currentRegularIndex + articlesPerPage;
|
||||
if (nextIndex < totalArticles) {
|
||||
renderArticles(nextIndex);
|
||||
// Scroll to where regular articles start
|
||||
const regularStart = document.querySelector('.article:not(.featured)');
|
||||
if (regularStart) {
|
||||
container.scrollTop = regularStart.offsetTop - 100;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
122
docs/examples/assets/virtual_scroll_twitter_like.html
Normal file
122
docs/examples/assets/virtual_scroll_twitter_like.html
Normal file
@@ -0,0 +1,122 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Twitter-like Virtual Scroll</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: Arial, sans-serif;
|
||||
margin: 0;
|
||||
padding: 20px;
|
||||
background-color: #f0f2f5;
|
||||
}
|
||||
|
||||
h1 {
|
||||
color: #1da1f2;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
#timeline {
|
||||
max-width: 600px;
|
||||
margin: 0 auto;
|
||||
height: 600px;
|
||||
overflow-y: auto;
|
||||
background: white;
|
||||
border: 1px solid #e1e8ed;
|
||||
border-radius: 10px;
|
||||
}
|
||||
|
||||
.tweet {
|
||||
padding: 15px;
|
||||
border-bottom: 1px solid #e1e8ed;
|
||||
min-height: 80px;
|
||||
}
|
||||
|
||||
.tweet:hover {
|
||||
background-color: #f7f9fa;
|
||||
}
|
||||
|
||||
.author {
|
||||
font-weight: bold;
|
||||
color: #14171a;
|
||||
margin-bottom: 5px;
|
||||
}
|
||||
|
||||
.content {
|
||||
color: #14171a;
|
||||
line-height: 1.5;
|
||||
}
|
||||
|
||||
.stats {
|
||||
color: #657786;
|
||||
font-size: 14px;
|
||||
margin-top: 10px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Virtual Scroll Demo - Twitter Style</h1>
|
||||
<p style="text-align: center; color: #666;">This simulates Twitter's timeline where content is replaced as you scroll</p>
|
||||
<div id="timeline"></div>
|
||||
|
||||
<script>
|
||||
// Simulate Twitter-like virtual scrolling where DOM elements are replaced
|
||||
const timeline = document.getElementById('timeline');
|
||||
const totalTweets = 500;
|
||||
const tweetsPerPage = 10;
|
||||
let currentIndex = 0;
|
||||
|
||||
// Generate fake tweet data
|
||||
const allTweets = [];
|
||||
for (let i = 0; i < totalTweets; i++) {
|
||||
allTweets.push({
|
||||
id: i,
|
||||
author: `User_${i + 1}`,
|
||||
content: `This is tweet #${i + 1} of ${totalTweets}. Virtual scrolling replaces DOM elements to maintain performance. Unique content ID: ${i}`,
|
||||
likes: Math.floor(Math.random() * 1000),
|
||||
retweets: Math.floor(Math.random() * 500)
|
||||
});
|
||||
}
|
||||
|
||||
// Render tweets - REPLACES content
|
||||
function renderTweets(startIndex) {
|
||||
const tweets = [];
|
||||
const endIndex = Math.min(startIndex + tweetsPerPage, totalTweets);
|
||||
|
||||
for (let i = startIndex; i < endIndex; i++) {
|
||||
const tweet = allTweets[i];
|
||||
tweets.push(`
|
||||
<div class="tweet" data-tweet-id="${tweet.id}">
|
||||
<div class="author">@${tweet.author}</div>
|
||||
<div class="content">${tweet.content}</div>
|
||||
<div class="stats">❤️ ${tweet.likes} | 🔁 ${tweet.retweets}</div>
|
||||
</div>
|
||||
`);
|
||||
}
|
||||
|
||||
// REPLACE entire content (virtual scroll behavior)
|
||||
timeline.innerHTML = tweets.join('');
|
||||
currentIndex = startIndex;
|
||||
}
|
||||
|
||||
// Initial render
|
||||
renderTweets(0);
|
||||
|
||||
// Handle scroll
|
||||
timeline.addEventListener('scroll', () => {
|
||||
const scrollTop = timeline.scrollTop;
|
||||
const scrollHeight = timeline.scrollHeight;
|
||||
const clientHeight = timeline.clientHeight;
|
||||
|
||||
// When near bottom, load next page
|
||||
if (scrollTop + clientHeight >= scrollHeight - 100) {
|
||||
const nextIndex = currentIndex + tweetsPerPage;
|
||||
if (nextIndex < totalTweets) {
|
||||
renderTweets(nextIndex);
|
||||
// Small scroll adjustment for continuous scrolling
|
||||
timeline.scrollTop = 50;
|
||||
}
|
||||
}
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
367
docs/examples/virtual_scroll_example.py
Normal file
367
docs/examples/virtual_scroll_example.py
Normal file
@@ -0,0 +1,367 @@
|
||||
"""
|
||||
Example of using the virtual scroll feature to capture content from pages
|
||||
with virtualized scrolling (like Twitter, Instagram, or other infinite scroll feeds).
|
||||
|
||||
This example demonstrates virtual scroll with a local test server serving
|
||||
different types of scrolling behaviors from HTML files in the assets directory.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import http.server
|
||||
import socketserver
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, VirtualScrollConfig, CacheMode, BrowserConfig
|
||||
|
||||
# Get the assets directory path
|
||||
ASSETS_DIR = Path(__file__).parent / "assets"
|
||||
|
||||
class TestServer:
|
||||
"""Simple HTTP server to serve our test HTML files"""
|
||||
|
||||
def __init__(self, port=8080):
|
||||
self.port = port
|
||||
self.httpd = None
|
||||
self.server_thread = None
|
||||
|
||||
async def start(self):
|
||||
"""Start the test server"""
|
||||
Handler = http.server.SimpleHTTPRequestHandler
|
||||
|
||||
# Save current directory and change to assets directory
|
||||
self.original_cwd = os.getcwd()
|
||||
os.chdir(ASSETS_DIR)
|
||||
|
||||
# Try to find an available port
|
||||
for _ in range(10):
|
||||
try:
|
||||
self.httpd = socketserver.TCPServer(("", self.port), Handler)
|
||||
break
|
||||
except OSError:
|
||||
self.port += 1
|
||||
|
||||
if self.httpd is None:
|
||||
raise RuntimeError("Could not find available port")
|
||||
|
||||
self.server_thread = threading.Thread(target=self.httpd.serve_forever)
|
||||
self.server_thread.daemon = True
|
||||
self.server_thread.start()
|
||||
|
||||
# Give server time to start
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
print(f"Test server started on http://localhost:{self.port}")
|
||||
return self.port
|
||||
|
||||
def stop(self):
|
||||
"""Stop the test server"""
|
||||
if self.httpd:
|
||||
self.httpd.shutdown()
|
||||
# Restore original directory
|
||||
if hasattr(self, 'original_cwd'):
|
||||
os.chdir(self.original_cwd)
|
||||
|
||||
|
||||
async def example_twitter_like_virtual_scroll():
|
||||
"""
|
||||
Example 1: Twitter-like virtual scroll where content is REPLACED.
|
||||
This is the classic virtual scroll use case - only visible items exist in DOM.
|
||||
"""
|
||||
print("\n" + "="*60)
|
||||
print("EXAMPLE 1: Twitter-like Virtual Scroll")
|
||||
print("="*60)
|
||||
|
||||
server = TestServer()
|
||||
port = await server.start()
|
||||
|
||||
try:
|
||||
# Configure virtual scroll for Twitter-like timeline
|
||||
virtual_config = VirtualScrollConfig(
|
||||
container_selector="#timeline", # The scrollable container
|
||||
scroll_count=50, # Scroll up to 50 times to get all content
|
||||
scroll_by="container_height", # Scroll by container's height
|
||||
wait_after_scroll=0.3 # Wait 300ms after each scroll
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
virtual_scroll_config=virtual_config,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
# TIP: Set headless=False to watch the scrolling happen!
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
viewport={"width": 1280, "height": 800}
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url=f"http://localhost:{port}/virtual_scroll_twitter_like.html",
|
||||
config=config
|
||||
)
|
||||
|
||||
# Count tweets captured
|
||||
import re
|
||||
tweets = re.findall(r'data-tweet-id="(\d+)"', result.html)
|
||||
unique_tweets = sorted(set(int(id) for id in tweets))
|
||||
|
||||
print(f"\n📊 Results:")
|
||||
print(f" Total HTML length: {len(result.html):,} characters")
|
||||
print(f" Tweets captured: {len(unique_tweets)} unique tweets")
|
||||
if unique_tweets:
|
||||
print(f" Tweet IDs range: {min(unique_tweets)} to {max(unique_tweets)}")
|
||||
print(f" Expected range: 0 to 499 (500 tweets total)")
|
||||
|
||||
if len(unique_tweets) == 500:
|
||||
print(f" ✅ SUCCESS! All tweets captured!")
|
||||
else:
|
||||
print(f" ⚠️ Captured {len(unique_tweets)}/500 tweets")
|
||||
|
||||
finally:
|
||||
server.stop()
|
||||
|
||||
|
||||
async def example_traditional_append_scroll():
|
||||
"""
|
||||
Example 2: Traditional infinite scroll where content is APPENDED.
|
||||
No virtual scroll needed - all content stays in DOM.
|
||||
"""
|
||||
print("\n" + "="*60)
|
||||
print("EXAMPLE 2: Traditional Append-Only Scroll")
|
||||
print("="*60)
|
||||
|
||||
server = TestServer()
|
||||
port = await server.start()
|
||||
|
||||
try:
|
||||
# Configure virtual scroll
|
||||
virtual_config = VirtualScrollConfig(
|
||||
container_selector=".posts-container",
|
||||
scroll_count=15, # Less scrolls needed since content accumulates
|
||||
scroll_by=500, # Scroll by 500 pixels
|
||||
wait_after_scroll=0.4
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
virtual_scroll_config=virtual_config,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url=f"http://localhost:{port}/virtual_scroll_append_only.html",
|
||||
config=config
|
||||
)
|
||||
|
||||
# Count posts
|
||||
import re
|
||||
posts = re.findall(r'data-post-id="(\d+)"', result.html)
|
||||
unique_posts = sorted(set(int(id) for id in posts))
|
||||
|
||||
print(f"\n📊 Results:")
|
||||
print(f" Total HTML length: {len(result.html):,} characters")
|
||||
print(f" Posts captured: {len(unique_posts)} unique posts")
|
||||
|
||||
if unique_posts:
|
||||
print(f" Post IDs range: {min(unique_posts)} to {max(unique_posts)}")
|
||||
print(f" ℹ️ Note: This page appends content, so virtual scroll")
|
||||
print(f" just helps trigger more loads. All content stays in DOM.")
|
||||
|
||||
finally:
|
||||
server.stop()
|
||||
|
||||
|
||||
async def example_instagram_grid():
|
||||
"""
|
||||
Example 3: Instagram-like grid with virtual scroll.
|
||||
Grid layout where only visible rows are rendered.
|
||||
"""
|
||||
print("\n" + "="*60)
|
||||
print("EXAMPLE 3: Instagram Grid Virtual Scroll")
|
||||
print("="*60)
|
||||
|
||||
server = TestServer()
|
||||
port = await server.start()
|
||||
|
||||
try:
|
||||
# Configure for grid layout
|
||||
virtual_config = VirtualScrollConfig(
|
||||
container_selector=".feed-container", # Container with the grid
|
||||
scroll_count=100, # Many scrolls for 999 posts
|
||||
scroll_by="container_height",
|
||||
wait_after_scroll=0.2 # Faster scrolling for grid
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
virtual_scroll_config=virtual_config,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
screenshot=True # Take a screenshot of the final grid
|
||||
)
|
||||
|
||||
# Show browser for this visual example
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
viewport={"width": 1200, "height": 900}
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url=f"http://localhost:{port}/virtual_scroll_instagram_grid.html",
|
||||
config=config
|
||||
)
|
||||
|
||||
# Count posts in grid
|
||||
import re
|
||||
posts = re.findall(r'data-post-id="(\d+)"', result.html)
|
||||
unique_posts = sorted(set(int(id) for id in posts))
|
||||
|
||||
print(f"\n📊 Results:")
|
||||
print(f" Posts in grid: {len(unique_posts)} unique posts")
|
||||
if unique_posts:
|
||||
print(f" Post IDs range: {min(unique_posts)} to {max(unique_posts)}")
|
||||
print(f" Expected: 0 to 998 (999 posts total)")
|
||||
|
||||
# Save screenshot
|
||||
if result.screenshot:
|
||||
import base64
|
||||
with open("instagram_grid_result.png", "wb") as f:
|
||||
f.write(base64.b64decode(result.screenshot))
|
||||
print(f" 📸 Screenshot saved as instagram_grid_result.png")
|
||||
|
||||
finally:
|
||||
server.stop()
|
||||
|
||||
|
||||
async def example_mixed_content():
|
||||
"""
|
||||
Example 4: News feed with mixed behavior.
|
||||
Featured articles stay (no virtual scroll), regular articles are virtualized.
|
||||
"""
|
||||
print("\n" + "="*60)
|
||||
print("EXAMPLE 4: News Feed with Mixed Behavior")
|
||||
print("="*60)
|
||||
|
||||
server = TestServer()
|
||||
port = await server.start()
|
||||
|
||||
try:
|
||||
# Configure virtual scroll
|
||||
virtual_config = VirtualScrollConfig(
|
||||
container_selector="#newsContainer",
|
||||
scroll_count=25,
|
||||
scroll_by="container_height",
|
||||
wait_after_scroll=0.3
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
virtual_scroll_config=virtual_config,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url=f"http://localhost:{port}/virtual_scroll_news_feed.html",
|
||||
config=config
|
||||
)
|
||||
|
||||
# Count different types of articles
|
||||
import re
|
||||
featured = re.findall(r'data-article-id="featured-\d+"', result.html)
|
||||
regular = re.findall(r'data-article-id="article-(\d+)"', result.html)
|
||||
|
||||
print(f"\n📊 Results:")
|
||||
print(f" Featured articles: {len(set(featured))} (always visible)")
|
||||
print(f" Regular articles: {len(set(regular))} unique articles")
|
||||
|
||||
if regular:
|
||||
regular_ids = sorted(set(int(id) for id in regular))
|
||||
print(f" Regular article IDs: {min(regular_ids)} to {max(regular_ids)}")
|
||||
print(f" ℹ️ Note: Featured articles stay in DOM, only regular")
|
||||
print(f" articles are replaced during virtual scroll")
|
||||
|
||||
finally:
|
||||
server.stop()
|
||||
|
||||
|
||||
async def compare_with_without_virtual_scroll():
|
||||
"""
|
||||
Comparison: Show the difference between crawling with and without virtual scroll.
|
||||
"""
|
||||
print("\n" + "="*60)
|
||||
print("COMPARISON: With vs Without Virtual Scroll")
|
||||
print("="*60)
|
||||
|
||||
server = TestServer()
|
||||
port = await server.start()
|
||||
|
||||
try:
|
||||
url = f"http://localhost:{port}/virtual_scroll_twitter_like.html"
|
||||
|
||||
# First, crawl WITHOUT virtual scroll
|
||||
print("\n1️⃣ Crawling WITHOUT virtual scroll...")
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config_normal = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
result_normal = await crawler.arun(url=url, config=config_normal)
|
||||
|
||||
# Count items
|
||||
import re
|
||||
tweets_normal = len(set(re.findall(r'data-tweet-id="(\d+)"', result_normal.html)))
|
||||
|
||||
# Then, crawl WITH virtual scroll
|
||||
print("2️⃣ Crawling WITH virtual scroll...")
|
||||
virtual_config = VirtualScrollConfig(
|
||||
container_selector="#timeline",
|
||||
scroll_count=50,
|
||||
scroll_by="container_height",
|
||||
wait_after_scroll=0.2
|
||||
)
|
||||
|
||||
config_virtual = CrawlerRunConfig(
|
||||
virtual_scroll_config=virtual_config,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result_virtual = await crawler.arun(url=url, config=config_virtual)
|
||||
|
||||
# Count items
|
||||
tweets_virtual = len(set(re.findall(r'data-tweet-id="(\d+)"', result_virtual.html)))
|
||||
|
||||
# Compare results
|
||||
print(f"\n📊 Comparison Results:")
|
||||
print(f" Without virtual scroll: {tweets_normal} tweets (only initial visible)")
|
||||
print(f" With virtual scroll: {tweets_virtual} tweets (all content captured)")
|
||||
print(f" Improvement: {tweets_virtual / tweets_normal if tweets_normal > 0 else 'N/A':.1f}x more content!")
|
||||
|
||||
print(f"\n HTML size without: {len(result_normal.html):,} characters")
|
||||
print(f" HTML size with: {len(result_virtual.html):,} characters")
|
||||
|
||||
finally:
|
||||
server.stop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("""
|
||||
╔════════════════════════════════════════════════════════════╗
|
||||
║ Virtual Scroll Examples for Crawl4AI ║
|
||||
╚════════════════════════════════════════════════════════════╝
|
||||
|
||||
These examples demonstrate different virtual scroll scenarios:
|
||||
1. Twitter-like (content replaced) - Classic virtual scroll
|
||||
2. Traditional append - Content accumulates
|
||||
3. Instagram grid - Visual grid layout
|
||||
4. Mixed behavior - Some content stays, some virtualizes
|
||||
|
||||
Starting examples...
|
||||
""")
|
||||
|
||||
# Run all examples
|
||||
asyncio.run(example_twitter_like_virtual_scroll())
|
||||
asyncio.run(example_traditional_append_scroll())
|
||||
asyncio.run(example_instagram_grid())
|
||||
asyncio.run(example_mixed_content())
|
||||
asyncio.run(compare_with_without_virtual_scroll())
|
||||
|
||||
print("\n✅ All examples completed!")
|
||||
print("\nTIP: Set headless=False in BrowserConfig to watch the scrolling in action!")
|
||||
Reference in New Issue
Block a user