feat: Add virtual scroll support for modern web scraping

Add comprehensive virtual scroll handling to capture all content from pages that use DOM recycling techniques (Twitter, Instagram, etc).

Key features:
- New VirtualScrollConfig class for configuring virtual scroll behavior
- Automatic detection of three scrolling scenarios: no change, content appended, content replaced
- Intelligent HTML chunk capture and merging with deduplication
- 100% content capture from virtual scroll pages
- Seamless integration with existing extraction strategies
- JavaScript-based detection and capture for performance
- Tree-based DOM merging with text-based deduplication

Documentation:
- Comprehensive guide at docs/md_v2/advanced/virtual-scroll.md
- API reference updates in parameters.md and page-interaction.md
- Blog article explaining the solution and techniques
- Complete examples with local test server

Testing:
- Full test suite achieving 100% capture of 1000 items
- Examples for Twitter timeline, Instagram grid scenarios
- Local test server with different scrolling behaviors

This enables scraping of modern websites that were previously impossible to fully capture with traditional scrolling techniques.
This commit is contained in:
UncleCode
2025-06-29 20:41:37 +08:00
parent 539a324cf6
commit a353515271
18 changed files with 2194 additions and 6 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.6 MiB

View File

@@ -0,0 +1,132 @@
<!DOCTYPE html>
<html>
<head>
<title>Append-Only Scroll (Traditional Infinite Scroll)</title>
<style>
body {
font-family: Arial, sans-serif;
margin: 0;
padding: 20px;
background-color: #f5f5f5;
}
h1 {
color: #333;
text-align: center;
}
.posts-container {
max-width: 800px;
margin: 0 auto;
background: white;
border: 1px solid #ddd;
border-radius: 5px;
padding: 20px;
}
.post {
background: #f9f9f9;
padding: 15px;
margin-bottom: 15px;
border-radius: 5px;
border: 1px solid #eee;
}
.post-title {
font-size: 18px;
font-weight: bold;
color: #2c3e50;
margin-bottom: 10px;
}
.post-content {
color: #555;
line-height: 1.6;
}
.loading {
text-align: center;
padding: 20px;
color: #888;
}
</style>
</head>
<body>
<h1>Traditional Infinite Scroll Demo</h1>
<p style="text-align: center; color: #666;">This appends new content without removing old content</p>
<div class="posts-container"></div>
<script>
// Traditional infinite scroll - APPENDS content
const container = document.querySelector('.posts-container');
const totalPosts = 200;
const postsPerPage = 20;
let loadedPosts = 0;
let isLoading = false;
// Generate fake post data
function generatePost(index) {
return {
id: index,
title: `Post Title #${index + 1}`,
content: `This is the content of post ${index + 1}. In traditional infinite scroll, new content is appended to existing content. The DOM keeps growing. Post ID: ${index}`
};
}
// Load more posts - APPENDS to existing content
function loadMorePosts() {
if (isLoading || loadedPosts >= totalPosts) return;
isLoading = true;
// Show loading indicator
const loadingDiv = document.createElement('div');
loadingDiv.className = 'loading';
loadingDiv.textContent = 'Loading more posts...';
container.appendChild(loadingDiv);
// Simulate network delay
setTimeout(() => {
// Remove loading indicator
container.removeChild(loadingDiv);
// Add new posts
const fragment = document.createDocumentFragment();
const endIndex = Math.min(loadedPosts + postsPerPage, totalPosts);
for (let i = loadedPosts; i < endIndex; i++) {
const post = generatePost(i);
const postElement = document.createElement('div');
postElement.className = 'post';
postElement.setAttribute('data-post-id', post.id);
postElement.innerHTML = `
<div class="post-title">${post.title}</div>
<div class="post-content">${post.content}</div>
`;
fragment.appendChild(postElement);
}
// APPEND new posts to existing ones
container.appendChild(fragment);
loadedPosts = endIndex;
isLoading = false;
console.log(`Loaded ${loadedPosts} of ${totalPosts} posts`);
}, 300);
}
// Initial load
loadMorePosts();
// Load more on scroll
window.addEventListener('scroll', () => {
const scrollBottom = window.innerHeight + window.scrollY;
const threshold = document.body.offsetHeight - 500;
if (scrollBottom >= threshold) {
loadMorePosts();
}
});
</script>
</body>
</html>

View File

@@ -0,0 +1,158 @@
<!DOCTYPE html>
<html>
<head>
<title>Instagram-like Grid Virtual Scroll</title>
<style>
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif;
margin: 0;
padding: 20px;
background-color: #fafafa;
}
h1 {
text-align: center;
color: #262626;
font-weight: 300;
}
.feed-container {
max-width: 935px;
margin: 0 auto;
height: 800px;
overflow-y: auto;
background: white;
border: 1px solid #dbdbdb;
border-radius: 3px;
}
.grid {
display: grid;
grid-template-columns: repeat(3, 1fr);
gap: 28px;
padding: 28px;
}
.post {
aspect-ratio: 1;
background: #f0f0f0;
border-radius: 3px;
position: relative;
overflow: hidden;
cursor: pointer;
}
.post:hover .overlay {
opacity: 1;
}
.post img {
width: 100%;
height: 100%;
object-fit: cover;
}
.overlay {
position: absolute;
top: 0;
left: 0;
right: 0;
bottom: 0;
background: rgba(0, 0, 0, 0.3);
display: flex;
align-items: center;
justify-content: center;
color: white;
font-size: 14px;
opacity: 0;
transition: opacity 0.2s;
}
.stats {
display: flex;
gap: 20px;
}
</style>
</head>
<body>
<h1>Instagram Grid Virtual Scroll</h1>
<p style="text-align: center; color: #8e8e8e;">Grid layout with virtual scrolling - only visible rows are rendered</p>
<div class="feed-container">
<div class="grid" id="grid"></div>
</div>
<script>
// Instagram-like grid virtual scroll
const grid = document.getElementById('grid');
const container = document.querySelector('.feed-container');
const totalPosts = 999; // Instagram style count
const postsPerRow = 3;
const rowsPerPage = 4; // 12 posts per page
const postsPerPage = postsPerRow * rowsPerPage;
let currentStartIndex = 0;
// Generate fake Instagram post data
const allPosts = [];
for (let i = 0; i < totalPosts; i++) {
allPosts.push({
id: i,
likes: Math.floor(Math.random() * 10000),
comments: Math.floor(Math.random() * 500),
imageNumber: (i % 10) + 1 // Cycle through 10 placeholder images
});
}
// Render grid - REPLACES content for performance
function renderGrid(startIndex) {
const posts = [];
const endIndex = Math.min(startIndex + postsPerPage, totalPosts);
for (let i = startIndex; i < endIndex; i++) {
const post = allPosts[i];
posts.push(`
<div class="post" data-post-id="${post.id}">
<img src="data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='400' height='400'%3E%3Crect width='400' height='400' fill='%23${Math.floor(Math.random()*16777215).toString(16)}'/%3E%3Ctext x='50%25' y='50%25' text-anchor='middle' dy='.3em' font-family='Arial' font-size='48' fill='white'%3E${post.id + 1}%3C/text%3E%3C/svg%3E" alt="Post ${post.id + 1}">
<div class="overlay">
<div class="stats">
<span>❤️ ${post.likes.toLocaleString()}</span>
<span>💬 ${post.comments}</span>
</div>
</div>
</div>
`);
}
// REPLACE grid content (virtual scroll)
grid.innerHTML = posts.join('');
currentStartIndex = startIndex;
}
// Initial render
renderGrid(0);
// Handle scroll
let scrollTimeout;
container.addEventListener('scroll', () => {
clearTimeout(scrollTimeout);
scrollTimeout = setTimeout(() => {
const scrollTop = container.scrollTop;
const scrollHeight = container.scrollHeight;
const clientHeight = container.clientHeight;
// Calculate which "page" we should show
const scrollPercentage = scrollTop / (scrollHeight - clientHeight);
const targetIndex = Math.floor(scrollPercentage * (totalPosts - postsPerPage) / postsPerPage) * postsPerPage;
// When scrolled to bottom, show next page
if (scrollTop + clientHeight >= scrollHeight - 100) {
const nextIndex = currentStartIndex + postsPerPage;
if (nextIndex < totalPosts) {
renderGrid(nextIndex);
container.scrollTop = 100; // Reset scroll for continuous experience
}
}
}, 50);
});
</script>
</body>
</html>

View File

@@ -0,0 +1,210 @@
<!DOCTYPE html>
<html>
<head>
<title>News Feed with Mixed Scroll Behavior</title>
<style>
body {
font-family: Georgia, serif;
margin: 0;
padding: 20px;
background-color: #f8f8f8;
}
h1 {
text-align: center;
color: #1a1a1a;
font-size: 32px;
margin-bottom: 10px;
}
.description {
text-align: center;
color: #666;
margin-bottom: 20px;
}
#newsContainer {
max-width: 900px;
margin: 0 auto;
height: 700px;
overflow-y: auto;
background: white;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
padding: 20px;
}
.article {
margin-bottom: 30px;
padding-bottom: 30px;
border-bottom: 1px solid #e0e0e0;
}
.article:last-child {
border-bottom: none;
}
.article-header {
margin-bottom: 15px;
}
.category {
display: inline-block;
background: #ff6b6b;
color: white;
padding: 4px 12px;
font-size: 12px;
text-transform: uppercase;
border-radius: 3px;
margin-bottom: 10px;
}
.headline {
font-size: 24px;
font-weight: bold;
color: #1a1a1a;
margin: 10px 0;
line-height: 1.3;
}
.meta {
color: #888;
font-size: 14px;
margin-bottom: 15px;
}
.content {
font-size: 16px;
line-height: 1.8;
color: #333;
}
.featured {
background: #fff9e6;
padding: 20px;
border-radius: 5px;
margin-bottom: 30px;
}
.featured .category {
background: #ffa500;
}
</style>
</head>
<body>
<h1>📰 Dynamic News Feed</h1>
<p class="description">Mixed behavior: Featured articles stay, regular articles use virtual scroll</p>
<div id="newsContainer"></div>
<script>
const container = document.getElementById('newsContainer');
const totalArticles = 100;
const articlesPerPage = 5;
let currentRegularIndex = 0;
// Categories for variety
const categories = ['Politics', 'Technology', 'Business', 'Science', 'Sports', 'Entertainment'];
// Generate article data
const featuredArticles = [];
const regularArticles = [];
// 3 featured articles that always stay
for (let i = 0; i < 3; i++) {
featuredArticles.push({
id: `featured-${i}`,
category: 'Featured',
headline: `Breaking: Major Story ${i + 1} That Stays Visible`,
date: new Date().toLocaleDateString(),
content: `This is featured article ${i + 1}. Featured articles remain in the DOM and are not replaced during scrolling. They provide important persistent content.`
});
}
// Regular articles that get virtualized
for (let i = 0; i < totalArticles; i++) {
regularArticles.push({
id: `article-${i}`,
category: categories[i % categories.length],
headline: `${categories[i % categories.length]} News: Article ${i + 1} of ${totalArticles}`,
date: new Date(Date.now() - i * 86400000).toLocaleDateString(),
content: `This is regular article ${i + 1}. These articles are replaced as you scroll to maintain performance. Only a subset is shown at any time. Article ID: ${i}`
});
}
// Render articles - Featured stay, regular ones are replaced
function renderArticles(regularStartIndex) {
const html = [];
// Always show featured articles
featuredArticles.forEach(article => {
html.push(`
<div class="article featured" data-article-id="${article.id}">
<div class="article-header">
<span class="category">${article.category}</span>
<h2 class="headline">${article.headline}</h2>
<div class="meta">📅 ${article.date}</div>
</div>
<div class="content">${article.content}</div>
</div>
`);
});
// Add divider
html.push('<div style="text-align: center; color: #999; margin: 20px 0;">— Latest News —</div>');
// Show current page of regular articles (virtual scroll)
const endIndex = Math.min(regularStartIndex + articlesPerPage, totalArticles);
for (let i = regularStartIndex; i < endIndex; i++) {
const article = regularArticles[i];
html.push(`
<div class="article" data-article-id="${article.id}">
<div class="article-header">
<span class="category" style="background: ${getCategoryColor(article.category)}">${article.category}</span>
<h2 class="headline">${article.headline}</h2>
<div class="meta">📅 ${article.date}</div>
</div>
<div class="content">${article.content}</div>
</div>
`);
}
container.innerHTML = html.join('');
currentRegularIndex = regularStartIndex;
}
function getCategoryColor(category) {
const colors = {
'Politics': '#e74c3c',
'Technology': '#3498db',
'Business': '#2ecc71',
'Science': '#9b59b6',
'Sports': '#f39c12',
'Entertainment': '#e91e63'
};
return colors[category] || '#95a5a6';
}
// Initial render
renderArticles(0);
// Handle scroll
container.addEventListener('scroll', () => {
const scrollTop = container.scrollTop;
const scrollHeight = container.scrollHeight;
const clientHeight = container.clientHeight;
// When near bottom, load next page of regular articles
if (scrollTop + clientHeight >= scrollHeight - 200) {
const nextIndex = currentRegularIndex + articlesPerPage;
if (nextIndex < totalArticles) {
renderArticles(nextIndex);
// Scroll to where regular articles start
const regularStart = document.querySelector('.article:not(.featured)');
if (regularStart) {
container.scrollTop = regularStart.offsetTop - 100;
}
}
}
});
</script>
</body>
</html>

View File

@@ -0,0 +1,122 @@
<!DOCTYPE html>
<html>
<head>
<title>Twitter-like Virtual Scroll</title>
<style>
body {
font-family: Arial, sans-serif;
margin: 0;
padding: 20px;
background-color: #f0f2f5;
}
h1 {
color: #1da1f2;
text-align: center;
}
#timeline {
max-width: 600px;
margin: 0 auto;
height: 600px;
overflow-y: auto;
background: white;
border: 1px solid #e1e8ed;
border-radius: 10px;
}
.tweet {
padding: 15px;
border-bottom: 1px solid #e1e8ed;
min-height: 80px;
}
.tweet:hover {
background-color: #f7f9fa;
}
.author {
font-weight: bold;
color: #14171a;
margin-bottom: 5px;
}
.content {
color: #14171a;
line-height: 1.5;
}
.stats {
color: #657786;
font-size: 14px;
margin-top: 10px;
}
</style>
</head>
<body>
<h1>Virtual Scroll Demo - Twitter Style</h1>
<p style="text-align: center; color: #666;">This simulates Twitter's timeline where content is replaced as you scroll</p>
<div id="timeline"></div>
<script>
// Simulate Twitter-like virtual scrolling where DOM elements are replaced
const timeline = document.getElementById('timeline');
const totalTweets = 500;
const tweetsPerPage = 10;
let currentIndex = 0;
// Generate fake tweet data
const allTweets = [];
for (let i = 0; i < totalTweets; i++) {
allTweets.push({
id: i,
author: `User_${i + 1}`,
content: `This is tweet #${i + 1} of ${totalTweets}. Virtual scrolling replaces DOM elements to maintain performance. Unique content ID: ${i}`,
likes: Math.floor(Math.random() * 1000),
retweets: Math.floor(Math.random() * 500)
});
}
// Render tweets - REPLACES content
function renderTweets(startIndex) {
const tweets = [];
const endIndex = Math.min(startIndex + tweetsPerPage, totalTweets);
for (let i = startIndex; i < endIndex; i++) {
const tweet = allTweets[i];
tweets.push(`
<div class="tweet" data-tweet-id="${tweet.id}">
<div class="author">@${tweet.author}</div>
<div class="content">${tweet.content}</div>
<div class="stats">❤️ ${tweet.likes} | 🔁 ${tweet.retweets}</div>
</div>
`);
}
// REPLACE entire content (virtual scroll behavior)
timeline.innerHTML = tweets.join('');
currentIndex = startIndex;
}
// Initial render
renderTweets(0);
// Handle scroll
timeline.addEventListener('scroll', () => {
const scrollTop = timeline.scrollTop;
const scrollHeight = timeline.scrollHeight;
const clientHeight = timeline.clientHeight;
// When near bottom, load next page
if (scrollTop + clientHeight >= scrollHeight - 100) {
const nextIndex = currentIndex + tweetsPerPage;
if (nextIndex < totalTweets) {
renderTweets(nextIndex);
// Small scroll adjustment for continuous scrolling
timeline.scrollTop = 50;
}
}
});
</script>
</body>
</html>

View File

@@ -0,0 +1,367 @@
"""
Example of using the virtual scroll feature to capture content from pages
with virtualized scrolling (like Twitter, Instagram, or other infinite scroll feeds).
This example demonstrates virtual scroll with a local test server serving
different types of scrolling behaviors from HTML files in the assets directory.
"""
import asyncio
import os
import http.server
import socketserver
import threading
from pathlib import Path
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, VirtualScrollConfig, CacheMode, BrowserConfig
# Get the assets directory path
ASSETS_DIR = Path(__file__).parent / "assets"
class TestServer:
"""Simple HTTP server to serve our test HTML files"""
def __init__(self, port=8080):
self.port = port
self.httpd = None
self.server_thread = None
async def start(self):
"""Start the test server"""
Handler = http.server.SimpleHTTPRequestHandler
# Save current directory and change to assets directory
self.original_cwd = os.getcwd()
os.chdir(ASSETS_DIR)
# Try to find an available port
for _ in range(10):
try:
self.httpd = socketserver.TCPServer(("", self.port), Handler)
break
except OSError:
self.port += 1
if self.httpd is None:
raise RuntimeError("Could not find available port")
self.server_thread = threading.Thread(target=self.httpd.serve_forever)
self.server_thread.daemon = True
self.server_thread.start()
# Give server time to start
await asyncio.sleep(0.5)
print(f"Test server started on http://localhost:{self.port}")
return self.port
def stop(self):
"""Stop the test server"""
if self.httpd:
self.httpd.shutdown()
# Restore original directory
if hasattr(self, 'original_cwd'):
os.chdir(self.original_cwd)
async def example_twitter_like_virtual_scroll():
"""
Example 1: Twitter-like virtual scroll where content is REPLACED.
This is the classic virtual scroll use case - only visible items exist in DOM.
"""
print("\n" + "="*60)
print("EXAMPLE 1: Twitter-like Virtual Scroll")
print("="*60)
server = TestServer()
port = await server.start()
try:
# Configure virtual scroll for Twitter-like timeline
virtual_config = VirtualScrollConfig(
container_selector="#timeline", # The scrollable container
scroll_count=50, # Scroll up to 50 times to get all content
scroll_by="container_height", # Scroll by container's height
wait_after_scroll=0.3 # Wait 300ms after each scroll
)
config = CrawlerRunConfig(
virtual_scroll_config=virtual_config,
cache_mode=CacheMode.BYPASS
)
# TIP: Set headless=False to watch the scrolling happen!
browser_config = BrowserConfig(
headless=False,
viewport={"width": 1280, "height": 800}
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url=f"http://localhost:{port}/virtual_scroll_twitter_like.html",
config=config
)
# Count tweets captured
import re
tweets = re.findall(r'data-tweet-id="(\d+)"', result.html)
unique_tweets = sorted(set(int(id) for id in tweets))
print(f"\n📊 Results:")
print(f" Total HTML length: {len(result.html):,} characters")
print(f" Tweets captured: {len(unique_tweets)} unique tweets")
if unique_tweets:
print(f" Tweet IDs range: {min(unique_tweets)} to {max(unique_tweets)}")
print(f" Expected range: 0 to 499 (500 tweets total)")
if len(unique_tweets) == 500:
print(f" ✅ SUCCESS! All tweets captured!")
else:
print(f" ⚠️ Captured {len(unique_tweets)}/500 tweets")
finally:
server.stop()
async def example_traditional_append_scroll():
"""
Example 2: Traditional infinite scroll where content is APPENDED.
No virtual scroll needed - all content stays in DOM.
"""
print("\n" + "="*60)
print("EXAMPLE 2: Traditional Append-Only Scroll")
print("="*60)
server = TestServer()
port = await server.start()
try:
# Configure virtual scroll
virtual_config = VirtualScrollConfig(
container_selector=".posts-container",
scroll_count=15, # Less scrolls needed since content accumulates
scroll_by=500, # Scroll by 500 pixels
wait_after_scroll=0.4
)
config = CrawlerRunConfig(
virtual_scroll_config=virtual_config,
cache_mode=CacheMode.BYPASS
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=f"http://localhost:{port}/virtual_scroll_append_only.html",
config=config
)
# Count posts
import re
posts = re.findall(r'data-post-id="(\d+)"', result.html)
unique_posts = sorted(set(int(id) for id in posts))
print(f"\n📊 Results:")
print(f" Total HTML length: {len(result.html):,} characters")
print(f" Posts captured: {len(unique_posts)} unique posts")
if unique_posts:
print(f" Post IDs range: {min(unique_posts)} to {max(unique_posts)}")
print(f" Note: This page appends content, so virtual scroll")
print(f" just helps trigger more loads. All content stays in DOM.")
finally:
server.stop()
async def example_instagram_grid():
"""
Example 3: Instagram-like grid with virtual scroll.
Grid layout where only visible rows are rendered.
"""
print("\n" + "="*60)
print("EXAMPLE 3: Instagram Grid Virtual Scroll")
print("="*60)
server = TestServer()
port = await server.start()
try:
# Configure for grid layout
virtual_config = VirtualScrollConfig(
container_selector=".feed-container", # Container with the grid
scroll_count=100, # Many scrolls for 999 posts
scroll_by="container_height",
wait_after_scroll=0.2 # Faster scrolling for grid
)
config = CrawlerRunConfig(
virtual_scroll_config=virtual_config,
cache_mode=CacheMode.BYPASS,
screenshot=True # Take a screenshot of the final grid
)
# Show browser for this visual example
browser_config = BrowserConfig(
headless=False,
viewport={"width": 1200, "height": 900}
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url=f"http://localhost:{port}/virtual_scroll_instagram_grid.html",
config=config
)
# Count posts in grid
import re
posts = re.findall(r'data-post-id="(\d+)"', result.html)
unique_posts = sorted(set(int(id) for id in posts))
print(f"\n📊 Results:")
print(f" Posts in grid: {len(unique_posts)} unique posts")
if unique_posts:
print(f" Post IDs range: {min(unique_posts)} to {max(unique_posts)}")
print(f" Expected: 0 to 998 (999 posts total)")
# Save screenshot
if result.screenshot:
import base64
with open("instagram_grid_result.png", "wb") as f:
f.write(base64.b64decode(result.screenshot))
print(f" 📸 Screenshot saved as instagram_grid_result.png")
finally:
server.stop()
async def example_mixed_content():
"""
Example 4: News feed with mixed behavior.
Featured articles stay (no virtual scroll), regular articles are virtualized.
"""
print("\n" + "="*60)
print("EXAMPLE 4: News Feed with Mixed Behavior")
print("="*60)
server = TestServer()
port = await server.start()
try:
# Configure virtual scroll
virtual_config = VirtualScrollConfig(
container_selector="#newsContainer",
scroll_count=25,
scroll_by="container_height",
wait_after_scroll=0.3
)
config = CrawlerRunConfig(
virtual_scroll_config=virtual_config,
cache_mode=CacheMode.BYPASS
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=f"http://localhost:{port}/virtual_scroll_news_feed.html",
config=config
)
# Count different types of articles
import re
featured = re.findall(r'data-article-id="featured-\d+"', result.html)
regular = re.findall(r'data-article-id="article-(\d+)"', result.html)
print(f"\n📊 Results:")
print(f" Featured articles: {len(set(featured))} (always visible)")
print(f" Regular articles: {len(set(regular))} unique articles")
if regular:
regular_ids = sorted(set(int(id) for id in regular))
print(f" Regular article IDs: {min(regular_ids)} to {max(regular_ids)}")
print(f" Note: Featured articles stay in DOM, only regular")
print(f" articles are replaced during virtual scroll")
finally:
server.stop()
async def compare_with_without_virtual_scroll():
"""
Comparison: Show the difference between crawling with and without virtual scroll.
"""
print("\n" + "="*60)
print("COMPARISON: With vs Without Virtual Scroll")
print("="*60)
server = TestServer()
port = await server.start()
try:
url = f"http://localhost:{port}/virtual_scroll_twitter_like.html"
# First, crawl WITHOUT virtual scroll
print("\n1⃣ Crawling WITHOUT virtual scroll...")
async with AsyncWebCrawler() as crawler:
config_normal = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
result_normal = await crawler.arun(url=url, config=config_normal)
# Count items
import re
tweets_normal = len(set(re.findall(r'data-tweet-id="(\d+)"', result_normal.html)))
# Then, crawl WITH virtual scroll
print("2⃣ Crawling WITH virtual scroll...")
virtual_config = VirtualScrollConfig(
container_selector="#timeline",
scroll_count=50,
scroll_by="container_height",
wait_after_scroll=0.2
)
config_virtual = CrawlerRunConfig(
virtual_scroll_config=virtual_config,
cache_mode=CacheMode.BYPASS
)
async with AsyncWebCrawler() as crawler:
result_virtual = await crawler.arun(url=url, config=config_virtual)
# Count items
tweets_virtual = len(set(re.findall(r'data-tweet-id="(\d+)"', result_virtual.html)))
# Compare results
print(f"\n📊 Comparison Results:")
print(f" Without virtual scroll: {tweets_normal} tweets (only initial visible)")
print(f" With virtual scroll: {tweets_virtual} tweets (all content captured)")
print(f" Improvement: {tweets_virtual / tweets_normal if tweets_normal > 0 else 'N/A':.1f}x more content!")
print(f"\n HTML size without: {len(result_normal.html):,} characters")
print(f" HTML size with: {len(result_virtual.html):,} characters")
finally:
server.stop()
if __name__ == "__main__":
print("""
╔════════════════════════════════════════════════════════════╗
║ Virtual Scroll Examples for Crawl4AI ║
╚════════════════════════════════════════════════════════════╝
These examples demonstrate different virtual scroll scenarios:
1. Twitter-like (content replaced) - Classic virtual scroll
2. Traditional append - Content accumulates
3. Instagram grid - Visual grid layout
4. Mixed behavior - Some content stays, some virtualizes
Starting examples...
""")
# Run all examples
asyncio.run(example_twitter_like_virtual_scroll())
asyncio.run(example_traditional_append_scroll())
asyncio.run(example_instagram_grid())
asyncio.run(example_mixed_content())
asyncio.run(compare_with_without_virtual_scroll())
print("\n✅ All examples completed!")
print("\nTIP: Set headless=False in BrowserConfig to watch the scrolling in action!")