Files
crawl4ai/tests/test_virtual_scroll.py
UncleCode a353515271 feat: Add virtual scroll support for modern web scraping
Add comprehensive virtual scroll handling to capture all content from pages that use DOM recycling techniques (Twitter, Instagram, etc).

Key features:
- New VirtualScrollConfig class for configuring virtual scroll behavior
- Automatic detection of three scrolling scenarios: no change, content appended, content replaced
- Intelligent HTML chunk capture and merging with deduplication
- 100% content capture from virtual scroll pages
- Seamless integration with existing extraction strategies
- JavaScript-based detection and capture for performance
- Tree-based DOM merging with text-based deduplication

Documentation:
- Comprehensive guide at docs/md_v2/advanced/virtual-scroll.md
- API reference updates in parameters.md and page-interaction.md
- Blog article explaining the solution and techniques
- Complete examples with local test server

Testing:
- Full test suite achieving 100% capture of 1000 items
- Examples for Twitter timeline, Instagram grid scenarios
- Local test server with different scrolling behaviors

This enables scraping of modern websites that were previously impossible to fully capture with traditional scrolling techniques.
2025-06-29 20:41:37 +08:00

197 lines
6.8 KiB
Python

"""
Test virtual scroll implementation according to the design:
- Create a page with virtual scroll that replaces content
- Verify all 1000 items are captured
"""
import asyncio
import os
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, VirtualScrollConfig, CacheMode, BrowserConfig
async def test_virtual_scroll():
"""Test virtual scroll with content replacement (true virtual scroll)"""
# Create test HTML with true virtual scroll that replaces content
test_html = '''
<html>
<head>
<style>
#container {
height: 500px;
overflow-y: auto;
border: 1px solid #ccc;
}
.item {
height: 50px;
padding: 10px;
border-bottom: 1px solid #eee;
}
</style>
</head>
<body>
<h1>Virtual Scroll Test - 1000 Items</h1>
<div id="container"></div>
<script>
// True virtual scroll that REPLACES content
const container = document.getElementById('container');
const totalItems = 1000;
const itemsPerPage = 10; // Only show 10 items at a time
let currentStartIndex = 0;
// All our data
const allData = [];
for (let i = 0; i < totalItems; i++) {
allData.push({
id: i,
text: `Item ${i + 1} of ${totalItems} - Unique ID: ${i}`
});
}
// Function to render current page
function renderPage(startIndex) {
const items = [];
const endIndex = Math.min(startIndex + itemsPerPage, totalItems);
for (let i = startIndex; i < endIndex; i++) {
const item = allData[i];
items.push(`<div class="item" data-index="${item.id}">${item.text}</div>`);
}
// REPLACE container content (virtual scroll)
container.innerHTML = items.join('');
currentStartIndex = startIndex;
}
// Initial render
renderPage(0);
// Handle scroll
container.addEventListener('scroll', () => {
const scrollTop = container.scrollTop;
const scrollHeight = container.scrollHeight;
const clientHeight = container.clientHeight;
// Calculate which page we should show based on scroll position
// This creates a virtual scroll effect
if (scrollTop + clientHeight >= scrollHeight - 50) {
// Load next page
const nextIndex = currentStartIndex + itemsPerPage;
if (nextIndex < totalItems) {
renderPage(nextIndex);
// Reset scroll to top to continue scrolling
container.scrollTop = 10;
}
}
});
</script>
</body>
</html>
'''
# Save test HTML to a file
import tempfile
with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f:
f.write(test_html)
test_file_path = f.name
httpd = None
old_cwd = os.getcwd()
try:
# Start a simple HTTP server
import http.server
import socketserver
import threading
import random
# Find available port
for _ in range(10):
PORT = random.randint(8000, 9999)
try:
Handler = http.server.SimpleHTTPRequestHandler
os.chdir(os.path.dirname(test_file_path))
httpd = socketserver.TCPServer(("", PORT), Handler)
break
except OSError:
continue
if httpd is None:
raise RuntimeError("Could not find available port")
server_thread = threading.Thread(target=httpd.serve_forever)
server_thread.daemon = True
server_thread.start()
# Give server time to start
await asyncio.sleep(0.5)
# Configure virtual scroll
# With 10 items per page and 1000 total, we need 100 pages
# Let's do 120 scrolls to ensure we get everything
virtual_config = VirtualScrollConfig(
container_selector="#container",
scroll_count=120,
scroll_by="container_height", # Scroll by container height
wait_after_scroll=0.1 # Quick wait for test
)
config = CrawlerRunConfig(
virtual_scroll_config=virtual_config,
cache_mode=CacheMode.BYPASS,
verbose=True
)
browserConfig = BrowserConfig(
headless= False
)
async with AsyncWebCrawler(verbose=True, config=browserConfig) as crawler:
result = await crawler.arun(
url=f"http://localhost:{PORT}/{os.path.basename(test_file_path)}",
config=config
)
# Count all items in the result
import re
items = re.findall(r'data-index="(\d+)"', result.html)
unique_indices = sorted(set(int(idx) for idx in items))
print(f"\n{'='*60}")
print(f"TEST RESULTS:")
print(f"HTML Length: {len(result.html)}")
print(f"Total items found: {len(items)}")
print(f"Unique items: {len(unique_indices)}")
if unique_indices:
print(f"Item indices: {min(unique_indices)} to {max(unique_indices)}")
print(f"Expected: 0 to 999")
# Check for gaps
expected = set(range(1000))
actual = set(unique_indices)
missing = expected - actual
if missing:
print(f"\n❌ FAILED! Missing {len(missing)} items")
print(f"Missing indices: {sorted(missing)[:10]}{'...' if len(missing) > 10 else ''}")
else:
print(f"\n✅ SUCCESS! All 1000 items captured!")
# Show some sample items
print(f"\nSample items from result:")
sample_items = re.findall(r'<div class="item"[^>]*>([^<]+)</div>', result.html)[:5]
for item in sample_items:
print(f" - {item}")
print(f"{'='*60}\n")
finally:
# Clean up
if httpd:
httpd.shutdown()
os.chdir(old_cwd)
os.unlink(test_file_path)
if __name__ == "__main__":
asyncio.run(test_virtual_scroll())