feat: Add virtual scroll support for modern web scraping
Add comprehensive virtual scroll handling to capture all content from pages that use DOM recycling techniques (Twitter, Instagram, etc). Key features: - New VirtualScrollConfig class for configuring virtual scroll behavior - Automatic detection of three scrolling scenarios: no change, content appended, content replaced - Intelligent HTML chunk capture and merging with deduplication - 100% content capture from virtual scroll pages - Seamless integration with existing extraction strategies - JavaScript-based detection and capture for performance - Tree-based DOM merging with text-based deduplication Documentation: - Comprehensive guide at docs/md_v2/advanced/virtual-scroll.md - API reference updates in parameters.md and page-interaction.md - Blog article explaining the solution and techniques - Complete examples with local test server Testing: - Full test suite achieving 100% capture of 1000 items - Examples for Twitter timeline, Instagram grid scenarios - Local test server with different scrolling behaviors This enables scraping of modern websites that were previously impossible to fully capture with traditional scrolling techniques.
This commit is contained in:
197
tests/test_virtual_scroll.py
Normal file
197
tests/test_virtual_scroll.py
Normal file
@@ -0,0 +1,197 @@
|
||||
"""
|
||||
Test virtual scroll implementation according to the design:
|
||||
- Create a page with virtual scroll that replaces content
|
||||
- Verify all 1000 items are captured
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, VirtualScrollConfig, CacheMode, BrowserConfig
|
||||
|
||||
async def test_virtual_scroll():
|
||||
"""Test virtual scroll with content replacement (true virtual scroll)"""
|
||||
|
||||
# Create test HTML with true virtual scroll that replaces content
|
||||
test_html = '''
|
||||
<html>
|
||||
<head>
|
||||
<style>
|
||||
#container {
|
||||
height: 500px;
|
||||
overflow-y: auto;
|
||||
border: 1px solid #ccc;
|
||||
}
|
||||
.item {
|
||||
height: 50px;
|
||||
padding: 10px;
|
||||
border-bottom: 1px solid #eee;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Virtual Scroll Test - 1000 Items</h1>
|
||||
<div id="container"></div>
|
||||
<script>
|
||||
// True virtual scroll that REPLACES content
|
||||
const container = document.getElementById('container');
|
||||
const totalItems = 1000;
|
||||
const itemsPerPage = 10; // Only show 10 items at a time
|
||||
let currentStartIndex = 0;
|
||||
|
||||
// All our data
|
||||
const allData = [];
|
||||
for (let i = 0; i < totalItems; i++) {
|
||||
allData.push({
|
||||
id: i,
|
||||
text: `Item ${i + 1} of ${totalItems} - Unique ID: ${i}`
|
||||
});
|
||||
}
|
||||
|
||||
// Function to render current page
|
||||
function renderPage(startIndex) {
|
||||
const items = [];
|
||||
const endIndex = Math.min(startIndex + itemsPerPage, totalItems);
|
||||
|
||||
for (let i = startIndex; i < endIndex; i++) {
|
||||
const item = allData[i];
|
||||
items.push(`<div class="item" data-index="${item.id}">${item.text}</div>`);
|
||||
}
|
||||
|
||||
// REPLACE container content (virtual scroll)
|
||||
container.innerHTML = items.join('');
|
||||
currentStartIndex = startIndex;
|
||||
}
|
||||
|
||||
// Initial render
|
||||
renderPage(0);
|
||||
|
||||
// Handle scroll
|
||||
container.addEventListener('scroll', () => {
|
||||
const scrollTop = container.scrollTop;
|
||||
const scrollHeight = container.scrollHeight;
|
||||
const clientHeight = container.clientHeight;
|
||||
|
||||
// Calculate which page we should show based on scroll position
|
||||
// This creates a virtual scroll effect
|
||||
if (scrollTop + clientHeight >= scrollHeight - 50) {
|
||||
// Load next page
|
||||
const nextIndex = currentStartIndex + itemsPerPage;
|
||||
if (nextIndex < totalItems) {
|
||||
renderPage(nextIndex);
|
||||
// Reset scroll to top to continue scrolling
|
||||
container.scrollTop = 10;
|
||||
}
|
||||
}
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
# Save test HTML to a file
|
||||
import tempfile
|
||||
|
||||
with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f:
|
||||
f.write(test_html)
|
||||
test_file_path = f.name
|
||||
|
||||
httpd = None
|
||||
old_cwd = os.getcwd()
|
||||
|
||||
try:
|
||||
# Start a simple HTTP server
|
||||
import http.server
|
||||
import socketserver
|
||||
import threading
|
||||
import random
|
||||
|
||||
# Find available port
|
||||
for _ in range(10):
|
||||
PORT = random.randint(8000, 9999)
|
||||
try:
|
||||
Handler = http.server.SimpleHTTPRequestHandler
|
||||
os.chdir(os.path.dirname(test_file_path))
|
||||
httpd = socketserver.TCPServer(("", PORT), Handler)
|
||||
break
|
||||
except OSError:
|
||||
continue
|
||||
|
||||
if httpd is None:
|
||||
raise RuntimeError("Could not find available port")
|
||||
|
||||
server_thread = threading.Thread(target=httpd.serve_forever)
|
||||
server_thread.daemon = True
|
||||
server_thread.start()
|
||||
|
||||
# Give server time to start
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
# Configure virtual scroll
|
||||
# With 10 items per page and 1000 total, we need 100 pages
|
||||
# Let's do 120 scrolls to ensure we get everything
|
||||
virtual_config = VirtualScrollConfig(
|
||||
container_selector="#container",
|
||||
scroll_count=120,
|
||||
scroll_by="container_height", # Scroll by container height
|
||||
wait_after_scroll=0.1 # Quick wait for test
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
virtual_scroll_config=virtual_config,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
browserConfig = BrowserConfig(
|
||||
headless= False
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(verbose=True, config=browserConfig) as crawler:
|
||||
result = await crawler.arun(
|
||||
url=f"http://localhost:{PORT}/{os.path.basename(test_file_path)}",
|
||||
config=config
|
||||
)
|
||||
|
||||
# Count all items in the result
|
||||
import re
|
||||
items = re.findall(r'data-index="(\d+)"', result.html)
|
||||
unique_indices = sorted(set(int(idx) for idx in items))
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"TEST RESULTS:")
|
||||
print(f"HTML Length: {len(result.html)}")
|
||||
print(f"Total items found: {len(items)}")
|
||||
print(f"Unique items: {len(unique_indices)}")
|
||||
|
||||
if unique_indices:
|
||||
print(f"Item indices: {min(unique_indices)} to {max(unique_indices)}")
|
||||
print(f"Expected: 0 to 999")
|
||||
|
||||
# Check for gaps
|
||||
expected = set(range(1000))
|
||||
actual = set(unique_indices)
|
||||
missing = expected - actual
|
||||
|
||||
if missing:
|
||||
print(f"\n❌ FAILED! Missing {len(missing)} items")
|
||||
print(f"Missing indices: {sorted(missing)[:10]}{'...' if len(missing) > 10 else ''}")
|
||||
else:
|
||||
print(f"\n✅ SUCCESS! All 1000 items captured!")
|
||||
|
||||
# Show some sample items
|
||||
print(f"\nSample items from result:")
|
||||
sample_items = re.findall(r'<div class="item"[^>]*>([^<]+)</div>', result.html)[:5]
|
||||
for item in sample_items:
|
||||
print(f" - {item}")
|
||||
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
if httpd:
|
||||
httpd.shutdown()
|
||||
os.chdir(old_cwd)
|
||||
os.unlink(test_file_path)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_virtual_scroll())
|
||||
Reference in New Issue
Block a user