Add comprehensive virtual scroll handling to capture all content from pages that use DOM recycling techniques (Twitter, Instagram, etc). Key features: - New VirtualScrollConfig class for configuring virtual scroll behavior - Automatic detection of three scrolling scenarios: no change, content appended, content replaced - Intelligent HTML chunk capture and merging with deduplication - 100% content capture from virtual scroll pages - Seamless integration with existing extraction strategies - JavaScript-based detection and capture for performance - Tree-based DOM merging with text-based deduplication Documentation: - Comprehensive guide at docs/md_v2/advanced/virtual-scroll.md - API reference updates in parameters.md and page-interaction.md - Blog article explaining the solution and techniques - Complete examples with local test server Testing: - Full test suite achieving 100% capture of 1000 items - Examples for Twitter timeline, Instagram grid scenarios - Local test server with different scrolling behaviors This enables scraping of modern websites that were previously impossible to fully capture with traditional scrolling techniques.
197 lines
6.8 KiB
Python
197 lines
6.8 KiB
Python
"""
|
|
Test virtual scroll implementation according to the design:
|
|
- Create a page with virtual scroll that replaces content
|
|
- Verify all 1000 items are captured
|
|
"""
|
|
|
|
import asyncio
|
|
import os
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, VirtualScrollConfig, CacheMode, BrowserConfig
|
|
|
|
async def test_virtual_scroll():
|
|
"""Test virtual scroll with content replacement (true virtual scroll)"""
|
|
|
|
# Create test HTML with true virtual scroll that replaces content
|
|
test_html = '''
|
|
<html>
|
|
<head>
|
|
<style>
|
|
#container {
|
|
height: 500px;
|
|
overflow-y: auto;
|
|
border: 1px solid #ccc;
|
|
}
|
|
.item {
|
|
height: 50px;
|
|
padding: 10px;
|
|
border-bottom: 1px solid #eee;
|
|
}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<h1>Virtual Scroll Test - 1000 Items</h1>
|
|
<div id="container"></div>
|
|
<script>
|
|
// True virtual scroll that REPLACES content
|
|
const container = document.getElementById('container');
|
|
const totalItems = 1000;
|
|
const itemsPerPage = 10; // Only show 10 items at a time
|
|
let currentStartIndex = 0;
|
|
|
|
// All our data
|
|
const allData = [];
|
|
for (let i = 0; i < totalItems; i++) {
|
|
allData.push({
|
|
id: i,
|
|
text: `Item ${i + 1} of ${totalItems} - Unique ID: ${i}`
|
|
});
|
|
}
|
|
|
|
// Function to render current page
|
|
function renderPage(startIndex) {
|
|
const items = [];
|
|
const endIndex = Math.min(startIndex + itemsPerPage, totalItems);
|
|
|
|
for (let i = startIndex; i < endIndex; i++) {
|
|
const item = allData[i];
|
|
items.push(`<div class="item" data-index="${item.id}">${item.text}</div>`);
|
|
}
|
|
|
|
// REPLACE container content (virtual scroll)
|
|
container.innerHTML = items.join('');
|
|
currentStartIndex = startIndex;
|
|
}
|
|
|
|
// Initial render
|
|
renderPage(0);
|
|
|
|
// Handle scroll
|
|
container.addEventListener('scroll', () => {
|
|
const scrollTop = container.scrollTop;
|
|
const scrollHeight = container.scrollHeight;
|
|
const clientHeight = container.clientHeight;
|
|
|
|
// Calculate which page we should show based on scroll position
|
|
// This creates a virtual scroll effect
|
|
if (scrollTop + clientHeight >= scrollHeight - 50) {
|
|
// Load next page
|
|
const nextIndex = currentStartIndex + itemsPerPage;
|
|
if (nextIndex < totalItems) {
|
|
renderPage(nextIndex);
|
|
// Reset scroll to top to continue scrolling
|
|
container.scrollTop = 10;
|
|
}
|
|
}
|
|
});
|
|
</script>
|
|
</body>
|
|
</html>
|
|
'''
|
|
|
|
# Save test HTML to a file
|
|
import tempfile
|
|
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f:
|
|
f.write(test_html)
|
|
test_file_path = f.name
|
|
|
|
httpd = None
|
|
old_cwd = os.getcwd()
|
|
|
|
try:
|
|
# Start a simple HTTP server
|
|
import http.server
|
|
import socketserver
|
|
import threading
|
|
import random
|
|
|
|
# Find available port
|
|
for _ in range(10):
|
|
PORT = random.randint(8000, 9999)
|
|
try:
|
|
Handler = http.server.SimpleHTTPRequestHandler
|
|
os.chdir(os.path.dirname(test_file_path))
|
|
httpd = socketserver.TCPServer(("", PORT), Handler)
|
|
break
|
|
except OSError:
|
|
continue
|
|
|
|
if httpd is None:
|
|
raise RuntimeError("Could not find available port")
|
|
|
|
server_thread = threading.Thread(target=httpd.serve_forever)
|
|
server_thread.daemon = True
|
|
server_thread.start()
|
|
|
|
# Give server time to start
|
|
await asyncio.sleep(0.5)
|
|
|
|
# Configure virtual scroll
|
|
# With 10 items per page and 1000 total, we need 100 pages
|
|
# Let's do 120 scrolls to ensure we get everything
|
|
virtual_config = VirtualScrollConfig(
|
|
container_selector="#container",
|
|
scroll_count=120,
|
|
scroll_by="container_height", # Scroll by container height
|
|
wait_after_scroll=0.1 # Quick wait for test
|
|
)
|
|
|
|
config = CrawlerRunConfig(
|
|
virtual_scroll_config=virtual_config,
|
|
cache_mode=CacheMode.BYPASS,
|
|
verbose=True
|
|
)
|
|
|
|
browserConfig = BrowserConfig(
|
|
headless= False
|
|
)
|
|
|
|
async with AsyncWebCrawler(verbose=True, config=browserConfig) as crawler:
|
|
result = await crawler.arun(
|
|
url=f"http://localhost:{PORT}/{os.path.basename(test_file_path)}",
|
|
config=config
|
|
)
|
|
|
|
# Count all items in the result
|
|
import re
|
|
items = re.findall(r'data-index="(\d+)"', result.html)
|
|
unique_indices = sorted(set(int(idx) for idx in items))
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"TEST RESULTS:")
|
|
print(f"HTML Length: {len(result.html)}")
|
|
print(f"Total items found: {len(items)}")
|
|
print(f"Unique items: {len(unique_indices)}")
|
|
|
|
if unique_indices:
|
|
print(f"Item indices: {min(unique_indices)} to {max(unique_indices)}")
|
|
print(f"Expected: 0 to 999")
|
|
|
|
# Check for gaps
|
|
expected = set(range(1000))
|
|
actual = set(unique_indices)
|
|
missing = expected - actual
|
|
|
|
if missing:
|
|
print(f"\n❌ FAILED! Missing {len(missing)} items")
|
|
print(f"Missing indices: {sorted(missing)[:10]}{'...' if len(missing) > 10 else ''}")
|
|
else:
|
|
print(f"\n✅ SUCCESS! All 1000 items captured!")
|
|
|
|
# Show some sample items
|
|
print(f"\nSample items from result:")
|
|
sample_items = re.findall(r'<div class="item"[^>]*>([^<]+)</div>', result.html)[:5]
|
|
for item in sample_items:
|
|
print(f" - {item}")
|
|
|
|
print(f"{'='*60}\n")
|
|
|
|
finally:
|
|
# Clean up
|
|
if httpd:
|
|
httpd.shutdown()
|
|
os.chdir(old_cwd)
|
|
os.unlink(test_file_path)
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(test_virtual_scroll()) |