feat: Add virtual scroll support for modern web scraping
Add comprehensive virtual scroll handling to capture all content from pages that use DOM recycling techniques (Twitter, Instagram, etc). Key features: - New VirtualScrollConfig class for configuring virtual scroll behavior - Automatic detection of three scrolling scenarios: no change, content appended, content replaced - Intelligent HTML chunk capture and merging with deduplication - 100% content capture from virtual scroll pages - Seamless integration with existing extraction strategies - JavaScript-based detection and capture for performance - Tree-based DOM merging with text-based deduplication Documentation: - Comprehensive guide at docs/md_v2/advanced/virtual-scroll.md - API reference updates in parameters.md and page-interaction.md - Blog article explaining the solution and techniques - Complete examples with local test server Testing: - Full test suite achieving 100% capture of 1000 items - Examples for Twitter timeline, Instagram grid scenarios - Local test server with different scrolling behaviors This enables scraping of modern websites that were previously impossible to fully capture with traditional scrolling techniques.
This commit is contained in:
@@ -2,8 +2,8 @@
|
||||
import warnings
|
||||
|
||||
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
||||
# MODIFIED: Add SeedingConfig here
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig
|
||||
# MODIFIED: Add SeedingConfig and VirtualScrollConfig here
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig
|
||||
|
||||
from .content_scraping_strategy import (
|
||||
ContentScrapingStrategy,
|
||||
@@ -92,8 +92,9 @@ __all__ = [
|
||||
"BrowserProfiler",
|
||||
"LLMConfig",
|
||||
"GeolocationConfig",
|
||||
# NEW: Add SeedingConfig
|
||||
# NEW: Add SeedingConfig and VirtualScrollConfig
|
||||
"SeedingConfig",
|
||||
"VirtualScrollConfig",
|
||||
# NEW: Add AsyncUrlSeeder
|
||||
"AsyncUrlSeeder",
|
||||
"DeepCrawlStrategy",
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import os
|
||||
from typing import Union
|
||||
from .config import (
|
||||
DEFAULT_PROVIDER,
|
||||
DEFAULT_PROVIDER_API_KEY,
|
||||
@@ -594,6 +595,51 @@ class BrowserConfig:
|
||||
return config
|
||||
return BrowserConfig.from_kwargs(config)
|
||||
|
||||
class VirtualScrollConfig:
|
||||
"""Configuration for virtual scroll handling.
|
||||
|
||||
This config enables capturing content from pages with virtualized scrolling
|
||||
(like Twitter, Instagram feeds) where DOM elements are recycled as user scrolls.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
container_selector: str,
|
||||
scroll_count: int = 10,
|
||||
scroll_by: Union[str, int] = "container_height",
|
||||
wait_after_scroll: float = 0.5,
|
||||
):
|
||||
"""
|
||||
Initialize virtual scroll configuration.
|
||||
|
||||
Args:
|
||||
container_selector: CSS selector for the scrollable container
|
||||
scroll_count: Maximum number of scrolls to perform
|
||||
scroll_by: Amount to scroll - can be:
|
||||
- "container_height": scroll by container's height
|
||||
- "page_height": scroll by viewport height
|
||||
- int: fixed pixel amount
|
||||
wait_after_scroll: Seconds to wait after each scroll for content to load
|
||||
"""
|
||||
self.container_selector = container_selector
|
||||
self.scroll_count = scroll_count
|
||||
self.scroll_by = scroll_by
|
||||
self.wait_after_scroll = wait_after_scroll
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary for serialization."""
|
||||
return {
|
||||
"container_selector": self.container_selector,
|
||||
"scroll_count": self.scroll_count,
|
||||
"scroll_by": self.scroll_by,
|
||||
"wait_after_scroll": self.wait_after_scroll,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict) -> "VirtualScrollConfig":
|
||||
"""Create instance from dictionary."""
|
||||
return cls(**data)
|
||||
|
||||
class LinkPreviewConfig:
|
||||
"""Configuration for link head extraction and scoring."""
|
||||
|
||||
@@ -911,6 +957,12 @@ class CrawlerRunConfig():
|
||||
table_score_threshold (int): Minimum score threshold for processing a table.
|
||||
Default: 7.
|
||||
|
||||
# Virtual Scroll Parameters
|
||||
virtual_scroll_config (VirtualScrollConfig or dict or None): Configuration for handling virtual scroll containers.
|
||||
Used for capturing content from pages with virtualized
|
||||
scrolling (e.g., Twitter, Instagram feeds).
|
||||
Default: None.
|
||||
|
||||
# Link and Domain Handling Parameters
|
||||
exclude_social_media_domains (list of str): List of domains to exclude for social media links.
|
||||
Default: SOCIAL_MEDIA_DOMAINS (from config).
|
||||
@@ -1056,6 +1108,8 @@ class CrawlerRunConfig():
|
||||
deep_crawl_strategy: Optional[DeepCrawlStrategy] = None,
|
||||
# Link Extraction Parameters
|
||||
link_preview_config: Union[LinkPreviewConfig, Dict[str, Any]] = None,
|
||||
# Virtual Scroll Parameters
|
||||
virtual_scroll_config: Union[VirtualScrollConfig, Dict[str, Any]] = None,
|
||||
# Experimental Parameters
|
||||
experimental: Dict[str, Any] = None,
|
||||
):
|
||||
@@ -1197,6 +1251,17 @@ class CrawlerRunConfig():
|
||||
else:
|
||||
raise ValueError("link_preview_config must be LinkPreviewConfig object or dict")
|
||||
|
||||
# Virtual Scroll Parameters
|
||||
if virtual_scroll_config is None:
|
||||
self.virtual_scroll_config = None
|
||||
elif isinstance(virtual_scroll_config, VirtualScrollConfig):
|
||||
self.virtual_scroll_config = virtual_scroll_config
|
||||
elif isinstance(virtual_scroll_config, dict):
|
||||
# Convert dict to config object for backward compatibility
|
||||
self.virtual_scroll_config = VirtualScrollConfig.from_dict(virtual_scroll_config)
|
||||
else:
|
||||
raise ValueError("virtual_scroll_config must be VirtualScrollConfig object or dict")
|
||||
|
||||
# Experimental Parameters
|
||||
self.experimental = experimental or {}
|
||||
|
||||
|
||||
@@ -898,6 +898,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
if config.scan_full_page:
|
||||
await self._handle_full_page_scan(page, config.scroll_delay)
|
||||
|
||||
# Handle virtual scroll if configured
|
||||
if config.virtual_scroll_config:
|
||||
await self._handle_virtual_scroll(page, config.virtual_scroll_config)
|
||||
|
||||
# Execute JavaScript if provided
|
||||
# if config.js_code:
|
||||
# if isinstance(config.js_code, str):
|
||||
@@ -1149,6 +1153,177 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
# await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
await self.safe_scroll(page, 0, total_height)
|
||||
|
||||
async def _handle_virtual_scroll(self, page: Page, config: "VirtualScrollConfig"):
|
||||
"""
|
||||
Handle virtual scroll containers (e.g., Twitter-like feeds) by capturing
|
||||
content at different scroll positions and merging unique elements.
|
||||
|
||||
Following the design:
|
||||
1. Get container HTML
|
||||
2. Scroll by container height
|
||||
3. Wait and check if container HTML changed
|
||||
4. Three cases:
|
||||
- No change: continue scrolling
|
||||
- New items added (appended): continue (items already in page)
|
||||
- Items replaced: capture HTML chunk and add to list
|
||||
5. After N scrolls, merge chunks if any were captured
|
||||
|
||||
Args:
|
||||
page: The Playwright page object
|
||||
config: Virtual scroll configuration
|
||||
"""
|
||||
try:
|
||||
# Import VirtualScrollConfig to avoid circular import
|
||||
from .async_configs import VirtualScrollConfig
|
||||
|
||||
# Ensure config is a VirtualScrollConfig instance
|
||||
if isinstance(config, dict):
|
||||
config = VirtualScrollConfig.from_dict(config)
|
||||
|
||||
self.logger.info(
|
||||
message="Starting virtual scroll capture for container: {selector}",
|
||||
tag="VSCROLL",
|
||||
params={"selector": config.container_selector}
|
||||
)
|
||||
|
||||
# JavaScript function to handle virtual scroll capture
|
||||
virtual_scroll_js = """
|
||||
async (config) => {
|
||||
const container = document.querySelector(config.container_selector);
|
||||
if (!container) {
|
||||
throw new Error(`Container not found: ${config.container_selector}`);
|
||||
}
|
||||
|
||||
// List to store HTML chunks when content is replaced
|
||||
const htmlChunks = [];
|
||||
let previousHTML = container.innerHTML;
|
||||
let scrollCount = 0;
|
||||
|
||||
// Determine scroll amount
|
||||
let scrollAmount;
|
||||
if (typeof config.scroll_by === 'number') {
|
||||
scrollAmount = config.scroll_by;
|
||||
} else if (config.scroll_by === 'page_height') {
|
||||
scrollAmount = window.innerHeight;
|
||||
} else { // container_height
|
||||
scrollAmount = container.offsetHeight;
|
||||
}
|
||||
|
||||
// Perform scrolling
|
||||
while (scrollCount < config.scroll_count) {
|
||||
// Scroll the container
|
||||
container.scrollTop += scrollAmount;
|
||||
|
||||
// Wait for content to potentially load
|
||||
await new Promise(resolve => setTimeout(resolve, config.wait_after_scroll * 1000));
|
||||
|
||||
// Get current HTML
|
||||
const currentHTML = container.innerHTML;
|
||||
|
||||
// Determine what changed
|
||||
if (currentHTML === previousHTML) {
|
||||
// Case 0: No change - continue scrolling
|
||||
console.log(`Scroll ${scrollCount + 1}: No change in content`);
|
||||
} else if (currentHTML.startsWith(previousHTML)) {
|
||||
// Case 1: New items appended - content already in page
|
||||
console.log(`Scroll ${scrollCount + 1}: New items appended`);
|
||||
} else {
|
||||
// Case 2: Items replaced - capture the previous HTML
|
||||
console.log(`Scroll ${scrollCount + 1}: Content replaced, capturing chunk`);
|
||||
htmlChunks.push(previousHTML);
|
||||
}
|
||||
|
||||
// Update previous HTML for next iteration
|
||||
previousHTML = currentHTML;
|
||||
scrollCount++;
|
||||
|
||||
// Check if we've reached the end
|
||||
if (container.scrollTop + container.clientHeight >= container.scrollHeight - 10) {
|
||||
console.log(`Reached end of scrollable content at scroll ${scrollCount}`);
|
||||
// Capture final chunk if content was replaced
|
||||
if (htmlChunks.length > 0) {
|
||||
htmlChunks.push(currentHTML);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// If we have chunks (case 2 occurred), merge them
|
||||
if (htmlChunks.length > 0) {
|
||||
console.log(`Merging ${htmlChunks.length} HTML chunks`);
|
||||
|
||||
// Parse all chunks to extract unique elements
|
||||
const tempDiv = document.createElement('div');
|
||||
const seenTexts = new Set();
|
||||
const uniqueElements = [];
|
||||
|
||||
// Process each chunk
|
||||
for (const chunk of htmlChunks) {
|
||||
tempDiv.innerHTML = chunk;
|
||||
const elements = tempDiv.children;
|
||||
|
||||
for (let i = 0; i < elements.length; i++) {
|
||||
const element = elements[i];
|
||||
// Normalize text for deduplication
|
||||
const normalizedText = element.innerText
|
||||
.toLowerCase()
|
||||
.replace(/[\\s\\W]/g, ''); // Remove spaces and symbols
|
||||
|
||||
if (!seenTexts.has(normalizedText)) {
|
||||
seenTexts.add(normalizedText);
|
||||
uniqueElements.push(element.outerHTML);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Replace container content with merged unique elements
|
||||
container.innerHTML = uniqueElements.join('\\n');
|
||||
console.log(`Merged ${uniqueElements.length} unique elements from ${htmlChunks.length} chunks`);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
chunksCount: htmlChunks.length,
|
||||
uniqueCount: uniqueElements.length,
|
||||
replaced: true
|
||||
};
|
||||
} else {
|
||||
console.log('No content replacement detected, all content remains in page');
|
||||
return {
|
||||
success: true,
|
||||
chunksCount: 0,
|
||||
uniqueCount: 0,
|
||||
replaced: false
|
||||
};
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
# Execute virtual scroll capture
|
||||
result = await page.evaluate(virtual_scroll_js, config.to_dict())
|
||||
|
||||
if result.get("replaced", False):
|
||||
self.logger.success(
|
||||
message="Virtual scroll completed. Merged {unique} unique elements from {chunks} chunks",
|
||||
tag="VSCROLL",
|
||||
params={
|
||||
"unique": result.get("uniqueCount", 0),
|
||||
"chunks": result.get("chunksCount", 0)
|
||||
}
|
||||
)
|
||||
else:
|
||||
self.logger.info(
|
||||
message="Virtual scroll completed. Content was appended, no merging needed",
|
||||
tag="VSCROLL"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
message="Virtual scroll capture failed: {error}",
|
||||
tag="VSCROLL",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
# Continue with normal flow even if virtual scroll fails
|
||||
|
||||
async def _handle_download(self, download):
|
||||
"""
|
||||
Handle file downloads.
|
||||
|
||||
Reference in New Issue
Block a user