Add comprehensive virtual scroll handling to capture all content from pages that use DOM recycling techniques (Twitter, Instagram, etc). Key features: - New VirtualScrollConfig class for configuring virtual scroll behavior - Automatic detection of three scrolling scenarios: no change, content appended, content replaced - Intelligent HTML chunk capture and merging with deduplication - 100% content capture from virtual scroll pages - Seamless integration with existing extraction strategies - JavaScript-based detection and capture for performance - Tree-based DOM merging with text-based deduplication Documentation: - Comprehensive guide at docs/md_v2/advanced/virtual-scroll.md - API reference updates in parameters.md and page-interaction.md - Blog article explaining the solution and techniques - Complete examples with local test server Testing: - Full test suite achieving 100% capture of 1000 items - Examples for Twitter timeline, Instagram grid scenarios - Local test server with different scrolling behaviors This enables scraping of modern websites that were previously impossible to fully capture with traditional scrolling techniques.
190 lines
4.9 KiB
Python
190 lines
4.9 KiB
Python
# __init__.py
|
|
import warnings
|
|
|
|
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
|
# MODIFIED: Add SeedingConfig and VirtualScrollConfig here
|
|
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig
|
|
|
|
from .content_scraping_strategy import (
|
|
ContentScrapingStrategy,
|
|
WebScrapingStrategy,
|
|
LXMLWebScrapingStrategy,
|
|
)
|
|
from .async_logger import (
|
|
AsyncLoggerBase,
|
|
AsyncLogger,
|
|
)
|
|
from .proxy_strategy import (
|
|
ProxyRotationStrategy,
|
|
RoundRobinProxyStrategy,
|
|
)
|
|
from .extraction_strategy import (
|
|
ExtractionStrategy,
|
|
LLMExtractionStrategy,
|
|
CosineStrategy,
|
|
JsonCssExtractionStrategy,
|
|
JsonXPathExtractionStrategy,
|
|
JsonLxmlExtractionStrategy,
|
|
RegexExtractionStrategy
|
|
)
|
|
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
|
from .markdown_generation_strategy import DefaultMarkdownGenerator
|
|
from .content_filter_strategy import (
|
|
PruningContentFilter,
|
|
BM25ContentFilter,
|
|
LLMContentFilter,
|
|
RelevantContentFilter,
|
|
)
|
|
from .models import CrawlResult, MarkdownGenerationResult, DisplayMode
|
|
from .components.crawler_monitor import CrawlerMonitor
|
|
from .link_preview import LinkPreview
|
|
from .async_dispatcher import (
|
|
MemoryAdaptiveDispatcher,
|
|
SemaphoreDispatcher,
|
|
RateLimiter,
|
|
BaseDispatcher,
|
|
)
|
|
from .docker_client import Crawl4aiDockerClient
|
|
from .hub import CrawlerHub
|
|
from .browser_profiler import BrowserProfiler
|
|
from .deep_crawling import (
|
|
DeepCrawlStrategy,
|
|
BFSDeepCrawlStrategy,
|
|
FilterChain,
|
|
URLPatternFilter,
|
|
DomainFilter,
|
|
ContentTypeFilter,
|
|
URLFilter,
|
|
FilterStats,
|
|
SEOFilter,
|
|
KeywordRelevanceScorer,
|
|
URLScorer,
|
|
CompositeScorer,
|
|
DomainAuthorityScorer,
|
|
FreshnessScorer,
|
|
PathDepthScorer,
|
|
BestFirstCrawlingStrategy,
|
|
DFSDeepCrawlStrategy,
|
|
DeepCrawlDecorator,
|
|
)
|
|
# NEW: Import AsyncUrlSeeder
|
|
from .async_url_seeder import AsyncUrlSeeder
|
|
|
|
# C4A Script Language Support
|
|
from .script import (
|
|
compile as c4a_compile,
|
|
validate as c4a_validate,
|
|
compile_file as c4a_compile_file,
|
|
CompilationResult,
|
|
ValidationResult,
|
|
ErrorDetail
|
|
)
|
|
|
|
from .utils import (
|
|
start_colab_display_server,
|
|
setup_colab_environment
|
|
)
|
|
|
|
__all__ = [
|
|
"AsyncLoggerBase",
|
|
"AsyncLogger",
|
|
"AsyncWebCrawler",
|
|
"BrowserProfiler",
|
|
"LLMConfig",
|
|
"GeolocationConfig",
|
|
# NEW: Add SeedingConfig and VirtualScrollConfig
|
|
"SeedingConfig",
|
|
"VirtualScrollConfig",
|
|
# NEW: Add AsyncUrlSeeder
|
|
"AsyncUrlSeeder",
|
|
"DeepCrawlStrategy",
|
|
"BFSDeepCrawlStrategy",
|
|
"BestFirstCrawlingStrategy",
|
|
"DFSDeepCrawlStrategy",
|
|
"FilterChain",
|
|
"URLPatternFilter",
|
|
"ContentTypeFilter",
|
|
"DomainFilter",
|
|
"FilterStats",
|
|
"URLFilter",
|
|
"SEOFilter",
|
|
"KeywordRelevanceScorer",
|
|
"URLScorer",
|
|
"CompositeScorer",
|
|
"DomainAuthorityScorer",
|
|
"FreshnessScorer",
|
|
"PathDepthScorer",
|
|
"DeepCrawlDecorator",
|
|
"CrawlResult",
|
|
"CrawlerHub",
|
|
"CacheMode",
|
|
"ContentScrapingStrategy",
|
|
"WebScrapingStrategy",
|
|
"LXMLWebScrapingStrategy",
|
|
"BrowserConfig",
|
|
"CrawlerRunConfig",
|
|
"HTTPCrawlerConfig",
|
|
"ExtractionStrategy",
|
|
"LLMExtractionStrategy",
|
|
"CosineStrategy",
|
|
"JsonCssExtractionStrategy",
|
|
"JsonXPathExtractionStrategy",
|
|
"JsonLxmlExtractionStrategy",
|
|
"RegexExtractionStrategy",
|
|
"ChunkingStrategy",
|
|
"RegexChunking",
|
|
"DefaultMarkdownGenerator",
|
|
"RelevantContentFilter",
|
|
"PruningContentFilter",
|
|
"BM25ContentFilter",
|
|
"LLMContentFilter",
|
|
"BaseDispatcher",
|
|
"MemoryAdaptiveDispatcher",
|
|
"SemaphoreDispatcher",
|
|
"RateLimiter",
|
|
"CrawlerMonitor",
|
|
"LinkPreview",
|
|
"DisplayMode",
|
|
"MarkdownGenerationResult",
|
|
"Crawl4aiDockerClient",
|
|
"ProxyRotationStrategy",
|
|
"RoundRobinProxyStrategy",
|
|
"ProxyConfig",
|
|
"start_colab_display_server",
|
|
"setup_colab_environment",
|
|
# C4A Script additions
|
|
"c4a_compile",
|
|
"c4a_validate",
|
|
"c4a_compile_file",
|
|
"CompilationResult",
|
|
"ValidationResult",
|
|
"ErrorDetail",
|
|
]
|
|
|
|
|
|
# def is_sync_version_installed():
|
|
# try:
|
|
# import selenium # noqa
|
|
|
|
# return True
|
|
# except ImportError:
|
|
# return False
|
|
|
|
|
|
# if is_sync_version_installed():
|
|
# try:
|
|
# from .web_crawler import WebCrawler
|
|
|
|
# __all__.append("WebCrawler")
|
|
# except ImportError:
|
|
# print(
|
|
# "Warning: Failed to import WebCrawler even though selenium is installed. This might be due to other missing dependencies."
|
|
# )
|
|
# else:
|
|
# WebCrawler = None
|
|
# # import warnings
|
|
# # print("Warning: Synchronous WebCrawler is not available. Install crawl4ai[sync] for synchronous support. However, please note that the synchronous version will be deprecated soon.")
|
|
|
|
# Disable all Pydantic warnings
|
|
warnings.filterwarnings("ignore", module="pydantic")
|
|
# pydantic_warnings.filter_warnings() |