- Add `flatten_shadow_dom` option to CrawlerRunConfig that serializes shadow DOM content into the light DOM before HTML capture. Uses a recursive serializer that resolves <slot> projections and strips only shadow-scoped <style> tags. Also injects an init script to force-open closed shadow roots via attachShadow patching. - Move `js_code` execution to after `wait_for` + `delay_before_return_html` so user scripts run on the fully-hydrated page. Add `js_code_before_wait` for the less common case of triggering loading before waiting. - Add JS snippet (flatten_shadow_dom.js), integration test, example, and documentation across all relevant doc files.
78 lines
2.6 KiB
Python
78 lines
2.6 KiB
Python
"""
|
|
Shadow DOM Crawling Example
|
|
============================
|
|
|
|
Demonstrates how to use `flatten_shadow_dom=True` to extract content
|
|
hidden inside Shadow DOM trees on sites built with Web Components
|
|
(Stencil, Lit, Shoelace, Angular Elements, etc.).
|
|
|
|
Shadow DOM creates encapsulated sub-trees that are invisible to the
|
|
normal page serialization (page.content() / outerHTML). The
|
|
`flatten_shadow_dom` option walks these trees and produces a single
|
|
flat HTML document that includes all shadow content.
|
|
|
|
This example crawls a Bosch Rexroth product page where the product
|
|
description, technical specs, and downloads are rendered entirely
|
|
inside Shadow DOM by Stencil.js web components.
|
|
"""
|
|
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
|
|
|
URL = "https://store.boschrexroth.com/en/us/p/hydraulic-cylinder-r900999011"
|
|
|
|
|
|
async def main():
|
|
browser_config = BrowserConfig(headless=True)
|
|
|
|
# ── 1. Baseline: without shadow DOM flattening ──────────────────
|
|
print("=" * 60)
|
|
print("Without flatten_shadow_dom (baseline)")
|
|
print("=" * 60)
|
|
|
|
config = CrawlerRunConfig(
|
|
wait_until="load",
|
|
delay_before_return_html=3.0,
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
result = await crawler.arun(URL, config=config)
|
|
|
|
md = result.markdown.raw_markdown if result.markdown else ""
|
|
print(f"Markdown length: {len(md)} chars")
|
|
print(f"Has product description: {'mill type design' in md.lower()}")
|
|
print(f"Has technical specs: {'CDH1' in md}")
|
|
print(f"Has downloads section: {'Downloads' in md}")
|
|
print()
|
|
|
|
# ── 2. With shadow DOM flattening ───────────────────────────────
|
|
print("=" * 60)
|
|
print("With flatten_shadow_dom=True")
|
|
print("=" * 60)
|
|
|
|
config = CrawlerRunConfig(
|
|
wait_until="load",
|
|
delay_before_return_html=3.0,
|
|
flatten_shadow_dom=True,
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
result = await crawler.arun(URL, config=config)
|
|
|
|
md = result.markdown.raw_markdown if result.markdown else ""
|
|
print(f"Markdown length: {len(md)} chars")
|
|
print(f"Has product description: {'mill type design' in md.lower()}")
|
|
print(f"Has technical specs: {'CDH1' in md}")
|
|
print(f"Has downloads section: {'Downloads' in md}")
|
|
print()
|
|
|
|
# Show the product content section
|
|
idx = md.find("Product Description")
|
|
if idx >= 0:
|
|
print("── Extracted product content ──")
|
|
print(md[idx:idx + 1200])
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|