Files
crawl4ai/docs/examples/shadow_dom_crawling.py
unclecode 8576331d4e Add Shadow DOM flattening and reorder js_code execution pipeline
- Add `flatten_shadow_dom` option to CrawlerRunConfig that serializes
  shadow DOM content into the light DOM before HTML capture. Uses a
  recursive serializer that resolves <slot> projections and strips
  only shadow-scoped <style> tags. Also injects an init script to
  force-open closed shadow roots via attachShadow patching.

- Move `js_code` execution to after `wait_for` + `delay_before_return_html`
  so user scripts run on the fully-hydrated page. Add `js_code_before_wait`
  for the less common case of triggering loading before waiting.

- Add JS snippet (flatten_shadow_dom.js), integration test, example,
  and documentation across all relevant doc files.
2026-02-18 06:43:00 +00:00

78 lines
2.6 KiB
Python

"""
Shadow DOM Crawling Example
============================
Demonstrates how to use `flatten_shadow_dom=True` to extract content
hidden inside Shadow DOM trees on sites built with Web Components
(Stencil, Lit, Shoelace, Angular Elements, etc.).
Shadow DOM creates encapsulated sub-trees that are invisible to the
normal page serialization (page.content() / outerHTML). The
`flatten_shadow_dom` option walks these trees and produces a single
flat HTML document that includes all shadow content.
This example crawls a Bosch Rexroth product page where the product
description, technical specs, and downloads are rendered entirely
inside Shadow DOM by Stencil.js web components.
"""
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
URL = "https://store.boschrexroth.com/en/us/p/hydraulic-cylinder-r900999011"
async def main():
browser_config = BrowserConfig(headless=True)
# ── 1. Baseline: without shadow DOM flattening ──────────────────
print("=" * 60)
print("Without flatten_shadow_dom (baseline)")
print("=" * 60)
config = CrawlerRunConfig(
wait_until="load",
delay_before_return_html=3.0,
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(URL, config=config)
md = result.markdown.raw_markdown if result.markdown else ""
print(f"Markdown length: {len(md)} chars")
print(f"Has product description: {'mill type design' in md.lower()}")
print(f"Has technical specs: {'CDH1' in md}")
print(f"Has downloads section: {'Downloads' in md}")
print()
# ── 2. With shadow DOM flattening ───────────────────────────────
print("=" * 60)
print("With flatten_shadow_dom=True")
print("=" * 60)
config = CrawlerRunConfig(
wait_until="load",
delay_before_return_html=3.0,
flatten_shadow_dom=True,
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(URL, config=config)
md = result.markdown.raw_markdown if result.markdown else ""
print(f"Markdown length: {len(md)} chars")
print(f"Has product description: {'mill type design' in md.lower()}")
print(f"Has technical specs: {'CDH1' in md}")
print(f"Has downloads section: {'Downloads' in md}")
print()
# Show the product content section
idx = md.find("Product Description")
if idx >= 0:
print("── Extracted product content ──")
print(md[idx:idx + 1200])
if __name__ == "__main__":
asyncio.run(main())