Add Shadow DOM flattening and reorder js_code execution pipeline
- Add `flatten_shadow_dom` option to CrawlerRunConfig that serializes shadow DOM content into the light DOM before HTML capture. Uses a recursive serializer that resolves <slot> projections and strips only shadow-scoped <style> tags. Also injects an init script to force-open closed shadow roots via attachShadow patching. - Move `js_code` execution to after `wait_for` + `delay_before_return_html` so user scripts run on the fully-hydrated page. Add `js_code_before_wait` for the less common case of triggering loading before waiting. - Add JS snippet (flatten_shadow_dom.js), integration test, example, and documentation across all relevant doc files.
This commit is contained in:
77
docs/examples/shadow_dom_crawling.py
Normal file
77
docs/examples/shadow_dom_crawling.py
Normal file
@@ -0,0 +1,77 @@
|
||||
"""
|
||||
Shadow DOM Crawling Example
|
||||
============================
|
||||
|
||||
Demonstrates how to use `flatten_shadow_dom=True` to extract content
|
||||
hidden inside Shadow DOM trees on sites built with Web Components
|
||||
(Stencil, Lit, Shoelace, Angular Elements, etc.).
|
||||
|
||||
Shadow DOM creates encapsulated sub-trees that are invisible to the
|
||||
normal page serialization (page.content() / outerHTML). The
|
||||
`flatten_shadow_dom` option walks these trees and produces a single
|
||||
flat HTML document that includes all shadow content.
|
||||
|
||||
This example crawls a Bosch Rexroth product page where the product
|
||||
description, technical specs, and downloads are rendered entirely
|
||||
inside Shadow DOM by Stencil.js web components.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
|
||||
URL = "https://store.boschrexroth.com/en/us/p/hydraulic-cylinder-r900999011"
|
||||
|
||||
|
||||
async def main():
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
|
||||
# ── 1. Baseline: without shadow DOM flattening ──────────────────
|
||||
print("=" * 60)
|
||||
print("Without flatten_shadow_dom (baseline)")
|
||||
print("=" * 60)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
wait_until="load",
|
||||
delay_before_return_html=3.0,
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(URL, config=config)
|
||||
|
||||
md = result.markdown.raw_markdown if result.markdown else ""
|
||||
print(f"Markdown length: {len(md)} chars")
|
||||
print(f"Has product description: {'mill type design' in md.lower()}")
|
||||
print(f"Has technical specs: {'CDH1' in md}")
|
||||
print(f"Has downloads section: {'Downloads' in md}")
|
||||
print()
|
||||
|
||||
# ── 2. With shadow DOM flattening ───────────────────────────────
|
||||
print("=" * 60)
|
||||
print("With flatten_shadow_dom=True")
|
||||
print("=" * 60)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
wait_until="load",
|
||||
delay_before_return_html=3.0,
|
||||
flatten_shadow_dom=True,
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(URL, config=config)
|
||||
|
||||
md = result.markdown.raw_markdown if result.markdown else ""
|
||||
print(f"Markdown length: {len(md)} chars")
|
||||
print(f"Has product description: {'mill type design' in md.lower()}")
|
||||
print(f"Has technical specs: {'CDH1' in md}")
|
||||
print(f"Has downloads section: {'Downloads' in md}")
|
||||
print()
|
||||
|
||||
# Show the product content section
|
||||
idx = md.find("Product Description")
|
||||
if idx >= 0:
|
||||
print("── Extracted product content ──")
|
||||
print(md[idx:idx + 1200])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user