Add Shadow DOM flattening and reorder js_code execution pipeline

- Add `flatten_shadow_dom` option to CrawlerRunConfig that serializes shadow DOM content into the light DOM before HTML capture. Uses a recursive serializer that resolves <slot> projections and strips only shadow-scoped <style> tags. Also injects an init script to force-open closed shadow roots via attachShadow patching. - Move `js_code` execution to after `wait_for` + `delay_before_return_html` so user scripts run on the fully-hydrated page. Add `js_code_before_wait` for the less common case of triggering loading before waiting. - Add JS snippet (flatten_shadow_dom.js), integration test, example, and documentation across all relevant doc files.
2026-02-18 06:43:00 +00:00
parent 4fb02f8b50
commit 8576331d4e
11 changed files with 522 additions and 66 deletions
--- a/docs/examples/shadow_dom_crawling.py
+++ b/docs/examples/shadow_dom_crawling.py
@@ -0,0 +1,77 @@
+"""
+Shadow DOM Crawling Example
+============================
+
+Demonstrates how to use `flatten_shadow_dom=True` to extract content
+hidden inside Shadow DOM trees on sites built with Web Components
+(Stencil, Lit, Shoelace, Angular Elements, etc.).
+
+Shadow DOM creates encapsulated sub-trees that are invisible to the
+normal page serialization (page.content() / outerHTML). The
+`flatten_shadow_dom` option walks these trees and produces a single
+flat HTML document that includes all shadow content.
+
+This example crawls a Bosch Rexroth product page where the product
+description, technical specs, and downloads are rendered entirely
+inside Shadow DOM by Stencil.js web components.
+"""
+
+import asyncio
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+
+URL = "https://store.boschrexroth.com/en/us/p/hydraulic-cylinder-r900999011"
+
+
+async def main():
+    browser_config = BrowserConfig(headless=True)
+
+    # ── 1. Baseline: without shadow DOM flattening ──────────────────
+    print("=" * 60)
+    print("Without flatten_shadow_dom (baseline)")
+    print("=" * 60)
+
+    config = CrawlerRunConfig(
+        wait_until="load",
+        delay_before_return_html=3.0,
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(URL, config=config)
+
+    md = result.markdown.raw_markdown if result.markdown else ""
+    print(f"Markdown length: {len(md)} chars")
+    print(f"Has product description: {'mill type design' in md.lower()}")
+    print(f"Has technical specs:     {'CDH1' in md}")
+    print(f"Has downloads section:   {'Downloads' in md}")
+    print()
+
+    # ── 2. With shadow DOM flattening ───────────────────────────────
+    print("=" * 60)
+    print("With flatten_shadow_dom=True")
+    print("=" * 60)
+
+    config = CrawlerRunConfig(
+        wait_until="load",
+        delay_before_return_html=3.0,
+        flatten_shadow_dom=True,
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(URL, config=config)
+
+    md = result.markdown.raw_markdown if result.markdown else ""
+    print(f"Markdown length: {len(md)} chars")
+    print(f"Has product description: {'mill type design' in md.lower()}")
+    print(f"Has technical specs:     {'CDH1' in md}")
+    print(f"Has downloads section:   {'Downloads' in md}")
+    print()
+
+    # Show the product content section
+    idx = md.find("Product Description")
+    if idx >= 0:
+        print("── Extracted product content ──")
+        print(md[idx:idx + 1200])
+
+
+if __name__ == "__main__":
+    asyncio.run(main())