Add Shadow DOM flattening and reorder js_code execution pipeline

- Add `flatten_shadow_dom` option to CrawlerRunConfig that serializes
  shadow DOM content into the light DOM before HTML capture. Uses a
  recursive serializer that resolves <slot> projections and strips
  only shadow-scoped <style> tags. Also injects an init script to
  force-open closed shadow roots via attachShadow patching.

- Move `js_code` execution to after `wait_for` + `delay_before_return_html`
  so user scripts run on the fully-hydrated page. Add `js_code_before_wait`
  for the less common case of triggering loading before waiting.

- Add JS snippet (flatten_shadow_dom.js), integration test, example,
  and documentation across all relevant doc files.
This commit is contained in:
unclecode
2026-02-18 06:43:00 +00:00
parent 4fb02f8b50
commit 8576331d4e
11 changed files with 522 additions and 66 deletions

View File

@@ -1235,7 +1235,11 @@ class CrawlerRunConfig():
Default: 5.
# Page Interaction Parameters
js_code (str or list of str or None): JavaScript code/snippets to run on the page.
js_code (str or list of str or None): JavaScript code/snippets to run on the page
after wait_for and delay_before_return_html.
Default: None.
js_code_before_wait (str or list of str or None): JavaScript to run BEFORE wait_for.
Use for triggering loading that wait_for then checks.
Default: None.
js_only (bool): If True, indicates subsequent calls are JS-driven updates, not full page loads.
Default: False.
@@ -1249,6 +1253,10 @@ class CrawlerRunConfig():
If None, scrolls until the entire page is loaded. Default: None.
process_iframes (bool): If True, attempts to process and inline iframe content.
Default: False.
flatten_shadow_dom (bool): If True, flatten shadow DOM content into the light DOM
before HTML capture so page.content() includes it.
Also injects an init script to force-open closed shadow roots.
Default: False.
remove_overlay_elements (bool): If True, remove overlays/popups before extracting HTML.
Default: False.
remove_consent_popups (bool): If True, remove GDPR/cookie consent popups (IAB TCF/CMP)
@@ -1410,6 +1418,7 @@ class CrawlerRunConfig():
semaphore_count: int = 5,
# Page Interaction Parameters
js_code: Union[str, List[str]] = None,
js_code_before_wait: Union[str, List[str]] = None,
c4a_script: Union[str, List[str]] = None,
js_only: bool = False,
ignore_body_visibility: bool = True,
@@ -1417,6 +1426,7 @@ class CrawlerRunConfig():
scroll_delay: float = 0.2,
max_scroll_steps: Optional[int] = None,
process_iframes: bool = False,
flatten_shadow_dom: bool = False,
remove_overlay_elements: bool = False,
remove_consent_popups: bool = False,
simulate_user: bool = False,
@@ -1538,6 +1548,7 @@ class CrawlerRunConfig():
# Page Interaction Parameters
self.js_code = js_code
self.js_code_before_wait = js_code_before_wait
self.c4a_script = c4a_script
self.js_only = js_only
self.ignore_body_visibility = ignore_body_visibility
@@ -1545,6 +1556,7 @@ class CrawlerRunConfig():
self.scroll_delay = scroll_delay
self.max_scroll_steps = max_scroll_steps
self.process_iframes = process_iframes
self.flatten_shadow_dom = flatten_shadow_dom
self.remove_overlay_elements = remove_overlay_elements
self.remove_consent_popups = remove_consent_popups
self.simulate_user = simulate_user
@@ -1887,12 +1899,14 @@ class CrawlerRunConfig():
"max_range": self.max_range,
"semaphore_count": self.semaphore_count,
"js_code": self.js_code,
"js_code_before_wait": self.js_code_before_wait,
"js_only": self.js_only,
"ignore_body_visibility": self.ignore_body_visibility,
"scan_full_page": self.scan_full_page,
"scroll_delay": self.scroll_delay,
"max_scroll_steps": self.max_scroll_steps,
"process_iframes": self.process_iframes,
"flatten_shadow_dom": self.flatten_shadow_dom,
"remove_overlay_elements": self.remove_overlay_elements,
"remove_consent_popups": self.remove_consent_popups,
"simulate_user": self.simulate_user,

View File

@@ -581,6 +581,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if config.override_navigator or config.simulate_user or config.magic:
await context.add_init_script(load_js_script("navigator_overrider"))
# Force-open closed shadow roots when flatten_shadow_dom is enabled
if config.flatten_shadow_dom:
await context.add_init_script("""
const _origAttachShadow = Element.prototype.attachShadow;
Element.prototype.attachShadow = function(init) {
return _origAttachShadow.call(this, {...init, mode: 'open'});
};
""")
# Call hook after page creation
await self.execute_hook("on_page_context_created", page, context=context, config=config)
@@ -925,16 +934,46 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if config.virtual_scroll_config:
await self._handle_virtual_scroll(page, config.virtual_scroll_config)
# Execute JavaScript if provided
# if config.js_code:
# if isinstance(config.js_code, str):
# await page.evaluate(config.js_code)
# elif isinstance(config.js_code, list):
# for js in config.js_code:
# await page.evaluate(js)
# --- Phase 1: Pre-wait JS and interaction ---
# Execute js_code_before_wait (for triggering loading that wait_for checks)
if config.js_code_before_wait:
bw_result = await self.robust_execute_user_script(
page, config.js_code_before_wait
)
if not bw_result["success"]:
self.logger.warning(
message="js_code_before_wait had issues: {error}",
tag="JS_EXEC",
params={"error": bw_result.get("error")},
)
# Handle user simulation
if config.simulate_user or config.magic:
await page.mouse.move(100, 100)
await page.mouse.down()
await page.mouse.up()
await page.keyboard.press("ArrowDown")
# --- Phase 2: Wait for page readiness ---
if config.wait_for:
try:
timeout = config.wait_for_timeout if config.wait_for_timeout is not None else config.page_timeout
await self.smart_wait(
page, config.wait_for, timeout=timeout
)
except Exception as e:
raise RuntimeError(f"Wait condition failed: {str(e)}")
# Pre-content retrieval hooks and delay
await self.execute_hook("before_retrieve_html", page, context=context, config=config)
if config.delay_before_return_html:
await asyncio.sleep(config.delay_before_return_html)
# --- Phase 3: Post-wait JS (runs on fully-loaded page) ---
if config.js_code:
# execution_result = await self.execute_user_script(page, config.js_code)
execution_result = await self.robust_execute_user_script(
page, config.js_code
)
@@ -949,28 +988,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
await self.execute_hook("on_execution_started", page, context=context, config=config)
await self.execute_hook("on_execution_ended", page, context=context, config=config, result=execution_result)
# Handle user simulation
if config.simulate_user or config.magic:
await page.mouse.move(100, 100)
await page.mouse.down()
await page.mouse.up()
await page.keyboard.press("ArrowDown")
# Handle wait_for condition
# Todo: Decide how to handle this
if not config.wait_for and config.css_selector and False:
# if not config.wait_for and config.css_selector:
config.wait_for = f"css:{config.css_selector}"
if config.wait_for:
try:
# Use wait_for_timeout if specified, otherwise fall back to page_timeout
timeout = config.wait_for_timeout if config.wait_for_timeout is not None else config.page_timeout
await self.smart_wait(
page, config.wait_for, timeout=timeout
)
except Exception as e:
raise RuntimeError(f"Wait condition failed: {str(e)}")
# --- Phase 4: DOM processing before HTML capture ---
# Update image dimensions if needed
if not self.browser_config.text_mode:
@@ -992,11 +1010,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if config.process_iframes:
page = await self.process_iframes(page)
# Pre-content retrieval hooks and delay
await self.execute_hook("before_retrieve_html", page, context=context, config=config)
if config.delay_before_return_html:
await asyncio.sleep(config.delay_before_return_html)
# Handle CMP/consent popup removal (before generic overlay removal)
if config.remove_consent_popups:
await self.remove_consent_popups(page)
@@ -1005,12 +1018,24 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if config.remove_overlay_elements:
await self.remove_overlay_elements(page)
if config.css_selector:
# --- Phase 5: HTML capture ---
if config.flatten_shadow_dom:
# Use JS to serialize the full DOM including shadow roots
flatten_js = load_js_script("flatten_shadow_dom")
html = await self.adapter.evaluate(page, flatten_js)
if not html or not isinstance(html, str):
# Fallback to normal capture if JS returned nothing
self.logger.warning(
message="Shadow DOM flattening returned no content, falling back to page.content()",
tag="SCRAPE",
)
html = await page.content()
elif config.css_selector:
try:
# Handle comma-separated selectors by splitting them
selectors = [s.strip() for s in config.css_selector.split(',')]
html_parts = []
for selector in selectors:
try:
content = await self.adapter.evaluate(page,
@@ -1021,16 +1046,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
html_parts.append(content)
except Error as e:
print(f"Warning: Could not get content for selector '{selector}': {str(e)}")
# Wrap in a div to create a valid HTML structure
html = f"<div class='crawl4ai-result'>\n" + "\n".join(html_parts) + "\n</div>"
html = f"<div class='crawl4ai-result'>\n" + "\n".join(html_parts) + "\n</div>"
except Error as e:
raise RuntimeError(f"Failed to extract HTML content: {str(e)}")
else:
html = await page.content()
# # Get final HTML content
# html = await page.content()
await self.execute_hook(
"before_return_html", page=page, html=html, context=context, config=config
)

View File

@@ -1220,6 +1220,15 @@ class BrowserManager:
):
await context.add_init_script(load_js_script("navigator_overrider"))
# Force-open closed shadow roots when flatten_shadow_dom is enabled
if crawlerRunConfig and crawlerRunConfig.flatten_shadow_dom:
await context.add_init_script("""
const _origAttachShadow = Element.prototype.attachShadow;
Element.prototype.attachShadow = function(init) {
return _origAttachShadow.call(this, {...init, mode: 'open'});
};
""")
# Apply custom init_scripts from BrowserConfig (for stealth evasions, etc.)
if self.config.init_scripts:
for script in self.config.init_scripts:

View File

@@ -0,0 +1,104 @@
/**
* Flatten all open shadow DOM trees into the light DOM so that
* page.content() / outerHTML can serialize the full composed view.
*
* Uses manual recursive serialization with proper slot resolution.
* Resolves slots via the live DOM API (assignedNodes), skips only
* shadow-scoped styles, and produces clean HTML with no regex hacks.
*
* Returns the full HTML string including shadow content.
*/
(() => {
const VOID = new Set([
'area','base','br','col','embed','hr','img','input',
'link','meta','param','source','track','wbr'
]);
// Serialize a DOM node. When it has a shadow root, switch to
// shadow-aware serialization that resolves <slot> elements.
const serialize = (node) => {
if (node.nodeType === Node.TEXT_NODE) return node.textContent;
if (node.nodeType === Node.COMMENT_NODE) return '';
if (node.nodeType !== Node.ELEMENT_NODE) return '';
const tag = node.tagName.toLowerCase();
const attrs = serializeAttrs(node);
let inner = '';
if (node.shadowRoot) {
inner = serializeShadowRoot(node);
} else {
for (const child of node.childNodes) {
inner += serialize(child);
}
}
if (VOID.has(tag)) return `<${tag}${attrs}>`;
return `<${tag}${attrs}>${inner}</${tag}>`;
};
// Serialize a shadow root's children, resolving slots against
// the host's light DOM children.
const serializeShadowRoot = (host) => {
let result = '';
for (const child of host.shadowRoot.childNodes) {
result += serializeShadowChild(child, host);
}
return result;
};
// Serialize a node that lives inside a shadow root.
// <style> tags are skipped (scoped CSS, useless outside shadow).
// <slot> tags are replaced with their assigned (projected) nodes.
const serializeShadowChild = (node, host) => {
if (node.nodeType === Node.TEXT_NODE) return node.textContent;
if (node.nodeType === Node.COMMENT_NODE) return '';
if (node.nodeType !== Node.ELEMENT_NODE) return '';
const tag = node.tagName.toLowerCase();
// Skip shadow-scoped styles only
if (tag === 'style') return '';
// Resolve <slot>: replace with projected light DOM content
if (tag === 'slot') {
const assigned = node.assignedNodes({ flatten: true });
if (assigned.length > 0) {
let out = '';
for (const a of assigned) out += serialize(a);
return out;
}
// No assigned nodes — use the slot's fallback content
let fallback = '';
for (const child of node.childNodes) {
fallback += serializeShadowChild(child, host);
}
return fallback;
}
const attrs = serializeAttrs(node);
let inner = '';
if (node.shadowRoot) {
// Nested shadow root — recurse
inner = serializeShadowRoot(node);
} else {
for (const child of node.childNodes) {
inner += serializeShadowChild(child, host);
}
}
if (VOID.has(tag)) return `<${tag}${attrs}>`;
return `<${tag}${attrs}>${inner}</${tag}>`;
};
const serializeAttrs = (node) => {
let s = '';
for (const a of node.attributes || []) {
s += ` ${a.name}="${a.value.replace(/&/g, '&amp;').replace(/"/g, '&quot;')}"`;
}
return s;
};
return serialize(document.documentElement);
})()

View File

@@ -0,0 +1,77 @@
"""
Shadow DOM Crawling Example
============================
Demonstrates how to use `flatten_shadow_dom=True` to extract content
hidden inside Shadow DOM trees on sites built with Web Components
(Stencil, Lit, Shoelace, Angular Elements, etc.).
Shadow DOM creates encapsulated sub-trees that are invisible to the
normal page serialization (page.content() / outerHTML). The
`flatten_shadow_dom` option walks these trees and produces a single
flat HTML document that includes all shadow content.
This example crawls a Bosch Rexroth product page where the product
description, technical specs, and downloads are rendered entirely
inside Shadow DOM by Stencil.js web components.
"""
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
URL = "https://store.boschrexroth.com/en/us/p/hydraulic-cylinder-r900999011"
async def main():
browser_config = BrowserConfig(headless=True)
# ── 1. Baseline: without shadow DOM flattening ──────────────────
print("=" * 60)
print("Without flatten_shadow_dom (baseline)")
print("=" * 60)
config = CrawlerRunConfig(
wait_until="load",
delay_before_return_html=3.0,
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(URL, config=config)
md = result.markdown.raw_markdown if result.markdown else ""
print(f"Markdown length: {len(md)} chars")
print(f"Has product description: {'mill type design' in md.lower()}")
print(f"Has technical specs: {'CDH1' in md}")
print(f"Has downloads section: {'Downloads' in md}")
print()
# ── 2. With shadow DOM flattening ───────────────────────────────
print("=" * 60)
print("With flatten_shadow_dom=True")
print("=" * 60)
config = CrawlerRunConfig(
wait_until="load",
delay_before_return_html=3.0,
flatten_shadow_dom=True,
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(URL, config=config)
md = result.markdown.raw_markdown if result.markdown else ""
print(f"Markdown length: {len(md)} chars")
print(f"Has product description: {'mill type design' in md.lower()}")
print(f"Has technical specs: {'CDH1' in md}")
print(f"Has downloads section: {'Downloads' in md}")
print()
# Show the product content section
idx = md.find("Product Description")
if idx >= 0:
print("── Extracted product content ──")
print(md[idx:idx + 1200])
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -152,7 +152,8 @@ Use these for controlling whether you read or write from a local content cache.
| **Parameter** | **Type / Default** | **What It Does** |
|----------------------------|--------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------|
| **`js_code`** | `str or list[str]` (None) | JavaScript to run after load. E.g. `"document.querySelector('button')?.click();"`. |
| **`js_code`** | `str or list[str]` (None) | JavaScript to run **after** `wait_for` and `delay_before_return_html`, on the fully-loaded page. E.g. `"document.querySelector('button')?.click();"`. |
| **`js_code_before_wait`** | `str or list[str]` (None) | JavaScript to run **before** `wait_for`. Use for triggering loading that `wait_for` then checks (e.g. clicking a tab, then waiting for its content). |
| **`c4a_script`** | `str or list[str]` (None) | C4A script that compiles to JavaScript. Alternative to writing raw JS. |
| **`js_only`** | `bool` (False) | If `True`, indicates we're reusing an existing session and only applying JS. No full reload. |
| **`ignore_body_visibility`** | `bool` (True) | Skip checking if `<body>` is visible. Usually best to keep `True`. |
@@ -160,6 +161,7 @@ Use these for controlling whether you read or write from a local content cache.
| **`scroll_delay`** | `float` (0.2) | Delay between scroll steps if `scan_full_page=True`. |
| **`max_scroll_steps`** | `int or None` (None) | Maximum number of scroll steps during full page scan. If None, scrolls until entire page is loaded. |
| **`process_iframes`** | `bool` (False) | Inlines iframe content for single-page extraction. |
| **`flatten_shadow_dom`** | `bool` (False) | Flattens Shadow DOM content into the light DOM before HTML capture. Resolves slots, strips shadow-scoped styles, and force-opens closed shadow roots. Essential for sites built with Web Components (Stencil, Lit, Shoelace, etc.). |
| **`remove_overlay_elements`** | `bool` (False) | Removes potential modals/popups blocking the main content. |
| **`remove_consent_popups`** | `bool` (False) | Removes GDPR/cookie consent popups from known CMP providers (OneTrust, Cookiebot, TrustArc, Quantcast, Didomi, Sourcepoint, FundingChoices, etc.). Tries clicking "Accept All" first, then falls back to DOM removal. |
| **`simulate_user`** | `bool` (False) | Simulate user interactions (mouse movements) to avoid bot detection. |

View File

@@ -1781,12 +1781,14 @@ run_cfg = CrawlerRunConfig(
### D) **Page Interaction**
| **Parameter** | **Type / Default** | **What It Does** |
|----------------------------|--------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------|
| **`js_code`** | `str or list[str]` (None) | JavaScript to run after load. E.g. `"document.querySelector('button')?.click();"`. |
| **`js_only`** | `bool` (False) | If `True`, indicates were reusing an existing session and only applying JS. No full reload. |
| **`js_code`** | `str or list[str]` (None) | JavaScript to run **after** `wait_for` and `delay_before_return_html`, on the fully-loaded page. E.g. `"document.querySelector('button')?.click();"`. |
| **`js_code_before_wait`** | `str or list[str]` (None) | JavaScript to run **before** `wait_for`. Use for triggering loading that `wait_for` then checks. |
| **`js_only`** | `bool` (False) | If `True`, indicates we're reusing an existing session and only applying JS. No full reload. |
| **`ignore_body_visibility`** | `bool` (True) | Skip checking if `<body>` is visible. Usually best to keep `True`. |
| **`scan_full_page`** | `bool` (False) | If `True`, auto-scroll the page to load dynamic content (infinite scroll). |
| **`scroll_delay`** | `float` (0.2) | Delay between scroll steps if `scan_full_page=True`. |
| **`process_iframes`** | `bool` (False) | Inlines iframe content for single-page extraction. |
| **`flatten_shadow_dom`** | `bool` (False) | Flattens Shadow DOM content into the light DOM before HTML capture. Resolves slots, strips shadow-scoped styles, and force-opens closed shadow roots. Essential for sites built with Web Components. |
| **`remove_overlay_elements`** | `bool` (False) | Removes potential modals/popups blocking the main content. |
| **`remove_consent_popups`** | `bool` (False) | Removes GDPR/cookie consent popups from known CMP providers (OneTrust, Cookiebot, TrustArc, Quantcast, Didomi, Sourcepoint, FundingChoices, etc.). Tries clicking "Accept All" first, then falls back to DOM removal. |
| **`simulate_user`** | `bool` (False) | Simulate user interactions (mouse movements) to avoid bot detection. |
@@ -2813,6 +2815,46 @@ async def main():
if __name__ == "__main__":
asyncio.run(main())
```
## 3.1 Flattening Shadow DOM
Sites built with **Web Components** (Stencil, Lit, Shoelace, Angular Elements, etc.) render content inside Shadow DOM — an encapsulated sub-tree invisible to `page.content()`. Set `flatten_shadow_dom=True` to extract it:
```python
config = CrawlerRunConfig(
flatten_shadow_dom=True,
wait_until="load",
delay_before_return_html=3.0, # give components time to hydrate
)
```
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
async def main():
config = CrawlerRunConfig(
flatten_shadow_dom=True,
wait_until="load",
delay_before_return_html=3.0,
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://store.boschrexroth.com/en/us/p/hydraulic-cylinder-r900999011",
config=config,
)
# Without flatten_shadow_dom: ~1 KB markdown (breadcrumbs only)
# With flatten_shadow_dom: ~33 KB (product description, specs, downloads)
print(len(result.markdown.raw_markdown))
if __name__ == "__main__":
asyncio.run(main())
```
When enabled, Crawl4AI also injects an init script that force-opens closed shadow roots. The flattener resolves `<slot>` projections and strips shadow-scoped `<style>` tags, producing clean HTML for the downstream scraping/markdown pipeline.
**Execution order**: `flatten_shadow_dom` runs right before HTML capture, after all waits and JS execution:
```
js_code_before_wait → wait_for → delay → js_code → flatten_shadow_dom → page capture
```
For a full runnable example, see [`shadow_dom_crawling.py`](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/shadow_dom_crawling.py).
## 4. Structured Extraction Examples
### 4.1 Pattern-Based with `JsonCssExtractionStrategy`
```python

View File

@@ -255,16 +255,22 @@ class CrawlerRunConfig:
- Controls caching behavior (`ENABLED`, `BYPASS`, `DISABLED`, etc.).
- Defaults to `CacheMode.BYPASS`.
6.**`js_code`** & **`c4a_script`**:
- `js_code`: A string or list of JavaScript strings to execute.
6.**`js_code`**, **`js_code_before_wait`**, & **`c4a_script`**:
- `js_code`: JavaScript to run **after** `wait_for` completes — on the fully-loaded page.
- `js_code_before_wait`: JavaScript to run **before** `wait_for` — for triggering loading that `wait_for` then checks.
- `c4a_script`: C4A script that compiles to JavaScript.
- Great for "Load More" buttons or user interactions.
- Great for "Load More" buttons or user interactions.
7.**`wait_for`**:
- A CSS or JS expression to wait for before extracting content.
- Common usage: `wait_for="css:.main-loaded"` or `wait_for="js:() => window.loaded === true"`.
8.**`screenshot`**, **`pdf`**, & **`capture_mhtml`**:
8.**`flatten_shadow_dom`**:
- If `True`, flattens Shadow DOM content into the light DOM before HTML capture.
- Essential for sites built with Web Components (Stencil, Lit, Shoelace, etc.).
- Also force-opens closed shadow roots. See [Flattening Shadow DOM](content-selection.md#31-flattening-shadow-dom).
9.**`screenshot`**, **`pdf`**, & **`capture_mhtml`**:
- If `True`, captures a screenshot, PDF, or MHTML snapshot after the page is fully loaded.
- The results go to `result.screenshot` (base64), `result.pdf` (bytes), or `result.mhtml` (string).
- Use `force_viewport_screenshot=True` to capture only the visible viewport instead of the full page. This is faster and produces smaller images when you don't need a full-page screenshot.

View File

@@ -183,6 +183,55 @@ if __name__ == "__main__":
---
## 3.1 Flattening Shadow DOM
Sites built with **Web Components** (Stencil, Lit, Shoelace, Angular Elements, etc.) render content inside [Shadow DOM](https://developer.mozilla.org/en-US/docs/Web/API/Web_components/Using_shadow_DOM) — an encapsulated sub-tree that is invisible to normal page serialization. The browser renders it on screen, but `page.content()` never includes it.
Set `flatten_shadow_dom=True` to walk all shadow trees, resolve `<slot>` projections, and produce a single flat HTML document:
```python
config = CrawlerRunConfig(
# Flatten shadow DOM into the main document
flatten_shadow_dom=True,
# Give web components time to hydrate
wait_until="load",
delay_before_return_html=3.0,
)
```
**Full example** — crawling a product page where specs live inside shadow roots:
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
async def main():
config = CrawlerRunConfig(
flatten_shadow_dom=True,
wait_until="load",
delay_before_return_html=3.0,
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://store.boschrexroth.com/en/us/p/hydraulic-cylinder-r900999011",
config=config,
)
# Without flatten_shadow_dom: ~1 KB of markdown (breadcrumbs only)
# With flatten_shadow_dom: ~33 KB (full product specs, downloads, etc.)
print(len(result.markdown.raw_markdown))
if __name__ == "__main__":
asyncio.run(main())
```
When `flatten_shadow_dom=True` is set, Crawl4AI also injects an init script that force-opens **closed** shadow roots (by patching `Element.prototype.attachShadow`), so even components that use `mode: 'closed'` become accessible.
> **Tip**: Web components need JavaScript to run before they render content (a process called *hydration*). Use `wait_until="load"` and a `delay_before_return_html` of 25 seconds to ensure components are fully hydrated before flattening.
For a complete runnable example, see [`shadow_dom_crawling.py`](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/shadow_dom_crawling.py).
---
## 4. Structured Extraction Examples
You can combine content selection with a more advanced extraction strategy. For instance, a **CSS-based** or **LLM-based** extraction strategy can run on the filtered HTML.

View File

@@ -15,8 +15,9 @@ Below is a quick overview of how to do it.
### Basic Execution
**`js_code`** in **`CrawlerRunConfig`** accepts either a single JS string or a list of JS snippets.
**Example**: Well scroll to the bottom of the page, then optionally click a “Load More” button.
**`js_code`** in **`CrawlerRunConfig`** accepts either a single JS string or a list of JS snippets. It runs **after** `wait_for` and `delay_before_return_html` — so the page is fully loaded when your code executes.
**Example**: We'll scroll to the bottom of the page, then optionally click a "Load More" button.
```python
import asyncio
@@ -55,10 +56,36 @@ if __name__ == "__main__":
```
**Relevant `CrawlerRunConfig` params**:
- **`js_code`**: A string or list of strings with JavaScript to run after the page loads.
- **`js_only`**: If set to `True` on subsequent calls, indicates were continuing an existing session without a new full navigation.
- **`js_code`**: JavaScript to run **after** `wait_for` and `delay_before_return_html` complete. Runs on the fully-loaded page.
- **`js_code_before_wait`**: JavaScript to run **before** `wait_for`. Use when you need to trigger loading that `wait_for` then checks.
- **`js_only`**: If set to `True` on subsequent calls, indicates we're continuing an existing session without a new full navigation.
- **`session_id`**: If you want to keep the same page across multiple calls, specify an ID.
### Execution Order
Understanding when your JavaScript runs relative to other pipeline steps:
```
1. Page navigation (page.goto)
2. js_code_before_wait ← triggers loading / clicks tabs
3. wait_for ← waits for content to appear
4. delay_before_return_html ← extra safety margin
5. js_code ← runs on the fully-loaded page
6. flatten_shadow_dom ← if enabled
7. page.content() ← HTML capture
```
If you need JS to trigger something and then wait for the result, use `js_code_before_wait` + `wait_for`:
```python
config = CrawlerRunConfig(
# Click a tab first
js_code_before_wait="document.querySelector('#specs-tab')?.click();",
# Then wait for the tab content to appear
wait_for="css:#specs-panel .content",
)
```
---
## 2. Wait Conditions
@@ -317,35 +344,55 @@ When done, check `result.extracted_content` for the JSON.
---
## 7. Relevant `CrawlerRunConfig` Parameters
## 7. Shadow DOM Flattening
Sites built with **Web Components** (Stencil, Lit, Shoelace, etc.) render content inside Shadow DOM — an encapsulated sub-tree that is invisible to normal page serialization. Set `flatten_shadow_dom=True` to extract it:
```python
config = CrawlerRunConfig(
flatten_shadow_dom=True,
wait_until="load",
delay_before_return_html=3.0, # give components time to hydrate
)
```
This walks all shadow trees, resolves `<slot>` projections, and produces flat HTML. It also force-opens closed shadow roots via an init script. For details and a full example, see [Flattening Shadow DOM](content-selection.md#31-flattening-shadow-dom) and [`shadow_dom_crawling.py`](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/shadow_dom_crawling.py).
---
## 8. Relevant `CrawlerRunConfig` Parameters
Below are the key interaction-related parameters in `CrawlerRunConfig`. For a full list, see [Configuration Parameters](../api/parameters.md).
- **`js_code`**: JavaScript to run after initial load.
- **`js_only`**: If `True`, no new page navigation—only JS in the existing session.
- **`wait_for`**: CSS (`"css:..."`) or JS (`"js:..."`) expression to wait for.
- **`session_id`**: Reuse the same page across calls.
- **`cache_mode`**: Whether to read/write from the cache or bypass.
- **`js_code`**: JavaScript to run after `wait_for` + `delay_before_return_html`, on the fully-loaded page.
- **`js_code_before_wait`**: JavaScript to run before `wait_for`. For triggering loading that `wait_for` then checks.
- **`js_only`**: If `True`, no new page navigation—only JS in the existing session.
- **`wait_for`**: CSS (`"css:..."`) or JS (`"js:..."`) expression to wait for.
- **`session_id`**: Reuse the same page across calls.
- **`cache_mode`**: Whether to read/write from the cache or bypass.
- **`flatten_shadow_dom`**: Flatten Shadow DOM content into the light DOM before capture.
- **`process_iframes`**: Inline iframe content into the main document.
- **`remove_overlay_elements`**: Remove certain popups automatically.
- **`remove_consent_popups`**: Remove GDPR/cookie consent popups from known CMP providers (OneTrust, Cookiebot, Didomi, etc.).
- **`simulate_user`, `override_navigator`, `magic`**: Anti-bot or "human-like" interactions.
---
## 8. Conclusion
## 9. Conclusion
Crawl4AIs **page interaction** features let you:
Crawl4AI's **page interaction** features let you:
1. **Execute JavaScript** for scrolling, clicks, or form filling.
2. **Wait** for CSS or custom JS conditions before capturing data.
3. **Handle** multi-step flows (like “Load More”) with partial reloads or persistent sessions.
4. Combine with **structured extraction** for dynamic sites.
4. **Flatten Shadow DOM** on Web Component sites to extract hidden content.
5. Combine with **structured extraction** for dynamic sites.
With these tools, you can scrape modern, interactive webpages confidently. For advanced hooking, user simulation, or in-depth config, check the [API reference](../api/parameters.md) or related advanced docs. Happy scripting!
---
## 9. Virtual Scrolling
## 10. Virtual Scrolling
For sites that use **virtual scrolling** (where content is replaced rather than appended as you scroll, like Twitter or Instagram), Crawl4AI provides a dedicated `VirtualScrollConfig`:

View File

@@ -0,0 +1,84 @@
"""Test flatten_shadow_dom feature — full comparison."""
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
URL = "https://store.boschrexroth.com/en/us/p/hydraulic-cylinder-r900999011"
async def run_test(label, bc, rc):
print(f"\n{'='*70}")
print(f"TEST: {label}")
print(f"{'='*70}")
async with AsyncWebCrawler(config=bc) as crawler:
result = await crawler.arun(URL, config=rc)
html = result.html or ""
cleaned = result.cleaned_html or ""
md = ""
if result.markdown and hasattr(result.markdown, "raw_markdown"):
md = result.markdown.raw_markdown or ""
print(f" Success: {result.success}")
print(f" Raw HTML: {len(html):>8} chars")
print(f" Cleaned HTML: {len(cleaned):>8} chars")
print(f" Markdown: {len(md):>8} chars")
checks = {
"Product title": "HYDRAULIC CYLINDER" in md,
"Part number (R900999011)": "R900999011" in md,
"Product description": "mill type design" in md.lower(),
"Feature: 6 types of mounting":"6 types of mounting" in md,
"Feature: safety vent": "safety vent" in md.lower(),
"Product Description heading": "Product Description" in md,
"Technical Specs heading": "Technical Specs" in md,
"Downloads heading": "Downloads" in md,
"Specs table: CDH1": "CDH1" in md,
"Specs table: 250 bar": "250" in md,
}
print(f"\n Content checks:")
passes = sum(1 for v in checks.values() if v)
for k, v in checks.items():
print(f" {'PASS' if v else 'FAIL'} {k}")
print(f"\n Result: {passes}/{len(checks)} checks passed")
# Show product content section
for term in ["Product Description"]:
idx = md.find(term)
if idx >= 0:
print(f"\n --- Product content section ---")
print(md[idx:idx+1500])
return result
async def main():
bc = BrowserConfig(headless=True)
r1 = await run_test(
"BASELINE (no shadow flattening)",
bc,
CrawlerRunConfig(wait_until="load", delay_before_return_html=3.0),
)
r2 = await run_test(
"WITH flatten_shadow_dom=True",
bc,
CrawlerRunConfig(
wait_until="load",
delay_before_return_html=3.0,
flatten_shadow_dom=True,
),
)
# Summary
md1 = r1.markdown.raw_markdown if r1.markdown else ""
md2 = r2.markdown.raw_markdown if r2.markdown else ""
print(f"\n{'='*70}")
print(f"SUMMARY")
print(f"{'='*70}")
print(f" Baseline markdown: {len(md1):>6} chars")
print(f" Flattened markdown: {len(md2):>6} chars")
print(f" Improvement: {len(md2)/max(len(md1),1):.1f}x more content")
if __name__ == "__main__":
asyncio.run(main())