Add Shadow DOM flattening and reorder js_code execution pipeline
- Add `flatten_shadow_dom` option to CrawlerRunConfig that serializes shadow DOM content into the light DOM before HTML capture. Uses a recursive serializer that resolves <slot> projections and strips only shadow-scoped <style> tags. Also injects an init script to force-open closed shadow roots via attachShadow patching. - Move `js_code` execution to after `wait_for` + `delay_before_return_html` so user scripts run on the fully-hydrated page. Add `js_code_before_wait` for the less common case of triggering loading before waiting. - Add JS snippet (flatten_shadow_dom.js), integration test, example, and documentation across all relevant doc files.
This commit is contained in:
@@ -1235,7 +1235,11 @@ class CrawlerRunConfig():
|
||||
Default: 5.
|
||||
|
||||
# Page Interaction Parameters
|
||||
js_code (str or list of str or None): JavaScript code/snippets to run on the page.
|
||||
js_code (str or list of str or None): JavaScript code/snippets to run on the page
|
||||
after wait_for and delay_before_return_html.
|
||||
Default: None.
|
||||
js_code_before_wait (str or list of str or None): JavaScript to run BEFORE wait_for.
|
||||
Use for triggering loading that wait_for then checks.
|
||||
Default: None.
|
||||
js_only (bool): If True, indicates subsequent calls are JS-driven updates, not full page loads.
|
||||
Default: False.
|
||||
@@ -1249,6 +1253,10 @@ class CrawlerRunConfig():
|
||||
If None, scrolls until the entire page is loaded. Default: None.
|
||||
process_iframes (bool): If True, attempts to process and inline iframe content.
|
||||
Default: False.
|
||||
flatten_shadow_dom (bool): If True, flatten shadow DOM content into the light DOM
|
||||
before HTML capture so page.content() includes it.
|
||||
Also injects an init script to force-open closed shadow roots.
|
||||
Default: False.
|
||||
remove_overlay_elements (bool): If True, remove overlays/popups before extracting HTML.
|
||||
Default: False.
|
||||
remove_consent_popups (bool): If True, remove GDPR/cookie consent popups (IAB TCF/CMP)
|
||||
@@ -1410,6 +1418,7 @@ class CrawlerRunConfig():
|
||||
semaphore_count: int = 5,
|
||||
# Page Interaction Parameters
|
||||
js_code: Union[str, List[str]] = None,
|
||||
js_code_before_wait: Union[str, List[str]] = None,
|
||||
c4a_script: Union[str, List[str]] = None,
|
||||
js_only: bool = False,
|
||||
ignore_body_visibility: bool = True,
|
||||
@@ -1417,6 +1426,7 @@ class CrawlerRunConfig():
|
||||
scroll_delay: float = 0.2,
|
||||
max_scroll_steps: Optional[int] = None,
|
||||
process_iframes: bool = False,
|
||||
flatten_shadow_dom: bool = False,
|
||||
remove_overlay_elements: bool = False,
|
||||
remove_consent_popups: bool = False,
|
||||
simulate_user: bool = False,
|
||||
@@ -1538,6 +1548,7 @@ class CrawlerRunConfig():
|
||||
|
||||
# Page Interaction Parameters
|
||||
self.js_code = js_code
|
||||
self.js_code_before_wait = js_code_before_wait
|
||||
self.c4a_script = c4a_script
|
||||
self.js_only = js_only
|
||||
self.ignore_body_visibility = ignore_body_visibility
|
||||
@@ -1545,6 +1556,7 @@ class CrawlerRunConfig():
|
||||
self.scroll_delay = scroll_delay
|
||||
self.max_scroll_steps = max_scroll_steps
|
||||
self.process_iframes = process_iframes
|
||||
self.flatten_shadow_dom = flatten_shadow_dom
|
||||
self.remove_overlay_elements = remove_overlay_elements
|
||||
self.remove_consent_popups = remove_consent_popups
|
||||
self.simulate_user = simulate_user
|
||||
@@ -1887,12 +1899,14 @@ class CrawlerRunConfig():
|
||||
"max_range": self.max_range,
|
||||
"semaphore_count": self.semaphore_count,
|
||||
"js_code": self.js_code,
|
||||
"js_code_before_wait": self.js_code_before_wait,
|
||||
"js_only": self.js_only,
|
||||
"ignore_body_visibility": self.ignore_body_visibility,
|
||||
"scan_full_page": self.scan_full_page,
|
||||
"scroll_delay": self.scroll_delay,
|
||||
"max_scroll_steps": self.max_scroll_steps,
|
||||
"process_iframes": self.process_iframes,
|
||||
"flatten_shadow_dom": self.flatten_shadow_dom,
|
||||
"remove_overlay_elements": self.remove_overlay_elements,
|
||||
"remove_consent_popups": self.remove_consent_popups,
|
||||
"simulate_user": self.simulate_user,
|
||||
|
||||
@@ -581,6 +581,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
if config.override_navigator or config.simulate_user or config.magic:
|
||||
await context.add_init_script(load_js_script("navigator_overrider"))
|
||||
|
||||
# Force-open closed shadow roots when flatten_shadow_dom is enabled
|
||||
if config.flatten_shadow_dom:
|
||||
await context.add_init_script("""
|
||||
const _origAttachShadow = Element.prototype.attachShadow;
|
||||
Element.prototype.attachShadow = function(init) {
|
||||
return _origAttachShadow.call(this, {...init, mode: 'open'});
|
||||
};
|
||||
""")
|
||||
|
||||
# Call hook after page creation
|
||||
await self.execute_hook("on_page_context_created", page, context=context, config=config)
|
||||
|
||||
@@ -925,16 +934,46 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
if config.virtual_scroll_config:
|
||||
await self._handle_virtual_scroll(page, config.virtual_scroll_config)
|
||||
|
||||
# Execute JavaScript if provided
|
||||
# if config.js_code:
|
||||
# if isinstance(config.js_code, str):
|
||||
# await page.evaluate(config.js_code)
|
||||
# elif isinstance(config.js_code, list):
|
||||
# for js in config.js_code:
|
||||
# await page.evaluate(js)
|
||||
# --- Phase 1: Pre-wait JS and interaction ---
|
||||
|
||||
# Execute js_code_before_wait (for triggering loading that wait_for checks)
|
||||
if config.js_code_before_wait:
|
||||
bw_result = await self.robust_execute_user_script(
|
||||
page, config.js_code_before_wait
|
||||
)
|
||||
if not bw_result["success"]:
|
||||
self.logger.warning(
|
||||
message="js_code_before_wait had issues: {error}",
|
||||
tag="JS_EXEC",
|
||||
params={"error": bw_result.get("error")},
|
||||
)
|
||||
|
||||
# Handle user simulation
|
||||
if config.simulate_user or config.magic:
|
||||
await page.mouse.move(100, 100)
|
||||
await page.mouse.down()
|
||||
await page.mouse.up()
|
||||
await page.keyboard.press("ArrowDown")
|
||||
|
||||
# --- Phase 2: Wait for page readiness ---
|
||||
|
||||
if config.wait_for:
|
||||
try:
|
||||
timeout = config.wait_for_timeout if config.wait_for_timeout is not None else config.page_timeout
|
||||
await self.smart_wait(
|
||||
page, config.wait_for, timeout=timeout
|
||||
)
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Wait condition failed: {str(e)}")
|
||||
|
||||
# Pre-content retrieval hooks and delay
|
||||
await self.execute_hook("before_retrieve_html", page, context=context, config=config)
|
||||
if config.delay_before_return_html:
|
||||
await asyncio.sleep(config.delay_before_return_html)
|
||||
|
||||
# --- Phase 3: Post-wait JS (runs on fully-loaded page) ---
|
||||
|
||||
if config.js_code:
|
||||
# execution_result = await self.execute_user_script(page, config.js_code)
|
||||
execution_result = await self.robust_execute_user_script(
|
||||
page, config.js_code
|
||||
)
|
||||
@@ -949,28 +988,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
await self.execute_hook("on_execution_started", page, context=context, config=config)
|
||||
await self.execute_hook("on_execution_ended", page, context=context, config=config, result=execution_result)
|
||||
|
||||
# Handle user simulation
|
||||
if config.simulate_user or config.magic:
|
||||
await page.mouse.move(100, 100)
|
||||
await page.mouse.down()
|
||||
await page.mouse.up()
|
||||
await page.keyboard.press("ArrowDown")
|
||||
|
||||
# Handle wait_for condition
|
||||
# Todo: Decide how to handle this
|
||||
if not config.wait_for and config.css_selector and False:
|
||||
# if not config.wait_for and config.css_selector:
|
||||
config.wait_for = f"css:{config.css_selector}"
|
||||
|
||||
if config.wait_for:
|
||||
try:
|
||||
# Use wait_for_timeout if specified, otherwise fall back to page_timeout
|
||||
timeout = config.wait_for_timeout if config.wait_for_timeout is not None else config.page_timeout
|
||||
await self.smart_wait(
|
||||
page, config.wait_for, timeout=timeout
|
||||
)
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Wait condition failed: {str(e)}")
|
||||
# --- Phase 4: DOM processing before HTML capture ---
|
||||
|
||||
# Update image dimensions if needed
|
||||
if not self.browser_config.text_mode:
|
||||
@@ -992,11 +1010,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
if config.process_iframes:
|
||||
page = await self.process_iframes(page)
|
||||
|
||||
# Pre-content retrieval hooks and delay
|
||||
await self.execute_hook("before_retrieve_html", page, context=context, config=config)
|
||||
if config.delay_before_return_html:
|
||||
await asyncio.sleep(config.delay_before_return_html)
|
||||
|
||||
# Handle CMP/consent popup removal (before generic overlay removal)
|
||||
if config.remove_consent_popups:
|
||||
await self.remove_consent_popups(page)
|
||||
@@ -1005,12 +1018,24 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
if config.remove_overlay_elements:
|
||||
await self.remove_overlay_elements(page)
|
||||
|
||||
if config.css_selector:
|
||||
# --- Phase 5: HTML capture ---
|
||||
|
||||
if config.flatten_shadow_dom:
|
||||
# Use JS to serialize the full DOM including shadow roots
|
||||
flatten_js = load_js_script("flatten_shadow_dom")
|
||||
html = await self.adapter.evaluate(page, flatten_js)
|
||||
if not html or not isinstance(html, str):
|
||||
# Fallback to normal capture if JS returned nothing
|
||||
self.logger.warning(
|
||||
message="Shadow DOM flattening returned no content, falling back to page.content()",
|
||||
tag="SCRAPE",
|
||||
)
|
||||
html = await page.content()
|
||||
elif config.css_selector:
|
||||
try:
|
||||
# Handle comma-separated selectors by splitting them
|
||||
selectors = [s.strip() for s in config.css_selector.split(',')]
|
||||
html_parts = []
|
||||
|
||||
|
||||
for selector in selectors:
|
||||
try:
|
||||
content = await self.adapter.evaluate(page,
|
||||
@@ -1021,16 +1046,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
html_parts.append(content)
|
||||
except Error as e:
|
||||
print(f"Warning: Could not get content for selector '{selector}': {str(e)}")
|
||||
|
||||
# Wrap in a div to create a valid HTML structure
|
||||
html = f"<div class='crawl4ai-result'>\n" + "\n".join(html_parts) + "\n</div>"
|
||||
|
||||
html = f"<div class='crawl4ai-result'>\n" + "\n".join(html_parts) + "\n</div>"
|
||||
except Error as e:
|
||||
raise RuntimeError(f"Failed to extract HTML content: {str(e)}")
|
||||
else:
|
||||
html = await page.content()
|
||||
|
||||
# # Get final HTML content
|
||||
# html = await page.content()
|
||||
|
||||
await self.execute_hook(
|
||||
"before_return_html", page=page, html=html, context=context, config=config
|
||||
)
|
||||
|
||||
@@ -1220,6 +1220,15 @@ class BrowserManager:
|
||||
):
|
||||
await context.add_init_script(load_js_script("navigator_overrider"))
|
||||
|
||||
# Force-open closed shadow roots when flatten_shadow_dom is enabled
|
||||
if crawlerRunConfig and crawlerRunConfig.flatten_shadow_dom:
|
||||
await context.add_init_script("""
|
||||
const _origAttachShadow = Element.prototype.attachShadow;
|
||||
Element.prototype.attachShadow = function(init) {
|
||||
return _origAttachShadow.call(this, {...init, mode: 'open'});
|
||||
};
|
||||
""")
|
||||
|
||||
# Apply custom init_scripts from BrowserConfig (for stealth evasions, etc.)
|
||||
if self.config.init_scripts:
|
||||
for script in self.config.init_scripts:
|
||||
|
||||
104
crawl4ai/js_snippet/flatten_shadow_dom.js
Normal file
104
crawl4ai/js_snippet/flatten_shadow_dom.js
Normal file
@@ -0,0 +1,104 @@
|
||||
/**
|
||||
* Flatten all open shadow DOM trees into the light DOM so that
|
||||
* page.content() / outerHTML can serialize the full composed view.
|
||||
*
|
||||
* Uses manual recursive serialization with proper slot resolution.
|
||||
* Resolves slots via the live DOM API (assignedNodes), skips only
|
||||
* shadow-scoped styles, and produces clean HTML with no regex hacks.
|
||||
*
|
||||
* Returns the full HTML string including shadow content.
|
||||
*/
|
||||
(() => {
|
||||
const VOID = new Set([
|
||||
'area','base','br','col','embed','hr','img','input',
|
||||
'link','meta','param','source','track','wbr'
|
||||
]);
|
||||
|
||||
// Serialize a DOM node. When it has a shadow root, switch to
|
||||
// shadow-aware serialization that resolves <slot> elements.
|
||||
const serialize = (node) => {
|
||||
if (node.nodeType === Node.TEXT_NODE) return node.textContent;
|
||||
if (node.nodeType === Node.COMMENT_NODE) return '';
|
||||
if (node.nodeType !== Node.ELEMENT_NODE) return '';
|
||||
|
||||
const tag = node.tagName.toLowerCase();
|
||||
const attrs = serializeAttrs(node);
|
||||
let inner = '';
|
||||
|
||||
if (node.shadowRoot) {
|
||||
inner = serializeShadowRoot(node);
|
||||
} else {
|
||||
for (const child of node.childNodes) {
|
||||
inner += serialize(child);
|
||||
}
|
||||
}
|
||||
|
||||
if (VOID.has(tag)) return `<${tag}${attrs}>`;
|
||||
return `<${tag}${attrs}>${inner}</${tag}>`;
|
||||
};
|
||||
|
||||
// Serialize a shadow root's children, resolving slots against
|
||||
// the host's light DOM children.
|
||||
const serializeShadowRoot = (host) => {
|
||||
let result = '';
|
||||
for (const child of host.shadowRoot.childNodes) {
|
||||
result += serializeShadowChild(child, host);
|
||||
}
|
||||
return result;
|
||||
};
|
||||
|
||||
// Serialize a node that lives inside a shadow root.
|
||||
// <style> tags are skipped (scoped CSS, useless outside shadow).
|
||||
// <slot> tags are replaced with their assigned (projected) nodes.
|
||||
const serializeShadowChild = (node, host) => {
|
||||
if (node.nodeType === Node.TEXT_NODE) return node.textContent;
|
||||
if (node.nodeType === Node.COMMENT_NODE) return '';
|
||||
if (node.nodeType !== Node.ELEMENT_NODE) return '';
|
||||
|
||||
const tag = node.tagName.toLowerCase();
|
||||
|
||||
// Skip shadow-scoped styles only
|
||||
if (tag === 'style') return '';
|
||||
|
||||
// Resolve <slot>: replace with projected light DOM content
|
||||
if (tag === 'slot') {
|
||||
const assigned = node.assignedNodes({ flatten: true });
|
||||
if (assigned.length > 0) {
|
||||
let out = '';
|
||||
for (const a of assigned) out += serialize(a);
|
||||
return out;
|
||||
}
|
||||
// No assigned nodes — use the slot's fallback content
|
||||
let fallback = '';
|
||||
for (const child of node.childNodes) {
|
||||
fallback += serializeShadowChild(child, host);
|
||||
}
|
||||
return fallback;
|
||||
}
|
||||
|
||||
const attrs = serializeAttrs(node);
|
||||
let inner = '';
|
||||
|
||||
if (node.shadowRoot) {
|
||||
// Nested shadow root — recurse
|
||||
inner = serializeShadowRoot(node);
|
||||
} else {
|
||||
for (const child of node.childNodes) {
|
||||
inner += serializeShadowChild(child, host);
|
||||
}
|
||||
}
|
||||
|
||||
if (VOID.has(tag)) return `<${tag}${attrs}>`;
|
||||
return `<${tag}${attrs}>${inner}</${tag}>`;
|
||||
};
|
||||
|
||||
const serializeAttrs = (node) => {
|
||||
let s = '';
|
||||
for (const a of node.attributes || []) {
|
||||
s += ` ${a.name}="${a.value.replace(/&/g, '&').replace(/"/g, '"')}"`;
|
||||
}
|
||||
return s;
|
||||
};
|
||||
|
||||
return serialize(document.documentElement);
|
||||
})()
|
||||
77
docs/examples/shadow_dom_crawling.py
Normal file
77
docs/examples/shadow_dom_crawling.py
Normal file
@@ -0,0 +1,77 @@
|
||||
"""
|
||||
Shadow DOM Crawling Example
|
||||
============================
|
||||
|
||||
Demonstrates how to use `flatten_shadow_dom=True` to extract content
|
||||
hidden inside Shadow DOM trees on sites built with Web Components
|
||||
(Stencil, Lit, Shoelace, Angular Elements, etc.).
|
||||
|
||||
Shadow DOM creates encapsulated sub-trees that are invisible to the
|
||||
normal page serialization (page.content() / outerHTML). The
|
||||
`flatten_shadow_dom` option walks these trees and produces a single
|
||||
flat HTML document that includes all shadow content.
|
||||
|
||||
This example crawls a Bosch Rexroth product page where the product
|
||||
description, technical specs, and downloads are rendered entirely
|
||||
inside Shadow DOM by Stencil.js web components.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
|
||||
URL = "https://store.boschrexroth.com/en/us/p/hydraulic-cylinder-r900999011"
|
||||
|
||||
|
||||
async def main():
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
|
||||
# ── 1. Baseline: without shadow DOM flattening ──────────────────
|
||||
print("=" * 60)
|
||||
print("Without flatten_shadow_dom (baseline)")
|
||||
print("=" * 60)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
wait_until="load",
|
||||
delay_before_return_html=3.0,
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(URL, config=config)
|
||||
|
||||
md = result.markdown.raw_markdown if result.markdown else ""
|
||||
print(f"Markdown length: {len(md)} chars")
|
||||
print(f"Has product description: {'mill type design' in md.lower()}")
|
||||
print(f"Has technical specs: {'CDH1' in md}")
|
||||
print(f"Has downloads section: {'Downloads' in md}")
|
||||
print()
|
||||
|
||||
# ── 2. With shadow DOM flattening ───────────────────────────────
|
||||
print("=" * 60)
|
||||
print("With flatten_shadow_dom=True")
|
||||
print("=" * 60)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
wait_until="load",
|
||||
delay_before_return_html=3.0,
|
||||
flatten_shadow_dom=True,
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(URL, config=config)
|
||||
|
||||
md = result.markdown.raw_markdown if result.markdown else ""
|
||||
print(f"Markdown length: {len(md)} chars")
|
||||
print(f"Has product description: {'mill type design' in md.lower()}")
|
||||
print(f"Has technical specs: {'CDH1' in md}")
|
||||
print(f"Has downloads section: {'Downloads' in md}")
|
||||
print()
|
||||
|
||||
# Show the product content section
|
||||
idx = md.find("Product Description")
|
||||
if idx >= 0:
|
||||
print("── Extracted product content ──")
|
||||
print(md[idx:idx + 1200])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -152,7 +152,8 @@ Use these for controlling whether you read or write from a local content cache.
|
||||
|
||||
| **Parameter** | **Type / Default** | **What It Does** |
|
||||
|----------------------------|--------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| **`js_code`** | `str or list[str]` (None) | JavaScript to run after load. E.g. `"document.querySelector('button')?.click();"`. |
|
||||
| **`js_code`** | `str or list[str]` (None) | JavaScript to run **after** `wait_for` and `delay_before_return_html`, on the fully-loaded page. E.g. `"document.querySelector('button')?.click();"`. |
|
||||
| **`js_code_before_wait`** | `str or list[str]` (None) | JavaScript to run **before** `wait_for`. Use for triggering loading that `wait_for` then checks (e.g. clicking a tab, then waiting for its content). |
|
||||
| **`c4a_script`** | `str or list[str]` (None) | C4A script that compiles to JavaScript. Alternative to writing raw JS. |
|
||||
| **`js_only`** | `bool` (False) | If `True`, indicates we're reusing an existing session and only applying JS. No full reload. |
|
||||
| **`ignore_body_visibility`** | `bool` (True) | Skip checking if `<body>` is visible. Usually best to keep `True`. |
|
||||
@@ -160,6 +161,7 @@ Use these for controlling whether you read or write from a local content cache.
|
||||
| **`scroll_delay`** | `float` (0.2) | Delay between scroll steps if `scan_full_page=True`. |
|
||||
| **`max_scroll_steps`** | `int or None` (None) | Maximum number of scroll steps during full page scan. If None, scrolls until entire page is loaded. |
|
||||
| **`process_iframes`** | `bool` (False) | Inlines iframe content for single-page extraction. |
|
||||
| **`flatten_shadow_dom`** | `bool` (False) | Flattens Shadow DOM content into the light DOM before HTML capture. Resolves slots, strips shadow-scoped styles, and force-opens closed shadow roots. Essential for sites built with Web Components (Stencil, Lit, Shoelace, etc.). |
|
||||
| **`remove_overlay_elements`** | `bool` (False) | Removes potential modals/popups blocking the main content. |
|
||||
| **`remove_consent_popups`** | `bool` (False) | Removes GDPR/cookie consent popups from known CMP providers (OneTrust, Cookiebot, TrustArc, Quantcast, Didomi, Sourcepoint, FundingChoices, etc.). Tries clicking "Accept All" first, then falls back to DOM removal. |
|
||||
| **`simulate_user`** | `bool` (False) | Simulate user interactions (mouse movements) to avoid bot detection. |
|
||||
|
||||
@@ -1781,12 +1781,14 @@ run_cfg = CrawlerRunConfig(
|
||||
### D) **Page Interaction**
|
||||
| **Parameter** | **Type / Default** | **What It Does** |
|
||||
|----------------------------|--------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| **`js_code`** | `str or list[str]` (None) | JavaScript to run after load. E.g. `"document.querySelector('button')?.click();"`. |
|
||||
| **`js_only`** | `bool` (False) | If `True`, indicates we’re reusing an existing session and only applying JS. No full reload. |
|
||||
| **`js_code`** | `str or list[str]` (None) | JavaScript to run **after** `wait_for` and `delay_before_return_html`, on the fully-loaded page. E.g. `"document.querySelector('button')?.click();"`. |
|
||||
| **`js_code_before_wait`** | `str or list[str]` (None) | JavaScript to run **before** `wait_for`. Use for triggering loading that `wait_for` then checks. |
|
||||
| **`js_only`** | `bool` (False) | If `True`, indicates we're reusing an existing session and only applying JS. No full reload. |
|
||||
| **`ignore_body_visibility`** | `bool` (True) | Skip checking if `<body>` is visible. Usually best to keep `True`. |
|
||||
| **`scan_full_page`** | `bool` (False) | If `True`, auto-scroll the page to load dynamic content (infinite scroll). |
|
||||
| **`scroll_delay`** | `float` (0.2) | Delay between scroll steps if `scan_full_page=True`. |
|
||||
| **`process_iframes`** | `bool` (False) | Inlines iframe content for single-page extraction. |
|
||||
| **`flatten_shadow_dom`** | `bool` (False) | Flattens Shadow DOM content into the light DOM before HTML capture. Resolves slots, strips shadow-scoped styles, and force-opens closed shadow roots. Essential for sites built with Web Components. |
|
||||
| **`remove_overlay_elements`** | `bool` (False) | Removes potential modals/popups blocking the main content. |
|
||||
| **`remove_consent_popups`** | `bool` (False) | Removes GDPR/cookie consent popups from known CMP providers (OneTrust, Cookiebot, TrustArc, Quantcast, Didomi, Sourcepoint, FundingChoices, etc.). Tries clicking "Accept All" first, then falls back to DOM removal. |
|
||||
| **`simulate_user`** | `bool` (False) | Simulate user interactions (mouse movements) to avoid bot detection. |
|
||||
@@ -2813,6 +2815,46 @@ async def main():
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
## 3.1 Flattening Shadow DOM
|
||||
Sites built with **Web Components** (Stencil, Lit, Shoelace, Angular Elements, etc.) render content inside Shadow DOM — an encapsulated sub-tree invisible to `page.content()`. Set `flatten_shadow_dom=True` to extract it:
|
||||
```python
|
||||
config = CrawlerRunConfig(
|
||||
flatten_shadow_dom=True,
|
||||
wait_until="load",
|
||||
delay_before_return_html=3.0, # give components time to hydrate
|
||||
)
|
||||
```
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
|
||||
async def main():
|
||||
config = CrawlerRunConfig(
|
||||
flatten_shadow_dom=True,
|
||||
wait_until="load",
|
||||
delay_before_return_html=3.0,
|
||||
)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://store.boschrexroth.com/en/us/p/hydraulic-cylinder-r900999011",
|
||||
config=config,
|
||||
)
|
||||
# Without flatten_shadow_dom: ~1 KB markdown (breadcrumbs only)
|
||||
# With flatten_shadow_dom: ~33 KB (product description, specs, downloads)
|
||||
print(len(result.markdown.raw_markdown))
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
When enabled, Crawl4AI also injects an init script that force-opens closed shadow roots. The flattener resolves `<slot>` projections and strips shadow-scoped `<style>` tags, producing clean HTML for the downstream scraping/markdown pipeline.
|
||||
|
||||
**Execution order**: `flatten_shadow_dom` runs right before HTML capture, after all waits and JS execution:
|
||||
```
|
||||
js_code_before_wait → wait_for → delay → js_code → flatten_shadow_dom → page capture
|
||||
```
|
||||
|
||||
For a full runnable example, see [`shadow_dom_crawling.py`](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/shadow_dom_crawling.py).
|
||||
|
||||
## 4. Structured Extraction Examples
|
||||
### 4.1 Pattern-Based with `JsonCssExtractionStrategy`
|
||||
```python
|
||||
|
||||
@@ -255,16 +255,22 @@ class CrawlerRunConfig:
|
||||
- Controls caching behavior (`ENABLED`, `BYPASS`, `DISABLED`, etc.).
|
||||
- Defaults to `CacheMode.BYPASS`.
|
||||
|
||||
6.⠀**`js_code`** & **`c4a_script`**:
|
||||
- `js_code`: A string or list of JavaScript strings to execute.
|
||||
6.⠀**`js_code`**, **`js_code_before_wait`**, & **`c4a_script`**:
|
||||
- `js_code`: JavaScript to run **after** `wait_for` completes — on the fully-loaded page.
|
||||
- `js_code_before_wait`: JavaScript to run **before** `wait_for` — for triggering loading that `wait_for` then checks.
|
||||
- `c4a_script`: C4A script that compiles to JavaScript.
|
||||
- Great for "Load More" buttons or user interactions.
|
||||
- Great for "Load More" buttons or user interactions.
|
||||
|
||||
7.⠀**`wait_for`**:
|
||||
- A CSS or JS expression to wait for before extracting content.
|
||||
- Common usage: `wait_for="css:.main-loaded"` or `wait_for="js:() => window.loaded === true"`.
|
||||
|
||||
8.⠀**`screenshot`**, **`pdf`**, & **`capture_mhtml`**:
|
||||
8.⠀**`flatten_shadow_dom`**:
|
||||
- If `True`, flattens Shadow DOM content into the light DOM before HTML capture.
|
||||
- Essential for sites built with Web Components (Stencil, Lit, Shoelace, etc.).
|
||||
- Also force-opens closed shadow roots. See [Flattening Shadow DOM](content-selection.md#31-flattening-shadow-dom).
|
||||
|
||||
9.⠀**`screenshot`**, **`pdf`**, & **`capture_mhtml`**:
|
||||
- If `True`, captures a screenshot, PDF, or MHTML snapshot after the page is fully loaded.
|
||||
- The results go to `result.screenshot` (base64), `result.pdf` (bytes), or `result.mhtml` (string).
|
||||
- Use `force_viewport_screenshot=True` to capture only the visible viewport instead of the full page. This is faster and produces smaller images when you don't need a full-page screenshot.
|
||||
|
||||
@@ -183,6 +183,55 @@ if __name__ == "__main__":
|
||||
|
||||
---
|
||||
|
||||
## 3.1 Flattening Shadow DOM
|
||||
|
||||
Sites built with **Web Components** (Stencil, Lit, Shoelace, Angular Elements, etc.) render content inside [Shadow DOM](https://developer.mozilla.org/en-US/docs/Web/API/Web_components/Using_shadow_DOM) — an encapsulated sub-tree that is invisible to normal page serialization. The browser renders it on screen, but `page.content()` never includes it.
|
||||
|
||||
Set `flatten_shadow_dom=True` to walk all shadow trees, resolve `<slot>` projections, and produce a single flat HTML document:
|
||||
|
||||
```python
|
||||
config = CrawlerRunConfig(
|
||||
# Flatten shadow DOM into the main document
|
||||
flatten_shadow_dom=True,
|
||||
# Give web components time to hydrate
|
||||
wait_until="load",
|
||||
delay_before_return_html=3.0,
|
||||
)
|
||||
```
|
||||
|
||||
**Full example** — crawling a product page where specs live inside shadow roots:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
|
||||
async def main():
|
||||
config = CrawlerRunConfig(
|
||||
flatten_shadow_dom=True,
|
||||
wait_until="load",
|
||||
delay_before_return_html=3.0,
|
||||
)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://store.boschrexroth.com/en/us/p/hydraulic-cylinder-r900999011",
|
||||
config=config,
|
||||
)
|
||||
# Without flatten_shadow_dom: ~1 KB of markdown (breadcrumbs only)
|
||||
# With flatten_shadow_dom: ~33 KB (full product specs, downloads, etc.)
|
||||
print(len(result.markdown.raw_markdown))
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
When `flatten_shadow_dom=True` is set, Crawl4AI also injects an init script that force-opens **closed** shadow roots (by patching `Element.prototype.attachShadow`), so even components that use `mode: 'closed'` become accessible.
|
||||
|
||||
> **Tip**: Web components need JavaScript to run before they render content (a process called *hydration*). Use `wait_until="load"` and a `delay_before_return_html` of 2–5 seconds to ensure components are fully hydrated before flattening.
|
||||
|
||||
For a complete runnable example, see [`shadow_dom_crawling.py`](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/shadow_dom_crawling.py).
|
||||
|
||||
---
|
||||
|
||||
## 4. Structured Extraction Examples
|
||||
|
||||
You can combine content selection with a more advanced extraction strategy. For instance, a **CSS-based** or **LLM-based** extraction strategy can run on the filtered HTML.
|
||||
|
||||
@@ -15,8 +15,9 @@ Below is a quick overview of how to do it.
|
||||
|
||||
### Basic Execution
|
||||
|
||||
**`js_code`** in **`CrawlerRunConfig`** accepts either a single JS string or a list of JS snippets.
|
||||
**Example**: We’ll scroll to the bottom of the page, then optionally click a “Load More” button.
|
||||
**`js_code`** in **`CrawlerRunConfig`** accepts either a single JS string or a list of JS snippets. It runs **after** `wait_for` and `delay_before_return_html` — so the page is fully loaded when your code executes.
|
||||
|
||||
**Example**: We'll scroll to the bottom of the page, then optionally click a "Load More" button.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
@@ -55,10 +56,36 @@ if __name__ == "__main__":
|
||||
```
|
||||
|
||||
**Relevant `CrawlerRunConfig` params**:
|
||||
- **`js_code`**: A string or list of strings with JavaScript to run after the page loads.
|
||||
- **`js_only`**: If set to `True` on subsequent calls, indicates we’re continuing an existing session without a new full navigation.
|
||||
- **`js_code`**: JavaScript to run **after** `wait_for` and `delay_before_return_html` complete. Runs on the fully-loaded page.
|
||||
- **`js_code_before_wait`**: JavaScript to run **before** `wait_for`. Use when you need to trigger loading that `wait_for` then checks.
|
||||
- **`js_only`**: If set to `True` on subsequent calls, indicates we're continuing an existing session without a new full navigation.
|
||||
- **`session_id`**: If you want to keep the same page across multiple calls, specify an ID.
|
||||
|
||||
### Execution Order
|
||||
|
||||
Understanding when your JavaScript runs relative to other pipeline steps:
|
||||
|
||||
```
|
||||
1. Page navigation (page.goto)
|
||||
2. js_code_before_wait ← triggers loading / clicks tabs
|
||||
3. wait_for ← waits for content to appear
|
||||
4. delay_before_return_html ← extra safety margin
|
||||
5. js_code ← runs on the fully-loaded page
|
||||
6. flatten_shadow_dom ← if enabled
|
||||
7. page.content() ← HTML capture
|
||||
```
|
||||
|
||||
If you need JS to trigger something and then wait for the result, use `js_code_before_wait` + `wait_for`:
|
||||
|
||||
```python
|
||||
config = CrawlerRunConfig(
|
||||
# Click a tab first
|
||||
js_code_before_wait="document.querySelector('#specs-tab')?.click();",
|
||||
# Then wait for the tab content to appear
|
||||
wait_for="css:#specs-panel .content",
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 2. Wait Conditions
|
||||
@@ -317,35 +344,55 @@ When done, check `result.extracted_content` for the JSON.
|
||||
|
||||
---
|
||||
|
||||
## 7. Relevant `CrawlerRunConfig` Parameters
|
||||
## 7. Shadow DOM Flattening
|
||||
|
||||
Sites built with **Web Components** (Stencil, Lit, Shoelace, etc.) render content inside Shadow DOM — an encapsulated sub-tree that is invisible to normal page serialization. Set `flatten_shadow_dom=True` to extract it:
|
||||
|
||||
```python
|
||||
config = CrawlerRunConfig(
|
||||
flatten_shadow_dom=True,
|
||||
wait_until="load",
|
||||
delay_before_return_html=3.0, # give components time to hydrate
|
||||
)
|
||||
```
|
||||
|
||||
This walks all shadow trees, resolves `<slot>` projections, and produces flat HTML. It also force-opens closed shadow roots via an init script. For details and a full example, see [Flattening Shadow DOM](content-selection.md#31-flattening-shadow-dom) and [`shadow_dom_crawling.py`](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/shadow_dom_crawling.py).
|
||||
|
||||
---
|
||||
|
||||
## 8. Relevant `CrawlerRunConfig` Parameters
|
||||
|
||||
Below are the key interaction-related parameters in `CrawlerRunConfig`. For a full list, see [Configuration Parameters](../api/parameters.md).
|
||||
|
||||
- **`js_code`**: JavaScript to run after initial load.
|
||||
- **`js_only`**: If `True`, no new page navigation—only JS in the existing session.
|
||||
- **`wait_for`**: CSS (`"css:..."`) or JS (`"js:..."`) expression to wait for.
|
||||
- **`session_id`**: Reuse the same page across calls.
|
||||
- **`cache_mode`**: Whether to read/write from the cache or bypass.
|
||||
- **`js_code`**: JavaScript to run after `wait_for` + `delay_before_return_html`, on the fully-loaded page.
|
||||
- **`js_code_before_wait`**: JavaScript to run before `wait_for`. For triggering loading that `wait_for` then checks.
|
||||
- **`js_only`**: If `True`, no new page navigation—only JS in the existing session.
|
||||
- **`wait_for`**: CSS (`"css:..."`) or JS (`"js:..."`) expression to wait for.
|
||||
- **`session_id`**: Reuse the same page across calls.
|
||||
- **`cache_mode`**: Whether to read/write from the cache or bypass.
|
||||
- **`flatten_shadow_dom`**: Flatten Shadow DOM content into the light DOM before capture.
|
||||
- **`process_iframes`**: Inline iframe content into the main document.
|
||||
- **`remove_overlay_elements`**: Remove certain popups automatically.
|
||||
- **`remove_consent_popups`**: Remove GDPR/cookie consent popups from known CMP providers (OneTrust, Cookiebot, Didomi, etc.).
|
||||
- **`simulate_user`, `override_navigator`, `magic`**: Anti-bot or "human-like" interactions.
|
||||
|
||||
---
|
||||
|
||||
## 8. Conclusion
|
||||
## 9. Conclusion
|
||||
|
||||
Crawl4AI’s **page interaction** features let you:
|
||||
Crawl4AI's **page interaction** features let you:
|
||||
|
||||
1. **Execute JavaScript** for scrolling, clicks, or form filling.
|
||||
2. **Wait** for CSS or custom JS conditions before capturing data.
|
||||
3. **Handle** multi-step flows (like “Load More”) with partial reloads or persistent sessions.
|
||||
4. Combine with **structured extraction** for dynamic sites.
|
||||
4. **Flatten Shadow DOM** on Web Component sites to extract hidden content.
|
||||
5. Combine with **structured extraction** for dynamic sites.
|
||||
|
||||
With these tools, you can scrape modern, interactive webpages confidently. For advanced hooking, user simulation, or in-depth config, check the [API reference](../api/parameters.md) or related advanced docs. Happy scripting!
|
||||
|
||||
---
|
||||
|
||||
## 9. Virtual Scrolling
|
||||
## 10. Virtual Scrolling
|
||||
|
||||
For sites that use **virtual scrolling** (where content is replaced rather than appended as you scroll, like Twitter or Instagram), Crawl4AI provides a dedicated `VirtualScrollConfig`:
|
||||
|
||||
|
||||
84
tests/general/test_flatten_shadow_dom.py
Normal file
84
tests/general/test_flatten_shadow_dom.py
Normal file
@@ -0,0 +1,84 @@
|
||||
"""Test flatten_shadow_dom feature — full comparison."""
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
|
||||
URL = "https://store.boschrexroth.com/en/us/p/hydraulic-cylinder-r900999011"
|
||||
|
||||
|
||||
async def run_test(label, bc, rc):
|
||||
print(f"\n{'='*70}")
|
||||
print(f"TEST: {label}")
|
||||
print(f"{'='*70}")
|
||||
async with AsyncWebCrawler(config=bc) as crawler:
|
||||
result = await crawler.arun(URL, config=rc)
|
||||
|
||||
html = result.html or ""
|
||||
cleaned = result.cleaned_html or ""
|
||||
md = ""
|
||||
if result.markdown and hasattr(result.markdown, "raw_markdown"):
|
||||
md = result.markdown.raw_markdown or ""
|
||||
|
||||
print(f" Success: {result.success}")
|
||||
print(f" Raw HTML: {len(html):>8} chars")
|
||||
print(f" Cleaned HTML: {len(cleaned):>8} chars")
|
||||
print(f" Markdown: {len(md):>8} chars")
|
||||
|
||||
checks = {
|
||||
"Product title": "HYDRAULIC CYLINDER" in md,
|
||||
"Part number (R900999011)": "R900999011" in md,
|
||||
"Product description": "mill type design" in md.lower(),
|
||||
"Feature: 6 types of mounting":"6 types of mounting" in md,
|
||||
"Feature: safety vent": "safety vent" in md.lower(),
|
||||
"Product Description heading": "Product Description" in md,
|
||||
"Technical Specs heading": "Technical Specs" in md,
|
||||
"Downloads heading": "Downloads" in md,
|
||||
"Specs table: CDH1": "CDH1" in md,
|
||||
"Specs table: 250 bar": "250" in md,
|
||||
}
|
||||
print(f"\n Content checks:")
|
||||
passes = sum(1 for v in checks.values() if v)
|
||||
for k, v in checks.items():
|
||||
print(f" {'PASS' if v else 'FAIL'} {k}")
|
||||
print(f"\n Result: {passes}/{len(checks)} checks passed")
|
||||
|
||||
# Show product content section
|
||||
for term in ["Product Description"]:
|
||||
idx = md.find(term)
|
||||
if idx >= 0:
|
||||
print(f"\n --- Product content section ---")
|
||||
print(md[idx:idx+1500])
|
||||
return result
|
||||
|
||||
|
||||
async def main():
|
||||
bc = BrowserConfig(headless=True)
|
||||
|
||||
r1 = await run_test(
|
||||
"BASELINE (no shadow flattening)",
|
||||
bc,
|
||||
CrawlerRunConfig(wait_until="load", delay_before_return_html=3.0),
|
||||
)
|
||||
|
||||
r2 = await run_test(
|
||||
"WITH flatten_shadow_dom=True",
|
||||
bc,
|
||||
CrawlerRunConfig(
|
||||
wait_until="load",
|
||||
delay_before_return_html=3.0,
|
||||
flatten_shadow_dom=True,
|
||||
),
|
||||
)
|
||||
|
||||
# Summary
|
||||
md1 = r1.markdown.raw_markdown if r1.markdown else ""
|
||||
md2 = r2.markdown.raw_markdown if r2.markdown else ""
|
||||
print(f"\n{'='*70}")
|
||||
print(f"SUMMARY")
|
||||
print(f"{'='*70}")
|
||||
print(f" Baseline markdown: {len(md1):>6} chars")
|
||||
print(f" Flattened markdown: {len(md2):>6} chars")
|
||||
print(f" Improvement: {len(md2)/max(len(md1),1):.1f}x more content")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user