diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 35106f43..7d663414 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -1235,7 +1235,11 @@ class CrawlerRunConfig(): Default: 5. # Page Interaction Parameters - js_code (str or list of str or None): JavaScript code/snippets to run on the page. + js_code (str or list of str or None): JavaScript code/snippets to run on the page + after wait_for and delay_before_return_html. + Default: None. + js_code_before_wait (str or list of str or None): JavaScript to run BEFORE wait_for. + Use for triggering loading that wait_for then checks. Default: None. js_only (bool): If True, indicates subsequent calls are JS-driven updates, not full page loads. Default: False. @@ -1249,6 +1253,10 @@ class CrawlerRunConfig(): If None, scrolls until the entire page is loaded. Default: None. process_iframes (bool): If True, attempts to process and inline iframe content. Default: False. + flatten_shadow_dom (bool): If True, flatten shadow DOM content into the light DOM + before HTML capture so page.content() includes it. + Also injects an init script to force-open closed shadow roots. + Default: False. remove_overlay_elements (bool): If True, remove overlays/popups before extracting HTML. Default: False. remove_consent_popups (bool): If True, remove GDPR/cookie consent popups (IAB TCF/CMP) @@ -1410,6 +1418,7 @@ class CrawlerRunConfig(): semaphore_count: int = 5, # Page Interaction Parameters js_code: Union[str, List[str]] = None, + js_code_before_wait: Union[str, List[str]] = None, c4a_script: Union[str, List[str]] = None, js_only: bool = False, ignore_body_visibility: bool = True, @@ -1417,6 +1426,7 @@ class CrawlerRunConfig(): scroll_delay: float = 0.2, max_scroll_steps: Optional[int] = None, process_iframes: bool = False, + flatten_shadow_dom: bool = False, remove_overlay_elements: bool = False, remove_consent_popups: bool = False, simulate_user: bool = False, @@ -1538,6 +1548,7 @@ class CrawlerRunConfig(): # Page Interaction Parameters self.js_code = js_code + self.js_code_before_wait = js_code_before_wait self.c4a_script = c4a_script self.js_only = js_only self.ignore_body_visibility = ignore_body_visibility @@ -1545,6 +1556,7 @@ class CrawlerRunConfig(): self.scroll_delay = scroll_delay self.max_scroll_steps = max_scroll_steps self.process_iframes = process_iframes + self.flatten_shadow_dom = flatten_shadow_dom self.remove_overlay_elements = remove_overlay_elements self.remove_consent_popups = remove_consent_popups self.simulate_user = simulate_user @@ -1887,12 +1899,14 @@ class CrawlerRunConfig(): "max_range": self.max_range, "semaphore_count": self.semaphore_count, "js_code": self.js_code, + "js_code_before_wait": self.js_code_before_wait, "js_only": self.js_only, "ignore_body_visibility": self.ignore_body_visibility, "scan_full_page": self.scan_full_page, "scroll_delay": self.scroll_delay, "max_scroll_steps": self.max_scroll_steps, "process_iframes": self.process_iframes, + "flatten_shadow_dom": self.flatten_shadow_dom, "remove_overlay_elements": self.remove_overlay_elements, "remove_consent_popups": self.remove_consent_popups, "simulate_user": self.simulate_user, diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 858f4cfc..198a62b7 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -581,6 +581,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if config.override_navigator or config.simulate_user or config.magic: await context.add_init_script(load_js_script("navigator_overrider")) + # Force-open closed shadow roots when flatten_shadow_dom is enabled + if config.flatten_shadow_dom: + await context.add_init_script(""" + const _origAttachShadow = Element.prototype.attachShadow; + Element.prototype.attachShadow = function(init) { + return _origAttachShadow.call(this, {...init, mode: 'open'}); + }; + """) + # Call hook after page creation await self.execute_hook("on_page_context_created", page, context=context, config=config) @@ -925,16 +934,46 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if config.virtual_scroll_config: await self._handle_virtual_scroll(page, config.virtual_scroll_config) - # Execute JavaScript if provided - # if config.js_code: - # if isinstance(config.js_code, str): - # await page.evaluate(config.js_code) - # elif isinstance(config.js_code, list): - # for js in config.js_code: - # await page.evaluate(js) + # --- Phase 1: Pre-wait JS and interaction --- + + # Execute js_code_before_wait (for triggering loading that wait_for checks) + if config.js_code_before_wait: + bw_result = await self.robust_execute_user_script( + page, config.js_code_before_wait + ) + if not bw_result["success"]: + self.logger.warning( + message="js_code_before_wait had issues: {error}", + tag="JS_EXEC", + params={"error": bw_result.get("error")}, + ) + + # Handle user simulation + if config.simulate_user or config.magic: + await page.mouse.move(100, 100) + await page.mouse.down() + await page.mouse.up() + await page.keyboard.press("ArrowDown") + + # --- Phase 2: Wait for page readiness --- + + if config.wait_for: + try: + timeout = config.wait_for_timeout if config.wait_for_timeout is not None else config.page_timeout + await self.smart_wait( + page, config.wait_for, timeout=timeout + ) + except Exception as e: + raise RuntimeError(f"Wait condition failed: {str(e)}") + + # Pre-content retrieval hooks and delay + await self.execute_hook("before_retrieve_html", page, context=context, config=config) + if config.delay_before_return_html: + await asyncio.sleep(config.delay_before_return_html) + + # --- Phase 3: Post-wait JS (runs on fully-loaded page) --- if config.js_code: - # execution_result = await self.execute_user_script(page, config.js_code) execution_result = await self.robust_execute_user_script( page, config.js_code ) @@ -949,28 +988,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await self.execute_hook("on_execution_started", page, context=context, config=config) await self.execute_hook("on_execution_ended", page, context=context, config=config, result=execution_result) - # Handle user simulation - if config.simulate_user or config.magic: - await page.mouse.move(100, 100) - await page.mouse.down() - await page.mouse.up() - await page.keyboard.press("ArrowDown") - - # Handle wait_for condition - # Todo: Decide how to handle this - if not config.wait_for and config.css_selector and False: - # if not config.wait_for and config.css_selector: - config.wait_for = f"css:{config.css_selector}" - - if config.wait_for: - try: - # Use wait_for_timeout if specified, otherwise fall back to page_timeout - timeout = config.wait_for_timeout if config.wait_for_timeout is not None else config.page_timeout - await self.smart_wait( - page, config.wait_for, timeout=timeout - ) - except Exception as e: - raise RuntimeError(f"Wait condition failed: {str(e)}") + # --- Phase 4: DOM processing before HTML capture --- # Update image dimensions if needed if not self.browser_config.text_mode: @@ -992,11 +1010,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if config.process_iframes: page = await self.process_iframes(page) - # Pre-content retrieval hooks and delay - await self.execute_hook("before_retrieve_html", page, context=context, config=config) - if config.delay_before_return_html: - await asyncio.sleep(config.delay_before_return_html) - # Handle CMP/consent popup removal (before generic overlay removal) if config.remove_consent_popups: await self.remove_consent_popups(page) @@ -1005,12 +1018,24 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if config.remove_overlay_elements: await self.remove_overlay_elements(page) - if config.css_selector: + # --- Phase 5: HTML capture --- + + if config.flatten_shadow_dom: + # Use JS to serialize the full DOM including shadow roots + flatten_js = load_js_script("flatten_shadow_dom") + html = await self.adapter.evaluate(page, flatten_js) + if not html or not isinstance(html, str): + # Fallback to normal capture if JS returned nothing + self.logger.warning( + message="Shadow DOM flattening returned no content, falling back to page.content()", + tag="SCRAPE", + ) + html = await page.content() + elif config.css_selector: try: - # Handle comma-separated selectors by splitting them selectors = [s.strip() for s in config.css_selector.split(',')] html_parts = [] - + for selector in selectors: try: content = await self.adapter.evaluate(page, @@ -1021,16 +1046,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): html_parts.append(content) except Error as e: print(f"Warning: Could not get content for selector '{selector}': {str(e)}") - - # Wrap in a div to create a valid HTML structure - html = f"
\n" + "\n".join(html_parts) + "\n
" + + html = f"
\n" + "\n".join(html_parts) + "\n
" except Error as e: raise RuntimeError(f"Failed to extract HTML content: {str(e)}") else: html = await page.content() - - # # Get final HTML content - # html = await page.content() + await self.execute_hook( "before_return_html", page=page, html=html, context=context, config=config ) diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index 2296d076..438c81ab 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -1220,6 +1220,15 @@ class BrowserManager: ): await context.add_init_script(load_js_script("navigator_overrider")) + # Force-open closed shadow roots when flatten_shadow_dom is enabled + if crawlerRunConfig and crawlerRunConfig.flatten_shadow_dom: + await context.add_init_script(""" + const _origAttachShadow = Element.prototype.attachShadow; + Element.prototype.attachShadow = function(init) { + return _origAttachShadow.call(this, {...init, mode: 'open'}); + }; + """) + # Apply custom init_scripts from BrowserConfig (for stealth evasions, etc.) if self.config.init_scripts: for script in self.config.init_scripts: diff --git a/crawl4ai/js_snippet/flatten_shadow_dom.js b/crawl4ai/js_snippet/flatten_shadow_dom.js new file mode 100644 index 00000000..e13f3f31 --- /dev/null +++ b/crawl4ai/js_snippet/flatten_shadow_dom.js @@ -0,0 +1,104 @@ +/** + * Flatten all open shadow DOM trees into the light DOM so that + * page.content() / outerHTML can serialize the full composed view. + * + * Uses manual recursive serialization with proper slot resolution. + * Resolves slots via the live DOM API (assignedNodes), skips only + * shadow-scoped styles, and produces clean HTML with no regex hacks. + * + * Returns the full HTML string including shadow content. + */ +(() => { + const VOID = new Set([ + 'area','base','br','col','embed','hr','img','input', + 'link','meta','param','source','track','wbr' + ]); + + // Serialize a DOM node. When it has a shadow root, switch to + // shadow-aware serialization that resolves elements. + const serialize = (node) => { + if (node.nodeType === Node.TEXT_NODE) return node.textContent; + if (node.nodeType === Node.COMMENT_NODE) return ''; + if (node.nodeType !== Node.ELEMENT_NODE) return ''; + + const tag = node.tagName.toLowerCase(); + const attrs = serializeAttrs(node); + let inner = ''; + + if (node.shadowRoot) { + inner = serializeShadowRoot(node); + } else { + for (const child of node.childNodes) { + inner += serialize(child); + } + } + + if (VOID.has(tag)) return `<${tag}${attrs}>`; + return `<${tag}${attrs}>${inner}`; + }; + + // Serialize a shadow root's children, resolving slots against + // the host's light DOM children. + const serializeShadowRoot = (host) => { + let result = ''; + for (const child of host.shadowRoot.childNodes) { + result += serializeShadowChild(child, host); + } + return result; + }; + + // Serialize a node that lives inside a shadow root. + //