Add Shadow DOM flattening and reorder js_code execution pipeline
- Add `flatten_shadow_dom` option to CrawlerRunConfig that serializes shadow DOM content into the light DOM before HTML capture. Uses a recursive serializer that resolves <slot> projections and strips only shadow-scoped <style> tags. Also injects an init script to force-open closed shadow roots via attachShadow patching. - Move `js_code` execution to after `wait_for` + `delay_before_return_html` so user scripts run on the fully-hydrated page. Add `js_code_before_wait` for the less common case of triggering loading before waiting. - Add JS snippet (flatten_shadow_dom.js), integration test, example, and documentation across all relevant doc files.
This commit is contained in:
84
tests/general/test_flatten_shadow_dom.py
Normal file
84
tests/general/test_flatten_shadow_dom.py
Normal file
@@ -0,0 +1,84 @@
|
||||
"""Test flatten_shadow_dom feature — full comparison."""
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
|
||||
URL = "https://store.boschrexroth.com/en/us/p/hydraulic-cylinder-r900999011"
|
||||
|
||||
|
||||
async def run_test(label, bc, rc):
|
||||
print(f"\n{'='*70}")
|
||||
print(f"TEST: {label}")
|
||||
print(f"{'='*70}")
|
||||
async with AsyncWebCrawler(config=bc) as crawler:
|
||||
result = await crawler.arun(URL, config=rc)
|
||||
|
||||
html = result.html or ""
|
||||
cleaned = result.cleaned_html or ""
|
||||
md = ""
|
||||
if result.markdown and hasattr(result.markdown, "raw_markdown"):
|
||||
md = result.markdown.raw_markdown or ""
|
||||
|
||||
print(f" Success: {result.success}")
|
||||
print(f" Raw HTML: {len(html):>8} chars")
|
||||
print(f" Cleaned HTML: {len(cleaned):>8} chars")
|
||||
print(f" Markdown: {len(md):>8} chars")
|
||||
|
||||
checks = {
|
||||
"Product title": "HYDRAULIC CYLINDER" in md,
|
||||
"Part number (R900999011)": "R900999011" in md,
|
||||
"Product description": "mill type design" in md.lower(),
|
||||
"Feature: 6 types of mounting":"6 types of mounting" in md,
|
||||
"Feature: safety vent": "safety vent" in md.lower(),
|
||||
"Product Description heading": "Product Description" in md,
|
||||
"Technical Specs heading": "Technical Specs" in md,
|
||||
"Downloads heading": "Downloads" in md,
|
||||
"Specs table: CDH1": "CDH1" in md,
|
||||
"Specs table: 250 bar": "250" in md,
|
||||
}
|
||||
print(f"\n Content checks:")
|
||||
passes = sum(1 for v in checks.values() if v)
|
||||
for k, v in checks.items():
|
||||
print(f" {'PASS' if v else 'FAIL'} {k}")
|
||||
print(f"\n Result: {passes}/{len(checks)} checks passed")
|
||||
|
||||
# Show product content section
|
||||
for term in ["Product Description"]:
|
||||
idx = md.find(term)
|
||||
if idx >= 0:
|
||||
print(f"\n --- Product content section ---")
|
||||
print(md[idx:idx+1500])
|
||||
return result
|
||||
|
||||
|
||||
async def main():
|
||||
bc = BrowserConfig(headless=True)
|
||||
|
||||
r1 = await run_test(
|
||||
"BASELINE (no shadow flattening)",
|
||||
bc,
|
||||
CrawlerRunConfig(wait_until="load", delay_before_return_html=3.0),
|
||||
)
|
||||
|
||||
r2 = await run_test(
|
||||
"WITH flatten_shadow_dom=True",
|
||||
bc,
|
||||
CrawlerRunConfig(
|
||||
wait_until="load",
|
||||
delay_before_return_html=3.0,
|
||||
flatten_shadow_dom=True,
|
||||
),
|
||||
)
|
||||
|
||||
# Summary
|
||||
md1 = r1.markdown.raw_markdown if r1.markdown else ""
|
||||
md2 = r2.markdown.raw_markdown if r2.markdown else ""
|
||||
print(f"\n{'='*70}")
|
||||
print(f"SUMMARY")
|
||||
print(f"{'='*70}")
|
||||
print(f" Baseline markdown: {len(md1):>6} chars")
|
||||
print(f" Flattened markdown: {len(md2):>6} chars")
|
||||
print(f" Improvement: {len(md2)/max(len(md1),1):.1f}x more content")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user