- Add `flatten_shadow_dom` option to CrawlerRunConfig that serializes shadow DOM content into the light DOM before HTML capture. Uses a recursive serializer that resolves <slot> projections and strips only shadow-scoped <style> tags. Also injects an init script to force-open closed shadow roots via attachShadow patching. - Move `js_code` execution to after `wait_for` + `delay_before_return_html` so user scripts run on the fully-hydrated page. Add `js_code_before_wait` for the less common case of triggering loading before waiting. - Add JS snippet (flatten_shadow_dom.js), integration test, example, and documentation across all relevant doc files.
85 lines
2.8 KiB
Python
85 lines
2.8 KiB
Python
"""Test flatten_shadow_dom feature — full comparison."""
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
|
|
|
URL = "https://store.boschrexroth.com/en/us/p/hydraulic-cylinder-r900999011"
|
|
|
|
|
|
async def run_test(label, bc, rc):
|
|
print(f"\n{'='*70}")
|
|
print(f"TEST: {label}")
|
|
print(f"{'='*70}")
|
|
async with AsyncWebCrawler(config=bc) as crawler:
|
|
result = await crawler.arun(URL, config=rc)
|
|
|
|
html = result.html or ""
|
|
cleaned = result.cleaned_html or ""
|
|
md = ""
|
|
if result.markdown and hasattr(result.markdown, "raw_markdown"):
|
|
md = result.markdown.raw_markdown or ""
|
|
|
|
print(f" Success: {result.success}")
|
|
print(f" Raw HTML: {len(html):>8} chars")
|
|
print(f" Cleaned HTML: {len(cleaned):>8} chars")
|
|
print(f" Markdown: {len(md):>8} chars")
|
|
|
|
checks = {
|
|
"Product title": "HYDRAULIC CYLINDER" in md,
|
|
"Part number (R900999011)": "R900999011" in md,
|
|
"Product description": "mill type design" in md.lower(),
|
|
"Feature: 6 types of mounting":"6 types of mounting" in md,
|
|
"Feature: safety vent": "safety vent" in md.lower(),
|
|
"Product Description heading": "Product Description" in md,
|
|
"Technical Specs heading": "Technical Specs" in md,
|
|
"Downloads heading": "Downloads" in md,
|
|
"Specs table: CDH1": "CDH1" in md,
|
|
"Specs table: 250 bar": "250" in md,
|
|
}
|
|
print(f"\n Content checks:")
|
|
passes = sum(1 for v in checks.values() if v)
|
|
for k, v in checks.items():
|
|
print(f" {'PASS' if v else 'FAIL'} {k}")
|
|
print(f"\n Result: {passes}/{len(checks)} checks passed")
|
|
|
|
# Show product content section
|
|
for term in ["Product Description"]:
|
|
idx = md.find(term)
|
|
if idx >= 0:
|
|
print(f"\n --- Product content section ---")
|
|
print(md[idx:idx+1500])
|
|
return result
|
|
|
|
|
|
async def main():
|
|
bc = BrowserConfig(headless=True)
|
|
|
|
r1 = await run_test(
|
|
"BASELINE (no shadow flattening)",
|
|
bc,
|
|
CrawlerRunConfig(wait_until="load", delay_before_return_html=3.0),
|
|
)
|
|
|
|
r2 = await run_test(
|
|
"WITH flatten_shadow_dom=True",
|
|
bc,
|
|
CrawlerRunConfig(
|
|
wait_until="load",
|
|
delay_before_return_html=3.0,
|
|
flatten_shadow_dom=True,
|
|
),
|
|
)
|
|
|
|
# Summary
|
|
md1 = r1.markdown.raw_markdown if r1.markdown else ""
|
|
md2 = r2.markdown.raw_markdown if r2.markdown else ""
|
|
print(f"\n{'='*70}")
|
|
print(f"SUMMARY")
|
|
print(f"{'='*70}")
|
|
print(f" Baseline markdown: {len(md1):>6} chars")
|
|
print(f" Flattened markdown: {len(md2):>6} chars")
|
|
print(f" Improvement: {len(md2)/max(len(md1),1):.1f}x more content")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|