From 7c1705712dddc0d80ad33fdacc6e37e9272d83aa Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Sat, 1 Mar 2025 18:17:11 +0530 Subject: [PATCH] fix: https://github.com/unclecode/crawl4ai/issues/756 --- crawl4ai/content_scraping_strategy.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 46761013..719cab8e 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -471,6 +471,9 @@ class WebScrapingStrategy(ContentScrapingStrategy): return False keep_element = False + # Special case for table elements - always preserve structure + if element.name in ["tr", "td", "th"]: + keep_element = True exclude_domains = kwargs.get("exclude_domains", []) # exclude_social_media_domains = kwargs.get('exclude_social_media_domains', set(SOCIAL_MEDIA_DOMAINS)) @@ -1130,6 +1133,9 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): "source", "track", "wbr", + "tr", + "td", + "th", } for el in reversed(list(root.iterdescendants())):