Aravind Karnam
2025-03-01 18:17:11 +05:30
parent a9e24307cc
commit 7c1705712d

View File

@@ -471,6 +471,9 @@ class WebScrapingStrategy(ContentScrapingStrategy):
return False return False
keep_element = False keep_element = False
# Special case for table elements - always preserve structure
if element.name in ["tr", "td", "th"]:
keep_element = True
exclude_domains = kwargs.get("exclude_domains", []) exclude_domains = kwargs.get("exclude_domains", [])
# exclude_social_media_domains = kwargs.get('exclude_social_media_domains', set(SOCIAL_MEDIA_DOMAINS)) # exclude_social_media_domains = kwargs.get('exclude_social_media_domains', set(SOCIAL_MEDIA_DOMAINS))
@@ -1130,6 +1133,9 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
"source", "source",
"track", "track",
"wbr", "wbr",
"tr",
"td",
"th",
} }
for el in reversed(list(root.iterdescendants())): for el in reversed(list(root.iterdescendants())):