Merged next branch

This commit is contained in:
Aravind Karnam
2025-04-12 10:47:02 +05:30
62 changed files with 3225 additions and 7085 deletions

View File

@@ -866,6 +866,12 @@ class WebScrapingStrategy(ContentScrapingStrategy):
if body is None:
raise Exception("'<body>' tag is not found in fetched html. Consider adding wait_for=\"css:body\" to wait for body tag to be loaded into DOM.")
base_domain = get_base_domain(url)
# Early removal of all images if exclude_all_images is set
# This happens before any processing to minimize memory usage
if kwargs.get("exclude_all_images", False):
for img in body.find_all('img'):
img.decompose()
try:
meta = extract_metadata("", soup)
@@ -1487,6 +1493,13 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
body = doc
base_domain = get_base_domain(url)
# Early removal of all images if exclude_all_images is set
# This is more efficient in lxml as we remove elements before any processing
if kwargs.get("exclude_all_images", False):
for img in body.xpath('//img'):
if img.getparent() is not None:
img.getparent().remove(img)
# Add comment removal
if kwargs.get("remove_comments", False):