This commit is contained in:
Aravind Karnam
2025-04-19 20:08:36 +05:30
parent c2902fd200
commit d2648eaa39

View File

@@ -28,6 +28,7 @@ from lxml import etree
from lxml import html as lhtml from lxml import html as lhtml
from typing import List from typing import List
from .models import ScrapingResult, MediaItem, Link, Media, Links from .models import ScrapingResult, MediaItem, Link, Media, Links
import copy
# Pre-compile regular expressions for Open Graph and Twitter metadata # Pre-compile regular expressions for Open Graph and Twitter metadata
OG_REGEX = re.compile(r"^og:") OG_REGEX = re.compile(r"^og:")
@@ -911,7 +912,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
for_content_targeted_element.extend(body.select(target_element)) for_content_targeted_element.extend(body.select(target_element))
content_element = soup.new_tag("div") content_element = soup.new_tag("div")
for el in for_content_targeted_element: for el in for_content_targeted_element:
content_element.append(el) content_element.append(copy.deepcopy(el))
except Exception as e: except Exception as e:
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
return None return None
@@ -1539,7 +1540,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
for target_element in target_elements: for target_element in target_elements:
for_content_targeted_element.extend(body.cssselect(target_element)) for_content_targeted_element.extend(body.cssselect(target_element))
content_element = lhtml.Element("div") content_element = lhtml.Element("div")
content_element.extend(for_content_targeted_element) content_element.extend(copy.deepcopy(for_content_targeted_element))
except Exception as e: except Exception as e:
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
return None return None