feat: add preserve_https_for_internal_links flag to maintain HTTPS during crawling. Ref #1410
Added a new `preserve_https_for_internal_links` configuration flag that preserves the original HTTPS scheme for same-domain links even when the server redirects to HTTP.
This commit is contained in:
@@ -258,7 +258,11 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
|
||||
continue
|
||||
|
||||
try:
|
||||
normalized_href = normalize_url(href, url)
|
||||
normalized_href = normalize_url(
|
||||
href, url,
|
||||
preserve_https=kwargs.get('preserve_https_for_internal_links', False),
|
||||
original_scheme=kwargs.get('original_scheme')
|
||||
)
|
||||
link_data = {
|
||||
"href": normalized_href,
|
||||
"text": link.text_content().strip(),
|
||||
|
||||
Reference in New Issue
Block a user