feat: add preserve_https_for_internal_links flag to maintain HTTPS during crawling. Ref #1410

Added a new `preserve_https_for_internal_links` configuration flag that preserves the original HTTPS scheme for same-domain links even when the server redirects to HTTP.
2025-08-28 17:38:40 +08:00
parent ef174a4c7a
commit f566c5a376
5 changed files with 224 additions and 4 deletions
--- a/tests/test_preserve_https_for_internal_links.py
+++ b/tests/test_preserve_https_for_internal_links.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+"""
+Final test and demo for HTTPS preservation feature (Issue #1410)
+
+This demonstrates how the preserve_https_for_internal_links flag
+prevents HTTPS downgrade when servers redirect to HTTP.
+"""
+
+import sys
+import os
+from urllib.parse import urljoin, urlparse
+
+def demonstrate_issue():
+    """Show the problem: HTTPS -> HTTP redirect causes HTTP links"""
+    
+    print("=" * 60)
+    print("DEMONSTRATING THE ISSUE")
+    print("=" * 60)
+    
+    # Simulate what happens during crawling
+    original_url = "https://quotes.toscrape.com/tag/deep-thoughts"
+    redirected_url = "http://quotes.toscrape.com/tag/deep-thoughts/"  # Server redirects to HTTP
+    
+    # Extract a relative link
+    relative_link = "/author/Albert-Einstein"
+    
+    # Standard URL joining uses the redirected (HTTP) base
+    resolved_url = urljoin(redirected_url, relative_link)
+    
+    print(f"Original URL:    {original_url}")
+    print(f"Redirected to:   {redirected_url}")
+    print(f"Relative link:   {relative_link}")
+    print(f"Resolved link:   {resolved_url}")
+    print(f"\n❌ Problem: Link is now HTTP instead of HTTPS!")
+    
+    return resolved_url
+
+def demonstrate_solution():
+    """Show the solution: preserve HTTPS for internal links"""
+    
+    print("\n" + "=" * 60)
+    print("DEMONSTRATING THE SOLUTION")
+    print("=" * 60)
+    
+    # Our normalize_url with HTTPS preservation
+    def normalize_url_with_preservation(href, base_url, preserve_https=False, original_scheme=None):
+        """Normalize URL with optional HTTPS preservation"""
+        
+        # Standard resolution
+        full_url = urljoin(base_url, href.strip())
+        
+        # Preserve HTTPS if requested
+        if preserve_https and original_scheme == 'https':
+            parsed_full = urlparse(full_url)
+            parsed_base = urlparse(base_url)
+            
+            # Only for same-domain links
+            if parsed_full.scheme == 'http' and parsed_full.netloc == parsed_base.netloc:
+                full_url = full_url.replace('http://', 'https://', 1)
+                print(f"  → Preserved HTTPS for {parsed_full.netloc}")
+        
+        return full_url
+    
+    # Same scenario as before
+    original_url = "https://quotes.toscrape.com/tag/deep-thoughts"
+    redirected_url = "http://quotes.toscrape.com/tag/deep-thoughts/"
+    relative_link = "/author/Albert-Einstein"
+    
+    # Without preservation (current behavior)
+    resolved_without = normalize_url_with_preservation(
+        relative_link, redirected_url,
+        preserve_https=False, original_scheme='https'
+    )
+    
+    print(f"\nWithout preservation:")
+    print(f"  Result: {resolved_without}")
+    
+    # With preservation (new feature)
+    resolved_with = normalize_url_with_preservation(
+        relative_link, redirected_url,
+        preserve_https=True, original_scheme='https'
+    )
+    
+    print(f"\nWith preservation (preserve_https_for_internal_links=True):")
+    print(f"  Result: {resolved_with}")
+    print(f"\n✅ Solution: Internal link stays HTTPS!")
+    
+    return resolved_with
+
+def test_edge_cases():
+    """Test important edge cases"""
+    
+    print("\n" + "=" * 60)
+    print("EDGE CASES")
+    print("=" * 60)
+    
+    from urllib.parse import urljoin, urlparse
+    
+    def preserve_https(href, base_url, original_scheme):
+        """Helper to test preservation logic"""
+        full_url = urljoin(base_url, href)
+        
+        if original_scheme == 'https':
+            parsed_full = urlparse(full_url)
+            parsed_base = urlparse(base_url)
+            # Fixed: check for protocol-relative URLs
+            if (parsed_full.scheme == 'http' and 
+                parsed_full.netloc == parsed_base.netloc and
+                not href.strip().startswith('//')):
+                full_url = full_url.replace('http://', 'https://', 1)
+        
+        return full_url
+    
+    test_cases = [
+        # (description, href, base_url, original_scheme, should_be_https)
+        ("External link", "http://other.com/page", "http://example.com", "https", False),
+        ("Already HTTPS", "/page", "https://example.com", "https", True),
+        ("No original HTTPS", "/page", "http://example.com", "http", False),
+        ("Subdomain", "/page", "http://sub.example.com", "https", True),
+        ("Protocol-relative", "//example.com/page", "http://example.com", "https", False),
+    ]
+    
+    for desc, href, base_url, orig_scheme, should_be_https in test_cases:
+        result = preserve_https(href, base_url, orig_scheme)
+        is_https = result.startswith('https://')
+        status = "✅" if is_https == should_be_https else "❌"
+        
+        print(f"\n{status} {desc}:")
+        print(f"  Input: {href} + {base_url}")
+        print(f"  Result: {result}")
+        print(f"  Expected HTTPS: {should_be_https}, Got: {is_https}")
+
+def usage_example():
+    """Show how to use the feature in crawl4ai"""
+    
+    print("\n" + "=" * 60)
+    print("USAGE IN CRAWL4AI")
+    print("=" * 60)
+    
+    print("""
+To enable HTTPS preservation in your crawl4ai code:
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async with AsyncWebCrawler() as crawler:
+    config = CrawlerRunConfig(
+        preserve_https_for_internal_links=True  # Enable HTTPS preservation
+    )
+    
+    result = await crawler.arun(
+        url="https://example.com",
+        config=config
+    )
+    
+    # All internal links will maintain HTTPS even if 
+    # the server redirects to HTTP
+```
+
+This is especially useful for:
+- Sites that redirect HTTPS to HTTP but still support HTTPS
+- Security-conscious crawling where you want to stay on HTTPS
+- Avoiding mixed content issues in downstream processing
+""")
+
+if __name__ == "__main__":
+    # Run all demonstrations
+    demonstrate_issue()
+    demonstrate_solution() 
+    test_edge_cases()
+    usage_example()
+    
+    print("\n" + "=" * 60)
+    print("✅ All tests complete!")
+    print("=" * 60)