#!/usr/bin/env python3 """ Final test and demo for HTTPS preservation feature (Issue #1410) This demonstrates how the preserve_https_for_internal_links flag prevents HTTPS downgrade when servers redirect to HTTP. """ import sys import os from urllib.parse import urljoin, urlparse def demonstrate_issue(): """Show the problem: HTTPS -> HTTP redirect causes HTTP links""" print("=" * 60) print("DEMONSTRATING THE ISSUE") print("=" * 60) # Simulate what happens during crawling original_url = "https://quotes.toscrape.com/tag/deep-thoughts" redirected_url = "http://quotes.toscrape.com/tag/deep-thoughts/" # Server redirects to HTTP # Extract a relative link relative_link = "/author/Albert-Einstein" # Standard URL joining uses the redirected (HTTP) base resolved_url = urljoin(redirected_url, relative_link) print(f"Original URL: {original_url}") print(f"Redirected to: {redirected_url}") print(f"Relative link: {relative_link}") print(f"Resolved link: {resolved_url}") print(f"\n❌ Problem: Link is now HTTP instead of HTTPS!") return resolved_url def demonstrate_solution(): """Show the solution: preserve HTTPS for internal links""" print("\n" + "=" * 60) print("DEMONSTRATING THE SOLUTION") print("=" * 60) # Our normalize_url with HTTPS preservation def normalize_url_with_preservation(href, base_url, preserve_https=False, original_scheme=None): """Normalize URL with optional HTTPS preservation""" # Standard resolution full_url = urljoin(base_url, href.strip()) # Preserve HTTPS if requested if preserve_https and original_scheme == 'https': parsed_full = urlparse(full_url) parsed_base = urlparse(base_url) # Only for same-domain links if parsed_full.scheme == 'http' and parsed_full.netloc == parsed_base.netloc: full_url = full_url.replace('http://', 'https://', 1) print(f" → Preserved HTTPS for {parsed_full.netloc}") return full_url # Same scenario as before original_url = "https://quotes.toscrape.com/tag/deep-thoughts" redirected_url = "http://quotes.toscrape.com/tag/deep-thoughts/" relative_link = "/author/Albert-Einstein" # Without preservation (current behavior) resolved_without = normalize_url_with_preservation( relative_link, redirected_url, preserve_https=False, original_scheme='https' ) print(f"\nWithout preservation:") print(f" Result: {resolved_without}") # With preservation (new feature) resolved_with = normalize_url_with_preservation( relative_link, redirected_url, preserve_https=True, original_scheme='https' ) print(f"\nWith preservation (preserve_https_for_internal_links=True):") print(f" Result: {resolved_with}") print(f"\n✅ Solution: Internal link stays HTTPS!") return resolved_with def test_edge_cases(): """Test important edge cases""" print("\n" + "=" * 60) print("EDGE CASES") print("=" * 60) from urllib.parse import urljoin, urlparse def preserve_https(href, base_url, original_scheme): """Helper to test preservation logic""" full_url = urljoin(base_url, href) if original_scheme == 'https': parsed_full = urlparse(full_url) parsed_base = urlparse(base_url) # Fixed: check for protocol-relative URLs if (parsed_full.scheme == 'http' and parsed_full.netloc == parsed_base.netloc and not href.strip().startswith('//')): full_url = full_url.replace('http://', 'https://', 1) return full_url test_cases = [ # (description, href, base_url, original_scheme, should_be_https) ("External link", "http://other.com/page", "http://example.com", "https", False), ("Already HTTPS", "/page", "https://example.com", "https", True), ("No original HTTPS", "/page", "http://example.com", "http", False), ("Subdomain", "/page", "http://sub.example.com", "https", True), ("Protocol-relative", "//example.com/page", "http://example.com", "https", False), ] for desc, href, base_url, orig_scheme, should_be_https in test_cases: result = preserve_https(href, base_url, orig_scheme) is_https = result.startswith('https://') status = "✅" if is_https == should_be_https else "❌" print(f"\n{status} {desc}:") print(f" Input: {href} + {base_url}") print(f" Result: {result}") print(f" Expected HTTPS: {should_be_https}, Got: {is_https}") def usage_example(): """Show how to use the feature in crawl4ai""" print("\n" + "=" * 60) print("USAGE IN CRAWL4AI") print("=" * 60) print(""" To enable HTTPS preservation in your crawl4ai code: ```python from crawl4ai import AsyncWebCrawler, CrawlerRunConfig async with AsyncWebCrawler() as crawler: config = CrawlerRunConfig( preserve_https_for_internal_links=True # Enable HTTPS preservation ) result = await crawler.arun( url="https://example.com", config=config ) # All internal links will maintain HTTPS even if # the server redirects to HTTP ``` This is especially useful for: - Sites that redirect HTTPS to HTTP but still support HTTPS - Security-conscious crawling where you want to stay on HTTPS - Avoiding mixed content issues in downstream processing """) if __name__ == "__main__": # Run all demonstrations demonstrate_issue() demonstrate_solution() test_edge_cases() usage_example() print("\n" + "=" * 60) print("✅ All tests complete!") print("=" * 60)