feat: update documentation for preserve_https_for_internal_links. ref #1410

feat: add preserve_https_for_internal_links flag to maintain HTTPS during crawling. Ref #1410
Added a new `preserve_https_for_internal_links` configuration flag that preserves the original HTTPS scheme for same-domain links even when the server redirects to HTTP.
2025-08-28 17:48:12 +08:00 · 2025-08-28 17:38:40 +08:00
13 changed files with 255 additions and 247 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,16 @@ All notable changes to Crawl4AI will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

+## [Unreleased]
+
+### Added
+- **🔒 HTTPS Preservation for Internal Links**: New `preserve_https_for_internal_links` configuration flag
+  - Maintains HTTPS scheme for internal links even when servers redirect to HTTP
+  - Prevents security downgrades during deep crawling
+  - Useful for security-conscious crawling and sites supporting both protocols
+  - Fully backward compatible with opt-in flag (default: `False`)
+  - Fixes issue #1410 where HTTPS URLs were being downgraded to HTTP
+
 ## [0.7.3] - 2025-08-09

 ### Added
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -97,16 +97,13 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
                if value != param.default and not ignore_default_value:
                    current_values[name] = to_serializable_dict(value)
        
-        # Don't serialize private __slots__ - they're internal implementation details
-        # not constructor parameters. This was causing URLPatternFilter to fail
-        # because _simple_suffixes was being serialized as 'simple_suffixes'
-        # if hasattr(obj, '__slots__'):
-        #     for slot in obj.__slots__:
-        #         if slot.startswith('_'):  # Handle private slots
-        #             attr_name = slot[1:]  # Remove leading '_'
-        #             value = getattr(obj, slot, None)
-        #             if value is not None:
-        #                 current_values[attr_name] = to_serializable_dict(value)
+        if hasattr(obj, '__slots__'):
+            for slot in obj.__slots__:
+                if slot.startswith('_'):  # Handle private slots
+                    attr_name = slot[1:]  # Remove leading '_'
+                    value = getattr(obj, slot, None)
+                    if value is not None:
+                        current_values[attr_name] = to_serializable_dict(value)

            
        
@@ -1124,6 +1121,7 @@ class CrawlerRunConfig():
        exclude_domains: list = None,
        exclude_internal_links: bool = False,
        score_links: bool = False,
+        preserve_https_for_internal_links: bool = False,
        # Debugging and Logging Parameters
        verbose: bool = True,
        log_console: bool = False,
@@ -1247,6 +1245,7 @@ class CrawlerRunConfig():
        self.exclude_domains = exclude_domains or []
        self.exclude_internal_links = exclude_internal_links
        self.score_links = score_links
+        self.preserve_https_for_internal_links = preserve_https_for_internal_links

        # Debugging and Logging Parameters
        self.verbose = verbose
@@ -1520,6 +1519,7 @@ class CrawlerRunConfig():
            exclude_domains=kwargs.get("exclude_domains", []),
            exclude_internal_links=kwargs.get("exclude_internal_links", False),
            score_links=kwargs.get("score_links", False),
+            preserve_https_for_internal_links=kwargs.get("preserve_https_for_internal_links", False),
            # Debugging and Logging Parameters
            verbose=kwargs.get("verbose", True),
            log_console=kwargs.get("log_console", False),
@@ -1626,6 +1626,7 @@ class CrawlerRunConfig():
            "exclude_domains": self.exclude_domains,
            "exclude_internal_links": self.exclude_internal_links,
            "score_links": self.score_links,
+            "preserve_https_for_internal_links": self.preserve_https_for_internal_links,
            "verbose": self.verbose,
            "log_console": self.log_console,
            "capture_network_requests": self.capture_network_requests,
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -354,6 +354,7 @@ class AsyncWebCrawler:
                    ###############################################################
                    # Process the HTML content, Call CrawlerStrategy.process_html #
                    ###############################################################
+                    from urllib.parse import urlparse
                    crawl_result: CrawlResult = await self.aprocess_html(
                        url=url,
                        html=html,
@@ -364,6 +365,7 @@ class AsyncWebCrawler:
                        verbose=config.verbose,
                        is_raw_html=True if url.startswith("raw:") else False,
                        redirected_url=async_response.redirected_url,
+                        original_scheme=urlparse(url).scheme,
                        **kwargs,
                    )

--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -258,7 +258,11 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
                continue

            try:
-                normalized_href = normalize_url(href, url)
+                normalized_href = normalize_url(
+                    href, url,
+                    preserve_https=kwargs.get('preserve_https_for_internal_links', False),
+                    original_scheme=kwargs.get('original_scheme')
+                )
                link_data = {
                    "href": normalized_href,
                    "text": link.text_content().strip(),
--- a/crawl4ai/deep_crawling/bff_strategy.py
+++ b/crawl4ai/deep_crawling/bff_strategy.py
@@ -47,13 +47,7 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
        self.url_scorer = url_scorer
        self.include_external = include_external
        self.max_pages = max_pages
-        # self.logger = logger or logging.getLogger(__name__)
-        # Ensure logger is always a Logger instance, not a dict from serialization
-        if isinstance(logger, logging.Logger):
-            self.logger = logger
-        else:
-            # Create a new logger if logger is None, dict, or any other non-Logger type
-            self.logger = logging.getLogger(__name__)
+        self.logger = logger or logging.getLogger(__name__)
        self.stats = TraversalStats(start_time=datetime.now())
        self._cancel_event = asyncio.Event()
        self._pages_crawled = 0
--- a/crawl4ai/deep_crawling/bfs_strategy.py
+++ b/crawl4ai/deep_crawling/bfs_strategy.py
@@ -38,13 +38,7 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
        self.include_external = include_external
        self.score_threshold = score_threshold
        self.max_pages = max_pages
-        # self.logger = logger or logging.getLogger(__name__)
-        # Ensure logger is always a Logger instance, not a dict from serialization
-        if isinstance(logger, logging.Logger):
-            self.logger = logger
-        else:
-            # Create a new logger if logger is None, dict, or any other non-Logger type
-            self.logger = logging.getLogger(__name__)
+        self.logger = logger or logging.getLogger(__name__)
        self.stats = TraversalStats(start_time=datetime.now())
        self._cancel_event = asyncio.Event()
        self._pages_crawled = 0
--- a/crawl4ai/deep_crawling/filters.py
+++ b/crawl4ai/deep_crawling/filters.py
@@ -120,9 +120,6 @@ class URLPatternFilter(URLFilter):
    """Pattern filter balancing speed and completeness"""

    __slots__ = (
-        "patterns",  # Store original patterns for serialization
-        "use_glob",  # Store original use_glob for serialization  
-        "reverse",   # Store original reverse for serialization
        "_simple_suffixes",
        "_simple_prefixes",
        "_domain_patterns",
@@ -145,11 +142,6 @@ class URLPatternFilter(URLFilter):
        reverse: bool = False,
    ):
        super().__init__()
-        # Store original constructor params for serialization
-        self.patterns = patterns
-        self.use_glob = use_glob
-        self.reverse = reverse
-        
        self._reverse = reverse
        patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns

--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -253,16 +253,6 @@ class CrawlResult(BaseModel):
        requirements change, this is where you would update the logic.
        """
        result = super().model_dump(*args, **kwargs)
-        
-        # Remove any property descriptors that might have been included
-        # These deprecated properties should not be in the serialized output
-        for key in ['fit_html', 'fit_markdown', 'markdown_v2']:
-            if key in result and isinstance(result[key], property):
-                # del result[key]
-                # Nasrin: I decided to convert it to string instead of removing it.
-                result[key] = str(result[key])
-        
-        # Add the markdown field properly
        if self._markdown is not None:
            result["markdown"] = self._markdown.model_dump() 
        return result
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -2146,7 +2146,9 @@ def normalize_url(
    drop_query_tracking=True,
    sort_query=True,
    keep_fragment=False,
-    extra_drop_params=None
+    extra_drop_params=None,
+    preserve_https=False,
+    original_scheme=None
 ):
    """
    Extended URL normalizer
@@ -2176,6 +2178,17 @@ def normalize_url(

    # Resolve relative paths first
    full_url = urljoin(base_url, href.strip())
+    
+    # Preserve HTTPS if requested and original scheme was HTTPS
+    if preserve_https and original_scheme == 'https':
+        parsed_full = urlparse(full_url)
+        parsed_base = urlparse(base_url)
+        # Only preserve HTTPS for same-domain links (not protocol-relative URLs)
+        # Protocol-relative URLs (//example.com) should follow the base URL's scheme
+        if (parsed_full.scheme == 'http' and 
+            parsed_full.netloc == parsed_base.netloc and
+            not href.strip().startswith('//')):
+            full_url = full_url.replace('http://', 'https://', 1)

    # Parse once, edit parts, then rebuild
    parsed = urlparse(full_url)
@@ -2225,7 +2238,7 @@ def normalize_url(
    return normalized


-def normalize_url_for_deep_crawl(href, base_url):
+def normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_scheme=None):
    """Normalize URLs to ensure consistent format"""
    from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode

@@ -2236,6 +2249,17 @@ def normalize_url_for_deep_crawl(href, base_url):
    # Use urljoin to handle relative URLs
    full_url = urljoin(base_url, href.strip())
    
+    # Preserve HTTPS if requested and original scheme was HTTPS
+    if preserve_https and original_scheme == 'https':
+        parsed_full = urlparse(full_url)
+        parsed_base = urlparse(base_url)
+        # Only preserve HTTPS for same-domain links (not protocol-relative URLs)
+        # Protocol-relative URLs (//example.com) should follow the base URL's scheme
+        if (parsed_full.scheme == 'http' and 
+            parsed_full.netloc == parsed_base.netloc and
+            not href.strip().startswith('//')):
+            full_url = full_url.replace('http://', 'https://', 1)
+    
    # Parse the URL for normalization
    parsed = urlparse(full_url)
    
@@ -2273,7 +2297,7 @@ def normalize_url_for_deep_crawl(href, base_url):
    return normalized

@lru_cache(maxsize=10000)
-def efficient_normalize_url_for_deep_crawl(href, base_url):
+def efficient_normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_scheme=None):
    """Efficient URL normalization with proper parsing"""
    from urllib.parse import urljoin
    
@@ -2283,6 +2307,17 @@ def efficient_normalize_url_for_deep_crawl(href, base_url):
    # Resolve relative URLs
    full_url = urljoin(base_url, href.strip())
    
+    # Preserve HTTPS if requested and original scheme was HTTPS
+    if preserve_https and original_scheme == 'https':
+        parsed_full = urlparse(full_url)
+        parsed_base = urlparse(base_url)
+        # Only preserve HTTPS for same-domain links (not protocol-relative URLs)
+        # Protocol-relative URLs (//example.com) should follow the base URL's scheme
+        if (parsed_full.scheme == 'http' and 
+            parsed_full.netloc == parsed_base.netloc and
+            not href.strip().startswith('//')):
+            full_url = full_url.replace('http://', 'https://', 1)
+    
    # Use proper URL parsing
    parsed = urlparse(full_url)
    
--- a/docs/md_v2/api/parameters.md
+++ b/docs/md_v2/api/parameters.md
@@ -155,6 +155,7 @@ If your page is a single-page app with repeated JS updates, set `js_only=True` i
 | **`exclude_external_links`** | `bool` (False)          | Removes all links pointing outside the current domain.                                                                      |
 | **`exclude_social_media_links`** | `bool` (False)      | Strips links specifically to social sites (like Facebook or Twitter).                                                      |
 | **`exclude_domains`**        | `list` ([])             | Provide a custom list of domains to exclude (like `["ads.com", "trackers.io"]`).                                            |
+| **`preserve_https_for_internal_links`** | `bool` (False) | If `True`, preserves HTTPS scheme for internal links even when the server redirects to HTTP. Useful for security-conscious crawling. |

 Use these for link-level content filtering (often to keep crawls “internal” or to remove spammy domains).

--- a/docs/md_v2/core/deep-crawling.md
+++ b/docs/md_v2/core/deep-crawling.md
@@ -472,6 +472,17 @@ Note that for BestFirstCrawlingStrategy, score_threshold is not needed since pag

 5.**Balance breadth vs. depth.** Choose your strategy wisely - BFS for comprehensive coverage, DFS for deep exploration, BestFirst for focused relevance-based crawling.

+6.**Preserve HTTPS for security.** If crawling HTTPS sites that redirect to HTTP, use `preserve_https_for_internal_links=True` to maintain secure connections:
+
+```python
+config = CrawlerRunConfig(
+    deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=2),
+    preserve_https_for_internal_links=True  # Keep HTTPS even if server redirects to HTTP
+)
+```
+
+This is especially useful for security-conscious crawling or when dealing with sites that support both protocols.
+
 ---

 ## 10. Summary & Next Steps
--- a/tests/docker/test_filter_deep_crawl.py
+++ b/tests/docker/test_filter_deep_crawl.py
@@ -1,201 +0,0 @@
-"""
-Test the complete fix for both the filter serialization and JSON serialization issues.
-"""
-
-import asyncio
-import httpx
-
-from crawl4ai import BrowserConfig, CacheMode, CrawlerRunConfig
-from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, FilterChain, URLPatternFilter
-
-BASE_URL = "http://localhost:11234/"  # Adjust port as needed
-
-async def test_with_docker_client():
-    """Test using the Docker client (same as 1419.py)."""
-    from crawl4ai.docker_client import Crawl4aiDockerClient
-    
-    print("=" * 60)
-    print("Testing with Docker Client")
-    print("=" * 60)
-    
-    try:
-        async with Crawl4aiDockerClient(
-            base_url=BASE_URL,
-            verbose=True,
-        ) as client:
-            
-            # Create filter chain - testing the serialization fix
-            filter_chain = [
-                URLPatternFilter(
-                    # patterns=["*about*", "*privacy*", "*terms*"],
-                    patterns=["*advanced*"],
-                    reverse=True
-                ),
-            ]
-            
-            crawler_config = CrawlerRunConfig(
-                deep_crawl_strategy=BFSDeepCrawlStrategy(
-                    max_depth=2,  # Keep it shallow for testing
-                    # max_pages=5,  # Limit pages for testing
-                    filter_chain=FilterChain(filter_chain)
-                ),
-                cache_mode=CacheMode.BYPASS,
-            )
-            
-            print("\n1. Testing crawl with filters...")
-            results = await client.crawl(
-                ["https://docs.crawl4ai.com"],  # Simple test page
-                browser_config=BrowserConfig(headless=True),
-                crawler_config=crawler_config,
-            )
-            
-            if results:
-                print(f"✅ Crawl succeeded! Type: {type(results)}")
-                if hasattr(results, 'success'):
-                    print(f"✅ Results success: {results.success}")
-                    # Test that we can iterate results without JSON errors
-                    if hasattr(results, '__iter__'):
-                        for i, result in enumerate(results):
-                            if hasattr(result, 'url'):
-                                print(f"   Result {i}: {result.url[:50]}...")
-                            else:
-                                print(f"   Result {i}: {str(result)[:50]}...")
-                else:
-                    # Handle list of results
-                    print(f"✅ Got {len(results)} results")
-                    for i, result in enumerate(results[:3]):  # Show first 3
-                        print(f"   Result {i}: {result.url[:50]}...")
-            else:
-                print("❌ Crawl failed - no results returned")
-                return False
-                
-        print("\n✅ Docker client test completed successfully!")
-        return True
-        
-    except Exception as e:
-        print(f"❌ Docker client test failed: {e}")
-        import traceback
-        traceback.print_exc()
-        return False
-
-
-async def test_with_rest_api():
-    """Test using REST API directly."""
-    print("\n" + "=" * 60)
-    print("Testing with REST API")
-    print("=" * 60)
-    
-    # Create filter configuration
-    deep_crawl_strategy_payload = {
-        "type": "BFSDeepCrawlStrategy",
-        "params": {
-            "max_depth": 2,
-            # "max_pages": 5,
-            "filter_chain": {
-                "type": "FilterChain",
-                "params": {
-                    "filters": [
-                        {
-                            "type": "URLPatternFilter",
-                            "params": {
-                                "patterns": ["*advanced*"],
-                                "reverse": True
-                            }
-                        }
-                    ]
-                }
-            }
-        }
-    }
-    
-    crawl_payload = {
-        "urls": ["https://docs.crawl4ai.com"],
-        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
-        "crawler_config": {
-            "type": "CrawlerRunConfig",
-            "params": {
-                "deep_crawl_strategy": deep_crawl_strategy_payload,
-                "cache_mode": "bypass"
-            }
-        }
-    }
-    
-    try:
-        async with httpx.AsyncClient() as client:
-            print("\n1. Sending crawl request to REST API...")
-            response = await client.post(
-                f"{BASE_URL}crawl",
-                json=crawl_payload,
-                timeout=30
-            )
-            
-            if response.status_code == 200:
-                print(f"✅ REST API returned 200 OK")
-                data = response.json()
-                if data.get("success"):
-                    results = data.get("results", [])
-                    print(f"✅ Got {len(results)} results")
-                    for i, result in enumerate(results[:3]):
-                        print(f"   Result {i}: {result.get('url', 'unknown')[:50]}...")
-                else:
-                    print(f"❌ Crawl not successful: {data}")
-                    return False
-            else:
-                print(f"❌ REST API returned {response.status_code}")
-                print(f"   Response: {response.text[:500]}")
-                return False
-                
-        print("\n✅ REST API test completed successfully!")
-        return True
-        
-    except Exception as e:
-        print(f"❌ REST API test failed: {e}")
-        import traceback
-        traceback.print_exc()
-        return False
-
-
-async def main():
-    """Run all tests."""
-    print("\n🧪 TESTING COMPLETE FIX FOR DOCKER FILTER AND JSON ISSUES")
-    print("=" * 60)
-    print("Make sure the server is running with the updated code!")
-    print("=" * 60)
-    
-    results = []
-    
-    # Test 1: Docker client
-    docker_passed = await test_with_docker_client()
-    results.append(("Docker Client", docker_passed))
-    
-    # Test 2: REST API
-    rest_passed = await test_with_rest_api()
-    results.append(("REST API", rest_passed))
-    
-    # Summary
-    print("\n" + "=" * 60)
-    print("FINAL TEST SUMMARY")
-    print("=" * 60)
-    
-    all_passed = True
-    for test_name, passed in results:
-        status = "✅ PASSED" if passed else "❌ FAILED"
-        print(f"{test_name:20} {status}")
-        if not passed:
-            all_passed = False
-    
-    print("=" * 60)
-    if all_passed:
-        print("🎉 ALL TESTS PASSED! Both issues are fully resolved!")
-        print("\nThe fixes:")
-        print("1. Filter serialization: Fixed by not serializing private __slots__")
-        print("2. JSON serialization: Fixed by removing property descriptors from model_dump()")
-    else:
-        print("⚠️ Some tests failed. Please check the server logs for details.")
-    
-    return 0 if all_passed else 1
-
-
-if __name__ == "__main__":
-    import sys
-    sys.exit(asyncio.run(main()))
--- a/tests/test_preserve_https_for_internal_links.py
+++ b/tests/test_preserve_https_for_internal_links.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+"""
+Final test and demo for HTTPS preservation feature (Issue #1410)
+
+This demonstrates how the preserve_https_for_internal_links flag
+prevents HTTPS downgrade when servers redirect to HTTP.
+"""
+
+import sys
+import os
+from urllib.parse import urljoin, urlparse
+
+def demonstrate_issue():
+    """Show the problem: HTTPS -> HTTP redirect causes HTTP links"""
+    
+    print("=" * 60)
+    print("DEMONSTRATING THE ISSUE")
+    print("=" * 60)
+    
+    # Simulate what happens during crawling
+    original_url = "https://quotes.toscrape.com/tag/deep-thoughts"
+    redirected_url = "http://quotes.toscrape.com/tag/deep-thoughts/"  # Server redirects to HTTP
+    
+    # Extract a relative link
+    relative_link = "/author/Albert-Einstein"
+    
+    # Standard URL joining uses the redirected (HTTP) base
+    resolved_url = urljoin(redirected_url, relative_link)
+    
+    print(f"Original URL:    {original_url}")
+    print(f"Redirected to:   {redirected_url}")
+    print(f"Relative link:   {relative_link}")
+    print(f"Resolved link:   {resolved_url}")
+    print(f"\n❌ Problem: Link is now HTTP instead of HTTPS!")
+    
+    return resolved_url
+
+def demonstrate_solution():
+    """Show the solution: preserve HTTPS for internal links"""
+    
+    print("\n" + "=" * 60)
+    print("DEMONSTRATING THE SOLUTION")
+    print("=" * 60)
+    
+    # Our normalize_url with HTTPS preservation
+    def normalize_url_with_preservation(href, base_url, preserve_https=False, original_scheme=None):
+        """Normalize URL with optional HTTPS preservation"""
+        
+        # Standard resolution
+        full_url = urljoin(base_url, href.strip())
+        
+        # Preserve HTTPS if requested
+        if preserve_https and original_scheme == 'https':
+            parsed_full = urlparse(full_url)
+            parsed_base = urlparse(base_url)
+            
+            # Only for same-domain links
+            if parsed_full.scheme == 'http' and parsed_full.netloc == parsed_base.netloc:
+                full_url = full_url.replace('http://', 'https://', 1)
+                print(f"  → Preserved HTTPS for {parsed_full.netloc}")
+        
+        return full_url
+    
+    # Same scenario as before
+    original_url = "https://quotes.toscrape.com/tag/deep-thoughts"
+    redirected_url = "http://quotes.toscrape.com/tag/deep-thoughts/"
+    relative_link = "/author/Albert-Einstein"
+    
+    # Without preservation (current behavior)
+    resolved_without = normalize_url_with_preservation(
+        relative_link, redirected_url,
+        preserve_https=False, original_scheme='https'
+    )
+    
+    print(f"\nWithout preservation:")
+    print(f"  Result: {resolved_without}")
+    
+    # With preservation (new feature)
+    resolved_with = normalize_url_with_preservation(
+        relative_link, redirected_url,
+        preserve_https=True, original_scheme='https'
+    )
+    
+    print(f"\nWith preservation (preserve_https_for_internal_links=True):")
+    print(f"  Result: {resolved_with}")
+    print(f"\n✅ Solution: Internal link stays HTTPS!")
+    
+    return resolved_with
+
+def test_edge_cases():
+    """Test important edge cases"""
+    
+    print("\n" + "=" * 60)
+    print("EDGE CASES")
+    print("=" * 60)
+    
+    from urllib.parse import urljoin, urlparse
+    
+    def preserve_https(href, base_url, original_scheme):
+        """Helper to test preservation logic"""
+        full_url = urljoin(base_url, href)
+        
+        if original_scheme == 'https':
+            parsed_full = urlparse(full_url)
+            parsed_base = urlparse(base_url)
+            # Fixed: check for protocol-relative URLs
+            if (parsed_full.scheme == 'http' and 
+                parsed_full.netloc == parsed_base.netloc and
+                not href.strip().startswith('//')):
+                full_url = full_url.replace('http://', 'https://', 1)
+        
+        return full_url
+    
+    test_cases = [
+        # (description, href, base_url, original_scheme, should_be_https)
+        ("External link", "http://other.com/page", "http://example.com", "https", False),
+        ("Already HTTPS", "/page", "https://example.com", "https", True),
+        ("No original HTTPS", "/page", "http://example.com", "http", False),
+        ("Subdomain", "/page", "http://sub.example.com", "https", True),
+        ("Protocol-relative", "//example.com/page", "http://example.com", "https", False),
+    ]
+    
+    for desc, href, base_url, orig_scheme, should_be_https in test_cases:
+        result = preserve_https(href, base_url, orig_scheme)
+        is_https = result.startswith('https://')
+        status = "✅" if is_https == should_be_https else "❌"
+        
+        print(f"\n{status} {desc}:")
+        print(f"  Input: {href} + {base_url}")
+        print(f"  Result: {result}")
+        print(f"  Expected HTTPS: {should_be_https}, Got: {is_https}")
+
+def usage_example():
+    """Show how to use the feature in crawl4ai"""
+    
+    print("\n" + "=" * 60)
+    print("USAGE IN CRAWL4AI")
+    print("=" * 60)
+    
+    print("""
+To enable HTTPS preservation in your crawl4ai code:
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async with AsyncWebCrawler() as crawler:
+    config = CrawlerRunConfig(
+        preserve_https_for_internal_links=True  # Enable HTTPS preservation
+    )
+    
+    result = await crawler.arun(
+        url="https://example.com",
+        config=config
+    )
+    
+    # All internal links will maintain HTTPS even if 
+    # the server redirects to HTTP
+```
+
+This is especially useful for:
+- Sites that redirect HTTPS to HTTP but still support HTTPS
+- Security-conscious crawling where you want to stay on HTTPS
+- Avoiding mixed content issues in downstream processing
+""")
+
+if __name__ == "__main__":
+    # Run all demonstrations
+    demonstrate_issue()
+    demonstrate_solution() 
+    test_edge_cases()
+    usage_example()
+    
+    print("\n" + "=" * 60)
+    print("✅ All tests complete!")
+    print("=" * 60)
Author	SHA1	Message	Date
ntohidi	bdacf61ca9	feat: update documentation for preserve_https_for_internal_links. ref #1410	2025-08-28 17:48:12 +08:00
ntohidi	f566c5a376	feat: add preserve_https_for_internal_links flag to maintain HTTPS during crawling. Ref #1410 Added a new `preserve_https_for_internal_links` configuration flag that preserves the original HTTPS scheme for same-domain links even when the server redirects to HTTP.	2025-08-28 17:38:40 +08:00