fix(logger): ensure logger is a Logger instance in crawling strategies. ref #1437

fix(docker): resolve filter serialization and JSON encoding errors in deep crawl strategy (ref #1419 )
- Fix URLPatternFilter serialization by preventing private __slots__ from being serialized as constructor params - Add public attributes to URLPatternFilter to store original constructor parameters for proper serialization - Handle property descriptors in CrawlResult.model_dump() to prevent JSON serialization errors - Ensure filter chains work correctly with Docker client and REST API The issue occurred because: 1. Private implementation details (_simple_suffixes, etc.) were being serialized and passed as constructor arguments during deserialization 2. Property descriptors were being included in the serialized output, causing "Object of type property is not JSON serializable" errors Changes: - async_configs.py: Comment out __slots__ serialization logic (lines 100-109) - filters.py: Add patterns, use_glob, reverse to URLPatternFilter __slots__ and store as public attributes - models.py: Convert property descriptors to strings in model_dump() instead of including them directly
2025-08-26 12:06:56 +08:00 · 2025-08-25 14:04:08 +08:00
13 changed files with 247 additions and 255 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,16 +5,6 @@ All notable changes to Crawl4AI will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

-## [Unreleased]
-
-### Added
- **🔒 HTTPS Preservation for Internal Links**: New `preserve_https_for_internal_links` configuration flag
-  - Maintains HTTPS scheme for internal links even when servers redirect to HTTP
-  - Prevents security downgrades during deep crawling
-  - Useful for security-conscious crawling and sites supporting both protocols
-  - Fully backward compatible with opt-in flag (default: `False`)
-  - Fixes issue #1410 where HTTPS URLs were being downgraded to HTTP
-
 ## [0.7.3] - 2025-08-09

 ### Added
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -97,13 +97,16 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
                if value != param.default and not ignore_default_value:
                    current_values[name] = to_serializable_dict(value)
        
-        if hasattr(obj, '__slots__'):
-            for slot in obj.__slots__:
-                if slot.startswith('_'):  # Handle private slots
-                    attr_name = slot[1:]  # Remove leading '_'
-                    value = getattr(obj, slot, None)
-                    if value is not None:
-                        current_values[attr_name] = to_serializable_dict(value)
+        # Don't serialize private __slots__ - they're internal implementation details
+        # not constructor parameters. This was causing URLPatternFilter to fail
+        # because _simple_suffixes was being serialized as 'simple_suffixes'
+        # if hasattr(obj, '__slots__'):
+        #     for slot in obj.__slots__:
+        #         if slot.startswith('_'):  # Handle private slots
+        #             attr_name = slot[1:]  # Remove leading '_'
+        #             value = getattr(obj, slot, None)
+        #             if value is not None:
+        #                 current_values[attr_name] = to_serializable_dict(value)

            
        
@@ -1121,7 +1124,6 @@ class CrawlerRunConfig():
        exclude_domains: list = None,
        exclude_internal_links: bool = False,
        score_links: bool = False,
-        preserve_https_for_internal_links: bool = False,
        # Debugging and Logging Parameters
        verbose: bool = True,
        log_console: bool = False,
@@ -1245,7 +1247,6 @@ class CrawlerRunConfig():
        self.exclude_domains = exclude_domains or []
        self.exclude_internal_links = exclude_internal_links
        self.score_links = score_links
-        self.preserve_https_for_internal_links = preserve_https_for_internal_links

        # Debugging and Logging Parameters
        self.verbose = verbose
@@ -1519,7 +1520,6 @@ class CrawlerRunConfig():
            exclude_domains=kwargs.get("exclude_domains", []),
            exclude_internal_links=kwargs.get("exclude_internal_links", False),
            score_links=kwargs.get("score_links", False),
-            preserve_https_for_internal_links=kwargs.get("preserve_https_for_internal_links", False),
            # Debugging and Logging Parameters
            verbose=kwargs.get("verbose", True),
            log_console=kwargs.get("log_console", False),
@@ -1626,7 +1626,6 @@ class CrawlerRunConfig():
            "exclude_domains": self.exclude_domains,
            "exclude_internal_links": self.exclude_internal_links,
            "score_links": self.score_links,
-            "preserve_https_for_internal_links": self.preserve_https_for_internal_links,
            "verbose": self.verbose,
            "log_console": self.log_console,
            "capture_network_requests": self.capture_network_requests,
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -354,7 +354,6 @@ class AsyncWebCrawler:
                    ###############################################################
                    # Process the HTML content, Call CrawlerStrategy.process_html #
                    ###############################################################
-                    from urllib.parse import urlparse
                    crawl_result: CrawlResult = await self.aprocess_html(
                        url=url,
                        html=html,
@@ -365,7 +364,6 @@ class AsyncWebCrawler:
                        verbose=config.verbose,
                        is_raw_html=True if url.startswith("raw:") else False,
                        redirected_url=async_response.redirected_url,
-                        original_scheme=urlparse(url).scheme,
                        **kwargs,
                    )

--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -258,11 +258,7 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
                continue

            try:
-                normalized_href = normalize_url(
-                    href, url,
-                    preserve_https=kwargs.get('preserve_https_for_internal_links', False),
-                    original_scheme=kwargs.get('original_scheme')
-                )
+                normalized_href = normalize_url(href, url)
                link_data = {
                    "href": normalized_href,
                    "text": link.text_content().strip(),
--- a/crawl4ai/deep_crawling/bff_strategy.py
+++ b/crawl4ai/deep_crawling/bff_strategy.py
@@ -47,7 +47,13 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
        self.url_scorer = url_scorer
        self.include_external = include_external
        self.max_pages = max_pages
-        self.logger = logger or logging.getLogger(__name__)
+        # self.logger = logger or logging.getLogger(__name__)
+        # Ensure logger is always a Logger instance, not a dict from serialization
+        if isinstance(logger, logging.Logger):
+            self.logger = logger
+        else:
+            # Create a new logger if logger is None, dict, or any other non-Logger type
+            self.logger = logging.getLogger(__name__)
        self.stats = TraversalStats(start_time=datetime.now())
        self._cancel_event = asyncio.Event()
        self._pages_crawled = 0
--- a/crawl4ai/deep_crawling/bfs_strategy.py
+++ b/crawl4ai/deep_crawling/bfs_strategy.py
@@ -38,7 +38,13 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
        self.include_external = include_external
        self.score_threshold = score_threshold
        self.max_pages = max_pages
-        self.logger = logger or logging.getLogger(__name__)
+        # self.logger = logger or logging.getLogger(__name__)
+        # Ensure logger is always a Logger instance, not a dict from serialization
+        if isinstance(logger, logging.Logger):
+            self.logger = logger
+        else:
+            # Create a new logger if logger is None, dict, or any other non-Logger type
+            self.logger = logging.getLogger(__name__)
        self.stats = TraversalStats(start_time=datetime.now())
        self._cancel_event = asyncio.Event()
        self._pages_crawled = 0
--- a/crawl4ai/deep_crawling/filters.py
+++ b/crawl4ai/deep_crawling/filters.py
@@ -120,6 +120,9 @@ class URLPatternFilter(URLFilter):
    """Pattern filter balancing speed and completeness"""

    __slots__ = (
+        "patterns",  # Store original patterns for serialization
+        "use_glob",  # Store original use_glob for serialization  
+        "reverse",   # Store original reverse for serialization
        "_simple_suffixes",
        "_simple_prefixes",
        "_domain_patterns",
@@ -142,6 +145,11 @@ class URLPatternFilter(URLFilter):
        reverse: bool = False,
    ):
        super().__init__()
+        # Store original constructor params for serialization
+        self.patterns = patterns
+        self.use_glob = use_glob
+        self.reverse = reverse
+        
        self._reverse = reverse
        patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns

--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -253,6 +253,16 @@ class CrawlResult(BaseModel):
        requirements change, this is where you would update the logic.
        """
        result = super().model_dump(*args, **kwargs)
+        
+        # Remove any property descriptors that might have been included
+        # These deprecated properties should not be in the serialized output
+        for key in ['fit_html', 'fit_markdown', 'markdown_v2']:
+            if key in result and isinstance(result[key], property):
+                # del result[key]
+                # Nasrin: I decided to convert it to string instead of removing it.
+                result[key] = str(result[key])
+        
+        # Add the markdown field properly
        if self._markdown is not None:
            result["markdown"] = self._markdown.model_dump() 
        return result
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -2146,9 +2146,7 @@ def normalize_url(
    drop_query_tracking=True,
    sort_query=True,
    keep_fragment=False,
-    extra_drop_params=None,
-    preserve_https=False,
-    original_scheme=None
+    extra_drop_params=None
 ):
    """
    Extended URL normalizer
@@ -2178,17 +2176,6 @@ def normalize_url(

    # Resolve relative paths first
    full_url = urljoin(base_url, href.strip())
-    
-    # Preserve HTTPS if requested and original scheme was HTTPS
-    if preserve_https and original_scheme == 'https':
-        parsed_full = urlparse(full_url)
-        parsed_base = urlparse(base_url)
-        # Only preserve HTTPS for same-domain links (not protocol-relative URLs)
-        # Protocol-relative URLs (//example.com) should follow the base URL's scheme
-        if (parsed_full.scheme == 'http' and 
-            parsed_full.netloc == parsed_base.netloc and
-            not href.strip().startswith('//')):
-            full_url = full_url.replace('http://', 'https://', 1)

    # Parse once, edit parts, then rebuild
    parsed = urlparse(full_url)
@@ -2238,7 +2225,7 @@ def normalize_url(
    return normalized


-def normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_scheme=None):
+def normalize_url_for_deep_crawl(href, base_url):
    """Normalize URLs to ensure consistent format"""
    from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode

@@ -2249,17 +2236,6 @@ def normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_
    # Use urljoin to handle relative URLs
    full_url = urljoin(base_url, href.strip())
    
-    # Preserve HTTPS if requested and original scheme was HTTPS
-    if preserve_https and original_scheme == 'https':
-        parsed_full = urlparse(full_url)
-        parsed_base = urlparse(base_url)
-        # Only preserve HTTPS for same-domain links (not protocol-relative URLs)
-        # Protocol-relative URLs (//example.com) should follow the base URL's scheme
-        if (parsed_full.scheme == 'http' and 
-            parsed_full.netloc == parsed_base.netloc and
-            not href.strip().startswith('//')):
-            full_url = full_url.replace('http://', 'https://', 1)
-    
    # Parse the URL for normalization
    parsed = urlparse(full_url)
    
@@ -2297,7 +2273,7 @@ def normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_
    return normalized

@lru_cache(maxsize=10000)
-def efficient_normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_scheme=None):
+def efficient_normalize_url_for_deep_crawl(href, base_url):
    """Efficient URL normalization with proper parsing"""
    from urllib.parse import urljoin
    
@@ -2307,17 +2283,6 @@ def efficient_normalize_url_for_deep_crawl(href, base_url, preserve_https=False,
    # Resolve relative URLs
    full_url = urljoin(base_url, href.strip())
    
-    # Preserve HTTPS if requested and original scheme was HTTPS
-    if preserve_https and original_scheme == 'https':
-        parsed_full = urlparse(full_url)
-        parsed_base = urlparse(base_url)
-        # Only preserve HTTPS for same-domain links (not protocol-relative URLs)
-        # Protocol-relative URLs (//example.com) should follow the base URL's scheme
-        if (parsed_full.scheme == 'http' and 
-            parsed_full.netloc == parsed_base.netloc and
-            not href.strip().startswith('//')):
-            full_url = full_url.replace('http://', 'https://', 1)
-    
    # Use proper URL parsing
    parsed = urlparse(full_url)
    
--- a/docs/md_v2/api/parameters.md
+++ b/docs/md_v2/api/parameters.md
@@ -155,7 +155,6 @@ If your page is a single-page app with repeated JS updates, set `js_only=True` i
 | **`exclude_external_links`** | `bool` (False)          | Removes all links pointing outside the current domain.                                                                      |
 | **`exclude_social_media_links`** | `bool` (False)      | Strips links specifically to social sites (like Facebook or Twitter).                                                      |
 | **`exclude_domains`**        | `list` ([])             | Provide a custom list of domains to exclude (like `["ads.com", "trackers.io"]`).                                            |
-| **`preserve_https_for_internal_links`** | `bool` (False) | If `True`, preserves HTTPS scheme for internal links even when the server redirects to HTTP. Useful for security-conscious crawling. |

 Use these for link-level content filtering (often to keep crawls “internal” or to remove spammy domains).

--- a/docs/md_v2/core/deep-crawling.md
+++ b/docs/md_v2/core/deep-crawling.md
@@ -472,17 +472,6 @@ Note that for BestFirstCrawlingStrategy, score_threshold is not needed since pag

 5.**Balance breadth vs. depth.** Choose your strategy wisely - BFS for comprehensive coverage, DFS for deep exploration, BestFirst for focused relevance-based crawling.

-6.**Preserve HTTPS for security.** If crawling HTTPS sites that redirect to HTTP, use `preserve_https_for_internal_links=True` to maintain secure connections:
-
-```python
-config = CrawlerRunConfig(
-    deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=2),
-    preserve_https_for_internal_links=True  # Keep HTTPS even if server redirects to HTTP
-)
-```
-
-This is especially useful for security-conscious crawling or when dealing with sites that support both protocols.
-
 ---

 ## 10. Summary & Next Steps
--- a/tests/docker/test_filter_deep_crawl.py
+++ b/tests/docker/test_filter_deep_crawl.py
@@ -0,0 +1,201 @@
+"""
+Test the complete fix for both the filter serialization and JSON serialization issues.
+"""
+
+import asyncio
+import httpx
+
+from crawl4ai import BrowserConfig, CacheMode, CrawlerRunConfig
+from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, FilterChain, URLPatternFilter
+
+BASE_URL = "http://localhost:11234/"  # Adjust port as needed
+
+async def test_with_docker_client():
+    """Test using the Docker client (same as 1419.py)."""
+    from crawl4ai.docker_client import Crawl4aiDockerClient
+    
+    print("=" * 60)
+    print("Testing with Docker Client")
+    print("=" * 60)
+    
+    try:
+        async with Crawl4aiDockerClient(
+            base_url=BASE_URL,
+            verbose=True,
+        ) as client:
+            
+            # Create filter chain - testing the serialization fix
+            filter_chain = [
+                URLPatternFilter(
+                    # patterns=["*about*", "*privacy*", "*terms*"],
+                    patterns=["*advanced*"],
+                    reverse=True
+                ),
+            ]
+            
+            crawler_config = CrawlerRunConfig(
+                deep_crawl_strategy=BFSDeepCrawlStrategy(
+                    max_depth=2,  # Keep it shallow for testing
+                    # max_pages=5,  # Limit pages for testing
+                    filter_chain=FilterChain(filter_chain)
+                ),
+                cache_mode=CacheMode.BYPASS,
+            )
+            
+            print("\n1. Testing crawl with filters...")
+            results = await client.crawl(
+                ["https://docs.crawl4ai.com"],  # Simple test page
+                browser_config=BrowserConfig(headless=True),
+                crawler_config=crawler_config,
+            )
+            
+            if results:
+                print(f"✅ Crawl succeeded! Type: {type(results)}")
+                if hasattr(results, 'success'):
+                    print(f"✅ Results success: {results.success}")
+                    # Test that we can iterate results without JSON errors
+                    if hasattr(results, '__iter__'):
+                        for i, result in enumerate(results):
+                            if hasattr(result, 'url'):
+                                print(f"   Result {i}: {result.url[:50]}...")
+                            else:
+                                print(f"   Result {i}: {str(result)[:50]}...")
+                else:
+                    # Handle list of results
+                    print(f"✅ Got {len(results)} results")
+                    for i, result in enumerate(results[:3]):  # Show first 3
+                        print(f"   Result {i}: {result.url[:50]}...")
+            else:
+                print("❌ Crawl failed - no results returned")
+                return False
+                
+        print("\n✅ Docker client test completed successfully!")
+        return True
+        
+    except Exception as e:
+        print(f"❌ Docker client test failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+async def test_with_rest_api():
+    """Test using REST API directly."""
+    print("\n" + "=" * 60)
+    print("Testing with REST API")
+    print("=" * 60)
+    
+    # Create filter configuration
+    deep_crawl_strategy_payload = {
+        "type": "BFSDeepCrawlStrategy",
+        "params": {
+            "max_depth": 2,
+            # "max_pages": 5,
+            "filter_chain": {
+                "type": "FilterChain",
+                "params": {
+                    "filters": [
+                        {
+                            "type": "URLPatternFilter",
+                            "params": {
+                                "patterns": ["*advanced*"],
+                                "reverse": True
+                            }
+                        }
+                    ]
+                }
+            }
+        }
+    }
+    
+    crawl_payload = {
+        "urls": ["https://docs.crawl4ai.com"],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "deep_crawl_strategy": deep_crawl_strategy_payload,
+                "cache_mode": "bypass"
+            }
+        }
+    }
+    
+    try:
+        async with httpx.AsyncClient() as client:
+            print("\n1. Sending crawl request to REST API...")
+            response = await client.post(
+                f"{BASE_URL}crawl",
+                json=crawl_payload,
+                timeout=30
+            )
+            
+            if response.status_code == 200:
+                print(f"✅ REST API returned 200 OK")
+                data = response.json()
+                if data.get("success"):
+                    results = data.get("results", [])
+                    print(f"✅ Got {len(results)} results")
+                    for i, result in enumerate(results[:3]):
+                        print(f"   Result {i}: {result.get('url', 'unknown')[:50]}...")
+                else:
+                    print(f"❌ Crawl not successful: {data}")
+                    return False
+            else:
+                print(f"❌ REST API returned {response.status_code}")
+                print(f"   Response: {response.text[:500]}")
+                return False
+                
+        print("\n✅ REST API test completed successfully!")
+        return True
+        
+    except Exception as e:
+        print(f"❌ REST API test failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+async def main():
+    """Run all tests."""
+    print("\n🧪 TESTING COMPLETE FIX FOR DOCKER FILTER AND JSON ISSUES")
+    print("=" * 60)
+    print("Make sure the server is running with the updated code!")
+    print("=" * 60)
+    
+    results = []
+    
+    # Test 1: Docker client
+    docker_passed = await test_with_docker_client()
+    results.append(("Docker Client", docker_passed))
+    
+    # Test 2: REST API
+    rest_passed = await test_with_rest_api()
+    results.append(("REST API", rest_passed))
+    
+    # Summary
+    print("\n" + "=" * 60)
+    print("FINAL TEST SUMMARY")
+    print("=" * 60)
+    
+    all_passed = True
+    for test_name, passed in results:
+        status = "✅ PASSED" if passed else "❌ FAILED"
+        print(f"{test_name:20} {status}")
+        if not passed:
+            all_passed = False
+    
+    print("=" * 60)
+    if all_passed:
+        print("🎉 ALL TESTS PASSED! Both issues are fully resolved!")
+        print("\nThe fixes:")
+        print("1. Filter serialization: Fixed by not serializing private __slots__")
+        print("2. JSON serialization: Fixed by removing property descriptors from model_dump()")
+    else:
+        print("⚠️ Some tests failed. Please check the server logs for details.")
+    
+    return 0 if all_passed else 1
+
+
+if __name__ == "__main__":
+    import sys
+    sys.exit(asyncio.run(main()))
--- a/tests/test_preserve_https_for_internal_links.py
+++ b/tests/test_preserve_https_for_internal_links.py
@@ -1,175 +0,0 @@
-#!/usr/bin/env python3
-"""
-Final test and demo for HTTPS preservation feature (Issue #1410)
-
-This demonstrates how the preserve_https_for_internal_links flag
-prevents HTTPS downgrade when servers redirect to HTTP.
-"""
-
-import sys
-import os
-from urllib.parse import urljoin, urlparse
-
-def demonstrate_issue():
-    """Show the problem: HTTPS -> HTTP redirect causes HTTP links"""
-    
-    print("=" * 60)
-    print("DEMONSTRATING THE ISSUE")
-    print("=" * 60)
-    
-    # Simulate what happens during crawling
-    original_url = "https://quotes.toscrape.com/tag/deep-thoughts"
-    redirected_url = "http://quotes.toscrape.com/tag/deep-thoughts/"  # Server redirects to HTTP
-    
-    # Extract a relative link
-    relative_link = "/author/Albert-Einstein"
-    
-    # Standard URL joining uses the redirected (HTTP) base
-    resolved_url = urljoin(redirected_url, relative_link)
-    
-    print(f"Original URL:    {original_url}")
-    print(f"Redirected to:   {redirected_url}")
-    print(f"Relative link:   {relative_link}")
-    print(f"Resolved link:   {resolved_url}")
-    print(f"\n❌ Problem: Link is now HTTP instead of HTTPS!")
-    
-    return resolved_url
-
-def demonstrate_solution():
-    """Show the solution: preserve HTTPS for internal links"""
-    
-    print("\n" + "=" * 60)
-    print("DEMONSTRATING THE SOLUTION")
-    print("=" * 60)
-    
-    # Our normalize_url with HTTPS preservation
-    def normalize_url_with_preservation(href, base_url, preserve_https=False, original_scheme=None):
-        """Normalize URL with optional HTTPS preservation"""
-        
-        # Standard resolution
-        full_url = urljoin(base_url, href.strip())
-        
-        # Preserve HTTPS if requested
-        if preserve_https and original_scheme == 'https':
-            parsed_full = urlparse(full_url)
-            parsed_base = urlparse(base_url)
-            
-            # Only for same-domain links
-            if parsed_full.scheme == 'http' and parsed_full.netloc == parsed_base.netloc:
-                full_url = full_url.replace('http://', 'https://', 1)
-                print(f"  → Preserved HTTPS for {parsed_full.netloc}")
-        
-        return full_url
-    
-    # Same scenario as before
-    original_url = "https://quotes.toscrape.com/tag/deep-thoughts"
-    redirected_url = "http://quotes.toscrape.com/tag/deep-thoughts/"
-    relative_link = "/author/Albert-Einstein"
-    
-    # Without preservation (current behavior)
-    resolved_without = normalize_url_with_preservation(
-        relative_link, redirected_url,
-        preserve_https=False, original_scheme='https'
-    )
-    
-    print(f"\nWithout preservation:")
-    print(f"  Result: {resolved_without}")
-    
-    # With preservation (new feature)
-    resolved_with = normalize_url_with_preservation(
-        relative_link, redirected_url,
-        preserve_https=True, original_scheme='https'
-    )
-    
-    print(f"\nWith preservation (preserve_https_for_internal_links=True):")
-    print(f"  Result: {resolved_with}")
-    print(f"\n✅ Solution: Internal link stays HTTPS!")
-    
-    return resolved_with
-
-def test_edge_cases():
-    """Test important edge cases"""
-    
-    print("\n" + "=" * 60)
-    print("EDGE CASES")
-    print("=" * 60)
-    
-    from urllib.parse import urljoin, urlparse
-    
-    def preserve_https(href, base_url, original_scheme):
-        """Helper to test preservation logic"""
-        full_url = urljoin(base_url, href)
-        
-        if original_scheme == 'https':
-            parsed_full = urlparse(full_url)
-            parsed_base = urlparse(base_url)
-            # Fixed: check for protocol-relative URLs
-            if (parsed_full.scheme == 'http' and 
-                parsed_full.netloc == parsed_base.netloc and
-                not href.strip().startswith('//')):
-                full_url = full_url.replace('http://', 'https://', 1)
-        
-        return full_url
-    
-    test_cases = [
-        # (description, href, base_url, original_scheme, should_be_https)
-        ("External link", "http://other.com/page", "http://example.com", "https", False),
-        ("Already HTTPS", "/page", "https://example.com", "https", True),
-        ("No original HTTPS", "/page", "http://example.com", "http", False),
-        ("Subdomain", "/page", "http://sub.example.com", "https", True),
-        ("Protocol-relative", "//example.com/page", "http://example.com", "https", False),
-    ]
-    
-    for desc, href, base_url, orig_scheme, should_be_https in test_cases:
-        result = preserve_https(href, base_url, orig_scheme)
-        is_https = result.startswith('https://')
-        status = "✅" if is_https == should_be_https else "❌"
-        
-        print(f"\n{status} {desc}:")
-        print(f"  Input: {href} + {base_url}")
-        print(f"  Result: {result}")
-        print(f"  Expected HTTPS: {should_be_https}, Got: {is_https}")
-
-def usage_example():
-    """Show how to use the feature in crawl4ai"""
-    
-    print("\n" + "=" * 60)
-    print("USAGE IN CRAWL4AI")
-    print("=" * 60)
-    
-    print("""
-To enable HTTPS preservation in your crawl4ai code:
-
-```python
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-
-async with AsyncWebCrawler() as crawler:
-    config = CrawlerRunConfig(
-        preserve_https_for_internal_links=True  # Enable HTTPS preservation
-    )
-    
-    result = await crawler.arun(
-        url="https://example.com",
-        config=config
-    )
-    
-    # All internal links will maintain HTTPS even if 
-    # the server redirects to HTTP
-```
-
-This is especially useful for:
- Sites that redirect HTTPS to HTTP but still support HTTPS
- Security-conscious crawling where you want to stay on HTTPS
- Avoiding mixed content issues in downstream processing
-""")
-
-if __name__ == "__main__":
-    # Run all demonstrations
-    demonstrate_issue()
-    demonstrate_solution() 
-    test_edge_cases()
-    usage_example()
-    
-    print("\n" + "=" * 60)
-    print("✅ All tests complete!")
-    print("=" * 60)