fix(docker): Fix LLM API key handling for multi-provider support

Previously, the system incorrectly used OPENAI_API_KEY for all LLM providers due to a hardcoded api_key_env fallback in config.yml. This caused authentication errors when using non-OpenAI providers like Gemini. Changes: - Remove api_key_env from config.yml to let litellm handle provider-specific env vars - Simplify get_llm_api_key() to return None, allowing litellm to auto-detect keys - Update validate_llm_provider() to trust litellm's built-in key detection - Update documentation to reflect the new automatic key handling The fix leverages litellm's existing capability to automatically find the correct environment variable for each provider (OPENAI_API_KEY, GEMINI_API_TOKEN, etc.) without manual configuration. ref #1291
2025-08-21 14:01:04 +08:00
13 changed files with 21 additions and 275 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,16 +5,6 @@ All notable changes to Crawl4AI will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 ## [Unreleased]
 ### Added
 - **🔒 HTTPS Preservation for Internal Links**: New `preserve_https_for_internal_links` configuration flag
  - Maintains HTTPS scheme for internal links even when servers redirect to HTTP
  - Prevents security downgrades during deep crawling
  - Useful for security-conscious crawling and sites supporting both protocols
  - Fully backward compatible with opt-in flag (default: `False`)
  - Fixes issue #1410 where HTTPS URLs were being downgraded to HTTP
 ## [0.7.3] - 2025-08-09
 ### Added
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -1121,7 +1121,6 @@ class CrawlerRunConfig():
        exclude_domains: list = None,
        exclude_internal_links: bool = False,
        score_links: bool = False,
        preserve_https_for_internal_links: bool = False,
        # Debugging and Logging Parameters
        verbose: bool = True,
        log_console: bool = False,
@@ -1245,7 +1244,6 @@ class CrawlerRunConfig():
        self.exclude_domains = exclude_domains or []
        self.exclude_internal_links = exclude_internal_links
        self.score_links = score_links
        self.preserve_https_for_internal_links = preserve_https_for_internal_links
        # Debugging and Logging Parameters
        self.verbose = verbose
@@ -1519,7 +1517,6 @@ class CrawlerRunConfig():
            exclude_domains=kwargs.get("exclude_domains", []),
            exclude_internal_links=kwargs.get("exclude_internal_links", False),
            score_links=kwargs.get("score_links", False),
            preserve_https_for_internal_links=kwargs.get("preserve_https_for_internal_links", False),
            # Debugging and Logging Parameters
            verbose=kwargs.get("verbose", True),
            log_console=kwargs.get("log_console", False),
@@ -1626,7 +1623,6 @@ class CrawlerRunConfig():
            "exclude_domains": self.exclude_domains,
            "exclude_internal_links": self.exclude_internal_links,
            "score_links": self.score_links,
            "preserve_https_for_internal_links": self.preserve_https_for_internal_links,
            "verbose": self.verbose,
            "log_console": self.log_console,
            "capture_network_requests": self.capture_network_requests,
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -354,7 +354,6 @@ class AsyncWebCrawler:
                    ###############################################################
                    # Process the HTML content, Call CrawlerStrategy.process_html #
                    ###############################################################
                    from urllib.parse import urlparse
                    crawl_result: CrawlResult = await self.aprocess_html(
                        url=url,
                        html=html,
@@ -365,7 +364,6 @@ class AsyncWebCrawler:
                        verbose=config.verbose,
                        is_raw_html=True if url.startswith("raw:") else False,
                        redirected_url=async_response.redirected_url,
                        original_scheme=urlparse(url).scheme,
                        **kwargs,
                    )
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -258,11 +258,7 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
                continue
            try:
-                normalized_href = normalize_url(
+                normalized_href = normalize_url(href, url)
                    href, url,
                    preserve_https=kwargs.get('preserve_https_for_internal_links', False),
                    original_scheme=kwargs.get('original_scheme')
                )
                link_data = {
                    "href": normalized_href,
                    "text": link.text_content().strip(),
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -2146,9 +2146,7 @@ def normalize_url(
    drop_query_tracking=True,
    sort_query=True,
    keep_fragment=False,
-    extra_drop_params=None,
+    extra_drop_params=None
    preserve_https=False,
    original_scheme=None
 ):
    """
    Extended URL normalizer
@@ -2179,17 +2177,6 @@ def normalize_url(
    # Resolve relative paths first
    full_url = urljoin(base_url, href.strip())
    # Preserve HTTPS if requested and original scheme was HTTPS
    if preserve_https and original_scheme == 'https':
        parsed_full = urlparse(full_url)
        parsed_base = urlparse(base_url)
        # Only preserve HTTPS for same-domain links (not protocol-relative URLs)
        # Protocol-relative URLs (//example.com) should follow the base URL's scheme
        if (parsed_full.scheme == 'http' and 
            parsed_full.netloc == parsed_base.netloc and
            not href.strip().startswith('//')):
            full_url = full_url.replace('http://', 'https://', 1)
    # Parse once, edit parts, then rebuild
    parsed = urlparse(full_url)
@@ -2238,7 +2225,7 @@ def normalize_url(
    return normalized
-def normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_scheme=None):
+def normalize_url_for_deep_crawl(href, base_url):
    """Normalize URLs to ensure consistent format"""
    from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode
@@ -2249,17 +2236,6 @@ def normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_
    # Use urljoin to handle relative URLs
    full_url = urljoin(base_url, href.strip())
    # Preserve HTTPS if requested and original scheme was HTTPS
    if preserve_https and original_scheme == 'https':
        parsed_full = urlparse(full_url)
        parsed_base = urlparse(base_url)
        # Only preserve HTTPS for same-domain links (not protocol-relative URLs)
        # Protocol-relative URLs (//example.com) should follow the base URL's scheme
        if (parsed_full.scheme == 'http' and 
            parsed_full.netloc == parsed_base.netloc and
            not href.strip().startswith('//')):
            full_url = full_url.replace('http://', 'https://', 1)
    # Parse the URL for normalization
    parsed = urlparse(full_url)
@@ -2297,7 +2273,7 @@ def normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_
    return normalized
@lru_cache(maxsize=10000)
-def efficient_normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_scheme=None):
+def efficient_normalize_url_for_deep_crawl(href, base_url):
    """Efficient URL normalization with proper parsing"""
    from urllib.parse import urljoin
@@ -2307,17 +2283,6 @@ def efficient_normalize_url_for_deep_crawl(href, base_url, preserve_https=False,
    # Resolve relative URLs
    full_url = urljoin(base_url, href.strip())
    # Preserve HTTPS if requested and original scheme was HTTPS
    if preserve_https and original_scheme == 'https':
        parsed_full = urlparse(full_url)
        parsed_base = urlparse(base_url)
        # Only preserve HTTPS for same-domain links (not protocol-relative URLs)
        # Protocol-relative URLs (//example.com) should follow the base URL's scheme
        if (parsed_full.scheme == 'http' and 
            parsed_full.netloc == parsed_base.netloc and
            not href.strip().startswith('//')):
            full_url = full_url.replace('http://', 'https://', 1)
    # Use proper URL parsing
    parsed = urlparse(full_url)
--- a/deploy/docker/README.md
+++ b/deploy/docker/README.md
@@ -692,8 +692,7 @@ app:
 # Default LLM Configuration
 llm:
  provider: "openai/gpt-4o-mini"  # Can be overridden by LLM_PROVIDER env var
-  api_key_env: "OPENAI_API_KEY"
+  # api_key: sk-...  # If you pass the API key directly (not recommended)
  # api_key: sk-...  # If you pass the API key directly then api_key_env will be ignored
 # Redis Configuration (Used by internal Redis server managed by supervisord)
 redis:
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -96,7 +96,7 @@ async def handle_llm_qa(
        response = perform_completion_with_backoff(
            provider=config["llm"]["provider"],
            prompt_with_variables=prompt,
-            api_token=get_llm_api_key(config)
+            api_token=get_llm_api_key(config)  # Returns None to let litellm handle it
        )
        return response.choices[0].message.content
@@ -127,7 +127,7 @@ async def process_llm_extraction(
                "error": error_msg
            })
            return
-        api_key = get_llm_api_key(config, provider)
+        api_key = get_llm_api_key(config, provider)  # Returns None to let litellm handle it
        llm_strategy = LLMExtractionStrategy(
            llm_config=LLMConfig(
                provider=provider or config["llm"]["provider"],
@@ -203,7 +203,7 @@ async def handle_markdown_request(
                FilterType.LLM: LLMContentFilter(
                    llm_config=LLMConfig(
                        provider=provider or config["llm"]["provider"],
-                        api_token=get_llm_api_key(config, provider),
+                        api_token=get_llm_api_key(config, provider),  # Returns None to let litellm handle it
                    ),
                    instruction=query or "Extract main content"
                )
--- a/deploy/docker/config.yml
+++ b/deploy/docker/config.yml
@@ -11,8 +11,7 @@ app:
 # Default LLM Configuration
 llm:
  provider: "openai/gpt-4o-mini"
-  api_key_env: "OPENAI_API_KEY"
+  # api_key: sk-...  # If you pass the API key directly (not recommended)
  # api_key: sk-...  # If you pass the API key directly then api_key_env will be ignored
 # Redis Configuration
 redis:
--- a/deploy/docker/utils.py
+++ b/deploy/docker/utils.py
@@ -71,7 +71,7 @@ def decode_redis_hash(hash_data: Dict[bytes, bytes]) -> Dict[str, str]:
-def get_llm_api_key(config: Dict, provider: Optional[str] = None) -> str:
+def get_llm_api_key(config: Dict, provider: Optional[str] = None) -> Optional[str]:
    """Get the appropriate API key based on the LLM provider.
    Args:
@@ -79,19 +79,14 @@ def get_llm_api_key(config: Dict, provider: Optional[str] = None) -> str:
        provider: Optional provider override (e.g., "openai/gpt-4")
    Returns:
-        The API key for the provider, or empty string if not found
+        The API key if directly configured, otherwise None to let litellm handle it
    """
-        
+    # Check if direct API key is configured (for backward compatibility)
    # Use provided provider or fall back to config
    if not provider:
        provider = config["llm"]["provider"]
    # Check if direct API key is configured
    if "api_key" in config["llm"]:
        return config["llm"]["api_key"]
-    # Fall back to the configured api_key_env if no match
+    # Return None - litellm will automatically find the right environment variable
-    return os.environ.get(config["llm"].get("api_key_env", ""), "")
+    return None
 def validate_llm_provider(config: Dict, provider: Optional[str] = None) -> tuple[bool, str]:
@@ -104,16 +99,12 @@ def validate_llm_provider(config: Dict, provider: Optional[str] = None) -> tuple
    Returns:
        Tuple of (is_valid, error_message)
    """
-    # Use provided provider or fall back to config
+    # If a direct API key is configured, validation passes
-    if not provider:
+    if "api_key" in config["llm"]:
-        provider = config["llm"]["provider"]
+        return True, ""
    # Get the API key for this provider
    api_key = get_llm_api_key(config, provider)
    if not api_key:
        return False, f"No API key found for provider '{provider}'. Please set the appropriate environment variable."
    # Otherwise, trust that litellm will find the appropriate environment variable
    # We can't easily validate this without reimplementing litellm's logic
    return True, ""
--- a/docs/md_v2/api/parameters.md
+++ b/docs/md_v2/api/parameters.md
@@ -155,7 +155,6 @@ If your page is a single-page app with repeated JS updates, set `js_only=True` i
 | **`exclude_external_links`** | `bool` (False)          | Removes all links pointing outside the current domain.                                                                      |
 | **`exclude_social_media_links`** | `bool` (False)      | Strips links specifically to social sites (like Facebook or Twitter).                                                      |
 | **`exclude_domains`**        | `list` ([])             | Provide a custom list of domains to exclude (like `["ads.com", "trackers.io"]`).                                            |
 | **`preserve_https_for_internal_links`** | `bool` (False) | If `True`, preserves HTTPS scheme for internal links even when the server redirects to HTTP. Useful for security-conscious crawling. |
 Use these for link-level content filtering (often to keep crawls “internal” or to remove spammy domains).
--- a/docs/md_v2/core/deep-crawling.md
+++ b/docs/md_v2/core/deep-crawling.md
@@ -472,17 +472,6 @@ Note that for BestFirstCrawlingStrategy, score_threshold is not needed since pag
 5.**Balance breadth vs. depth.** Choose your strategy wisely - BFS for comprehensive coverage, DFS for deep exploration, BestFirst for focused relevance-based crawling.
 6.**Preserve HTTPS for security.** If crawling HTTPS sites that redirect to HTTP, use `preserve_https_for_internal_links=True` to maintain secure connections:
 ```python
 config = CrawlerRunConfig(
    deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=2),
    preserve_https_for_internal_links=True  # Keep HTTPS even if server redirects to HTTP
 )
 ```
 This is especially useful for security-conscious crawling or when dealing with sites that support both protocols.
 ---
 ## 10. Summary & Next Steps
--- a/docs/md_v2/core/docker-deployment.md
+++ b/docs/md_v2/core/docker-deployment.md
@@ -176,7 +176,7 @@ The Docker setup now supports flexible LLM provider configuration through three
 3. **Config File Default**: Falls back to `config.yml` (default: `openai/gpt-4o-mini`)
-The system automatically selects the appropriate API key based on the configured `api_key_env` in the config file.
+The system automatically selects the appropriate API key based on the provider. LiteLLM handles finding the correct environment variable for each provider (e.g., OPENAI_API_KEY for OpenAI, GEMINI_API_TOKEN for Google Gemini, etc.).
 #### 3. Build and Run with Compose
@@ -693,8 +693,7 @@ app:
 # Default LLM Configuration
 llm:
  provider: "openai/gpt-4o-mini"  # Can be overridden by LLM_PROVIDER env var
-  api_key_env: "OPENAI_API_KEY"
+  # api_key: sk-...  # If you pass the API key directly (not recommended)
  # api_key: sk-...  # If you pass the API key directly then api_key_env will be ignored
 # Redis Configuration (Used by internal Redis server managed by supervisord)
 redis:
--- a/tests/test_preserve_https_for_internal_links.py
+++ b/tests/test_preserve_https_for_internal_links.py
@@ -1,175 +0,0 @@
 #!/usr/bin/env python3
 """
 Final test and demo for HTTPS preservation feature (Issue #1410)
 This demonstrates how the preserve_https_for_internal_links flag
 prevents HTTPS downgrade when servers redirect to HTTP.
 """
 import sys
 import os
 from urllib.parse import urljoin, urlparse
 def demonstrate_issue():
    """Show the problem: HTTPS -> HTTP redirect causes HTTP links"""
    print("=" * 60)
    print("DEMONSTRATING THE ISSUE")
    print("=" * 60)
    # Simulate what happens during crawling
    original_url = "https://quotes.toscrape.com/tag/deep-thoughts"
    redirected_url = "http://quotes.toscrape.com/tag/deep-thoughts/"  # Server redirects to HTTP
    # Extract a relative link
    relative_link = "/author/Albert-Einstein"
    # Standard URL joining uses the redirected (HTTP) base
    resolved_url = urljoin(redirected_url, relative_link)
    print(f"Original URL:    {original_url}")
    print(f"Redirected to:   {redirected_url}")
    print(f"Relative link:   {relative_link}")
    print(f"Resolved link:   {resolved_url}")
    print(f"\n❌ Problem: Link is now HTTP instead of HTTPS!")
    return resolved_url
 def demonstrate_solution():
    """Show the solution: preserve HTTPS for internal links"""
    print("\n" + "=" * 60)
    print("DEMONSTRATING THE SOLUTION")
    print("=" * 60)
    # Our normalize_url with HTTPS preservation
    def normalize_url_with_preservation(href, base_url, preserve_https=False, original_scheme=None):
        """Normalize URL with optional HTTPS preservation"""
        # Standard resolution
        full_url = urljoin(base_url, href.strip())
        # Preserve HTTPS if requested
        if preserve_https and original_scheme == 'https':
            parsed_full = urlparse(full_url)
            parsed_base = urlparse(base_url)
            # Only for same-domain links
            if parsed_full.scheme == 'http' and parsed_full.netloc == parsed_base.netloc:
                full_url = full_url.replace('http://', 'https://', 1)
                print(f"  → Preserved HTTPS for {parsed_full.netloc}")
        return full_url
    # Same scenario as before
    original_url = "https://quotes.toscrape.com/tag/deep-thoughts"
    redirected_url = "http://quotes.toscrape.com/tag/deep-thoughts/"
    relative_link = "/author/Albert-Einstein"
    # Without preservation (current behavior)
    resolved_without = normalize_url_with_preservation(
        relative_link, redirected_url,
        preserve_https=False, original_scheme='https'
    )
    print(f"\nWithout preservation:")
    print(f"  Result: {resolved_without}")
    # With preservation (new feature)
    resolved_with = normalize_url_with_preservation(
        relative_link, redirected_url,
        preserve_https=True, original_scheme='https'
    )
    print(f"\nWith preservation (preserve_https_for_internal_links=True):")
    print(f"  Result: {resolved_with}")
    print(f"\n✅ Solution: Internal link stays HTTPS!")
    return resolved_with
 def test_edge_cases():
    """Test important edge cases"""
    print("\n" + "=" * 60)
    print("EDGE CASES")
    print("=" * 60)
    from urllib.parse import urljoin, urlparse
    def preserve_https(href, base_url, original_scheme):
        """Helper to test preservation logic"""
        full_url = urljoin(base_url, href)
        if original_scheme == 'https':
            parsed_full = urlparse(full_url)
            parsed_base = urlparse(base_url)
            # Fixed: check for protocol-relative URLs
            if (parsed_full.scheme == 'http' and 
                parsed_full.netloc == parsed_base.netloc and
                not href.strip().startswith('//')):
                full_url = full_url.replace('http://', 'https://', 1)
        return full_url
    test_cases = [
        # (description, href, base_url, original_scheme, should_be_https)
        ("External link", "http://other.com/page", "http://example.com", "https", False),
        ("Already HTTPS", "/page", "https://example.com", "https", True),
        ("No original HTTPS", "/page", "http://example.com", "http", False),
        ("Subdomain", "/page", "http://sub.example.com", "https", True),
        ("Protocol-relative", "//example.com/page", "http://example.com", "https", False),
    ]
    for desc, href, base_url, orig_scheme, should_be_https in test_cases:
        result = preserve_https(href, base_url, orig_scheme)
        is_https = result.startswith('https://')
        status = "✅" if is_https == should_be_https else "❌"
        print(f"\n{status} {desc}:")
        print(f"  Input: {href} + {base_url}")
        print(f"  Result: {result}")
        print(f"  Expected HTTPS: {should_be_https}, Got: {is_https}")
 def usage_example():
    """Show how to use the feature in crawl4ai"""
    print("\n" + "=" * 60)
    print("USAGE IN CRAWL4AI")
    print("=" * 60)
    print("""
 To enable HTTPS preservation in your crawl4ai code:
 ```python
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
 async with AsyncWebCrawler() as crawler:
    config = CrawlerRunConfig(
        preserve_https_for_internal_links=True  # Enable HTTPS preservation
    )
    result = await crawler.arun(
        url="https://example.com",
        config=config
    )
    # All internal links will maintain HTTPS even if 
    # the server redirects to HTTP
 ```
 This is especially useful for:
 - Sites that redirect HTTPS to HTTP but still support HTTPS
 - Security-conscious crawling where you want to stay on HTTPS
 - Avoiding mixed content issues in downstream processing
 """)
 if __name__ == "__main__":
    # Run all demonstrations
    demonstrate_issue()
    demonstrate_solution() 
    test_edge_cases()
    usage_example()
    print("\n" + "=" * 60)
    print("✅ All tests complete!")
    print("=" * 60)