feat: update documentation for preserve_https_for_internal_links. ref #1410

feat: add preserve_https_for_internal_links flag to maintain HTTPS during crawling. Ref #1410
Added a new `preserve_https_for_internal_links` configuration flag that preserves the original HTTPS scheme for same-domain links even when the server redirects to HTTP.
2025-08-28 17:48:12 +08:00 · 2025-08-28 17:38:40 +08:00
19 changed files with 302 additions and 630 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,16 @@ All notable changes to Crawl4AI will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 ## [Unreleased]
 ### Added
 - **🔒 HTTPS Preservation for Internal Links**: New `preserve_https_for_internal_links` configuration flag
  - Maintains HTTPS scheme for internal links even when servers redirect to HTTP
  - Prevents security downgrades during deep crawling
  - Useful for security-conscious crawling and sites supporting both protocols
  - Fully backward compatible with opt-in flag (default: `False`)
  - Fixes issue #1410 where HTTPS URLs were being downgraded to HTTP
 ## [0.7.3] - 2025-08-09
 ### Added
--- a/README.md
+++ b/README.md
@@ -304,9 +304,9 @@ The new Docker implementation includes:
 ### Getting Started
 ```bash
-# Pull and run the latest release
+# Pull and run the latest release candidate
-docker pull unclecode/crawl4ai:latest
+docker pull unclecode/crawl4ai:0.7.0
-docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:latest
+docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:0.7.0
 # Visit the playground at http://localhost:11235/playground
 ```
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -1121,6 +1121,7 @@ class CrawlerRunConfig():
        exclude_domains: list = None,
        exclude_internal_links: bool = False,
        score_links: bool = False,
        preserve_https_for_internal_links: bool = False,
        # Debugging and Logging Parameters
        verbose: bool = True,
        log_console: bool = False,
@@ -1244,6 +1245,7 @@ class CrawlerRunConfig():
        self.exclude_domains = exclude_domains or []
        self.exclude_internal_links = exclude_internal_links
        self.score_links = score_links
        self.preserve_https_for_internal_links = preserve_https_for_internal_links
        # Debugging and Logging Parameters
        self.verbose = verbose
@@ -1517,6 +1519,7 @@ class CrawlerRunConfig():
            exclude_domains=kwargs.get("exclude_domains", []),
            exclude_internal_links=kwargs.get("exclude_internal_links", False),
            score_links=kwargs.get("score_links", False),
            preserve_https_for_internal_links=kwargs.get("preserve_https_for_internal_links", False),
            # Debugging and Logging Parameters
            verbose=kwargs.get("verbose", True),
            log_console=kwargs.get("log_console", False),
@@ -1623,6 +1626,7 @@ class CrawlerRunConfig():
            "exclude_domains": self.exclude_domains,
            "exclude_internal_links": self.exclude_internal_links,
            "score_links": self.score_links,
            "preserve_https_for_internal_links": self.preserve_https_for_internal_links,
            "verbose": self.verbose,
            "log_console": self.log_console,
            "capture_network_requests": self.capture_network_requests,
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -354,6 +354,7 @@ class AsyncWebCrawler:
                    ###############################################################
                    # Process the HTML content, Call CrawlerStrategy.process_html #
                    ###############################################################
                    from urllib.parse import urlparse
                    crawl_result: CrawlResult = await self.aprocess_html(
                        url=url,
                        html=html,
@@ -364,6 +365,7 @@ class AsyncWebCrawler:
                        verbose=config.verbose,
                        is_raw_html=True if url.startswith("raw:") else False,
                        redirected_url=async_response.redirected_url,
                        original_scheme=urlparse(url).scheme,
                        **kwargs,
                    )
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -258,7 +258,11 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
                continue
            try:
-                normalized_href = normalize_url(href, url)
+                normalized_href = normalize_url(
                    href, url,
                    preserve_https=kwargs.get('preserve_https_for_internal_links', False),
                    original_scheme=kwargs.get('original_scheme')
                )
                link_data = {
                    "href": normalized_href,
                    "text": link.text_content().strip(),
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -2146,7 +2146,9 @@ def normalize_url(
    drop_query_tracking=True,
    sort_query=True,
    keep_fragment=False,
-    extra_drop_params=None
+    extra_drop_params=None,
    preserve_https=False,
    original_scheme=None
 ):
    """
    Extended URL normalizer
@@ -2177,6 +2179,17 @@ def normalize_url(
    # Resolve relative paths first
    full_url = urljoin(base_url, href.strip())
    # Preserve HTTPS if requested and original scheme was HTTPS
    if preserve_https and original_scheme == 'https':
        parsed_full = urlparse(full_url)
        parsed_base = urlparse(base_url)
        # Only preserve HTTPS for same-domain links (not protocol-relative URLs)
        # Protocol-relative URLs (//example.com) should follow the base URL's scheme
        if (parsed_full.scheme == 'http' and 
            parsed_full.netloc == parsed_base.netloc and
            not href.strip().startswith('//')):
            full_url = full_url.replace('http://', 'https://', 1)
    # Parse once, edit parts, then rebuild
    parsed = urlparse(full_url)
@@ -2184,10 +2197,8 @@ def normalize_url(
    netloc = parsed.netloc.lower()
    # ── path ──
-    # Strip duplicate slashes and trailing "/" (except root)
+    # Strip duplicate slashes and trailing “/” (except root)
-    # IMPORTANT: Don't use quote(unquote()) as it mangles + signs in URLs
+    path = quote(unquote(parsed.path))
    # The path from urlparse is already properly encoded
    path = parsed.path
    if path.endswith('/') and path != '/':
        path = path.rstrip('/')
@@ -2227,7 +2238,7 @@ def normalize_url(
    return normalized
-def normalize_url_for_deep_crawl(href, base_url):
+def normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_scheme=None):
    """Normalize URLs to ensure consistent format"""
    from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode
@@ -2238,6 +2249,17 @@ def normalize_url_for_deep_crawl(href, base_url):
    # Use urljoin to handle relative URLs
    full_url = urljoin(base_url, href.strip())
    # Preserve HTTPS if requested and original scheme was HTTPS
    if preserve_https and original_scheme == 'https':
        parsed_full = urlparse(full_url)
        parsed_base = urlparse(base_url)
        # Only preserve HTTPS for same-domain links (not protocol-relative URLs)
        # Protocol-relative URLs (//example.com) should follow the base URL's scheme
        if (parsed_full.scheme == 'http' and 
            parsed_full.netloc == parsed_base.netloc and
            not href.strip().startswith('//')):
            full_url = full_url.replace('http://', 'https://', 1)
    # Parse the URL for normalization
    parsed = urlparse(full_url)
@@ -2275,7 +2297,7 @@ def normalize_url_for_deep_crawl(href, base_url):
    return normalized
@lru_cache(maxsize=10000)
-def efficient_normalize_url_for_deep_crawl(href, base_url):
+def efficient_normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_scheme=None):
    """Efficient URL normalization with proper parsing"""
    from urllib.parse import urljoin
@@ -2285,6 +2307,17 @@ def efficient_normalize_url_for_deep_crawl(href, base_url):
    # Resolve relative URLs
    full_url = urljoin(base_url, href.strip())
    # Preserve HTTPS if requested and original scheme was HTTPS
    if preserve_https and original_scheme == 'https':
        parsed_full = urlparse(full_url)
        parsed_base = urlparse(base_url)
        # Only preserve HTTPS for same-domain links (not protocol-relative URLs)
        # Protocol-relative URLs (//example.com) should follow the base URL's scheme
        if (parsed_full.scheme == 'http' and 
            parsed_full.netloc == parsed_base.netloc and
            not href.strip().startswith('//')):
            full_url = full_url.replace('http://', 'https://', 1)
    # Use proper URL parsing
    parsed = urlparse(full_url)
--- a/deploy/docker/.llm.env.example
+++ b/deploy/docker/.llm.env.example
@@ -11,22 +11,3 @@ GEMINI_API_TOKEN=your_gemini_key_here
 # Examples: "openai/gpt-4", "anthropic/claude-3-opus", "deepseek/chat", etc.
 # If not set, uses the provider specified in config.yml (default: openai/gpt-4o-mini)
 # LLM_PROVIDER=anthropic/claude-3-opus
 # Optional: Global LLM temperature setting (0.0-2.0)
 # Controls randomness in responses. Lower = more focused, Higher = more creative
 # LLM_TEMPERATURE=0.7
 # Optional: Global custom API base URL
 # Use this to point to custom endpoints or proxy servers
 # LLM_BASE_URL=https://api.custom.com/v1
 # Optional: Provider-specific temperature overrides
 # These take precedence over the global LLM_TEMPERATURE
 # OPENAI_TEMPERATURE=0.5
 # ANTHROPIC_TEMPERATURE=0.3
 # GROQ_TEMPERATURE=0.8
 # Optional: Provider-specific base URL overrides
 # Use for provider-specific proxy endpoints
 # OPENAI_BASE_URL=https://custom-openai.company.com/v1
 # GROQ_BASE_URL=https://custom-groq.company.com/v1
--- a/deploy/docker/README.md
+++ b/deploy/docker/README.md
@@ -692,7 +692,8 @@ app:
 # Default LLM Configuration
 llm:
  provider: "openai/gpt-4o-mini"  # Can be overridden by LLM_PROVIDER env var
-  # api_key: sk-...  # If you pass the API key directly (not recommended)
+  api_key_env: "OPENAI_API_KEY"
  # api_key: sk-...  # If you pass the API key directly then api_key_env will be ignored
 # Redis Configuration (Used by internal Redis server managed by supervisord)
 redis:
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -42,9 +42,7 @@ from utils import (
    should_cleanup_task,
    decode_redis_hash,
    get_llm_api_key,
-    validate_llm_provider,
+    validate_llm_provider
    get_llm_temperature,
    get_llm_base_url
 )
 import psutil, time
@@ -98,9 +96,7 @@ async def handle_llm_qa(
        response = perform_completion_with_backoff(
            provider=config["llm"]["provider"],
            prompt_with_variables=prompt,
-            api_token=get_llm_api_key(config),  # Returns None to let litellm handle it
+            api_token=get_llm_api_key(config)
            temperature=get_llm_temperature(config),
            base_url=get_llm_base_url(config)
        )
        return response.choices[0].message.content
@@ -119,9 +115,7 @@ async def process_llm_extraction(
    instruction: str,
    schema: Optional[str] = None,
    cache: str = "0",
-    provider: Optional[str] = None,
+    provider: Optional[str] = None
    temperature: Optional[float] = None,
    base_url: Optional[str] = None
 ) -> None:
    """Process LLM extraction in background."""
    try:
@@ -133,13 +127,11 @@ async def process_llm_extraction(
                "error": error_msg
            })
            return
-        api_key = get_llm_api_key(config, provider)  # Returns None to let litellm handle it
+        api_key = get_llm_api_key(config, provider)
        llm_strategy = LLMExtractionStrategy(
            llm_config=LLMConfig(
                provider=provider or config["llm"]["provider"],
-                api_token=api_key,
+                api_token=api_key
                temperature=temperature or get_llm_temperature(config, provider),
                base_url=base_url or get_llm_base_url(config, provider)
            ),
            instruction=instruction,
            schema=json.loads(schema) if schema else None,
@@ -186,9 +178,7 @@ async def handle_markdown_request(
    query: Optional[str] = None,
    cache: str = "0",
    config: Optional[dict] = None,
-    provider: Optional[str] = None,
+    provider: Optional[str] = None
    temperature: Optional[float] = None,
    base_url: Optional[str] = None
 ) -> str:
    """Handle markdown generation requests."""
    try:
@@ -213,9 +203,7 @@ async def handle_markdown_request(
                FilterType.LLM: LLMContentFilter(
                    llm_config=LLMConfig(
                        provider=provider or config["llm"]["provider"],
-                        api_token=get_llm_api_key(config, provider),  # Returns None to let litellm handle it
+                        api_token=get_llm_api_key(config, provider),
                        temperature=temperature or get_llm_temperature(config, provider),
                        base_url=base_url or get_llm_base_url(config, provider)
                    ),
                    instruction=query or "Extract main content"
                )
@@ -260,9 +248,7 @@ async def handle_llm_request(
    schema: Optional[str] = None,
    cache: str = "0",
    config: Optional[dict] = None,
-    provider: Optional[str] = None,
+    provider: Optional[str] = None
    temperature: Optional[float] = None,
    api_base_url: Optional[str] = None
 ) -> JSONResponse:
    """Handle LLM extraction requests."""
    base_url = get_base_url(request)
@@ -293,9 +279,7 @@ async def handle_llm_request(
            cache,
            base_url,
            config,
-            provider,
+            provider
            temperature,
            api_base_url
        )
    except Exception as e:
@@ -340,9 +324,7 @@ async def create_new_task(
    cache: str,
    base_url: str,
    config: dict,
-    provider: Optional[str] = None,
+    provider: Optional[str] = None
    temperature: Optional[float] = None,
    api_base_url: Optional[str] = None
 ) -> JSONResponse:
    """Create and initialize a new task."""
    decoded_url = unquote(input_path)
@@ -367,9 +349,7 @@ async def create_new_task(
        query,
        schema,
        cache,
-        provider,
+        provider
        temperature,
        api_base_url
    )
    return JSONResponse({
--- a/deploy/docker/config.yml
+++ b/deploy/docker/config.yml
@@ -11,7 +11,8 @@ app:
 # Default LLM Configuration
 llm:
  provider: "openai/gpt-4o-mini"
-  # api_key: sk-...  # If you pass the API key directly (not recommended)
+  api_key_env: "OPENAI_API_KEY"
  # api_key: sk-...  # If you pass the API key directly then api_key_env will be ignored
 # Redis Configuration
 redis:
--- a/deploy/docker/job.py
+++ b/deploy/docker/job.py
@@ -37,8 +37,6 @@ class LlmJobPayload(BaseModel):
    schema: Optional[str] = None
    cache:  bool = False
    provider: Optional[str] = None
    temperature: Optional[float] = None
    base_url: Optional[str] = None
 class CrawlJobPayload(BaseModel):
@@ -65,8 +63,6 @@ async def llm_job_enqueue(
        cache=payload.cache,
        config=_config,
        provider=payload.provider,
        temperature=payload.temperature,
        api_base_url=payload.base_url,
    )
@@ -76,7 +72,7 @@ async def llm_job_status(
    task_id: str,
    _td: Dict = Depends(lambda: _token_dep())
 ):
-    return await handle_task_status(_redis, task_id, base_url=str(request.base_url))
+    return await handle_task_status(_redis, task_id)
 # ---------- CRAWL job -------------------------------------------------------
--- a/deploy/docker/schemas.py
+++ b/deploy/docker/schemas.py
@@ -16,8 +16,6 @@ class MarkdownRequest(BaseModel):
    q:   Optional[str] = Field(None,  description="Query string used by BM25/LLM filters")
    c:   Optional[str] = Field("0",   description="Cache‑bust / revision counter")
    provider: Optional[str] = Field(None, description="LLM provider override (e.g., 'anthropic/claude-3-opus')")
    temperature: Optional[float] = Field(None, description="LLM temperature override (0.0-2.0)")
    base_url: Optional[str] = Field(None, description="LLM API base URL override")
 class RawCode(BaseModel):
--- a/deploy/docker/server.py
+++ b/deploy/docker/server.py
@@ -241,8 +241,7 @@ async def get_markdown(
        raise HTTPException(
            400, "Invalid URL format. Must start with http://, https://, or for raw HTML (raw:, raw://)")
    markdown = await handle_markdown_request(
-        body.url, body.f, body.q, body.c, config, body.provider,
+        body.url, body.f, body.q, body.c, config, body.provider
        body.temperature, body.base_url
    )
    return JSONResponse({
        "url": body.url,
--- a/deploy/docker/utils.py
+++ b/deploy/docker/utils.py
@@ -71,7 +71,7 @@ def decode_redis_hash(hash_data: Dict[bytes, bytes]) -> Dict[str, str]:
-def get_llm_api_key(config: Dict, provider: Optional[str] = None) -> Optional[str]:
+def get_llm_api_key(config: Dict, provider: Optional[str] = None) -> str:
    """Get the appropriate API key based on the LLM provider.
    Args:
@@ -79,14 +79,19 @@ def get_llm_api_key(config: Dict, provider: Optional[str] = None) -> Optional[st
        provider: Optional provider override (e.g., "openai/gpt-4")
    Returns:
-        The API key if directly configured, otherwise None to let litellm handle it
+        The API key for the provider, or empty string if not found
    """
-    # Check if direct API key is configured (for backward compatibility)
+        
    # Use provided provider or fall back to config
    if not provider:
        provider = config["llm"]["provider"]
    # Check if direct API key is configured
    if "api_key" in config["llm"]:
        return config["llm"]["api_key"]
-    # Return None - litellm will automatically find the right environment variable
+    # Fall back to the configured api_key_env if no match
-    return None
+    return os.environ.get(config["llm"].get("api_key_env", ""), "")
 def validate_llm_provider(config: Dict, provider: Optional[str] = None) -> tuple[bool, str]:
@@ -99,77 +104,18 @@ def validate_llm_provider(config: Dict, provider: Optional[str] = None) -> tuple
    Returns:
        Tuple of (is_valid, error_message)
    """
-    # If a direct API key is configured, validation passes
+    # Use provided provider or fall back to config
-    if "api_key" in config["llm"]:
+    if not provider:
        provider = config["llm"]["provider"]
    # Get the API key for this provider
    api_key = get_llm_api_key(config, provider)
    if not api_key:
        return False, f"No API key found for provider '{provider}'. Please set the appropriate environment variable."
    return True, ""
    # Otherwise, trust that litellm will find the appropriate environment variable
    # We can't easily validate this without reimplementing litellm's logic
    return True, ""
 def get_llm_temperature(config: Dict, provider: Optional[str] = None) -> Optional[float]:
    """Get temperature setting based on the LLM provider.
    Priority order:
    1. Provider-specific environment variable (e.g., OPENAI_TEMPERATURE)
    2. Global LLM_TEMPERATURE environment variable
    3. None (to use litellm/provider defaults)
    Args:
        config: The application configuration dictionary
        provider: Optional provider override (e.g., "openai/gpt-4")
    Returns:
        The temperature setting if configured, otherwise None
    """
    # Check provider-specific temperature first
    if provider:
        provider_name = provider.split('/')[0].upper()
        provider_temp = os.environ.get(f"{provider_name}_TEMPERATURE")
        if provider_temp:
            try:
                return float(provider_temp)
            except ValueError:
                logging.warning(f"Invalid temperature value for {provider_name}: {provider_temp}")
    # Check global LLM_TEMPERATURE
    global_temp = os.environ.get("LLM_TEMPERATURE")
    if global_temp:
        try:
            return float(global_temp)
        except ValueError:
            logging.warning(f"Invalid global temperature value: {global_temp}")
    # Return None to use litellm/provider defaults
    return None
 def get_llm_base_url(config: Dict, provider: Optional[str] = None) -> Optional[str]:
    """Get base URL setting based on the LLM provider.
    Priority order:
    1. Provider-specific environment variable (e.g., OPENAI_BASE_URL)
    2. Global LLM_BASE_URL environment variable
    3. None (to use default endpoints)
    Args:
        config: The application configuration dictionary
        provider: Optional provider override (e.g., "openai/gpt-4")
    Returns:
        The base URL if configured, otherwise None
    """
    # Check provider-specific base URL first
    if provider:
        provider_name = provider.split('/')[0].upper()
        provider_url = os.environ.get(f"{provider_name}_BASE_URL")
        if provider_url:
            return provider_url
    # Check global LLM_BASE_URL
    return os.environ.get("LLM_BASE_URL")
 def verify_email_domain(email: str) -> bool:
    try:
--- a/docs/md_v2/api/parameters.md
+++ b/docs/md_v2/api/parameters.md
@@ -155,6 +155,7 @@ If your page is a single-page app with repeated JS updates, set `js_only=True` i
 | **`exclude_external_links`** | `bool` (False)          | Removes all links pointing outside the current domain.                                                                      |
 | **`exclude_social_media_links`** | `bool` (False)      | Strips links specifically to social sites (like Facebook or Twitter).                                                      |
 | **`exclude_domains`**        | `list` ([])             | Provide a custom list of domains to exclude (like `["ads.com", "trackers.io"]`).                                            |
 | **`preserve_https_for_internal_links`** | `bool` (False) | If `True`, preserves HTTPS scheme for internal links even when the server redirects to HTTP. Useful for security-conscious crawling. |
 Use these for link-level content filtering (often to keep crawls “internal” or to remove spammy domains).
--- a/docs/md_v2/core/deep-crawling.md
+++ b/docs/md_v2/core/deep-crawling.md
@@ -472,6 +472,17 @@ Note that for BestFirstCrawlingStrategy, score_threshold is not needed since pag
 5.**Balance breadth vs. depth.** Choose your strategy wisely - BFS for comprehensive coverage, DFS for deep exploration, BestFirst for focused relevance-based crawling.
 6.**Preserve HTTPS for security.** If crawling HTTPS sites that redirect to HTTP, use `preserve_https_for_internal_links=True` to maintain secure connections:
 ```python
 config = CrawlerRunConfig(
    deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=2),
    preserve_https_for_internal_links=True  # Keep HTTPS even if server redirects to HTTP
 )
 ```
 This is especially useful for security-conscious crawling or when dealing with sites that support both protocols.
 ---
 ## 10. Summary & Next Steps
--- a/docs/md_v2/core/docker-deployment.md
+++ b/docs/md_v2/core/docker-deployment.md
@@ -89,16 +89,6 @@ ANTHROPIC_API_KEY=your-anthropic-key
 # TOGETHER_API_KEY=your-together-key
 # MISTRAL_API_KEY=your-mistral-key
 # GEMINI_API_TOKEN=your-gemini-token
 # Optional: Global LLM settings
 # LLM_PROVIDER=openai/gpt-4o-mini
 # LLM_TEMPERATURE=0.7
 # LLM_BASE_URL=https://api.custom.com/v1
 # Optional: Provider-specific overrides
 # OPENAI_TEMPERATURE=0.5
 # OPENAI_BASE_URL=https://custom-openai.com/v1
 # ANTHROPIC_TEMPERATURE=0.3
 EOL
 ```
 > 🔑 **Note**: Keep your API keys secure! Never commit `.llm.env` to version control.
@@ -166,43 +156,27 @@ cp deploy/docker/.llm.env.example .llm.env
 **Flexible LLM Provider Configuration:**
-The Docker setup now supports flexible LLM provider configuration through a hierarchical system:
+The Docker setup now supports flexible LLM provider configuration through three methods:
-1. **API Request Parameters** (Highest Priority): Specify per request
+1. **Environment Variable** (Highest Priority): Set `LLM_PROVIDER` to override the default
   ```bash
   export LLM_PROVIDER="anthropic/claude-3-opus"
   # Or in your .llm.env file:
   # LLM_PROVIDER=anthropic/claude-3-opus
   ```
 2. **API Request Parameter**: Specify provider per request
   ```json
   {
     "url": "https://example.com",
     "f": "llm",
-     "provider": "groq/mixtral-8x7b",
+     "provider": "groq/mixtral-8x7b"
     "temperature": 0.7,
     "base_url": "https://api.custom.com/v1"
   }
   ```
-2. **Provider-Specific Environment Variables**: Override for specific providers
+3. **Config File Default**: Falls back to `config.yml` (default: `openai/gpt-4o-mini`)
   ```bash
   # In your .llm.env file:
   OPENAI_TEMPERATURE=0.5
   OPENAI_BASE_URL=https://custom-openai.com/v1
   ANTHROPIC_TEMPERATURE=0.3
   ```
-3. **Global Environment Variables**: Set defaults for all providers
+The system automatically selects the appropriate API key based on the configured `api_key_env` in the config file.
   ```bash
   # In your .llm.env file:
   LLM_PROVIDER=anthropic/claude-3-opus
   LLM_TEMPERATURE=0.7
   LLM_BASE_URL=https://api.proxy.com/v1
   ```
 4. **Config File Default**: Falls back to `config.yml` (default: `openai/gpt-4o-mini`)
 The system automatically selects the appropriate API key based on the provider. LiteLLM handles finding the correct environment variable for each provider (e.g., OPENAI_API_KEY for OpenAI, GEMINI_API_TOKEN for Google Gemini, etc.).
 **Supported LLM Parameters:**
 - `provider`: LLM provider and model (e.g., "openai/gpt-4", "anthropic/claude-3-opus")
 - `temperature`: Controls randomness (0.0-2.0, lower = more focused, higher = more creative)
 - `base_url`: Custom API endpoint for proxy servers or alternative endpoints
 #### 3. Build and Run with Compose
@@ -581,101 +555,6 @@ Crucially, when sending configurations directly via JSON, they **must** follow t
 **LLM Extraction Strategy** *(Keep example, ensure schema uses type/value wrapper)*
 *(Keep Deep Crawler Example)*
 ### LLM Configuration Examples
 The Docker API supports dynamic LLM configuration through multiple levels:
 #### Temperature Control
 Temperature affects the randomness of LLM responses (0.0 = deterministic, 2.0 = very creative):
 ```python
 import requests
 # Low temperature for factual extraction
 response = requests.post(
    "http://localhost:11235/md",
    json={
        "url": "https://example.com",
        "f": "llm",
        "q": "Extract all dates and numbers from this page",
        "temperature": 0.2  # Very focused, deterministic
    }
 )
 # High temperature for creative tasks
 response = requests.post(
    "http://localhost:11235/md",
    json={
        "url": "https://example.com", 
        "f": "llm",
        "q": "Write a creative summary of this content",
        "temperature": 1.2  # More creative, varied responses
    }
 )
 ```
 #### Custom API Endpoints
 Use custom base URLs for proxy servers or alternative API endpoints:
 ```python
 # Using a local LLM server
 response = requests.post(
    "http://localhost:11235/md",
    json={
        "url": "https://example.com",
        "f": "llm",
        "q": "Extract key information",
        "provider": "ollama/llama2",
        "base_url": "http://localhost:11434/v1"
    }
 )
 ```
 #### Dynamic Provider Selection
 Switch between providers based on task requirements:
 ```python
 async def smart_extraction(url: str, content_type: str):
    """Select provider and temperature based on content type"""
    configs = {
        "technical": {
            "provider": "openai/gpt-4",
            "temperature": 0.3,
            "query": "Extract technical specifications and code examples"
        },
        "creative": {
            "provider": "anthropic/claude-3-opus",
            "temperature": 0.9,
            "query": "Create an engaging narrative summary"
        },
        "quick": {
            "provider": "groq/mixtral-8x7b",
            "temperature": 0.5,
            "query": "Quick summary in bullet points"
        }
    }
    config = configs.get(content_type, configs["quick"])
    response = await httpx.post(
        "http://localhost:11235/md",
        json={
            "url": url,
            "f": "llm",
            "q": config["query"],
            "provider": config["provider"],
            "temperature": config["temperature"]
        }
    )
    return response.json()
 ```
 ### REST API Examples
 Update URLs to use port `11235`.
@@ -814,8 +693,8 @@ app:
 # Default LLM Configuration
 llm:
  provider: "openai/gpt-4o-mini"  # Can be overridden by LLM_PROVIDER env var
-  # api_key: sk-...  # If you pass the API key directly (not recommended)
+  api_key_env: "OPENAI_API_KEY"
-  # temperature and base_url are controlled via environment variables or request parameters
+  # api_key: sk-...  # If you pass the API key directly then api_key_env will be ignored
 # Redis Configuration (Used by internal Redis server managed by supervisord)
 redis:
--- a/tests/docker/test_llm_params.py
+++ b/tests/docker/test_llm_params.py
@@ -1,349 +0,0 @@
 #!/usr/bin/env python3
 """
 Test script for LLM temperature and base_url parameters in Crawl4AI Docker API.
 This demonstrates the new hierarchical configuration system:
 1. Request-level parameters (highest priority)
 2. Provider-specific environment variables
 3. Global environment variables
 4. System defaults (lowest priority)
 """
 import asyncio
 import httpx
 import json
 import os
 from rich.console import Console
 from rich.panel import Panel
 from rich.syntax import Syntax
 from rich.table import Table
 console = Console()
 # Configuration
 BASE_URL = "http://localhost:11235"  # Docker API endpoint
 TEST_URL = "https://httpbin.org/html"     # Simple test page
 # --- Helper Functions ---
 async def check_server_health(client: httpx.AsyncClient) -> bool:
    """Check if the server is healthy."""
    console.print("[bold cyan]Checking server health...[/]", end="")
    try:
        response = await client.get("/health", timeout=10.0)
        response.raise_for_status()
        console.print(" [bold green]✓ Server is healthy![/]")
        return True
    except Exception as e:
        console.print(f"\n[bold red]✗ Server health check failed: {e}[/]")
        console.print(f"Is the server running at {BASE_URL}?")
        return False
 def print_request(endpoint: str, payload: dict, title: str = "Request"):
    """Pretty print the request."""
    syntax = Syntax(json.dumps(payload, indent=2), "json", theme="monokai")
    console.print(Panel.fit(
        f"[cyan]POST {endpoint}[/cyan]\n{syntax}",
        title=f"[bold blue]{title}[/]",
        border_style="blue"
    ))
 def print_response(response: dict, title: str = "Response"):
    """Pretty print relevant parts of the response."""
    # Extract only the relevant parts
    relevant = {}
    if "markdown" in response:
        relevant["markdown"] = response["markdown"][:200] + "..." if len(response.get("markdown", "")) > 200 else response.get("markdown", "")
    if "success" in response:
        relevant["success"] = response["success"]
    if "url" in response:
        relevant["url"] = response["url"]
    if "filter" in response:
        relevant["filter"] = response["filter"]
    console.print(Panel.fit(
        Syntax(json.dumps(relevant, indent=2), "json", theme="monokai"),
        title=f"[bold green]{title}[/]",
        border_style="green"
    ))
 # --- Test Functions ---
 async def test_default_no_params(client: httpx.AsyncClient):
    """Test 1: No temperature or base_url specified - uses defaults"""
    console.rule("[bold yellow]Test 1: Default Configuration (No Parameters)[/]")
    payload = {
        "url": TEST_URL,
        "f": "llm",
        "q": "What is the main heading of this page? Answer in exactly 5 words."
    }
    print_request("/md", payload, "Request without temperature/base_url")
    try:
        response = await client.post("/md", json=payload, timeout=30.0)
        response.raise_for_status()
        data = response.json()
        print_response(data, "Response (using system defaults)")
        console.print("[dim]→ This used system defaults or environment variables if set[/]")
    except Exception as e:
        console.print(f"[red]Error: {e}[/]")
 async def test_request_temperature(client: httpx.AsyncClient):
    """Test 2: Request-level temperature (highest priority)"""
    console.rule("[bold yellow]Test 2: Request-Level Temperature[/]")
    # Test with low temperature (more focused)
    payload_low = {
        "url": TEST_URL,
        "f": "llm",
        "q": "What is the main heading? Be creative and poetic.",
        "temperature": 0.1  # Very low - should be less creative
    }
    print_request("/md", payload_low, "Low Temperature (0.1)")
    try:
        response = await client.post("/md", json=payload_low, timeout=30.0)
        response.raise_for_status()
        data_low = response.json()
        print_response(data_low, "Response with Low Temperature")
        console.print("[dim]→ Low temperature (0.1) should produce focused, less creative output[/]")
    except Exception as e:
        console.print(f"[red]Error: {e}[/]")
    console.print()
    # Test with high temperature (more creative)
    payload_high = {
        "url": TEST_URL,
        "f": "llm",
        "q": "What is the main heading? Be creative and poetic.",
        "temperature": 1.5  # High - should be more creative
    }
    print_request("/md", payload_high, "High Temperature (1.5)")
    try:
        response = await client.post("/md", json=payload_high, timeout=30.0)
        response.raise_for_status()
        data_high = response.json()
        print_response(data_high, "Response with High Temperature")
        console.print("[dim]→ High temperature (1.5) should produce more creative, varied output[/]")
    except Exception as e:
        console.print(f"[red]Error: {e}[/]")
 async def test_provider_override(client: httpx.AsyncClient):
    """Test 3: Provider override with temperature"""
    console.rule("[bold yellow]Test 3: Provider Override with Temperature[/]")
    provider = "gemini/gemini-2.5-flash-lite"
    payload = {
        "url": TEST_URL,
        "f": "llm",
        "q": "Summarize this page in one sentence.",
        "provider": provider,  # Explicitly set provider
        "temperature": 0.7
    }
    print_request("/md", payload, "Provider + Temperature Override")
    try:
        response = await client.post("/md", json=payload, timeout=30.0)
        response.raise_for_status()
        data = response.json()
        print_response(data, "Response with Provider Override")
        console.print(f"[dim]→ This explicitly uses {provider} with temperature 0.7[/]")
    except Exception as e:
        console.print(f"[red]Error: {e}[/]")
 async def test_base_url_custom(client: httpx.AsyncClient):
    """Test 4: Custom base_url (will fail unless you have a custom endpoint)"""
    console.rule("[bold yellow]Test 4: Custom Base URL (Demo Only)[/]")
    payload = {
        "url": TEST_URL,
        "f": "llm",
        "q": "What is this page about?",
        "base_url": "https://api.custom-endpoint.com/v1",  # Custom endpoint
        "temperature": 0.5
    }
    print_request("/md", payload, "Custom Base URL Request")
    console.print("[yellow]Note: This will fail unless you have a custom endpoint set up[/]")
    try:
        response = await client.post("/md", json=payload, timeout=10.0)
        response.raise_for_status()
        data = response.json()
        print_response(data, "Response from Custom Endpoint")
    except httpx.HTTPStatusError as e:
        console.print(f"[yellow]Expected failure (no custom endpoint): Status {e.response.status_code}[/]")
    except Exception as e:
        console.print(f"[yellow]Expected error: {e}[/]")
 async def test_llm_job_endpoint(client: httpx.AsyncClient):
    """Test 5: Test the /llm/job endpoint with temperature and base_url"""
    console.rule("[bold yellow]Test 5: LLM Job Endpoint with Parameters[/]")
    payload = {
        "url": TEST_URL,
        "q": "Extract the main title and any key information",
        "temperature": 0.3,
        # "base_url": "https://api.openai.com/v1"  # Optional
    }
    print_request("/llm/job", payload, "LLM Job with Temperature")
    try:
        # Submit the job
        response = await client.post("/llm/job", json=payload, timeout=30.0)
        response.raise_for_status()
        job_data = response.json()
        if "task_id" in job_data:
            task_id = job_data["task_id"]
            console.print(f"[green]Job created with task_id: {task_id}[/]")
            # Poll for result (simplified - in production use proper polling)
            await asyncio.sleep(3)
            status_response = await client.get(f"/llm/job/{task_id}")
            status_data = status_response.json()
            if status_data.get("status") == "completed":
                console.print("[green]Job completed successfully![/]")
                if "result" in status_data:
                    console.print(Panel.fit(
                        Syntax(json.dumps(status_data["result"], indent=2), "json", theme="monokai"),
                        title="Extraction Result",
                        border_style="green"
                    ))
            else:
                console.print(f"[yellow]Job status: {status_data.get('status', 'unknown')}[/]")
        else:
            console.print(f"[red]Unexpected response: {job_data}[/]")
    except Exception as e:
        console.print(f"[red]Error: {e}[/]")
 async def test_llm_endpoint(client: httpx.AsyncClient):
    """
    Quick QA round-trip with /llm.
    Asks a trivial question against SIMPLE_URL just to show wiring.
    """
    import time
    import urllib.parse
    page_url = "https://kidocode.com"
    question = "What is the title of this page?"
    enc = urllib.parse.quote_plus(page_url, safe="")
    console.print(f"GET /llm/{enc}?q={question}")
    try:
        t0 = time.time()
        resp = await client.get(f"/llm/{enc}", params={"q": question})
        dt = time.time() - t0
        console.print(
            f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
        resp.raise_for_status()
        answer = resp.json().get("answer", "")
        console.print(Panel(answer or "No answer returned",
                      title="LLM answer", border_style="magenta", expand=False))
    except Exception as e:
        console.print(f"[bold red]Error hitting /llm:[/] {e}")
 async def show_environment_info():
    """Display current environment configuration"""
    console.rule("[bold cyan]Current Environment Configuration[/]")
    table = Table(title="LLM Environment Variables", show_header=True, header_style="bold magenta")
    table.add_column("Variable", style="cyan", width=30)
    table.add_column("Value", style="yellow")
    table.add_column("Description", style="dim")
    env_vars = [
        ("LLM_PROVIDER", "Global default provider"),
        ("LLM_TEMPERATURE", "Global default temperature"),
        ("LLM_BASE_URL", "Global custom API endpoint"),
        ("OPENAI_API_KEY", "OpenAI API key"),
        ("OPENAI_TEMPERATURE", "OpenAI-specific temperature"),
        ("OPENAI_BASE_URL", "OpenAI-specific endpoint"),
        ("ANTHROPIC_API_KEY", "Anthropic API key"),
        ("ANTHROPIC_TEMPERATURE", "Anthropic-specific temperature"),
        ("GROQ_API_KEY", "Groq API key"),
        ("GROQ_TEMPERATURE", "Groq-specific temperature"),
    ]
    for var, desc in env_vars:
        value = os.environ.get(var, "[not set]")
        if "API_KEY" in var and value != "[not set]":
            # Mask API keys for security
            value = value[:10] + "..." if len(value) > 10 else "***"
        table.add_row(var, value, desc)
    console.print(table)
    console.print()
 # --- Main Test Runner ---
 async def main():
    """Run all tests"""
    console.print(Panel.fit(
        "[bold cyan]Crawl4AI LLM Parameters Test Suite[/]\n" +
        "Testing temperature and base_url configuration hierarchy",
        border_style="cyan"
    ))
    # Show current environment
    # await show_environment_info()
    # Create HTTP client
    async with httpx.AsyncClient(base_url=BASE_URL, timeout=60.0) as client:
        # Check server health
        if not await check_server_health(client):
            console.print("[red]Server is not available. Please ensure the Docker container is running.[/]")
            return
        # Run tests
        tests = [
            ("Default Configuration", test_default_no_params),
            ("Request Temperature", test_request_temperature),
            ("Provider Override", test_provider_override),
            ("Custom Base URL", test_base_url_custom),
            ("LLM Job Endpoint", test_llm_job_endpoint),
            ("LLM Endpoint", test_llm_endpoint),
        ]
        for i, (name, test_func) in enumerate(tests, 1):
            if i > 1:
                console.print()  # Add spacing between tests
            try:
                await test_func(client)
            except Exception as e:
                console.print(f"[red]Test '{name}' failed with error: {e}[/]")
                console.print_exception(show_locals=False)
        console.rule("[bold green]All Tests Complete![/]", style="green")
        # Summary
        console.print("\n[bold cyan]Configuration Hierarchy Summary:[/]")
        console.print("1. [yellow]Request parameters[/] - Highest priority (temperature, base_url in API call)")
        console.print("2. [yellow]Provider-specific env[/] - e.g., OPENAI_TEMPERATURE, GROQ_BASE_URL")
        console.print("3. [yellow]Global env variables[/] - LLM_TEMPERATURE, LLM_BASE_URL")
        console.print("4. [yellow]System defaults[/] - Lowest priority (provider/litellm defaults)")
        console.print()
 if __name__ == "__main__":
    try:
        asyncio.run(main())
    except KeyboardInterrupt:
        console.print("\n[yellow]Tests interrupted by user.[/]")
    except Exception as e:
        console.print(f"\n[bold red]An error occurred:[/]")
        console.print_exception(show_locals=False)
--- a/tests/test_preserve_https_for_internal_links.py
+++ b/tests/test_preserve_https_for_internal_links.py
@@ -0,0 +1,175 @@
 #!/usr/bin/env python3
 """
 Final test and demo for HTTPS preservation feature (Issue #1410)
 This demonstrates how the preserve_https_for_internal_links flag
 prevents HTTPS downgrade when servers redirect to HTTP.
 """
 import sys
 import os
 from urllib.parse import urljoin, urlparse
 def demonstrate_issue():
    """Show the problem: HTTPS -> HTTP redirect causes HTTP links"""
    print("=" * 60)
    print("DEMONSTRATING THE ISSUE")
    print("=" * 60)
    # Simulate what happens during crawling
    original_url = "https://quotes.toscrape.com/tag/deep-thoughts"
    redirected_url = "http://quotes.toscrape.com/tag/deep-thoughts/"  # Server redirects to HTTP
    # Extract a relative link
    relative_link = "/author/Albert-Einstein"
    # Standard URL joining uses the redirected (HTTP) base
    resolved_url = urljoin(redirected_url, relative_link)
    print(f"Original URL:    {original_url}")
    print(f"Redirected to:   {redirected_url}")
    print(f"Relative link:   {relative_link}")
    print(f"Resolved link:   {resolved_url}")
    print(f"\n❌ Problem: Link is now HTTP instead of HTTPS!")
    return resolved_url
 def demonstrate_solution():
    """Show the solution: preserve HTTPS for internal links"""
    print("\n" + "=" * 60)
    print("DEMONSTRATING THE SOLUTION")
    print("=" * 60)
    # Our normalize_url with HTTPS preservation
    def normalize_url_with_preservation(href, base_url, preserve_https=False, original_scheme=None):
        """Normalize URL with optional HTTPS preservation"""
        # Standard resolution
        full_url = urljoin(base_url, href.strip())
        # Preserve HTTPS if requested
        if preserve_https and original_scheme == 'https':
            parsed_full = urlparse(full_url)
            parsed_base = urlparse(base_url)
            # Only for same-domain links
            if parsed_full.scheme == 'http' and parsed_full.netloc == parsed_base.netloc:
                full_url = full_url.replace('http://', 'https://', 1)
                print(f"  → Preserved HTTPS for {parsed_full.netloc}")
        return full_url
    # Same scenario as before
    original_url = "https://quotes.toscrape.com/tag/deep-thoughts"
    redirected_url = "http://quotes.toscrape.com/tag/deep-thoughts/"
    relative_link = "/author/Albert-Einstein"
    # Without preservation (current behavior)
    resolved_without = normalize_url_with_preservation(
        relative_link, redirected_url,
        preserve_https=False, original_scheme='https'
    )
    print(f"\nWithout preservation:")
    print(f"  Result: {resolved_without}")
    # With preservation (new feature)
    resolved_with = normalize_url_with_preservation(
        relative_link, redirected_url,
        preserve_https=True, original_scheme='https'
    )
    print(f"\nWith preservation (preserve_https_for_internal_links=True):")
    print(f"  Result: {resolved_with}")
    print(f"\n✅ Solution: Internal link stays HTTPS!")
    return resolved_with
 def test_edge_cases():
    """Test important edge cases"""
    print("\n" + "=" * 60)
    print("EDGE CASES")
    print("=" * 60)
    from urllib.parse import urljoin, urlparse
    def preserve_https(href, base_url, original_scheme):
        """Helper to test preservation logic"""
        full_url = urljoin(base_url, href)
        if original_scheme == 'https':
            parsed_full = urlparse(full_url)
            parsed_base = urlparse(base_url)
            # Fixed: check for protocol-relative URLs
            if (parsed_full.scheme == 'http' and 
                parsed_full.netloc == parsed_base.netloc and
                not href.strip().startswith('//')):
                full_url = full_url.replace('http://', 'https://', 1)
        return full_url
    test_cases = [
        # (description, href, base_url, original_scheme, should_be_https)
        ("External link", "http://other.com/page", "http://example.com", "https", False),
        ("Already HTTPS", "/page", "https://example.com", "https", True),
        ("No original HTTPS", "/page", "http://example.com", "http", False),
        ("Subdomain", "/page", "http://sub.example.com", "https", True),
        ("Protocol-relative", "//example.com/page", "http://example.com", "https", False),
    ]
    for desc, href, base_url, orig_scheme, should_be_https in test_cases:
        result = preserve_https(href, base_url, orig_scheme)
        is_https = result.startswith('https://')
        status = "✅" if is_https == should_be_https else "❌"
        print(f"\n{status} {desc}:")
        print(f"  Input: {href} + {base_url}")
        print(f"  Result: {result}")
        print(f"  Expected HTTPS: {should_be_https}, Got: {is_https}")
 def usage_example():
    """Show how to use the feature in crawl4ai"""
    print("\n" + "=" * 60)
    print("USAGE IN CRAWL4AI")
    print("=" * 60)
    print("""
 To enable HTTPS preservation in your crawl4ai code:
 ```python
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
 async with AsyncWebCrawler() as crawler:
    config = CrawlerRunConfig(
        preserve_https_for_internal_links=True  # Enable HTTPS preservation
    )
    result = await crawler.arun(
        url="https://example.com",
        config=config
    )
    # All internal links will maintain HTTPS even if 
    # the server redirects to HTTP
 ```
 This is especially useful for:
 - Sites that redirect HTTPS to HTTP but still support HTTPS
 - Security-conscious crawling where you want to stay on HTTPS
 - Avoiding mixed content issues in downstream processing
 """)
 if __name__ == "__main__":
    # Run all demonstrations
    demonstrate_issue()
    demonstrate_solution() 
    test_edge_cases()
    usage_example()
    print("\n" + "=" * 60)
    print("✅ All tests complete!")
    print("=" * 60)
Author	SHA1	Message	Date
ntohidi	bdacf61ca9	feat: update documentation for preserve_https_for_internal_links. ref #1410	2025-08-28 17:48:12 +08:00
ntohidi	f566c5a376	feat: add preserve_https_for_internal_links flag to maintain HTTPS during crawling. Ref #1410 Added a new `preserve_https_for_internal_links` configuration flag that preserves the original HTTPS scheme for same-domain links even when the server redirects to HTTP.	2025-08-28 17:38:40 +08:00