Compare commits
2 Commits
feature/do
...
fix/https-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bdacf61ca9 | ||
|
|
f566c5a376 |
10
CHANGELOG.md
10
CHANGELOG.md
@@ -5,6 +5,16 @@ All notable changes to Crawl4AI will be documented in this file.
|
|||||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||||
|
|
||||||
|
## [Unreleased]
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- **🔒 HTTPS Preservation for Internal Links**: New `preserve_https_for_internal_links` configuration flag
|
||||||
|
- Maintains HTTPS scheme for internal links even when servers redirect to HTTP
|
||||||
|
- Prevents security downgrades during deep crawling
|
||||||
|
- Useful for security-conscious crawling and sites supporting both protocols
|
||||||
|
- Fully backward compatible with opt-in flag (default: `False`)
|
||||||
|
- Fixes issue #1410 where HTTPS URLs were being downgraded to HTTP
|
||||||
|
|
||||||
## [0.7.3] - 2025-08-09
|
## [0.7.3] - 2025-08-09
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
|||||||
@@ -304,9 +304,9 @@ The new Docker implementation includes:
|
|||||||
### Getting Started
|
### Getting Started
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Pull and run the latest release
|
# Pull and run the latest release candidate
|
||||||
docker pull unclecode/crawl4ai:latest
|
docker pull unclecode/crawl4ai:0.7.0
|
||||||
docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:latest
|
docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:0.7.0
|
||||||
|
|
||||||
# Visit the playground at http://localhost:11235/playground
|
# Visit the playground at http://localhost:11235/playground
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -1121,6 +1121,7 @@ class CrawlerRunConfig():
|
|||||||
exclude_domains: list = None,
|
exclude_domains: list = None,
|
||||||
exclude_internal_links: bool = False,
|
exclude_internal_links: bool = False,
|
||||||
score_links: bool = False,
|
score_links: bool = False,
|
||||||
|
preserve_https_for_internal_links: bool = False,
|
||||||
# Debugging and Logging Parameters
|
# Debugging and Logging Parameters
|
||||||
verbose: bool = True,
|
verbose: bool = True,
|
||||||
log_console: bool = False,
|
log_console: bool = False,
|
||||||
@@ -1244,6 +1245,7 @@ class CrawlerRunConfig():
|
|||||||
self.exclude_domains = exclude_domains or []
|
self.exclude_domains = exclude_domains or []
|
||||||
self.exclude_internal_links = exclude_internal_links
|
self.exclude_internal_links = exclude_internal_links
|
||||||
self.score_links = score_links
|
self.score_links = score_links
|
||||||
|
self.preserve_https_for_internal_links = preserve_https_for_internal_links
|
||||||
|
|
||||||
# Debugging and Logging Parameters
|
# Debugging and Logging Parameters
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
@@ -1517,6 +1519,7 @@ class CrawlerRunConfig():
|
|||||||
exclude_domains=kwargs.get("exclude_domains", []),
|
exclude_domains=kwargs.get("exclude_domains", []),
|
||||||
exclude_internal_links=kwargs.get("exclude_internal_links", False),
|
exclude_internal_links=kwargs.get("exclude_internal_links", False),
|
||||||
score_links=kwargs.get("score_links", False),
|
score_links=kwargs.get("score_links", False),
|
||||||
|
preserve_https_for_internal_links=kwargs.get("preserve_https_for_internal_links", False),
|
||||||
# Debugging and Logging Parameters
|
# Debugging and Logging Parameters
|
||||||
verbose=kwargs.get("verbose", True),
|
verbose=kwargs.get("verbose", True),
|
||||||
log_console=kwargs.get("log_console", False),
|
log_console=kwargs.get("log_console", False),
|
||||||
@@ -1623,6 +1626,7 @@ class CrawlerRunConfig():
|
|||||||
"exclude_domains": self.exclude_domains,
|
"exclude_domains": self.exclude_domains,
|
||||||
"exclude_internal_links": self.exclude_internal_links,
|
"exclude_internal_links": self.exclude_internal_links,
|
||||||
"score_links": self.score_links,
|
"score_links": self.score_links,
|
||||||
|
"preserve_https_for_internal_links": self.preserve_https_for_internal_links,
|
||||||
"verbose": self.verbose,
|
"verbose": self.verbose,
|
||||||
"log_console": self.log_console,
|
"log_console": self.log_console,
|
||||||
"capture_network_requests": self.capture_network_requests,
|
"capture_network_requests": self.capture_network_requests,
|
||||||
|
|||||||
@@ -354,6 +354,7 @@ class AsyncWebCrawler:
|
|||||||
###############################################################
|
###############################################################
|
||||||
# Process the HTML content, Call CrawlerStrategy.process_html #
|
# Process the HTML content, Call CrawlerStrategy.process_html #
|
||||||
###############################################################
|
###############################################################
|
||||||
|
from urllib.parse import urlparse
|
||||||
crawl_result: CrawlResult = await self.aprocess_html(
|
crawl_result: CrawlResult = await self.aprocess_html(
|
||||||
url=url,
|
url=url,
|
||||||
html=html,
|
html=html,
|
||||||
@@ -364,6 +365,7 @@ class AsyncWebCrawler:
|
|||||||
verbose=config.verbose,
|
verbose=config.verbose,
|
||||||
is_raw_html=True if url.startswith("raw:") else False,
|
is_raw_html=True if url.startswith("raw:") else False,
|
||||||
redirected_url=async_response.redirected_url,
|
redirected_url=async_response.redirected_url,
|
||||||
|
original_scheme=urlparse(url).scheme,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -258,7 +258,11 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
normalized_href = normalize_url(href, url)
|
normalized_href = normalize_url(
|
||||||
|
href, url,
|
||||||
|
preserve_https=kwargs.get('preserve_https_for_internal_links', False),
|
||||||
|
original_scheme=kwargs.get('original_scheme')
|
||||||
|
)
|
||||||
link_data = {
|
link_data = {
|
||||||
"href": normalized_href,
|
"href": normalized_href,
|
||||||
"text": link.text_content().strip(),
|
"text": link.text_content().strip(),
|
||||||
|
|||||||
@@ -2146,7 +2146,9 @@ def normalize_url(
|
|||||||
drop_query_tracking=True,
|
drop_query_tracking=True,
|
||||||
sort_query=True,
|
sort_query=True,
|
||||||
keep_fragment=False,
|
keep_fragment=False,
|
||||||
extra_drop_params=None
|
extra_drop_params=None,
|
||||||
|
preserve_https=False,
|
||||||
|
original_scheme=None
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Extended URL normalizer
|
Extended URL normalizer
|
||||||
@@ -2177,6 +2179,17 @@ def normalize_url(
|
|||||||
# Resolve relative paths first
|
# Resolve relative paths first
|
||||||
full_url = urljoin(base_url, href.strip())
|
full_url = urljoin(base_url, href.strip())
|
||||||
|
|
||||||
|
# Preserve HTTPS if requested and original scheme was HTTPS
|
||||||
|
if preserve_https and original_scheme == 'https':
|
||||||
|
parsed_full = urlparse(full_url)
|
||||||
|
parsed_base = urlparse(base_url)
|
||||||
|
# Only preserve HTTPS for same-domain links (not protocol-relative URLs)
|
||||||
|
# Protocol-relative URLs (//example.com) should follow the base URL's scheme
|
||||||
|
if (parsed_full.scheme == 'http' and
|
||||||
|
parsed_full.netloc == parsed_base.netloc and
|
||||||
|
not href.strip().startswith('//')):
|
||||||
|
full_url = full_url.replace('http://', 'https://', 1)
|
||||||
|
|
||||||
# Parse once, edit parts, then rebuild
|
# Parse once, edit parts, then rebuild
|
||||||
parsed = urlparse(full_url)
|
parsed = urlparse(full_url)
|
||||||
|
|
||||||
@@ -2184,10 +2197,8 @@ def normalize_url(
|
|||||||
netloc = parsed.netloc.lower()
|
netloc = parsed.netloc.lower()
|
||||||
|
|
||||||
# ── path ──
|
# ── path ──
|
||||||
# Strip duplicate slashes and trailing "/" (except root)
|
# Strip duplicate slashes and trailing “/” (except root)
|
||||||
# IMPORTANT: Don't use quote(unquote()) as it mangles + signs in URLs
|
path = quote(unquote(parsed.path))
|
||||||
# The path from urlparse is already properly encoded
|
|
||||||
path = parsed.path
|
|
||||||
if path.endswith('/') and path != '/':
|
if path.endswith('/') and path != '/':
|
||||||
path = path.rstrip('/')
|
path = path.rstrip('/')
|
||||||
|
|
||||||
@@ -2227,7 +2238,7 @@ def normalize_url(
|
|||||||
return normalized
|
return normalized
|
||||||
|
|
||||||
|
|
||||||
def normalize_url_for_deep_crawl(href, base_url):
|
def normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_scheme=None):
|
||||||
"""Normalize URLs to ensure consistent format"""
|
"""Normalize URLs to ensure consistent format"""
|
||||||
from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode
|
from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode
|
||||||
|
|
||||||
@@ -2238,6 +2249,17 @@ def normalize_url_for_deep_crawl(href, base_url):
|
|||||||
# Use urljoin to handle relative URLs
|
# Use urljoin to handle relative URLs
|
||||||
full_url = urljoin(base_url, href.strip())
|
full_url = urljoin(base_url, href.strip())
|
||||||
|
|
||||||
|
# Preserve HTTPS if requested and original scheme was HTTPS
|
||||||
|
if preserve_https and original_scheme == 'https':
|
||||||
|
parsed_full = urlparse(full_url)
|
||||||
|
parsed_base = urlparse(base_url)
|
||||||
|
# Only preserve HTTPS for same-domain links (not protocol-relative URLs)
|
||||||
|
# Protocol-relative URLs (//example.com) should follow the base URL's scheme
|
||||||
|
if (parsed_full.scheme == 'http' and
|
||||||
|
parsed_full.netloc == parsed_base.netloc and
|
||||||
|
not href.strip().startswith('//')):
|
||||||
|
full_url = full_url.replace('http://', 'https://', 1)
|
||||||
|
|
||||||
# Parse the URL for normalization
|
# Parse the URL for normalization
|
||||||
parsed = urlparse(full_url)
|
parsed = urlparse(full_url)
|
||||||
|
|
||||||
@@ -2275,7 +2297,7 @@ def normalize_url_for_deep_crawl(href, base_url):
|
|||||||
return normalized
|
return normalized
|
||||||
|
|
||||||
@lru_cache(maxsize=10000)
|
@lru_cache(maxsize=10000)
|
||||||
def efficient_normalize_url_for_deep_crawl(href, base_url):
|
def efficient_normalize_url_for_deep_crawl(href, base_url, preserve_https=False, original_scheme=None):
|
||||||
"""Efficient URL normalization with proper parsing"""
|
"""Efficient URL normalization with proper parsing"""
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
@@ -2285,6 +2307,17 @@ def efficient_normalize_url_for_deep_crawl(href, base_url):
|
|||||||
# Resolve relative URLs
|
# Resolve relative URLs
|
||||||
full_url = urljoin(base_url, href.strip())
|
full_url = urljoin(base_url, href.strip())
|
||||||
|
|
||||||
|
# Preserve HTTPS if requested and original scheme was HTTPS
|
||||||
|
if preserve_https and original_scheme == 'https':
|
||||||
|
parsed_full = urlparse(full_url)
|
||||||
|
parsed_base = urlparse(base_url)
|
||||||
|
# Only preserve HTTPS for same-domain links (not protocol-relative URLs)
|
||||||
|
# Protocol-relative URLs (//example.com) should follow the base URL's scheme
|
||||||
|
if (parsed_full.scheme == 'http' and
|
||||||
|
parsed_full.netloc == parsed_base.netloc and
|
||||||
|
not href.strip().startswith('//')):
|
||||||
|
full_url = full_url.replace('http://', 'https://', 1)
|
||||||
|
|
||||||
# Use proper URL parsing
|
# Use proper URL parsing
|
||||||
parsed = urlparse(full_url)
|
parsed = urlparse(full_url)
|
||||||
|
|
||||||
|
|||||||
@@ -11,22 +11,3 @@ GEMINI_API_TOKEN=your_gemini_key_here
|
|||||||
# Examples: "openai/gpt-4", "anthropic/claude-3-opus", "deepseek/chat", etc.
|
# Examples: "openai/gpt-4", "anthropic/claude-3-opus", "deepseek/chat", etc.
|
||||||
# If not set, uses the provider specified in config.yml (default: openai/gpt-4o-mini)
|
# If not set, uses the provider specified in config.yml (default: openai/gpt-4o-mini)
|
||||||
# LLM_PROVIDER=anthropic/claude-3-opus
|
# LLM_PROVIDER=anthropic/claude-3-opus
|
||||||
|
|
||||||
# Optional: Global LLM temperature setting (0.0-2.0)
|
|
||||||
# Controls randomness in responses. Lower = more focused, Higher = more creative
|
|
||||||
# LLM_TEMPERATURE=0.7
|
|
||||||
|
|
||||||
# Optional: Global custom API base URL
|
|
||||||
# Use this to point to custom endpoints or proxy servers
|
|
||||||
# LLM_BASE_URL=https://api.custom.com/v1
|
|
||||||
|
|
||||||
# Optional: Provider-specific temperature overrides
|
|
||||||
# These take precedence over the global LLM_TEMPERATURE
|
|
||||||
# OPENAI_TEMPERATURE=0.5
|
|
||||||
# ANTHROPIC_TEMPERATURE=0.3
|
|
||||||
# GROQ_TEMPERATURE=0.8
|
|
||||||
|
|
||||||
# Optional: Provider-specific base URL overrides
|
|
||||||
# Use for provider-specific proxy endpoints
|
|
||||||
# OPENAI_BASE_URL=https://custom-openai.company.com/v1
|
|
||||||
# GROQ_BASE_URL=https://custom-groq.company.com/v1
|
|
||||||
@@ -692,7 +692,8 @@ app:
|
|||||||
# Default LLM Configuration
|
# Default LLM Configuration
|
||||||
llm:
|
llm:
|
||||||
provider: "openai/gpt-4o-mini" # Can be overridden by LLM_PROVIDER env var
|
provider: "openai/gpt-4o-mini" # Can be overridden by LLM_PROVIDER env var
|
||||||
# api_key: sk-... # If you pass the API key directly (not recommended)
|
api_key_env: "OPENAI_API_KEY"
|
||||||
|
# api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
|
||||||
|
|
||||||
# Redis Configuration (Used by internal Redis server managed by supervisord)
|
# Redis Configuration (Used by internal Redis server managed by supervisord)
|
||||||
redis:
|
redis:
|
||||||
|
|||||||
@@ -42,9 +42,7 @@ from utils import (
|
|||||||
should_cleanup_task,
|
should_cleanup_task,
|
||||||
decode_redis_hash,
|
decode_redis_hash,
|
||||||
get_llm_api_key,
|
get_llm_api_key,
|
||||||
validate_llm_provider,
|
validate_llm_provider
|
||||||
get_llm_temperature,
|
|
||||||
get_llm_base_url
|
|
||||||
)
|
)
|
||||||
|
|
||||||
import psutil, time
|
import psutil, time
|
||||||
@@ -98,9 +96,7 @@ async def handle_llm_qa(
|
|||||||
response = perform_completion_with_backoff(
|
response = perform_completion_with_backoff(
|
||||||
provider=config["llm"]["provider"],
|
provider=config["llm"]["provider"],
|
||||||
prompt_with_variables=prompt,
|
prompt_with_variables=prompt,
|
||||||
api_token=get_llm_api_key(config), # Returns None to let litellm handle it
|
api_token=get_llm_api_key(config)
|
||||||
temperature=get_llm_temperature(config),
|
|
||||||
base_url=get_llm_base_url(config)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return response.choices[0].message.content
|
return response.choices[0].message.content
|
||||||
@@ -119,9 +115,7 @@ async def process_llm_extraction(
|
|||||||
instruction: str,
|
instruction: str,
|
||||||
schema: Optional[str] = None,
|
schema: Optional[str] = None,
|
||||||
cache: str = "0",
|
cache: str = "0",
|
||||||
provider: Optional[str] = None,
|
provider: Optional[str] = None
|
||||||
temperature: Optional[float] = None,
|
|
||||||
base_url: Optional[str] = None
|
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Process LLM extraction in background."""
|
"""Process LLM extraction in background."""
|
||||||
try:
|
try:
|
||||||
@@ -133,13 +127,11 @@ async def process_llm_extraction(
|
|||||||
"error": error_msg
|
"error": error_msg
|
||||||
})
|
})
|
||||||
return
|
return
|
||||||
api_key = get_llm_api_key(config, provider) # Returns None to let litellm handle it
|
api_key = get_llm_api_key(config, provider)
|
||||||
llm_strategy = LLMExtractionStrategy(
|
llm_strategy = LLMExtractionStrategy(
|
||||||
llm_config=LLMConfig(
|
llm_config=LLMConfig(
|
||||||
provider=provider or config["llm"]["provider"],
|
provider=provider or config["llm"]["provider"],
|
||||||
api_token=api_key,
|
api_token=api_key
|
||||||
temperature=temperature or get_llm_temperature(config, provider),
|
|
||||||
base_url=base_url or get_llm_base_url(config, provider)
|
|
||||||
),
|
),
|
||||||
instruction=instruction,
|
instruction=instruction,
|
||||||
schema=json.loads(schema) if schema else None,
|
schema=json.loads(schema) if schema else None,
|
||||||
@@ -186,9 +178,7 @@ async def handle_markdown_request(
|
|||||||
query: Optional[str] = None,
|
query: Optional[str] = None,
|
||||||
cache: str = "0",
|
cache: str = "0",
|
||||||
config: Optional[dict] = None,
|
config: Optional[dict] = None,
|
||||||
provider: Optional[str] = None,
|
provider: Optional[str] = None
|
||||||
temperature: Optional[float] = None,
|
|
||||||
base_url: Optional[str] = None
|
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Handle markdown generation requests."""
|
"""Handle markdown generation requests."""
|
||||||
try:
|
try:
|
||||||
@@ -213,9 +203,7 @@ async def handle_markdown_request(
|
|||||||
FilterType.LLM: LLMContentFilter(
|
FilterType.LLM: LLMContentFilter(
|
||||||
llm_config=LLMConfig(
|
llm_config=LLMConfig(
|
||||||
provider=provider or config["llm"]["provider"],
|
provider=provider or config["llm"]["provider"],
|
||||||
api_token=get_llm_api_key(config, provider), # Returns None to let litellm handle it
|
api_token=get_llm_api_key(config, provider),
|
||||||
temperature=temperature or get_llm_temperature(config, provider),
|
|
||||||
base_url=base_url or get_llm_base_url(config, provider)
|
|
||||||
),
|
),
|
||||||
instruction=query or "Extract main content"
|
instruction=query or "Extract main content"
|
||||||
)
|
)
|
||||||
@@ -260,9 +248,7 @@ async def handle_llm_request(
|
|||||||
schema: Optional[str] = None,
|
schema: Optional[str] = None,
|
||||||
cache: str = "0",
|
cache: str = "0",
|
||||||
config: Optional[dict] = None,
|
config: Optional[dict] = None,
|
||||||
provider: Optional[str] = None,
|
provider: Optional[str] = None
|
||||||
temperature: Optional[float] = None,
|
|
||||||
api_base_url: Optional[str] = None
|
|
||||||
) -> JSONResponse:
|
) -> JSONResponse:
|
||||||
"""Handle LLM extraction requests."""
|
"""Handle LLM extraction requests."""
|
||||||
base_url = get_base_url(request)
|
base_url = get_base_url(request)
|
||||||
@@ -293,9 +279,7 @@ async def handle_llm_request(
|
|||||||
cache,
|
cache,
|
||||||
base_url,
|
base_url,
|
||||||
config,
|
config,
|
||||||
provider,
|
provider
|
||||||
temperature,
|
|
||||||
api_base_url
|
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -340,9 +324,7 @@ async def create_new_task(
|
|||||||
cache: str,
|
cache: str,
|
||||||
base_url: str,
|
base_url: str,
|
||||||
config: dict,
|
config: dict,
|
||||||
provider: Optional[str] = None,
|
provider: Optional[str] = None
|
||||||
temperature: Optional[float] = None,
|
|
||||||
api_base_url: Optional[str] = None
|
|
||||||
) -> JSONResponse:
|
) -> JSONResponse:
|
||||||
"""Create and initialize a new task."""
|
"""Create and initialize a new task."""
|
||||||
decoded_url = unquote(input_path)
|
decoded_url = unquote(input_path)
|
||||||
@@ -367,9 +349,7 @@ async def create_new_task(
|
|||||||
query,
|
query,
|
||||||
schema,
|
schema,
|
||||||
cache,
|
cache,
|
||||||
provider,
|
provider
|
||||||
temperature,
|
|
||||||
api_base_url
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return JSONResponse({
|
return JSONResponse({
|
||||||
|
|||||||
@@ -11,7 +11,8 @@ app:
|
|||||||
# Default LLM Configuration
|
# Default LLM Configuration
|
||||||
llm:
|
llm:
|
||||||
provider: "openai/gpt-4o-mini"
|
provider: "openai/gpt-4o-mini"
|
||||||
# api_key: sk-... # If you pass the API key directly (not recommended)
|
api_key_env: "OPENAI_API_KEY"
|
||||||
|
# api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
|
||||||
|
|
||||||
# Redis Configuration
|
# Redis Configuration
|
||||||
redis:
|
redis:
|
||||||
|
|||||||
@@ -37,8 +37,6 @@ class LlmJobPayload(BaseModel):
|
|||||||
schema: Optional[str] = None
|
schema: Optional[str] = None
|
||||||
cache: bool = False
|
cache: bool = False
|
||||||
provider: Optional[str] = None
|
provider: Optional[str] = None
|
||||||
temperature: Optional[float] = None
|
|
||||||
base_url: Optional[str] = None
|
|
||||||
|
|
||||||
|
|
||||||
class CrawlJobPayload(BaseModel):
|
class CrawlJobPayload(BaseModel):
|
||||||
@@ -65,8 +63,6 @@ async def llm_job_enqueue(
|
|||||||
cache=payload.cache,
|
cache=payload.cache,
|
||||||
config=_config,
|
config=_config,
|
||||||
provider=payload.provider,
|
provider=payload.provider,
|
||||||
temperature=payload.temperature,
|
|
||||||
api_base_url=payload.base_url,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -76,7 +72,7 @@ async def llm_job_status(
|
|||||||
task_id: str,
|
task_id: str,
|
||||||
_td: Dict = Depends(lambda: _token_dep())
|
_td: Dict = Depends(lambda: _token_dep())
|
||||||
):
|
):
|
||||||
return await handle_task_status(_redis, task_id, base_url=str(request.base_url))
|
return await handle_task_status(_redis, task_id)
|
||||||
|
|
||||||
|
|
||||||
# ---------- CRAWL job -------------------------------------------------------
|
# ---------- CRAWL job -------------------------------------------------------
|
||||||
|
|||||||
@@ -16,8 +16,6 @@ class MarkdownRequest(BaseModel):
|
|||||||
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
|
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
|
||||||
c: Optional[str] = Field("0", description="Cache‑bust / revision counter")
|
c: Optional[str] = Field("0", description="Cache‑bust / revision counter")
|
||||||
provider: Optional[str] = Field(None, description="LLM provider override (e.g., 'anthropic/claude-3-opus')")
|
provider: Optional[str] = Field(None, description="LLM provider override (e.g., 'anthropic/claude-3-opus')")
|
||||||
temperature: Optional[float] = Field(None, description="LLM temperature override (0.0-2.0)")
|
|
||||||
base_url: Optional[str] = Field(None, description="LLM API base URL override")
|
|
||||||
|
|
||||||
|
|
||||||
class RawCode(BaseModel):
|
class RawCode(BaseModel):
|
||||||
|
|||||||
@@ -241,8 +241,7 @@ async def get_markdown(
|
|||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
400, "Invalid URL format. Must start with http://, https://, or for raw HTML (raw:, raw://)")
|
400, "Invalid URL format. Must start with http://, https://, or for raw HTML (raw:, raw://)")
|
||||||
markdown = await handle_markdown_request(
|
markdown = await handle_markdown_request(
|
||||||
body.url, body.f, body.q, body.c, config, body.provider,
|
body.url, body.f, body.q, body.c, config, body.provider
|
||||||
body.temperature, body.base_url
|
|
||||||
)
|
)
|
||||||
return JSONResponse({
|
return JSONResponse({
|
||||||
"url": body.url,
|
"url": body.url,
|
||||||
|
|||||||
@@ -71,7 +71,7 @@ def decode_redis_hash(hash_data: Dict[bytes, bytes]) -> Dict[str, str]:
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_llm_api_key(config: Dict, provider: Optional[str] = None) -> Optional[str]:
|
def get_llm_api_key(config: Dict, provider: Optional[str] = None) -> str:
|
||||||
"""Get the appropriate API key based on the LLM provider.
|
"""Get the appropriate API key based on the LLM provider.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -79,14 +79,19 @@ def get_llm_api_key(config: Dict, provider: Optional[str] = None) -> Optional[st
|
|||||||
provider: Optional provider override (e.g., "openai/gpt-4")
|
provider: Optional provider override (e.g., "openai/gpt-4")
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The API key if directly configured, otherwise None to let litellm handle it
|
The API key for the provider, or empty string if not found
|
||||||
"""
|
"""
|
||||||
# Check if direct API key is configured (for backward compatibility)
|
|
||||||
|
# Use provided provider or fall back to config
|
||||||
|
if not provider:
|
||||||
|
provider = config["llm"]["provider"]
|
||||||
|
|
||||||
|
# Check if direct API key is configured
|
||||||
if "api_key" in config["llm"]:
|
if "api_key" in config["llm"]:
|
||||||
return config["llm"]["api_key"]
|
return config["llm"]["api_key"]
|
||||||
|
|
||||||
# Return None - litellm will automatically find the right environment variable
|
# Fall back to the configured api_key_env if no match
|
||||||
return None
|
return os.environ.get(config["llm"].get("api_key_env", ""), "")
|
||||||
|
|
||||||
|
|
||||||
def validate_llm_provider(config: Dict, provider: Optional[str] = None) -> tuple[bool, str]:
|
def validate_llm_provider(config: Dict, provider: Optional[str] = None) -> tuple[bool, str]:
|
||||||
@@ -99,77 +104,18 @@ def validate_llm_provider(config: Dict, provider: Optional[str] = None) -> tuple
|
|||||||
Returns:
|
Returns:
|
||||||
Tuple of (is_valid, error_message)
|
Tuple of (is_valid, error_message)
|
||||||
"""
|
"""
|
||||||
# If a direct API key is configured, validation passes
|
# Use provided provider or fall back to config
|
||||||
if "api_key" in config["llm"]:
|
if not provider:
|
||||||
|
provider = config["llm"]["provider"]
|
||||||
|
|
||||||
|
# Get the API key for this provider
|
||||||
|
api_key = get_llm_api_key(config, provider)
|
||||||
|
|
||||||
|
if not api_key:
|
||||||
|
return False, f"No API key found for provider '{provider}'. Please set the appropriate environment variable."
|
||||||
|
|
||||||
return True, ""
|
return True, ""
|
||||||
|
|
||||||
# Otherwise, trust that litellm will find the appropriate environment variable
|
|
||||||
# We can't easily validate this without reimplementing litellm's logic
|
|
||||||
return True, ""
|
|
||||||
|
|
||||||
|
|
||||||
def get_llm_temperature(config: Dict, provider: Optional[str] = None) -> Optional[float]:
|
|
||||||
"""Get temperature setting based on the LLM provider.
|
|
||||||
|
|
||||||
Priority order:
|
|
||||||
1. Provider-specific environment variable (e.g., OPENAI_TEMPERATURE)
|
|
||||||
2. Global LLM_TEMPERATURE environment variable
|
|
||||||
3. None (to use litellm/provider defaults)
|
|
||||||
|
|
||||||
Args:
|
|
||||||
config: The application configuration dictionary
|
|
||||||
provider: Optional provider override (e.g., "openai/gpt-4")
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The temperature setting if configured, otherwise None
|
|
||||||
"""
|
|
||||||
# Check provider-specific temperature first
|
|
||||||
if provider:
|
|
||||||
provider_name = provider.split('/')[0].upper()
|
|
||||||
provider_temp = os.environ.get(f"{provider_name}_TEMPERATURE")
|
|
||||||
if provider_temp:
|
|
||||||
try:
|
|
||||||
return float(provider_temp)
|
|
||||||
except ValueError:
|
|
||||||
logging.warning(f"Invalid temperature value for {provider_name}: {provider_temp}")
|
|
||||||
|
|
||||||
# Check global LLM_TEMPERATURE
|
|
||||||
global_temp = os.environ.get("LLM_TEMPERATURE")
|
|
||||||
if global_temp:
|
|
||||||
try:
|
|
||||||
return float(global_temp)
|
|
||||||
except ValueError:
|
|
||||||
logging.warning(f"Invalid global temperature value: {global_temp}")
|
|
||||||
|
|
||||||
# Return None to use litellm/provider defaults
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def get_llm_base_url(config: Dict, provider: Optional[str] = None) -> Optional[str]:
|
|
||||||
"""Get base URL setting based on the LLM provider.
|
|
||||||
|
|
||||||
Priority order:
|
|
||||||
1. Provider-specific environment variable (e.g., OPENAI_BASE_URL)
|
|
||||||
2. Global LLM_BASE_URL environment variable
|
|
||||||
3. None (to use default endpoints)
|
|
||||||
|
|
||||||
Args:
|
|
||||||
config: The application configuration dictionary
|
|
||||||
provider: Optional provider override (e.g., "openai/gpt-4")
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The base URL if configured, otherwise None
|
|
||||||
"""
|
|
||||||
# Check provider-specific base URL first
|
|
||||||
if provider:
|
|
||||||
provider_name = provider.split('/')[0].upper()
|
|
||||||
provider_url = os.environ.get(f"{provider_name}_BASE_URL")
|
|
||||||
if provider_url:
|
|
||||||
return provider_url
|
|
||||||
|
|
||||||
# Check global LLM_BASE_URL
|
|
||||||
return os.environ.get("LLM_BASE_URL")
|
|
||||||
|
|
||||||
|
|
||||||
def verify_email_domain(email: str) -> bool:
|
def verify_email_domain(email: str) -> bool:
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -155,6 +155,7 @@ If your page is a single-page app with repeated JS updates, set `js_only=True` i
|
|||||||
| **`exclude_external_links`** | `bool` (False) | Removes all links pointing outside the current domain. |
|
| **`exclude_external_links`** | `bool` (False) | Removes all links pointing outside the current domain. |
|
||||||
| **`exclude_social_media_links`** | `bool` (False) | Strips links specifically to social sites (like Facebook or Twitter). |
|
| **`exclude_social_media_links`** | `bool` (False) | Strips links specifically to social sites (like Facebook or Twitter). |
|
||||||
| **`exclude_domains`** | `list` ([]) | Provide a custom list of domains to exclude (like `["ads.com", "trackers.io"]`). |
|
| **`exclude_domains`** | `list` ([]) | Provide a custom list of domains to exclude (like `["ads.com", "trackers.io"]`). |
|
||||||
|
| **`preserve_https_for_internal_links`** | `bool` (False) | If `True`, preserves HTTPS scheme for internal links even when the server redirects to HTTP. Useful for security-conscious crawling. |
|
||||||
|
|
||||||
Use these for link-level content filtering (often to keep crawls “internal” or to remove spammy domains).
|
Use these for link-level content filtering (often to keep crawls “internal” or to remove spammy domains).
|
||||||
|
|
||||||
|
|||||||
@@ -472,6 +472,17 @@ Note that for BestFirstCrawlingStrategy, score_threshold is not needed since pag
|
|||||||
|
|
||||||
5.**Balance breadth vs. depth.** Choose your strategy wisely - BFS for comprehensive coverage, DFS for deep exploration, BestFirst for focused relevance-based crawling.
|
5.**Balance breadth vs. depth.** Choose your strategy wisely - BFS for comprehensive coverage, DFS for deep exploration, BestFirst for focused relevance-based crawling.
|
||||||
|
|
||||||
|
6.**Preserve HTTPS for security.** If crawling HTTPS sites that redirect to HTTP, use `preserve_https_for_internal_links=True` to maintain secure connections:
|
||||||
|
|
||||||
|
```python
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=2),
|
||||||
|
preserve_https_for_internal_links=True # Keep HTTPS even if server redirects to HTTP
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
This is especially useful for security-conscious crawling or when dealing with sites that support both protocols.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 10. Summary & Next Steps
|
## 10. Summary & Next Steps
|
||||||
|
|||||||
@@ -89,16 +89,6 @@ ANTHROPIC_API_KEY=your-anthropic-key
|
|||||||
# TOGETHER_API_KEY=your-together-key
|
# TOGETHER_API_KEY=your-together-key
|
||||||
# MISTRAL_API_KEY=your-mistral-key
|
# MISTRAL_API_KEY=your-mistral-key
|
||||||
# GEMINI_API_TOKEN=your-gemini-token
|
# GEMINI_API_TOKEN=your-gemini-token
|
||||||
|
|
||||||
# Optional: Global LLM settings
|
|
||||||
# LLM_PROVIDER=openai/gpt-4o-mini
|
|
||||||
# LLM_TEMPERATURE=0.7
|
|
||||||
# LLM_BASE_URL=https://api.custom.com/v1
|
|
||||||
|
|
||||||
# Optional: Provider-specific overrides
|
|
||||||
# OPENAI_TEMPERATURE=0.5
|
|
||||||
# OPENAI_BASE_URL=https://custom-openai.com/v1
|
|
||||||
# ANTHROPIC_TEMPERATURE=0.3
|
|
||||||
EOL
|
EOL
|
||||||
```
|
```
|
||||||
> 🔑 **Note**: Keep your API keys secure! Never commit `.llm.env` to version control.
|
> 🔑 **Note**: Keep your API keys secure! Never commit `.llm.env` to version control.
|
||||||
@@ -166,43 +156,27 @@ cp deploy/docker/.llm.env.example .llm.env
|
|||||||
|
|
||||||
**Flexible LLM Provider Configuration:**
|
**Flexible LLM Provider Configuration:**
|
||||||
|
|
||||||
The Docker setup now supports flexible LLM provider configuration through a hierarchical system:
|
The Docker setup now supports flexible LLM provider configuration through three methods:
|
||||||
|
|
||||||
1. **API Request Parameters** (Highest Priority): Specify per request
|
1. **Environment Variable** (Highest Priority): Set `LLM_PROVIDER` to override the default
|
||||||
|
```bash
|
||||||
|
export LLM_PROVIDER="anthropic/claude-3-opus"
|
||||||
|
# Or in your .llm.env file:
|
||||||
|
# LLM_PROVIDER=anthropic/claude-3-opus
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **API Request Parameter**: Specify provider per request
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"url": "https://example.com",
|
"url": "https://example.com",
|
||||||
"f": "llm",
|
"f": "llm",
|
||||||
"provider": "groq/mixtral-8x7b",
|
"provider": "groq/mixtral-8x7b"
|
||||||
"temperature": 0.7,
|
|
||||||
"base_url": "https://api.custom.com/v1"
|
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
2. **Provider-Specific Environment Variables**: Override for specific providers
|
3. **Config File Default**: Falls back to `config.yml` (default: `openai/gpt-4o-mini`)
|
||||||
```bash
|
|
||||||
# In your .llm.env file:
|
|
||||||
OPENAI_TEMPERATURE=0.5
|
|
||||||
OPENAI_BASE_URL=https://custom-openai.com/v1
|
|
||||||
ANTHROPIC_TEMPERATURE=0.3
|
|
||||||
```
|
|
||||||
|
|
||||||
3. **Global Environment Variables**: Set defaults for all providers
|
The system automatically selects the appropriate API key based on the configured `api_key_env` in the config file.
|
||||||
```bash
|
|
||||||
# In your .llm.env file:
|
|
||||||
LLM_PROVIDER=anthropic/claude-3-opus
|
|
||||||
LLM_TEMPERATURE=0.7
|
|
||||||
LLM_BASE_URL=https://api.proxy.com/v1
|
|
||||||
```
|
|
||||||
|
|
||||||
4. **Config File Default**: Falls back to `config.yml` (default: `openai/gpt-4o-mini`)
|
|
||||||
|
|
||||||
The system automatically selects the appropriate API key based on the provider. LiteLLM handles finding the correct environment variable for each provider (e.g., OPENAI_API_KEY for OpenAI, GEMINI_API_TOKEN for Google Gemini, etc.).
|
|
||||||
|
|
||||||
**Supported LLM Parameters:**
|
|
||||||
- `provider`: LLM provider and model (e.g., "openai/gpt-4", "anthropic/claude-3-opus")
|
|
||||||
- `temperature`: Controls randomness (0.0-2.0, lower = more focused, higher = more creative)
|
|
||||||
- `base_url`: Custom API endpoint for proxy servers or alternative endpoints
|
|
||||||
|
|
||||||
#### 3. Build and Run with Compose
|
#### 3. Build and Run with Compose
|
||||||
|
|
||||||
@@ -581,101 +555,6 @@ Crucially, when sending configurations directly via JSON, they **must** follow t
|
|||||||
**LLM Extraction Strategy** *(Keep example, ensure schema uses type/value wrapper)*
|
**LLM Extraction Strategy** *(Keep example, ensure schema uses type/value wrapper)*
|
||||||
*(Keep Deep Crawler Example)*
|
*(Keep Deep Crawler Example)*
|
||||||
|
|
||||||
### LLM Configuration Examples
|
|
||||||
|
|
||||||
The Docker API supports dynamic LLM configuration through multiple levels:
|
|
||||||
|
|
||||||
#### Temperature Control
|
|
||||||
|
|
||||||
Temperature affects the randomness of LLM responses (0.0 = deterministic, 2.0 = very creative):
|
|
||||||
|
|
||||||
```python
|
|
||||||
import requests
|
|
||||||
|
|
||||||
# Low temperature for factual extraction
|
|
||||||
response = requests.post(
|
|
||||||
"http://localhost:11235/md",
|
|
||||||
json={
|
|
||||||
"url": "https://example.com",
|
|
||||||
"f": "llm",
|
|
||||||
"q": "Extract all dates and numbers from this page",
|
|
||||||
"temperature": 0.2 # Very focused, deterministic
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
# High temperature for creative tasks
|
|
||||||
response = requests.post(
|
|
||||||
"http://localhost:11235/md",
|
|
||||||
json={
|
|
||||||
"url": "https://example.com",
|
|
||||||
"f": "llm",
|
|
||||||
"q": "Write a creative summary of this content",
|
|
||||||
"temperature": 1.2 # More creative, varied responses
|
|
||||||
}
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Custom API Endpoints
|
|
||||||
|
|
||||||
Use custom base URLs for proxy servers or alternative API endpoints:
|
|
||||||
|
|
||||||
```python
|
|
||||||
|
|
||||||
# Using a local LLM server
|
|
||||||
response = requests.post(
|
|
||||||
"http://localhost:11235/md",
|
|
||||||
json={
|
|
||||||
"url": "https://example.com",
|
|
||||||
"f": "llm",
|
|
||||||
"q": "Extract key information",
|
|
||||||
"provider": "ollama/llama2",
|
|
||||||
"base_url": "http://localhost:11434/v1"
|
|
||||||
}
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Dynamic Provider Selection
|
|
||||||
|
|
||||||
Switch between providers based on task requirements:
|
|
||||||
|
|
||||||
```python
|
|
||||||
async def smart_extraction(url: str, content_type: str):
|
|
||||||
"""Select provider and temperature based on content type"""
|
|
||||||
|
|
||||||
configs = {
|
|
||||||
"technical": {
|
|
||||||
"provider": "openai/gpt-4",
|
|
||||||
"temperature": 0.3,
|
|
||||||
"query": "Extract technical specifications and code examples"
|
|
||||||
},
|
|
||||||
"creative": {
|
|
||||||
"provider": "anthropic/claude-3-opus",
|
|
||||||
"temperature": 0.9,
|
|
||||||
"query": "Create an engaging narrative summary"
|
|
||||||
},
|
|
||||||
"quick": {
|
|
||||||
"provider": "groq/mixtral-8x7b",
|
|
||||||
"temperature": 0.5,
|
|
||||||
"query": "Quick summary in bullet points"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
config = configs.get(content_type, configs["quick"])
|
|
||||||
|
|
||||||
response = await httpx.post(
|
|
||||||
"http://localhost:11235/md",
|
|
||||||
json={
|
|
||||||
"url": url,
|
|
||||||
"f": "llm",
|
|
||||||
"q": config["query"],
|
|
||||||
"provider": config["provider"],
|
|
||||||
"temperature": config["temperature"]
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
return response.json()
|
|
||||||
```
|
|
||||||
|
|
||||||
### REST API Examples
|
### REST API Examples
|
||||||
|
|
||||||
Update URLs to use port `11235`.
|
Update URLs to use port `11235`.
|
||||||
@@ -814,8 +693,8 @@ app:
|
|||||||
# Default LLM Configuration
|
# Default LLM Configuration
|
||||||
llm:
|
llm:
|
||||||
provider: "openai/gpt-4o-mini" # Can be overridden by LLM_PROVIDER env var
|
provider: "openai/gpt-4o-mini" # Can be overridden by LLM_PROVIDER env var
|
||||||
# api_key: sk-... # If you pass the API key directly (not recommended)
|
api_key_env: "OPENAI_API_KEY"
|
||||||
# temperature and base_url are controlled via environment variables or request parameters
|
# api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
|
||||||
|
|
||||||
# Redis Configuration (Used by internal Redis server managed by supervisord)
|
# Redis Configuration (Used by internal Redis server managed by supervisord)
|
||||||
redis:
|
redis:
|
||||||
|
|||||||
@@ -1,349 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Test script for LLM temperature and base_url parameters in Crawl4AI Docker API.
|
|
||||||
This demonstrates the new hierarchical configuration system:
|
|
||||||
1. Request-level parameters (highest priority)
|
|
||||||
2. Provider-specific environment variables
|
|
||||||
3. Global environment variables
|
|
||||||
4. System defaults (lowest priority)
|
|
||||||
"""
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
import httpx
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
from rich.console import Console
|
|
||||||
from rich.panel import Panel
|
|
||||||
from rich.syntax import Syntax
|
|
||||||
from rich.table import Table
|
|
||||||
|
|
||||||
|
|
||||||
console = Console()
|
|
||||||
|
|
||||||
# Configuration
|
|
||||||
BASE_URL = "http://localhost:11235" # Docker API endpoint
|
|
||||||
TEST_URL = "https://httpbin.org/html" # Simple test page
|
|
||||||
|
|
||||||
# --- Helper Functions ---
|
|
||||||
|
|
||||||
async def check_server_health(client: httpx.AsyncClient) -> bool:
|
|
||||||
"""Check if the server is healthy."""
|
|
||||||
console.print("[bold cyan]Checking server health...[/]", end="")
|
|
||||||
try:
|
|
||||||
response = await client.get("/health", timeout=10.0)
|
|
||||||
response.raise_for_status()
|
|
||||||
console.print(" [bold green]✓ Server is healthy![/]")
|
|
||||||
return True
|
|
||||||
except Exception as e:
|
|
||||||
console.print(f"\n[bold red]✗ Server health check failed: {e}[/]")
|
|
||||||
console.print(f"Is the server running at {BASE_URL}?")
|
|
||||||
return False
|
|
||||||
|
|
||||||
def print_request(endpoint: str, payload: dict, title: str = "Request"):
|
|
||||||
"""Pretty print the request."""
|
|
||||||
syntax = Syntax(json.dumps(payload, indent=2), "json", theme="monokai")
|
|
||||||
console.print(Panel.fit(
|
|
||||||
f"[cyan]POST {endpoint}[/cyan]\n{syntax}",
|
|
||||||
title=f"[bold blue]{title}[/]",
|
|
||||||
border_style="blue"
|
|
||||||
))
|
|
||||||
|
|
||||||
def print_response(response: dict, title: str = "Response"):
|
|
||||||
"""Pretty print relevant parts of the response."""
|
|
||||||
# Extract only the relevant parts
|
|
||||||
relevant = {}
|
|
||||||
if "markdown" in response:
|
|
||||||
relevant["markdown"] = response["markdown"][:200] + "..." if len(response.get("markdown", "")) > 200 else response.get("markdown", "")
|
|
||||||
if "success" in response:
|
|
||||||
relevant["success"] = response["success"]
|
|
||||||
if "url" in response:
|
|
||||||
relevant["url"] = response["url"]
|
|
||||||
if "filter" in response:
|
|
||||||
relevant["filter"] = response["filter"]
|
|
||||||
|
|
||||||
console.print(Panel.fit(
|
|
||||||
Syntax(json.dumps(relevant, indent=2), "json", theme="monokai"),
|
|
||||||
title=f"[bold green]{title}[/]",
|
|
||||||
border_style="green"
|
|
||||||
))
|
|
||||||
|
|
||||||
# --- Test Functions ---
|
|
||||||
|
|
||||||
async def test_default_no_params(client: httpx.AsyncClient):
|
|
||||||
"""Test 1: No temperature or base_url specified - uses defaults"""
|
|
||||||
console.rule("[bold yellow]Test 1: Default Configuration (No Parameters)[/]")
|
|
||||||
|
|
||||||
payload = {
|
|
||||||
"url": TEST_URL,
|
|
||||||
"f": "llm",
|
|
||||||
"q": "What is the main heading of this page? Answer in exactly 5 words."
|
|
||||||
}
|
|
||||||
|
|
||||||
print_request("/md", payload, "Request without temperature/base_url")
|
|
||||||
|
|
||||||
try:
|
|
||||||
response = await client.post("/md", json=payload, timeout=30.0)
|
|
||||||
response.raise_for_status()
|
|
||||||
data = response.json()
|
|
||||||
print_response(data, "Response (using system defaults)")
|
|
||||||
console.print("[dim]→ This used system defaults or environment variables if set[/]")
|
|
||||||
except Exception as e:
|
|
||||||
console.print(f"[red]Error: {e}[/]")
|
|
||||||
|
|
||||||
async def test_request_temperature(client: httpx.AsyncClient):
|
|
||||||
"""Test 2: Request-level temperature (highest priority)"""
|
|
||||||
console.rule("[bold yellow]Test 2: Request-Level Temperature[/]")
|
|
||||||
|
|
||||||
# Test with low temperature (more focused)
|
|
||||||
payload_low = {
|
|
||||||
"url": TEST_URL,
|
|
||||||
"f": "llm",
|
|
||||||
"q": "What is the main heading? Be creative and poetic.",
|
|
||||||
"temperature": 0.1 # Very low - should be less creative
|
|
||||||
}
|
|
||||||
|
|
||||||
print_request("/md", payload_low, "Low Temperature (0.1)")
|
|
||||||
|
|
||||||
try:
|
|
||||||
response = await client.post("/md", json=payload_low, timeout=30.0)
|
|
||||||
response.raise_for_status()
|
|
||||||
data_low = response.json()
|
|
||||||
print_response(data_low, "Response with Low Temperature")
|
|
||||||
console.print("[dim]→ Low temperature (0.1) should produce focused, less creative output[/]")
|
|
||||||
except Exception as e:
|
|
||||||
console.print(f"[red]Error: {e}[/]")
|
|
||||||
|
|
||||||
console.print()
|
|
||||||
|
|
||||||
# Test with high temperature (more creative)
|
|
||||||
payload_high = {
|
|
||||||
"url": TEST_URL,
|
|
||||||
"f": "llm",
|
|
||||||
"q": "What is the main heading? Be creative and poetic.",
|
|
||||||
"temperature": 1.5 # High - should be more creative
|
|
||||||
}
|
|
||||||
|
|
||||||
print_request("/md", payload_high, "High Temperature (1.5)")
|
|
||||||
|
|
||||||
try:
|
|
||||||
response = await client.post("/md", json=payload_high, timeout=30.0)
|
|
||||||
response.raise_for_status()
|
|
||||||
data_high = response.json()
|
|
||||||
print_response(data_high, "Response with High Temperature")
|
|
||||||
console.print("[dim]→ High temperature (1.5) should produce more creative, varied output[/]")
|
|
||||||
except Exception as e:
|
|
||||||
console.print(f"[red]Error: {e}[/]")
|
|
||||||
|
|
||||||
async def test_provider_override(client: httpx.AsyncClient):
|
|
||||||
"""Test 3: Provider override with temperature"""
|
|
||||||
console.rule("[bold yellow]Test 3: Provider Override with Temperature[/]")
|
|
||||||
|
|
||||||
provider = "gemini/gemini-2.5-flash-lite"
|
|
||||||
payload = {
|
|
||||||
"url": TEST_URL,
|
|
||||||
"f": "llm",
|
|
||||||
"q": "Summarize this page in one sentence.",
|
|
||||||
"provider": provider, # Explicitly set provider
|
|
||||||
"temperature": 0.7
|
|
||||||
}
|
|
||||||
|
|
||||||
print_request("/md", payload, "Provider + Temperature Override")
|
|
||||||
|
|
||||||
try:
|
|
||||||
response = await client.post("/md", json=payload, timeout=30.0)
|
|
||||||
response.raise_for_status()
|
|
||||||
data = response.json()
|
|
||||||
print_response(data, "Response with Provider Override")
|
|
||||||
console.print(f"[dim]→ This explicitly uses {provider} with temperature 0.7[/]")
|
|
||||||
except Exception as e:
|
|
||||||
console.print(f"[red]Error: {e}[/]")
|
|
||||||
|
|
||||||
async def test_base_url_custom(client: httpx.AsyncClient):
|
|
||||||
"""Test 4: Custom base_url (will fail unless you have a custom endpoint)"""
|
|
||||||
console.rule("[bold yellow]Test 4: Custom Base URL (Demo Only)[/]")
|
|
||||||
|
|
||||||
payload = {
|
|
||||||
"url": TEST_URL,
|
|
||||||
"f": "llm",
|
|
||||||
"q": "What is this page about?",
|
|
||||||
"base_url": "https://api.custom-endpoint.com/v1", # Custom endpoint
|
|
||||||
"temperature": 0.5
|
|
||||||
}
|
|
||||||
|
|
||||||
print_request("/md", payload, "Custom Base URL Request")
|
|
||||||
console.print("[yellow]Note: This will fail unless you have a custom endpoint set up[/]")
|
|
||||||
|
|
||||||
try:
|
|
||||||
response = await client.post("/md", json=payload, timeout=10.0)
|
|
||||||
response.raise_for_status()
|
|
||||||
data = response.json()
|
|
||||||
print_response(data, "Response from Custom Endpoint")
|
|
||||||
except httpx.HTTPStatusError as e:
|
|
||||||
console.print(f"[yellow]Expected failure (no custom endpoint): Status {e.response.status_code}[/]")
|
|
||||||
except Exception as e:
|
|
||||||
console.print(f"[yellow]Expected error: {e}[/]")
|
|
||||||
|
|
||||||
async def test_llm_job_endpoint(client: httpx.AsyncClient):
|
|
||||||
"""Test 5: Test the /llm/job endpoint with temperature and base_url"""
|
|
||||||
console.rule("[bold yellow]Test 5: LLM Job Endpoint with Parameters[/]")
|
|
||||||
|
|
||||||
payload = {
|
|
||||||
"url": TEST_URL,
|
|
||||||
"q": "Extract the main title and any key information",
|
|
||||||
"temperature": 0.3,
|
|
||||||
# "base_url": "https://api.openai.com/v1" # Optional
|
|
||||||
}
|
|
||||||
|
|
||||||
print_request("/llm/job", payload, "LLM Job with Temperature")
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Submit the job
|
|
||||||
response = await client.post("/llm/job", json=payload, timeout=30.0)
|
|
||||||
response.raise_for_status()
|
|
||||||
job_data = response.json()
|
|
||||||
|
|
||||||
if "task_id" in job_data:
|
|
||||||
task_id = job_data["task_id"]
|
|
||||||
console.print(f"[green]Job created with task_id: {task_id}[/]")
|
|
||||||
|
|
||||||
# Poll for result (simplified - in production use proper polling)
|
|
||||||
await asyncio.sleep(3)
|
|
||||||
|
|
||||||
status_response = await client.get(f"/llm/job/{task_id}")
|
|
||||||
status_data = status_response.json()
|
|
||||||
|
|
||||||
if status_data.get("status") == "completed":
|
|
||||||
console.print("[green]Job completed successfully![/]")
|
|
||||||
if "result" in status_data:
|
|
||||||
console.print(Panel.fit(
|
|
||||||
Syntax(json.dumps(status_data["result"], indent=2), "json", theme="monokai"),
|
|
||||||
title="Extraction Result",
|
|
||||||
border_style="green"
|
|
||||||
))
|
|
||||||
else:
|
|
||||||
console.print(f"[yellow]Job status: {status_data.get('status', 'unknown')}[/]")
|
|
||||||
else:
|
|
||||||
console.print(f"[red]Unexpected response: {job_data}[/]")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
console.print(f"[red]Error: {e}[/]")
|
|
||||||
|
|
||||||
|
|
||||||
async def test_llm_endpoint(client: httpx.AsyncClient):
|
|
||||||
"""
|
|
||||||
Quick QA round-trip with /llm.
|
|
||||||
Asks a trivial question against SIMPLE_URL just to show wiring.
|
|
||||||
"""
|
|
||||||
import time
|
|
||||||
import urllib.parse
|
|
||||||
|
|
||||||
page_url = "https://kidocode.com"
|
|
||||||
question = "What is the title of this page?"
|
|
||||||
|
|
||||||
enc = urllib.parse.quote_plus(page_url, safe="")
|
|
||||||
console.print(f"GET /llm/{enc}?q={question}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
t0 = time.time()
|
|
||||||
resp = await client.get(f"/llm/{enc}", params={"q": question})
|
|
||||||
dt = time.time() - t0
|
|
||||||
console.print(
|
|
||||||
f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
|
|
||||||
resp.raise_for_status()
|
|
||||||
answer = resp.json().get("answer", "")
|
|
||||||
console.print(Panel(answer or "No answer returned",
|
|
||||||
title="LLM answer", border_style="magenta", expand=False))
|
|
||||||
except Exception as e:
|
|
||||||
console.print(f"[bold red]Error hitting /llm:[/] {e}")
|
|
||||||
|
|
||||||
|
|
||||||
async def show_environment_info():
|
|
||||||
"""Display current environment configuration"""
|
|
||||||
console.rule("[bold cyan]Current Environment Configuration[/]")
|
|
||||||
|
|
||||||
table = Table(title="LLM Environment Variables", show_header=True, header_style="bold magenta")
|
|
||||||
table.add_column("Variable", style="cyan", width=30)
|
|
||||||
table.add_column("Value", style="yellow")
|
|
||||||
table.add_column("Description", style="dim")
|
|
||||||
|
|
||||||
env_vars = [
|
|
||||||
("LLM_PROVIDER", "Global default provider"),
|
|
||||||
("LLM_TEMPERATURE", "Global default temperature"),
|
|
||||||
("LLM_BASE_URL", "Global custom API endpoint"),
|
|
||||||
("OPENAI_API_KEY", "OpenAI API key"),
|
|
||||||
("OPENAI_TEMPERATURE", "OpenAI-specific temperature"),
|
|
||||||
("OPENAI_BASE_URL", "OpenAI-specific endpoint"),
|
|
||||||
("ANTHROPIC_API_KEY", "Anthropic API key"),
|
|
||||||
("ANTHROPIC_TEMPERATURE", "Anthropic-specific temperature"),
|
|
||||||
("GROQ_API_KEY", "Groq API key"),
|
|
||||||
("GROQ_TEMPERATURE", "Groq-specific temperature"),
|
|
||||||
]
|
|
||||||
|
|
||||||
for var, desc in env_vars:
|
|
||||||
value = os.environ.get(var, "[not set]")
|
|
||||||
if "API_KEY" in var and value != "[not set]":
|
|
||||||
# Mask API keys for security
|
|
||||||
value = value[:10] + "..." if len(value) > 10 else "***"
|
|
||||||
table.add_row(var, value, desc)
|
|
||||||
|
|
||||||
console.print(table)
|
|
||||||
console.print()
|
|
||||||
|
|
||||||
# --- Main Test Runner ---
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
"""Run all tests"""
|
|
||||||
console.print(Panel.fit(
|
|
||||||
"[bold cyan]Crawl4AI LLM Parameters Test Suite[/]\n" +
|
|
||||||
"Testing temperature and base_url configuration hierarchy",
|
|
||||||
border_style="cyan"
|
|
||||||
))
|
|
||||||
|
|
||||||
# Show current environment
|
|
||||||
# await show_environment_info()
|
|
||||||
|
|
||||||
# Create HTTP client
|
|
||||||
async with httpx.AsyncClient(base_url=BASE_URL, timeout=60.0) as client:
|
|
||||||
# Check server health
|
|
||||||
if not await check_server_health(client):
|
|
||||||
console.print("[red]Server is not available. Please ensure the Docker container is running.[/]")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Run tests
|
|
||||||
tests = [
|
|
||||||
("Default Configuration", test_default_no_params),
|
|
||||||
("Request Temperature", test_request_temperature),
|
|
||||||
("Provider Override", test_provider_override),
|
|
||||||
("Custom Base URL", test_base_url_custom),
|
|
||||||
("LLM Job Endpoint", test_llm_job_endpoint),
|
|
||||||
("LLM Endpoint", test_llm_endpoint),
|
|
||||||
]
|
|
||||||
|
|
||||||
for i, (name, test_func) in enumerate(tests, 1):
|
|
||||||
if i > 1:
|
|
||||||
console.print() # Add spacing between tests
|
|
||||||
|
|
||||||
try:
|
|
||||||
await test_func(client)
|
|
||||||
except Exception as e:
|
|
||||||
console.print(f"[red]Test '{name}' failed with error: {e}[/]")
|
|
||||||
console.print_exception(show_locals=False)
|
|
||||||
|
|
||||||
console.rule("[bold green]All Tests Complete![/]", style="green")
|
|
||||||
|
|
||||||
# Summary
|
|
||||||
console.print("\n[bold cyan]Configuration Hierarchy Summary:[/]")
|
|
||||||
console.print("1. [yellow]Request parameters[/] - Highest priority (temperature, base_url in API call)")
|
|
||||||
console.print("2. [yellow]Provider-specific env[/] - e.g., OPENAI_TEMPERATURE, GROQ_BASE_URL")
|
|
||||||
console.print("3. [yellow]Global env variables[/] - LLM_TEMPERATURE, LLM_BASE_URL")
|
|
||||||
console.print("4. [yellow]System defaults[/] - Lowest priority (provider/litellm defaults)")
|
|
||||||
console.print()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
try:
|
|
||||||
asyncio.run(main())
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
console.print("\n[yellow]Tests interrupted by user.[/]")
|
|
||||||
except Exception as e:
|
|
||||||
console.print(f"\n[bold red]An error occurred:[/]")
|
|
||||||
console.print_exception(show_locals=False)
|
|
||||||
175
tests/test_preserve_https_for_internal_links.py
Normal file
175
tests/test_preserve_https_for_internal_links.py
Normal file
@@ -0,0 +1,175 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Final test and demo for HTTPS preservation feature (Issue #1410)
|
||||||
|
|
||||||
|
This demonstrates how the preserve_https_for_internal_links flag
|
||||||
|
prevents HTTPS downgrade when servers redirect to HTTP.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
|
def demonstrate_issue():
|
||||||
|
"""Show the problem: HTTPS -> HTTP redirect causes HTTP links"""
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
print("DEMONSTRATING THE ISSUE")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Simulate what happens during crawling
|
||||||
|
original_url = "https://quotes.toscrape.com/tag/deep-thoughts"
|
||||||
|
redirected_url = "http://quotes.toscrape.com/tag/deep-thoughts/" # Server redirects to HTTP
|
||||||
|
|
||||||
|
# Extract a relative link
|
||||||
|
relative_link = "/author/Albert-Einstein"
|
||||||
|
|
||||||
|
# Standard URL joining uses the redirected (HTTP) base
|
||||||
|
resolved_url = urljoin(redirected_url, relative_link)
|
||||||
|
|
||||||
|
print(f"Original URL: {original_url}")
|
||||||
|
print(f"Redirected to: {redirected_url}")
|
||||||
|
print(f"Relative link: {relative_link}")
|
||||||
|
print(f"Resolved link: {resolved_url}")
|
||||||
|
print(f"\n❌ Problem: Link is now HTTP instead of HTTPS!")
|
||||||
|
|
||||||
|
return resolved_url
|
||||||
|
|
||||||
|
def demonstrate_solution():
|
||||||
|
"""Show the solution: preserve HTTPS for internal links"""
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("DEMONSTRATING THE SOLUTION")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Our normalize_url with HTTPS preservation
|
||||||
|
def normalize_url_with_preservation(href, base_url, preserve_https=False, original_scheme=None):
|
||||||
|
"""Normalize URL with optional HTTPS preservation"""
|
||||||
|
|
||||||
|
# Standard resolution
|
||||||
|
full_url = urljoin(base_url, href.strip())
|
||||||
|
|
||||||
|
# Preserve HTTPS if requested
|
||||||
|
if preserve_https and original_scheme == 'https':
|
||||||
|
parsed_full = urlparse(full_url)
|
||||||
|
parsed_base = urlparse(base_url)
|
||||||
|
|
||||||
|
# Only for same-domain links
|
||||||
|
if parsed_full.scheme == 'http' and parsed_full.netloc == parsed_base.netloc:
|
||||||
|
full_url = full_url.replace('http://', 'https://', 1)
|
||||||
|
print(f" → Preserved HTTPS for {parsed_full.netloc}")
|
||||||
|
|
||||||
|
return full_url
|
||||||
|
|
||||||
|
# Same scenario as before
|
||||||
|
original_url = "https://quotes.toscrape.com/tag/deep-thoughts"
|
||||||
|
redirected_url = "http://quotes.toscrape.com/tag/deep-thoughts/"
|
||||||
|
relative_link = "/author/Albert-Einstein"
|
||||||
|
|
||||||
|
# Without preservation (current behavior)
|
||||||
|
resolved_without = normalize_url_with_preservation(
|
||||||
|
relative_link, redirected_url,
|
||||||
|
preserve_https=False, original_scheme='https'
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"\nWithout preservation:")
|
||||||
|
print(f" Result: {resolved_without}")
|
||||||
|
|
||||||
|
# With preservation (new feature)
|
||||||
|
resolved_with = normalize_url_with_preservation(
|
||||||
|
relative_link, redirected_url,
|
||||||
|
preserve_https=True, original_scheme='https'
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"\nWith preservation (preserve_https_for_internal_links=True):")
|
||||||
|
print(f" Result: {resolved_with}")
|
||||||
|
print(f"\n✅ Solution: Internal link stays HTTPS!")
|
||||||
|
|
||||||
|
return resolved_with
|
||||||
|
|
||||||
|
def test_edge_cases():
|
||||||
|
"""Test important edge cases"""
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("EDGE CASES")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
|
def preserve_https(href, base_url, original_scheme):
|
||||||
|
"""Helper to test preservation logic"""
|
||||||
|
full_url = urljoin(base_url, href)
|
||||||
|
|
||||||
|
if original_scheme == 'https':
|
||||||
|
parsed_full = urlparse(full_url)
|
||||||
|
parsed_base = urlparse(base_url)
|
||||||
|
# Fixed: check for protocol-relative URLs
|
||||||
|
if (parsed_full.scheme == 'http' and
|
||||||
|
parsed_full.netloc == parsed_base.netloc and
|
||||||
|
not href.strip().startswith('//')):
|
||||||
|
full_url = full_url.replace('http://', 'https://', 1)
|
||||||
|
|
||||||
|
return full_url
|
||||||
|
|
||||||
|
test_cases = [
|
||||||
|
# (description, href, base_url, original_scheme, should_be_https)
|
||||||
|
("External link", "http://other.com/page", "http://example.com", "https", False),
|
||||||
|
("Already HTTPS", "/page", "https://example.com", "https", True),
|
||||||
|
("No original HTTPS", "/page", "http://example.com", "http", False),
|
||||||
|
("Subdomain", "/page", "http://sub.example.com", "https", True),
|
||||||
|
("Protocol-relative", "//example.com/page", "http://example.com", "https", False),
|
||||||
|
]
|
||||||
|
|
||||||
|
for desc, href, base_url, orig_scheme, should_be_https in test_cases:
|
||||||
|
result = preserve_https(href, base_url, orig_scheme)
|
||||||
|
is_https = result.startswith('https://')
|
||||||
|
status = "✅" if is_https == should_be_https else "❌"
|
||||||
|
|
||||||
|
print(f"\n{status} {desc}:")
|
||||||
|
print(f" Input: {href} + {base_url}")
|
||||||
|
print(f" Result: {result}")
|
||||||
|
print(f" Expected HTTPS: {should_be_https}, Got: {is_https}")
|
||||||
|
|
||||||
|
def usage_example():
|
||||||
|
"""Show how to use the feature in crawl4ai"""
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("USAGE IN CRAWL4AI")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
print("""
|
||||||
|
To enable HTTPS preservation in your crawl4ai code:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
preserve_https_for_internal_links=True # Enable HTTPS preservation
|
||||||
|
)
|
||||||
|
|
||||||
|
result = await crawler.arun(
|
||||||
|
url="https://example.com",
|
||||||
|
config=config
|
||||||
|
)
|
||||||
|
|
||||||
|
# All internal links will maintain HTTPS even if
|
||||||
|
# the server redirects to HTTP
|
||||||
|
```
|
||||||
|
|
||||||
|
This is especially useful for:
|
||||||
|
- Sites that redirect HTTPS to HTTP but still support HTTPS
|
||||||
|
- Security-conscious crawling where you want to stay on HTTPS
|
||||||
|
- Avoiding mixed content issues in downstream processing
|
||||||
|
""")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Run all demonstrations
|
||||||
|
demonstrate_issue()
|
||||||
|
demonstrate_solution()
|
||||||
|
test_edge_cases()
|
||||||
|
usage_example()
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("✅ All tests complete!")
|
||||||
|
print("=" * 60)
|
||||||
Reference in New Issue
Block a user