Compare commits

...

8 Commits

Author SHA1 Message Date
ntohidi
95051020f4 fix(docker): Fix LLM API key handling for multi-provider support
Previously, the system incorrectly used OPENAI_API_KEY for all LLM providers
due to a hardcoded api_key_env fallback in config.yml. This caused authentication
errors when using non-OpenAI providers like Gemini.

Changes:
- Remove api_key_env from config.yml to let litellm handle provider-specific env vars
- Simplify get_llm_api_key() to return None, allowing litellm to auto-detect keys
- Update validate_llm_provider() to trust litellm's built-in key detection
- Update documentation to reflect the new automatic key handling

The fix leverages litellm's existing capability to automatically find the correct
environment variable for each provider (OPENAI_API_KEY, GEMINI_API_TOKEN, etc.)
without manual configuration.

ref #1291
2025-08-21 14:01:04 +08:00
Nasrin
ef174a4c7a Merge pull request #1104 from emmanuel-ferdman/main
fix(docker-api): migrate to modern datetime library API
2025-08-20 10:57:39 +08:00
Nasrin
f4206d6ba1 Merge pull request #1369 from NezarAli/main
Fix examples in README.md
2025-08-18 14:22:54 +08:00
Nasrin
dad7c51481 Merge pull request #1398 from unclecode/fix/update-url-seeding-docs
Update URL seeding examples to use proper async context managers
2025-08-18 13:00:26 +08:00
Soham Kukreti
ecbe5ffb84 docs: Update URL seeding examples to use proper async context managers
- Wrap all AsyncUrlSeeder usage with async context managers
- Update URL seeding adventure example to use "sitemap+cc" source, focus on course posts, and add stream=True parameter to fix runtime error
2025-08-13 18:16:46 +05:30
Nezar Ali
7a8190ecb6 Fix examples in README.md 2025-08-06 11:58:29 +03:00
Emmanuel Ferdman
8e3c411a3e Merge branch 'main' into main 2025-07-29 14:05:35 +03:00
Emmanuel Ferdman
1e1c887a2f fix(docker-api): migrate to modern datetime library API
Signed-off-by: Emmanuel Ferdman <emmanuelferdman@gmail.com>
2025-05-13 00:04:58 -07:00
7 changed files with 54 additions and 64 deletions

View File

@@ -373,7 +373,7 @@ async def main():
async with AsyncWebCrawler(config=browser_config) as crawler: async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun( result = await crawler.arun(
url="https://docs.micronaut.io/4.7.6/guide/", url="https://docs.micronaut.io/4.9.9/guide/",
config=run_config config=run_config
) )
print(len(result.markdown.raw_markdown)) print(len(result.markdown.raw_markdown))
@@ -425,7 +425,7 @@ async def main():
"type": "attribute", "type": "attribute",
"attribute": "src" "attribute": "src"
} }
} ]
} }
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)

View File

@@ -692,8 +692,7 @@ app:
# Default LLM Configuration # Default LLM Configuration
llm: llm:
provider: "openai/gpt-4o-mini" # Can be overridden by LLM_PROVIDER env var provider: "openai/gpt-4o-mini" # Can be overridden by LLM_PROVIDER env var
api_key_env: "OPENAI_API_KEY" # api_key: sk-... # If you pass the API key directly (not recommended)
# api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
# Redis Configuration (Used by internal Redis server managed by supervisord) # Redis Configuration (Used by internal Redis server managed by supervisord)
redis: redis:

View File

@@ -4,7 +4,7 @@ import asyncio
from typing import List, Tuple, Dict from typing import List, Tuple, Dict
from functools import partial from functools import partial
from uuid import uuid4 from uuid import uuid4
from datetime import datetime from datetime import datetime, timezone
from base64 import b64encode from base64 import b64encode
import logging import logging
@@ -96,7 +96,7 @@ async def handle_llm_qa(
response = perform_completion_with_backoff( response = perform_completion_with_backoff(
provider=config["llm"]["provider"], provider=config["llm"]["provider"],
prompt_with_variables=prompt, prompt_with_variables=prompt,
api_token=get_llm_api_key(config) api_token=get_llm_api_key(config) # Returns None to let litellm handle it
) )
return response.choices[0].message.content return response.choices[0].message.content
@@ -127,7 +127,7 @@ async def process_llm_extraction(
"error": error_msg "error": error_msg
}) })
return return
api_key = get_llm_api_key(config, provider) api_key = get_llm_api_key(config, provider) # Returns None to let litellm handle it
llm_strategy = LLMExtractionStrategy( llm_strategy = LLMExtractionStrategy(
llm_config=LLMConfig( llm_config=LLMConfig(
provider=provider or config["llm"]["provider"], provider=provider or config["llm"]["provider"],
@@ -203,7 +203,7 @@ async def handle_markdown_request(
FilterType.LLM: LLMContentFilter( FilterType.LLM: LLMContentFilter(
llm_config=LLMConfig( llm_config=LLMConfig(
provider=provider or config["llm"]["provider"], provider=provider or config["llm"]["provider"],
api_token=get_llm_api_key(config, provider), api_token=get_llm_api_key(config, provider), # Returns None to let litellm handle it
), ),
instruction=query or "Extract main content" instruction=query or "Extract main content"
) )
@@ -576,7 +576,7 @@ async def handle_crawl_job(
task_id = f"crawl_{uuid4().hex[:8]}" task_id = f"crawl_{uuid4().hex[:8]}"
await redis.hset(f"task:{task_id}", mapping={ await redis.hset(f"task:{task_id}", mapping={
"status": TaskStatus.PROCESSING, # <-- keep enum values consistent "status": TaskStatus.PROCESSING, # <-- keep enum values consistent
"created_at": datetime.utcnow().isoformat(), "created_at": datetime.now(timezone.utc).replace(tzinfo=None).isoformat(),
"url": json.dumps(urls), # store list as JSON string "url": json.dumps(urls), # store list as JSON string
"result": "", "result": "",
"error": "", "error": "",

View File

@@ -11,8 +11,7 @@ app:
# Default LLM Configuration # Default LLM Configuration
llm: llm:
provider: "openai/gpt-4o-mini" provider: "openai/gpt-4o-mini"
api_key_env: "OPENAI_API_KEY" # api_key: sk-... # If you pass the API key directly (not recommended)
# api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
# Redis Configuration # Redis Configuration
redis: redis:

View File

@@ -71,7 +71,7 @@ def decode_redis_hash(hash_data: Dict[bytes, bytes]) -> Dict[str, str]:
def get_llm_api_key(config: Dict, provider: Optional[str] = None) -> str: def get_llm_api_key(config: Dict, provider: Optional[str] = None) -> Optional[str]:
"""Get the appropriate API key based on the LLM provider. """Get the appropriate API key based on the LLM provider.
Args: Args:
@@ -79,19 +79,14 @@ def get_llm_api_key(config: Dict, provider: Optional[str] = None) -> str:
provider: Optional provider override (e.g., "openai/gpt-4") provider: Optional provider override (e.g., "openai/gpt-4")
Returns: Returns:
The API key for the provider, or empty string if not found The API key if directly configured, otherwise None to let litellm handle it
""" """
# Check if direct API key is configured (for backward compatibility)
# Use provided provider or fall back to config
if not provider:
provider = config["llm"]["provider"]
# Check if direct API key is configured
if "api_key" in config["llm"]: if "api_key" in config["llm"]:
return config["llm"]["api_key"] return config["llm"]["api_key"]
# Fall back to the configured api_key_env if no match # Return None - litellm will automatically find the right environment variable
return os.environ.get(config["llm"].get("api_key_env", ""), "") return None
def validate_llm_provider(config: Dict, provider: Optional[str] = None) -> tuple[bool, str]: def validate_llm_provider(config: Dict, provider: Optional[str] = None) -> tuple[bool, str]:
@@ -104,16 +99,12 @@ def validate_llm_provider(config: Dict, provider: Optional[str] = None) -> tuple
Returns: Returns:
Tuple of (is_valid, error_message) Tuple of (is_valid, error_message)
""" """
# Use provided provider or fall back to config # If a direct API key is configured, validation passes
if not provider: if "api_key" in config["llm"]:
provider = config["llm"]["provider"] return True, ""
# Get the API key for this provider
api_key = get_llm_api_key(config, provider)
if not api_key:
return False, f"No API key found for provider '{provider}'. Please set the appropriate environment variable."
# Otherwise, trust that litellm will find the appropriate environment variable
# We can't easily validate this without reimplementing litellm's logic
return True, "" return True, ""

View File

@@ -176,7 +176,7 @@ The Docker setup now supports flexible LLM provider configuration through three
3. **Config File Default**: Falls back to `config.yml` (default: `openai/gpt-4o-mini`) 3. **Config File Default**: Falls back to `config.yml` (default: `openai/gpt-4o-mini`)
The system automatically selects the appropriate API key based on the configured `api_key_env` in the config file. The system automatically selects the appropriate API key based on the provider. LiteLLM handles finding the correct environment variable for each provider (e.g., OPENAI_API_KEY for OpenAI, GEMINI_API_TOKEN for Google Gemini, etc.).
#### 3. Build and Run with Compose #### 3. Build and Run with Compose
@@ -693,8 +693,7 @@ app:
# Default LLM Configuration # Default LLM Configuration
llm: llm:
provider: "openai/gpt-4o-mini" # Can be overridden by LLM_PROVIDER env var provider: "openai/gpt-4o-mini" # Can be overridden by LLM_PROVIDER env var
api_key_env: "OPENAI_API_KEY" # api_key: sk-... # If you pass the API key directly (not recommended)
# api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
# Redis Configuration (Used by internal Redis server managed by supervisord) # Redis Configuration (Used by internal Redis server managed by supervisord)
redis: redis:

View File

@@ -102,16 +102,16 @@ async def smart_blog_crawler():
# Step 2: Configure discovery - let's find all blog posts # Step 2: Configure discovery - let's find all blog posts
config = SeedingConfig( config = SeedingConfig(
source="sitemap", # Use the website's sitemap source="sitemap+cc", # Use the website's sitemap+cc
pattern="*/blog/*.html", # Only blog posts pattern="*/courses/*", # Only courses related posts
extract_head=True, # Get page metadata extract_head=True, # Get page metadata
max_urls=100 # Limit for this example max_urls=100 # Limit for this example
) )
# Step 3: Discover URLs from the Python blog # Step 3: Discover URLs from the Python blog
print("🔍 Discovering blog posts...") print("🔍 Discovering course posts...")
urls = await seeder.urls("realpython.com", config) urls = await seeder.urls("realpython.com", config)
print(f"✅ Found {len(urls)} blog posts") print(f"✅ Found {len(urls)} course posts")
# Step 4: Filter for Python tutorials (using metadata!) # Step 4: Filter for Python tutorials (using metadata!)
tutorials = [ tutorials = [
@@ -134,7 +134,8 @@ async def smart_blog_crawler():
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig( config = CrawlerRunConfig(
only_text=True, only_text=True,
word_count_threshold=300 # Only substantial articles word_count_threshold=300, # Only substantial articles
stream=True
) )
# Extract URLs and crawl them # Extract URLs and crawl them
@@ -155,7 +156,7 @@ asyncio.run(smart_blog_crawler())
**What just happened?** **What just happened?**
1. We discovered all blog URLs from the sitemap 1. We discovered all blog URLs from the sitemap+cc
2. We filtered using metadata (no crawling needed!) 2. We filtered using metadata (no crawling needed!)
3. We crawled only the relevant tutorials 3. We crawled only the relevant tutorials
4. We saved tons of time and bandwidth 4. We saved tons of time and bandwidth
@@ -282,8 +283,8 @@ config = SeedingConfig(
live_check=True, # Verify each URL is accessible live_check=True, # Verify each URL is accessible
concurrency=20 # Check 20 URLs in parallel concurrency=20 # Check 20 URLs in parallel
) )
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("example.com", config) urls = await seeder.urls("example.com", config)
# Now you can filter by status # Now you can filter by status
live_urls = [u for u in urls if u["status"] == "valid"] live_urls = [u for u in urls if u["status"] == "valid"]
@@ -311,8 +312,8 @@ This is where URL seeding gets really powerful. Instead of crawling entire pages
config = SeedingConfig( config = SeedingConfig(
extract_head=True # Extract metadata from <head> section extract_head=True # Extract metadata from <head> section
) )
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("example.com", config) urls = await seeder.urls("example.com", config)
# Now each URL has rich metadata # Now each URL has rich metadata
for url in urls[:3]: for url in urls[:3]:
@@ -387,8 +388,8 @@ config = SeedingConfig(
scoring_method="bm25", scoring_method="bm25",
score_threshold=0.3 score_threshold=0.3
) )
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("example.com", config) urls = await seeder.urls("example.com", config)
# URLs are scored based on: # URLs are scored based on:
# 1. Domain parts matching (e.g., 'python' in python.example.com) # 1. Domain parts matching (e.g., 'python' in python.example.com)
@@ -429,8 +430,8 @@ config = SeedingConfig(
extract_head=True, extract_head=True,
live_check=True live_check=True
) )
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("blog.example.com", config) urls = await seeder.urls("blog.example.com", config)
# Analyze the results # Analyze the results
for url in urls[:5]: for url in urls[:5]:
@@ -488,8 +489,8 @@ config = SeedingConfig(
scoring_method="bm25", # Use BM25 algorithm scoring_method="bm25", # Use BM25 algorithm
score_threshold=0.3 # Minimum relevance score score_threshold=0.3 # Minimum relevance score
) )
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("realpython.com", config) urls = await seeder.urls("realpython.com", config)
# Results are automatically sorted by relevance! # Results are automatically sorted by relevance!
for url in urls[:5]: for url in urls[:5]:
@@ -511,8 +512,8 @@ config = SeedingConfig(
score_threshold=0.5, score_threshold=0.5,
max_urls=20 max_urls=20
) )
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("docs.example.com", config) urls = await seeder.urls("docs.example.com", config)
# The highest scoring URLs will be API docs! # The highest scoring URLs will be API docs!
``` ```
@@ -529,8 +530,8 @@ config = SeedingConfig(
score_threshold=0.4, score_threshold=0.4,
pattern="*/product/*" # Combine with pattern matching pattern="*/product/*" # Combine with pattern matching
) )
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("shop.example.com", config) urls = await seeder.urls("shop.example.com", config)
# Filter further by price (from metadata) # Filter further by price (from metadata)
affordable = [ affordable = [
@@ -550,8 +551,8 @@ config = SeedingConfig(
scoring_method="bm25", scoring_method="bm25",
score_threshold=0.35 score_threshold=0.35
) )
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("technews.com", config) urls = await seeder.urls("technews.com", config)
# Filter by date # Filter by date
from datetime import datetime, timedelta from datetime import datetime, timedelta
@@ -591,8 +592,8 @@ for query in queries:
score_threshold=0.4, score_threshold=0.4,
max_urls=10 # Top 10 per topic max_urls=10 # Top 10 per topic
) )
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("learning-platform.com", config) urls = await seeder.urls("learning-platform.com", config)
all_tutorials.extend(urls) all_tutorials.extend(urls)
# Remove duplicates while preserving order # Remove duplicates while preserving order
@@ -625,7 +626,8 @@ config = SeedingConfig(
) )
# Returns a dictionary: {domain: [urls]} # Returns a dictionary: {domain: [urls]}
results = await seeder.many_urls(domains, config) async with AsyncUrlSeeder() as seeder:
results = await seeder.many_urls(domains, config)
# Process results # Process results
for domain, urls in results.items(): for domain, urls in results.items():
@@ -654,8 +656,8 @@ config = SeedingConfig(
pattern="*/blog/*", pattern="*/blog/*",
max_urls=100 max_urls=100
) )
async with AsyncUrlSeeder() as seeder:
results = await seeder.many_urls(competitors, config) results = await seeder.many_urls(competitors, config)
# Analyze content types # Analyze content types
for domain, urls in results.items(): for domain, urls in results.items():
@@ -690,8 +692,8 @@ config = SeedingConfig(
score_threshold=0.3, score_threshold=0.3,
max_urls=20 # Per site max_urls=20 # Per site
) )
async with AsyncUrlSeeder() as seeder:
results = await seeder.many_urls(educational_sites, config) results = await seeder.many_urls(educational_sites, config)
# Find the best beginner tutorials # Find the best beginner tutorials
all_tutorials = [] all_tutorials = []
@@ -731,8 +733,8 @@ config = SeedingConfig(
score_threshold=0.5, # High threshold for relevance score_threshold=0.5, # High threshold for relevance
max_urls=10 max_urls=10
) )
async with AsyncUrlSeeder() as seeder:
results = await seeder.many_urls(news_sites, config) results = await seeder.many_urls(news_sites, config)
# Collect all mentions # Collect all mentions
mentions = [] mentions = []