Compare commits
13 Commits
fix/docker
...
fix/docker
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6e728096fa | ||
|
|
4e1c4bd24e | ||
|
|
cce3390a2d | ||
|
|
4fe2d01361 | ||
|
|
159207b86f | ||
|
|
38f3ea42a7 | ||
|
|
102352eac4 | ||
|
|
40ab287c90 | ||
|
|
c09a57644f | ||
|
|
90af453506 | ||
|
|
8bb0e68cce | ||
|
|
69961cf40b | ||
|
|
9447054a65 |
@@ -304,9 +304,9 @@ The new Docker implementation includes:
|
|||||||
### Getting Started
|
### Getting Started
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Pull and run the latest release candidate
|
# Pull and run the latest release
|
||||||
docker pull unclecode/crawl4ai:0.7.0
|
docker pull unclecode/crawl4ai:latest
|
||||||
docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:0.7.0
|
docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:latest
|
||||||
|
|
||||||
# Visit the playground at http://localhost:11235/playground
|
# Visit the playground at http://localhost:11235/playground
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -97,13 +97,16 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
|
|||||||
if value != param.default and not ignore_default_value:
|
if value != param.default and not ignore_default_value:
|
||||||
current_values[name] = to_serializable_dict(value)
|
current_values[name] = to_serializable_dict(value)
|
||||||
|
|
||||||
if hasattr(obj, '__slots__'):
|
# Don't serialize private __slots__ - they're internal implementation details
|
||||||
for slot in obj.__slots__:
|
# not constructor parameters. This was causing URLPatternFilter to fail
|
||||||
if slot.startswith('_'): # Handle private slots
|
# because _simple_suffixes was being serialized as 'simple_suffixes'
|
||||||
attr_name = slot[1:] # Remove leading '_'
|
# if hasattr(obj, '__slots__'):
|
||||||
value = getattr(obj, slot, None)
|
# for slot in obj.__slots__:
|
||||||
if value is not None:
|
# if slot.startswith('_'): # Handle private slots
|
||||||
current_values[attr_name] = to_serializable_dict(value)
|
# attr_name = slot[1:] # Remove leading '_'
|
||||||
|
# value = getattr(obj, slot, None)
|
||||||
|
# if value is not None:
|
||||||
|
# current_values[attr_name] = to_serializable_dict(value)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -47,7 +47,13 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
|||||||
self.url_scorer = url_scorer
|
self.url_scorer = url_scorer
|
||||||
self.include_external = include_external
|
self.include_external = include_external
|
||||||
self.max_pages = max_pages
|
self.max_pages = max_pages
|
||||||
self.logger = logger or logging.getLogger(__name__)
|
# self.logger = logger or logging.getLogger(__name__)
|
||||||
|
# Ensure logger is always a Logger instance, not a dict from serialization
|
||||||
|
if isinstance(logger, logging.Logger):
|
||||||
|
self.logger = logger
|
||||||
|
else:
|
||||||
|
# Create a new logger if logger is None, dict, or any other non-Logger type
|
||||||
|
self.logger = logging.getLogger(__name__)
|
||||||
self.stats = TraversalStats(start_time=datetime.now())
|
self.stats = TraversalStats(start_time=datetime.now())
|
||||||
self._cancel_event = asyncio.Event()
|
self._cancel_event = asyncio.Event()
|
||||||
self._pages_crawled = 0
|
self._pages_crawled = 0
|
||||||
|
|||||||
@@ -38,7 +38,13 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
|||||||
self.include_external = include_external
|
self.include_external = include_external
|
||||||
self.score_threshold = score_threshold
|
self.score_threshold = score_threshold
|
||||||
self.max_pages = max_pages
|
self.max_pages = max_pages
|
||||||
self.logger = logger or logging.getLogger(__name__)
|
# self.logger = logger or logging.getLogger(__name__)
|
||||||
|
# Ensure logger is always a Logger instance, not a dict from serialization
|
||||||
|
if isinstance(logger, logging.Logger):
|
||||||
|
self.logger = logger
|
||||||
|
else:
|
||||||
|
# Create a new logger if logger is None, dict, or any other non-Logger type
|
||||||
|
self.logger = logging.getLogger(__name__)
|
||||||
self.stats = TraversalStats(start_time=datetime.now())
|
self.stats = TraversalStats(start_time=datetime.now())
|
||||||
self._cancel_event = asyncio.Event()
|
self._cancel_event = asyncio.Event()
|
||||||
self._pages_crawled = 0
|
self._pages_crawled = 0
|
||||||
|
|||||||
@@ -120,6 +120,9 @@ class URLPatternFilter(URLFilter):
|
|||||||
"""Pattern filter balancing speed and completeness"""
|
"""Pattern filter balancing speed and completeness"""
|
||||||
|
|
||||||
__slots__ = (
|
__slots__ = (
|
||||||
|
"patterns", # Store original patterns for serialization
|
||||||
|
"use_glob", # Store original use_glob for serialization
|
||||||
|
"reverse", # Store original reverse for serialization
|
||||||
"_simple_suffixes",
|
"_simple_suffixes",
|
||||||
"_simple_prefixes",
|
"_simple_prefixes",
|
||||||
"_domain_patterns",
|
"_domain_patterns",
|
||||||
@@ -142,6 +145,11 @@ class URLPatternFilter(URLFilter):
|
|||||||
reverse: bool = False,
|
reverse: bool = False,
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
# Store original constructor params for serialization
|
||||||
|
self.patterns = patterns
|
||||||
|
self.use_glob = use_glob
|
||||||
|
self.reverse = reverse
|
||||||
|
|
||||||
self._reverse = reverse
|
self._reverse = reverse
|
||||||
patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
|
patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
|
||||||
|
|
||||||
|
|||||||
@@ -253,6 +253,16 @@ class CrawlResult(BaseModel):
|
|||||||
requirements change, this is where you would update the logic.
|
requirements change, this is where you would update the logic.
|
||||||
"""
|
"""
|
||||||
result = super().model_dump(*args, **kwargs)
|
result = super().model_dump(*args, **kwargs)
|
||||||
|
|
||||||
|
# Remove any property descriptors that might have been included
|
||||||
|
# These deprecated properties should not be in the serialized output
|
||||||
|
for key in ['fit_html', 'fit_markdown', 'markdown_v2']:
|
||||||
|
if key in result and isinstance(result[key], property):
|
||||||
|
# del result[key]
|
||||||
|
# Nasrin: I decided to convert it to string instead of removing it.
|
||||||
|
result[key] = str(result[key])
|
||||||
|
|
||||||
|
# Add the markdown field properly
|
||||||
if self._markdown is not None:
|
if self._markdown is not None:
|
||||||
result["markdown"] = self._markdown.model_dump()
|
result["markdown"] = self._markdown.model_dump()
|
||||||
return result
|
return result
|
||||||
|
|||||||
@@ -2184,8 +2184,10 @@ def normalize_url(
|
|||||||
netloc = parsed.netloc.lower()
|
netloc = parsed.netloc.lower()
|
||||||
|
|
||||||
# ── path ──
|
# ── path ──
|
||||||
# Strip duplicate slashes and trailing “/” (except root)
|
# Strip duplicate slashes and trailing "/" (except root)
|
||||||
path = quote(unquote(parsed.path))
|
# IMPORTANT: Don't use quote(unquote()) as it mangles + signs in URLs
|
||||||
|
# The path from urlparse is already properly encoded
|
||||||
|
path = parsed.path
|
||||||
if path.endswith('/') and path != '/':
|
if path.endswith('/') and path != '/':
|
||||||
path = path.rstrip('/')
|
path = path.rstrip('/')
|
||||||
|
|
||||||
|
|||||||
@@ -10,4 +10,23 @@ GEMINI_API_TOKEN=your_gemini_key_here
|
|||||||
# Optional: Override the default LLM provider
|
# Optional: Override the default LLM provider
|
||||||
# Examples: "openai/gpt-4", "anthropic/claude-3-opus", "deepseek/chat", etc.
|
# Examples: "openai/gpt-4", "anthropic/claude-3-opus", "deepseek/chat", etc.
|
||||||
# If not set, uses the provider specified in config.yml (default: openai/gpt-4o-mini)
|
# If not set, uses the provider specified in config.yml (default: openai/gpt-4o-mini)
|
||||||
# LLM_PROVIDER=anthropic/claude-3-opus
|
# LLM_PROVIDER=anthropic/claude-3-opus
|
||||||
|
|
||||||
|
# Optional: Global LLM temperature setting (0.0-2.0)
|
||||||
|
# Controls randomness in responses. Lower = more focused, Higher = more creative
|
||||||
|
# LLM_TEMPERATURE=0.7
|
||||||
|
|
||||||
|
# Optional: Global custom API base URL
|
||||||
|
# Use this to point to custom endpoints or proxy servers
|
||||||
|
# LLM_BASE_URL=https://api.custom.com/v1
|
||||||
|
|
||||||
|
# Optional: Provider-specific temperature overrides
|
||||||
|
# These take precedence over the global LLM_TEMPERATURE
|
||||||
|
# OPENAI_TEMPERATURE=0.5
|
||||||
|
# ANTHROPIC_TEMPERATURE=0.3
|
||||||
|
# GROQ_TEMPERATURE=0.8
|
||||||
|
|
||||||
|
# Optional: Provider-specific base URL overrides
|
||||||
|
# Use for provider-specific proxy endpoints
|
||||||
|
# OPENAI_BASE_URL=https://custom-openai.company.com/v1
|
||||||
|
# GROQ_BASE_URL=https://custom-groq.company.com/v1
|
||||||
@@ -42,7 +42,9 @@ from utils import (
|
|||||||
should_cleanup_task,
|
should_cleanup_task,
|
||||||
decode_redis_hash,
|
decode_redis_hash,
|
||||||
get_llm_api_key,
|
get_llm_api_key,
|
||||||
validate_llm_provider
|
validate_llm_provider,
|
||||||
|
get_llm_temperature,
|
||||||
|
get_llm_base_url
|
||||||
)
|
)
|
||||||
|
|
||||||
import psutil, time
|
import psutil, time
|
||||||
@@ -96,7 +98,9 @@ async def handle_llm_qa(
|
|||||||
response = perform_completion_with_backoff(
|
response = perform_completion_with_backoff(
|
||||||
provider=config["llm"]["provider"],
|
provider=config["llm"]["provider"],
|
||||||
prompt_with_variables=prompt,
|
prompt_with_variables=prompt,
|
||||||
api_token=get_llm_api_key(config) # Returns None to let litellm handle it
|
api_token=get_llm_api_key(config), # Returns None to let litellm handle it
|
||||||
|
temperature=get_llm_temperature(config),
|
||||||
|
base_url=get_llm_base_url(config)
|
||||||
)
|
)
|
||||||
|
|
||||||
return response.choices[0].message.content
|
return response.choices[0].message.content
|
||||||
@@ -115,7 +119,9 @@ async def process_llm_extraction(
|
|||||||
instruction: str,
|
instruction: str,
|
||||||
schema: Optional[str] = None,
|
schema: Optional[str] = None,
|
||||||
cache: str = "0",
|
cache: str = "0",
|
||||||
provider: Optional[str] = None
|
provider: Optional[str] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
base_url: Optional[str] = None
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Process LLM extraction in background."""
|
"""Process LLM extraction in background."""
|
||||||
try:
|
try:
|
||||||
@@ -131,7 +137,9 @@ async def process_llm_extraction(
|
|||||||
llm_strategy = LLMExtractionStrategy(
|
llm_strategy = LLMExtractionStrategy(
|
||||||
llm_config=LLMConfig(
|
llm_config=LLMConfig(
|
||||||
provider=provider or config["llm"]["provider"],
|
provider=provider or config["llm"]["provider"],
|
||||||
api_token=api_key
|
api_token=api_key,
|
||||||
|
temperature=temperature or get_llm_temperature(config, provider),
|
||||||
|
base_url=base_url or get_llm_base_url(config, provider)
|
||||||
),
|
),
|
||||||
instruction=instruction,
|
instruction=instruction,
|
||||||
schema=json.loads(schema) if schema else None,
|
schema=json.loads(schema) if schema else None,
|
||||||
@@ -178,7 +186,9 @@ async def handle_markdown_request(
|
|||||||
query: Optional[str] = None,
|
query: Optional[str] = None,
|
||||||
cache: str = "0",
|
cache: str = "0",
|
||||||
config: Optional[dict] = None,
|
config: Optional[dict] = None,
|
||||||
provider: Optional[str] = None
|
provider: Optional[str] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
base_url: Optional[str] = None
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Handle markdown generation requests."""
|
"""Handle markdown generation requests."""
|
||||||
try:
|
try:
|
||||||
@@ -204,6 +214,8 @@ async def handle_markdown_request(
|
|||||||
llm_config=LLMConfig(
|
llm_config=LLMConfig(
|
||||||
provider=provider or config["llm"]["provider"],
|
provider=provider or config["llm"]["provider"],
|
||||||
api_token=get_llm_api_key(config, provider), # Returns None to let litellm handle it
|
api_token=get_llm_api_key(config, provider), # Returns None to let litellm handle it
|
||||||
|
temperature=temperature or get_llm_temperature(config, provider),
|
||||||
|
base_url=base_url or get_llm_base_url(config, provider)
|
||||||
),
|
),
|
||||||
instruction=query or "Extract main content"
|
instruction=query or "Extract main content"
|
||||||
)
|
)
|
||||||
@@ -248,7 +260,9 @@ async def handle_llm_request(
|
|||||||
schema: Optional[str] = None,
|
schema: Optional[str] = None,
|
||||||
cache: str = "0",
|
cache: str = "0",
|
||||||
config: Optional[dict] = None,
|
config: Optional[dict] = None,
|
||||||
provider: Optional[str] = None
|
provider: Optional[str] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
api_base_url: Optional[str] = None
|
||||||
) -> JSONResponse:
|
) -> JSONResponse:
|
||||||
"""Handle LLM extraction requests."""
|
"""Handle LLM extraction requests."""
|
||||||
base_url = get_base_url(request)
|
base_url = get_base_url(request)
|
||||||
@@ -279,7 +293,9 @@ async def handle_llm_request(
|
|||||||
cache,
|
cache,
|
||||||
base_url,
|
base_url,
|
||||||
config,
|
config,
|
||||||
provider
|
provider,
|
||||||
|
temperature,
|
||||||
|
api_base_url
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -324,7 +340,9 @@ async def create_new_task(
|
|||||||
cache: str,
|
cache: str,
|
||||||
base_url: str,
|
base_url: str,
|
||||||
config: dict,
|
config: dict,
|
||||||
provider: Optional[str] = None
|
provider: Optional[str] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
api_base_url: Optional[str] = None
|
||||||
) -> JSONResponse:
|
) -> JSONResponse:
|
||||||
"""Create and initialize a new task."""
|
"""Create and initialize a new task."""
|
||||||
decoded_url = unquote(input_path)
|
decoded_url = unquote(input_path)
|
||||||
@@ -349,7 +367,9 @@ async def create_new_task(
|
|||||||
query,
|
query,
|
||||||
schema,
|
schema,
|
||||||
cache,
|
cache,
|
||||||
provider
|
provider,
|
||||||
|
temperature,
|
||||||
|
api_base_url
|
||||||
)
|
)
|
||||||
|
|
||||||
return JSONResponse({
|
return JSONResponse({
|
||||||
|
|||||||
@@ -28,25 +28,43 @@ def create_access_token(data: dict, expires_delta: Optional[timedelta] = None) -
|
|||||||
signing_key = get_jwk_from_secret(SECRET_KEY)
|
signing_key = get_jwk_from_secret(SECRET_KEY)
|
||||||
return instance.encode(to_encode, signing_key, alg='HS256')
|
return instance.encode(to_encode, signing_key, alg='HS256')
|
||||||
|
|
||||||
def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)) -> Dict:
|
def verify_token(credentials: HTTPAuthorizationCredentials) -> Dict:
|
||||||
"""Verify the JWT token from the Authorization header."""
|
"""Verify the JWT token from the Authorization header."""
|
||||||
|
|
||||||
if credentials is None:
|
if not credentials or not credentials.credentials:
|
||||||
return None
|
raise HTTPException(
|
||||||
|
status_code=401,
|
||||||
|
detail="No token provided",
|
||||||
|
headers={"WWW-Authenticate": "Bearer"}
|
||||||
|
)
|
||||||
|
|
||||||
token = credentials.credentials
|
token = credentials.credentials
|
||||||
verifying_key = get_jwk_from_secret(SECRET_KEY)
|
verifying_key = get_jwk_from_secret(SECRET_KEY)
|
||||||
try:
|
try:
|
||||||
payload = instance.decode(token, verifying_key, do_time_check=True, algorithms='HS256')
|
payload = instance.decode(token, verifying_key, do_time_check=True, algorithms='HS256')
|
||||||
return payload
|
return payload
|
||||||
except Exception:
|
except Exception as e:
|
||||||
raise HTTPException(status_code=401, detail="Invalid or expired token")
|
raise HTTPException(
|
||||||
|
status_code=401,
|
||||||
|
detail=f"Invalid or expired token: {str(e)}",
|
||||||
|
headers={"WWW-Authenticate": "Bearer"}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_token_dependency(config: Dict):
|
def get_token_dependency(config: Dict):
|
||||||
"""Return the token dependency if JWT is enabled, else a function that returns None."""
|
"""Return the token dependency if JWT is enabled, else a function that returns None."""
|
||||||
|
|
||||||
if config.get("security", {}).get("jwt_enabled", False):
|
if config.get("security", {}).get("jwt_enabled", False):
|
||||||
return verify_token
|
def jwt_required(credentials: HTTPAuthorizationCredentials = Depends(security)) -> Dict:
|
||||||
|
"""Enforce JWT authentication when enabled."""
|
||||||
|
if credentials is None:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=401,
|
||||||
|
detail="Authentication required. Please provide a valid Bearer token.",
|
||||||
|
headers={"WWW-Authenticate": "Bearer"}
|
||||||
|
)
|
||||||
|
return verify_token(credentials)
|
||||||
|
return jwt_required
|
||||||
else:
|
else:
|
||||||
return lambda: None
|
return lambda: None
|
||||||
|
|
||||||
|
|||||||
@@ -38,8 +38,8 @@ rate_limiting:
|
|||||||
|
|
||||||
# Security Configuration
|
# Security Configuration
|
||||||
security:
|
security:
|
||||||
enabled: false
|
enabled: false
|
||||||
jwt_enabled: false
|
jwt_enabled: false
|
||||||
https_redirect: false
|
https_redirect: false
|
||||||
trusted_hosts: ["*"]
|
trusted_hosts: ["*"]
|
||||||
headers:
|
headers:
|
||||||
|
|||||||
@@ -37,6 +37,8 @@ class LlmJobPayload(BaseModel):
|
|||||||
schema: Optional[str] = None
|
schema: Optional[str] = None
|
||||||
cache: bool = False
|
cache: bool = False
|
||||||
provider: Optional[str] = None
|
provider: Optional[str] = None
|
||||||
|
temperature: Optional[float] = None
|
||||||
|
base_url: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
class CrawlJobPayload(BaseModel):
|
class CrawlJobPayload(BaseModel):
|
||||||
@@ -63,6 +65,8 @@ async def llm_job_enqueue(
|
|||||||
cache=payload.cache,
|
cache=payload.cache,
|
||||||
config=_config,
|
config=_config,
|
||||||
provider=payload.provider,
|
provider=payload.provider,
|
||||||
|
temperature=payload.temperature,
|
||||||
|
api_base_url=payload.base_url,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -72,7 +76,7 @@ async def llm_job_status(
|
|||||||
task_id: str,
|
task_id: str,
|
||||||
_td: Dict = Depends(lambda: _token_dep())
|
_td: Dict = Depends(lambda: _token_dep())
|
||||||
):
|
):
|
||||||
return await handle_task_status(_redis, task_id)
|
return await handle_task_status(_redis, task_id, base_url=str(request.base_url))
|
||||||
|
|
||||||
|
|
||||||
# ---------- CRAWL job -------------------------------------------------------
|
# ---------- CRAWL job -------------------------------------------------------
|
||||||
|
|||||||
@@ -16,6 +16,8 @@ class MarkdownRequest(BaseModel):
|
|||||||
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
|
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
|
||||||
c: Optional[str] = Field("0", description="Cache‑bust / revision counter")
|
c: Optional[str] = Field("0", description="Cache‑bust / revision counter")
|
||||||
provider: Optional[str] = Field(None, description="LLM provider override (e.g., 'anthropic/claude-3-opus')")
|
provider: Optional[str] = Field(None, description="LLM provider override (e.g., 'anthropic/claude-3-opus')")
|
||||||
|
temperature: Optional[float] = Field(None, description="LLM temperature override (0.0-2.0)")
|
||||||
|
base_url: Optional[str] = Field(None, description="LLM API base URL override")
|
||||||
|
|
||||||
|
|
||||||
class RawCode(BaseModel):
|
class RawCode(BaseModel):
|
||||||
|
|||||||
@@ -241,7 +241,8 @@ async def get_markdown(
|
|||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
400, "Invalid URL format. Must start with http://, https://, or for raw HTML (raw:, raw://)")
|
400, "Invalid URL format. Must start with http://, https://, or for raw HTML (raw:, raw://)")
|
||||||
markdown = await handle_markdown_request(
|
markdown = await handle_markdown_request(
|
||||||
body.url, body.f, body.q, body.c, config, body.provider
|
body.url, body.f, body.q, body.c, config, body.provider,
|
||||||
|
body.temperature, body.base_url
|
||||||
)
|
)
|
||||||
return JSONResponse({
|
return JSONResponse({
|
||||||
"url": body.url,
|
"url": body.url,
|
||||||
|
|||||||
@@ -108,6 +108,69 @@ def validate_llm_provider(config: Dict, provider: Optional[str] = None) -> tuple
|
|||||||
return True, ""
|
return True, ""
|
||||||
|
|
||||||
|
|
||||||
|
def get_llm_temperature(config: Dict, provider: Optional[str] = None) -> Optional[float]:
|
||||||
|
"""Get temperature setting based on the LLM provider.
|
||||||
|
|
||||||
|
Priority order:
|
||||||
|
1. Provider-specific environment variable (e.g., OPENAI_TEMPERATURE)
|
||||||
|
2. Global LLM_TEMPERATURE environment variable
|
||||||
|
3. None (to use litellm/provider defaults)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: The application configuration dictionary
|
||||||
|
provider: Optional provider override (e.g., "openai/gpt-4")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The temperature setting if configured, otherwise None
|
||||||
|
"""
|
||||||
|
# Check provider-specific temperature first
|
||||||
|
if provider:
|
||||||
|
provider_name = provider.split('/')[0].upper()
|
||||||
|
provider_temp = os.environ.get(f"{provider_name}_TEMPERATURE")
|
||||||
|
if provider_temp:
|
||||||
|
try:
|
||||||
|
return float(provider_temp)
|
||||||
|
except ValueError:
|
||||||
|
logging.warning(f"Invalid temperature value for {provider_name}: {provider_temp}")
|
||||||
|
|
||||||
|
# Check global LLM_TEMPERATURE
|
||||||
|
global_temp = os.environ.get("LLM_TEMPERATURE")
|
||||||
|
if global_temp:
|
||||||
|
try:
|
||||||
|
return float(global_temp)
|
||||||
|
except ValueError:
|
||||||
|
logging.warning(f"Invalid global temperature value: {global_temp}")
|
||||||
|
|
||||||
|
# Return None to use litellm/provider defaults
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_llm_base_url(config: Dict, provider: Optional[str] = None) -> Optional[str]:
|
||||||
|
"""Get base URL setting based on the LLM provider.
|
||||||
|
|
||||||
|
Priority order:
|
||||||
|
1. Provider-specific environment variable (e.g., OPENAI_BASE_URL)
|
||||||
|
2. Global LLM_BASE_URL environment variable
|
||||||
|
3. None (to use default endpoints)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: The application configuration dictionary
|
||||||
|
provider: Optional provider override (e.g., "openai/gpt-4")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The base URL if configured, otherwise None
|
||||||
|
"""
|
||||||
|
# Check provider-specific base URL first
|
||||||
|
if provider:
|
||||||
|
provider_name = provider.split('/')[0].upper()
|
||||||
|
provider_url = os.environ.get(f"{provider_name}_BASE_URL")
|
||||||
|
if provider_url:
|
||||||
|
return provider_url
|
||||||
|
|
||||||
|
# Check global LLM_BASE_URL
|
||||||
|
return os.environ.get("LLM_BASE_URL")
|
||||||
|
|
||||||
|
|
||||||
def verify_email_domain(email: str) -> bool:
|
def verify_email_domain(email: str) -> bool:
|
||||||
try:
|
try:
|
||||||
domain = email.split('@')[1]
|
domain = email.split('@')[1]
|
||||||
|
|||||||
@@ -126,30 +126,6 @@ Factors:
|
|||||||
- URL depth (fewer slashes = higher authority)
|
- URL depth (fewer slashes = higher authority)
|
||||||
- Clean URL structure
|
- Clean URL structure
|
||||||
|
|
||||||
### Custom Link Scoring
|
|
||||||
|
|
||||||
```python
|
|
||||||
class CustomLinkScorer:
|
|
||||||
def score(self, link: Link, query: str, state: CrawlState) -> float:
|
|
||||||
# Prioritize specific URL patterns
|
|
||||||
if "/api/reference/" in link.href:
|
|
||||||
return 2.0 # Double the score
|
|
||||||
|
|
||||||
# Deprioritize certain sections
|
|
||||||
if "/archive/" in link.href:
|
|
||||||
return 0.1 # Reduce score by 90%
|
|
||||||
|
|
||||||
# Default scoring
|
|
||||||
return 1.0
|
|
||||||
|
|
||||||
# Use with adaptive crawler
|
|
||||||
adaptive = AdaptiveCrawler(
|
|
||||||
crawler,
|
|
||||||
config=config,
|
|
||||||
link_scorer=CustomLinkScorer()
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Domain-Specific Configurations
|
## Domain-Specific Configurations
|
||||||
|
|
||||||
### Technical Documentation
|
### Technical Documentation
|
||||||
@@ -230,8 +206,12 @@ config = AdaptiveConfig(
|
|||||||
|
|
||||||
# Periodically clean state
|
# Periodically clean state
|
||||||
if len(state.knowledge_base) > 1000:
|
if len(state.knowledge_base) > 1000:
|
||||||
# Keep only most relevant
|
# Keep only the top 500 most relevant docs
|
||||||
state.knowledge_base = get_top_relevant(state.knowledge_base, 500)
|
top_content = adaptive.get_relevant_content(top_k=500)
|
||||||
|
keep_indices = {d["index"] for d in top_content}
|
||||||
|
state.knowledge_base = [
|
||||||
|
doc for i, doc in enumerate(state.knowledge_base) if i in keep_indices
|
||||||
|
]
|
||||||
```
|
```
|
||||||
|
|
||||||
### Parallel Processing
|
### Parallel Processing
|
||||||
@@ -252,18 +232,6 @@ tasks = [
|
|||||||
results = await asyncio.gather(*tasks)
|
results = await asyncio.gather(*tasks)
|
||||||
```
|
```
|
||||||
|
|
||||||
### Caching Strategy
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Enable caching for repeated crawls
|
|
||||||
async with AsyncWebCrawler(
|
|
||||||
config=BrowserConfig(
|
|
||||||
cache_mode=CacheMode.ENABLED
|
|
||||||
)
|
|
||||||
) as crawler:
|
|
||||||
adaptive = AdaptiveCrawler(crawler, config)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Debugging & Analysis
|
## Debugging & Analysis
|
||||||
|
|
||||||
### Enable Verbose Logging
|
### Enable Verbose Logging
|
||||||
@@ -322,9 +290,9 @@ with open("crawl_analysis.json", "w") as f:
|
|||||||
### Implementing a Custom Strategy
|
### Implementing a Custom Strategy
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.adaptive_crawler import BaseStrategy
|
from crawl4ai.adaptive_crawler import CrawlStrategy
|
||||||
|
|
||||||
class DomainSpecificStrategy(BaseStrategy):
|
class DomainSpecificStrategy(CrawlStrategy):
|
||||||
def calculate_coverage(self, state: CrawlState) -> float:
|
def calculate_coverage(self, state: CrawlState) -> float:
|
||||||
# Custom coverage calculation
|
# Custom coverage calculation
|
||||||
# e.g., weight certain terms more heavily
|
# e.g., weight certain terms more heavily
|
||||||
@@ -351,7 +319,7 @@ adaptive = AdaptiveCrawler(
|
|||||||
### Combining Strategies
|
### Combining Strategies
|
||||||
|
|
||||||
```python
|
```python
|
||||||
class HybridStrategy(BaseStrategy):
|
class HybridStrategy(CrawlStrategy):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.strategies = [
|
self.strategies = [
|
||||||
TechnicalDocStrategy(),
|
TechnicalDocStrategy(),
|
||||||
|
|||||||
@@ -89,6 +89,16 @@ ANTHROPIC_API_KEY=your-anthropic-key
|
|||||||
# TOGETHER_API_KEY=your-together-key
|
# TOGETHER_API_KEY=your-together-key
|
||||||
# MISTRAL_API_KEY=your-mistral-key
|
# MISTRAL_API_KEY=your-mistral-key
|
||||||
# GEMINI_API_TOKEN=your-gemini-token
|
# GEMINI_API_TOKEN=your-gemini-token
|
||||||
|
|
||||||
|
# Optional: Global LLM settings
|
||||||
|
# LLM_PROVIDER=openai/gpt-4o-mini
|
||||||
|
# LLM_TEMPERATURE=0.7
|
||||||
|
# LLM_BASE_URL=https://api.custom.com/v1
|
||||||
|
|
||||||
|
# Optional: Provider-specific overrides
|
||||||
|
# OPENAI_TEMPERATURE=0.5
|
||||||
|
# OPENAI_BASE_URL=https://custom-openai.com/v1
|
||||||
|
# ANTHROPIC_TEMPERATURE=0.3
|
||||||
EOL
|
EOL
|
||||||
```
|
```
|
||||||
> 🔑 **Note**: Keep your API keys secure! Never commit `.llm.env` to version control.
|
> 🔑 **Note**: Keep your API keys secure! Never commit `.llm.env` to version control.
|
||||||
@@ -156,28 +166,44 @@ cp deploy/docker/.llm.env.example .llm.env
|
|||||||
|
|
||||||
**Flexible LLM Provider Configuration:**
|
**Flexible LLM Provider Configuration:**
|
||||||
|
|
||||||
The Docker setup now supports flexible LLM provider configuration through three methods:
|
The Docker setup now supports flexible LLM provider configuration through a hierarchical system:
|
||||||
|
|
||||||
1. **Environment Variable** (Highest Priority): Set `LLM_PROVIDER` to override the default
|
1. **API Request Parameters** (Highest Priority): Specify per request
|
||||||
```bash
|
|
||||||
export LLM_PROVIDER="anthropic/claude-3-opus"
|
|
||||||
# Or in your .llm.env file:
|
|
||||||
# LLM_PROVIDER=anthropic/claude-3-opus
|
|
||||||
```
|
|
||||||
|
|
||||||
2. **API Request Parameter**: Specify provider per request
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"url": "https://example.com",
|
"url": "https://example.com",
|
||||||
"f": "llm",
|
"f": "llm",
|
||||||
"provider": "groq/mixtral-8x7b"
|
"provider": "groq/mixtral-8x7b",
|
||||||
|
"temperature": 0.7,
|
||||||
|
"base_url": "https://api.custom.com/v1"
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
3. **Config File Default**: Falls back to `config.yml` (default: `openai/gpt-4o-mini`)
|
2. **Provider-Specific Environment Variables**: Override for specific providers
|
||||||
|
```bash
|
||||||
|
# In your .llm.env file:
|
||||||
|
OPENAI_TEMPERATURE=0.5
|
||||||
|
OPENAI_BASE_URL=https://custom-openai.com/v1
|
||||||
|
ANTHROPIC_TEMPERATURE=0.3
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Global Environment Variables**: Set defaults for all providers
|
||||||
|
```bash
|
||||||
|
# In your .llm.env file:
|
||||||
|
LLM_PROVIDER=anthropic/claude-3-opus
|
||||||
|
LLM_TEMPERATURE=0.7
|
||||||
|
LLM_BASE_URL=https://api.proxy.com/v1
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **Config File Default**: Falls back to `config.yml` (default: `openai/gpt-4o-mini`)
|
||||||
|
|
||||||
The system automatically selects the appropriate API key based on the provider. LiteLLM handles finding the correct environment variable for each provider (e.g., OPENAI_API_KEY for OpenAI, GEMINI_API_TOKEN for Google Gemini, etc.).
|
The system automatically selects the appropriate API key based on the provider. LiteLLM handles finding the correct environment variable for each provider (e.g., OPENAI_API_KEY for OpenAI, GEMINI_API_TOKEN for Google Gemini, etc.).
|
||||||
|
|
||||||
|
**Supported LLM Parameters:**
|
||||||
|
- `provider`: LLM provider and model (e.g., "openai/gpt-4", "anthropic/claude-3-opus")
|
||||||
|
- `temperature`: Controls randomness (0.0-2.0, lower = more focused, higher = more creative)
|
||||||
|
- `base_url`: Custom API endpoint for proxy servers or alternative endpoints
|
||||||
|
|
||||||
#### 3. Build and Run with Compose
|
#### 3. Build and Run with Compose
|
||||||
|
|
||||||
The `docker-compose.yml` file in the project root provides a simplified approach that automatically handles architecture detection using buildx.
|
The `docker-compose.yml` file in the project root provides a simplified approach that automatically handles architecture detection using buildx.
|
||||||
@@ -555,6 +581,101 @@ Crucially, when sending configurations directly via JSON, they **must** follow t
|
|||||||
**LLM Extraction Strategy** *(Keep example, ensure schema uses type/value wrapper)*
|
**LLM Extraction Strategy** *(Keep example, ensure schema uses type/value wrapper)*
|
||||||
*(Keep Deep Crawler Example)*
|
*(Keep Deep Crawler Example)*
|
||||||
|
|
||||||
|
### LLM Configuration Examples
|
||||||
|
|
||||||
|
The Docker API supports dynamic LLM configuration through multiple levels:
|
||||||
|
|
||||||
|
#### Temperature Control
|
||||||
|
|
||||||
|
Temperature affects the randomness of LLM responses (0.0 = deterministic, 2.0 = very creative):
|
||||||
|
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# Low temperature for factual extraction
|
||||||
|
response = requests.post(
|
||||||
|
"http://localhost:11235/md",
|
||||||
|
json={
|
||||||
|
"url": "https://example.com",
|
||||||
|
"f": "llm",
|
||||||
|
"q": "Extract all dates and numbers from this page",
|
||||||
|
"temperature": 0.2 # Very focused, deterministic
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# High temperature for creative tasks
|
||||||
|
response = requests.post(
|
||||||
|
"http://localhost:11235/md",
|
||||||
|
json={
|
||||||
|
"url": "https://example.com",
|
||||||
|
"f": "llm",
|
||||||
|
"q": "Write a creative summary of this content",
|
||||||
|
"temperature": 1.2 # More creative, varied responses
|
||||||
|
}
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Custom API Endpoints
|
||||||
|
|
||||||
|
Use custom base URLs for proxy servers or alternative API endpoints:
|
||||||
|
|
||||||
|
```python
|
||||||
|
|
||||||
|
# Using a local LLM server
|
||||||
|
response = requests.post(
|
||||||
|
"http://localhost:11235/md",
|
||||||
|
json={
|
||||||
|
"url": "https://example.com",
|
||||||
|
"f": "llm",
|
||||||
|
"q": "Extract key information",
|
||||||
|
"provider": "ollama/llama2",
|
||||||
|
"base_url": "http://localhost:11434/v1"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Dynamic Provider Selection
|
||||||
|
|
||||||
|
Switch between providers based on task requirements:
|
||||||
|
|
||||||
|
```python
|
||||||
|
async def smart_extraction(url: str, content_type: str):
|
||||||
|
"""Select provider and temperature based on content type"""
|
||||||
|
|
||||||
|
configs = {
|
||||||
|
"technical": {
|
||||||
|
"provider": "openai/gpt-4",
|
||||||
|
"temperature": 0.3,
|
||||||
|
"query": "Extract technical specifications and code examples"
|
||||||
|
},
|
||||||
|
"creative": {
|
||||||
|
"provider": "anthropic/claude-3-opus",
|
||||||
|
"temperature": 0.9,
|
||||||
|
"query": "Create an engaging narrative summary"
|
||||||
|
},
|
||||||
|
"quick": {
|
||||||
|
"provider": "groq/mixtral-8x7b",
|
||||||
|
"temperature": 0.5,
|
||||||
|
"query": "Quick summary in bullet points"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
config = configs.get(content_type, configs["quick"])
|
||||||
|
|
||||||
|
response = await httpx.post(
|
||||||
|
"http://localhost:11235/md",
|
||||||
|
json={
|
||||||
|
"url": url,
|
||||||
|
"f": "llm",
|
||||||
|
"q": config["query"],
|
||||||
|
"provider": config["provider"],
|
||||||
|
"temperature": config["temperature"]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return response.json()
|
||||||
|
```
|
||||||
|
|
||||||
### REST API Examples
|
### REST API Examples
|
||||||
|
|
||||||
Update URLs to use port `11235`.
|
Update URLs to use port `11235`.
|
||||||
@@ -694,6 +815,7 @@ app:
|
|||||||
llm:
|
llm:
|
||||||
provider: "openai/gpt-4o-mini" # Can be overridden by LLM_PROVIDER env var
|
provider: "openai/gpt-4o-mini" # Can be overridden by LLM_PROVIDER env var
|
||||||
# api_key: sk-... # If you pass the API key directly (not recommended)
|
# api_key: sk-... # If you pass the API key directly (not recommended)
|
||||||
|
# temperature and base_url are controlled via environment variables or request parameters
|
||||||
|
|
||||||
# Redis Configuration (Used by internal Redis server managed by supervisord)
|
# Redis Configuration (Used by internal Redis server managed by supervisord)
|
||||||
redis:
|
redis:
|
||||||
|
|||||||
@@ -79,7 +79,7 @@ if __name__ == "__main__":
|
|||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
```
|
```
|
||||||
|
|
||||||
> IMPORTANT: By default cache mode is set to `CacheMode.ENABLED`. So to have fresh content, you need to set it to `CacheMode.BYPASS`
|
> IMPORTANT: By default cache mode is set to `CacheMode.BYPASS` to have fresh content. Set `CacheMode.ENABLED` to enable caching.
|
||||||
|
|
||||||
We’ll explore more advanced config in later tutorials (like enabling proxies, PDF output, multi-tab sessions, etc.). For now, just note how you pass these objects to manage crawling.
|
We’ll explore more advanced config in later tutorials (like enabling proxies, PDF output, multi-tab sessions, etc.). For now, just note how you pass these objects to manage crawling.
|
||||||
|
|
||||||
|
|||||||
201
tests/docker/test_filter_deep_crawl.py
Normal file
201
tests/docker/test_filter_deep_crawl.py
Normal file
@@ -0,0 +1,201 @@
|
|||||||
|
"""
|
||||||
|
Test the complete fix for both the filter serialization and JSON serialization issues.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from crawl4ai import BrowserConfig, CacheMode, CrawlerRunConfig
|
||||||
|
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, FilterChain, URLPatternFilter
|
||||||
|
|
||||||
|
BASE_URL = "http://localhost:11234/" # Adjust port as needed
|
||||||
|
|
||||||
|
async def test_with_docker_client():
|
||||||
|
"""Test using the Docker client (same as 1419.py)."""
|
||||||
|
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
print("Testing with Docker Client")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with Crawl4aiDockerClient(
|
||||||
|
base_url=BASE_URL,
|
||||||
|
verbose=True,
|
||||||
|
) as client:
|
||||||
|
|
||||||
|
# Create filter chain - testing the serialization fix
|
||||||
|
filter_chain = [
|
||||||
|
URLPatternFilter(
|
||||||
|
# patterns=["*about*", "*privacy*", "*terms*"],
|
||||||
|
patterns=["*advanced*"],
|
||||||
|
reverse=True
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
crawler_config = CrawlerRunConfig(
|
||||||
|
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||||
|
max_depth=2, # Keep it shallow for testing
|
||||||
|
# max_pages=5, # Limit pages for testing
|
||||||
|
filter_chain=FilterChain(filter_chain)
|
||||||
|
),
|
||||||
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n1. Testing crawl with filters...")
|
||||||
|
results = await client.crawl(
|
||||||
|
["https://docs.crawl4ai.com"], # Simple test page
|
||||||
|
browser_config=BrowserConfig(headless=True),
|
||||||
|
crawler_config=crawler_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
if results:
|
||||||
|
print(f"✅ Crawl succeeded! Type: {type(results)}")
|
||||||
|
if hasattr(results, 'success'):
|
||||||
|
print(f"✅ Results success: {results.success}")
|
||||||
|
# Test that we can iterate results without JSON errors
|
||||||
|
if hasattr(results, '__iter__'):
|
||||||
|
for i, result in enumerate(results):
|
||||||
|
if hasattr(result, 'url'):
|
||||||
|
print(f" Result {i}: {result.url[:50]}...")
|
||||||
|
else:
|
||||||
|
print(f" Result {i}: {str(result)[:50]}...")
|
||||||
|
else:
|
||||||
|
# Handle list of results
|
||||||
|
print(f"✅ Got {len(results)} results")
|
||||||
|
for i, result in enumerate(results[:3]): # Show first 3
|
||||||
|
print(f" Result {i}: {result.url[:50]}...")
|
||||||
|
else:
|
||||||
|
print("❌ Crawl failed - no results returned")
|
||||||
|
return False
|
||||||
|
|
||||||
|
print("\n✅ Docker client test completed successfully!")
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Docker client test failed: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
async def test_with_rest_api():
|
||||||
|
"""Test using REST API directly."""
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("Testing with REST API")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Create filter configuration
|
||||||
|
deep_crawl_strategy_payload = {
|
||||||
|
"type": "BFSDeepCrawlStrategy",
|
||||||
|
"params": {
|
||||||
|
"max_depth": 2,
|
||||||
|
# "max_pages": 5,
|
||||||
|
"filter_chain": {
|
||||||
|
"type": "FilterChain",
|
||||||
|
"params": {
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"type": "URLPatternFilter",
|
||||||
|
"params": {
|
||||||
|
"patterns": ["*advanced*"],
|
||||||
|
"reverse": True
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
crawl_payload = {
|
||||||
|
"urls": ["https://docs.crawl4ai.com"],
|
||||||
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||||
|
"crawler_config": {
|
||||||
|
"type": "CrawlerRunConfig",
|
||||||
|
"params": {
|
||||||
|
"deep_crawl_strategy": deep_crawl_strategy_payload,
|
||||||
|
"cache_mode": "bypass"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
print("\n1. Sending crawl request to REST API...")
|
||||||
|
response = await client.post(
|
||||||
|
f"{BASE_URL}crawl",
|
||||||
|
json=crawl_payload,
|
||||||
|
timeout=30
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
print(f"✅ REST API returned 200 OK")
|
||||||
|
data = response.json()
|
||||||
|
if data.get("success"):
|
||||||
|
results = data.get("results", [])
|
||||||
|
print(f"✅ Got {len(results)} results")
|
||||||
|
for i, result in enumerate(results[:3]):
|
||||||
|
print(f" Result {i}: {result.get('url', 'unknown')[:50]}...")
|
||||||
|
else:
|
||||||
|
print(f"❌ Crawl not successful: {data}")
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
print(f"❌ REST API returned {response.status_code}")
|
||||||
|
print(f" Response: {response.text[:500]}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
print("\n✅ REST API test completed successfully!")
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ REST API test failed: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Run all tests."""
|
||||||
|
print("\n🧪 TESTING COMPLETE FIX FOR DOCKER FILTER AND JSON ISSUES")
|
||||||
|
print("=" * 60)
|
||||||
|
print("Make sure the server is running with the updated code!")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
# Test 1: Docker client
|
||||||
|
docker_passed = await test_with_docker_client()
|
||||||
|
results.append(("Docker Client", docker_passed))
|
||||||
|
|
||||||
|
# Test 2: REST API
|
||||||
|
rest_passed = await test_with_rest_api()
|
||||||
|
results.append(("REST API", rest_passed))
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("FINAL TEST SUMMARY")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
all_passed = True
|
||||||
|
for test_name, passed in results:
|
||||||
|
status = "✅ PASSED" if passed else "❌ FAILED"
|
||||||
|
print(f"{test_name:20} {status}")
|
||||||
|
if not passed:
|
||||||
|
all_passed = False
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
if all_passed:
|
||||||
|
print("🎉 ALL TESTS PASSED! Both issues are fully resolved!")
|
||||||
|
print("\nThe fixes:")
|
||||||
|
print("1. Filter serialization: Fixed by not serializing private __slots__")
|
||||||
|
print("2. JSON serialization: Fixed by removing property descriptors from model_dump()")
|
||||||
|
else:
|
||||||
|
print("⚠️ Some tests failed. Please check the server logs for details.")
|
||||||
|
|
||||||
|
return 0 if all_passed else 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import sys
|
||||||
|
sys.exit(asyncio.run(main()))
|
||||||
349
tests/docker/test_llm_params.py
Executable file
349
tests/docker/test_llm_params.py
Executable file
@@ -0,0 +1,349 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test script for LLM temperature and base_url parameters in Crawl4AI Docker API.
|
||||||
|
This demonstrates the new hierarchical configuration system:
|
||||||
|
1. Request-level parameters (highest priority)
|
||||||
|
2. Provider-specific environment variables
|
||||||
|
3. Global environment variables
|
||||||
|
4. System defaults (lowest priority)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import httpx
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.panel import Panel
|
||||||
|
from rich.syntax import Syntax
|
||||||
|
from rich.table import Table
|
||||||
|
|
||||||
|
|
||||||
|
console = Console()
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
BASE_URL = "http://localhost:11235" # Docker API endpoint
|
||||||
|
TEST_URL = "https://httpbin.org/html" # Simple test page
|
||||||
|
|
||||||
|
# --- Helper Functions ---
|
||||||
|
|
||||||
|
async def check_server_health(client: httpx.AsyncClient) -> bool:
|
||||||
|
"""Check if the server is healthy."""
|
||||||
|
console.print("[bold cyan]Checking server health...[/]", end="")
|
||||||
|
try:
|
||||||
|
response = await client.get("/health", timeout=10.0)
|
||||||
|
response.raise_for_status()
|
||||||
|
console.print(" [bold green]✓ Server is healthy![/]")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"\n[bold red]✗ Server health check failed: {e}[/]")
|
||||||
|
console.print(f"Is the server running at {BASE_URL}?")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def print_request(endpoint: str, payload: dict, title: str = "Request"):
|
||||||
|
"""Pretty print the request."""
|
||||||
|
syntax = Syntax(json.dumps(payload, indent=2), "json", theme="monokai")
|
||||||
|
console.print(Panel.fit(
|
||||||
|
f"[cyan]POST {endpoint}[/cyan]\n{syntax}",
|
||||||
|
title=f"[bold blue]{title}[/]",
|
||||||
|
border_style="blue"
|
||||||
|
))
|
||||||
|
|
||||||
|
def print_response(response: dict, title: str = "Response"):
|
||||||
|
"""Pretty print relevant parts of the response."""
|
||||||
|
# Extract only the relevant parts
|
||||||
|
relevant = {}
|
||||||
|
if "markdown" in response:
|
||||||
|
relevant["markdown"] = response["markdown"][:200] + "..." if len(response.get("markdown", "")) > 200 else response.get("markdown", "")
|
||||||
|
if "success" in response:
|
||||||
|
relevant["success"] = response["success"]
|
||||||
|
if "url" in response:
|
||||||
|
relevant["url"] = response["url"]
|
||||||
|
if "filter" in response:
|
||||||
|
relevant["filter"] = response["filter"]
|
||||||
|
|
||||||
|
console.print(Panel.fit(
|
||||||
|
Syntax(json.dumps(relevant, indent=2), "json", theme="monokai"),
|
||||||
|
title=f"[bold green]{title}[/]",
|
||||||
|
border_style="green"
|
||||||
|
))
|
||||||
|
|
||||||
|
# --- Test Functions ---
|
||||||
|
|
||||||
|
async def test_default_no_params(client: httpx.AsyncClient):
|
||||||
|
"""Test 1: No temperature or base_url specified - uses defaults"""
|
||||||
|
console.rule("[bold yellow]Test 1: Default Configuration (No Parameters)[/]")
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"url": TEST_URL,
|
||||||
|
"f": "llm",
|
||||||
|
"q": "What is the main heading of this page? Answer in exactly 5 words."
|
||||||
|
}
|
||||||
|
|
||||||
|
print_request("/md", payload, "Request without temperature/base_url")
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = await client.post("/md", json=payload, timeout=30.0)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
print_response(data, "Response (using system defaults)")
|
||||||
|
console.print("[dim]→ This used system defaults or environment variables if set[/]")
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]Error: {e}[/]")
|
||||||
|
|
||||||
|
async def test_request_temperature(client: httpx.AsyncClient):
|
||||||
|
"""Test 2: Request-level temperature (highest priority)"""
|
||||||
|
console.rule("[bold yellow]Test 2: Request-Level Temperature[/]")
|
||||||
|
|
||||||
|
# Test with low temperature (more focused)
|
||||||
|
payload_low = {
|
||||||
|
"url": TEST_URL,
|
||||||
|
"f": "llm",
|
||||||
|
"q": "What is the main heading? Be creative and poetic.",
|
||||||
|
"temperature": 0.1 # Very low - should be less creative
|
||||||
|
}
|
||||||
|
|
||||||
|
print_request("/md", payload_low, "Low Temperature (0.1)")
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = await client.post("/md", json=payload_low, timeout=30.0)
|
||||||
|
response.raise_for_status()
|
||||||
|
data_low = response.json()
|
||||||
|
print_response(data_low, "Response with Low Temperature")
|
||||||
|
console.print("[dim]→ Low temperature (0.1) should produce focused, less creative output[/]")
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]Error: {e}[/]")
|
||||||
|
|
||||||
|
console.print()
|
||||||
|
|
||||||
|
# Test with high temperature (more creative)
|
||||||
|
payload_high = {
|
||||||
|
"url": TEST_URL,
|
||||||
|
"f": "llm",
|
||||||
|
"q": "What is the main heading? Be creative and poetic.",
|
||||||
|
"temperature": 1.5 # High - should be more creative
|
||||||
|
}
|
||||||
|
|
||||||
|
print_request("/md", payload_high, "High Temperature (1.5)")
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = await client.post("/md", json=payload_high, timeout=30.0)
|
||||||
|
response.raise_for_status()
|
||||||
|
data_high = response.json()
|
||||||
|
print_response(data_high, "Response with High Temperature")
|
||||||
|
console.print("[dim]→ High temperature (1.5) should produce more creative, varied output[/]")
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]Error: {e}[/]")
|
||||||
|
|
||||||
|
async def test_provider_override(client: httpx.AsyncClient):
|
||||||
|
"""Test 3: Provider override with temperature"""
|
||||||
|
console.rule("[bold yellow]Test 3: Provider Override with Temperature[/]")
|
||||||
|
|
||||||
|
provider = "gemini/gemini-2.5-flash-lite"
|
||||||
|
payload = {
|
||||||
|
"url": TEST_URL,
|
||||||
|
"f": "llm",
|
||||||
|
"q": "Summarize this page in one sentence.",
|
||||||
|
"provider": provider, # Explicitly set provider
|
||||||
|
"temperature": 0.7
|
||||||
|
}
|
||||||
|
|
||||||
|
print_request("/md", payload, "Provider + Temperature Override")
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = await client.post("/md", json=payload, timeout=30.0)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
print_response(data, "Response with Provider Override")
|
||||||
|
console.print(f"[dim]→ This explicitly uses {provider} with temperature 0.7[/]")
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]Error: {e}[/]")
|
||||||
|
|
||||||
|
async def test_base_url_custom(client: httpx.AsyncClient):
|
||||||
|
"""Test 4: Custom base_url (will fail unless you have a custom endpoint)"""
|
||||||
|
console.rule("[bold yellow]Test 4: Custom Base URL (Demo Only)[/]")
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"url": TEST_URL,
|
||||||
|
"f": "llm",
|
||||||
|
"q": "What is this page about?",
|
||||||
|
"base_url": "https://api.custom-endpoint.com/v1", # Custom endpoint
|
||||||
|
"temperature": 0.5
|
||||||
|
}
|
||||||
|
|
||||||
|
print_request("/md", payload, "Custom Base URL Request")
|
||||||
|
console.print("[yellow]Note: This will fail unless you have a custom endpoint set up[/]")
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = await client.post("/md", json=payload, timeout=10.0)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
print_response(data, "Response from Custom Endpoint")
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
console.print(f"[yellow]Expected failure (no custom endpoint): Status {e.response.status_code}[/]")
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[yellow]Expected error: {e}[/]")
|
||||||
|
|
||||||
|
async def test_llm_job_endpoint(client: httpx.AsyncClient):
|
||||||
|
"""Test 5: Test the /llm/job endpoint with temperature and base_url"""
|
||||||
|
console.rule("[bold yellow]Test 5: LLM Job Endpoint with Parameters[/]")
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"url": TEST_URL,
|
||||||
|
"q": "Extract the main title and any key information",
|
||||||
|
"temperature": 0.3,
|
||||||
|
# "base_url": "https://api.openai.com/v1" # Optional
|
||||||
|
}
|
||||||
|
|
||||||
|
print_request("/llm/job", payload, "LLM Job with Temperature")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Submit the job
|
||||||
|
response = await client.post("/llm/job", json=payload, timeout=30.0)
|
||||||
|
response.raise_for_status()
|
||||||
|
job_data = response.json()
|
||||||
|
|
||||||
|
if "task_id" in job_data:
|
||||||
|
task_id = job_data["task_id"]
|
||||||
|
console.print(f"[green]Job created with task_id: {task_id}[/]")
|
||||||
|
|
||||||
|
# Poll for result (simplified - in production use proper polling)
|
||||||
|
await asyncio.sleep(3)
|
||||||
|
|
||||||
|
status_response = await client.get(f"/llm/job/{task_id}")
|
||||||
|
status_data = status_response.json()
|
||||||
|
|
||||||
|
if status_data.get("status") == "completed":
|
||||||
|
console.print("[green]Job completed successfully![/]")
|
||||||
|
if "result" in status_data:
|
||||||
|
console.print(Panel.fit(
|
||||||
|
Syntax(json.dumps(status_data["result"], indent=2), "json", theme="monokai"),
|
||||||
|
title="Extraction Result",
|
||||||
|
border_style="green"
|
||||||
|
))
|
||||||
|
else:
|
||||||
|
console.print(f"[yellow]Job status: {status_data.get('status', 'unknown')}[/]")
|
||||||
|
else:
|
||||||
|
console.print(f"[red]Unexpected response: {job_data}[/]")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]Error: {e}[/]")
|
||||||
|
|
||||||
|
|
||||||
|
async def test_llm_endpoint(client: httpx.AsyncClient):
|
||||||
|
"""
|
||||||
|
Quick QA round-trip with /llm.
|
||||||
|
Asks a trivial question against SIMPLE_URL just to show wiring.
|
||||||
|
"""
|
||||||
|
import time
|
||||||
|
import urllib.parse
|
||||||
|
|
||||||
|
page_url = "https://kidocode.com"
|
||||||
|
question = "What is the title of this page?"
|
||||||
|
|
||||||
|
enc = urllib.parse.quote_plus(page_url, safe="")
|
||||||
|
console.print(f"GET /llm/{enc}?q={question}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
t0 = time.time()
|
||||||
|
resp = await client.get(f"/llm/{enc}", params={"q": question})
|
||||||
|
dt = time.time() - t0
|
||||||
|
console.print(
|
||||||
|
f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
|
||||||
|
resp.raise_for_status()
|
||||||
|
answer = resp.json().get("answer", "")
|
||||||
|
console.print(Panel(answer or "No answer returned",
|
||||||
|
title="LLM answer", border_style="magenta", expand=False))
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[bold red]Error hitting /llm:[/] {e}")
|
||||||
|
|
||||||
|
|
||||||
|
async def show_environment_info():
|
||||||
|
"""Display current environment configuration"""
|
||||||
|
console.rule("[bold cyan]Current Environment Configuration[/]")
|
||||||
|
|
||||||
|
table = Table(title="LLM Environment Variables", show_header=True, header_style="bold magenta")
|
||||||
|
table.add_column("Variable", style="cyan", width=30)
|
||||||
|
table.add_column("Value", style="yellow")
|
||||||
|
table.add_column("Description", style="dim")
|
||||||
|
|
||||||
|
env_vars = [
|
||||||
|
("LLM_PROVIDER", "Global default provider"),
|
||||||
|
("LLM_TEMPERATURE", "Global default temperature"),
|
||||||
|
("LLM_BASE_URL", "Global custom API endpoint"),
|
||||||
|
("OPENAI_API_KEY", "OpenAI API key"),
|
||||||
|
("OPENAI_TEMPERATURE", "OpenAI-specific temperature"),
|
||||||
|
("OPENAI_BASE_URL", "OpenAI-specific endpoint"),
|
||||||
|
("ANTHROPIC_API_KEY", "Anthropic API key"),
|
||||||
|
("ANTHROPIC_TEMPERATURE", "Anthropic-specific temperature"),
|
||||||
|
("GROQ_API_KEY", "Groq API key"),
|
||||||
|
("GROQ_TEMPERATURE", "Groq-specific temperature"),
|
||||||
|
]
|
||||||
|
|
||||||
|
for var, desc in env_vars:
|
||||||
|
value = os.environ.get(var, "[not set]")
|
||||||
|
if "API_KEY" in var and value != "[not set]":
|
||||||
|
# Mask API keys for security
|
||||||
|
value = value[:10] + "..." if len(value) > 10 else "***"
|
||||||
|
table.add_row(var, value, desc)
|
||||||
|
|
||||||
|
console.print(table)
|
||||||
|
console.print()
|
||||||
|
|
||||||
|
# --- Main Test Runner ---
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
"""Run all tests"""
|
||||||
|
console.print(Panel.fit(
|
||||||
|
"[bold cyan]Crawl4AI LLM Parameters Test Suite[/]\n" +
|
||||||
|
"Testing temperature and base_url configuration hierarchy",
|
||||||
|
border_style="cyan"
|
||||||
|
))
|
||||||
|
|
||||||
|
# Show current environment
|
||||||
|
# await show_environment_info()
|
||||||
|
|
||||||
|
# Create HTTP client
|
||||||
|
async with httpx.AsyncClient(base_url=BASE_URL, timeout=60.0) as client:
|
||||||
|
# Check server health
|
||||||
|
if not await check_server_health(client):
|
||||||
|
console.print("[red]Server is not available. Please ensure the Docker container is running.[/]")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Run tests
|
||||||
|
tests = [
|
||||||
|
("Default Configuration", test_default_no_params),
|
||||||
|
("Request Temperature", test_request_temperature),
|
||||||
|
("Provider Override", test_provider_override),
|
||||||
|
("Custom Base URL", test_base_url_custom),
|
||||||
|
("LLM Job Endpoint", test_llm_job_endpoint),
|
||||||
|
("LLM Endpoint", test_llm_endpoint),
|
||||||
|
]
|
||||||
|
|
||||||
|
for i, (name, test_func) in enumerate(tests, 1):
|
||||||
|
if i > 1:
|
||||||
|
console.print() # Add spacing between tests
|
||||||
|
|
||||||
|
try:
|
||||||
|
await test_func(client)
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]Test '{name}' failed with error: {e}[/]")
|
||||||
|
console.print_exception(show_locals=False)
|
||||||
|
|
||||||
|
console.rule("[bold green]All Tests Complete![/]", style="green")
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
console.print("\n[bold cyan]Configuration Hierarchy Summary:[/]")
|
||||||
|
console.print("1. [yellow]Request parameters[/] - Highest priority (temperature, base_url in API call)")
|
||||||
|
console.print("2. [yellow]Provider-specific env[/] - e.g., OPENAI_TEMPERATURE, GROQ_BASE_URL")
|
||||||
|
console.print("3. [yellow]Global env variables[/] - LLM_TEMPERATURE, LLM_BASE_URL")
|
||||||
|
console.print("4. [yellow]System defaults[/] - Lowest priority (provider/litellm defaults)")
|
||||||
|
console.print()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
try:
|
||||||
|
asyncio.run(main())
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
console.print("\n[yellow]Tests interrupted by user.[/]")
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"\n[bold red]An error occurred:[/]")
|
||||||
|
console.print_exception(show_locals=False)
|
||||||
Reference in New Issue
Block a user