Compare commits
2 Commits
fix/adapti
...
fix/reques
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1874a7b8d2 | ||
|
|
6a3b3e9d38 |
@@ -19,7 +19,7 @@ import re
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||||
from crawl4ai.async_configs import CrawlerRunConfig, LinkPreviewConfig, LLMConfig
|
from crawl4ai.async_configs import CrawlerRunConfig, LinkPreviewConfig
|
||||||
from crawl4ai.models import Link, CrawlResult
|
from crawl4ai.models import Link, CrawlResult
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
@@ -178,7 +178,7 @@ class AdaptiveConfig:
|
|||||||
|
|
||||||
# Embedding strategy parameters
|
# Embedding strategy parameters
|
||||||
embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
|
embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
|
||||||
embedding_llm_config: Optional[Union[LLMConfig, Dict]] = None # Separate config for embeddings
|
embedding_llm_config: Optional[Dict] = None # Separate config for embeddings
|
||||||
n_query_variations: int = 10
|
n_query_variations: int = 10
|
||||||
coverage_threshold: float = 0.85
|
coverage_threshold: float = 0.85
|
||||||
alpha_shape_alpha: float = 0.5
|
alpha_shape_alpha: float = 0.5
|
||||||
@@ -250,30 +250,6 @@ class AdaptiveConfig:
|
|||||||
assert 0 <= self.embedding_quality_max_confidence <= 1, "embedding_quality_max_confidence must be between 0 and 1"
|
assert 0 <= self.embedding_quality_max_confidence <= 1, "embedding_quality_max_confidence must be between 0 and 1"
|
||||||
assert self.embedding_quality_scale_factor > 0, "embedding_quality_scale_factor must be positive"
|
assert self.embedding_quality_scale_factor > 0, "embedding_quality_scale_factor must be positive"
|
||||||
assert 0 <= self.embedding_min_confidence_threshold <= 1, "embedding_min_confidence_threshold must be between 0 and 1"
|
assert 0 <= self.embedding_min_confidence_threshold <= 1, "embedding_min_confidence_threshold must be between 0 and 1"
|
||||||
|
|
||||||
@property
|
|
||||||
def _embedding_llm_config_dict(self) -> Optional[Dict]:
|
|
||||||
"""Convert LLMConfig to dict format for backward compatibility."""
|
|
||||||
if self.embedding_llm_config is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
if isinstance(self.embedding_llm_config, dict):
|
|
||||||
# Already a dict - return as-is for backward compatibility
|
|
||||||
return self.embedding_llm_config
|
|
||||||
|
|
||||||
# Convert LLMConfig object to dict format
|
|
||||||
return {
|
|
||||||
'provider': self.embedding_llm_config.provider,
|
|
||||||
'api_token': self.embedding_llm_config.api_token,
|
|
||||||
'base_url': getattr(self.embedding_llm_config, 'base_url', None),
|
|
||||||
'temperature': getattr(self.embedding_llm_config, 'temperature', None),
|
|
||||||
'max_tokens': getattr(self.embedding_llm_config, 'max_tokens', None),
|
|
||||||
'top_p': getattr(self.embedding_llm_config, 'top_p', None),
|
|
||||||
'frequency_penalty': getattr(self.embedding_llm_config, 'frequency_penalty', None),
|
|
||||||
'presence_penalty': getattr(self.embedding_llm_config, 'presence_penalty', None),
|
|
||||||
'stop': getattr(self.embedding_llm_config, 'stop', None),
|
|
||||||
'n': getattr(self.embedding_llm_config, 'n', None),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class CrawlStrategy(ABC):
|
class CrawlStrategy(ABC):
|
||||||
@@ -617,7 +593,7 @@ class StatisticalStrategy(CrawlStrategy):
|
|||||||
class EmbeddingStrategy(CrawlStrategy):
|
class EmbeddingStrategy(CrawlStrategy):
|
||||||
"""Embedding-based adaptive crawling using semantic space coverage"""
|
"""Embedding-based adaptive crawling using semantic space coverage"""
|
||||||
|
|
||||||
def __init__(self, embedding_model: str = None, llm_config: Union[LLMConfig, Dict] = None):
|
def __init__(self, embedding_model: str = None, llm_config: Dict = None):
|
||||||
self.embedding_model = embedding_model or "sentence-transformers/all-MiniLM-L6-v2"
|
self.embedding_model = embedding_model or "sentence-transformers/all-MiniLM-L6-v2"
|
||||||
self.llm_config = llm_config
|
self.llm_config = llm_config
|
||||||
self._embedding_cache = {}
|
self._embedding_cache = {}
|
||||||
@@ -629,24 +605,14 @@ class EmbeddingStrategy(CrawlStrategy):
|
|||||||
self._kb_embeddings_hash = None # Track KB changes
|
self._kb_embeddings_hash = None # Track KB changes
|
||||||
self._validation_embeddings_cache = None # Cache validation query embeddings
|
self._validation_embeddings_cache = None # Cache validation query embeddings
|
||||||
self._kb_similarity_threshold = 0.95 # Threshold for deduplication
|
self._kb_similarity_threshold = 0.95 # Threshold for deduplication
|
||||||
|
|
||||||
def _get_embedding_llm_config_dict(self) -> Dict:
|
|
||||||
"""Get embedding LLM config as dict with fallback to default."""
|
|
||||||
if hasattr(self, 'config') and self.config:
|
|
||||||
config_dict = self.config._embedding_llm_config_dict
|
|
||||||
if config_dict:
|
|
||||||
return config_dict
|
|
||||||
|
|
||||||
# Fallback to default if no config provided
|
|
||||||
return {
|
|
||||||
'provider': 'openai/text-embedding-3-small',
|
|
||||||
'api_token': os.getenv('OPENAI_API_KEY')
|
|
||||||
}
|
|
||||||
|
|
||||||
async def _get_embeddings(self, texts: List[str]) -> Any:
|
async def _get_embeddings(self, texts: List[str]) -> Any:
|
||||||
"""Get embeddings using configured method"""
|
"""Get embeddings using configured method"""
|
||||||
from .utils import get_text_embeddings
|
from .utils import get_text_embeddings
|
||||||
embedding_llm_config = self._get_embedding_llm_config_dict()
|
embedding_llm_config = {
|
||||||
|
'provider': 'openai/text-embedding-3-small',
|
||||||
|
'api_token': os.getenv('OPENAI_API_KEY')
|
||||||
|
}
|
||||||
return await get_text_embeddings(
|
return await get_text_embeddings(
|
||||||
texts,
|
texts,
|
||||||
embedding_llm_config,
|
embedding_llm_config,
|
||||||
@@ -713,20 +679,8 @@ class EmbeddingStrategy(CrawlStrategy):
|
|||||||
Return as a JSON array of strings."""
|
Return as a JSON array of strings."""
|
||||||
|
|
||||||
# Use the LLM for query generation
|
# Use the LLM for query generation
|
||||||
# Convert LLMConfig to dict if needed
|
provider = self.llm_config.get('provider', 'openai/gpt-4o-mini') if self.llm_config else 'openai/gpt-4o-mini'
|
||||||
llm_config_dict = None
|
api_token = self.llm_config.get('api_token') if self.llm_config else None
|
||||||
if self.llm_config:
|
|
||||||
if isinstance(self.llm_config, dict):
|
|
||||||
llm_config_dict = self.llm_config
|
|
||||||
else:
|
|
||||||
# Convert LLMConfig object to dict
|
|
||||||
llm_config_dict = {
|
|
||||||
'provider': self.llm_config.provider,
|
|
||||||
'api_token': self.llm_config.api_token
|
|
||||||
}
|
|
||||||
|
|
||||||
provider = llm_config_dict.get('provider', 'openai/gpt-4o-mini') if llm_config_dict else 'openai/gpt-4o-mini'
|
|
||||||
api_token = llm_config_dict.get('api_token') if llm_config_dict else None
|
|
||||||
|
|
||||||
# response = perform_completion_with_backoff(
|
# response = perform_completion_with_backoff(
|
||||||
# provider=provider,
|
# provider=provider,
|
||||||
@@ -889,7 +843,10 @@ class EmbeddingStrategy(CrawlStrategy):
|
|||||||
|
|
||||||
# Batch embed only uncached links
|
# Batch embed only uncached links
|
||||||
if texts_to_embed:
|
if texts_to_embed:
|
||||||
embedding_llm_config = self._get_embedding_llm_config_dict()
|
embedding_llm_config = {
|
||||||
|
'provider': 'openai/text-embedding-3-small',
|
||||||
|
'api_token': os.getenv('OPENAI_API_KEY')
|
||||||
|
}
|
||||||
new_embeddings = await get_text_embeddings(texts_to_embed, embedding_llm_config, self.embedding_model)
|
new_embeddings = await get_text_embeddings(texts_to_embed, embedding_llm_config, self.embedding_model)
|
||||||
|
|
||||||
# Cache the new embeddings
|
# Cache the new embeddings
|
||||||
@@ -1227,7 +1184,10 @@ class EmbeddingStrategy(CrawlStrategy):
|
|||||||
return
|
return
|
||||||
|
|
||||||
# Get embeddings for new texts
|
# Get embeddings for new texts
|
||||||
embedding_llm_config = self._get_embedding_llm_config_dict()
|
embedding_llm_config = {
|
||||||
|
'provider': 'openai/text-embedding-3-small',
|
||||||
|
'api_token': os.getenv('OPENAI_API_KEY')
|
||||||
|
}
|
||||||
new_embeddings = await get_text_embeddings(new_texts, embedding_llm_config, self.embedding_model)
|
new_embeddings = await get_text_embeddings(new_texts, embedding_llm_config, self.embedding_model)
|
||||||
|
|
||||||
# Deduplicate embeddings before adding to KB
|
# Deduplicate embeddings before adding to KB
|
||||||
@@ -1296,12 +1256,10 @@ class AdaptiveCrawler:
|
|||||||
if strategy_name == "statistical":
|
if strategy_name == "statistical":
|
||||||
return StatisticalStrategy()
|
return StatisticalStrategy()
|
||||||
elif strategy_name == "embedding":
|
elif strategy_name == "embedding":
|
||||||
strategy = EmbeddingStrategy(
|
return EmbeddingStrategy(
|
||||||
embedding_model=self.config.embedding_model,
|
embedding_model=self.config.embedding_model,
|
||||||
llm_config=self.config.embedding_llm_config
|
llm_config=self.config.embedding_llm_config
|
||||||
)
|
)
|
||||||
strategy.config = self.config # Pass config to strategy
|
|
||||||
return strategy
|
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown strategy: {strategy_name}")
|
raise ValueError(f"Unknown strategy: {strategy_name}")
|
||||||
|
|
||||||
|
|||||||
@@ -28,43 +28,25 @@ def create_access_token(data: dict, expires_delta: Optional[timedelta] = None) -
|
|||||||
signing_key = get_jwk_from_secret(SECRET_KEY)
|
signing_key = get_jwk_from_secret(SECRET_KEY)
|
||||||
return instance.encode(to_encode, signing_key, alg='HS256')
|
return instance.encode(to_encode, signing_key, alg='HS256')
|
||||||
|
|
||||||
def verify_token(credentials: HTTPAuthorizationCredentials) -> Dict:
|
def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)) -> Dict:
|
||||||
"""Verify the JWT token from the Authorization header."""
|
"""Verify the JWT token from the Authorization header."""
|
||||||
|
|
||||||
if not credentials or not credentials.credentials:
|
if credentials is None:
|
||||||
raise HTTPException(
|
return None
|
||||||
status_code=401,
|
|
||||||
detail="No token provided",
|
|
||||||
headers={"WWW-Authenticate": "Bearer"}
|
|
||||||
)
|
|
||||||
|
|
||||||
token = credentials.credentials
|
token = credentials.credentials
|
||||||
verifying_key = get_jwk_from_secret(SECRET_KEY)
|
verifying_key = get_jwk_from_secret(SECRET_KEY)
|
||||||
try:
|
try:
|
||||||
payload = instance.decode(token, verifying_key, do_time_check=True, algorithms='HS256')
|
payload = instance.decode(token, verifying_key, do_time_check=True, algorithms='HS256')
|
||||||
return payload
|
return payload
|
||||||
except Exception as e:
|
except Exception:
|
||||||
raise HTTPException(
|
raise HTTPException(status_code=401, detail="Invalid or expired token")
|
||||||
status_code=401,
|
|
||||||
detail=f"Invalid or expired token: {str(e)}",
|
|
||||||
headers={"WWW-Authenticate": "Bearer"}
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_token_dependency(config: Dict):
|
def get_token_dependency(config: Dict):
|
||||||
"""Return the token dependency if JWT is enabled, else a function that returns None."""
|
"""Return the token dependency if JWT is enabled, else a function that returns None."""
|
||||||
|
|
||||||
if config.get("security", {}).get("jwt_enabled", False):
|
if config.get("security", {}).get("jwt_enabled", False):
|
||||||
def jwt_required(credentials: HTTPAuthorizationCredentials = Depends(security)) -> Dict:
|
return verify_token
|
||||||
"""Enforce JWT authentication when enabled."""
|
|
||||||
if credentials is None:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=401,
|
|
||||||
detail="Authentication required. Please provide a valid Bearer token.",
|
|
||||||
headers={"WWW-Authenticate": "Bearer"}
|
|
||||||
)
|
|
||||||
return verify_token(credentials)
|
|
||||||
return jwt_required
|
|
||||||
else:
|
else:
|
||||||
return lambda: None
|
return lambda: None
|
||||||
|
|
||||||
|
|||||||
@@ -38,8 +38,8 @@ rate_limiting:
|
|||||||
|
|
||||||
# Security Configuration
|
# Security Configuration
|
||||||
security:
|
security:
|
||||||
enabled: false
|
enabled: false
|
||||||
jwt_enabled: false
|
jwt_enabled: false
|
||||||
https_redirect: false
|
https_redirect: false
|
||||||
trusted_hosts: ["*"]
|
trusted_hosts: ["*"]
|
||||||
headers:
|
headers:
|
||||||
|
|||||||
@@ -482,9 +482,14 @@ async def crawl(
|
|||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Crawl a list of URLs and return the results as JSON.
|
Crawl a list of URLs and return the results as JSON.
|
||||||
|
For streaming responses, use /crawl/stream endpoint.
|
||||||
"""
|
"""
|
||||||
if not crawl_request.urls:
|
if not crawl_request.urls:
|
||||||
raise HTTPException(400, "At least one URL required")
|
raise HTTPException(400, "At least one URL required")
|
||||||
|
# Check whether it is a redirection for a streaming request
|
||||||
|
crawler_config = CrawlerRunConfig.load(crawl_request.crawler_config)
|
||||||
|
if crawler_config.stream:
|
||||||
|
return await stream_process(crawl_request=crawl_request)
|
||||||
results = await handle_crawl_request(
|
results = await handle_crawl_request(
|
||||||
urls=crawl_request.urls,
|
urls=crawl_request.urls,
|
||||||
browser_config=crawl_request.browser_config,
|
browser_config=crawl_request.browser_config,
|
||||||
@@ -506,12 +511,16 @@ async def crawl_stream(
|
|||||||
):
|
):
|
||||||
if not crawl_request.urls:
|
if not crawl_request.urls:
|
||||||
raise HTTPException(400, "At least one URL required")
|
raise HTTPException(400, "At least one URL required")
|
||||||
|
|
||||||
|
return await stream_process(crawl_request=crawl_request)
|
||||||
|
|
||||||
|
async def stream_process(crawl_request: CrawlRequest):
|
||||||
crawler, gen = await handle_stream_crawl_request(
|
crawler, gen = await handle_stream_crawl_request(
|
||||||
urls=crawl_request.urls,
|
urls=crawl_request.urls,
|
||||||
browser_config=crawl_request.browser_config,
|
browser_config=crawl_request.browser_config,
|
||||||
crawler_config=crawl_request.crawler_config,
|
crawler_config=crawl_request.crawler_config,
|
||||||
config=config,
|
config=config,
|
||||||
)
|
)
|
||||||
return StreamingResponse(
|
return StreamingResponse(
|
||||||
stream_results(crawler, gen),
|
stream_results(crawler, gen),
|
||||||
media_type="application/x-ndjson",
|
media_type="application/x-ndjson",
|
||||||
|
|||||||
@@ -371,7 +371,7 @@
|
|||||||
|
|
||||||
<div class="flex items-center">
|
<div class="flex items-center">
|
||||||
<input id="st-stream" type="checkbox" class="mr-2">
|
<input id="st-stream" type="checkbox" class="mr-2">
|
||||||
<label for="st-stream" class="text-sm">Use /crawl/stream</label>
|
<label for="st-stream" class="text-sm">Enable streaming mode</label>
|
||||||
<button id="st-run"
|
<button id="st-run"
|
||||||
class="ml-auto bg-accent text-dark px-4 py-2 rounded hover:bg-opacity-90 font-medium">
|
class="ml-auto bg-accent text-dark px-4 py-2 rounded hover:bg-opacity-90 font-medium">
|
||||||
Run Stress Test
|
Run Stress Test
|
||||||
@@ -596,6 +596,14 @@
|
|||||||
forceHighlightElement(curlCodeEl);
|
forceHighlightElement(curlCodeEl);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Detect if stream is requested inside payload
|
||||||
|
function shouldUseStream(payload) {
|
||||||
|
const toBool = (v) => v === true || (typeof v === 'string' && v.toLowerCase() === 'true');
|
||||||
|
const fromCrawler = payload && payload.crawler_config && payload.crawler_config.params && payload.crawler_config.params.stream;
|
||||||
|
const direct = payload && payload.stream;
|
||||||
|
return toBool(fromCrawler) || toBool(direct);
|
||||||
|
}
|
||||||
|
|
||||||
// Main run function
|
// Main run function
|
||||||
async function runCrawl() {
|
async function runCrawl() {
|
||||||
const endpoint = document.getElementById('endpoint').value;
|
const endpoint = document.getElementById('endpoint').value;
|
||||||
@@ -611,16 +619,24 @@
|
|||||||
: { browser_config: cfgJson };
|
: { browser_config: cfgJson };
|
||||||
}
|
}
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
updateStatus('error');
|
const codeText = cm.getValue();
|
||||||
document.querySelector('#response-content code').textContent =
|
const streamFlag = /stream\s*=\s*True/i.test(codeText);
|
||||||
JSON.stringify({ error: err.message }, null, 2);
|
const isCrawlEndpoint = document.getElementById('endpoint').value === 'crawl';
|
||||||
forceHighlightElement(document.querySelector('#response-content code'));
|
if (isCrawlEndpoint && streamFlag) {
|
||||||
return; // stop run
|
// Fallback: proceed with minimal config only for stream
|
||||||
|
advConfig = { crawler_config: { stream: true } };
|
||||||
|
} else {
|
||||||
|
updateStatus('error');
|
||||||
|
document.querySelector('#response-content code').textContent =
|
||||||
|
JSON.stringify({ error: err.message }, null, 2);
|
||||||
|
forceHighlightElement(document.querySelector('#response-content code'));
|
||||||
|
return; // stop run
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const endpointMap = {
|
const endpointMap = {
|
||||||
crawl: '/crawl',
|
crawl: '/crawl',
|
||||||
// crawl_stream: '/crawl/stream',
|
crawl_stream: '/crawl/stream', // Keep for backward compatibility
|
||||||
md: '/md',
|
md: '/md',
|
||||||
llm: '/llm'
|
llm: '/llm'
|
||||||
};
|
};
|
||||||
@@ -647,7 +663,7 @@
|
|||||||
// This will be handled directly in the fetch below
|
// This will be handled directly in the fetch below
|
||||||
payload = null;
|
payload = null;
|
||||||
} else {
|
} else {
|
||||||
// Default payload for /crawl and /crawl/stream
|
// Default payload for /crawl (supports both streaming and batch modes)
|
||||||
payload = {
|
payload = {
|
||||||
urls,
|
urls,
|
||||||
...advConfig
|
...advConfig
|
||||||
@@ -659,6 +675,7 @@
|
|||||||
try {
|
try {
|
||||||
const startTime = performance.now();
|
const startTime = performance.now();
|
||||||
let response, responseData;
|
let response, responseData;
|
||||||
|
const useStreamOverride = (endpoint === 'crawl') && shouldUseStream(payload);
|
||||||
|
|
||||||
if (endpoint === 'llm') {
|
if (endpoint === 'llm') {
|
||||||
// Special handling for LLM endpoint which uses URL pattern: /llm/{encoded_url}?q={query}
|
// Special handling for LLM endpoint which uses URL pattern: /llm/{encoded_url}?q={query}
|
||||||
@@ -681,8 +698,8 @@
|
|||||||
document.querySelector('#response-content code').textContent = JSON.stringify(responseData, null, 2);
|
document.querySelector('#response-content code').textContent = JSON.stringify(responseData, null, 2);
|
||||||
document.querySelector('#response-content code').className = 'json hljs';
|
document.querySelector('#response-content code').className = 'json hljs';
|
||||||
forceHighlightElement(document.querySelector('#response-content code'));
|
forceHighlightElement(document.querySelector('#response-content code'));
|
||||||
} else if (endpoint === 'crawl_stream') {
|
} else if (endpoint === 'crawl_stream' || useStreamOverride) {
|
||||||
// Stream processing
|
// Stream processing - now handled directly by /crawl endpoint
|
||||||
response = await fetch(api, {
|
response = await fetch(api, {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
headers: { 'Content-Type': 'application/json' },
|
headers: { 'Content-Type': 'application/json' },
|
||||||
@@ -757,6 +774,7 @@
|
|||||||
const question = document.getElementById('llm-question').value.trim() || "What is this page about?";
|
const question = document.getElementById('llm-question').value.trim() || "What is this page about?";
|
||||||
generateSnippets(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, null, 'GET');
|
generateSnippets(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, null, 'GET');
|
||||||
} else {
|
} else {
|
||||||
|
// Use the same API endpoint for both streaming and non-streaming
|
||||||
generateSnippets(api, payload);
|
generateSnippets(api, payload);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
@@ -786,7 +804,7 @@
|
|||||||
document.getElementById('stress-avg-time').textContent = '0';
|
document.getElementById('stress-avg-time').textContent = '0';
|
||||||
document.getElementById('stress-peak-mem').textContent = '0';
|
document.getElementById('stress-peak-mem').textContent = '0';
|
||||||
|
|
||||||
const api = useStream ? '/crawl/stream' : '/crawl';
|
const api = '/crawl'; // Always use /crawl - backend handles streaming internally
|
||||||
const urls = Array.from({ length: total }, (_, i) => `https://httpbin.org/anything/stress-${i}-${Date.now()}`);
|
const urls = Array.from({ length: total }, (_, i) => `https://httpbin.org/anything/stress-${i}-${Date.now()}`);
|
||||||
const chunks = [];
|
const chunks = [];
|
||||||
|
|
||||||
|
|||||||
@@ -1,154 +0,0 @@
|
|||||||
import asyncio
|
|
||||||
import os
|
|
||||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig, LLMConfig
|
|
||||||
|
|
||||||
|
|
||||||
async def test_configuration(name: str, config: AdaptiveConfig, url: str, query: str):
|
|
||||||
"""Test a specific configuration"""
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"Configuration: {name}")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
|
|
||||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
|
||||||
adaptive = AdaptiveCrawler(crawler, config)
|
|
||||||
result = await adaptive.digest(start_url=url, query=query)
|
|
||||||
|
|
||||||
print("\n" + "="*50)
|
|
||||||
print("CRAWL STATISTICS")
|
|
||||||
print("="*50)
|
|
||||||
adaptive.print_stats(detailed=False)
|
|
||||||
|
|
||||||
# Get the most relevant content found
|
|
||||||
print("\n" + "="*50)
|
|
||||||
print("MOST RELEVANT PAGES")
|
|
||||||
print("="*50)
|
|
||||||
|
|
||||||
relevant_pages = adaptive.get_relevant_content(top_k=5)
|
|
||||||
for i, page in enumerate(relevant_pages, 1):
|
|
||||||
print(f"\n{i}. {page['url']}")
|
|
||||||
print(f" Relevance Score: {page['score']:.2%}")
|
|
||||||
|
|
||||||
# Show a snippet of the content
|
|
||||||
content = page['content'] or ""
|
|
||||||
if content:
|
|
||||||
snippet = content[:200].replace('\n', ' ')
|
|
||||||
if len(content) > 200:
|
|
||||||
snippet += "..."
|
|
||||||
print(f" Preview: {snippet}")
|
|
||||||
|
|
||||||
print(f"\n{'='*50}")
|
|
||||||
print(f"Pages crawled: {len(result.crawled_urls)}")
|
|
||||||
print(f"Final confidence: {adaptive.confidence:.1%}")
|
|
||||||
print(f"Stopped reason: {result.metrics.get('stopped_reason', 'max_pages')}")
|
|
||||||
|
|
||||||
if result.metrics.get('is_irrelevant', False):
|
|
||||||
print("⚠️ Query detected as irrelevant!")
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
async def llm_embedding():
|
|
||||||
"""Demonstrate various embedding configurations"""
|
|
||||||
|
|
||||||
print("EMBEDDING STRATEGY CONFIGURATION EXAMPLES")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
# Base URL and query for testing
|
|
||||||
test_url = "https://docs.python.org/3/library/asyncio.html"
|
|
||||||
|
|
||||||
openai_llm_config = LLMConfig(
|
|
||||||
provider='openai/text-embedding-3-small',
|
|
||||||
api_token=os.getenv('OPENAI_API_KEY'),
|
|
||||||
temperature=0.7,
|
|
||||||
max_tokens=2000
|
|
||||||
)
|
|
||||||
config_openai = AdaptiveConfig(
|
|
||||||
strategy="embedding",
|
|
||||||
max_pages=10,
|
|
||||||
|
|
||||||
# Use OpenAI embeddings
|
|
||||||
embedding_llm_config=openai_llm_config,
|
|
||||||
# embedding_llm_config={
|
|
||||||
# 'provider': 'openai/text-embedding-3-small',
|
|
||||||
# 'api_token': os.getenv('OPENAI_API_KEY')
|
|
||||||
# },
|
|
||||||
|
|
||||||
# OpenAI embeddings are high quality, can be stricter
|
|
||||||
embedding_k_exp=4.0,
|
|
||||||
n_query_variations=12
|
|
||||||
)
|
|
||||||
|
|
||||||
await test_configuration(
|
|
||||||
"OpenAI Embeddings",
|
|
||||||
config_openai,
|
|
||||||
test_url,
|
|
||||||
# "event-driven architecture patterns"
|
|
||||||
"async await context managers coroutines"
|
|
||||||
)
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
async def basic_adaptive_crawling():
|
|
||||||
"""Basic adaptive crawling example"""
|
|
||||||
|
|
||||||
# Initialize the crawler
|
|
||||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
|
||||||
# Create an adaptive crawler with default settings (statistical strategy)
|
|
||||||
adaptive = AdaptiveCrawler(crawler)
|
|
||||||
|
|
||||||
# Note: You can also use embedding strategy for semantic understanding:
|
|
||||||
# from crawl4ai import AdaptiveConfig
|
|
||||||
# config = AdaptiveConfig(strategy="embedding")
|
|
||||||
# adaptive = AdaptiveCrawler(crawler, config)
|
|
||||||
|
|
||||||
# Start adaptive crawling
|
|
||||||
print("Starting adaptive crawl for Python async programming information...")
|
|
||||||
result = await adaptive.digest(
|
|
||||||
start_url="https://docs.python.org/3/library/asyncio.html",
|
|
||||||
query="async await context managers coroutines"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Display crawl statistics
|
|
||||||
print("\n" + "="*50)
|
|
||||||
print("CRAWL STATISTICS")
|
|
||||||
print("="*50)
|
|
||||||
adaptive.print_stats(detailed=False)
|
|
||||||
|
|
||||||
# Get the most relevant content found
|
|
||||||
print("\n" + "="*50)
|
|
||||||
print("MOST RELEVANT PAGES")
|
|
||||||
print("="*50)
|
|
||||||
|
|
||||||
relevant_pages = adaptive.get_relevant_content(top_k=5)
|
|
||||||
for i, page in enumerate(relevant_pages, 1):
|
|
||||||
print(f"\n{i}. {page['url']}")
|
|
||||||
print(f" Relevance Score: {page['score']:.2%}")
|
|
||||||
|
|
||||||
# Show a snippet of the content
|
|
||||||
content = page['content'] or ""
|
|
||||||
if content:
|
|
||||||
snippet = content[:200].replace('\n', ' ')
|
|
||||||
if len(content) > 200:
|
|
||||||
snippet += "..."
|
|
||||||
print(f" Preview: {snippet}")
|
|
||||||
|
|
||||||
# Show final confidence
|
|
||||||
print(f"\n{'='*50}")
|
|
||||||
print(f"Final Confidence: {adaptive.confidence:.2%}")
|
|
||||||
print(f"Total Pages Crawled: {len(result.crawled_urls)}")
|
|
||||||
print(f"Knowledge Base Size: {len(adaptive.state.knowledge_base)} documents")
|
|
||||||
|
|
||||||
|
|
||||||
if adaptive.confidence >= 0.8:
|
|
||||||
print("✓ High confidence - can answer detailed questions about async Python")
|
|
||||||
elif adaptive.confidence >= 0.6:
|
|
||||||
print("~ Moderate confidence - can answer basic questions")
|
|
||||||
else:
|
|
||||||
print("✗ Low confidence - need more information")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(llm_embedding())
|
|
||||||
# asyncio.run(basic_adaptive_crawling())
|
|
||||||
@@ -108,19 +108,7 @@ config = AdaptiveConfig(
|
|||||||
embedding_min_confidence_threshold=0.1 # Stop if completely irrelevant
|
embedding_min_confidence_threshold=0.1 # Stop if completely irrelevant
|
||||||
)
|
)
|
||||||
|
|
||||||
# With custom LLM provider for query expansion (recommended)
|
# With custom embedding provider (e.g., OpenAI)
|
||||||
from crawl4ai import LLMConfig
|
|
||||||
|
|
||||||
config = AdaptiveConfig(
|
|
||||||
strategy="embedding",
|
|
||||||
embedding_llm_config=LLMConfig(
|
|
||||||
provider='openai/text-embedding-3-small',
|
|
||||||
api_token='your-api-key',
|
|
||||||
temperature=0.7
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Alternative: Dictionary format (backward compatible)
|
|
||||||
config = AdaptiveConfig(
|
config = AdaptiveConfig(
|
||||||
strategy="embedding",
|
strategy="embedding",
|
||||||
embedding_llm_config={
|
embedding_llm_config={
|
||||||
|
|||||||
@@ -1,154 +0,0 @@
|
|||||||
import asyncio
|
|
||||||
import os
|
|
||||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig, LLMConfig
|
|
||||||
|
|
||||||
|
|
||||||
async def test_configuration(name: str, config: AdaptiveConfig, url: str, query: str):
|
|
||||||
"""Test a specific configuration"""
|
|
||||||
print(f"\n{'='*60}")
|
|
||||||
print(f"Configuration: {name}")
|
|
||||||
print(f"{'='*60}")
|
|
||||||
|
|
||||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
|
||||||
adaptive = AdaptiveCrawler(crawler, config)
|
|
||||||
result = await adaptive.digest(start_url=url, query=query)
|
|
||||||
|
|
||||||
print("\n" + "="*50)
|
|
||||||
print("CRAWL STATISTICS")
|
|
||||||
print("="*50)
|
|
||||||
adaptive.print_stats(detailed=False)
|
|
||||||
|
|
||||||
# Get the most relevant content found
|
|
||||||
print("\n" + "="*50)
|
|
||||||
print("MOST RELEVANT PAGES")
|
|
||||||
print("="*50)
|
|
||||||
|
|
||||||
relevant_pages = adaptive.get_relevant_content(top_k=5)
|
|
||||||
for i, page in enumerate(relevant_pages, 1):
|
|
||||||
print(f"\n{i}. {page['url']}")
|
|
||||||
print(f" Relevance Score: {page['score']:.2%}")
|
|
||||||
|
|
||||||
# Show a snippet of the content
|
|
||||||
content = page['content'] or ""
|
|
||||||
if content:
|
|
||||||
snippet = content[:200].replace('\n', ' ')
|
|
||||||
if len(content) > 200:
|
|
||||||
snippet += "..."
|
|
||||||
print(f" Preview: {snippet}")
|
|
||||||
|
|
||||||
print(f"\n{'='*50}")
|
|
||||||
print(f"Pages crawled: {len(result.crawled_urls)}")
|
|
||||||
print(f"Final confidence: {adaptive.confidence:.1%}")
|
|
||||||
print(f"Stopped reason: {result.metrics.get('stopped_reason', 'max_pages')}")
|
|
||||||
|
|
||||||
if result.metrics.get('is_irrelevant', False):
|
|
||||||
print("⚠️ Query detected as irrelevant!")
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
async def llm_embedding():
|
|
||||||
"""Demonstrate various embedding configurations"""
|
|
||||||
|
|
||||||
print("EMBEDDING STRATEGY CONFIGURATION EXAMPLES")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
# Base URL and query for testing
|
|
||||||
test_url = "https://docs.python.org/3/library/asyncio.html"
|
|
||||||
|
|
||||||
openai_llm_config = LLMConfig(
|
|
||||||
provider='openai/text-embedding-3-small',
|
|
||||||
api_token=os.getenv('OPENAI_API_KEY'),
|
|
||||||
temperature=0.7,
|
|
||||||
max_tokens=2000
|
|
||||||
)
|
|
||||||
config_openai = AdaptiveConfig(
|
|
||||||
strategy="embedding",
|
|
||||||
max_pages=10,
|
|
||||||
|
|
||||||
# Use OpenAI embeddings
|
|
||||||
embedding_llm_config=openai_llm_config,
|
|
||||||
# embedding_llm_config={
|
|
||||||
# 'provider': 'openai/text-embedding-3-small',
|
|
||||||
# 'api_token': os.getenv('OPENAI_API_KEY')
|
|
||||||
# },
|
|
||||||
|
|
||||||
# OpenAI embeddings are high quality, can be stricter
|
|
||||||
embedding_k_exp=4.0,
|
|
||||||
n_query_variations=12
|
|
||||||
)
|
|
||||||
|
|
||||||
await test_configuration(
|
|
||||||
"OpenAI Embeddings",
|
|
||||||
config_openai,
|
|
||||||
test_url,
|
|
||||||
# "event-driven architecture patterns"
|
|
||||||
"async await context managers coroutines"
|
|
||||||
)
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
async def basic_adaptive_crawling():
|
|
||||||
"""Basic adaptive crawling example"""
|
|
||||||
|
|
||||||
# Initialize the crawler
|
|
||||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
|
||||||
# Create an adaptive crawler with default settings (statistical strategy)
|
|
||||||
adaptive = AdaptiveCrawler(crawler)
|
|
||||||
|
|
||||||
# Note: You can also use embedding strategy for semantic understanding:
|
|
||||||
# from crawl4ai import AdaptiveConfig
|
|
||||||
# config = AdaptiveConfig(strategy="embedding")
|
|
||||||
# adaptive = AdaptiveCrawler(crawler, config)
|
|
||||||
|
|
||||||
# Start adaptive crawling
|
|
||||||
print("Starting adaptive crawl for Python async programming information...")
|
|
||||||
result = await adaptive.digest(
|
|
||||||
start_url="https://docs.python.org/3/library/asyncio.html",
|
|
||||||
query="async await context managers coroutines"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Display crawl statistics
|
|
||||||
print("\n" + "="*50)
|
|
||||||
print("CRAWL STATISTICS")
|
|
||||||
print("="*50)
|
|
||||||
adaptive.print_stats(detailed=False)
|
|
||||||
|
|
||||||
# Get the most relevant content found
|
|
||||||
print("\n" + "="*50)
|
|
||||||
print("MOST RELEVANT PAGES")
|
|
||||||
print("="*50)
|
|
||||||
|
|
||||||
relevant_pages = adaptive.get_relevant_content(top_k=5)
|
|
||||||
for i, page in enumerate(relevant_pages, 1):
|
|
||||||
print(f"\n{i}. {page['url']}")
|
|
||||||
print(f" Relevance Score: {page['score']:.2%}")
|
|
||||||
|
|
||||||
# Show a snippet of the content
|
|
||||||
content = page['content'] or ""
|
|
||||||
if content:
|
|
||||||
snippet = content[:200].replace('\n', ' ')
|
|
||||||
if len(content) > 200:
|
|
||||||
snippet += "..."
|
|
||||||
print(f" Preview: {snippet}")
|
|
||||||
|
|
||||||
# Show final confidence
|
|
||||||
print(f"\n{'='*50}")
|
|
||||||
print(f"Final Confidence: {adaptive.confidence:.2%}")
|
|
||||||
print(f"Total Pages Crawled: {len(result.crawled_urls)}")
|
|
||||||
print(f"Knowledge Base Size: {len(adaptive.state.knowledge_base)} documents")
|
|
||||||
|
|
||||||
|
|
||||||
if adaptive.confidence >= 0.8:
|
|
||||||
print("✓ High confidence - can answer detailed questions about async Python")
|
|
||||||
elif adaptive.confidence >= 0.6:
|
|
||||||
print("~ Moderate confidence - can answer basic questions")
|
|
||||||
else:
|
|
||||||
print("✗ Low confidence - need more information")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
asyncio.run(llm_embedding())
|
|
||||||
# asyncio.run(basic_adaptive_crawling())
|
|
||||||
@@ -143,7 +143,40 @@ class TestCrawlEndpoints:
|
|||||||
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
|
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
|
||||||
# We don't specify a markdown generator in this test, so don't make assumptions about markdown field
|
# We don't specify a markdown generator in this test, so don't make assumptions about markdown field
|
||||||
# It might be null, missing, or populated depending on the server's default behavior
|
# It might be null, missing, or populated depending on the server's default behavior
|
||||||
|
async def test_crawl_with_stream_direct(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test that /crawl endpoint handles stream=True directly without redirect."""
|
||||||
|
payload = {
|
||||||
|
"urls": [SIMPLE_HTML_URL],
|
||||||
|
"browser_config": {
|
||||||
|
"type": "BrowserConfig",
|
||||||
|
"params": {
|
||||||
|
"headless": True,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"crawler_config": {
|
||||||
|
"type": "CrawlerRunConfig",
|
||||||
|
"params": {
|
||||||
|
"stream": True, # Set stream to True for direct streaming
|
||||||
|
"screenshot": False,
|
||||||
|
"cache_mode": CacheMode.BYPASS.value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Send a request to the /crawl endpoint - should handle streaming directly
|
||||||
|
async with async_client.stream("POST", "/crawl", json=payload) as response:
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert response.headers["content-type"] == "application/x-ndjson"
|
||||||
|
assert response.headers.get("x-stream-status") == "active"
|
||||||
|
|
||||||
|
results = await process_streaming_response(response)
|
||||||
|
|
||||||
|
assert len(results) == 1
|
||||||
|
result = results[0]
|
||||||
|
await assert_crawl_result_structure(result)
|
||||||
|
assert result["success"] is True
|
||||||
|
assert result["url"] == SIMPLE_HTML_URL
|
||||||
|
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
|
||||||
async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient):
|
async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient):
|
||||||
"""Test /crawl/stream with a single URL and simple config values."""
|
"""Test /crawl/stream with a single URL and simple config values."""
|
||||||
payload = {
|
payload = {
|
||||||
|
|||||||
Reference in New Issue
Block a user