chore: Update configuration values for chunk token threshold, overlap rate, and minimum word threshold. Create a new example for LLMExtraction Strategy, update Dockerfile, and README

This commit is contained in:
unclecode
2024-06-19 18:32:20 +08:00
parent 3f0e265baf
commit 539263a8ba
11 changed files with 212 additions and 130 deletions

View File

@@ -21,7 +21,9 @@ PROVIDER_MODELS = {
# Chunk token threshold
CHUNK_TOKEN_THRESHOLD = 1000
CHUNK_TOKEN_THRESHOLD = 500
OVERLAP_RATE = 0.1
WORD_TOKEN_RATE = 1.3
# Threshold for the minimum number of word in a HTML tag to be considered
MIN_WORD_THRESHOLD = 5
MIN_WORD_THRESHOLD = 1