[v0.3.71] Enhance chunking strategies and improve overall performance

- Add OverlappingWindowChunking and improve SlidingWindowChunking - Update CHUNK_TOKEN_THRESHOLD to 2048 tokens - Optimize AsyncPlaywrightCrawlerStrategy close method - Enhance flexibility in CosineStrategy with generic embedding model loading - Improve JSON-based extraction strategies - Add knowledge graph generation example
2024-10-19 18:36:59 +08:00
parent b309bc34e1
commit 4e2852d5ff
7 changed files with 118 additions and 18 deletions
--- a/crawl4ai/config.py
+++ b/crawl4ai/config.py
@@ -21,7 +21,7 @@ PROVIDER_MODELS = {


 # Chunk token threshold
-CHUNK_TOKEN_THRESHOLD = 500
+CHUNK_TOKEN_THRESHOLD = 2 ** 11 # 2048 tokens
 OVERLAP_RATE = 0.1
 WORD_TOKEN_RATE = 1.3