[v0.3.71] Enhance chunking strategies and improve overall performance

- Add OverlappingWindowChunking and improve SlidingWindowChunking
- Update CHUNK_TOKEN_THRESHOLD to 2048 tokens
- Optimize AsyncPlaywrightCrawlerStrategy close method
- Enhance flexibility in CosineStrategy with generic embedding model loading
- Improve JSON-based extraction strategies
- Add knowledge graph generation example
This commit is contained in:
UncleCode
2024-10-19 18:36:59 +08:00
parent b309bc34e1
commit 4e2852d5ff
7 changed files with 118 additions and 18 deletions

View File

@@ -234,11 +234,12 @@ class CosineStrategy(ExtractionStrategy):
"""
Initialize the strategy with clustering parameters.
:param semantic_filter: A keyword filter for document filtering.
:param word_count_threshold: Minimum number of words per cluster.
:param max_dist: The maximum cophenetic distance on the dendrogram to form clusters.
:param linkage_method: The linkage method for hierarchical clustering.
:param top_k: Number of top categories to extract.
Args:
semantic_filter (str): A keyword filter for document filtering.
word_count_threshold (int): Minimum number of words per cluster.
max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
linkage_method (str): The linkage method for hierarchical clustering.
top_k (int): Number of top categories to extract.
"""
super().__init__()
@@ -257,8 +258,8 @@ class CosineStrategy(ExtractionStrategy):
self.get_embedding_method = "direct"
self.device = get_device()
import torch
self.device = torch.device('cpu')
# import torch
# self.device = torch.device('cpu')
self.default_batch_size = calculate_batch_size(self.device)
@@ -271,7 +272,7 @@ class CosineStrategy(ExtractionStrategy):
# self.get_embedding_method = "direct"
# else:
self.tokenizer, self.model = load_bge_small_en_v1_5()
self.tokenizer, self.model = load_HF_embedding_model(model_name)
self.model.to(self.device)
self.model.eval()
@@ -738,7 +739,6 @@ class JsonCssExtractionStrategy(ExtractionStrategy):
combined_html = self.DEL.join(sections)
return self.extract(url, combined_html, **kwargs)
class JsonXPATHExtractionStrategy(ExtractionStrategy):
def __init__(self, schema: Dict[str, Any], **kwargs):
super().__init__(**kwargs)