[v0.3.71] Enhance chunking strategies and improve overall performance

- Add OverlappingWindowChunking and improve SlidingWindowChunking - Update CHUNK_TOKEN_THRESHOLD to 2048 tokens - Optimize AsyncPlaywrightCrawlerStrategy close method - Enhance flexibility in CosineStrategy with generic embedding model loading - Improve JSON-based extraction strategies - Add knowledge graph generation example
2024-10-19 18:36:59 +08:00
parent b309bc34e1
commit 4e2852d5ff
7 changed files with 118 additions and 18 deletions
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -234,11 +234,12 @@ class CosineStrategy(ExtractionStrategy):
        """
        Initialize the strategy with clustering parameters.

-        :param semantic_filter: A keyword filter for document filtering.
-        :param word_count_threshold: Minimum number of words per cluster.
-        :param max_dist: The maximum cophenetic distance on the dendrogram to form clusters.
-        :param linkage_method: The linkage method for hierarchical clustering.
-        :param top_k: Number of top categories to extract.
+        Args:
+            semantic_filter (str): A keyword filter for document filtering.
+            word_count_threshold (int): Minimum number of words per cluster.
+            max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
+            linkage_method (str): The linkage method for hierarchical clustering.
+            top_k (int): Number of top categories to extract.
        """
        super().__init__()
        
@@ -257,8 +258,8 @@ class CosineStrategy(ExtractionStrategy):
        self.get_embedding_method = "direct"
        
        self.device = get_device()
-        import torch
-        self.device = torch.device('cpu')
+        # import torch
+        # self.device = torch.device('cpu')
        
        self.default_batch_size = calculate_batch_size(self.device)

@@ -271,7 +272,7 @@ class CosineStrategy(ExtractionStrategy):
        #     self.get_embedding_method = "direct"
        # else:

-        self.tokenizer, self.model = load_bge_small_en_v1_5()
+        self.tokenizer, self.model = load_HF_embedding_model(model_name)
        self.model.to(self.device)
        self.model.eval()  
        
@@ -738,7 +739,6 @@ class JsonCssExtractionStrategy(ExtractionStrategy):
        combined_html = self.DEL.join(sections)
        return self.extract(url, combined_html, **kwargs)
    
-
 class JsonXPATHExtractionStrategy(ExtractionStrategy):
    def __init__(self, schema: Dict[str, Any], **kwargs):
        super().__init__(**kwargs)