Update:

- Text Categorization - Crawler, Extraction, and Chunking strategies - Clustering for semantic segmentation
2024-05-12 22:37:21 +08:00
parent 7039e3c1ee
commit 82706129f5
19 changed files with 84568 additions and 102 deletions
--- a/crawl4ai/chunking_strategy.py
+++ b/crawl4ai/chunking_strategy.py
@@ -0,0 +1,95 @@
+from abc import ABC, abstractmethod
+import re
+import spacy
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize, TextTilingTokenizer
+from collections import Counter
+import string
+
+# Define the abstract base class for chunking strategies
+class ChunkingStrategy(ABC):
+    
+    @abstractmethod
+    def chunk(self, text: str) -> list:
+        """
+        Abstract method to chunk the given text.
+        """
+        pass
+    
+# Regex-based chunking
+class RegexChunking(ChunkingStrategy):
+    def __init__(self, patterns=None):
+        if patterns is None:
+            patterns = [r'\n\n']  # Default split pattern
+        self.patterns = patterns
+
+    def chunk(self, text: str) -> list:
+        paragraphs = [text]
+        for pattern in self.patterns:
+            new_paragraphs = []
+            for paragraph in paragraphs:
+                new_paragraphs.extend(re.split(pattern, paragraph))
+            paragraphs = new_paragraphs
+        return paragraphs
+    
+# NLP-based sentence chunking using spaCy
+class NlpSentenceChunking(ChunkingStrategy):
+    def __init__(self, model='en_core_web_sm'):
+        self.nlp = spacy.load(model)
+
+    def chunk(self, text: str) -> list:
+        doc = self.nlp(text)
+        return [sent.text.strip() for sent in doc.sents]
+    
+# Topic-based segmentation using TextTiling
+class TopicSegmentationChunking(ChunkingStrategy):
+    def __init__(self, num_keywords=3):
+        self.tokenizer = TextTilingTokenizer()
+        self.num_keywords = num_keywords
+
+    def chunk(self, text: str) -> list:
+        # Use the TextTilingTokenizer to segment the text
+        segmented_topics = self.tokenizer.tokenize(text)
+        return segmented_topics
+
+    def extract_keywords(self, text: str) -> list:
+        # Tokenize and remove stopwords and punctuation
+        tokens = word_tokenize(text)
+        tokens = [token.lower() for token in tokens if token not in stopwords.words('english') and token not in string.punctuation]
+
+        # Calculate frequency distribution
+        freq_dist = Counter(tokens)
+        keywords = [word for word, freq in freq_dist.most_common(self.num_keywords)]
+        return keywords
+
+    def chunk_with_topics(self, text: str) -> list:
+        # Segment the text into topics
+        segments = self.chunk(text)
+        # Extract keywords for each topic segment
+        segments_with_topics = [(segment, self.extract_keywords(segment)) for segment in segments]
+        return segments_with_topics
+    
+# Fixed-length word chunks
+class FixedLengthWordChunking(ChunkingStrategy):
+    def __init__(self, chunk_size=100):
+        self.chunk_size = chunk_size
+
+    def chunk(self, text: str) -> list:
+        words = text.split()
+        return [' '.join(words[i:i + self.chunk_size]) for i in range(0, len(words), self.chunk_size)]
+    
+# Sliding window chunking
+class SlidingWindowChunking(ChunkingStrategy):
+    def __init__(self, window_size=100, step=50):
+        self.window_size = window_size
+        self.step = step
+
+    def chunk(self, text: str) -> list:
+        words = text.split()
+        chunks = []
+        for i in range(0, len(words), self.step):
+            chunks.append(' '.join(words[i:i + self.window_size]))
+        return chunks
+    
+