- Add OverlappingWindowChunking and improve SlidingWindowChunking - Update CHUNK_TOKEN_THRESHOLD to 2048 tokens - Optimize AsyncPlaywrightCrawlerStrategy close method - Enhance flexibility in CosineStrategy with generic embedding model loading - Improve JSON-based extraction strategies - Add knowledge graph generation example
162 lines
5.4 KiB
Python
162 lines
5.4 KiB
Python
from abc import ABC, abstractmethod
|
|
import re
|
|
from collections import Counter
|
|
import string
|
|
from .model_loader import load_nltk_punkt
|
|
from .utils import *
|
|
|
|
# Define the abstract base class for chunking strategies
|
|
class ChunkingStrategy(ABC):
|
|
|
|
@abstractmethod
|
|
def chunk(self, text: str) -> list:
|
|
"""
|
|
Abstract method to chunk the given text.
|
|
"""
|
|
pass
|
|
|
|
# Regex-based chunking
|
|
class RegexChunking(ChunkingStrategy):
|
|
def __init__(self, patterns=None, **kwargs):
|
|
if patterns is None:
|
|
patterns = [r'\n\n'] # Default split pattern
|
|
self.patterns = patterns
|
|
|
|
def chunk(self, text: str) -> list:
|
|
paragraphs = [text]
|
|
for pattern in self.patterns:
|
|
new_paragraphs = []
|
|
for paragraph in paragraphs:
|
|
new_paragraphs.extend(re.split(pattern, paragraph))
|
|
paragraphs = new_paragraphs
|
|
return paragraphs
|
|
|
|
# NLP-based sentence chunking
|
|
class NlpSentenceChunking(ChunkingStrategy):
|
|
def __init__(self, **kwargs):
|
|
load_nltk_punkt()
|
|
pass
|
|
|
|
def chunk(self, text: str) -> list:
|
|
# Improved regex for sentence splitting
|
|
# sentence_endings = re.compile(
|
|
# r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z][A-Z]\.)(?<![A-Za-z]\.)(?<=\.|\?|\!|\n)\s'
|
|
# )
|
|
# sentences = sentence_endings.split(text)
|
|
# sens = [sent.strip() for sent in sentences if sent]
|
|
from nltk.tokenize import sent_tokenize
|
|
sentences = sent_tokenize(text)
|
|
sens = [sent.strip() for sent in sentences]
|
|
|
|
return list(set(sens))
|
|
|
|
# Topic-based segmentation using TextTiling
|
|
class TopicSegmentationChunking(ChunkingStrategy):
|
|
|
|
def __init__(self, num_keywords=3, **kwargs):
|
|
import nltk as nl
|
|
self.tokenizer = nl.tokenize.TextTilingTokenizer()
|
|
self.num_keywords = num_keywords
|
|
|
|
def chunk(self, text: str) -> list:
|
|
# Use the TextTilingTokenizer to segment the text
|
|
segmented_topics = self.tokenizer.tokenize(text)
|
|
return segmented_topics
|
|
|
|
def extract_keywords(self, text: str) -> list:
|
|
# Tokenize and remove stopwords and punctuation
|
|
import nltk as nl
|
|
tokens = nl.toknize.word_tokenize(text)
|
|
tokens = [token.lower() for token in tokens if token not in nl.corpus.stopwords.words('english') and token not in string.punctuation]
|
|
|
|
# Calculate frequency distribution
|
|
freq_dist = Counter(tokens)
|
|
keywords = [word for word, freq in freq_dist.most_common(self.num_keywords)]
|
|
return keywords
|
|
|
|
def chunk_with_topics(self, text: str) -> list:
|
|
# Segment the text into topics
|
|
segments = self.chunk(text)
|
|
# Extract keywords for each topic segment
|
|
segments_with_topics = [(segment, self.extract_keywords(segment)) for segment in segments]
|
|
return segments_with_topics
|
|
|
|
# Fixed-length word chunks
|
|
class FixedLengthWordChunking(ChunkingStrategy):
|
|
def __init__(self, chunk_size=100, **kwargs):
|
|
"""
|
|
Initialize the fixed-length word chunking strategy with the given chunk size.
|
|
|
|
Args:
|
|
chunk_size (int): The size of each chunk in words.
|
|
"""
|
|
self.chunk_size = chunk_size
|
|
|
|
def chunk(self, text: str) -> list:
|
|
words = text.split()
|
|
return [' '.join(words[i:i + self.chunk_size]) for i in range(0, len(words), self.chunk_size)]
|
|
|
|
# Sliding window chunking
|
|
class SlidingWindowChunking(ChunkingStrategy):
|
|
def __init__(self, window_size=100, step=50, **kwargs):
|
|
"""
|
|
Initialize the sliding window chunking strategy with the given window size and
|
|
step size.
|
|
|
|
Args:
|
|
window_size (int): The size of the sliding window in words.
|
|
step (int): The step size for sliding the window in words.
|
|
"""
|
|
self.window_size = window_size
|
|
self.step = step
|
|
|
|
def chunk(self, text: str) -> list:
|
|
words = text.split()
|
|
chunks = []
|
|
|
|
if len(words) <= self.window_size:
|
|
return [text]
|
|
|
|
for i in range(0, len(words) - self.window_size + 1, self.step):
|
|
chunk = ' '.join(words[i:i + self.window_size])
|
|
chunks.append(chunk)
|
|
|
|
# Handle the last chunk if it doesn't align perfectly
|
|
if i + self.window_size < len(words):
|
|
chunks.append(' '.join(words[-self.window_size:]))
|
|
|
|
return chunks
|
|
|
|
|
|
class OverlappingWindowChunking(ChunkingStrategy):
|
|
def __init__(self, window_size=1000, overlap=100, **kwargs):
|
|
"""
|
|
Initialize the overlapping window chunking strategy with the given window size and
|
|
overlap size.
|
|
|
|
Args:
|
|
window_size (int): The size of the window in words.
|
|
overlap (int): The size of the overlap between consecutive chunks in words.
|
|
"""
|
|
self.window_size = window_size
|
|
self.overlap = overlap
|
|
|
|
def chunk(self, text: str) -> list:
|
|
words = text.split()
|
|
chunks = []
|
|
|
|
if len(words) <= self.window_size:
|
|
return [text]
|
|
|
|
start = 0
|
|
while start < len(words):
|
|
end = start + self.window_size
|
|
chunk = ' '.join(words[start:end])
|
|
chunks.append(chunk)
|
|
|
|
if end >= len(words):
|
|
break
|
|
|
|
start = end - self.overlap
|
|
|
|
return chunks |