Files
crawl4ai/crawl4ai/chunking_strategy.py
unclecode 82706129f5 Update:
- Text Categorization
- Crawler, Extraction, and Chunking strategies
- Clustering for semantic segmentation
2024-05-12 22:37:21 +08:00

96 lines
3.2 KiB
Python

from abc import ABC, abstractmethod
import re
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, TextTilingTokenizer
from collections import Counter
import string
# Define the abstract base class for chunking strategies
class ChunkingStrategy(ABC):
@abstractmethod
def chunk(self, text: str) -> list:
"""
Abstract method to chunk the given text.
"""
pass
# Regex-based chunking
class RegexChunking(ChunkingStrategy):
def __init__(self, patterns=None):
if patterns is None:
patterns = [r'\n\n'] # Default split pattern
self.patterns = patterns
def chunk(self, text: str) -> list:
paragraphs = [text]
for pattern in self.patterns:
new_paragraphs = []
for paragraph in paragraphs:
new_paragraphs.extend(re.split(pattern, paragraph))
paragraphs = new_paragraphs
return paragraphs
# NLP-based sentence chunking using spaCy
class NlpSentenceChunking(ChunkingStrategy):
def __init__(self, model='en_core_web_sm'):
self.nlp = spacy.load(model)
def chunk(self, text: str) -> list:
doc = self.nlp(text)
return [sent.text.strip() for sent in doc.sents]
# Topic-based segmentation using TextTiling
class TopicSegmentationChunking(ChunkingStrategy):
def __init__(self, num_keywords=3):
self.tokenizer = TextTilingTokenizer()
self.num_keywords = num_keywords
def chunk(self, text: str) -> list:
# Use the TextTilingTokenizer to segment the text
segmented_topics = self.tokenizer.tokenize(text)
return segmented_topics
def extract_keywords(self, text: str) -> list:
# Tokenize and remove stopwords and punctuation
tokens = word_tokenize(text)
tokens = [token.lower() for token in tokens if token not in stopwords.words('english') and token not in string.punctuation]
# Calculate frequency distribution
freq_dist = Counter(tokens)
keywords = [word for word, freq in freq_dist.most_common(self.num_keywords)]
return keywords
def chunk_with_topics(self, text: str) -> list:
# Segment the text into topics
segments = self.chunk(text)
# Extract keywords for each topic segment
segments_with_topics = [(segment, self.extract_keywords(segment)) for segment in segments]
return segments_with_topics
# Fixed-length word chunks
class FixedLengthWordChunking(ChunkingStrategy):
def __init__(self, chunk_size=100):
self.chunk_size = chunk_size
def chunk(self, text: str) -> list:
words = text.split()
return [' '.join(words[i:i + self.chunk_size]) for i in range(0, len(words), self.chunk_size)]
# Sliding window chunking
class SlidingWindowChunking(ChunkingStrategy):
def __init__(self, window_size=100, step=50):
self.window_size = window_size
self.step = step
def chunk(self, text: str) -> list:
words = text.split()
chunks = []
for i in range(0, len(words), self.step):
chunks.append(' '.join(words[i:i + self.window_size]))
return chunks