[v0.3.71] Enhance chunking strategies and improve overall performance

- Add OverlappingWindowChunking and improve SlidingWindowChunking - Update CHUNK_TOKEN_THRESHOLD to 2048 tokens - Optimize AsyncPlaywrightCrawlerStrategy close method - Enhance flexibility in CosineStrategy with generic embedding model loading - Improve JSON-based extraction strategies - Add knowledge graph generation example
2024-10-19 18:36:59 +08:00
parent b309bc34e1
commit 4e2852d5ff
7 changed files with 118 additions and 18 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -206,3 +206,4 @@ git_issues.py
 git_issues.md
 .tests/
 .issues/
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -134,7 +134,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
    async def close(self):
        if self.sleep_on_close:
-            await asyncio.sleep(500)
+            await asyncio.sleep(0.5)
        if self.browser:
            await self.browser.close()
            self.browser = None
--- a/crawl4ai/chunking_strategy.py
+++ b/crawl4ai/chunking_strategy.py
@@ -84,6 +84,12 @@ class TopicSegmentationChunking(ChunkingStrategy):
 # Fixed-length word chunks
 class FixedLengthWordChunking(ChunkingStrategy):
    def __init__(self, chunk_size=100, **kwargs):
        """
        Initialize the fixed-length word chunking strategy with the given chunk size.
        Args:
            chunk_size (int): The size of each chunk in words.
        """
        self.chunk_size = chunk_size
    def chunk(self, text: str) -> list:
@@ -93,14 +99,64 @@ class FixedLengthWordChunking(ChunkingStrategy):
 # Sliding window chunking
 class SlidingWindowChunking(ChunkingStrategy):
    def __init__(self, window_size=100, step=50, **kwargs):
        """
        Initialize the sliding window chunking strategy with the given window size and
        step size.
        Args:
            window_size (int): The size of the sliding window in words.
            step (int): The step size for sliding the window in words.
        """
        self.window_size = window_size
        self.step = step
    def chunk(self, text: str) -> list:
        words = text.split()
        chunks = []
-        for i in range(0, len(words), self.step):
+        
-            chunks.append(' '.join(words[i:i + self.window_size]))
+        if len(words) <= self.window_size:
            return [text]
        for i in range(0, len(words) - self.window_size + 1, self.step):
            chunk = ' '.join(words[i:i + self.window_size])
            chunks.append(chunk)
        # Handle the last chunk if it doesn't align perfectly
        if i + self.window_size < len(words):
            chunks.append(' '.join(words[-self.window_size:]))
        return chunks
 class OverlappingWindowChunking(ChunkingStrategy):
    def __init__(self, window_size=1000, overlap=100, **kwargs):
        """
        Initialize the overlapping window chunking strategy with the given window size and
        overlap size.
        Args:
            window_size (int): The size of the window in words.
            overlap (int): The size of the overlap between consecutive chunks in words.
        """
        self.window_size = window_size
        self.overlap = overlap
    def chunk(self, text: str) -> list:
        words = text.split()
        chunks = []
        if len(words) <= self.window_size:
            return [text]
        start = 0
        while start < len(words):
            end = start + self.window_size
            chunk = ' '.join(words[start:end])
            chunks.append(chunk)
            if end >= len(words):
                break
            start = end - self.overlap
        return chunks
--- a/crawl4ai/config.py
+++ b/crawl4ai/config.py
@@ -21,7 +21,7 @@ PROVIDER_MODELS = {
 # Chunk token threshold
-CHUNK_TOKEN_THRESHOLD = 500
+CHUNK_TOKEN_THRESHOLD = 2 ** 11 # 2048 tokens
 OVERLAP_RATE = 0.1
 WORD_TOKEN_RATE = 1.3
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -234,11 +234,12 @@ class CosineStrategy(ExtractionStrategy):
        """
        Initialize the strategy with clustering parameters.
-        :param semantic_filter: A keyword filter for document filtering.
+        Args:
-        :param word_count_threshold: Minimum number of words per cluster.
+            semantic_filter (str): A keyword filter for document filtering.
-        :param max_dist: The maximum cophenetic distance on the dendrogram to form clusters.
+            word_count_threshold (int): Minimum number of words per cluster.
-        :param linkage_method: The linkage method for hierarchical clustering.
+            max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
-        :param top_k: Number of top categories to extract.
+            linkage_method (str): The linkage method for hierarchical clustering.
            top_k (int): Number of top categories to extract.
        """
        super().__init__()
@@ -257,8 +258,8 @@ class CosineStrategy(ExtractionStrategy):
        self.get_embedding_method = "direct"
        self.device = get_device()
-        import torch
+        # import torch
-        self.device = torch.device('cpu')
+        # self.device = torch.device('cpu')
        self.default_batch_size = calculate_batch_size(self.device)
@@ -271,7 +272,7 @@ class CosineStrategy(ExtractionStrategy):
        #     self.get_embedding_method = "direct"
        # else:
-        self.tokenizer, self.model = load_bge_small_en_v1_5()
+        self.tokenizer, self.model = load_HF_embedding_model(model_name)
        self.model.to(self.device)
        self.model.eval()  
@@ -738,7 +739,6 @@ class JsonCssExtractionStrategy(ExtractionStrategy):
        combined_html = self.DEL.join(sections)
        return self.extract(url, combined_html, **kwargs)
 class JsonXPATHExtractionStrategy(ExtractionStrategy):
    def __init__(self, schema: Dict[str, Any], **kwargs):
        super().__init__(**kwargs)
--- a/crawl4ai/model_loader.py
+++ b/crawl4ai/model_loader.py
@@ -72,10 +72,18 @@ def load_bert_base_uncased():
    return tokenizer, model
@lru_cache()
-def load_bge_small_en_v1_5():
+def load_HF_embedding_model(model_name="BAAI/bge-small-en-v1.5") -> tuple:
    """Load the Hugging Face model for embedding.
    Args:
        model_name (str, optional): The model name to load. Defaults to "BAAI/bge-small-en-v1.5".
    Returns:
        tuple: The tokenizer and model.
    """
    from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
-    tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
+    tokenizer = AutoTokenizer.from_pretrained(model_name, resume_download=None)
-    model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5', resume_download=None)
+    model = AutoModel.from_pretrained(model_name, resume_download=None)
    model.eval()
    model, device = set_model_device(model)
    return tokenizer, model
--- a/docs/examples/quickstart_async.py
+++ b/docs/examples/quickstart_async.py
@@ -10,7 +10,7 @@ import time
 import json
 import os
 import re
-from typing import Dict
+from typing import Dict, List
 from bs4 import BeautifulSoup
 from pydantic import BaseModel, Field
 from crawl4ai import AsyncWebCrawler
@@ -456,6 +456,41 @@ async def speed_comparison():
    print("If you run these tests in an environment with better network conditions,")
    print("you may observe an even more significant speed advantage for Crawl4AI.")
 async def generate_knowledge_graph():
    class Entity(BaseModel):
        name: str
        description: str
    class Relationship(BaseModel):
        entity1: Entity
        entity2: Entity
        description: str
        relation_type: str
    class KnowledgeGraph(BaseModel):
        entities: List[Entity]
        relationships: List[Relationship]
    extraction_strategy = LLMExtractionStrategy(
            provider='openai/gpt-4o-mini',
            api_token=os.getenv('OPENAI_API_KEY'),
            schema=KnowledgeGraph.model_json_schema(),
            extraction_type="schema",
            instruction="""Extract entities and relationships from the given text."""
    )
    async with AsyncWebCrawler() as crawler:
        url = "https://paulgraham.com/love.html"
        result = await crawler.arun(
            url=url,
            bypass_cache=True,
            extraction_strategy=extraction_strategy,
            # magic=True
        )
        # print(result.extracted_content)
        with open(os.path.join(__location__, "kb.json"), "w") as f:
            f.write(result.extracted_content)
 async def main():
    await simple_crawl()
    await simple_example_with_running_js_code()