[v0.3.71] Enhance chunking strategies and improve overall performance

- Add OverlappingWindowChunking and improve SlidingWindowChunking - Update CHUNK_TOKEN_THRESHOLD to 2048 tokens - Optimize AsyncPlaywrightCrawlerStrategy close method - Enhance flexibility in CosineStrategy with generic embedding model loading - Improve JSON-based extraction strategies - Add knowledge graph generation example
2024-10-19 18:36:59 +08:00
parent b309bc34e1
commit 4e2852d5ff
7 changed files with 118 additions and 18 deletions
--- a/docs/examples/quickstart_async.py
+++ b/docs/examples/quickstart_async.py
@@ -10,7 +10,7 @@ import time
 import json
 import os
 import re
-from typing import Dict
+from typing import Dict, List
 from bs4 import BeautifulSoup
 from pydantic import BaseModel, Field
 from crawl4ai import AsyncWebCrawler
@@ -456,6 +456,41 @@ async def speed_comparison():
    print("If you run these tests in an environment with better network conditions,")
    print("you may observe an even more significant speed advantage for Crawl4AI.")

+
+async def generate_knowledge_graph():
+    class Entity(BaseModel):
+        name: str
+        description: str
+        
+    class Relationship(BaseModel):
+        entity1: Entity
+        entity2: Entity
+        description: str
+        relation_type: str
+
+    class KnowledgeGraph(BaseModel):
+        entities: List[Entity]
+        relationships: List[Relationship]
+
+    extraction_strategy = LLMExtractionStrategy(
+            provider='openai/gpt-4o-mini',
+            api_token=os.getenv('OPENAI_API_KEY'),
+            schema=KnowledgeGraph.model_json_schema(),
+            extraction_type="schema",
+            instruction="""Extract entities and relationships from the given text."""
+    )
+    async with AsyncWebCrawler() as crawler:
+        url = "https://paulgraham.com/love.html"
+        result = await crawler.arun(
+            url=url,
+            bypass_cache=True,
+            extraction_strategy=extraction_strategy,
+            # magic=True
+        )
+        # print(result.extracted_content)
+        with open(os.path.join(__location__, "kb.json"), "w") as f:
+            f.write(result.extracted_content)
+
 async def main():
    await simple_crawl()
    await simple_example_with_running_js_code()