From 1fcb573909364afdf751f5caaa85bc1fbc0bde87 Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Wed, 19 Jun 2024 18:53:22 +0800
Subject: [PATCH 1/2] chore: Update table of contents in README.md

---
 README.md | 33 +++++++++------------------------
 1 file changed, 9 insertions(+), 24 deletions(-)

diff --git a/README.md b/README.md
index 1e9ccb87..0e2f4fd5 100644
--- a/README.md
+++ b/README.md
@@ -169,30 +169,15 @@ With Crawl4AI, you can perform advanced web crawling and data extraction tasks w
 
 ## Table of Contents
 
-1. [Features](#features-)
-2. [Installation](#installation-)
-3. [REST API/Local Server](#using-the-local-server-ot-rest-api-)
-4. [Python Library Usage](#python-library-usage-)
-5. [Parameters](#parameters-)
-6. [Chunking Strategies](#chunking-strategies-)
-7. [Extraction Strategies](#extraction-strategies-)
-8. [Contributing](#contributing-)
-9. [License](#license-)
-10. [Contact](#contact-)
-
-
-## Features ✨
-
-- 🕷️ Efficient web crawling to extract valuable data from websites
-- 🤖 LLM-friendly output formats (JSON, cleaned HTML, markdown)
-- 🌍 Supports crawling multiple URLs simultaneously
-- 🌃 Replace media tags with ALT.
-- 🆓 Completely free to use and open-source
-- 📜 Execute custom JavaScript before crawling
-- 📚 Chunking strategies: topic-based, regex, sentence, and more
-- 🧠 Extraction strategies: cosine clustering, LLM, and more
-- 🎯 CSS selector support
-- 📝 Pass instructions/keywords to refine extraction
+1. [Installation](#installation-)
+2. [REST API/Local Server](#using-the-local-server-ot-rest-api-)
+3. [Python Library Usage](#python-library-usage-)
+4. [Parameters](#parameters-)
+5. [Chunking Strategies](#chunking-strategies-)
+6. [Extraction Strategies](#extraction-strategies-)
+7. [Contributing](#contributing-)
+8. [License](#license-)
+9. [Contact](#contact-)
 
 ## Installation 💻
 

From 21b110bfd72682c229ce4814a13ad076b7229602 Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Wed, 19 Jun 2024 19:03:35 +0800
Subject: [PATCH 2/2] Update LLMExtractionStrategy to disable chunking if
 specified, Add example of summarization for a web page.

---
 crawl4ai/extraction_strategy.py |  3 +++
 docs/examples/summarize_page.py | 46 +++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)
 create mode 100644 docs/examples/summarize_page.py

diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py
index dca61350..aadcda20 100644
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -77,6 +77,9 @@ class LLMExtractionStrategy(ExtractionStrategy):
         self.chunk_token_threshold = kwargs.get("chunk_token_threshold", CHUNK_TOKEN_THRESHOLD)
         self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE)
         self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
+        self.apply_chunking = kwargs.get("apply_chunking", True)
+        if not self.apply_chunking:
+            self.chunk_token_threshold = 1e9
         
         self.verbose = kwargs.get("verbose", False)
         
diff --git a/docs/examples/summarize_page.py b/docs/examples/summarize_page.py
new file mode 100644
index 00000000..31098e8e
--- /dev/null
+++ b/docs/examples/summarize_page.py
@@ -0,0 +1,46 @@
+import os
+import time
+import json
+from crawl4ai.web_crawler import WebCrawler
+from crawl4ai.chunking_strategy import *
+from crawl4ai.extraction_strategy import *
+from crawl4ai.crawler_strategy import *
+
+url = r'https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot'
+
+crawler = WebCrawler()
+crawler.warmup()
+
+from pydantic import BaseModel, Field
+
+class PageSummary(BaseModel):
+    title: str = Field(..., description="Title of the page.")
+    summary: str = Field(..., description="Summary of the page.")
+    brief_summary: str = Field(..., description="Brief summary of the page.")
+    keywords: list = Field(..., description="Keywords assigned to the page.")
+
+result = crawler.run(
+    url=url,
+    word_count_threshold=1,
+    extraction_strategy= LLMExtractionStrategy(
+        provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), 
+        schema=PageSummary.model_json_schema(),
+        extraction_type="schema",
+        apply_chunking =False,
+        instruction="From the crawled content, extract the following details: "\
+            "1. Title of the page "\
+            "2. Summary of the page, which is a detailed summary "\
+            "3. Brief summary of the page, which is a paragraph text "\
+            "4. Keywords assigned to the page, which is a list of keywords. "\
+            'The extracted JSON format should look like this: '\
+            '{ "title": "Page Title", "summary": "Detailed summary of the page.", "brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }'
+    ),
+    bypass_cache=True,
+)
+
+page_summary = json.loads(result.extracted_content)
+
+print(page_summary)
+
+with open(".data/page_summary.json", "w") as f:
+    f.write(result.extracted_content)