From 1cc67df3019089f350d669af688f0a577156c0d1 Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Fri, 17 May 2024 16:53:03 +0800
Subject: [PATCH] chore: Update pip installation command and requirements, add
 new dependencies

---
 README.md                       | 42 ++++++++++++++-------------------
 docs/examples/quickstart.py     | 15 ++----------
 pages/app.js                    | 27 +++++++++++----------
 pages/partial/installation.html |  4 ++--
 pages/partial/try_it.html       | 18 +++++++-------
 5 files changed, 46 insertions(+), 60 deletions(-)
diff --git a/README.md b/README.md
index 9f1ef712..48124f2d 100644
--- a/README.md
+++ b/README.md
@@ -22,32 +22,26 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information
 
 ## Power and Simplicity of Crawl4AI 🚀
 
-Crawl4AI makes even complex web crawling tasks simple and intuitive. Below is an example of how you can execute JavaScript, filter data using keywords, and use a CSS selector to extract specific content—all in one go!
+To show the simplicity take a look at the first example:
 
-**Example Task:**
+```python
+from crawl4ai import WebCrawler
+
+# Create the WebCrawler instance 
+crawler = WebCrawler()
+
+# Run the crawler with keyword filtering and CSS selector
+result = crawler.run(url="https://www.example.com")
+print(result) # {url, html, markdown, extracted_content, metadata}
+```
+
+Now let's try a complex task. Below is an example of how you can execute JavaScript, filter data using keywords, and use a CSS selector to extract specific content—all in one go!
 
 1. Instantiate a WebCrawler object.
 2. Execute custom JavaScript to click a "Load More" button.
-3. Filter the data to include only content related to "technology".
+3. Extract semantical chunks of content and filter the data to include only content related to technology.
 4. Use a CSS selector to extract only paragraphs (`<p>` tags).
 
-**Example Code:**
-
-Simply, firtsy install the package:
-```bash
-virtualenv venv
-source venv/bin/activate
-# Install Crawl4AI
-pip install git+https://github.com/unclecode/crawl4ai.git
-```
-
-Run the following command to load the required models. This is optional, but it will boost the performance and speed of the crawler. You need to do this only once.
-```bash
-crawl4ai-download-models
-```
-
-Now, you can run the following code:
-
 ```python
 # Import necessary modules
 from crawl4ai import WebCrawler
@@ -137,7 +131,7 @@ To install Crawl4AI as a library, follow these steps:
 ```bash
 virtualenv venv
 source venv/bin/activate
-pip install git+https://github.com/unclecode/crawl4ai.git
+pip install "crawl4ai[all] @ git+https://github.com/unclecode/crawl4ai.git"
 ```
 
     💡 Better to run the following CLI-command to load the required models. This is optional, but it will boost the performance and speed of the crawler. You need to do this only once.
@@ -150,12 +144,12 @@ virtualenv venv
 source venv/bin/activate
 git clone https://github.com/unclecode/crawl4ai.git
 cd crawl4ai
-pip install -e .
+pip install -e .[all]
 ```
 
 3. Use docker to run the local server:
 ```bash
-docker build -t crawl4ai . 
+docker build -t crawl4ai .
 # For Mac users
 # docker build --platform linux/amd64 -t crawl4ai .
 docker run -d -p 8000:80 crawl4ai
@@ -349,7 +343,7 @@ result = crawler.run(url="https://www.nbcnews.com/business")
 | `include_raw_html`    | Whether to include the raw HTML content in the response.                                              | No       | `false`             |
 | `bypass_cache`        | Whether to force a fresh crawl even if the URL has been previously crawled.                           | No       | `false`             |
 | `word_count_threshold`| The minimum number of words a block must contain to be considered meaningful (minimum value is 5).    | No       | `5`                 |
-| `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy").                    | No       | `CosineStrategy`    |
+| `extraction_strategy` | The strategy to use for extracting content from the HTML (e.g., "CosineStrategy").                    | No       | `NoExtractionStrategy`    |
 | `chunking_strategy`   | The strategy to use for chunking the text before processing (e.g., "RegexChunking").                  | No       | `RegexChunking`     |
 | `css_selector`        | The CSS selector to target specific parts of the HTML for extraction.                                 | No       | `None`              |
 | `verbose`             | Whether to enable verbose logging.                                                                    | No       | `true`              |
diff --git a/docs/examples/quickstart.py b/docs/examples/quickstart.py
index de30c7f5..fa3e9045 100644
--- a/docs/examples/quickstart.py
+++ b/docs/examples/quickstart.py
@@ -170,22 +170,11 @@ def main():
     cprint("If this is the first time you're running Crawl4ai, this might take a few seconds to load required model files.")
 
     crawler = create_crawler()
-    
-    cprint("For the rest of this guide, I set crawler.always_by_pass_cache to True to force the crawler to bypass the cache. This is to ensure that we get fresh results for each run.")
-    crawler.always_by_pass_cache = True
-
-
-    cprint("\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]", True)
-    cprint("NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!")
-    result = crawler.run(
-        url="https://www.nbcnews.com/business",
-        chunking_strategy=NlpSentenceChunking()
-    )
-    cprint("[LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow]")
-    print_result(result)
 
     basic_usage(crawler)
     understanding_parameters(crawler)
+    
+    crawler.always_by_pass_cache = True
     add_chunking_strategy(crawler)
     add_extraction_strategy(crawler)
     add_llm_extraction_strategy(crawler)
diff --git a/pages/app.js b/pages/app.js
index a30581a5..200e29d3 100644
--- a/pages/app.js
+++ b/pages/app.js
@@ -69,9 +69,12 @@ axios
 // Handle crawl button click
 document.getElementById("crawl-btn").addEventListener("click", () => {
     // validate input to have both URL and API token
-    if (!document.getElementById("url-input").value || !document.getElementById("token-input").value) {
-        alert("Please enter both URL(s) and API token.");
-        return;
+    // if selected extraction strategy is LLMExtractionStrategy, then API token is required
+    if (document.getElementById("extraction-strategy-select").value === "LLMExtractionStrategy") {
+        if (!document.getElementById("url-input").value || !document.getElementById("token-input").value) {
+            alert("Please enter both URL(s) and API token.");
+            return;
+        }
     }
 
     const selectedProviderModel = document.getElementById("provider-model-select").value;
@@ -87,8 +90,6 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
     const urls = urlsInput.split(",").map((url) => url.trim());
     const data = {
         urls: urls,
-        provider_model: selectedProviderModel,
-        api_token: apiToken,
         include_raw_html: true,
         bypass_cache: bypassCache,
         extract_blocks: extractBlocks,
@@ -112,8 +113,8 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
     localStorage.setItem("api_token", document.getElementById("token-input").value);
 
     document.getElementById("loading").classList.remove("hidden");
-    document.getElementById("result").classList.add("hidden");
-    document.getElementById("code_help").classList.add("hidden");
+    document.getElementById("result").style.visibility = "hidden";
+    document.getElementById("code_help").style.visibility = "hidden";
 
     axios
         .post("/crawl", data)
@@ -128,18 +129,20 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
             const extractionStrategy = data.extraction_strategy;
             const isLLMExtraction = extractionStrategy === "LLMExtractionStrategy";
 
+            // REMOVE API TOKEN FROM CODE EXAMPLES
+            data.extraction_strategy_args.api_token = "your_api_token";
             document.getElementById(
                 "curl-code"
             ).textContent = `curl -X POST -H "Content-Type: application/json" -d '${JSON.stringify({
                 ...data,
                 api_token: isLLMExtraction ? "your_api_token" : undefined,
-            })}' http://crawl4ai.uccode.io/crawl`;
+            }, null, 2)}' http://crawl4ai.com/crawl`;
 
             document.getElementById("python-code").textContent = `import requests\n\ndata = ${JSON.stringify(
                 { ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
                 null,
                 2
-            )}\n\nresponse = requests.post("http://crawl4ai.uccode.io/crawl", json=data) # OR local host if your run locally \nprint(response.json())`;
+            )}\n\nresponse = requests.post("http://crawl4ai.com/crawl", json=data) # OR local host if your run locally \nprint(response.json())`;
 
             document.getElementById(
                 "nodejs-code"
@@ -147,7 +150,7 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
                 { ...data, api_token: isLLMExtraction ? "your_api_token" : undefined },
                 null,
                 2
-            )};\n\naxios.post("http://crawl4ai.uccode.io/crawl", data) // OR local host if your run locally \n    .then(response => console.log(response.data))\n    .catch(error => console.error(error));`;
+            )};\n\naxios.post("http://crawl4ai.com/crawl", data) // OR local host if your run locally \n    .then(response => console.log(response.data))\n    .catch(error => console.error(error));`;
 
             document.getElementById(
                 "library-code"
@@ -169,8 +172,8 @@ document.getElementById("crawl-btn").addEventListener("click", () => {
 
             document.getElementById("loading").classList.add("hidden");
 
-            document.getElementById("result").classList.remove("hidden");
-            document.getElementById("code_help").classList.remove("hidden");
+            document.getElementById("result").style.visibility = "visible";
+            document.getElementById("code_help").style.visibility = "visible";
 
             // increment the total count
             document.getElementById("total-count").textContent =
diff --git a/pages/partial/installation.html b/pages/partial/installation.html
index 01ff715b..6a6561cd 100644
--- a/pages/partial/installation.html
+++ b/pages/partial/installation.html
@@ -29,7 +29,7 @@
                 class="bg-zinc-800 p-4 rounded mt-2 text-zinc-100"
             ><code>virtualenv venv
 source venv/bin/activate
-pip install git+https://github.com/unclecode/crawl4ai.git
+pip install "crawl4ai[all] @ git+https://github.com/unclecode/crawl4ai.git"
             </code></pre>
         </li>
         <li class="mb-4">
@@ -46,7 +46,7 @@ pip install git+https://github.com/unclecode/crawl4ai.git
 source venv/bin/activate
 git clone https://github.com/unclecode/crawl4ai.git
 cd crawl4ai
-pip install -e .
+pip install -e .[all]
 </code></pre>
         </li>
         <li class="">
diff --git a/pages/partial/try_it.html b/pages/partial/try_it.html
index 56f85062..b7fa2a13 100644
--- a/pages/partial/try_it.html
+++ b/pages/partial/try_it.html
@@ -46,9 +46,9 @@
                             id="extraction-strategy-select"
                             class="border border-zinc-700 rounded px-4 py-1 bg-zinc-900 text-zinc-300"
                         >
+                            <option value="NoExtractionStrategy" selected>NoExtractionStrategy</option>
                             <option value="CosineStrategy">CosineStrategy</option>
                             <option value="LLMExtractionStrategy">LLMExtractionStrategy</option>
-                            <option value="NoExtractionStrategy">NoExtractionStrategy</option>
                         </select>
                     </div>
                     <div class="flex flex-col">
@@ -99,7 +99,7 @@
                 </div>
                 <div  class="flex gap-2">
                     <!-- Add two textarea one for getting Keyword Filter and another one Instruction, make both grow whole with-->
-                    <div id = "semantic_filter_div" class="flex flex-col flex-1">
+                    <div id = "semantic_filter_div" class="flex flex-col flex-1 hidden">
                         <label for="keyword-filter" class="text-lime-500 font-bold text-xs">Keyword Filter</label>
                         <textarea
                             id="semantic_filter"
@@ -131,10 +131,10 @@
                 </div>
             </div>
 
+            <div id="loading" class="hidden">
+                <p class="text-white">Loading... Please wait.</p>
+            </div>
             <div id="result" class="flex-1">
-                <div id="loading" class="hidden">
-                    <p class="text-white">Loading... Please wait.</p>
-                </div>
                 <div class="tab-buttons flex gap-2">
                     <button class="tab-btn px-4 py-1 text-sm bg-zinc-700 rounded-t text-lime-500" data-tab="json">
                         JSON
@@ -181,19 +181,19 @@
                     </button> -->
                 </div>
                 <div class="tab-content result bg-zinc-900 p-2 rounded h-full border border-zinc-700 text-sm">
-                    <pre class="h-full flex relative">
+                    <pre class="h-full flex relative overflow-x-auto">
                         <code id="curl-code" class="language-bash"></code>
                         <button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="curl-code">Copy</button>
                     </pre>
-                    <pre class="hidden h-full flex relative">
+                    <pre class="hidden h-full flex relative overflow-x-auto">
                         <code id="python-code" class="language-python"></code>
                         <button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="python-code">Copy</button>
                     </pre>
-                    <pre class="hidden h-full flex relative">
+                    <pre class="hidden h-full flex relative overflow-x-auto">
                         <code id="nodejs-code" class="language-javascript"></code>
                         <button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="nodejs-code">Copy</button>
                     </pre>
-                    <pre class="hidden h-full flex relative">
+                    <pre class="hidden h-full flex relative overflow-x-auto">
                         <code id="library-code" class="language-python"></code>
                         <button class="absolute top-2 right-2 bg-zinc-700 text-white px-2 py-1 rounded copy-btn" data-target="library-code">Copy</button>
                     </pre>