From b6023a51fbbb9294e6d6104e4bbb4d072df05757 Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Mon, 24 Jun 2024 22:47:46 +0800
Subject: [PATCH 1/4] Add star chart

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index c75dd07d..ebea270c 100644
--- a/README.md
+++ b/README.md
@@ -119,3 +119,7 @@ For questions, suggestions, or feedback, feel free to reach out:
 - Website: [crawl4ai.com](https://crawl4ai.com)
 
 Happy Crawling! 🕸️🚀
+
+## Star History
+
+[![Star History Chart](https://api.star-history.com/svg?repos=unclecode/crawl4ai&type=Date)](https://star-history.com/#unclecode/crawl4ai&Date)
\ No newline at end of file

From f51b0780423ceadd626f2b747bbf291cd0587dfa Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Mon, 24 Jun 2024 22:54:29 +0800
Subject: [PATCH 2/4] Update reame example.

---
 README.md | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index ebea270c..39f7cc9c 100644
--- a/README.md
+++ b/README.md
@@ -60,19 +60,30 @@ Crawl all OpenAI models and their fees from the official page.
 import os
 from crawl4ai import WebCrawler
 from crawl4ai.extraction_strategy import LLMExtractionStrategy
+from pydantic import BaseModel, Field
+
+class OpenAIModelFee(BaseModel):
+    model_name: str = Field(..., description="Name of the OpenAI model.")
+    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
+    output_fee: str = Field(..., description="Fee for output token ßfor the OpenAI model.")
 
 url = 'https://openai.com/api/pricing/'
 crawler = WebCrawler()
 crawler.warmup()
 
 result = crawler.run(
-    url=url,
-    extraction_strategy=LLMExtractionStrategy(
-        provider="openai/gpt-4",
-        api_token=os.getenv('OPENAI_API_KEY'),
-        instruction="Extract all model names and their fees for input and output tokens."
-    ),
-)
+        url=url,
+        word_count_threshold=1,
+        extraction_strategy= LLMExtractionStrategy(
+            provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), 
+            schema=OpenAIModelFee.schema(),
+            extraction_type="schema",
+            instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
+            Do not miss any models in the entire content. One extracted model JSON format should look like this: 
+            {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}."""
+        ),            
+        bypass_cache=True,
+    )
 
 print(result.extracted_content)
 ```

From 1fffeeedd200c71d2500ea8a20faf68e5d3237d2 Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Mon, 24 Jun 2024 23:02:08 +0800
Subject: [PATCH 3/4] Update Readme: Showcase the speed

---
 README.md               | 29 +++++++++++++++++++++++++++++
 crawl4ai/web_crawler.py |  6 ++----
 2 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 39f7cc9c..e45abd98 100644
--- a/README.md
+++ b/README.md
@@ -52,6 +52,35 @@ result = crawler.run(url="https://www.nbcnews.com/business")
 print(result.markdown)
 ```
 
+### Speed-First Design 🚀
+
+Perhaps the most important design principle for this library is speed. We need to ensure it can handle many links and resources in parallel as quickly as possible. By combining this speed with fast LLMs like Groq, the results will be truly amazing.
+
+```python
+import time
+from crawl4ai.web_crawler import WebCrawler
+crawler = WebCrawler()
+crawler.warmup()
+
+start = time.time()
+url = r"https://www.nbcnews.com/business"
+result = crawler.run( url, word_count_threshold=10, bypass_cache=True)
+end = time.time()
+print(f"Time taken: {end - start}")
+```
+
+Let's take a look the calculated time for the above code snippet:
+
+```bash
+[LOG] 🚀 Crawling done, success: True, time taken: 0.05835 seconds
+[LOG] 🔥 Extracting semantic blocks, Strategy: NoExtractionStrategy
+[LOG] 🚀 Extraction, time taken: 0.0588 seconds.
+Time taken: 4.29332
+```
+
+It took around 4.29 seconds to crawl the page, extract the content, and return the result.
+
+
 ### Extract Structured Data from Web Pages 📊
 
 Crawl all OpenAI models and their fees from the official page.
diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py
index ad687231..57caed50 100644
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -189,7 +189,8 @@ class WebCrawler:
                 # print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t1} seconds")
                 t1 = time.time()
                 result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
-                print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t1} seconds")
+                if verbose:
+                    print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t1} seconds")
                 
                 if result is None:
                     raise ValueError(f"Failed to extract content from the website: {url}")
@@ -201,9 +202,6 @@ class WebCrawler:
             media = result.get("media", [])
             links = result.get("links", [])
             metadata = result.get("metadata", {})
-
-            if verbose:
-                print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t} seconds")
                         
             if extracted_content is None:
                 if verbose:

From a0dff192aee50446c8f4899a3c1c17f5ea29465f Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Mon, 24 Jun 2024 23:06:12 +0800
Subject: [PATCH 4/4] Update README for speed example

---
 README.md               | 12 +++++-------
 crawl4ai/web_crawler.py |  8 ++++++--
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index e45abd98..191614f4 100644
--- a/README.md
+++ b/README.md
@@ -72,14 +72,12 @@ print(f"Time taken: {end - start}")
 Let's take a look the calculated time for the above code snippet:
 
 ```bash
-[LOG] 🚀 Crawling done, success: True, time taken: 0.05835 seconds
-[LOG] 🔥 Extracting semantic blocks, Strategy: NoExtractionStrategy
-[LOG] 🚀 Extraction, time taken: 0.0588 seconds.
-Time taken: 4.29332
+[LOG] 🚀 Crawling done, success: True, time taken: 1.3623387813568115 seconds
+[LOG] 🚀 Content extracted, success: True, time taken: 0.05715131759643555 seconds
+[LOG] 🚀 Extraction, time taken: 0.05750393867492676 seconds.
+Time taken: 1.439958095550537
 ```
-
-It took around 4.29 seconds to crawl the page, extract the content, and return the result.
-
+Fetching the content from the page took 1.3623 seconds, and extracting the content took 0.0575 seconds. 🚀
 
 ### Extract Structured Data from Web Pages 📊
 
diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py
index 57caed50..a33663e8 100644
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -47,7 +47,7 @@ class WebCrawler:
             extraction_strategy= NoExtractionStrategy(),
             bypass_cache=False,
             verbose = False,
-            warmup=True
+            # warmup=True
         )
         self.ready = True
         print("[LOG] 🌞 WebCrawler is ready to crawl")
@@ -160,7 +160,11 @@ class WebCrawler:
             if not cached or not html:
                 if user_agent:
                     self.crawler_strategy.update_user_agent(user_agent)
+                t1 = time.time()
                 html = self.crawler_strategy.crawl(url)
+                t2 = time.time()
+                if verbose:
+                    print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
                 if screenshot:
                     screenshot_data = self.crawler_strategy.take_screenshot()
 
@@ -190,7 +194,7 @@ class WebCrawler:
                 t1 = time.time()
                 result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
                 if verbose:
-                    print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t1} seconds")
+                    print(f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1} seconds")
                 
                 if result is None:
                     raise ValueError(f"Failed to extract content from the website: {url}")