Some updated ins utils.py

Switch to ChromeDriverManager due some issues with download the chrome driver
Update README for speed example
2024-06-26 13:03:03 +08:00 · 2024-06-26 13:00:17 +08:00 · 2024-06-24 23:06:12 +08:00 · 2024-06-24 23:02:08 +08:00 · 2024-06-24 22:54:29 +08:00 · 2024-06-24 22:47:46 +08:00
6 changed files with 74 additions and 17 deletions
--- a/README.md
+++ b/README.md
@@ -52,6 +52,33 @@ result = crawler.run(url="https://www.nbcnews.com/business")
 print(result.markdown)
 ```

+### Speed-First Design 🚀
+
+Perhaps the most important design principle for this library is speed. We need to ensure it can handle many links and resources in parallel as quickly as possible. By combining this speed with fast LLMs like Groq, the results will be truly amazing.
+
+```python
+import time
+from crawl4ai.web_crawler import WebCrawler
+crawler = WebCrawler()
+crawler.warmup()
+
+start = time.time()
+url = r"https://www.nbcnews.com/business"
+result = crawler.run( url, word_count_threshold=10, bypass_cache=True)
+end = time.time()
+print(f"Time taken: {end - start}")
+```
+
+Let's take a look the calculated time for the above code snippet:
+
+```bash
+[LOG] 🚀 Crawling done, success: True, time taken: 1.3623387813568115 seconds
+[LOG] 🚀 Content extracted, success: True, time taken: 0.05715131759643555 seconds
+[LOG] 🚀 Extraction, time taken: 0.05750393867492676 seconds.
+Time taken: 1.439958095550537
+```
+Fetching the content from the page took 1.3623 seconds, and extracting the content took 0.0575 seconds. 🚀
+
 ### Extract Structured Data from Web Pages 📊

 Crawl all OpenAI models and their fees from the official page.
@@ -60,19 +87,30 @@ Crawl all OpenAI models and their fees from the official page.
 import os
 from crawl4ai import WebCrawler
 from crawl4ai.extraction_strategy import LLMExtractionStrategy
+from pydantic import BaseModel, Field
+
+class OpenAIModelFee(BaseModel):
+    model_name: str = Field(..., description="Name of the OpenAI model.")
+    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
+    output_fee: str = Field(..., description="Fee for output token ßfor the OpenAI model.")

 url = 'https://openai.com/api/pricing/'
 crawler = WebCrawler()
 crawler.warmup()

 result = crawler.run(
-    url=url,
-    extraction_strategy=LLMExtractionStrategy(
-        provider="openai/gpt-4",
-        api_token=os.getenv('OPENAI_API_KEY'),
-        instruction="Extract all model names and their fees for input and output tokens."
-    ),
-)
+        url=url,
+        word_count_threshold=1,
+        extraction_strategy= LLMExtractionStrategy(
+            provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), 
+            schema=OpenAIModelFee.schema(),
+            extraction_type="schema",
+            instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. 
+            Do not miss any models in the entire content. One extracted model JSON format should look like this: 
+            {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}."""
+        ),            
+        bypass_cache=True,
+    )

 print(result.extracted_content)
 ```
@@ -119,3 +157,7 @@ For questions, suggestions, or feedback, feel free to reach out:
 - Website: [crawl4ai.com](https://crawl4ai.com)

 Happy Crawling! 🕸️🚀
+
+## Star History
+
+[![Star History Chart](https://api.star-history.com/svg?repos=unclecode/crawl4ai&type=Date)](https://star-history.com/#unclecode/crawl4ai&Date)
--- a/crawl4ai/crawler_strategy.py
+++ b/crawl4ai/crawler_strategy.py
@@ -6,6 +6,9 @@ from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.chrome.options import Options
 from selenium.common.exceptions import InvalidArgumentException
+from selenium.webdriver.chrome.service import Service as ChromeService
+from webdriver_manager.chrome import ChromeDriverManager
+
 import logging
 import base64
 from PIL import Image, ImageDraw, ImageFont
@@ -118,10 +121,15 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
        }

        # chromedriver_autoinstaller.install()
-        import chromedriver_autoinstaller
-        crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
-        chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver(crawl4ai_folder, False)
+        # import chromedriver_autoinstaller
+        # crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
+        # driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options)
+        # chromedriver_path = chromedriver_autoinstaller.install()
+        # chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver()
        # self.service = Service(chromedriver_autoinstaller.install())
+        
+        
+        chromedriver_path = ChromeDriverManager().install()
        self.service = Service(chromedriver_path)
        self.service.log_path = "NUL"
        self.driver = webdriver.Chrome(service=self.service, options=self.options)
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -770,4 +770,6 @@ def wrap_text(draw, text, font, max_width):

 def format_html(html_string):
    soup = BeautifulSoup(html_string, 'html.parser')
-    return soup.prettify()
+    return soup.prettify()
+
+
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -47,7 +47,7 @@ class WebCrawler:
            extraction_strategy= NoExtractionStrategy(),
            bypass_cache=False,
            verbose = False,
-            warmup=True
+            # warmup=True
        )
        self.ready = True
        print("[LOG] 🌞 WebCrawler is ready to crawl")
@@ -160,7 +160,11 @@ class WebCrawler:
            if not cached or not html:
                if user_agent:
                    self.crawler_strategy.update_user_agent(user_agent)
+                t1 = time.time()
                html = self.crawler_strategy.crawl(url)
+                t2 = time.time()
+                if verbose:
+                    print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
                if screenshot:
                    screenshot_data = self.crawler_strategy.take_screenshot()

@@ -189,7 +193,8 @@ class WebCrawler:
                # print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t1} seconds")
                t1 = time.time()
                result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
-                print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t1} seconds")
+                if verbose:
+                    print(f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1} seconds")
                
                if result is None:
                    raise ValueError(f"Failed to extract content from the website: {url}")
@@ -201,9 +206,6 @@ class WebCrawler:
            media = result.get("media", [])
            links = result.get("links", [])
            metadata = result.get("metadata", {})
-
-            if verbose:
-                print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t} seconds")
                        
            if extracted_content is None:
                if verbose:
--- a/main.py
+++ b/main.py
@@ -49,7 +49,9 @@ templates = Jinja2Templates(directory=__location__ + "/pages")
@lru_cache()
 def get_crawler():
    # Initialize and return a WebCrawler instance
-    return WebCrawler(verbose = True)
+    crawler = WebCrawler(verbose = True)
+    crawler.warmup()
+    return crawler

 class CrawlRequest(BaseModel):
    urls: List[str]
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,3 +20,4 @@ torch==2.3.1
 onnxruntime==1.18.0
 tokenizers==0.19.1
 pillow==10.3.0
+webdriver-manager==4.0.1
Author	SHA1	Message	Date
unclecode	96d1eb0d0d	Some updated ins utils.py	2024-06-26 13:03:03 +08:00
unclecode	144cfa0eda	Switch to ChromeDriverManager due some issues with download the chrome driver	2024-06-26 13:00:17 +08:00
unclecode	a0dff192ae	Update README for speed example	2024-06-24 23:06:12 +08:00
unclecode	1fffeeedd2	Update Readme: Showcase the speed	2024-06-24 23:02:08 +08:00
unclecode	f51b078042	Update reame example.	2024-06-24 22:54:29 +08:00
unclecode	b6023a51fb	Add star chart	2024-06-24 22:47:46 +08:00