From ea16dec587b751b67d9b84a44ee7b84765656add Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Thu, 16 May 2024 21:19:02 +0800
Subject: [PATCH] Improve library loading

---
 README.md                       | 4 ----
 crawl4ai/crawler_strategy.py    | 4 +++-
 crawl4ai/extraction_strategy.py | 7 +++++--
 crawl4ai/model_loader.py        | 8 +++-----
 crawl4ai/web_crawler.py         | 4 ++--
 pages/partial/installation.html | 2 --
 6 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 942da285..f4e7f37a 100644
--- a/README.md
+++ b/README.md
@@ -35,8 +35,6 @@ Simply, firtsy install the package:
 ```bash
 virtualenv venv
 source venv/bin/activate
-# Install the required packages
-pip install transformers torch chromedriver_autoinstaller
 # Install Crawl4AI
 pip install git+https://github.com/unclecode/crawl4ai.git
 ```
@@ -137,7 +135,6 @@ To install Crawl4AI as a library, follow these steps:
 ```bash
 virtualenv venv
 source venv/bin/activate
-pip install transformers torch chromedriver_autoinstaller
 pip install git+https://github.com/unclecode/crawl4ai.git
 ```
 
@@ -151,7 +148,6 @@ virtualenv venv
 source venv/bin/activate
 git clone https://github.com/unclecode/crawl4ai.git
 cd crawl4ai
-pip install transformers torch chromedriver_autoinstaller
 pip install -e .
 ```
 
diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py
index c1a06072..24add103 100644
--- a/crawl4ai/crawler_strategy.py
+++ b/crawl4ai/crawler_strategy.py
@@ -6,7 +6,7 @@ from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.chrome.options import Options
 from selenium.common.exceptions import InvalidArgumentException
-import chromedriver_autoinstaller
+
 from typing import List
 import requests
 import os
@@ -38,6 +38,7 @@ class CloudCrawlerStrategy(CrawlerStrategy):
 class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
     def __init__(self, use_cached_html=False, js_code=None):
         super().__init__()
+        print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy")
         self.options = Options()
         self.options.headless = True
         self.options.add_argument("--no-sandbox")
@@ -49,6 +50,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
         self.js_code = js_code
 
         # chromedriver_autoinstaller.install()
+        import chromedriver_autoinstaller
         self.service = Service(chromedriver_autoinstaller.install())
         self.driver = webdriver.Chrome(service=self.service, options=self.options)
 
diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py
index c9074eb2..59aa81a5 100644
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -8,8 +8,8 @@ from .config import *
 from .utils import *
 from functools import partial
 from .model_loader import load_bert_base_uncased, load_bge_small_en_v1_5, load_spacy_model
-from transformers import pipeline
-from sklearn.metrics.pairwise import cosine_similarity
+
+
 import numpy as np
 class ExtractionStrategy(ABC):
     """
@@ -165,6 +165,7 @@ class CosineStrategy(ExtractionStrategy):
         :param top_k: Number of top categories to extract.
         """
         super().__init__()
+        
         from transformers import BertTokenizer, BertModel, pipeline
         from transformers import AutoTokenizer, AutoModel     
         import spacy  
@@ -196,6 +197,7 @@ class CosineStrategy(ExtractionStrategy):
         :param threshold: Cosine similarity threshold for filtering documents.
         :return: Filtered list of documents.
         """
+        from sklearn.metrics.pairwise import cosine_similarity
         if not semantic_filter:
             return documents
         # Compute embedding for the keyword filter
@@ -409,6 +411,7 @@ class ContentSummarizationStrategy(ExtractionStrategy):
 
         :param model_name: The model to use for summarization.
         """
+        from transformers import pipeline
         self.summarizer = pipeline("summarization", model=model_name)
 
     def extract(self, url: str, text: str, provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]:
diff --git a/crawl4ai/model_loader.py b/crawl4ai/model_loader.py
index 9c50c524..50288631 100644
--- a/crawl4ai/model_loader.py
+++ b/crawl4ai/model_loader.py
@@ -2,7 +2,7 @@ from functools import lru_cache
 from pathlib import Path
 import subprocess, os
 import shutil
-from .config import MODEL_REPO_BRANCH
+from crawl4ai.config import MODEL_REPO_BRANCH
 import argparse
 
 def get_home_folder():
@@ -40,7 +40,6 @@ def load_spacy_en_core_web_sm():
     print("[LOG] ✅ spaCy model loaded successfully")
     return nlp
 
-
 @lru_cache()
 def load_spacy_model():
     import spacy
@@ -92,11 +91,8 @@ def load_spacy_model():
 
     return spacy.load(model_folder)
 
-
 def download_all_models(remove_existing=False):
     """Download all models required for Crawl4AI."""
-    print("[LOG] Welcome to the Crawl4AI Model Downloader!")
-    print("[LOG] This script will download all the models required for Crawl4AI.")
     if remove_existing:
         print("[LOG] Removing existing models...")
         home_folder = get_home_folder()
@@ -121,6 +117,8 @@ def download_all_models(remove_existing=False):
     print("[LOG] ✅ All models downloaded successfully.")
 
 def main():
+    print("[LOG] Welcome to the Crawl4AI Model Downloader!")
+    print("[LOG] This script will download all the models required for Crawl4AI.")
     parser = argparse.ArgumentParser(description="Crawl4AI Model Downloader")
     parser.add_argument('--remove-existing', action='store_true', help="Remove existing models before downloading")
     args = parser.parse_args()
diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py
index 88996c44..7fa4304f 100644
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -18,11 +18,11 @@ class WebCrawler:
     def __init__(
         self,
         # db_path: str = None,
-        crawler_strategy: CrawlerStrategy = LocalSeleniumCrawlerStrategy(),
+        crawler_strategy: CrawlerStrategy = None,
         always_by_pass_cache: bool = False,
     ):
         # self.db_path = db_path
-        self.crawler_strategy = crawler_strategy
+        self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy()
         self.always_by_pass_cache = always_by_pass_cache
 
         # Create the .crawl4ai folder in the user's home directory if it doesn't exist
diff --git a/pages/partial/installation.html b/pages/partial/installation.html
index 6a80987e..01ff715b 100644
--- a/pages/partial/installation.html
+++ b/pages/partial/installation.html
@@ -30,7 +30,6 @@
             ><code>virtualenv venv
 source venv/bin/activate
 pip install git+https://github.com/unclecode/crawl4ai.git
-pip install transformers torch chromedriver_autoinstaller
             </code></pre>
         </li>
         <li class="mb-4">
@@ -47,7 +46,6 @@ pip install transformers torch chromedriver_autoinstaller
 source venv/bin/activate
 git clone https://github.com/unclecode/crawl4ai.git
 cd crawl4ai
-pip install transformers torch chromedriver_autoinstaller
 pip install -e .
 </code></pre>
         </li>