Update for v0.2.2

- Support multiple JS scripts - Fixed some of bugs - Resolved a few issue relevant to Colab installation
2024-06-02 15:40:18 +08:00
parent f1b60b2016
commit 51f26d12fe
8 changed files with 79 additions and 32 deletions
--- a/crawl4ai/crawler_strategy.py
+++ b/crawl4ai/crawler_strategy.py
@@ -103,12 +103,18 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
            )
            
            # Execute JS code if provided
-            if self.js_code:
+            if self.js_code and type(self.js_code) == str:
                self.driver.execute_script(self.js_code)
                # Optionally, wait for some condition after executing the JS code
                WebDriverWait(self.driver, 10).until(
                    lambda driver: driver.execute_script("return document.readyState") == "complete"
                )
+            elif self.js_code and type(self.js_code) == list:
+                for js in self.js_code:
+                    self.driver.execute_script(js)
+                    WebDriverWait(self.driver, 10).until(
+                        lambda driver: driver.execute_script("return document.readyState") == "complete"
+                    )
            
            html = self.driver.page_source
            
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -188,14 +188,15 @@ class CosineStrategy(ExtractionStrategy):
        if self.verbose:
            print(f"[LOG] Loading Extraction Model for {self.device.type} device.")

-        if False and self.device.type == "cpu":
-            self.model = load_onnx_all_MiniLM_l6_v2()
-            self.tokenizer = self.model.tokenizer
-            self.get_embedding_method = "direct"
-        else:
-            self.tokenizer, self.model = load_bge_small_en_v1_5()
-            self.model.eval()  
-            self.get_embedding_method = "batch"
+        # if False and self.device.type == "cpu":
+        #     self.model = load_onnx_all_MiniLM_l6_v2()
+        #     self.tokenizer = self.model.tokenizer
+        #     self.get_embedding_method = "direct"
+        # else:
+
+        self.tokenizer, self.model = load_bge_small_en_v1_5()
+        self.model.eval()  
+        self.get_embedding_method = "batch"
        
        self.buffer_embeddings = np.array([])

--- a/crawl4ai/model_loader.py
+++ b/crawl4ai/model_loader.py
@@ -35,8 +35,7 @@ def calculate_batch_size(device):
        else:
            return 32
    else:
-        return 16  # Default batch size
-    
+        return 16  # Default batch size   
    
@lru_cache()
 def get_device():
@@ -258,8 +257,8 @@ def download_all_models(remove_existing=False):
    # load_bert_base_uncased()
    # print("[LOG] Downloading BGE Small EN v1.5...")
    # load_bge_small_en_v1_5()
-    print("[LOG] Downloading ONNX model...")
-    load_onnx_all_MiniLM_l6_v2()
+    # print("[LOG] Downloading ONNX model...")
+    # load_onnx_all_MiniLM_l6_v2()
    print("[LOG] Downloading text classifier...")
    _, device = load_text_multilabel_classifier()
    print(f"[LOG] Text classifier loaded on {device}")