Update for v0.2.2

- Support multiple JS scripts
- Fixed some of bugs
- Resolved a few issue relevant to Colab installation
This commit is contained in:
unclecode
2024-06-02 15:40:18 +08:00
parent f1b60b2016
commit 51f26d12fe
8 changed files with 79 additions and 32 deletions

View File

@@ -103,12 +103,18 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
)
# Execute JS code if provided
if self.js_code:
if self.js_code and type(self.js_code) == str:
self.driver.execute_script(self.js_code)
# Optionally, wait for some condition after executing the JS code
WebDriverWait(self.driver, 10).until(
lambda driver: driver.execute_script("return document.readyState") == "complete"
)
elif self.js_code and type(self.js_code) == list:
for js in self.js_code:
self.driver.execute_script(js)
WebDriverWait(self.driver, 10).until(
lambda driver: driver.execute_script("return document.readyState") == "complete"
)
html = self.driver.page_source

View File

@@ -188,14 +188,15 @@ class CosineStrategy(ExtractionStrategy):
if self.verbose:
print(f"[LOG] Loading Extraction Model for {self.device.type} device.")
if False and self.device.type == "cpu":
self.model = load_onnx_all_MiniLM_l6_v2()
self.tokenizer = self.model.tokenizer
self.get_embedding_method = "direct"
else:
self.tokenizer, self.model = load_bge_small_en_v1_5()
self.model.eval()
self.get_embedding_method = "batch"
# if False and self.device.type == "cpu":
# self.model = load_onnx_all_MiniLM_l6_v2()
# self.tokenizer = self.model.tokenizer
# self.get_embedding_method = "direct"
# else:
self.tokenizer, self.model = load_bge_small_en_v1_5()
self.model.eval()
self.get_embedding_method = "batch"
self.buffer_embeddings = np.array([])

View File

@@ -35,8 +35,7 @@ def calculate_batch_size(device):
else:
return 32
else:
return 16 # Default batch size
return 16 # Default batch size
@lru_cache()
def get_device():
@@ -258,8 +257,8 @@ def download_all_models(remove_existing=False):
# load_bert_base_uncased()
# print("[LOG] Downloading BGE Small EN v1.5...")
# load_bge_small_en_v1_5()
print("[LOG] Downloading ONNX model...")
load_onnx_all_MiniLM_l6_v2()
# print("[LOG] Downloading ONNX model...")
# load_onnx_all_MiniLM_l6_v2()
print("[LOG] Downloading text classifier...")
_, device = load_text_multilabel_classifier()
print(f"[LOG] Text classifier loaded on {device}")