From eb6423875f3e50ffe9294c742b7dbef611bd314a Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Sat, 18 May 2024 14:13:06 +0800
Subject: [PATCH] chore: Update Selenium options in crawler_strategy.py and add
 verbose logging in CosineStrategy

---
 crawl4ai/crawler_strategy.py    | 15 ++++++++++++++-
 crawl4ai/extraction_strategy.py | 19 ++++++++++++++++---
 docs/examples/quickstart.py     |  2 +-
 3 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py
index 0b189e67..33988dec 100644
--- a/crawl4ai/crawler_strategy.py
+++ b/crawl4ai/crawler_strategy.py
@@ -42,10 +42,23 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
         self.options = Options()
         self.options.headless = True
         self.options.add_argument("--no-sandbox")
+        self.options.add_argument("--headless")
         self.options.add_argument("--disable-dev-shm-usage")
         self.options.add_argument("--disable-gpu")
         self.options.add_argument("--disable-extensions")
-        self.options.add_argument("--headless")
+        self.options.add_argument("--disable-infobars")
+        self.options.add_argument("--disable-logging")
+        self.options.add_argument("--disable-popup-blocking")
+        self.options.add_argument("--disable-translate")
+        self.options.add_argument("--disable-default-apps")
+        self.options.add_argument("--disable-background-networking")
+        self.options.add_argument("--disable-sync")
+        self.options.add_argument("--disable-features=NetworkService,NetworkServiceInProcess")
+        self.options.add_argument("--disable-browser-side-navigation")
+        self.options.add_argument("--dns-prefetch-disable")
+        self.options.add_argument("--disable-web-security")
+        self.options.add_argument("--log-level=3")
+        self.use_cached_html = use_cached_html
         self.use_cached_html = use_cached_html
         self.js_code = js_code
         self.verbose = kwargs.get("verbose", False)
diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py
index e76c1084..a5d4b447 100644
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -141,7 +141,8 @@ class LLMExtractionStrategy(ExtractionStrategy):
         if self.provider.startswith("groq/"):
             # Sequential processing with a delay
             for ix, section in enumerate(merged_sections):
-                extracted_content.extend(self.extract(ix, url, section))
+                extract_func = partial(self.extract, url)
+                extracted_content.extend(extract_func(ix, section))
                 time.sleep(0.5)  # 500 ms delay between each processing
         else:
             # Parallel processing using ThreadPoolExecutor
@@ -315,6 +316,9 @@ class CosineStrategy(ExtractionStrategy):
         # Convert filtered clusters to a sorted list of dictionaries
         cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)]
         
+        if self.verbose:
+            print(f"[LOG] 🚀 Assign tags using {self.device}")
+        
         if self.device == "gpu":
             labels = self.nlp([cluster['content'] for cluster in cluster_list])
             
@@ -322,11 +326,20 @@ class CosineStrategy(ExtractionStrategy):
                 cluster['tags'] = label
         elif self.device == "cpu":
             # Process the text with the loaded model
-            for cluster in  cluster_list:
-                doc = self.nlp(cluster['content'])
+            texts = [cluster['content'] for cluster in cluster_list]
+            # Batch process texts
+            docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"])
+
+            for doc, cluster in zip(docs, cluster_list):
                 tok_k = self.top_k
                 top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
                 cluster['tags'] = [cat for cat, _ in top_categories]
+                            
+            # for cluster in  cluster_list:
+            #     doc = self.nlp(cluster['content'])
+            #     tok_k = self.top_k
+            #     top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
+            #     cluster['tags'] = [cat for cat, _ in top_categories]
         
         if self.verbose:
             print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")
diff --git a/docs/examples/quickstart.py b/docs/examples/quickstart.py
index 012ea65a..c0fe9468 100644
--- a/docs/examples/quickstart.py
+++ b/docs/examples/quickstart.py
@@ -171,7 +171,7 @@ def main():
 
     crawler = create_crawler()
 
-    # basic_usage(crawler)
+    basic_usage(crawler)
     understanding_parameters(crawler)
     
     crawler.always_by_pass_cache = True