chore: Update Selenium options in crawler_strategy.py and add verbose logging in CosineStrategy
This commit is contained in:
@@ -42,10 +42,23 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
self.options = Options()
|
self.options = Options()
|
||||||
self.options.headless = True
|
self.options.headless = True
|
||||||
self.options.add_argument("--no-sandbox")
|
self.options.add_argument("--no-sandbox")
|
||||||
|
self.options.add_argument("--headless")
|
||||||
self.options.add_argument("--disable-dev-shm-usage")
|
self.options.add_argument("--disable-dev-shm-usage")
|
||||||
self.options.add_argument("--disable-gpu")
|
self.options.add_argument("--disable-gpu")
|
||||||
self.options.add_argument("--disable-extensions")
|
self.options.add_argument("--disable-extensions")
|
||||||
self.options.add_argument("--headless")
|
self.options.add_argument("--disable-infobars")
|
||||||
|
self.options.add_argument("--disable-logging")
|
||||||
|
self.options.add_argument("--disable-popup-blocking")
|
||||||
|
self.options.add_argument("--disable-translate")
|
||||||
|
self.options.add_argument("--disable-default-apps")
|
||||||
|
self.options.add_argument("--disable-background-networking")
|
||||||
|
self.options.add_argument("--disable-sync")
|
||||||
|
self.options.add_argument("--disable-features=NetworkService,NetworkServiceInProcess")
|
||||||
|
self.options.add_argument("--disable-browser-side-navigation")
|
||||||
|
self.options.add_argument("--dns-prefetch-disable")
|
||||||
|
self.options.add_argument("--disable-web-security")
|
||||||
|
self.options.add_argument("--log-level=3")
|
||||||
|
self.use_cached_html = use_cached_html
|
||||||
self.use_cached_html = use_cached_html
|
self.use_cached_html = use_cached_html
|
||||||
self.js_code = js_code
|
self.js_code = js_code
|
||||||
self.verbose = kwargs.get("verbose", False)
|
self.verbose = kwargs.get("verbose", False)
|
||||||
|
|||||||
@@ -141,7 +141,8 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
if self.provider.startswith("groq/"):
|
if self.provider.startswith("groq/"):
|
||||||
# Sequential processing with a delay
|
# Sequential processing with a delay
|
||||||
for ix, section in enumerate(merged_sections):
|
for ix, section in enumerate(merged_sections):
|
||||||
extracted_content.extend(self.extract(ix, url, section))
|
extract_func = partial(self.extract, url)
|
||||||
|
extracted_content.extend(extract_func(ix, section))
|
||||||
time.sleep(0.5) # 500 ms delay between each processing
|
time.sleep(0.5) # 500 ms delay between each processing
|
||||||
else:
|
else:
|
||||||
# Parallel processing using ThreadPoolExecutor
|
# Parallel processing using ThreadPoolExecutor
|
||||||
@@ -315,6 +316,9 @@ class CosineStrategy(ExtractionStrategy):
|
|||||||
# Convert filtered clusters to a sorted list of dictionaries
|
# Convert filtered clusters to a sorted list of dictionaries
|
||||||
cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)]
|
cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)]
|
||||||
|
|
||||||
|
if self.verbose:
|
||||||
|
print(f"[LOG] 🚀 Assign tags using {self.device}")
|
||||||
|
|
||||||
if self.device == "gpu":
|
if self.device == "gpu":
|
||||||
labels = self.nlp([cluster['content'] for cluster in cluster_list])
|
labels = self.nlp([cluster['content'] for cluster in cluster_list])
|
||||||
|
|
||||||
@@ -322,12 +326,21 @@ class CosineStrategy(ExtractionStrategy):
|
|||||||
cluster['tags'] = label
|
cluster['tags'] = label
|
||||||
elif self.device == "cpu":
|
elif self.device == "cpu":
|
||||||
# Process the text with the loaded model
|
# Process the text with the loaded model
|
||||||
for cluster in cluster_list:
|
texts = [cluster['content'] for cluster in cluster_list]
|
||||||
doc = self.nlp(cluster['content'])
|
# Batch process texts
|
||||||
|
docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"])
|
||||||
|
|
||||||
|
for doc, cluster in zip(docs, cluster_list):
|
||||||
tok_k = self.top_k
|
tok_k = self.top_k
|
||||||
top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
|
top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
|
||||||
cluster['tags'] = [cat for cat, _ in top_categories]
|
cluster['tags'] = [cat for cat, _ in top_categories]
|
||||||
|
|
||||||
|
# for cluster in cluster_list:
|
||||||
|
# doc = self.nlp(cluster['content'])
|
||||||
|
# tok_k = self.top_k
|
||||||
|
# top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
|
||||||
|
# cluster['tags'] = [cat for cat, _ in top_categories]
|
||||||
|
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")
|
print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")
|
||||||
|
|
||||||
|
|||||||
@@ -171,7 +171,7 @@ def main():
|
|||||||
|
|
||||||
crawler = create_crawler()
|
crawler = create_crawler()
|
||||||
|
|
||||||
# basic_usage(crawler)
|
basic_usage(crawler)
|
||||||
understanding_parameters(crawler)
|
understanding_parameters(crawler)
|
||||||
|
|
||||||
crawler.always_by_pass_cache = True
|
crawler.always_by_pass_cache = True
|
||||||
|
|||||||
Reference in New Issue
Block a user