chore: Update Selenium options in crawler_strategy.py and add verbose logging in CosineStrategy

This commit is contained in:
unclecode
2024-05-18 14:13:06 +08:00
parent e3524a10a7
commit eb6423875f
3 changed files with 31 additions and 5 deletions

View File

@@ -42,10 +42,23 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
self.options = Options()
self.options.headless = True
self.options.add_argument("--no-sandbox")
self.options.add_argument("--headless")
self.options.add_argument("--disable-dev-shm-usage")
self.options.add_argument("--disable-gpu")
self.options.add_argument("--disable-extensions")
self.options.add_argument("--headless")
self.options.add_argument("--disable-infobars")
self.options.add_argument("--disable-logging")
self.options.add_argument("--disable-popup-blocking")
self.options.add_argument("--disable-translate")
self.options.add_argument("--disable-default-apps")
self.options.add_argument("--disable-background-networking")
self.options.add_argument("--disable-sync")
self.options.add_argument("--disable-features=NetworkService,NetworkServiceInProcess")
self.options.add_argument("--disable-browser-side-navigation")
self.options.add_argument("--dns-prefetch-disable")
self.options.add_argument("--disable-web-security")
self.options.add_argument("--log-level=3")
self.use_cached_html = use_cached_html
self.use_cached_html = use_cached_html
self.js_code = js_code
self.verbose = kwargs.get("verbose", False)

View File

@@ -141,7 +141,8 @@ class LLMExtractionStrategy(ExtractionStrategy):
if self.provider.startswith("groq/"):
# Sequential processing with a delay
for ix, section in enumerate(merged_sections):
extracted_content.extend(self.extract(ix, url, section))
extract_func = partial(self.extract, url)
extracted_content.extend(extract_func(ix, section))
time.sleep(0.5) # 500 ms delay between each processing
else:
# Parallel processing using ThreadPoolExecutor
@@ -315,6 +316,9 @@ class CosineStrategy(ExtractionStrategy):
# Convert filtered clusters to a sorted list of dictionaries
cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)]
if self.verbose:
print(f"[LOG] 🚀 Assign tags using {self.device}")
if self.device == "gpu":
labels = self.nlp([cluster['content'] for cluster in cluster_list])
@@ -322,11 +326,20 @@ class CosineStrategy(ExtractionStrategy):
cluster['tags'] = label
elif self.device == "cpu":
# Process the text with the loaded model
for cluster in cluster_list:
doc = self.nlp(cluster['content'])
texts = [cluster['content'] for cluster in cluster_list]
# Batch process texts
docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"])
for doc, cluster in zip(docs, cluster_list):
tok_k = self.top_k
top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
cluster['tags'] = [cat for cat, _ in top_categories]
# for cluster in cluster_list:
# doc = self.nlp(cluster['content'])
# tok_k = self.top_k
# top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
# cluster['tags'] = [cat for cat, _ in top_categories]
if self.verbose:
print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")

View File

@@ -171,7 +171,7 @@ def main():
crawler = create_crawler()
# basic_usage(crawler)
basic_usage(crawler)
understanding_parameters(crawler)
crawler.always_by_pass_cache = True