chore: Update Selenium options in crawler_strategy.py and add verbose logging in CosineStrategy
This commit is contained in:
@@ -42,10 +42,23 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
||||
self.options = Options()
|
||||
self.options.headless = True
|
||||
self.options.add_argument("--no-sandbox")
|
||||
self.options.add_argument("--headless")
|
||||
self.options.add_argument("--disable-dev-shm-usage")
|
||||
self.options.add_argument("--disable-gpu")
|
||||
self.options.add_argument("--disable-extensions")
|
||||
self.options.add_argument("--headless")
|
||||
self.options.add_argument("--disable-infobars")
|
||||
self.options.add_argument("--disable-logging")
|
||||
self.options.add_argument("--disable-popup-blocking")
|
||||
self.options.add_argument("--disable-translate")
|
||||
self.options.add_argument("--disable-default-apps")
|
||||
self.options.add_argument("--disable-background-networking")
|
||||
self.options.add_argument("--disable-sync")
|
||||
self.options.add_argument("--disable-features=NetworkService,NetworkServiceInProcess")
|
||||
self.options.add_argument("--disable-browser-side-navigation")
|
||||
self.options.add_argument("--dns-prefetch-disable")
|
||||
self.options.add_argument("--disable-web-security")
|
||||
self.options.add_argument("--log-level=3")
|
||||
self.use_cached_html = use_cached_html
|
||||
self.use_cached_html = use_cached_html
|
||||
self.js_code = js_code
|
||||
self.verbose = kwargs.get("verbose", False)
|
||||
|
||||
@@ -141,7 +141,8 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
if self.provider.startswith("groq/"):
|
||||
# Sequential processing with a delay
|
||||
for ix, section in enumerate(merged_sections):
|
||||
extracted_content.extend(self.extract(ix, url, section))
|
||||
extract_func = partial(self.extract, url)
|
||||
extracted_content.extend(extract_func(ix, section))
|
||||
time.sleep(0.5) # 500 ms delay between each processing
|
||||
else:
|
||||
# Parallel processing using ThreadPoolExecutor
|
||||
@@ -315,6 +316,9 @@ class CosineStrategy(ExtractionStrategy):
|
||||
# Convert filtered clusters to a sorted list of dictionaries
|
||||
cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)]
|
||||
|
||||
if self.verbose:
|
||||
print(f"[LOG] 🚀 Assign tags using {self.device}")
|
||||
|
||||
if self.device == "gpu":
|
||||
labels = self.nlp([cluster['content'] for cluster in cluster_list])
|
||||
|
||||
@@ -322,11 +326,20 @@ class CosineStrategy(ExtractionStrategy):
|
||||
cluster['tags'] = label
|
||||
elif self.device == "cpu":
|
||||
# Process the text with the loaded model
|
||||
for cluster in cluster_list:
|
||||
doc = self.nlp(cluster['content'])
|
||||
texts = [cluster['content'] for cluster in cluster_list]
|
||||
# Batch process texts
|
||||
docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"])
|
||||
|
||||
for doc, cluster in zip(docs, cluster_list):
|
||||
tok_k = self.top_k
|
||||
top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
|
||||
cluster['tags'] = [cat for cat, _ in top_categories]
|
||||
|
||||
# for cluster in cluster_list:
|
||||
# doc = self.nlp(cluster['content'])
|
||||
# tok_k = self.top_k
|
||||
# top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
|
||||
# cluster['tags'] = [cat for cat, _ in top_categories]
|
||||
|
||||
if self.verbose:
|
||||
print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")
|
||||
|
||||
@@ -171,7 +171,7 @@ def main():
|
||||
|
||||
crawler = create_crawler()
|
||||
|
||||
# basic_usage(crawler)
|
||||
basic_usage(crawler)
|
||||
understanding_parameters(crawler)
|
||||
|
||||
crawler.always_by_pass_cache = True
|
||||
|
||||
Reference in New Issue
Block a user