From eb6423875f3e50ffe9294c742b7dbef611bd314a Mon Sep 17 00:00:00 2001 From: unclecode Date: Sat, 18 May 2024 14:13:06 +0800 Subject: [PATCH] chore: Update Selenium options in crawler_strategy.py and add verbose logging in CosineStrategy --- crawl4ai/crawler_strategy.py | 15 ++++++++++++++- crawl4ai/extraction_strategy.py | 19 ++++++++++++++++--- docs/examples/quickstart.py | 2 +- 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py index 0b189e67..33988dec 100644 --- a/crawl4ai/crawler_strategy.py +++ b/crawl4ai/crawler_strategy.py @@ -42,10 +42,23 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): self.options = Options() self.options.headless = True self.options.add_argument("--no-sandbox") + self.options.add_argument("--headless") self.options.add_argument("--disable-dev-shm-usage") self.options.add_argument("--disable-gpu") self.options.add_argument("--disable-extensions") - self.options.add_argument("--headless") + self.options.add_argument("--disable-infobars") + self.options.add_argument("--disable-logging") + self.options.add_argument("--disable-popup-blocking") + self.options.add_argument("--disable-translate") + self.options.add_argument("--disable-default-apps") + self.options.add_argument("--disable-background-networking") + self.options.add_argument("--disable-sync") + self.options.add_argument("--disable-features=NetworkService,NetworkServiceInProcess") + self.options.add_argument("--disable-browser-side-navigation") + self.options.add_argument("--dns-prefetch-disable") + self.options.add_argument("--disable-web-security") + self.options.add_argument("--log-level=3") + self.use_cached_html = use_cached_html self.use_cached_html = use_cached_html self.js_code = js_code self.verbose = kwargs.get("verbose", False) diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index e76c1084..a5d4b447 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -141,7 +141,8 @@ class LLMExtractionStrategy(ExtractionStrategy): if self.provider.startswith("groq/"): # Sequential processing with a delay for ix, section in enumerate(merged_sections): - extracted_content.extend(self.extract(ix, url, section)) + extract_func = partial(self.extract, url) + extracted_content.extend(extract_func(ix, section)) time.sleep(0.5) # 500 ms delay between each processing else: # Parallel processing using ThreadPoolExecutor @@ -315,6 +316,9 @@ class CosineStrategy(ExtractionStrategy): # Convert filtered clusters to a sorted list of dictionaries cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)] + if self.verbose: + print(f"[LOG] 🚀 Assign tags using {self.device}") + if self.device == "gpu": labels = self.nlp([cluster['content'] for cluster in cluster_list]) @@ -322,11 +326,20 @@ class CosineStrategy(ExtractionStrategy): cluster['tags'] = label elif self.device == "cpu": # Process the text with the loaded model - for cluster in cluster_list: - doc = self.nlp(cluster['content']) + texts = [cluster['content'] for cluster in cluster_list] + # Batch process texts + docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"]) + + for doc, cluster in zip(docs, cluster_list): tok_k = self.top_k top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k] cluster['tags'] = [cat for cat, _ in top_categories] + + # for cluster in cluster_list: + # doc = self.nlp(cluster['content']) + # tok_k = self.top_k + # top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k] + # cluster['tags'] = [cat for cat, _ in top_categories] if self.verbose: print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds") diff --git a/docs/examples/quickstart.py b/docs/examples/quickstart.py index 012ea65a..c0fe9468 100644 --- a/docs/examples/quickstart.py +++ b/docs/examples/quickstart.py @@ -171,7 +171,7 @@ def main(): crawler = create_crawler() - # basic_usage(crawler) + basic_usage(crawler) understanding_parameters(crawler) crawler.always_by_pass_cache = True