diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 648631a0..d1f68a37 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -198,12 +198,15 @@ class HierarchicalClusteringStrategy(ExtractionStrategy): :return: A list of dictionaries representing the clusters. """ # Assume `html` is a list of text chunks for this strategy + t = time.time() text_chunks = html.split(self.DEL) # Split by lines or paragraphs as needed # Perform clustering labels = self.hierarchical_clustering(text_chunks) + print(f"Clustering done in {time.time() - t:.2f} seconds") # Organize texts by their cluster labels, retaining order + t = time.time() clusters = {} for index, label in enumerate(labels): clusters.setdefault(label, []).append(text_chunks[index]) @@ -221,6 +224,8 @@ class HierarchicalClusteringStrategy(ExtractionStrategy): top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k] cluster['tags'] = [cat for cat, _ in top_categories] + print(f"Processing done in {time.time() - t:.2f} seconds") + return cluster_list def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index a5dfd4ee..19ad8af5 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -76,13 +76,16 @@ class WebCrawler: parsed_json = [] if extract_blocks_flag: print(f"[LOG] 🚀 Extracting semantic blocks for {url_model.url}") + t = time.time() # Split markdown into sections sections = chunking_strategy.chunk(markdown) # sections = merge_chunks_based_on_token_threshold(sections, CHUNK_TOKEN_THRESHOLD) parsed_json = extraction_strategy.run(str(url_model.url), sections, provider, api_token) parsed_json = json.dumps(parsed_json) - print(f"[LOG] 🚀 Extraction done for {url_model.url}") + + + print(f"[LOG] 🚀 Extraction done for {url_model.url}, time taken: {time.time() - t} seconds.") else: parsed_json = "{}" print(f"[LOG] 🚀 Skipping extraction for {url_model.url}")