Add time measurements.
This commit is contained in:
@@ -198,12 +198,15 @@ class HierarchicalClusteringStrategy(ExtractionStrategy):
|
|||||||
:return: A list of dictionaries representing the clusters.
|
:return: A list of dictionaries representing the clusters.
|
||||||
"""
|
"""
|
||||||
# Assume `html` is a list of text chunks for this strategy
|
# Assume `html` is a list of text chunks for this strategy
|
||||||
|
t = time.time()
|
||||||
text_chunks = html.split(self.DEL) # Split by lines or paragraphs as needed
|
text_chunks = html.split(self.DEL) # Split by lines or paragraphs as needed
|
||||||
|
|
||||||
# Perform clustering
|
# Perform clustering
|
||||||
labels = self.hierarchical_clustering(text_chunks)
|
labels = self.hierarchical_clustering(text_chunks)
|
||||||
|
print(f"Clustering done in {time.time() - t:.2f} seconds")
|
||||||
|
|
||||||
# Organize texts by their cluster labels, retaining order
|
# Organize texts by their cluster labels, retaining order
|
||||||
|
t = time.time()
|
||||||
clusters = {}
|
clusters = {}
|
||||||
for index, label in enumerate(labels):
|
for index, label in enumerate(labels):
|
||||||
clusters.setdefault(label, []).append(text_chunks[index])
|
clusters.setdefault(label, []).append(text_chunks[index])
|
||||||
@@ -221,6 +224,8 @@ class HierarchicalClusteringStrategy(ExtractionStrategy):
|
|||||||
top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
|
top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
|
||||||
cluster['tags'] = [cat for cat, _ in top_categories]
|
cluster['tags'] = [cat for cat, _ in top_categories]
|
||||||
|
|
||||||
|
print(f"Processing done in {time.time() - t:.2f} seconds")
|
||||||
|
|
||||||
return cluster_list
|
return cluster_list
|
||||||
|
|
||||||
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
||||||
|
|||||||
@@ -76,13 +76,16 @@ class WebCrawler:
|
|||||||
parsed_json = []
|
parsed_json = []
|
||||||
if extract_blocks_flag:
|
if extract_blocks_flag:
|
||||||
print(f"[LOG] 🚀 Extracting semantic blocks for {url_model.url}")
|
print(f"[LOG] 🚀 Extracting semantic blocks for {url_model.url}")
|
||||||
|
t = time.time()
|
||||||
# Split markdown into sections
|
# Split markdown into sections
|
||||||
sections = chunking_strategy.chunk(markdown)
|
sections = chunking_strategy.chunk(markdown)
|
||||||
# sections = merge_chunks_based_on_token_threshold(sections, CHUNK_TOKEN_THRESHOLD)
|
# sections = merge_chunks_based_on_token_threshold(sections, CHUNK_TOKEN_THRESHOLD)
|
||||||
|
|
||||||
parsed_json = extraction_strategy.run(str(url_model.url), sections, provider, api_token)
|
parsed_json = extraction_strategy.run(str(url_model.url), sections, provider, api_token)
|
||||||
parsed_json = json.dumps(parsed_json)
|
parsed_json = json.dumps(parsed_json)
|
||||||
print(f"[LOG] 🚀 Extraction done for {url_model.url}")
|
|
||||||
|
|
||||||
|
print(f"[LOG] 🚀 Extraction done for {url_model.url}, time taken: {time.time() - t} seconds.")
|
||||||
else:
|
else:
|
||||||
parsed_json = "{}"
|
parsed_json = "{}"
|
||||||
print(f"[LOG] 🚀 Skipping extraction for {url_model.url}")
|
print(f"[LOG] 🚀 Skipping extraction for {url_model.url}")
|
||||||
|
|||||||
Reference in New Issue
Block a user