chore: Update version to 0.2.7 and improve extraction function speed

This commit is contained in:
unclecode
2024-06-24 22:39:56 +08:00
parent 68b3dff74a
commit 78cfad8b2f
6 changed files with 164 additions and 16 deletions

View File

@@ -46,7 +46,8 @@ class WebCrawler:
word_count_threshold=5,
extraction_strategy= NoExtractionStrategy(),
bypass_cache=False,
verbose = False
verbose = False,
warmup=True
)
self.ready = True
print("[LOG] 🌞 WebCrawler is ready to crawl")
@@ -145,6 +146,9 @@ class WebCrawler:
if not bypass_cache and not self.always_by_pass_cache:
cached = get_cached_url(url)
if kwargs.get("warmup", True) and not self.ready:
return None
if cached:
html = cached[1]
extracted_content = cached[4]
@@ -180,8 +184,13 @@ class WebCrawler:
t = time.time()
# Extract content from HTML
try:
result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
metadata = extract_metadata(html)
# t1 = time.time()
# result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
# print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t1} seconds")
t1 = time.time()
result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t1} seconds")
if result is None:
raise ValueError(f"Failed to extract content from the website: {url}")
except InvalidCSSSelectorError as e:
@@ -191,6 +200,7 @@ class WebCrawler:
markdown = result.get("markdown", "")
media = result.get("media", [])
links = result.get("links", [])
metadata = result.get("metadata", {})
if verbose:
print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t} seconds")