Update README for speed example

This commit is contained in:
unclecode
2024-06-24 23:06:12 +08:00
parent 1fffeeedd2
commit a0dff192ae
2 changed files with 11 additions and 9 deletions

View File

@@ -72,14 +72,12 @@ print(f"Time taken: {end - start}")
Let's take a look the calculated time for the above code snippet:
```bash
[LOG] 🚀 Crawling done, success: True, time taken: 0.05835 seconds
[LOG] 🔥 Extracting semantic blocks, Strategy: NoExtractionStrategy
[LOG] 🚀 Extraction, time taken: 0.0588 seconds.
Time taken: 4.29332
[LOG] 🚀 Crawling done, success: True, time taken: 1.3623387813568115 seconds
[LOG] 🚀 Content extracted, success: True, time taken: 0.05715131759643555 seconds
[LOG] 🚀 Extraction, time taken: 0.05750393867492676 seconds.
Time taken: 1.439958095550537
```
It took around 4.29 seconds to crawl the page, extract the content, and return the result.
Fetching the content from the page took 1.3623 seconds, and extracting the content took 0.0575 seconds. 🚀
### Extract Structured Data from Web Pages 📊

View File

@@ -47,7 +47,7 @@ class WebCrawler:
extraction_strategy= NoExtractionStrategy(),
bypass_cache=False,
verbose = False,
warmup=True
# warmup=True
)
self.ready = True
print("[LOG] 🌞 WebCrawler is ready to crawl")
@@ -160,7 +160,11 @@ class WebCrawler:
if not cached or not html:
if user_agent:
self.crawler_strategy.update_user_agent(user_agent)
t1 = time.time()
html = self.crawler_strategy.crawl(url)
t2 = time.time()
if verbose:
print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
if screenshot:
screenshot_data = self.crawler_strategy.take_screenshot()
@@ -190,7 +194,7 @@ class WebCrawler:
t1 = time.time()
result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
if verbose:
print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t1} seconds")
print(f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1} seconds")
if result is None:
raise ValueError(f"Failed to extract content from the website: {url}")