Update README for speed example
This commit is contained in:
12
README.md
12
README.md
@@ -72,14 +72,12 @@ print(f"Time taken: {end - start}")
|
|||||||
Let's take a look the calculated time for the above code snippet:
|
Let's take a look the calculated time for the above code snippet:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
[LOG] 🚀 Crawling done, success: True, time taken: 0.05835 seconds
|
[LOG] 🚀 Crawling done, success: True, time taken: 1.3623387813568115 seconds
|
||||||
[LOG] 🔥 Extracting semantic blocks, Strategy: NoExtractionStrategy
|
[LOG] 🚀 Content extracted, success: True, time taken: 0.05715131759643555 seconds
|
||||||
[LOG] 🚀 Extraction, time taken: 0.0588 seconds.
|
[LOG] 🚀 Extraction, time taken: 0.05750393867492676 seconds.
|
||||||
Time taken: 4.29332
|
Time taken: 1.439958095550537
|
||||||
```
|
```
|
||||||
|
Fetching the content from the page took 1.3623 seconds, and extracting the content took 0.0575 seconds. 🚀
|
||||||
It took around 4.29 seconds to crawl the page, extract the content, and return the result.
|
|
||||||
|
|
||||||
|
|
||||||
### Extract Structured Data from Web Pages 📊
|
### Extract Structured Data from Web Pages 📊
|
||||||
|
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ class WebCrawler:
|
|||||||
extraction_strategy= NoExtractionStrategy(),
|
extraction_strategy= NoExtractionStrategy(),
|
||||||
bypass_cache=False,
|
bypass_cache=False,
|
||||||
verbose = False,
|
verbose = False,
|
||||||
warmup=True
|
# warmup=True
|
||||||
)
|
)
|
||||||
self.ready = True
|
self.ready = True
|
||||||
print("[LOG] 🌞 WebCrawler is ready to crawl")
|
print("[LOG] 🌞 WebCrawler is ready to crawl")
|
||||||
@@ -160,7 +160,11 @@ class WebCrawler:
|
|||||||
if not cached or not html:
|
if not cached or not html:
|
||||||
if user_agent:
|
if user_agent:
|
||||||
self.crawler_strategy.update_user_agent(user_agent)
|
self.crawler_strategy.update_user_agent(user_agent)
|
||||||
|
t1 = time.time()
|
||||||
html = self.crawler_strategy.crawl(url)
|
html = self.crawler_strategy.crawl(url)
|
||||||
|
t2 = time.time()
|
||||||
|
if verbose:
|
||||||
|
print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
|
||||||
if screenshot:
|
if screenshot:
|
||||||
screenshot_data = self.crawler_strategy.take_screenshot()
|
screenshot_data = self.crawler_strategy.take_screenshot()
|
||||||
|
|
||||||
@@ -190,7 +194,7 @@ class WebCrawler:
|
|||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
|
result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False))
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t1} seconds")
|
print(f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1} seconds")
|
||||||
|
|
||||||
if result is None:
|
if result is None:
|
||||||
raise ValueError(f"Failed to extract content from the website: {url}")
|
raise ValueError(f"Failed to extract content from the website: {url}")
|
||||||
|
|||||||
Reference in New Issue
Block a user