diff --git a/README.md b/README.md index e45abd98..191614f4 100644 --- a/README.md +++ b/README.md @@ -72,14 +72,12 @@ print(f"Time taken: {end - start}") Let's take a look the calculated time for the above code snippet: ```bash -[LOG] 🚀 Crawling done, success: True, time taken: 0.05835 seconds -[LOG] 🔥 Extracting semantic blocks, Strategy: NoExtractionStrategy -[LOG] 🚀 Extraction, time taken: 0.0588 seconds. -Time taken: 4.29332 +[LOG] 🚀 Crawling done, success: True, time taken: 1.3623387813568115 seconds +[LOG] 🚀 Content extracted, success: True, time taken: 0.05715131759643555 seconds +[LOG] 🚀 Extraction, time taken: 0.05750393867492676 seconds. +Time taken: 1.439958095550537 ``` - -It took around 4.29 seconds to crawl the page, extract the content, and return the result. - +Fetching the content from the page took 1.3623 seconds, and extracting the content took 0.0575 seconds. 🚀 ### Extract Structured Data from Web Pages 📊 diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index 57caed50..a33663e8 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -47,7 +47,7 @@ class WebCrawler: extraction_strategy= NoExtractionStrategy(), bypass_cache=False, verbose = False, - warmup=True + # warmup=True ) self.ready = True print("[LOG] 🌞 WebCrawler is ready to crawl") @@ -160,7 +160,11 @@ class WebCrawler: if not cached or not html: if user_agent: self.crawler_strategy.update_user_agent(user_agent) + t1 = time.time() html = self.crawler_strategy.crawl(url) + t2 = time.time() + if verbose: + print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds") if screenshot: screenshot_data = self.crawler_strategy.take_screenshot() @@ -190,7 +194,7 @@ class WebCrawler: t1 = time.time() result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False)) if verbose: - print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t1} seconds") + print(f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1} seconds") if result is None: raise ValueError(f"Failed to extract content from the website: {url}")