From 4d283ab3865aff060dd63856ac8bdfcb0632b325 Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Mon, 8 Jul 2024 16:33:25 +0800
Subject: [PATCH] =?UTF-8?q?##=20[v0.2.74]=20-=202024-07-08=20A=20slew=20of?=
 =?UTF-8?q?=20exciting=20updates=20to=20improve=20the=20crawler's=20stabil?=
 =?UTF-8?q?ity=20and=20robustness!=20=F0=9F=8E=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding.
- 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy.
- 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy.
- 🚮 **Database cleanup**: Removed existing database file and initialized a new one.
---
 CHANGELOG.md                                  |  9 ++++
 README.md                                     |  2 +-
 crawl4ai/chunking_strategy.py                 |  1 +
 crawl4ai/crawler_strategy.py                  | 34 +++++++------
 crawl4ai/database.py                          | 11 +++--
 crawl4ai/extraction_strategy.py               | 18 +++++--
 crawl4ai/utils.py                             | 12 ++++-
 crawl4ai/web_crawler.py                       | 10 ++--
 .../examples/llm_extraction_openai_pricing.py |  2 +-
 docs/examples/quickstart.py                   | 33 +++++++++++--
 docs/examples/summarize_page.py               |  2 +-
 docs/md/changelog.md                          |  8 ++++
 docs/md/examples/hooks_auth.md                | 14 ++++--
 docs/md/examples/llm_extraction.md            |  4 +-
 docs/md/examples/summarization.md             |  2 +-
 docs/md/index.md                              |  2 +-
 docs/md/quickstart.md                         | 48 +++++++------------
 setup.py                                      |  7 ++-
 18 files changed, 142 insertions(+), 77 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 90722b04..3db7d01b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,14 @@
 # Changelog
 
+## [v0.2.74] - 2024-07-08
+A slew of exciting updates to improve the crawler's stability and robustness! 🎉
+
+- 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding.
+- 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy.
+- 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy.
+- 🚮 **Database cleanup**: Removed existing database file and initialized a new one.
+
+
 ## [v0.2.73] - 2024-07-03
 
 💡 In this release, we've bumped the version to v0.2.73 and refreshed our documentation to ensure you have the best experience with our project.
diff --git a/README.md b/README.md
index cf4e4760..a2e784b3 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Crawl4AI v0.2.73 🕷️🤖
+# Crawl4AI v0.2.74 🕷️🤖
 
 [![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers)
 [![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members)
diff --git a/crawl4ai/chunking_strategy.py b/crawl4ai/chunking_strategy.py
index 5fe9b5e1..59006072 100644
--- a/crawl4ai/chunking_strategy.py
+++ b/crawl4ai/chunking_strategy.py
@@ -3,6 +3,7 @@ import re
 from collections import Counter
 import string
 from .model_loader import load_nltk_punkt
+from .utils import *
 
 # Define the abstract base class for chunking strategies
 class ChunkingStrategy(ABC):
diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py
index 21de883e..85ba4450 100644
--- a/crawl4ai/crawler_strategy.py
+++ b/crawl4ai/crawler_strategy.py
@@ -8,6 +8,7 @@ from selenium.webdriver.chrome.options import Options
 from selenium.common.exceptions import InvalidArgumentException, WebDriverException
 from selenium.webdriver.chrome.service import Service as ChromeService
 from webdriver_manager.chrome import ChromeDriverManager
+from urllib3.exceptions import MaxRetryError
 
 from .config import *
 import logging, time
@@ -18,7 +19,7 @@ from typing import List, Callable
 import requests
 import os
 from pathlib import Path
-from .utils import wrap_text
+from .utils import *
 
 logger = logging.getLogger('selenium.webdriver.remote.remote_connection')
 logger.setLevel(logging.WARNING)
@@ -73,7 +74,7 @@ class CloudCrawlerStrategy(CrawlerStrategy):
         response = requests.post("http://crawl4ai.uccode.io/crawl", json=data)
         response = response.json()
         html = response["results"][0]["html"]
-        return html
+        return sanitize_input_encode(html)
 
 class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
     def __init__(self, use_cached_html=False, js_code=None, **kwargs):
@@ -200,7 +201,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
             cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
             if os.path.exists(cache_file_path):
                 with open(cache_file_path, "r") as f:
-                    return f.read()
+                    return sanitize_input_encode(f.read())
 
         try:
             self.driver = self.execute_hook('before_get_url', self.driver)
@@ -214,11 +215,15 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
             WebDriverWait(self.driver, 10).until(
                 EC.presence_of_all_elements_located((By.TAG_NAME, "body"))
             )
+            
             self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
-            html = self._ensure_page_load() # self.driver.page_source                
+            
+            self.driver = self.execute_hook('after_get_url', self.driver)
+            html = sanitize_input_encode(self._ensure_page_load()) # self.driver.page_source                                        
             can_not_be_done_headless = False # Look at my creativity for naming variables
-            # TODO: Very ugly way for now but it works
-            if not kwargs.get('bypass_headless', False) and html == "<html><head></head><body></body></html>":
+            
+            # TODO: Very ugly approach, but promise to change it!
+            if kwargs.get('bypass_headless', False) or html == "<html><head></head><body></body></html>":
                 print("[LOG] 🙌 Page could not be loaded in headless mode. Trying non-headless mode...")
                 can_not_be_done_headless = True
                 options = Options()
@@ -227,11 +232,10 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
                 options.add_argument("--window-size=5,5")
                 driver = webdriver.Chrome(service=self.service, options=options)
                 driver.get(url)
-                html = driver.page_source
+                self.driver = self.execute_hook('after_get_url', driver)
+                html = sanitize_input_encode(driver.page_source)
                 driver.quit()
             
-            self.driver = self.execute_hook('after_get_url', self.driver)
-            
             # Execute JS code if provided
             if self.js_code and type(self.js_code) == str:
                 self.driver.execute_script(self.js_code)
@@ -247,12 +251,12 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
                     )
             
             if not can_not_be_done_headless:
-                html = self.driver.page_source
+                html = sanitize_input_encode(self.driver.page_source)
             self.driver = self.execute_hook('before_return_html', self.driver, html)
             
             # Store in cache
             cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
-            with open(cache_file_path, "w") as f:
+            with open(cache_file_path, "w", encoding="utf-8") as f:
                 f.write(html)
                 
             if self.verbose:
@@ -261,16 +265,16 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
             return html
         except InvalidArgumentException:
             if not hasattr(e, 'msg'):
-                e.msg = str(e)
+                e.msg = sanitize_input_encode(str(e))
             raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}")
         except WebDriverException as e:
             # If e does nlt have msg attribute create it and set it to str(e)
             if not hasattr(e, 'msg'):
-                e.msg = str(e)
+                e.msg = sanitize_input_encode(str(e))
             raise WebDriverException(f"Failed to crawl {url}: {e.msg}")  
         except Exception as e:
             if not hasattr(e, 'msg'):
-                e.msg = str(e)
+                e.msg = sanitize_input_encode(str(e))
             raise Exception(f"Failed to crawl {url}: {e.msg}")
 
     def take_screenshot(self) -> str:
@@ -299,7 +303,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
             return img_base64
 
         except Exception as e:
-            error_message = f"Failed to take screenshot: {str(e)}"
+            error_message = sanitize_input_encode(f"Failed to take screenshot: {str(e)}")
             print(error_message)
 
             # Generate an image with black background
diff --git a/crawl4ai/database.py b/crawl4ai/database.py
index 47f41748..37d94463 100644
--- a/crawl4ai/database.py
+++ b/crawl4ai/database.py
@@ -20,7 +20,7 @@ def init_db():
             extracted_content TEXT,
             success BOOLEAN,
             media TEXT DEFAULT "{}",
-            link TEXT DEFAULT "{}",
+            links TEXT DEFAULT "{}",
             metadata TEXT DEFAULT "{}",
             screenshot TEXT DEFAULT ""
         )
@@ -127,6 +127,9 @@ def update_existing_records(new_column: str = "media", default_value: str = "{}"
         print(f"Error updating existing records: {e}")
 
 if __name__ == "__main__":
-    init_db()  # Initialize the database if not already initialized
-    alter_db_add_screenshot("metadata")  # Add the new column to the table
-    update_existing_records("metadata")  # Update existing records to set the new column to an empty string
+    # Delete the existing database file
+    if os.path.exists(DB_PATH):
+        os.remove(DB_PATH)
+    init_db()  
+    # alter_db_add_screenshot("COL_NAME")
+    
diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py
index d4415c88..f889b45c 100644
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -116,7 +116,6 @@ class LLMExtractionStrategy(ExtractionStrategy):
             for block in blocks:
                 block['error'] = False
         except Exception as e:
-            print("Error extracting blocks:", str(e))
             parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
             blocks = parsed
             if unparsed:
@@ -192,7 +191,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
             # Sequential processing with a delay
             for ix, section in enumerate(merged_sections):
                 extract_func = partial(self.extract, url)
-                extracted_content.extend(extract_func(ix, section))
+                extracted_content.extend(extract_func(ix, sanitize_input_encode(section)))
                 time.sleep(0.5)  # 500 ms delay between each processing
         else:
             # Parallel processing using ThreadPoolExecutor
@@ -202,10 +201,21 @@ class LLMExtractionStrategy(ExtractionStrategy):
             
             with ThreadPoolExecutor(max_workers=4) as executor:
                 extract_func = partial(self.extract, url)
-                futures = [executor.submit(extract_func, ix, section) for ix, section in enumerate(merged_sections)]
+                futures = [executor.submit(extract_func, ix, sanitize_input_encode(section)) for ix, section in enumerate(merged_sections)]
                 
                 for future in as_completed(futures):
-                    extracted_content.extend(future.result())
+                    try:
+                        extracted_content.extend(future.result())
+                    except Exception as e:
+                        if self.verbose:
+                            print(f"Error in thread execution: {e}")
+                        # Add error information to extracted_content
+                        extracted_content.append({
+                            "index": 0,
+                            "error": True,
+                            "tags": ["error"],
+                            "content": str(e)
+                        })
 
         
         return extracted_content        
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index 474ce395..e7b59d65 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -96,6 +96,16 @@ def sanitize_html(html):
 
     return sanitized_html
 
+def sanitize_input_encode(text: str) -> str:
+    """Sanitize input to handle potential encoding issues."""
+    try:
+        # Attempt to encode and decode as UTF-8 to handle potential encoding issues
+        return text.encode('utf-8', errors='ignore').decode('utf-8')
+    except UnicodeEncodeError as e:
+        print(f"Warning: Encoding issue detected. Some characters may be lost. Error: {e}")
+        # Fall back to ASCII if UTF-8 fails
+        return text.encode('ascii', errors='ignore').decode('ascii')
+
 def escape_json_string(s):
     """
     Escapes characters in a string to be JSON safe.
@@ -664,7 +674,6 @@ def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None):
         for block in blocks:
             block['error'] = False
     except Exception as e:
-        print("Error extracting blocks:", str(e))
         parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content)
         blocks = parsed
         # Append all unparsed segments as onr error block and content is list of unparsed segments
@@ -710,7 +719,6 @@ def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_toke
             blocks = json.loads(blocks)
 
         except Exception as e:
-            print("Error extracting blocks:", str(e))
             blocks = [{
                 "index": 0,
                 "tags": ["error"],
diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py
index 954e9b84..db0d9856 100644
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -155,8 +155,8 @@ class WebCrawler:
                     return None
                 
                 if cached:
-                    html = cached[1]
-                    extracted_content = cached[4]
+                    html = sanitize_input_encode(cached[1])
+                    extracted_content = sanitize_input_encode(cached[4])
                     if screenshot:
                         screenshot_data = cached[9]
                         if not screenshot_data:
@@ -166,7 +166,7 @@ class WebCrawler:
                     if user_agent:
                         self.crawler_strategy.update_user_agent(user_agent)
                     t1 = time.time()
-                    html = self.crawler_strategy.crawl(url, **kwargs)
+                    html = sanitize_input_encode(self.crawler_strategy.crawl(url, **kwargs))
                     t2 = time.time()
                     if verbose:
                         print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
@@ -213,8 +213,8 @@ class WebCrawler:
             except InvalidCSSSelectorError as e:
                 raise ValueError(str(e))
             
-            cleaned_html = result.get("cleaned_html", "")
-            markdown = result.get("markdown", "")
+            cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
+            markdown = sanitize_input_encode(result.get("markdown", ""))
             media = result.get("media", [])
             links = result.get("links", [])
             metadata = result.get("metadata", {})
diff --git a/docs/examples/llm_extraction_openai_pricing.py b/docs/examples/llm_extraction_openai_pricing.py
index c4c6943e..9330ad31 100644
--- a/docs/examples/llm_extraction_openai_pricing.py
+++ b/docs/examples/llm_extraction_openai_pricing.py
@@ -36,5 +36,5 @@ model_fees = json.loads(result.extracted_content)
 
 print(len(model_fees))
 
-with open(".data/data.json", "w") as f:
+with open(".data/data.json", "w", encoding="utf-8") as f:
     f.write(result.extracted_content)
\ No newline at end of file
diff --git a/docs/examples/quickstart.py b/docs/examples/quickstart.py
index 24486cc1..89c63139 100644
--- a/docs/examples/quickstart.py
+++ b/docs/examples/quickstart.py
@@ -249,15 +249,40 @@ def using_crawler_hooks(crawler):
     
     cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
     
-    crawler.set_hook('on_driver_created', on_driver_created)
-    crawler.set_hook('before_get_url', before_get_url)
-    crawler.set_hook('after_get_url', after_get_url)
-    crawler.set_hook('before_return_html', before_return_html)
+    crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
+    crawler_strategy.set_hook('on_driver_created', on_driver_created)
+    crawler_strategy.set_hook('before_get_url', before_get_url)
+    crawler_strategy.set_hook('after_get_url', after_get_url)
+    crawler_strategy.set_hook('before_return_html', before_return_html)
     
+    crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
+    crawler.warmup()    
     result = crawler.run(url="https://example.com")
     
     cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
     print_result(result= result)
+    
+def using_crawler_hooks_dleay_example(crawler):
+    def delay(driver):
+        print("Delaying for 5 seconds...")
+        time.sleep(5)
+        print("Resuming...")
+        
+    def create_crawler():
+        crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
+        crawler_strategy.set_hook('after_get_url', delay)
+        crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
+        crawler.warmup()
+        return crawler
+
+    cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]")
+    crawler = create_crawler()
+    result = crawler.run(url="https://google.com", bypass_cache=True)    
+    
+    cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
+    print_result(result)
+    
+    
 
 def main():
     cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
diff --git a/docs/examples/summarize_page.py b/docs/examples/summarize_page.py
index 31098e8e..85158999 100644
--- a/docs/examples/summarize_page.py
+++ b/docs/examples/summarize_page.py
@@ -42,5 +42,5 @@ page_summary = json.loads(result.extracted_content)
 
 print(page_summary)
 
-with open(".data/page_summary.json", "w") as f:
+with open(".data/page_summary.json", "w", encoding="utf-8") as f:
     f.write(result.extracted_content)
diff --git a/docs/md/changelog.md b/docs/md/changelog.md
index 3796d309..b0eb7c0d 100644
--- a/docs/md/changelog.md
+++ b/docs/md/changelog.md
@@ -1,5 +1,13 @@
 # Changelog
 
+## v0.2.74 - 2024-07-08
+A slew of exciting updates to improve the crawler's stability and robustness! 🎉
+
+- 💻 **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding.
+- 🛡️ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy.
+- 🧹 **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy.
+- 🚮 **Database cleanup**: Removed existing database file and initialized a new one.
+
 ## [v0.2.73] - 2024-07-03
 
 💡 In this release, we've bumped the version to v0.2.73 and refreshed our documentation to ensure you have the best experience with our project.
diff --git a/docs/md/examples/hooks_auth.md b/docs/md/examples/hooks_auth.md
index 154300df..2b4c2701 100644
--- a/docs/md/examples/hooks_auth.md
+++ b/docs/md/examples/hooks_auth.md
@@ -14,6 +14,9 @@ Let's see how we can customize the crawler using hooks! In this example, we'll:
 ### Hook Definitions
 
 ```python
+from crawl4ai.web_crawler import WebCrawler
+from crawl4ai.crawler_strategy import *
+
 def on_driver_created(driver):
     print("[HOOK] on_driver_created")
     # Example customization: maximize the window
@@ -66,12 +69,13 @@ def before_return_html(driver, html):
 
 ```python
 print("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
-crawler = WebCrawler(verbose=True)
+crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
+crawler_strategy.set_hook('on_driver_created', on_driver_created)
+crawler_strategy.set_hook('before_get_url', before_get_url)
+crawler_strategy.set_hook('after_get_url', after_get_url)
+crawler_strategy.set_hook('before_return_html', before_return_html)
+crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
 crawler.warmup()
-crawler.set_hook('on_driver_created', on_driver_created)
-crawler.set_hook('before_get_url', before_get_url)
-crawler.set_hook('after_get_url', after_get_url)
-crawler.set_hook('before_return_html', before_return_html)
 
 result = crawler.run(url="https://example.com")
 
diff --git a/docs/md/examples/llm_extraction.md b/docs/md/examples/llm_extraction.md
index 5336a3cf..b7805726 100644
--- a/docs/md/examples/llm_extraction.md
+++ b/docs/md/examples/llm_extraction.md
@@ -45,7 +45,7 @@ model_fees = json.loads(result.extracted_content)
 
 print(len(model_fees))
 
-with open(".data/data.json", "w") as f:
+with open(".data/data.json", "w", encoding="utf-8") as f:
     f.write(result.extracted_content)
 ```
 
@@ -71,7 +71,7 @@ model_fees = json.loads(result.extracted_content)
 
 print(len(model_fees))
 
-with open(".data/data.json", "w") as f:
+with open(".data/data.json", "w", encoding="utf-8") as f:
     f.write(result.extracted_content)
 ```
 
diff --git a/docs/md/examples/summarization.md b/docs/md/examples/summarization.md
index 3210fad6..b817f691 100644
--- a/docs/md/examples/summarization.md
+++ b/docs/md/examples/summarization.md
@@ -91,7 +91,7 @@ This example demonstrates how to use `Crawl4AI` to extract a summary from a web
     Save the extracted data to a file for further use.
 
     ```python
-    with open(".data/page_summary.json", "w") as f:
+    with open(".data/page_summary.json", "w", encoding="utf-8") as f:
         f.write(result.extracted_content)
     ```
 
diff --git a/docs/md/index.md b/docs/md/index.md
index b08fdd12..b483234f 100644
--- a/docs/md/index.md
+++ b/docs/md/index.md
@@ -1,4 +1,4 @@
-# Crawl4AI v0.2.73
+# Crawl4AI v0.2.74
 
 Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.
 
diff --git a/docs/md/quickstart.md b/docs/md/quickstart.md
index 9f5bdcd8..a0c1a2c7 100644
--- a/docs/md/quickstart.md
+++ b/docs/md/quickstart.md
@@ -176,41 +176,29 @@ print(f"JavaScript Code (Load More button) result: {result}")
 Let's see how we can customize the crawler using hooks!
 
 ```python
-def on_driver_created(driver):
-    print("[HOOK] on_driver_created")
-    driver.maximize_window()
-    driver.get('https://example.com/login')
-    driver.find_element(By.NAME, 'username').send_keys('testuser')
-    driver.find_element(By.NAME, 'password').send_keys('password123')
-    driver.find_element(By.NAME, 'login').click()
-    driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'})
-    return driver        
+import time
 
-def before_get_url(driver):
-    print("[HOOK] before_get_url")
-    driver.execute_cdp_cmd('Network.enable', {})
-    driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}})
-    return driver
+from crawl4ai.web_crawler import WebCrawler
+from crawl4ai.crawler_strategy import *
 
-def after_get_url(driver):
-    print("[HOOK] after_get_url")
-    print(driver.current_url)
-    return driver
+def delay(driver):
+    print("Delaying for 5 seconds...")
+    time.sleep(5)
+    print("Resuming...")
+    
+def create_crawler():
+    crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
+    crawler_strategy.set_hook('after_get_url', delay)
+    crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
+    crawler.warmup()
+    return crawler
 
-def before_return_html(driver, html):
-    print("[HOOK] before_return_html")
-    print(len(html))
-    return driver
-
-crawler.set_hook('on_driver_created', on_driver_created)
-crawler.set_hook('before_get_url', before_get_url)
-crawler.set_hook('after_get_url', after_get_url)
-crawler.set_hook('before_return_html', before_return_html)
-
-result = crawler.run(url="https://example.com")
-print(f"Crawler Hooks result: {result}")
+crawler = create_crawler()
+result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
 ```
 
+check [Hooks](examples/hooks_auth.md) for more examples.
+
 ## Congratulations! 🎉
 
 You've made it through the Crawl4AI Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️
diff --git a/setup.py b/setup.py
index 468dc56e..4a2c346d 100644
--- a/setup.py
+++ b/setup.py
@@ -5,10 +5,15 @@ import subprocess
 from setuptools.command.install import install
 
 # Create the .crawl4ai folder in the user's home directory if it doesn't exist
+# If the folder already exists, remove the cache folder
 crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
+if os.path.exists(f"{crawl4ai_folder}/cache"):
+    subprocess.run(["rm", "-rf", f"{crawl4ai_folder}/cache"])
 os.makedirs(crawl4ai_folder, exist_ok=True)
 os.makedirs(f"{crawl4ai_folder}/cache", exist_ok=True)
 
+
+
 # Read the requirements from requirements.txt
 with open("requirements.txt") as f:
     requirements = f.read().splitlines()
@@ -20,7 +25,7 @@ transformer_requirements = [req for req in requirements if req.startswith(("tran
 
 setup(
     name="Crawl4AI",
-    version="0.2.73",
+    version="0.2.74",
     description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
     long_description=open("README.md", encoding="utf-8").read(),
     long_description_content_type="text/markdown",