From 144cfa0eda9587bc283b1fa4fda688d5cdeef49e Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Wed, 26 Jun 2024 13:00:17 +0800
Subject: [PATCH 1/6] Switch to ChromeDriverManager due some issues with
 download the chrome driver

---
 crawl4ai/crawler_strategy.py | 14 +++++++++++---
 main.py                      |  4 +++-
 requirements.txt             |  1 +
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py
index 9e85d60d..06e386c3 100644
--- a/crawl4ai/crawler_strategy.py
+++ b/crawl4ai/crawler_strategy.py
@@ -6,6 +6,9 @@ from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.chrome.options import Options
 from selenium.common.exceptions import InvalidArgumentException
+from selenium.webdriver.chrome.service import Service as ChromeService
+from webdriver_manager.chrome import ChromeDriverManager
+
 import logging
 import base64
 from PIL import Image, ImageDraw, ImageFont
@@ -118,10 +121,15 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
         }
 
         # chromedriver_autoinstaller.install()
-        import chromedriver_autoinstaller
-        crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
-        chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver(crawl4ai_folder, False)
+        # import chromedriver_autoinstaller
+        # crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
+        # driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options)
+        # chromedriver_path = chromedriver_autoinstaller.install()
+        # chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver()
         # self.service = Service(chromedriver_autoinstaller.install())
+        
+        
+        chromedriver_path = ChromeDriverManager().install()
         self.service = Service(chromedriver_path)
         self.service.log_path = "NUL"
         self.driver = webdriver.Chrome(service=self.service, options=self.options)
diff --git a/main.py b/main.py
index 45947c5a..a20c13ad 100644
--- a/main.py
+++ b/main.py
@@ -49,7 +49,9 @@ templates = Jinja2Templates(directory=__location__ + "/pages")
 @lru_cache()
 def get_crawler():
     # Initialize and return a WebCrawler instance
-    return WebCrawler(verbose = True)
+    crawler = WebCrawler(verbose = True)
+    crawler.warmup()
+    return crawler
 
 class CrawlRequest(BaseModel):
     urls: List[str]
diff --git a/requirements.txt b/requirements.txt
index ee5be60a..ced41173 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,3 +20,4 @@ torch==2.3.1
 onnxruntime==1.18.0
 tokenizers==0.19.1
 pillow==10.3.0
+webdriver-manager==4.0.1
\ No newline at end of file

From 96d1eb0d0d0c66f9ecaf4f5332246c75c5b26fea Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Wed, 26 Jun 2024 13:03:03 +0800
Subject: [PATCH 2/6] Some updated ins utils.py

---
 crawl4ai/utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index 3673fcc9..7699dc7b 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -770,4 +770,6 @@ def wrap_text(draw, text, font, max_width):
 
 def format_html(html_string):
     soup = BeautifulSoup(html_string, 'html.parser')
-    return soup.prettify()
\ No newline at end of file
+    return soup.prettify()
+
+

From 7ba2142363281341500d744e5b74ca62750d5006 Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Wed, 26 Jun 2024 14:43:09 +0800
Subject: [PATCH 3/6] chore: Refactor get_content_of_website_optimized function
 in utils.py

---
 crawl4ai/utils.py       | 43 +++++++++++++++++++++++------------------
 crawl4ai/web_crawler.py |  6 ++++--
 2 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index 7699dc7b..c468c49a 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -438,18 +438,17 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
     links = {'internal': [], 'external': []}
     media = {'images': [], 'videos': [], 'audios': []}
 
-    def process_element(element: element.PageElement) -> None:
+    def process_element(element: element.PageElement) -> bool:
         if isinstance(element, NavigableString):
             if isinstance(element, Comment):
                 element.extract()
-            return
-
-        # if not isinstance(element, element.Tag):
-        #     return
+            return False
 
         if element.name in ['script', 'style', 'link', 'meta', 'noscript']:
             element.decompose()
-            return
+            return False
+
+        keep_element = False
 
         if element.name == 'a' and element.get('href'):
             href = element['href']
@@ -459,6 +458,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
                 links['external'].append(link_data)
             else:
                 links['internal'].append(link_data)
+            keep_element = True
 
         elif element.name == 'img':
             media['images'].append({
@@ -466,12 +466,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
                 'alt': element.get('alt'),
                 'type': 'image'
             })
-            alt_text = element.get('alt')
-            if alt_text:
-                element.replace_with(soup.new_string(alt_text))
-            else:
-                element.decompose()
-            return
+            return True  # Always keep image elements
 
         elif element.name in ['video', 'audio']:
             media[f"{element.name}s"].append({
@@ -479,6 +474,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
                 'alt': element.get('alt'),
                 'type': element.name
             })
+            return True  # Always keep video and audio elements
 
         if element.name != 'pre':
             if element.name in ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']:
@@ -489,17 +485,26 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
             elif element.name != 'img':
                 element.attrs = {}
 
-        word_count = len(element.get_text(strip=True).split())
-        if word_count < word_count_threshold:
-            element.decompose()
-            return
-
+        # Process children
         for child in list(element.children):
-            process_element(child)
+            if isinstance(child, NavigableString) and not isinstance(child, Comment):
+                if len(child.strip()) > 0:
+                    keep_element = True
+            else:
+                if process_element(child):
+                    keep_element = True
+            
 
-        if not element.contents and not element.get_text(strip=True):
+        # Check word count
+        if not keep_element:
+            word_count = len(element.get_text(strip=True).split())
+            keep_element = word_count >= word_count_threshold
+
+        if not keep_element:
             element.decompose()
 
+        return keep_element
+
     process_element(body)
 
     def flatten_nested_elements(node):
diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py
index a33663e8..8aca6688 100644
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -136,8 +136,10 @@ class WebCrawler:
             if not isinstance(chunking_strategy, ChunkingStrategy):
                 raise ValueError("Unsupported chunking strategy")
             
-            if word_count_threshold < MIN_WORD_THRESHOLD:
-                word_count_threshold = MIN_WORD_THRESHOLD
+            # if word_count_threshold < MIN_WORD_THRESHOLD:
+            #     word_count_threshold = MIN_WORD_THRESHOLD
+                
+            word_count_threshold = max(word_count_threshold, 0)
 
             # Check cache first
             cached = None

From 4756d0a532b5233fb391e66513a66f653cc3dc02 Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Wed, 26 Jun 2024 15:04:33 +0800
Subject: [PATCH 4/6] Refactor crawler_strategy.py to handle exceptions and
 improve error messages

---
 crawl4ai/crawler_strategy.py | 15 ++++--
 crawl4ai/web_crawler.py      | 88 ++++++++++++++++++++----------------
 2 files changed, 60 insertions(+), 43 deletions(-)

diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py
index 06e386c3..4f6190c9 100644
--- a/crawl4ai/crawler_strategy.py
+++ b/crawl4ai/crawler_strategy.py
@@ -5,7 +5,7 @@ from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.chrome.options import Options
-from selenium.common.exceptions import InvalidArgumentException
+from selenium.common.exceptions import InvalidArgumentException, WebDriverException
 from selenium.webdriver.chrome.service import Service as ChromeService
 from webdriver_manager.chrome import ChromeDriverManager
 
@@ -220,9 +220,18 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
             
             return html
         except InvalidArgumentException:
-            raise InvalidArgumentException(f"Invalid URL {url}")
+            if not hasattr(e, 'msg'):
+                e.msg = str(e)
+            raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}")
+        except WebDriverException as e:
+            # If e does nlt have msg attribute create it and set it to str(e)
+            if not hasattr(e, 'msg'):
+                e.msg = str(e)
+            raise WebDriverException(f"Failed to crawl {url}: {e.msg}")  
         except Exception as e:
-            raise Exception(f"Failed to crawl {url}: {str(e)}")
+            if not hasattr(e, 'msg'):
+                e.msg = str(e)
+            raise Exception(f"Failed to crawl {url}: {e.msg}")
 
     def take_screenshot(self) -> str:
         try:
diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py
index 8aca6688..ef85066e 100644
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -129,49 +129,57 @@ class WebCrawler:
             verbose=True,
             **kwargs,
         ) -> CrawlResult:
-            extraction_strategy = extraction_strategy or NoExtractionStrategy()
-            extraction_strategy.verbose = verbose
-            if not isinstance(extraction_strategy, ExtractionStrategy):
-                raise ValueError("Unsupported extraction strategy")
-            if not isinstance(chunking_strategy, ChunkingStrategy):
-                raise ValueError("Unsupported chunking strategy")
-            
-            # if word_count_threshold < MIN_WORD_THRESHOLD:
-            #     word_count_threshold = MIN_WORD_THRESHOLD
+            try:
+                extraction_strategy = extraction_strategy or NoExtractionStrategy()
+                extraction_strategy.verbose = verbose
+                if not isinstance(extraction_strategy, ExtractionStrategy):
+                    raise ValueError("Unsupported extraction strategy")
+                if not isinstance(chunking_strategy, ChunkingStrategy):
+                    raise ValueError("Unsupported chunking strategy")
                 
-            word_count_threshold = max(word_count_threshold, 0)
+                # if word_count_threshold < MIN_WORD_THRESHOLD:
+                #     word_count_threshold = MIN_WORD_THRESHOLD
+                    
+                word_count_threshold = max(word_count_threshold, 0)
 
-            # Check cache first
-            cached = None
-            screenshot_data = None
-            extracted_content = None
-            if not bypass_cache and not self.always_by_pass_cache:
-                cached = get_cached_url(url)
-            
-            if kwargs.get("warmup", True) and not self.ready:
-                return None
-            
-            if cached:
-                html = cached[1]
-                extracted_content = cached[4]
-                if screenshot:
-                    screenshot_data = cached[9]
-                    if not screenshot_data:
-                        cached = None
-            
-            if not cached or not html:
-                if user_agent:
-                    self.crawler_strategy.update_user_agent(user_agent)
-                t1 = time.time()
-                html = self.crawler_strategy.crawl(url)
-                t2 = time.time()
-                if verbose:
-                    print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
-                if screenshot:
-                    screenshot_data = self.crawler_strategy.take_screenshot()
+                # Check cache first
+                cached = None
+                screenshot_data = None
+                extracted_content = None
+                if not bypass_cache and not self.always_by_pass_cache:
+                    cached = get_cached_url(url)
+                
+                if kwargs.get("warmup", True) and not self.ready:
+                    return None
+                
+                if cached:
+                    html = cached[1]
+                    extracted_content = cached[4]
+                    if screenshot:
+                        screenshot_data = cached[9]
+                        if not screenshot_data:
+                            cached = None
+                
+                if not cached or not html:
+                    if user_agent:
+                        self.crawler_strategy.update_user_agent(user_agent)
+                    t1 = time.time()
+                    html = self.crawler_strategy.crawl(url)
+                    t2 = time.time()
+                    if verbose:
+                        print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds")
+                    if screenshot:
+                        screenshot_data = self.crawler_strategy.take_screenshot()
 
-            
-            return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs)
+                
+                crawl_result = self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs)
+                crawl_result.success = bool(html)
+                return crawl_result
+            except Exception as e:
+                if not hasattr(e, "msg"):
+                    e.msg = str(e)
+                print(f"[ERROR] 🚫 Failed to crawl {url}, error: {e.msg}")    
+                return CrawlResult(url=url, html="", success=False, error_message=e.msg)
 
     def process_html(
             self,

From 3255c7a3facc3bcb94b5a85bb9b4ba92613a7bd3 Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Wed, 26 Jun 2024 15:20:34 +0800
Subject: [PATCH 5/6] Update CHANGELOG.md with recent commits

---
 CHANGELOG.md | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d235d2cb..8f675785 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,3 +20,37 @@
 ## [0.2.4] - 2024-06-17
 ### Fixed
 - Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs
+## Update 2024-06-26
+
+### Commits in the last 3 hours:
+4756d0a - Refactor crawler_strategy.py to handle exceptions and improve error messages
+7ba2142 - chore: Refactor get_content_of_website_optimized function in utils.py
+96d1eb0 - Some updated ins utils.py
+144cfa0 - Switch to ChromeDriverManager due some issues with download the chrome driver
+null
+null
+null
+Here is a rewritten version of the changelog update in a nicer and more condensed way:
+
+**Update 2024-06-26**
+
+We've made some exciting improvements to our codebase! Here are the highlights:
+
+* Refactored our crawler strategy to handle exceptions and provide clearer error messages
+* Optimized our content retrieval function for improved performance
+* Updated internal utilities for better functionality
+* Switched to ChromeDriverManager to resolve issues with downloading Chrome drivers
+
+These updates aim to improve stability, reliability, and overall performance. Thank you for using our tool!
+Here is a rewritten version of the changelog update:
+
+**June 26, 2024**
+
+We've made some improvements to our code to make it more reliable and user-friendly! 
+
+In the last 3 hours, we've committed 4 changes:
+
+* Improved error handling and messaging in [crawler_strategy.py](https://example.com/crawler_strategy.py)
+* Refactored [get_content_of_website_optimized](https://example.com/utils.py) in [utils.py](https://example.com/utils.py)
+* Made updates to [utils.py](https://example.com/utils.py)
+* Switched to [ChromeDriverManager](https://example.com/ChromeDriverManager) to resolve issues with downloading the Chrome driver.

From d11a83c2322e629f625918ade9db11760dd35923 Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Wed, 26 Jun 2024 15:34:15 +0800
Subject: [PATCH 6/6] =?UTF-8?q?##=20[0.2.71]=202024-06-26=20=E2=80=A2=20Re?=
 =?UTF-8?q?factored=20`crawler=5Fstrategy.py`=20to=20handle=20exceptions?=
 =?UTF-8?q?=20and=20improve=20error=20messages=20=E2=80=A2=20Improved=20`g?=
 =?UTF-8?q?et=5Fcontent=5Fof=5Fwebsite=5Foptimized`=20function=20in=20`uti?=
 =?UTF-8?q?ls.py`=20for=20better=20performance=20=E2=80=A2=20Updated=20`ut?=
 =?UTF-8?q?ils.py`=20with=20latest=20changes=20=E2=80=A2=20Migrated=20to?=
 =?UTF-8?q?=20`ChromeDriverManager`=20for=20resolving=20Chrome=20driver=20?=
 =?UTF-8?q?download=20issues?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore           |  4 +++-
 CHANGELOG.md         | 45 +++++++++++---------------------------------
 README.md            |  2 +-
 docs/md/changelog.md |  8 +++++++-
 docs/md/index.md     |  2 +-
 setup.py             |  2 +-
 6 files changed, 24 insertions(+), 39 deletions(-)

diff --git a/.gitignore b/.gitignore
index a055a455..d91cb941 100644
--- a/.gitignore
+++ b/.gitignore
@@ -185,4 +185,6 @@ local/
 
 a.txt
 .lambda_function.py
-ec2*
\ No newline at end of file
+ec2*
+
+update_changelog.sh
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8f675785..57bb8614 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,16 @@
 # Changelog
 
+## [0.2.71] 2024-06-26
+• Refactored `crawler_strategy.py` to handle exceptions and improve error messages
+• Improved `get_content_of_website_optimized` function in `utils.py` for better performance
+• Updated `utils.py` with latest changes
+• Migrated to `ChromeDriverManager` for resolving Chrome driver download issues
+
+## [0.2.71] - 2024-06-25
+### Fixed
+- Speed up twice the extraction function.
+
+
 ## [0.2.6] - 2024-06-22
 ### Fixed
 - Fix issue #19: Update Dockerfile to ensure compatibility across multiple platforms.
@@ -20,37 +31,3 @@
 ## [0.2.4] - 2024-06-17
 ### Fixed
 - Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs
-## Update 2024-06-26
-
-### Commits in the last 3 hours:
-4756d0a - Refactor crawler_strategy.py to handle exceptions and improve error messages
-7ba2142 - chore: Refactor get_content_of_website_optimized function in utils.py
-96d1eb0 - Some updated ins utils.py
-144cfa0 - Switch to ChromeDriverManager due some issues with download the chrome driver
-null
-null
-null
-Here is a rewritten version of the changelog update in a nicer and more condensed way:
-
-**Update 2024-06-26**
-
-We've made some exciting improvements to our codebase! Here are the highlights:
-
-* Refactored our crawler strategy to handle exceptions and provide clearer error messages
-* Optimized our content retrieval function for improved performance
-* Updated internal utilities for better functionality
-* Switched to ChromeDriverManager to resolve issues with downloading Chrome drivers
-
-These updates aim to improve stability, reliability, and overall performance. Thank you for using our tool!
-Here is a rewritten version of the changelog update:
-
-**June 26, 2024**
-
-We've made some improvements to our code to make it more reliable and user-friendly! 
-
-In the last 3 hours, we've committed 4 changes:
-
-* Improved error handling and messaging in [crawler_strategy.py](https://example.com/crawler_strategy.py)
-* Refactored [get_content_of_website_optimized](https://example.com/utils.py) in [utils.py](https://example.com/utils.py)
-* Made updates to [utils.py](https://example.com/utils.py)
-* Switched to [ChromeDriverManager](https://example.com/ChromeDriverManager) to resolve issues with downloading the Chrome driver.
diff --git a/README.md b/README.md
index 191614f4..f910c829 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Crawl4AI v0.2.7 🕷️🤖
+# Crawl4AI v0.2.71 🕷️🤖
 
 [![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers)
 [![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members)
diff --git a/docs/md/changelog.md b/docs/md/changelog.md
index 7ab9e0cd..6f9ac706 100644
--- a/docs/md/changelog.md
+++ b/docs/md/changelog.md
@@ -1,6 +1,12 @@
 # Changelog
 
-## [0.2.7] - 2024-06-27
+## [0.2.71] 2024-06-26
+• Refactored `crawler_strategy.py` to handle exceptions and improve error messages
+• Improved `get_content_of_website_optimized` function in `utils.py` for better performance
+• Updated `utils.py` with latest changes
+• Migrated to `ChromeDriverManager` for resolving Chrome driver download issues
+
+## [0.2.71] - 2024-06-25
 ### Fixed
 - Speed up twice the extraction function.
 
diff --git a/docs/md/index.md b/docs/md/index.md
index c3610229..f9c25a42 100644
--- a/docs/md/index.md
+++ b/docs/md/index.md
@@ -1,4 +1,4 @@
-# Crawl4AI v0.2.7
+# Crawl4AI v0.2.71
 
 Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI.
 
diff --git a/setup.py b/setup.py
index be9e5ca0..a11abc2e 100644
--- a/setup.py
+++ b/setup.py
@@ -33,7 +33,7 @@ class CustomInstallCommand(install):
 
 setup(
     name="Crawl4AI",
-    version="0.2.7",
+    version="0.2.71",
     description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
     long_description=open("README.md").read(),
     long_description_content_type="text/markdown",