From 144cfa0eda9587bc283b1fa4fda688d5cdeef49e Mon Sep 17 00:00:00 2001 From: unclecode Date: Wed, 26 Jun 2024 13:00:17 +0800 Subject: [PATCH 1/6] Switch to ChromeDriverManager due some issues with download the chrome driver --- crawl4ai/crawler_strategy.py | 14 +++++++++++--- main.py | 4 +++- requirements.txt | 1 + 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py index 9e85d60d..06e386c3 100644 --- a/crawl4ai/crawler_strategy.py +++ b/crawl4ai/crawler_strategy.py @@ -6,6 +6,9 @@ from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.options import Options from selenium.common.exceptions import InvalidArgumentException +from selenium.webdriver.chrome.service import Service as ChromeService +from webdriver_manager.chrome import ChromeDriverManager + import logging import base64 from PIL import Image, ImageDraw, ImageFont @@ -118,10 +121,15 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): } # chromedriver_autoinstaller.install() - import chromedriver_autoinstaller - crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") - chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver(crawl4ai_folder, False) + # import chromedriver_autoinstaller + # crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") + # driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options) + # chromedriver_path = chromedriver_autoinstaller.install() + # chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver() # self.service = Service(chromedriver_autoinstaller.install()) + + + chromedriver_path = ChromeDriverManager().install() self.service = Service(chromedriver_path) self.service.log_path = "NUL" self.driver = webdriver.Chrome(service=self.service, options=self.options) diff --git a/main.py b/main.py index 45947c5a..a20c13ad 100644 --- a/main.py +++ b/main.py @@ -49,7 +49,9 @@ templates = Jinja2Templates(directory=__location__ + "/pages") @lru_cache() def get_crawler(): # Initialize and return a WebCrawler instance - return WebCrawler(verbose = True) + crawler = WebCrawler(verbose = True) + crawler.warmup() + return crawler class CrawlRequest(BaseModel): urls: List[str] diff --git a/requirements.txt b/requirements.txt index ee5be60a..ced41173 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,3 +20,4 @@ torch==2.3.1 onnxruntime==1.18.0 tokenizers==0.19.1 pillow==10.3.0 +webdriver-manager==4.0.1 \ No newline at end of file From 96d1eb0d0d0c66f9ecaf4f5332246c75c5b26fea Mon Sep 17 00:00:00 2001 From: unclecode Date: Wed, 26 Jun 2024 13:03:03 +0800 Subject: [PATCH 2/6] Some updated ins utils.py --- crawl4ai/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 3673fcc9..7699dc7b 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -770,4 +770,6 @@ def wrap_text(draw, text, font, max_width): def format_html(html_string): soup = BeautifulSoup(html_string, 'html.parser') - return soup.prettify() \ No newline at end of file + return soup.prettify() + + From 7ba2142363281341500d744e5b74ca62750d5006 Mon Sep 17 00:00:00 2001 From: unclecode Date: Wed, 26 Jun 2024 14:43:09 +0800 Subject: [PATCH 3/6] chore: Refactor get_content_of_website_optimized function in utils.py --- crawl4ai/utils.py | 43 +++++++++++++++++++++++------------------ crawl4ai/web_crawler.py | 6 ++++-- 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 7699dc7b..c468c49a 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -438,18 +438,17 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: links = {'internal': [], 'external': []} media = {'images': [], 'videos': [], 'audios': []} - def process_element(element: element.PageElement) -> None: + def process_element(element: element.PageElement) -> bool: if isinstance(element, NavigableString): if isinstance(element, Comment): element.extract() - return - - # if not isinstance(element, element.Tag): - # return + return False if element.name in ['script', 'style', 'link', 'meta', 'noscript']: element.decompose() - return + return False + + keep_element = False if element.name == 'a' and element.get('href'): href = element['href'] @@ -459,6 +458,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: links['external'].append(link_data) else: links['internal'].append(link_data) + keep_element = True elif element.name == 'img': media['images'].append({ @@ -466,12 +466,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: 'alt': element.get('alt'), 'type': 'image' }) - alt_text = element.get('alt') - if alt_text: - element.replace_with(soup.new_string(alt_text)) - else: - element.decompose() - return + return True # Always keep image elements elif element.name in ['video', 'audio']: media[f"{element.name}s"].append({ @@ -479,6 +474,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: 'alt': element.get('alt'), 'type': element.name }) + return True # Always keep video and audio elements if element.name != 'pre': if element.name in ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']: @@ -489,17 +485,26 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: elif element.name != 'img': element.attrs = {} - word_count = len(element.get_text(strip=True).split()) - if word_count < word_count_threshold: - element.decompose() - return - + # Process children for child in list(element.children): - process_element(child) + if isinstance(child, NavigableString) and not isinstance(child, Comment): + if len(child.strip()) > 0: + keep_element = True + else: + if process_element(child): + keep_element = True + - if not element.contents and not element.get_text(strip=True): + # Check word count + if not keep_element: + word_count = len(element.get_text(strip=True).split()) + keep_element = word_count >= word_count_threshold + + if not keep_element: element.decompose() + return keep_element + process_element(body) def flatten_nested_elements(node): diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index a33663e8..8aca6688 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -136,8 +136,10 @@ class WebCrawler: if not isinstance(chunking_strategy, ChunkingStrategy): raise ValueError("Unsupported chunking strategy") - if word_count_threshold < MIN_WORD_THRESHOLD: - word_count_threshold = MIN_WORD_THRESHOLD + # if word_count_threshold < MIN_WORD_THRESHOLD: + # word_count_threshold = MIN_WORD_THRESHOLD + + word_count_threshold = max(word_count_threshold, 0) # Check cache first cached = None From 4756d0a532b5233fb391e66513a66f653cc3dc02 Mon Sep 17 00:00:00 2001 From: unclecode Date: Wed, 26 Jun 2024 15:04:33 +0800 Subject: [PATCH 4/6] Refactor crawler_strategy.py to handle exceptions and improve error messages --- crawl4ai/crawler_strategy.py | 15 ++++-- crawl4ai/web_crawler.py | 88 ++++++++++++++++++++---------------- 2 files changed, 60 insertions(+), 43 deletions(-) diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py index 06e386c3..4f6190c9 100644 --- a/crawl4ai/crawler_strategy.py +++ b/crawl4ai/crawler_strategy.py @@ -5,7 +5,7 @@ from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.options import Options -from selenium.common.exceptions import InvalidArgumentException +from selenium.common.exceptions import InvalidArgumentException, WebDriverException from selenium.webdriver.chrome.service import Service as ChromeService from webdriver_manager.chrome import ChromeDriverManager @@ -220,9 +220,18 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): return html except InvalidArgumentException: - raise InvalidArgumentException(f"Invalid URL {url}") + if not hasattr(e, 'msg'): + e.msg = str(e) + raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}") + except WebDriverException as e: + # If e does nlt have msg attribute create it and set it to str(e) + if not hasattr(e, 'msg'): + e.msg = str(e) + raise WebDriverException(f"Failed to crawl {url}: {e.msg}") except Exception as e: - raise Exception(f"Failed to crawl {url}: {str(e)}") + if not hasattr(e, 'msg'): + e.msg = str(e) + raise Exception(f"Failed to crawl {url}: {e.msg}") def take_screenshot(self) -> str: try: diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index 8aca6688..ef85066e 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -129,49 +129,57 @@ class WebCrawler: verbose=True, **kwargs, ) -> CrawlResult: - extraction_strategy = extraction_strategy or NoExtractionStrategy() - extraction_strategy.verbose = verbose - if not isinstance(extraction_strategy, ExtractionStrategy): - raise ValueError("Unsupported extraction strategy") - if not isinstance(chunking_strategy, ChunkingStrategy): - raise ValueError("Unsupported chunking strategy") - - # if word_count_threshold < MIN_WORD_THRESHOLD: - # word_count_threshold = MIN_WORD_THRESHOLD + try: + extraction_strategy = extraction_strategy or NoExtractionStrategy() + extraction_strategy.verbose = verbose + if not isinstance(extraction_strategy, ExtractionStrategy): + raise ValueError("Unsupported extraction strategy") + if not isinstance(chunking_strategy, ChunkingStrategy): + raise ValueError("Unsupported chunking strategy") - word_count_threshold = max(word_count_threshold, 0) + # if word_count_threshold < MIN_WORD_THRESHOLD: + # word_count_threshold = MIN_WORD_THRESHOLD + + word_count_threshold = max(word_count_threshold, 0) - # Check cache first - cached = None - screenshot_data = None - extracted_content = None - if not bypass_cache and not self.always_by_pass_cache: - cached = get_cached_url(url) - - if kwargs.get("warmup", True) and not self.ready: - return None - - if cached: - html = cached[1] - extracted_content = cached[4] - if screenshot: - screenshot_data = cached[9] - if not screenshot_data: - cached = None - - if not cached or not html: - if user_agent: - self.crawler_strategy.update_user_agent(user_agent) - t1 = time.time() - html = self.crawler_strategy.crawl(url) - t2 = time.time() - if verbose: - print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds") - if screenshot: - screenshot_data = self.crawler_strategy.take_screenshot() + # Check cache first + cached = None + screenshot_data = None + extracted_content = None + if not bypass_cache and not self.always_by_pass_cache: + cached = get_cached_url(url) + + if kwargs.get("warmup", True) and not self.ready: + return None + + if cached: + html = cached[1] + extracted_content = cached[4] + if screenshot: + screenshot_data = cached[9] + if not screenshot_data: + cached = None + + if not cached or not html: + if user_agent: + self.crawler_strategy.update_user_agent(user_agent) + t1 = time.time() + html = self.crawler_strategy.crawl(url) + t2 = time.time() + if verbose: + print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds") + if screenshot: + screenshot_data = self.crawler_strategy.take_screenshot() - - return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs) + + crawl_result = self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs) + crawl_result.success = bool(html) + return crawl_result + except Exception as e: + if not hasattr(e, "msg"): + e.msg = str(e) + print(f"[ERROR] 🚫 Failed to crawl {url}, error: {e.msg}") + return CrawlResult(url=url, html="", success=False, error_message=e.msg) def process_html( self, From 3255c7a3facc3bcb94b5a85bb9b4ba92613a7bd3 Mon Sep 17 00:00:00 2001 From: unclecode Date: Wed, 26 Jun 2024 15:20:34 +0800 Subject: [PATCH 5/6] Update CHANGELOG.md with recent commits --- CHANGELOG.md | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d235d2cb..8f675785 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,3 +20,37 @@ ## [0.2.4] - 2024-06-17 ### Fixed - Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs +## Update 2024-06-26 + +### Commits in the last 3 hours: +4756d0a - Refactor crawler_strategy.py to handle exceptions and improve error messages +7ba2142 - chore: Refactor get_content_of_website_optimized function in utils.py +96d1eb0 - Some updated ins utils.py +144cfa0 - Switch to ChromeDriverManager due some issues with download the chrome driver +null +null +null +Here is a rewritten version of the changelog update in a nicer and more condensed way: + +**Update 2024-06-26** + +We've made some exciting improvements to our codebase! Here are the highlights: + +* Refactored our crawler strategy to handle exceptions and provide clearer error messages +* Optimized our content retrieval function for improved performance +* Updated internal utilities for better functionality +* Switched to ChromeDriverManager to resolve issues with downloading Chrome drivers + +These updates aim to improve stability, reliability, and overall performance. Thank you for using our tool! +Here is a rewritten version of the changelog update: + +**June 26, 2024** + +We've made some improvements to our code to make it more reliable and user-friendly! + +In the last 3 hours, we've committed 4 changes: + +* Improved error handling and messaging in [crawler_strategy.py](https://example.com/crawler_strategy.py) +* Refactored [get_content_of_website_optimized](https://example.com/utils.py) in [utils.py](https://example.com/utils.py) +* Made updates to [utils.py](https://example.com/utils.py) +* Switched to [ChromeDriverManager](https://example.com/ChromeDriverManager) to resolve issues with downloading the Chrome driver. From d11a83c2322e629f625918ade9db11760dd35923 Mon Sep 17 00:00:00 2001 From: unclecode Date: Wed, 26 Jun 2024 15:34:15 +0800 Subject: [PATCH 6/6] =?UTF-8?q?##=20[0.2.71]=202024-06-26=20=E2=80=A2=20Re?= =?UTF-8?q?factored=20`crawler=5Fstrategy.py`=20to=20handle=20exceptions?= =?UTF-8?q?=20and=20improve=20error=20messages=20=E2=80=A2=20Improved=20`g?= =?UTF-8?q?et=5Fcontent=5Fof=5Fwebsite=5Foptimized`=20function=20in=20`uti?= =?UTF-8?q?ls.py`=20for=20better=20performance=20=E2=80=A2=20Updated=20`ut?= =?UTF-8?q?ils.py`=20with=20latest=20changes=20=E2=80=A2=20Migrated=20to?= =?UTF-8?q?=20`ChromeDriverManager`=20for=20resolving=20Chrome=20driver=20?= =?UTF-8?q?download=20issues?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 4 +++- CHANGELOG.md | 45 +++++++++++--------------------------------- README.md | 2 +- docs/md/changelog.md | 8 +++++++- docs/md/index.md | 2 +- setup.py | 2 +- 6 files changed, 24 insertions(+), 39 deletions(-) diff --git a/.gitignore b/.gitignore index a055a455..d91cb941 100644 --- a/.gitignore +++ b/.gitignore @@ -185,4 +185,6 @@ local/ a.txt .lambda_function.py -ec2* \ No newline at end of file +ec2* + +update_changelog.sh \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f675785..57bb8614 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,16 @@ # Changelog +## [0.2.71] 2024-06-26 +• Refactored `crawler_strategy.py` to handle exceptions and improve error messages +• Improved `get_content_of_website_optimized` function in `utils.py` for better performance +• Updated `utils.py` with latest changes +• Migrated to `ChromeDriverManager` for resolving Chrome driver download issues + +## [0.2.71] - 2024-06-25 +### Fixed +- Speed up twice the extraction function. + + ## [0.2.6] - 2024-06-22 ### Fixed - Fix issue #19: Update Dockerfile to ensure compatibility across multiple platforms. @@ -20,37 +31,3 @@ ## [0.2.4] - 2024-06-17 ### Fixed - Fix issue #22: Use MD5 hash for caching HTML files to handle long URLs -## Update 2024-06-26 - -### Commits in the last 3 hours: -4756d0a - Refactor crawler_strategy.py to handle exceptions and improve error messages -7ba2142 - chore: Refactor get_content_of_website_optimized function in utils.py -96d1eb0 - Some updated ins utils.py -144cfa0 - Switch to ChromeDriverManager due some issues with download the chrome driver -null -null -null -Here is a rewritten version of the changelog update in a nicer and more condensed way: - -**Update 2024-06-26** - -We've made some exciting improvements to our codebase! Here are the highlights: - -* Refactored our crawler strategy to handle exceptions and provide clearer error messages -* Optimized our content retrieval function for improved performance -* Updated internal utilities for better functionality -* Switched to ChromeDriverManager to resolve issues with downloading Chrome drivers - -These updates aim to improve stability, reliability, and overall performance. Thank you for using our tool! -Here is a rewritten version of the changelog update: - -**June 26, 2024** - -We've made some improvements to our code to make it more reliable and user-friendly! - -In the last 3 hours, we've committed 4 changes: - -* Improved error handling and messaging in [crawler_strategy.py](https://example.com/crawler_strategy.py) -* Refactored [get_content_of_website_optimized](https://example.com/utils.py) in [utils.py](https://example.com/utils.py) -* Made updates to [utils.py](https://example.com/utils.py) -* Switched to [ChromeDriverManager](https://example.com/ChromeDriverManager) to resolve issues with downloading the Chrome driver. diff --git a/README.md b/README.md index 191614f4..f910c829 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Crawl4AI v0.2.7 🕷️🤖 +# Crawl4AI v0.2.71 🕷️🤖 [![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers) [![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members) diff --git a/docs/md/changelog.md b/docs/md/changelog.md index 7ab9e0cd..6f9ac706 100644 --- a/docs/md/changelog.md +++ b/docs/md/changelog.md @@ -1,6 +1,12 @@ # Changelog -## [0.2.7] - 2024-06-27 +## [0.2.71] 2024-06-26 +• Refactored `crawler_strategy.py` to handle exceptions and improve error messages +• Improved `get_content_of_website_optimized` function in `utils.py` for better performance +• Updated `utils.py` with latest changes +• Migrated to `ChromeDriverManager` for resolving Chrome driver download issues + +## [0.2.71] - 2024-06-25 ### Fixed - Speed up twice the extraction function. diff --git a/docs/md/index.md b/docs/md/index.md index c3610229..f9c25a42 100644 --- a/docs/md/index.md +++ b/docs/md/index.md @@ -1,4 +1,4 @@ -# Crawl4AI v0.2.7 +# Crawl4AI v0.2.71 Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI. diff --git a/setup.py b/setup.py index be9e5ca0..a11abc2e 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ class CustomInstallCommand(install): setup( name="Crawl4AI", - version="0.2.7", + version="0.2.71", description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper", long_description=open("README.md").read(), long_description_content_type="text/markdown",