diff --git a/.gitignore b/.gitignore index a055a455..d91cb941 100644 --- a/.gitignore +++ b/.gitignore @@ -185,4 +185,6 @@ local/ a.txt .lambda_function.py -ec2* \ No newline at end of file +ec2* + +update_changelog.sh \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index d235d2cb..57bb8614 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,16 @@ # Changelog +## [0.2.71] 2024-06-26 +• Refactored `crawler_strategy.py` to handle exceptions and improve error messages +• Improved `get_content_of_website_optimized` function in `utils.py` for better performance +• Updated `utils.py` with latest changes +• Migrated to `ChromeDriverManager` for resolving Chrome driver download issues + +## [0.2.71] - 2024-06-25 +### Fixed +- Speed up twice the extraction function. + + ## [0.2.6] - 2024-06-22 ### Fixed - Fix issue #19: Update Dockerfile to ensure compatibility across multiple platforms. diff --git a/README.md b/README.md index 191614f4..f910c829 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Crawl4AI v0.2.7 🕷️🤖 +# Crawl4AI v0.2.71 🕷️🤖 [![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers) [![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members) diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py index 9e85d60d..4f6190c9 100644 --- a/crawl4ai/crawler_strategy.py +++ b/crawl4ai/crawler_strategy.py @@ -5,7 +5,10 @@ from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.options import Options -from selenium.common.exceptions import InvalidArgumentException +from selenium.common.exceptions import InvalidArgumentException, WebDriverException +from selenium.webdriver.chrome.service import Service as ChromeService +from webdriver_manager.chrome import ChromeDriverManager + import logging import base64 from PIL import Image, ImageDraw, ImageFont @@ -118,10 +121,15 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): } # chromedriver_autoinstaller.install() - import chromedriver_autoinstaller - crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") - chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver(crawl4ai_folder, False) + # import chromedriver_autoinstaller + # crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") + # driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options) + # chromedriver_path = chromedriver_autoinstaller.install() + # chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver() # self.service = Service(chromedriver_autoinstaller.install()) + + + chromedriver_path = ChromeDriverManager().install() self.service = Service(chromedriver_path) self.service.log_path = "NUL" self.driver = webdriver.Chrome(service=self.service, options=self.options) @@ -212,9 +220,18 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): return html except InvalidArgumentException: - raise InvalidArgumentException(f"Invalid URL {url}") + if not hasattr(e, 'msg'): + e.msg = str(e) + raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}") + except WebDriverException as e: + # If e does nlt have msg attribute create it and set it to str(e) + if not hasattr(e, 'msg'): + e.msg = str(e) + raise WebDriverException(f"Failed to crawl {url}: {e.msg}") except Exception as e: - raise Exception(f"Failed to crawl {url}: {str(e)}") + if not hasattr(e, 'msg'): + e.msg = str(e) + raise Exception(f"Failed to crawl {url}: {e.msg}") def take_screenshot(self) -> str: try: diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 3673fcc9..c468c49a 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -438,18 +438,17 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: links = {'internal': [], 'external': []} media = {'images': [], 'videos': [], 'audios': []} - def process_element(element: element.PageElement) -> None: + def process_element(element: element.PageElement) -> bool: if isinstance(element, NavigableString): if isinstance(element, Comment): element.extract() - return - - # if not isinstance(element, element.Tag): - # return + return False if element.name in ['script', 'style', 'link', 'meta', 'noscript']: element.decompose() - return + return False + + keep_element = False if element.name == 'a' and element.get('href'): href = element['href'] @@ -459,6 +458,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: links['external'].append(link_data) else: links['internal'].append(link_data) + keep_element = True elif element.name == 'img': media['images'].append({ @@ -466,12 +466,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: 'alt': element.get('alt'), 'type': 'image' }) - alt_text = element.get('alt') - if alt_text: - element.replace_with(soup.new_string(alt_text)) - else: - element.decompose() - return + return True # Always keep image elements elif element.name in ['video', 'audio']: media[f"{element.name}s"].append({ @@ -479,6 +474,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: 'alt': element.get('alt'), 'type': element.name }) + return True # Always keep video and audio elements if element.name != 'pre': if element.name in ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark']: @@ -489,17 +485,26 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: elif element.name != 'img': element.attrs = {} - word_count = len(element.get_text(strip=True).split()) - if word_count < word_count_threshold: - element.decompose() - return - + # Process children for child in list(element.children): - process_element(child) + if isinstance(child, NavigableString) and not isinstance(child, Comment): + if len(child.strip()) > 0: + keep_element = True + else: + if process_element(child): + keep_element = True + - if not element.contents and not element.get_text(strip=True): + # Check word count + if not keep_element: + word_count = len(element.get_text(strip=True).split()) + keep_element = word_count >= word_count_threshold + + if not keep_element: element.decompose() + return keep_element + process_element(body) def flatten_nested_elements(node): @@ -770,4 +775,6 @@ def wrap_text(draw, text, font, max_width): def format_html(html_string): soup = BeautifulSoup(html_string, 'html.parser') - return soup.prettify() \ No newline at end of file + return soup.prettify() + + diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index a33663e8..ef85066e 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -129,47 +129,57 @@ class WebCrawler: verbose=True, **kwargs, ) -> CrawlResult: - extraction_strategy = extraction_strategy or NoExtractionStrategy() - extraction_strategy.verbose = verbose - if not isinstance(extraction_strategy, ExtractionStrategy): - raise ValueError("Unsupported extraction strategy") - if not isinstance(chunking_strategy, ChunkingStrategy): - raise ValueError("Unsupported chunking strategy") - - if word_count_threshold < MIN_WORD_THRESHOLD: - word_count_threshold = MIN_WORD_THRESHOLD + try: + extraction_strategy = extraction_strategy or NoExtractionStrategy() + extraction_strategy.verbose = verbose + if not isinstance(extraction_strategy, ExtractionStrategy): + raise ValueError("Unsupported extraction strategy") + if not isinstance(chunking_strategy, ChunkingStrategy): + raise ValueError("Unsupported chunking strategy") + + # if word_count_threshold < MIN_WORD_THRESHOLD: + # word_count_threshold = MIN_WORD_THRESHOLD + + word_count_threshold = max(word_count_threshold, 0) - # Check cache first - cached = None - screenshot_data = None - extracted_content = None - if not bypass_cache and not self.always_by_pass_cache: - cached = get_cached_url(url) - - if kwargs.get("warmup", True) and not self.ready: - return None - - if cached: - html = cached[1] - extracted_content = cached[4] - if screenshot: - screenshot_data = cached[9] - if not screenshot_data: - cached = None - - if not cached or not html: - if user_agent: - self.crawler_strategy.update_user_agent(user_agent) - t1 = time.time() - html = self.crawler_strategy.crawl(url) - t2 = time.time() - if verbose: - print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds") - if screenshot: - screenshot_data = self.crawler_strategy.take_screenshot() + # Check cache first + cached = None + screenshot_data = None + extracted_content = None + if not bypass_cache and not self.always_by_pass_cache: + cached = get_cached_url(url) + + if kwargs.get("warmup", True) and not self.ready: + return None + + if cached: + html = cached[1] + extracted_content = cached[4] + if screenshot: + screenshot_data = cached[9] + if not screenshot_data: + cached = None + + if not cached or not html: + if user_agent: + self.crawler_strategy.update_user_agent(user_agent) + t1 = time.time() + html = self.crawler_strategy.crawl(url) + t2 = time.time() + if verbose: + print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds") + if screenshot: + screenshot_data = self.crawler_strategy.take_screenshot() - - return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs) + + crawl_result = self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs) + crawl_result.success = bool(html) + return crawl_result + except Exception as e: + if not hasattr(e, "msg"): + e.msg = str(e) + print(f"[ERROR] 🚫 Failed to crawl {url}, error: {e.msg}") + return CrawlResult(url=url, html="", success=False, error_message=e.msg) def process_html( self, diff --git a/docs/md/changelog.md b/docs/md/changelog.md index 7ab9e0cd..6f9ac706 100644 --- a/docs/md/changelog.md +++ b/docs/md/changelog.md @@ -1,6 +1,12 @@ # Changelog -## [0.2.7] - 2024-06-27 +## [0.2.71] 2024-06-26 +• Refactored `crawler_strategy.py` to handle exceptions and improve error messages +• Improved `get_content_of_website_optimized` function in `utils.py` for better performance +• Updated `utils.py` with latest changes +• Migrated to `ChromeDriverManager` for resolving Chrome driver download issues + +## [0.2.71] - 2024-06-25 ### Fixed - Speed up twice the extraction function. diff --git a/docs/md/index.md b/docs/md/index.md index c3610229..f9c25a42 100644 --- a/docs/md/index.md +++ b/docs/md/index.md @@ -1,4 +1,4 @@ -# Crawl4AI v0.2.7 +# Crawl4AI v0.2.71 Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI. diff --git a/main.py b/main.py index 45947c5a..a20c13ad 100644 --- a/main.py +++ b/main.py @@ -49,7 +49,9 @@ templates = Jinja2Templates(directory=__location__ + "/pages") @lru_cache() def get_crawler(): # Initialize and return a WebCrawler instance - return WebCrawler(verbose = True) + crawler = WebCrawler(verbose = True) + crawler.warmup() + return crawler class CrawlRequest(BaseModel): urls: List[str] diff --git a/requirements.txt b/requirements.txt index ee5be60a..ced41173 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,3 +20,4 @@ torch==2.3.1 onnxruntime==1.18.0 tokenizers==0.19.1 pillow==10.3.0 +webdriver-manager==4.0.1 \ No newline at end of file diff --git a/setup.py b/setup.py index be9e5ca0..a11abc2e 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ class CustomInstallCommand(install): setup( name="Crawl4AI", - version="0.2.7", + version="0.2.71", description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper", long_description=open("README.md").read(), long_description_content_type="text/markdown",