From 4d283ab3865aff060dd63856ac8bdfcb0632b325 Mon Sep 17 00:00:00 2001 From: unclecode Date: Mon, 8 Jul 2024 16:33:25 +0800 Subject: [PATCH] =?UTF-8?q?##=20[v0.2.74]=20-=202024-07-08=20A=20slew=20of?= =?UTF-8?q?=20exciting=20updates=20to=20improve=20the=20crawler's=20stabil?= =?UTF-8?q?ity=20and=20robustness!=20=F0=9F=8E=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ๐Ÿ’ป **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding. - ๐Ÿ›ก๏ธ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy. - ๐Ÿงน **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy. - ๐Ÿšฎ **Database cleanup**: Removed existing database file and initialized a new one. --- CHANGELOG.md | 9 ++++ README.md | 2 +- crawl4ai/chunking_strategy.py | 1 + crawl4ai/crawler_strategy.py | 34 +++++++------ crawl4ai/database.py | 11 +++-- crawl4ai/extraction_strategy.py | 18 +++++-- crawl4ai/utils.py | 12 ++++- crawl4ai/web_crawler.py | 10 ++-- .../examples/llm_extraction_openai_pricing.py | 2 +- docs/examples/quickstart.py | 33 +++++++++++-- docs/examples/summarize_page.py | 2 +- docs/md/changelog.md | 8 ++++ docs/md/examples/hooks_auth.md | 14 ++++-- docs/md/examples/llm_extraction.md | 4 +- docs/md/examples/summarization.md | 2 +- docs/md/index.md | 2 +- docs/md/quickstart.md | 48 +++++++------------ setup.py | 7 ++- 18 files changed, 142 insertions(+), 77 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 90722b04..3db7d01b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ # Changelog +## [v0.2.74] - 2024-07-08 +A slew of exciting updates to improve the crawler's stability and robustness! ๐ŸŽ‰ + +- ๐Ÿ’ป **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding. +- ๐Ÿ›ก๏ธ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy. +- ๐Ÿงน **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy. +- ๐Ÿšฎ **Database cleanup**: Removed existing database file and initialized a new one. + + ## [v0.2.73] - 2024-07-03 ๐Ÿ’ก In this release, we've bumped the version to v0.2.73 and refreshed our documentation to ensure you have the best experience with our project. diff --git a/README.md b/README.md index cf4e4760..a2e784b3 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Crawl4AI v0.2.73 ๐Ÿ•ท๏ธ๐Ÿค– +# Crawl4AI v0.2.74 ๐Ÿ•ท๏ธ๐Ÿค– [![GitHub Stars](https://img.shields.io/github/stars/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/stargazers) [![GitHub Forks](https://img.shields.io/github/forks/unclecode/crawl4ai?style=social)](https://github.com/unclecode/crawl4ai/network/members) diff --git a/crawl4ai/chunking_strategy.py b/crawl4ai/chunking_strategy.py index 5fe9b5e1..59006072 100644 --- a/crawl4ai/chunking_strategy.py +++ b/crawl4ai/chunking_strategy.py @@ -3,6 +3,7 @@ import re from collections import Counter import string from .model_loader import load_nltk_punkt +from .utils import * # Define the abstract base class for chunking strategies class ChunkingStrategy(ABC): diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py index 21de883e..85ba4450 100644 --- a/crawl4ai/crawler_strategy.py +++ b/crawl4ai/crawler_strategy.py @@ -8,6 +8,7 @@ from selenium.webdriver.chrome.options import Options from selenium.common.exceptions import InvalidArgumentException, WebDriverException from selenium.webdriver.chrome.service import Service as ChromeService from webdriver_manager.chrome import ChromeDriverManager +from urllib3.exceptions import MaxRetryError from .config import * import logging, time @@ -18,7 +19,7 @@ from typing import List, Callable import requests import os from pathlib import Path -from .utils import wrap_text +from .utils import * logger = logging.getLogger('selenium.webdriver.remote.remote_connection') logger.setLevel(logging.WARNING) @@ -73,7 +74,7 @@ class CloudCrawlerStrategy(CrawlerStrategy): response = requests.post("http://crawl4ai.uccode.io/crawl", json=data) response = response.json() html = response["results"][0]["html"] - return html + return sanitize_input_encode(html) class LocalSeleniumCrawlerStrategy(CrawlerStrategy): def __init__(self, use_cached_html=False, js_code=None, **kwargs): @@ -200,7 +201,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash) if os.path.exists(cache_file_path): with open(cache_file_path, "r") as f: - return f.read() + return sanitize_input_encode(f.read()) try: self.driver = self.execute_hook('before_get_url', self.driver) @@ -214,11 +215,15 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): WebDriverWait(self.driver, 10).until( EC.presence_of_all_elements_located((By.TAG_NAME, "body")) ) + self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") - html = self._ensure_page_load() # self.driver.page_source + + self.driver = self.execute_hook('after_get_url', self.driver) + html = sanitize_input_encode(self._ensure_page_load()) # self.driver.page_source can_not_be_done_headless = False # Look at my creativity for naming variables - # TODO: Very ugly way for now but it works - if not kwargs.get('bypass_headless', False) and html == "": + + # TODO: Very ugly approach, but promise to change it! + if kwargs.get('bypass_headless', False) or html == "": print("[LOG] ๐Ÿ™Œ Page could not be loaded in headless mode. Trying non-headless mode...") can_not_be_done_headless = True options = Options() @@ -227,11 +232,10 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): options.add_argument("--window-size=5,5") driver = webdriver.Chrome(service=self.service, options=options) driver.get(url) - html = driver.page_source + self.driver = self.execute_hook('after_get_url', driver) + html = sanitize_input_encode(driver.page_source) driver.quit() - self.driver = self.execute_hook('after_get_url', self.driver) - # Execute JS code if provided if self.js_code and type(self.js_code) == str: self.driver.execute_script(self.js_code) @@ -247,12 +251,12 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): ) if not can_not_be_done_headless: - html = self.driver.page_source + html = sanitize_input_encode(self.driver.page_source) self.driver = self.execute_hook('before_return_html', self.driver, html) # Store in cache cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash) - with open(cache_file_path, "w") as f: + with open(cache_file_path, "w", encoding="utf-8") as f: f.write(html) if self.verbose: @@ -261,16 +265,16 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): return html except InvalidArgumentException: if not hasattr(e, 'msg'): - e.msg = str(e) + e.msg = sanitize_input_encode(str(e)) raise InvalidArgumentException(f"Failed to crawl {url}: {e.msg}") except WebDriverException as e: # If e does nlt have msg attribute create it and set it to str(e) if not hasattr(e, 'msg'): - e.msg = str(e) + e.msg = sanitize_input_encode(str(e)) raise WebDriverException(f"Failed to crawl {url}: {e.msg}") except Exception as e: if not hasattr(e, 'msg'): - e.msg = str(e) + e.msg = sanitize_input_encode(str(e)) raise Exception(f"Failed to crawl {url}: {e.msg}") def take_screenshot(self) -> str: @@ -299,7 +303,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): return img_base64 except Exception as e: - error_message = f"Failed to take screenshot: {str(e)}" + error_message = sanitize_input_encode(f"Failed to take screenshot: {str(e)}") print(error_message) # Generate an image with black background diff --git a/crawl4ai/database.py b/crawl4ai/database.py index 47f41748..37d94463 100644 --- a/crawl4ai/database.py +++ b/crawl4ai/database.py @@ -20,7 +20,7 @@ def init_db(): extracted_content TEXT, success BOOLEAN, media TEXT DEFAULT "{}", - link TEXT DEFAULT "{}", + links TEXT DEFAULT "{}", metadata TEXT DEFAULT "{}", screenshot TEXT DEFAULT "" ) @@ -127,6 +127,9 @@ def update_existing_records(new_column: str = "media", default_value: str = "{}" print(f"Error updating existing records: {e}") if __name__ == "__main__": - init_db() # Initialize the database if not already initialized - alter_db_add_screenshot("metadata") # Add the new column to the table - update_existing_records("metadata") # Update existing records to set the new column to an empty string + # Delete the existing database file + if os.path.exists(DB_PATH): + os.remove(DB_PATH) + init_db() + # alter_db_add_screenshot("COL_NAME") + diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index d4415c88..f889b45c 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -116,7 +116,6 @@ class LLMExtractionStrategy(ExtractionStrategy): for block in blocks: block['error'] = False except Exception as e: - print("Error extracting blocks:", str(e)) parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content) blocks = parsed if unparsed: @@ -192,7 +191,7 @@ class LLMExtractionStrategy(ExtractionStrategy): # Sequential processing with a delay for ix, section in enumerate(merged_sections): extract_func = partial(self.extract, url) - extracted_content.extend(extract_func(ix, section)) + extracted_content.extend(extract_func(ix, sanitize_input_encode(section))) time.sleep(0.5) # 500 ms delay between each processing else: # Parallel processing using ThreadPoolExecutor @@ -202,10 +201,21 @@ class LLMExtractionStrategy(ExtractionStrategy): with ThreadPoolExecutor(max_workers=4) as executor: extract_func = partial(self.extract, url) - futures = [executor.submit(extract_func, ix, section) for ix, section in enumerate(merged_sections)] + futures = [executor.submit(extract_func, ix, sanitize_input_encode(section)) for ix, section in enumerate(merged_sections)] for future in as_completed(futures): - extracted_content.extend(future.result()) + try: + extracted_content.extend(future.result()) + except Exception as e: + if self.verbose: + print(f"Error in thread execution: {e}") + # Add error information to extracted_content + extracted_content.append({ + "index": 0, + "error": True, + "tags": ["error"], + "content": str(e) + }) return extracted_content diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 474ce395..e7b59d65 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -96,6 +96,16 @@ def sanitize_html(html): return sanitized_html +def sanitize_input_encode(text: str) -> str: + """Sanitize input to handle potential encoding issues.""" + try: + # Attempt to encode and decode as UTF-8 to handle potential encoding issues + return text.encode('utf-8', errors='ignore').decode('utf-8') + except UnicodeEncodeError as e: + print(f"Warning: Encoding issue detected. Some characters may be lost. Error: {e}") + # Fall back to ASCII if UTF-8 fails + return text.encode('ascii', errors='ignore').decode('ascii') + def escape_json_string(s): """ Escapes characters in a string to be JSON safe. @@ -664,7 +674,6 @@ def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None): for block in blocks: block['error'] = False except Exception as e: - print("Error extracting blocks:", str(e)) parsed, unparsed = split_and_parse_json_objects(response.choices[0].message.content) blocks = parsed # Append all unparsed segments as onr error block and content is list of unparsed segments @@ -710,7 +719,6 @@ def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_toke blocks = json.loads(blocks) except Exception as e: - print("Error extracting blocks:", str(e)) blocks = [{ "index": 0, "tags": ["error"], diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index 954e9b84..db0d9856 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -155,8 +155,8 @@ class WebCrawler: return None if cached: - html = cached[1] - extracted_content = cached[4] + html = sanitize_input_encode(cached[1]) + extracted_content = sanitize_input_encode(cached[4]) if screenshot: screenshot_data = cached[9] if not screenshot_data: @@ -166,7 +166,7 @@ class WebCrawler: if user_agent: self.crawler_strategy.update_user_agent(user_agent) t1 = time.time() - html = self.crawler_strategy.crawl(url, **kwargs) + html = sanitize_input_encode(self.crawler_strategy.crawl(url, **kwargs)) t2 = time.time() if verbose: print(f"[LOG] ๐Ÿš€ Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1} seconds") @@ -213,8 +213,8 @@ class WebCrawler: except InvalidCSSSelectorError as e: raise ValueError(str(e)) - cleaned_html = result.get("cleaned_html", "") - markdown = result.get("markdown", "") + cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) + markdown = sanitize_input_encode(result.get("markdown", "")) media = result.get("media", []) links = result.get("links", []) metadata = result.get("metadata", {}) diff --git a/docs/examples/llm_extraction_openai_pricing.py b/docs/examples/llm_extraction_openai_pricing.py index c4c6943e..9330ad31 100644 --- a/docs/examples/llm_extraction_openai_pricing.py +++ b/docs/examples/llm_extraction_openai_pricing.py @@ -36,5 +36,5 @@ model_fees = json.loads(result.extracted_content) print(len(model_fees)) -with open(".data/data.json", "w") as f: +with open(".data/data.json", "w", encoding="utf-8") as f: f.write(result.extracted_content) \ No newline at end of file diff --git a/docs/examples/quickstart.py b/docs/examples/quickstart.py index 24486cc1..89c63139 100644 --- a/docs/examples/quickstart.py +++ b/docs/examples/quickstart.py @@ -249,15 +249,40 @@ def using_crawler_hooks(crawler): cprint("\n๐Ÿ”— [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True) - crawler.set_hook('on_driver_created', on_driver_created) - crawler.set_hook('before_get_url', before_get_url) - crawler.set_hook('after_get_url', after_get_url) - crawler.set_hook('before_return_html', before_return_html) + crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True) + crawler_strategy.set_hook('on_driver_created', on_driver_created) + crawler_strategy.set_hook('before_get_url', before_get_url) + crawler_strategy.set_hook('after_get_url', after_get_url) + crawler_strategy.set_hook('before_return_html', before_return_html) + crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy) + crawler.warmup() result = crawler.run(url="https://example.com") cprint("[LOG] ๐Ÿ“ฆ [bold yellow]Crawler Hooks result:[/bold yellow]") print_result(result= result) + +def using_crawler_hooks_dleay_example(crawler): + def delay(driver): + print("Delaying for 5 seconds...") + time.sleep(5) + print("Resuming...") + + def create_crawler(): + crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True) + crawler_strategy.set_hook('after_get_url', delay) + crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy) + crawler.warmup() + return crawler + + cprint("\n๐Ÿ”— [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]") + crawler = create_crawler() + result = crawler.run(url="https://google.com", bypass_cache=True) + + cprint("[LOG] ๐Ÿ“ฆ [bold yellow]Crawler Hooks result:[/bold yellow]") + print_result(result) + + def main(): cprint("๐ŸŒŸ [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! ๐ŸŒ[/bold green]") diff --git a/docs/examples/summarize_page.py b/docs/examples/summarize_page.py index 31098e8e..85158999 100644 --- a/docs/examples/summarize_page.py +++ b/docs/examples/summarize_page.py @@ -42,5 +42,5 @@ page_summary = json.loads(result.extracted_content) print(page_summary) -with open(".data/page_summary.json", "w") as f: +with open(".data/page_summary.json", "w", encoding="utf-8") as f: f.write(result.extracted_content) diff --git a/docs/md/changelog.md b/docs/md/changelog.md index 3796d309..b0eb7c0d 100644 --- a/docs/md/changelog.md +++ b/docs/md/changelog.md @@ -1,5 +1,13 @@ # Changelog +## v0.2.74 - 2024-07-08 +A slew of exciting updates to improve the crawler's stability and robustness! ๐ŸŽ‰ + +- ๐Ÿ’ป **UTF encoding fix**: Resolved the Windows \"charmap\" error by adding UTF encoding. +- ๐Ÿ›ก๏ธ **Error handling**: Implemented MaxRetryError exception handling in LocalSeleniumCrawlerStrategy. +- ๐Ÿงน **Input sanitization**: Improved input sanitization and handled encoding issues in LLMExtractionStrategy. +- ๐Ÿšฎ **Database cleanup**: Removed existing database file and initialized a new one. + ## [v0.2.73] - 2024-07-03 ๐Ÿ’ก In this release, we've bumped the version to v0.2.73 and refreshed our documentation to ensure you have the best experience with our project. diff --git a/docs/md/examples/hooks_auth.md b/docs/md/examples/hooks_auth.md index 154300df..2b4c2701 100644 --- a/docs/md/examples/hooks_auth.md +++ b/docs/md/examples/hooks_auth.md @@ -14,6 +14,9 @@ Let's see how we can customize the crawler using hooks! In this example, we'll: ### Hook Definitions ```python +from crawl4ai.web_crawler import WebCrawler +from crawl4ai.crawler_strategy import * + def on_driver_created(driver): print("[HOOK] on_driver_created") # Example customization: maximize the window @@ -66,12 +69,13 @@ def before_return_html(driver, html): ```python print("\n๐Ÿ”— [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True) -crawler = WebCrawler(verbose=True) +crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True) +crawler_strategy.set_hook('on_driver_created', on_driver_created) +crawler_strategy.set_hook('before_get_url', before_get_url) +crawler_strategy.set_hook('after_get_url', after_get_url) +crawler_strategy.set_hook('before_return_html', before_return_html) +crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy) crawler.warmup() -crawler.set_hook('on_driver_created', on_driver_created) -crawler.set_hook('before_get_url', before_get_url) -crawler.set_hook('after_get_url', after_get_url) -crawler.set_hook('before_return_html', before_return_html) result = crawler.run(url="https://example.com") diff --git a/docs/md/examples/llm_extraction.md b/docs/md/examples/llm_extraction.md index 5336a3cf..b7805726 100644 --- a/docs/md/examples/llm_extraction.md +++ b/docs/md/examples/llm_extraction.md @@ -45,7 +45,7 @@ model_fees = json.loads(result.extracted_content) print(len(model_fees)) -with open(".data/data.json", "w") as f: +with open(".data/data.json", "w", encoding="utf-8") as f: f.write(result.extracted_content) ``` @@ -71,7 +71,7 @@ model_fees = json.loads(result.extracted_content) print(len(model_fees)) -with open(".data/data.json", "w") as f: +with open(".data/data.json", "w", encoding="utf-8") as f: f.write(result.extracted_content) ``` diff --git a/docs/md/examples/summarization.md b/docs/md/examples/summarization.md index 3210fad6..b817f691 100644 --- a/docs/md/examples/summarization.md +++ b/docs/md/examples/summarization.md @@ -91,7 +91,7 @@ This example demonstrates how to use `Crawl4AI` to extract a summary from a web Save the extracted data to a file for further use. ```python - with open(".data/page_summary.json", "w") as f: + with open(".data/page_summary.json", "w", encoding="utf-8") as f: f.write(result.extracted_content) ``` diff --git a/docs/md/index.md b/docs/md/index.md index b08fdd12..b483234f 100644 --- a/docs/md/index.md +++ b/docs/md/index.md @@ -1,4 +1,4 @@ -# Crawl4AI v0.2.73 +# Crawl4AI v0.2.74 Welcome to the official documentation for Crawl4AI! ๐Ÿ•ท๏ธ๐Ÿค– Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI. diff --git a/docs/md/quickstart.md b/docs/md/quickstart.md index 9f5bdcd8..a0c1a2c7 100644 --- a/docs/md/quickstart.md +++ b/docs/md/quickstart.md @@ -176,41 +176,29 @@ print(f"JavaScript Code (Load More button) result: {result}") Let's see how we can customize the crawler using hooks! ```python -def on_driver_created(driver): - print("[HOOK] on_driver_created") - driver.maximize_window() - driver.get('https://example.com/login') - driver.find_element(By.NAME, 'username').send_keys('testuser') - driver.find_element(By.NAME, 'password').send_keys('password123') - driver.find_element(By.NAME, 'login').click() - driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'}) - return driver +import time -def before_get_url(driver): - print("[HOOK] before_get_url") - driver.execute_cdp_cmd('Network.enable', {}) - driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}}) - return driver +from crawl4ai.web_crawler import WebCrawler +from crawl4ai.crawler_strategy import * -def after_get_url(driver): - print("[HOOK] after_get_url") - print(driver.current_url) - return driver +def delay(driver): + print("Delaying for 5 seconds...") + time.sleep(5) + print("Resuming...") + +def create_crawler(): + crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True) + crawler_strategy.set_hook('after_get_url', delay) + crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy) + crawler.warmup() + return crawler -def before_return_html(driver, html): - print("[HOOK] before_return_html") - print(len(html)) - return driver - -crawler.set_hook('on_driver_created', on_driver_created) -crawler.set_hook('before_get_url', before_get_url) -crawler.set_hook('after_get_url', after_get_url) -crawler.set_hook('before_return_html', before_return_html) - -result = crawler.run(url="https://example.com") -print(f"Crawler Hooks result: {result}") +crawler = create_crawler() +result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True) ``` +check [Hooks](examples/hooks_auth.md) for more examples. + ## Congratulations! ๐ŸŽ‰ You've made it through the Crawl4AI Quickstart Guide! Now go forth and crawl the web like a pro! ๐Ÿ•ธ๏ธ diff --git a/setup.py b/setup.py index 468dc56e..4a2c346d 100644 --- a/setup.py +++ b/setup.py @@ -5,10 +5,15 @@ import subprocess from setuptools.command.install import install # Create the .crawl4ai folder in the user's home directory if it doesn't exist +# If the folder already exists, remove the cache folder crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") +if os.path.exists(f"{crawl4ai_folder}/cache"): + subprocess.run(["rm", "-rf", f"{crawl4ai_folder}/cache"]) os.makedirs(crawl4ai_folder, exist_ok=True) os.makedirs(f"{crawl4ai_folder}/cache", exist_ok=True) + + # Read the requirements from requirements.txt with open("requirements.txt") as f: requirements = f.read().splitlines() @@ -20,7 +25,7 @@ transformer_requirements = [req for req in requirements if req.startswith(("tran setup( name="Crawl4AI", - version="0.2.73", + version="0.2.74", description="๐Ÿ”ฅ๐Ÿ•ท๏ธ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper", long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown",