From 2879344d9ccc281054587c079a5d5d2a2245b60a Mon Sep 17 00:00:00 2001 From: devatnull Date: Wed, 6 Nov 2024 17:36:46 +0300 Subject: [PATCH 1/2] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 28563762..a0e8b005 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# 🔥🕷️ Crawl4AI: LLM Friendly Web Crawler & Scrapper +# 🔥🕷️ Crawl4AI: LLM Friendly Web Crawler & Scraper unclecode%2Fcrawl4ai | Trendshift @@ -480,4 +480,4 @@ For a detailed exploration of our vision, challenges, and solutions, please see ## Star History -[![Star History Chart](https://api.star-history.com/svg?repos=unclecode/crawl4ai&type=Date)](https://star-history.com/#unclecode/crawl4ai&Date) \ No newline at end of file +[![Star History Chart](https://api.star-history.com/svg?repos=unclecode/crawl4ai&type=Date)](https://star-history.com/#unclecode/crawl4ai&Date) From 00026b5f8b9aec4ef5f4aa1fde8594c8118de74f Mon Sep 17 00:00:00 2001 From: Mahesh Date: Tue, 12 Nov 2024 14:52:51 -0700 Subject: [PATCH 2/2] feat(config): Adding a configurable way of setting the cache directory for constrained environments --- crawl4ai/async_crawler_strategy.py | 4 ++-- crawl4ai/async_database.py | 2 +- crawl4ai/async_webcrawler.py | 4 ++-- crawl4ai/crawler_strategy.py | 6 +++--- crawl4ai/database.py | 2 +- crawl4ai/model_loader.py | 2 +- crawl4ai/utils.py | 2 +- crawl4ai/web_crawler.py | 2 +- docs/md_v2/api/async-webcrawler.md | 2 +- setup.py | 2 +- 10 files changed, 14 insertions(+), 14 deletions(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index fa50e7b5..9af9f826 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -525,7 +525,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if self.use_cached_html: cache_file_path = os.path.join( - Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() + os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() ) if os.path.exists(cache_file_path): html = "" @@ -725,7 +725,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if self.use_cached_html: cache_file_path = os.path.join( - Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() + os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() ) with open(cache_file_path, "w", encoding="utf-8") as f: f.write(html) diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py index 78931d28..249c4b31 100644 --- a/crawl4ai/async_database.py +++ b/crawl4ai/async_database.py @@ -10,7 +10,7 @@ import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -DB_PATH = os.path.join(Path.home(), ".crawl4ai") +DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai") os.makedirs(DB_PATH, exist_ok=True) DB_PATH = os.path.join(DB_PATH, "crawl4ai.db") diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index ceb9ad28..38e429ca 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -23,14 +23,14 @@ class AsyncWebCrawler: self, crawler_strategy: Optional[AsyncCrawlerStrategy] = None, always_by_pass_cache: bool = False, - base_directory: str = str(Path.home()), + base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), **kwargs, ): self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( **kwargs ) self.always_by_pass_cache = always_by_pass_cache - # self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") + # self.crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai") self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai") os.makedirs(self.crawl4ai_folder, exist_ok=True) os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py index 5d6864b5..ce802e49 100644 --- a/crawl4ai/crawler_strategy.py +++ b/crawl4ai/crawler_strategy.py @@ -132,7 +132,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): # chromedriver_autoinstaller.install() # import chromedriver_autoinstaller - # crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") + # crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai") # driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options) # chromedriver_path = chromedriver_autoinstaller.install() # chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver() @@ -205,7 +205,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): url_hash = hashlib.md5(url.encode()).hexdigest() if self.use_cached_html: - cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash) + cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash) if os.path.exists(cache_file_path): with open(cache_file_path, "r") as f: return sanitize_input_encode(f.read()) @@ -275,7 +275,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): self.driver = self.execute_hook('before_return_html', self.driver, html) # Store in cache - cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash) + cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash) with open(cache_file_path, "w", encoding="utf-8") as f: f.write(html) diff --git a/crawl4ai/database.py b/crawl4ai/database.py index 37d94463..42ad7017 100644 --- a/crawl4ai/database.py +++ b/crawl4ai/database.py @@ -3,7 +3,7 @@ from pathlib import Path import sqlite3 from typing import Optional, Tuple -DB_PATH = os.path.join(Path.home(), ".crawl4ai") +DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai") os.makedirs(DB_PATH, exist_ok=True) DB_PATH = os.path.join(DB_PATH, "crawl4ai.db") diff --git a/crawl4ai/model_loader.py b/crawl4ai/model_loader.py index 7b3a2846..d1872d7e 100644 --- a/crawl4ai/model_loader.py +++ b/crawl4ai/model_loader.py @@ -56,7 +56,7 @@ def set_model_device(model): @lru_cache() def get_home_folder(): - home_folder = os.path.join(Path.home(), ".crawl4ai") + home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai") os.makedirs(home_folder, exist_ok=True) os.makedirs(f"{home_folder}/cache", exist_ok=True) os.makedirs(f"{home_folder}/models", exist_ok=True) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index d96f1ded..1f15dea1 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -60,7 +60,7 @@ def get_system_memory(): raise OSError("Unsupported operating system") def get_home_folder(): - home_folder = os.path.join(Path.home(), ".crawl4ai") + home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), ".crawl4ai") os.makedirs(home_folder, exist_ok=True) os.makedirs(f"{home_folder}/cache", exist_ok=True) os.makedirs(f"{home_folder}/models", exist_ok=True) diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index 20e9b04e..d44de183 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -20,7 +20,7 @@ class WebCrawler: def __init__(self, crawler_strategy: CrawlerStrategy = None, always_by_pass_cache: bool = False, verbose: bool = False): self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose) self.always_by_pass_cache = always_by_pass_cache - self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") + self.crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai") os.makedirs(self.crawl4ai_folder, exist_ok=True) os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) init_db() diff --git a/docs/md_v2/api/async-webcrawler.md b/docs/md_v2/api/async-webcrawler.md index 25164f6c..be956101 100644 --- a/docs/md_v2/api/async-webcrawler.md +++ b/docs/md_v2/api/async-webcrawler.md @@ -13,7 +13,7 @@ AsyncWebCrawler( # Cache Settings always_by_pass_cache: bool = False, # Always bypass cache - base_directory: str = str(Path.home()), # Base directory for cache + base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), # Base directory for cache # Network Settings proxy: str = None, # Simple proxy URL diff --git a/setup.py b/setup.py index 93190291..90063212 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ import sys # Create the .crawl4ai folder in the user's home directory if it doesn't exist # If the folder already exists, remove the cache folder -crawl4ai_folder = Path.home() / ".crawl4ai" +crawl4ai_folder = os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()) / ".crawl4ai" cache_folder = crawl4ai_folder / "cache" if cache_folder.exists():