From 2879344d9ccc281054587c079a5d5d2a2245b60a Mon Sep 17 00:00:00 2001
From: devatnull <lateralus@criptext.com>
Date: Wed, 6 Nov 2024 17:36:46 +0300
Subject: [PATCH 1/2] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index 28563762..a0e8b005 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# 🔥🕷️ Crawl4AI: LLM Friendly Web Crawler & Scrapper
+# 🔥🕷️ Crawl4AI: LLM Friendly Web Crawler & Scraper
 
 <a href="https://trendshift.io/repositories/11716" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11716" alt="unclecode%2Fcrawl4ai | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
 
@@ -480,4 +480,4 @@ For a detailed exploration of our vision, challenges, and solutions, please see
 
 ## Star History
 
-[![Star History Chart](https://api.star-history.com/svg?repos=unclecode/crawl4ai&type=Date)](https://star-history.com/#unclecode/crawl4ai&Date)
\ No newline at end of file
+[![Star History Chart](https://api.star-history.com/svg?repos=unclecode/crawl4ai&type=Date)](https://star-history.com/#unclecode/crawl4ai&Date)

From 00026b5f8b9aec4ef5f4aa1fde8594c8118de74f Mon Sep 17 00:00:00 2001
From: Mahesh <mahesh.a.subramanian@lendistry.com>
Date: Tue, 12 Nov 2024 14:52:51 -0700
Subject: [PATCH 2/2] feat(config): Adding a configurable way of setting the
 cache directory for constrained environments

---
 crawl4ai/async_crawler_strategy.py | 4 ++--
 crawl4ai/async_database.py         | 2 +-
 crawl4ai/async_webcrawler.py       | 4 ++--
 crawl4ai/crawler_strategy.py       | 6 +++---
 crawl4ai/database.py               | 2 +-
 crawl4ai/model_loader.py           | 2 +-
 crawl4ai/utils.py                  | 2 +-
 crawl4ai/web_crawler.py            | 2 +-
 docs/md_v2/api/async-webcrawler.md | 2 +-
 setup.py                           | 2 +-
 10 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index fa50e7b5..9af9f826 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -525,7 +525,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
 
             if self.use_cached_html:
                 cache_file_path = os.path.join(
-                    Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
+                    os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
                 )
                 if os.path.exists(cache_file_path):
                     html = ""
@@ -725,7 +725,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
 
             if self.use_cached_html:
                 cache_file_path = os.path.join(
-                    Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
+                    os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
                 )
                 with open(cache_file_path, "w", encoding="utf-8") as f:
                     f.write(html)
diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py
index 78931d28..249c4b31 100644
--- a/crawl4ai/async_database.py
+++ b/crawl4ai/async_database.py
@@ -10,7 +10,7 @@ import logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
-DB_PATH = os.path.join(Path.home(), ".crawl4ai")
+DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
 os.makedirs(DB_PATH, exist_ok=True)
 DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
 
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index ceb9ad28..38e429ca 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -23,14 +23,14 @@ class AsyncWebCrawler:
         self,
         crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
         always_by_pass_cache: bool = False,
-        base_directory: str = str(Path.home()),
+        base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
         **kwargs,
     ):
         self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
             **kwargs
         )
         self.always_by_pass_cache = always_by_pass_cache
-        # self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
+        # self.crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
         self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
         os.makedirs(self.crawl4ai_folder, exist_ok=True)
         os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py
index 5d6864b5..ce802e49 100644
--- a/crawl4ai/crawler_strategy.py
+++ b/crawl4ai/crawler_strategy.py
@@ -132,7 +132,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
 
         # chromedriver_autoinstaller.install()
         # import chromedriver_autoinstaller
-        # crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
+        # crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
         # driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options)
         # chromedriver_path = chromedriver_autoinstaller.install()
         # chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver()
@@ -205,7 +205,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
         url_hash = hashlib.md5(url.encode()).hexdigest()
         
         if self.use_cached_html:
-            cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
+            cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash)
             if os.path.exists(cache_file_path):
                 with open(cache_file_path, "r") as f:
                     return sanitize_input_encode(f.read())
@@ -275,7 +275,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
             self.driver = self.execute_hook('before_return_html', self.driver, html)
             
             # Store in cache
-            cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
+            cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash)
             with open(cache_file_path, "w", encoding="utf-8") as f:
                 f.write(html)
                 
diff --git a/crawl4ai/database.py b/crawl4ai/database.py
index 37d94463..42ad7017 100644
--- a/crawl4ai/database.py
+++ b/crawl4ai/database.py
@@ -3,7 +3,7 @@ from pathlib import Path
 import sqlite3
 from typing import Optional, Tuple
 
-DB_PATH = os.path.join(Path.home(), ".crawl4ai")
+DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
 os.makedirs(DB_PATH, exist_ok=True)
 DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
 
diff --git a/crawl4ai/model_loader.py b/crawl4ai/model_loader.py
index 7b3a2846..d1872d7e 100644
--- a/crawl4ai/model_loader.py
+++ b/crawl4ai/model_loader.py
@@ -56,7 +56,7 @@ def set_model_device(model):
 
 @lru_cache()
 def get_home_folder():
-    home_folder = os.path.join(Path.home(), ".crawl4ai")
+    home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
     os.makedirs(home_folder, exist_ok=True)
     os.makedirs(f"{home_folder}/cache", exist_ok=True)
     os.makedirs(f"{home_folder}/models", exist_ok=True)
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index d96f1ded..1f15dea1 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -60,7 +60,7 @@ def get_system_memory():
         raise OSError("Unsupported operating system")
 
 def get_home_folder():
-    home_folder = os.path.join(Path.home(), ".crawl4ai")
+    home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), ".crawl4ai")
     os.makedirs(home_folder, exist_ok=True)
     os.makedirs(f"{home_folder}/cache", exist_ok=True)
     os.makedirs(f"{home_folder}/models", exist_ok=True)
diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py
index 20e9b04e..d44de183 100644
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -20,7 +20,7 @@ class WebCrawler:
     def __init__(self, crawler_strategy: CrawlerStrategy = None, always_by_pass_cache: bool = False, verbose: bool = False):
         self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
         self.always_by_pass_cache = always_by_pass_cache
-        self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
+        self.crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
         os.makedirs(self.crawl4ai_folder, exist_ok=True)
         os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
         init_db()
diff --git a/docs/md_v2/api/async-webcrawler.md b/docs/md_v2/api/async-webcrawler.md
index 25164f6c..be956101 100644
--- a/docs/md_v2/api/async-webcrawler.md
+++ b/docs/md_v2/api/async-webcrawler.md
@@ -13,7 +13,7 @@ AsyncWebCrawler(
     
     # Cache Settings
     always_by_pass_cache: bool = False,     # Always bypass cache
-    base_directory: str = str(Path.home()), # Base directory for cache
+    base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), # Base directory for cache
     
     # Network Settings
     proxy: str = None,                      # Simple proxy URL
diff --git a/setup.py b/setup.py
index 93190291..90063212 100644
--- a/setup.py
+++ b/setup.py
@@ -8,7 +8,7 @@ import sys
 
 # Create the .crawl4ai folder in the user's home directory if it doesn't exist
 # If the folder already exists, remove the cache folder
-crawl4ai_folder = Path.home() / ".crawl4ai"
+crawl4ai_folder = os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()) / ".crawl4ai"
 cache_folder = crawl4ai_folder / "cache"
 
 if cache_folder.exists():