diff --git a/README.md b/README.md index f6c8dc08..af0d6610 100644 --- a/README.md +++ b/README.md @@ -517,4 +517,4 @@ For a detailed exploration of our vision, challenges, and solutions, please see ## Star History -[![Star History Chart](https://api.star-history.com/svg?repos=unclecode/crawl4ai&type=Date)](https://star-history.com/#unclecode/crawl4ai&Date) \ No newline at end of file +[![Star History Chart](https://api.star-history.com/svg?repos=unclecode/crawl4ai&type=Date)](https://star-history.com/#unclecode/crawl4ai&Date) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index a6ba8e50..e7dc9c54 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -736,7 +736,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if self.use_cached_html: cache_file_path = os.path.join( - Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() + os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() ) if os.path.exists(cache_file_path): html = "" @@ -940,7 +940,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if self.use_cached_html: cache_file_path = os.path.join( - Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() + os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() ) with open(cache_file_path, "w", encoding="utf-8") as f: f.write(html) diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py index 19160b6e..3c97e7d1 100644 --- a/crawl4ai/async_database.py +++ b/crawl4ai/async_database.py @@ -17,10 +17,9 @@ from .async_logger import AsyncLogger logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -base_directory = Path.home() -DB_PATH = os.path.join(Path.home(), ".crawl4ai") +base_directory = DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai") os.makedirs(DB_PATH, exist_ok=True) -DB_PATH = os.path.join(DB_PATH, "crawl4ai.db") +DB_PATH = os.path.join(base_directory, "crawl4ai.db") class AsyncDatabaseManager: def __init__(self, pool_size: int = 10, max_retries: int = 3): diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 5fe7822c..7d1814b6 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -63,7 +63,7 @@ class AsyncWebCrawler: crawler_strategy: Optional[AsyncCrawlerStrategy] = None, always_bypass_cache: bool = False, always_by_pass_cache: Optional[bool] = None, # Deprecated parameter - base_directory: str = str(Path.home()), + base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), **kwargs, ): """ @@ -74,19 +74,7 @@ class AsyncWebCrawler: always_bypass_cache: Whether to always bypass cache (new parameter) always_by_pass_cache: Deprecated, use always_bypass_cache instead base_directory: Base directory for storing cache - """ - # init() - # self.log_width = 10 # Width of "[COMPLETE]" - # self.tag_format = lambda tag: f"[{tag}]".ljust(self.log_width, ".") - # self.log_icons = { - # 'INIT': '→', # Alternative: '▶' or '►' - # 'READY': '✓', # Alternative: '√' - # 'FETCH': '↓', # Alternative: '▼' - # 'SCRAPE': '◆', # Alternative: '♦' - # 'EXTRACT': '■', # Alternative: '□' - # 'COMPLETE': '●', # Alternative: '○' - # 'ERROR': '×' - # } + """ self.verbose = kwargs.get("verbose", False) self.logger = AsyncLogger( log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"), diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py index 5d6864b5..ce802e49 100644 --- a/crawl4ai/crawler_strategy.py +++ b/crawl4ai/crawler_strategy.py @@ -132,7 +132,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): # chromedriver_autoinstaller.install() # import chromedriver_autoinstaller - # crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") + # crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai") # driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options) # chromedriver_path = chromedriver_autoinstaller.install() # chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver() @@ -205,7 +205,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): url_hash = hashlib.md5(url.encode()).hexdigest() if self.use_cached_html: - cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash) + cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash) if os.path.exists(cache_file_path): with open(cache_file_path, "r") as f: return sanitize_input_encode(f.read()) @@ -275,7 +275,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): self.driver = self.execute_hook('before_return_html', self.driver, html) # Store in cache - cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash) + cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash) with open(cache_file_path, "w", encoding="utf-8") as f: f.write(html) diff --git a/crawl4ai/database.py b/crawl4ai/database.py index 37d94463..42ad7017 100644 --- a/crawl4ai/database.py +++ b/crawl4ai/database.py @@ -3,7 +3,7 @@ from pathlib import Path import sqlite3 from typing import Optional, Tuple -DB_PATH = os.path.join(Path.home(), ".crawl4ai") +DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai") os.makedirs(DB_PATH, exist_ok=True) DB_PATH = os.path.join(DB_PATH, "crawl4ai.db") diff --git a/crawl4ai/model_loader.py b/crawl4ai/model_loader.py index 7b3a2846..d1872d7e 100644 --- a/crawl4ai/model_loader.py +++ b/crawl4ai/model_loader.py @@ -56,7 +56,7 @@ def set_model_device(model): @lru_cache() def get_home_folder(): - home_folder = os.path.join(Path.home(), ".crawl4ai") + home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai") os.makedirs(home_folder, exist_ok=True) os.makedirs(f"{home_folder}/cache", exist_ok=True) os.makedirs(f"{home_folder}/models", exist_ok=True) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index a80cf09a..9abc5784 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -63,7 +63,7 @@ def get_system_memory(): raise OSError("Unsupported operating system") def get_home_folder(): - home_folder = os.path.join(Path.home(), ".crawl4ai") + home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), ".crawl4ai") os.makedirs(home_folder, exist_ok=True) os.makedirs(f"{home_folder}/cache", exist_ok=True) os.makedirs(f"{home_folder}/models", exist_ok=True) diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index c97a9cf4..6cfef6f0 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -21,7 +21,7 @@ class WebCrawler: def __init__(self, crawler_strategy: CrawlerStrategy = None, always_by_pass_cache: bool = False, verbose: bool = False): self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose) self.always_by_pass_cache = always_by_pass_cache - self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") + self.crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai") os.makedirs(self.crawl4ai_folder, exist_ok=True) os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) init_db() diff --git a/docs/md_v2/api/async-webcrawler.md b/docs/md_v2/api/async-webcrawler.md index 25164f6c..be956101 100644 --- a/docs/md_v2/api/async-webcrawler.md +++ b/docs/md_v2/api/async-webcrawler.md @@ -13,7 +13,7 @@ AsyncWebCrawler( # Cache Settings always_by_pass_cache: bool = False, # Always bypass cache - base_directory: str = str(Path.home()), # Base directory for cache + base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), # Base directory for cache # Network Settings proxy: str = None, # Simple proxy URL diff --git a/setup.py b/setup.py index bbc03026..f5f3cf2d 100644 --- a/setup.py +++ b/setup.py @@ -7,8 +7,9 @@ import subprocess import sys import asyncio -# Create the .crawl4ai folder structure -crawl4ai_folder = Path.home() / ".crawl4ai" +# Create the .crawl4ai folder in the user's home directory if it doesn't exist +# If the folder already exists, remove the cache folder +crawl4ai_folder = os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()) / ".crawl4ai" cache_folder = crawl4ai_folder / "cache" content_folders = ['html_content', 'cleaned_html', 'markdown_content', 'extracted_content', 'screenshots']