Merge remote-tracking branch 'origin/main' into 0.3.74

2024-11-18 21:15:04 +08:00
parent 852729ff38 38044d4afe
commit b6af94cbbb
11 changed files with 18 additions and 30 deletions
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -736,7 +736,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            if self.use_cached_html:
                cache_file_path = os.path.join(
-                    Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
+                    os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
                )
                if os.path.exists(cache_file_path):
                    html = ""
@@ -940,7 +940,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            if self.use_cached_html:
                cache_file_path = os.path.join(
-                    Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
+                    os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
                )
                with open(cache_file_path, "w", encoding="utf-8") as f:
                    f.write(html)
--- a/crawl4ai/async_database.py
+++ b/crawl4ai/async_database.py
@@ -17,10 +17,9 @@ from .async_logger import AsyncLogger
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-base_directory = Path.home()
+base_directory = DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
 DB_PATH = os.path.join(Path.home(), ".crawl4ai")
 os.makedirs(DB_PATH, exist_ok=True)
-DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
+DB_PATH = os.path.join(base_directory, "crawl4ai.db")
 class AsyncDatabaseManager:
    def __init__(self, pool_size: int = 10, max_retries: int = 3):
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -63,7 +63,7 @@ class AsyncWebCrawler:
        crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
        always_bypass_cache: bool = False,
        always_by_pass_cache: Optional[bool] = None,  # Deprecated parameter
-        base_directory: str = str(Path.home()),
+        base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
        **kwargs,
    ):
        """
@@ -75,18 +75,6 @@ class AsyncWebCrawler:
            always_by_pass_cache: Deprecated, use always_bypass_cache instead
            base_directory: Base directory for storing cache
        """  
        # init()
        # self.log_width = 10  # Width of "[COMPLETE]" 
        # self.tag_format = lambda tag: f"[{tag}]".ljust(self.log_width, ".")
        # self.log_icons = {
        #     'INIT': '→',      # Alternative: '▶' or '►'
        #     'READY': '✓',     # Alternative: '√'
        #     'FETCH': '↓',     # Alternative: '▼'
        #     'SCRAPE': '◆',    # Alternative: '♦'
        #     'EXTRACT': '■',    # Alternative: '□'
        #     'COMPLETE': '●',   # Alternative: '○'
        #     'ERROR': '×' 
        # }        
        self.verbose = kwargs.get("verbose", False)
        self.logger = AsyncLogger(
            log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"),
--- a/crawl4ai/crawler_strategy.py
+++ b/crawl4ai/crawler_strategy.py
@@ -132,7 +132,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
        # chromedriver_autoinstaller.install()
        # import chromedriver_autoinstaller
-        # crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
+        # crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
        # driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options)
        # chromedriver_path = chromedriver_autoinstaller.install()
        # chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver()
@@ -205,7 +205,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
        url_hash = hashlib.md5(url.encode()).hexdigest()
        if self.use_cached_html:
-            cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
+            cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash)
            if os.path.exists(cache_file_path):
                with open(cache_file_path, "r") as f:
                    return sanitize_input_encode(f.read())
@@ -275,7 +275,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
            self.driver = self.execute_hook('before_return_html', self.driver, html)
            # Store in cache
-            cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
+            cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash)
            with open(cache_file_path, "w", encoding="utf-8") as f:
                f.write(html)
--- a/crawl4ai/database.py
+++ b/crawl4ai/database.py
@@ -3,7 +3,7 @@ from pathlib import Path
 import sqlite3
 from typing import Optional, Tuple
-DB_PATH = os.path.join(Path.home(), ".crawl4ai")
+DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
 os.makedirs(DB_PATH, exist_ok=True)
 DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
--- a/crawl4ai/model_loader.py
+++ b/crawl4ai/model_loader.py
@@ -56,7 +56,7 @@ def set_model_device(model):
@lru_cache()
 def get_home_folder():
-    home_folder = os.path.join(Path.home(), ".crawl4ai")
+    home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
    os.makedirs(home_folder, exist_ok=True)
    os.makedirs(f"{home_folder}/cache", exist_ok=True)
    os.makedirs(f"{home_folder}/models", exist_ok=True)
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -63,7 +63,7 @@ def get_system_memory():
        raise OSError("Unsupported operating system")
 def get_home_folder():
-    home_folder = os.path.join(Path.home(), ".crawl4ai")
+    home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), ".crawl4ai")
    os.makedirs(home_folder, exist_ok=True)
    os.makedirs(f"{home_folder}/cache", exist_ok=True)
    os.makedirs(f"{home_folder}/models", exist_ok=True)
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -21,7 +21,7 @@ class WebCrawler:
    def __init__(self, crawler_strategy: CrawlerStrategy = None, always_by_pass_cache: bool = False, verbose: bool = False):
        self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
        self.always_by_pass_cache = always_by_pass_cache
-        self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
+        self.crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
        os.makedirs(self.crawl4ai_folder, exist_ok=True)
        os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
        init_db()
--- a/docs/md_v2/api/async-webcrawler.md
+++ b/docs/md_v2/api/async-webcrawler.md
@@ -13,7 +13,7 @@ AsyncWebCrawler(
    # Cache Settings
    always_by_pass_cache: bool = False,     # Always bypass cache
-    base_directory: str = str(Path.home()), # Base directory for cache
+    base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), # Base directory for cache
    # Network Settings
    proxy: str = None,                      # Simple proxy URL
--- a/setup.py
+++ b/setup.py
@@ -7,8 +7,9 @@ import subprocess
 import sys
 import asyncio
-# Create the .crawl4ai folder structure
+# Create the .crawl4ai folder in the user's home directory if it doesn't exist
-crawl4ai_folder = Path.home() / ".crawl4ai"
+# If the folder already exists, remove the cache folder
 crawl4ai_folder = os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()) / ".crawl4ai"
 cache_folder = crawl4ai_folder / "cache"
 content_folders = ['html_content', 'cleaned_html', 'markdown_content', 
                  'extracted_content', 'screenshots']