Merge remote-tracking branch 'origin/main' into 0.3.74

This commit is contained in:
UncleCode
2024-11-18 21:15:04 +08:00
11 changed files with 18 additions and 30 deletions

View File

@@ -736,7 +736,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if self.use_cached_html: if self.use_cached_html:
cache_file_path = os.path.join( cache_file_path = os.path.join(
Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
) )
if os.path.exists(cache_file_path): if os.path.exists(cache_file_path):
html = "" html = ""
@@ -940,7 +940,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if self.use_cached_html: if self.use_cached_html:
cache_file_path = os.path.join( cache_file_path = os.path.join(
Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest() os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
) )
with open(cache_file_path, "w", encoding="utf-8") as f: with open(cache_file_path, "w", encoding="utf-8") as f:
f.write(html) f.write(html)

View File

@@ -17,10 +17,9 @@ from .async_logger import AsyncLogger
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
base_directory = Path.home() base_directory = DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
DB_PATH = os.path.join(Path.home(), ".crawl4ai")
os.makedirs(DB_PATH, exist_ok=True) os.makedirs(DB_PATH, exist_ok=True)
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db") DB_PATH = os.path.join(base_directory, "crawl4ai.db")
class AsyncDatabaseManager: class AsyncDatabaseManager:
def __init__(self, pool_size: int = 10, max_retries: int = 3): def __init__(self, pool_size: int = 10, max_retries: int = 3):

View File

@@ -63,7 +63,7 @@ class AsyncWebCrawler:
crawler_strategy: Optional[AsyncCrawlerStrategy] = None, crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
always_bypass_cache: bool = False, always_bypass_cache: bool = False,
always_by_pass_cache: Optional[bool] = None, # Deprecated parameter always_by_pass_cache: Optional[bool] = None, # Deprecated parameter
base_directory: str = str(Path.home()), base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
**kwargs, **kwargs,
): ):
""" """
@@ -75,18 +75,6 @@ class AsyncWebCrawler:
always_by_pass_cache: Deprecated, use always_bypass_cache instead always_by_pass_cache: Deprecated, use always_bypass_cache instead
base_directory: Base directory for storing cache base_directory: Base directory for storing cache
""" """
# init()
# self.log_width = 10 # Width of "[COMPLETE]"
# self.tag_format = lambda tag: f"[{tag}]".ljust(self.log_width, ".")
# self.log_icons = {
# 'INIT': '→', # Alternative: '▶' or '►'
# 'READY': '✓', # Alternative: '√'
# 'FETCH': '↓', # Alternative: '▼'
# 'SCRAPE': '◆', # Alternative: '♦'
# 'EXTRACT': '■', # Alternative: '□'
# 'COMPLETE': '●', # Alternative: '○'
# 'ERROR': '×'
# }
self.verbose = kwargs.get("verbose", False) self.verbose = kwargs.get("verbose", False)
self.logger = AsyncLogger( self.logger = AsyncLogger(
log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"), log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"),

View File

@@ -132,7 +132,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
# chromedriver_autoinstaller.install() # chromedriver_autoinstaller.install()
# import chromedriver_autoinstaller # import chromedriver_autoinstaller
# crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") # crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
# driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options) # driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options)
# chromedriver_path = chromedriver_autoinstaller.install() # chromedriver_path = chromedriver_autoinstaller.install()
# chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver() # chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver()
@@ -205,7 +205,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
url_hash = hashlib.md5(url.encode()).hexdigest() url_hash = hashlib.md5(url.encode()).hexdigest()
if self.use_cached_html: if self.use_cached_html:
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash) cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash)
if os.path.exists(cache_file_path): if os.path.exists(cache_file_path):
with open(cache_file_path, "r") as f: with open(cache_file_path, "r") as f:
return sanitize_input_encode(f.read()) return sanitize_input_encode(f.read())
@@ -275,7 +275,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
self.driver = self.execute_hook('before_return_html', self.driver, html) self.driver = self.execute_hook('before_return_html', self.driver, html)
# Store in cache # Store in cache
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash) cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash)
with open(cache_file_path, "w", encoding="utf-8") as f: with open(cache_file_path, "w", encoding="utf-8") as f:
f.write(html) f.write(html)

View File

@@ -3,7 +3,7 @@ from pathlib import Path
import sqlite3 import sqlite3
from typing import Optional, Tuple from typing import Optional, Tuple
DB_PATH = os.path.join(Path.home(), ".crawl4ai") DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
os.makedirs(DB_PATH, exist_ok=True) os.makedirs(DB_PATH, exist_ok=True)
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db") DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")

View File

@@ -56,7 +56,7 @@ def set_model_device(model):
@lru_cache() @lru_cache()
def get_home_folder(): def get_home_folder():
home_folder = os.path.join(Path.home(), ".crawl4ai") home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
os.makedirs(home_folder, exist_ok=True) os.makedirs(home_folder, exist_ok=True)
os.makedirs(f"{home_folder}/cache", exist_ok=True) os.makedirs(f"{home_folder}/cache", exist_ok=True)
os.makedirs(f"{home_folder}/models", exist_ok=True) os.makedirs(f"{home_folder}/models", exist_ok=True)

View File

@@ -63,7 +63,7 @@ def get_system_memory():
raise OSError("Unsupported operating system") raise OSError("Unsupported operating system")
def get_home_folder(): def get_home_folder():
home_folder = os.path.join(Path.home(), ".crawl4ai") home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), ".crawl4ai")
os.makedirs(home_folder, exist_ok=True) os.makedirs(home_folder, exist_ok=True)
os.makedirs(f"{home_folder}/cache", exist_ok=True) os.makedirs(f"{home_folder}/cache", exist_ok=True)
os.makedirs(f"{home_folder}/models", exist_ok=True) os.makedirs(f"{home_folder}/models", exist_ok=True)

View File

@@ -21,7 +21,7 @@ class WebCrawler:
def __init__(self, crawler_strategy: CrawlerStrategy = None, always_by_pass_cache: bool = False, verbose: bool = False): def __init__(self, crawler_strategy: CrawlerStrategy = None, always_by_pass_cache: bool = False, verbose: bool = False):
self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose) self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
self.always_by_pass_cache = always_by_pass_cache self.always_by_pass_cache = always_by_pass_cache
self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") self.crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
os.makedirs(self.crawl4ai_folder, exist_ok=True) os.makedirs(self.crawl4ai_folder, exist_ok=True)
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
init_db() init_db()

View File

@@ -13,7 +13,7 @@ AsyncWebCrawler(
# Cache Settings # Cache Settings
always_by_pass_cache: bool = False, # Always bypass cache always_by_pass_cache: bool = False, # Always bypass cache
base_directory: str = str(Path.home()), # Base directory for cache base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), # Base directory for cache
# Network Settings # Network Settings
proxy: str = None, # Simple proxy URL proxy: str = None, # Simple proxy URL

View File

@@ -7,8 +7,9 @@ import subprocess
import sys import sys
import asyncio import asyncio
# Create the .crawl4ai folder structure # Create the .crawl4ai folder in the user's home directory if it doesn't exist
crawl4ai_folder = Path.home() / ".crawl4ai" # If the folder already exists, remove the cache folder
crawl4ai_folder = os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()) / ".crawl4ai"
cache_folder = crawl4ai_folder / "cache" cache_folder = crawl4ai_folder / "cache"
content_folders = ['html_content', 'cleaned_html', 'markdown_content', content_folders = ['html_content', 'cleaned_html', 'markdown_content',
'extracted_content', 'screenshots'] 'extracted_content', 'screenshots']