Merge remote-tracking branch 'origin/main' into 0.3.74
This commit is contained in:
@@ -517,4 +517,4 @@ For a detailed exploration of our vision, challenges, and solutions, please see
|
|||||||
|
|
||||||
## Star History
|
## Star History
|
||||||
|
|
||||||
[](https://star-history.com/#unclecode/crawl4ai&Date)
|
[](https://star-history.com/#unclecode/crawl4ai&Date)
|
||||||
|
|||||||
@@ -736,7 +736,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
|
|
||||||
if self.use_cached_html:
|
if self.use_cached_html:
|
||||||
cache_file_path = os.path.join(
|
cache_file_path = os.path.join(
|
||||||
Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
|
os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
|
||||||
)
|
)
|
||||||
if os.path.exists(cache_file_path):
|
if os.path.exists(cache_file_path):
|
||||||
html = ""
|
html = ""
|
||||||
@@ -940,7 +940,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
|
|
||||||
if self.use_cached_html:
|
if self.use_cached_html:
|
||||||
cache_file_path = os.path.join(
|
cache_file_path = os.path.join(
|
||||||
Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
|
os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
|
||||||
)
|
)
|
||||||
with open(cache_file_path, "w", encoding="utf-8") as f:
|
with open(cache_file_path, "w", encoding="utf-8") as f:
|
||||||
f.write(html)
|
f.write(html)
|
||||||
|
|||||||
@@ -17,10 +17,9 @@ from .async_logger import AsyncLogger
|
|||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
base_directory = Path.home()
|
base_directory = DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
|
||||||
DB_PATH = os.path.join(Path.home(), ".crawl4ai")
|
|
||||||
os.makedirs(DB_PATH, exist_ok=True)
|
os.makedirs(DB_PATH, exist_ok=True)
|
||||||
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
|
DB_PATH = os.path.join(base_directory, "crawl4ai.db")
|
||||||
|
|
||||||
class AsyncDatabaseManager:
|
class AsyncDatabaseManager:
|
||||||
def __init__(self, pool_size: int = 10, max_retries: int = 3):
|
def __init__(self, pool_size: int = 10, max_retries: int = 3):
|
||||||
|
|||||||
@@ -63,7 +63,7 @@ class AsyncWebCrawler:
|
|||||||
crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
|
crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
|
||||||
always_bypass_cache: bool = False,
|
always_bypass_cache: bool = False,
|
||||||
always_by_pass_cache: Optional[bool] = None, # Deprecated parameter
|
always_by_pass_cache: Optional[bool] = None, # Deprecated parameter
|
||||||
base_directory: str = str(Path.home()),
|
base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
@@ -74,19 +74,7 @@ class AsyncWebCrawler:
|
|||||||
always_bypass_cache: Whether to always bypass cache (new parameter)
|
always_bypass_cache: Whether to always bypass cache (new parameter)
|
||||||
always_by_pass_cache: Deprecated, use always_bypass_cache instead
|
always_by_pass_cache: Deprecated, use always_bypass_cache instead
|
||||||
base_directory: Base directory for storing cache
|
base_directory: Base directory for storing cache
|
||||||
"""
|
"""
|
||||||
# init()
|
|
||||||
# self.log_width = 10 # Width of "[COMPLETE]"
|
|
||||||
# self.tag_format = lambda tag: f"[{tag}]".ljust(self.log_width, ".")
|
|
||||||
# self.log_icons = {
|
|
||||||
# 'INIT': '→', # Alternative: '▶' or '►'
|
|
||||||
# 'READY': '✓', # Alternative: '√'
|
|
||||||
# 'FETCH': '↓', # Alternative: '▼'
|
|
||||||
# 'SCRAPE': '◆', # Alternative: '♦'
|
|
||||||
# 'EXTRACT': '■', # Alternative: '□'
|
|
||||||
# 'COMPLETE': '●', # Alternative: '○'
|
|
||||||
# 'ERROR': '×'
|
|
||||||
# }
|
|
||||||
self.verbose = kwargs.get("verbose", False)
|
self.verbose = kwargs.get("verbose", False)
|
||||||
self.logger = AsyncLogger(
|
self.logger = AsyncLogger(
|
||||||
log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"),
|
log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"),
|
||||||
|
|||||||
@@ -132,7 +132,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
|
|
||||||
# chromedriver_autoinstaller.install()
|
# chromedriver_autoinstaller.install()
|
||||||
# import chromedriver_autoinstaller
|
# import chromedriver_autoinstaller
|
||||||
# crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
|
# crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
|
||||||
# driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options)
|
# driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=self.options)
|
||||||
# chromedriver_path = chromedriver_autoinstaller.install()
|
# chromedriver_path = chromedriver_autoinstaller.install()
|
||||||
# chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver()
|
# chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver()
|
||||||
@@ -205,7 +205,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
url_hash = hashlib.md5(url.encode()).hexdigest()
|
url_hash = hashlib.md5(url.encode()).hexdigest()
|
||||||
|
|
||||||
if self.use_cached_html:
|
if self.use_cached_html:
|
||||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
|
cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash)
|
||||||
if os.path.exists(cache_file_path):
|
if os.path.exists(cache_file_path):
|
||||||
with open(cache_file_path, "r") as f:
|
with open(cache_file_path, "r") as f:
|
||||||
return sanitize_input_encode(f.read())
|
return sanitize_input_encode(f.read())
|
||||||
@@ -275,7 +275,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
|
|||||||
self.driver = self.execute_hook('before_return_html', self.driver, html)
|
self.driver = self.execute_hook('before_return_html', self.driver, html)
|
||||||
|
|
||||||
# Store in cache
|
# Store in cache
|
||||||
cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", url_hash)
|
cache_file_path = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai", "cache", url_hash)
|
||||||
with open(cache_file_path, "w", encoding="utf-8") as f:
|
with open(cache_file_path, "w", encoding="utf-8") as f:
|
||||||
f.write(html)
|
f.write(html)
|
||||||
|
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ from pathlib import Path
|
|||||||
import sqlite3
|
import sqlite3
|
||||||
from typing import Optional, Tuple
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
DB_PATH = os.path.join(Path.home(), ".crawl4ai")
|
DB_PATH = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
|
||||||
os.makedirs(DB_PATH, exist_ok=True)
|
os.makedirs(DB_PATH, exist_ok=True)
|
||||||
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
|
DB_PATH = os.path.join(DB_PATH, "crawl4ai.db")
|
||||||
|
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ def set_model_device(model):
|
|||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def get_home_folder():
|
def get_home_folder():
|
||||||
home_folder = os.path.join(Path.home(), ".crawl4ai")
|
home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
|
||||||
os.makedirs(home_folder, exist_ok=True)
|
os.makedirs(home_folder, exist_ok=True)
|
||||||
os.makedirs(f"{home_folder}/cache", exist_ok=True)
|
os.makedirs(f"{home_folder}/cache", exist_ok=True)
|
||||||
os.makedirs(f"{home_folder}/models", exist_ok=True)
|
os.makedirs(f"{home_folder}/models", exist_ok=True)
|
||||||
|
|||||||
@@ -63,7 +63,7 @@ def get_system_memory():
|
|||||||
raise OSError("Unsupported operating system")
|
raise OSError("Unsupported operating system")
|
||||||
|
|
||||||
def get_home_folder():
|
def get_home_folder():
|
||||||
home_folder = os.path.join(Path.home(), ".crawl4ai")
|
home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), ".crawl4ai")
|
||||||
os.makedirs(home_folder, exist_ok=True)
|
os.makedirs(home_folder, exist_ok=True)
|
||||||
os.makedirs(f"{home_folder}/cache", exist_ok=True)
|
os.makedirs(f"{home_folder}/cache", exist_ok=True)
|
||||||
os.makedirs(f"{home_folder}/models", exist_ok=True)
|
os.makedirs(f"{home_folder}/models", exist_ok=True)
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ class WebCrawler:
|
|||||||
def __init__(self, crawler_strategy: CrawlerStrategy = None, always_by_pass_cache: bool = False, verbose: bool = False):
|
def __init__(self, crawler_strategy: CrawlerStrategy = None, always_by_pass_cache: bool = False, verbose: bool = False):
|
||||||
self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
|
self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
|
||||||
self.always_by_pass_cache = always_by_pass_cache
|
self.always_by_pass_cache = always_by_pass_cache
|
||||||
self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
|
self.crawl4ai_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai")
|
||||||
os.makedirs(self.crawl4ai_folder, exist_ok=True)
|
os.makedirs(self.crawl4ai_folder, exist_ok=True)
|
||||||
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
|
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
|
||||||
init_db()
|
init_db()
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ AsyncWebCrawler(
|
|||||||
|
|
||||||
# Cache Settings
|
# Cache Settings
|
||||||
always_by_pass_cache: bool = False, # Always bypass cache
|
always_by_pass_cache: bool = False, # Always bypass cache
|
||||||
base_directory: str = str(Path.home()), # Base directory for cache
|
base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), # Base directory for cache
|
||||||
|
|
||||||
# Network Settings
|
# Network Settings
|
||||||
proxy: str = None, # Simple proxy URL
|
proxy: str = None, # Simple proxy URL
|
||||||
|
|||||||
5
setup.py
5
setup.py
@@ -7,8 +7,9 @@ import subprocess
|
|||||||
import sys
|
import sys
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
||||||
# Create the .crawl4ai folder structure
|
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
|
||||||
crawl4ai_folder = Path.home() / ".crawl4ai"
|
# If the folder already exists, remove the cache folder
|
||||||
|
crawl4ai_folder = os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()) / ".crawl4ai"
|
||||||
cache_folder = crawl4ai_folder / "cache"
|
cache_folder = crawl4ai_folder / "cache"
|
||||||
content_folders = ['html_content', 'cleaned_html', 'markdown_content',
|
content_folders = ['html_content', 'cleaned_html', 'markdown_content',
|
||||||
'extracted_content', 'screenshots']
|
'extracted_content', 'screenshots']
|
||||||
|
|||||||
Reference in New Issue
Block a user