From 1d83c493aff8672c9da471c222f60c5c72145b71 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 19:58:40 +0800 Subject: [PATCH] Enhance setup process and update contributors list - Acknowledge contributor paulokuong for fixing RAWL4_AI_BASE_DIRECTORY issue - Refine base directory handling in `setup.py` - Clarify Playwright installation instructions and improve error handling --- CONTRIBUTORS.md | 1 + setup.py | 48 ++++++++++++++++++++++++++++++++++-------------- 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index deb46a9c..663e5541 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -21,6 +21,7 @@ We would like to thank the following people for their contributions to Crawl4AI: - [nelzomal](https://github.com/nelzomal) - Enhance development installation instructions [#286](https://github.com/unclecode/crawl4ai/pull/286) - [HamzaFarhan](https://github.com/HamzaFarhan) - Handled the cases where markdown_with_citations, references_markdown, and filtered_html might not be defined [#293](https://github.com/unclecode/crawl4ai/pull/293) - [NanmiCoder](https://github.com/NanmiCoder) - fix: crawler strategy exception handling and fixes [#271](https://github.com/unclecode/crawl4ai/pull/271) +- [paulokuong](https://github.com/paulokuong) - fix: RAWL4_AI_BASE_DIRECTORY should be Path object instead of string [#298](https://github.com/unclecode/crawl4ai/pull/298) ## Other Contributors diff --git a/setup.py b/setup.py index f5f3cf2d..dbb07410 100644 --- a/setup.py +++ b/setup.py @@ -9,10 +9,16 @@ import asyncio # Create the .crawl4ai folder in the user's home directory if it doesn't exist # If the folder already exists, remove the cache folder -crawl4ai_folder = os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()) / ".crawl4ai" +crawl4ai_folder = Path(os.getenv("CRAWL4_AI_BASE_DIRECTORY")) or Path.home() +crawl4ai_folder = crawl4ai_folder / ".crawl4ai" cache_folder = crawl4ai_folder / "cache" -content_folders = ['html_content', 'cleaned_html', 'markdown_content', - 'extracted_content', 'screenshots'] +content_folders = [ + "html_content", + "cleaned_html", + "markdown_content", + "extracted_content", + "screenshots", +] # Clean up old cache if exists if cache_folder.exists(): @@ -28,7 +34,7 @@ for folder in content_folders: __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) with open(os.path.join(__location__, "requirements.txt")) as f: requirements = f.read().splitlines() - + with open("crawl4ai/__version__.py") as f: for line in f: if line.startswith("__version__"): @@ -37,11 +43,12 @@ with open("crawl4ai/__version__.py") as f: # Define requirements default_requirements = requirements -torch_requirements = ["torch", "nltk", "scikit-learn"] +torch_requirements = ["torch", "nltk", "scikit-learn"] transformer_requirements = ["transformers", "tokenizers"] -cosine_similarity_requirements = ["torch", "transformers", "nltk" ] +cosine_similarity_requirements = ["torch", "transformers", "nltk"] sync_requirements = ["selenium"] + def install_playwright(): print("Installing Playwright browsers...") try: @@ -49,16 +56,22 @@ def install_playwright(): print("Playwright installation completed successfully.") except subprocess.CalledProcessError as e: print(f"Error during Playwright installation: {e}") - print("Please run 'python -m playwright install' manually after the installation.") + print( + "Please run 'python -m playwright install' manually after the installation." + ) except Exception as e: print(f"Unexpected error during Playwright installation: {e}") - print("Please run 'python -m playwright install' manually after the installation.") + print( + "Please run 'python -m playwright install' manually after the installation." + ) + def run_migration(): """Initialize database during installation""" try: print("Starting database initialization...") from crawl4ai.async_database import async_db_manager + asyncio.run(async_db_manager.initialize()) print("Database initialization completed successfully.") except ImportError: @@ -67,12 +80,14 @@ def run_migration(): print(f"Warning: Database initialization failed: {e}") print("Database will be initialized on first use") + class PostInstallCommand(install): def run(self): install.run(self) install_playwright() # run_migration() + setup( name="Crawl4AI", version=version, @@ -84,18 +99,23 @@ setup( author_email="unclecode@kidocode.com", license="MIT", packages=find_packages(), - install_requires=default_requirements + ["playwright", "aiofiles"], # Added aiofiles + install_requires=default_requirements + + ["playwright", "aiofiles"], # Added aiofiles extras_require={ "torch": torch_requirements, "transformer": transformer_requirements, "cosine": cosine_similarity_requirements, "sync": sync_requirements, - "all": default_requirements + torch_requirements + transformer_requirements + cosine_similarity_requirements + sync_requirements, + "all": default_requirements + + torch_requirements + + transformer_requirements + + cosine_similarity_requirements + + sync_requirements, }, entry_points={ - 'console_scripts': [ - 'crawl4ai-download-models=crawl4ai.model_loader:main', - 'crawl4ai-migrate=crawl4ai.migrations:main', # Added migration command + "console_scripts": [ + "crawl4ai-download-models=crawl4ai.model_loader:main", + "crawl4ai-migrate=crawl4ai.migrations:main", # Added migration command ], }, classifiers=[ @@ -110,6 +130,6 @@ setup( ], python_requires=">=3.7", cmdclass={ - 'install': PostInstallCommand, + "install": PostInstallCommand, }, ) \ No newline at end of file