Compare commits

...

11 Commits

Author SHA1 Message Date
UncleCode
c8485776fe docs: update README to reflect latest version v0.3.745 2024-11-28 20:04:16 +08:00
UncleCode
aa3e2d0fe6 Merge branch 'main' of https://github.com/unclecode/crawl4ai 2024-11-28 20:03:43 +08:00
UncleCode
98c64f9d5f Merge branch 'next' 2024-11-28 20:03:11 +08:00
UncleCode
7d81c17cca fix: improve handling of CRAWL4_AI_BASE_DIRECTORY environment variable in setup.py 2024-11-28 20:02:39 +08:00
UncleCode
652d396a81 chore: update version to 0.3.745 2024-11-28 20:00:29 +08:00
UncleCode
1d83c493af Enhance setup process and update contributors list
- Acknowledge contributor paulokuong for fixing RAWL4_AI_BASE_DIRECTORY issue
  - Refine base directory handling in `setup.py`
  - Clarify Playwright installation instructions and improve error handling
2024-11-28 19:58:40 +08:00
Paulo Kuong
cf35cbe59e CRAWL4_AI_BASE_DIRECTORY should be Path object instead of string (#298)
Thank you so much for your point. Yes, that's correct. I accept your pull request, and I add your name to a contribution list. Thank you again.
2024-11-28 19:46:36 +08:00
UncleCode
9221c08418 docs: fix link formatting for recent updates section in README 2024-11-28 19:33:36 +08:00
UncleCode
48d43c14b1 docs: fix link formatting for recent updates section in README 2024-11-28 19:33:02 +08:00
UncleCode
776efa74a4 docs: fix link formatting for recent updates section in README 2024-11-28 19:32:32 +08:00
UncleCode
b14e83f499 docs: fix link formatting for recent updates section in README 2024-11-28 19:31:09 +08:00
4 changed files with 39 additions and 17 deletions

View File

@@ -21,6 +21,7 @@ We would like to thank the following people for their contributions to Crawl4AI:
- [nelzomal](https://github.com/nelzomal) - Enhance development installation instructions [#286](https://github.com/unclecode/crawl4ai/pull/286)
- [HamzaFarhan](https://github.com/HamzaFarhan) - Handled the cases where markdown_with_citations, references_markdown, and filtered_html might not be defined [#293](https://github.com/unclecode/crawl4ai/pull/293)
- [NanmiCoder](https://github.com/NanmiCoder) - fix: crawler strategy exception handling and fixes [#271](https://github.com/unclecode/crawl4ai/pull/271)
- [paulokuong](https://github.com/paulokuong) - fix: RAWL4_AI_BASE_DIRECTORY should be Path object instead of string [#298](https://github.com/unclecode/crawl4ai/pull/298)
## Other Contributors

View File

@@ -11,7 +11,7 @@
Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease.
[✨ Check out what's new in the latest update!](#recent-updates)
[✨ Check out latest update v0.3.745](#-recent-updates)
## 🧐 Why Crawl4AI?

View File

@@ -1,2 +1,2 @@
# crawl4ai/_version.py
__version__ = "0.3.744"
__version__ = "0.3.745"

View File

@@ -9,10 +9,17 @@ import asyncio
# Create the .crawl4ai folder in the user's home directory if it doesn't exist
# If the folder already exists, remove the cache folder
crawl4ai_folder = os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()) / ".crawl4ai"
base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY")
crawl4ai_folder = Path(base_dir) if base_dir else Path.home()
crawl4ai_folder = crawl4ai_folder / ".crawl4ai"
cache_folder = crawl4ai_folder / "cache"
content_folders = ['html_content', 'cleaned_html', 'markdown_content',
'extracted_content', 'screenshots']
content_folders = [
"html_content",
"cleaned_html",
"markdown_content",
"extracted_content",
"screenshots",
]
# Clean up old cache if exists
if cache_folder.exists():
@@ -28,7 +35,7 @@ for folder in content_folders:
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
with open(os.path.join(__location__, "requirements.txt")) as f:
requirements = f.read().splitlines()
with open("crawl4ai/__version__.py") as f:
for line in f:
if line.startswith("__version__"):
@@ -37,11 +44,12 @@ with open("crawl4ai/__version__.py") as f:
# Define requirements
default_requirements = requirements
torch_requirements = ["torch", "nltk", "scikit-learn"]
torch_requirements = ["torch", "nltk", "scikit-learn"]
transformer_requirements = ["transformers", "tokenizers"]
cosine_similarity_requirements = ["torch", "transformers", "nltk" ]
cosine_similarity_requirements = ["torch", "transformers", "nltk"]
sync_requirements = ["selenium"]
def install_playwright():
print("Installing Playwright browsers...")
try:
@@ -49,16 +57,22 @@ def install_playwright():
print("Playwright installation completed successfully.")
except subprocess.CalledProcessError as e:
print(f"Error during Playwright installation: {e}")
print("Please run 'python -m playwright install' manually after the installation.")
print(
"Please run 'python -m playwright install' manually after the installation."
)
except Exception as e:
print(f"Unexpected error during Playwright installation: {e}")
print("Please run 'python -m playwright install' manually after the installation.")
print(
"Please run 'python -m playwright install' manually after the installation."
)
def run_migration():
"""Initialize database during installation"""
try:
print("Starting database initialization...")
from crawl4ai.async_database import async_db_manager
asyncio.run(async_db_manager.initialize())
print("Database initialization completed successfully.")
except ImportError:
@@ -67,12 +81,14 @@ def run_migration():
print(f"Warning: Database initialization failed: {e}")
print("Database will be initialized on first use")
class PostInstallCommand(install):
def run(self):
install.run(self)
install_playwright()
# run_migration()
setup(
name="Crawl4AI",
version=version,
@@ -84,18 +100,23 @@ setup(
author_email="unclecode@kidocode.com",
license="MIT",
packages=find_packages(),
install_requires=default_requirements + ["playwright", "aiofiles"], # Added aiofiles
install_requires=default_requirements
+ ["playwright", "aiofiles"], # Added aiofiles
extras_require={
"torch": torch_requirements,
"transformer": transformer_requirements,
"cosine": cosine_similarity_requirements,
"sync": sync_requirements,
"all": default_requirements + torch_requirements + transformer_requirements + cosine_similarity_requirements + sync_requirements,
"all": default_requirements
+ torch_requirements
+ transformer_requirements
+ cosine_similarity_requirements
+ sync_requirements,
},
entry_points={
'console_scripts': [
'crawl4ai-download-models=crawl4ai.model_loader:main',
'crawl4ai-migrate=crawl4ai.migrations:main', # Added migration command
"console_scripts": [
"crawl4ai-download-models=crawl4ai.model_loader:main",
"crawl4ai-migrate=crawl4ai.migrations:main", # Added migration command
],
},
classifiers=[
@@ -110,6 +131,6 @@ setup(
],
python_requires=">=3.7",
cmdclass={
'install': PostInstallCommand,
"install": PostInstallCommand,
},
)
)