diff --git a/README.md b/README.md index 0a56a397..51bded41 100644 --- a/README.md +++ b/README.md @@ -20,9 +20,9 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease. -[✨ Check out latest update v0.4.24](#-recent-updates) +[✨ Check out latest update v0.4.24x](#-recent-updates) -🎉 **Version 0.4.24 is out!** Major improvements in extraction strategies with enhanced JSON handling, SSL security, and Amazon product extraction. Plus, a completely revamped content filtering system! [Read the release notes →](https://crawl4ai.com/mkdocs/blog) +🎉 **Version 0.4.24x is out!** Major improvements in extraction strategies with enhanced JSON handling, SSL security, and Amazon product extraction. Plus, a completely revamped content filtering system! [Read the release notes →](https://crawl4ai.com/mkdocs/blog) ## 🧐 Why Crawl4AI? @@ -38,14 +38,18 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant 1. Install Crawl4AI: ```bash # Install the package -pip install crawl4ai +pip install -U crawl4ai + +# Run post-installation setup crawl4ai-setup -# Install Playwright with system dependencies (recommended) -playwright install --with-deps +# Verify your installation +crawl4ai-doctor +``` -# Or install specific browsers: -playwright install --with-deps chrome # Recommended for Colab/Linux +If you encounter any browser-related issues, you can install them manually: +```bash +python -m playwright install --with-deps chromium ``` 2. Run a simple web crawl: diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 73e5c025..2761f396 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.4.24" +__version__ = "0.4.243" diff --git a/crawl4ai/install.py b/crawl4ai/install.py index 4a3f5d45..7efb6800 100644 --- a/crawl4ai/install.py +++ b/crawl4ai/install.py @@ -2,7 +2,6 @@ import subprocess import sys import asyncio from .async_logger import AsyncLogger, LogLevel -from .docs_manager import DocsManager # Initialize logger logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True) @@ -12,24 +11,20 @@ def post_install(): logger.info("Running post-installation setup...", tag="INIT") install_playwright() run_migration() - asyncio.run(setup_docs()) logger.success("Post-installation setup completed!", tag="COMPLETE") def install_playwright(): logger.info("Installing Playwright browsers...", tag="INIT") try: - subprocess.check_call([sys.executable, "-m", "playwright", "install"]) + # subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "--force", "chrome"]) + subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "--force", "chromium"]) logger.success("Playwright installation completed successfully.", tag="COMPLETE") except subprocess.CalledProcessError as e: - logger.error(f"Error during Playwright installation: {e}", tag="ERROR") - logger.warning( - "Please run 'python -m playwright install' manually after the installation." - ) + # logger.error(f"Error during Playwright installation: {e}", tag="ERROR") + logger.warning(f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation.") except Exception as e: - logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR") - logger.warning( - "Please run 'python -m playwright install' manually after the installation." - ) + # logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR") + logger.warning(f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation.") def run_migration(): """Initialize database during installation""" @@ -45,7 +40,44 @@ def run_migration(): logger.warning(f"Database initialization failed: {e}") logger.warning("Database will be initialized on first use") -async def setup_docs(): - """Download documentation files""" - docs_manager = DocsManager(logger) - await docs_manager.update_docs() \ No newline at end of file +async def run_doctor(): + """Test if Crawl4AI is working properly""" + logger.info("Running Crawl4AI health check...", tag="INIT") + try: + from .async_webcrawler import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode + + browser_config = BrowserConfig( + headless=True, + browser_type="chromium", + ignore_https_errors=True, + light_mode=True, + viewport_width=1280, + viewport_height=720 + ) + + run_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + screenshot=True, + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + logger.info("Testing crawling capabilities...", tag="TEST") + result = await crawler.arun( + url="https://crawl4ai.com", + config=run_config + ) + + if result and result.markdown: + logger.success("✅ Crawling test passed!", tag="COMPLETE") + return True + else: + raise Exception("Failed to get content") + + except Exception as e: + logger.error(f"❌ Test failed: {e}", tag="ERROR") + return False + +def doctor(): + """Entry point for the doctor command""" + import asyncio + return asyncio.run(run_doctor()) diff --git a/docs/examples/v0_4_24_walkthrough.py b/docs/examples/v0_4_24_walkthrough.py index a4e1aaa3..135ac29c 100644 --- a/docs/examples/v0_4_24_walkthrough.py +++ b/docs/examples/v0_4_24_walkthrough.py @@ -169,7 +169,8 @@ async def demo_content_filtering(): ) run_config = CrawlerRunConfig( - markdown_generator=markdown_gen + markdown_generator=markdown_gen, + cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: @@ -436,7 +437,7 @@ async def main(): await demo_ssl_features() await demo_content_filtering() await demo_json_extraction() - await demo_input_formats() + # await demo_input_formats() if __name__ == "__main__": asyncio.run(main()) diff --git a/docs/md_v3/tutorials/getting-started.md b/docs/md_v3/tutorials/getting-started.md index 045590cb..b148e6e1 100644 --- a/docs/md_v3/tutorials/getting-started.md +++ b/docs/md_v3/tutorials/getting-started.md @@ -31,7 +31,14 @@ By the end of this guide, you’ll have installed Crawl4AI, performed a basic cr ```bash pip install crawl4ai crawl4ai-setup -playwright install --with-deps + +# Verify your installation +crawl4ai-doctor +``` + +If you encounter any browser-related issues, you can install them manually: +```bash +python -m playwright install --with-deps chrome chromium ``` - **`crawl4ai-setup`** installs and configures Playwright (Chromium by default). diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..b3247e8a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,78 @@ +[build-system] +requires = ["setuptools>=64.0.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "Crawl4AI" +dynamic = ["version"] +description = "🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & scraper" +readme = "README.md" +requires-python = ">=3.9" +license = {text = "MIT"} +authors = [ + {name = "Unclecode", email = "unclecode@kidocode.com"} +] +dependencies = [ + "aiosqlite~=0.20", + "lxml~=5.3", + "litellm>=1.53.1", + "numpy>=1.26.0,<3", + "pillow~=10.4", + "playwright>=1.49.0", + "python-dotenv~=1.0", + "requests~=2.26", + "beautifulsoup4~=4.12", + "tf-playwright-stealth>=1.1.0", + "xxhash~=3.4", + "rank-bm25~=0.2", + "aiofiles>=24.1.0", + "colorama~=0.4", + "snowballstemmer~=2.2", + "pydantic>=2.10", + "pyOpenSSL>=24.3.0", + "psutil>=6.1.1", + "nltk>=3.9.1", + "playwright", + "aiofiles" +] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", +] + +[project.optional-dependencies] +torch = ["torch", "nltk", "scikit-learn"] +transformer = ["transformers", "tokenizers"] +cosine = ["torch", "transformers", "nltk"] +sync = ["selenium"] +all = [ + "torch", + "nltk", + "scikit-learn", + "transformers", + "tokenizers", + "selenium" +] + +[project.scripts] +crawl4ai-download-models = "crawl4ai.model_loader:main" +crawl4ai-migrate = "crawl4ai.migrations:main" +crawl4ai-setup = "crawl4ai.install:post_install" +crawl4ai-doctor = "crawl4ai.install:doctor" +crawl = "crawl4ai.cli:cli" + +[tool.setuptools] +packages = {find = {where = ["."], include = ["crawl4ai*"]}} + +[tool.setuptools.package-data] +crawl4ai = ["js_snippet/*.js"] + +[tool.setuptools.dynamic] +version = {attr = "crawl4ai.__version__.__version__"} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index fc616d5b..00ce69d6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ +# Note: These requirements are also specified in pyproject.toml +# This file is kept for development environment setup and compatibility aiosqlite~=0.20 lxml~=5.3 litellm>=1.53.1 @@ -14,4 +16,6 @@ aiofiles>=24.1.0 colorama~=0.4 snowballstemmer~=2.2 pydantic>=2.10 -pyOpenSSL>=24.3.0 \ No newline at end of file +pyOpenSSL>=24.3.0 +psutil>=6.1.1 +nltk>=3.9.1 \ No newline at end of file diff --git a/setup.py b/setup.py index 358088d2..dad3199d 100644 --- a/setup.py +++ b/setup.py @@ -3,6 +3,8 @@ import os from pathlib import Path import shutil +# Note: Most configuration is now in pyproject.toml +# This setup.py is kept for backwards compatibility # Create the .crawl4ai folder in the user's home directory if it doesn't exist # If the folder already exists, remove the cache folder @@ -28,28 +30,20 @@ cache_folder.mkdir(exist_ok=True) for folder in content_folders: (crawl4ai_folder / folder).mkdir(exist_ok=True) -# Read requirements and version -__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) -with open(os.path.join(__location__, "requirements.txt")) as f: - requirements = f.read().splitlines() - -with open("crawl4ai/__version__.py") as f: - for line in f: - if line.startswith("__version__"): - version = line.split("=")[1].strip().strip('"') - break - -# Define requirements -default_requirements = requirements -torch_requirements = ["torch", "nltk", "scikit-learn"] -transformer_requirements = ["transformers", "tokenizers"] -cosine_similarity_requirements = ["torch", "transformers", "nltk"] -sync_requirements = ["selenium"] +version = "0.0.0" # This will be overridden by pyproject.toml's dynamic version +try: + with open("crawl4ai/__version__.py") as f: + for line in f: + if line.startswith("__version__"): + version = line.split("=")[1].strip().strip('"') + break +except Exception: + pass # Let pyproject.toml handle version setup( name="Crawl4AI", version=version, - description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper", + description="🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & scraper", long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown", url="https://github.com/unclecode/crawl4ai", @@ -58,38 +52,18 @@ setup( license="MIT", packages=find_packages(), package_data={ - 'crawl4ai': ['js_snippet/*.js'] # This matches the exact path structure - }, - install_requires=default_requirements - + ["playwright", "aiofiles"], # Added aiofiles - extras_require={ - "torch": torch_requirements, - "transformer": transformer_requirements, - "cosine": cosine_similarity_requirements, - "sync": sync_requirements, - "all": default_requirements - + torch_requirements - + transformer_requirements - + cosine_similarity_requirements - + sync_requirements, - }, - entry_points={ - "console_scripts": [ - "crawl4ai-download-models=crawl4ai.model_loader:main", - "crawl4ai-migrate=crawl4ai.migrations:main", - 'crawl4ai-setup=crawl4ai.install:post_install', - 'crawl=crawl4ai.cli:cli', - ], + 'crawl4ai': ['js_snippet/*.js'] }, classifiers=[ "Development Status :: 3 - Alpha", "Intended Audience :: Developers", "License :: OSI Approved :: Apache Software License", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", ], - python_requires=">=3.7", + python_requires=">=3.9", )