From bd66befcf086f024c2063250765af8e349c4dec1 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 31 Dec 2024 21:07:58 +0800 Subject: [PATCH 01/16] Fix issue in 0.4.24 walkthrough --- docs/examples/v0_4_24_walkthrough.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/examples/v0_4_24_walkthrough.py b/docs/examples/v0_4_24_walkthrough.py index c80727dd..a4e1aaa3 100644 --- a/docs/examples/v0_4_24_walkthrough.py +++ b/docs/examples/v0_4_24_walkthrough.py @@ -17,7 +17,8 @@ from crawl4ai import ( BrowserConfig, CrawlerRunConfig, CacheMode, - LLMExtractionStrategy + LLMExtractionStrategy, + JsonCssExtractionStrategy ) from crawl4ai.content_filter_strategy import RelevantContentFilter from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator From 553c97a0c179f854549dcc5c600505d98f8149e6 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 1 Jan 2025 15:15:14 +0800 Subject: [PATCH 02/16] Fix bug reported in issue https://github.com/unclecode/crawl4ai/issues/396 --- crawl4ai/install.py | 5 ----- requirements.txt | 4 +++- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/crawl4ai/install.py b/crawl4ai/install.py index 4a3f5d45..e54f0850 100644 --- a/crawl4ai/install.py +++ b/crawl4ai/install.py @@ -2,7 +2,6 @@ import subprocess import sys import asyncio from .async_logger import AsyncLogger, LogLevel -from .docs_manager import DocsManager # Initialize logger logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True) @@ -45,7 +44,3 @@ def run_migration(): logger.warning(f"Database initialization failed: {e}") logger.warning("Database will be initialized on first use") -async def setup_docs(): - """Download documentation files""" - docs_manager = DocsManager(logger) - await docs_manager.update_docs() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index fc616d5b..f676100e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,6 @@ aiofiles>=24.1.0 colorama~=0.4 snowballstemmer~=2.2 pydantic>=2.10 -pyOpenSSL>=24.3.0 \ No newline at end of file +pyOpenSSL>=24.3.0 +psutil>=6.1.1 +nltk>=3.9.1 \ No newline at end of file From 1acc162c18afe284eef5790ee54ef578f433075b Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 1 Jan 2025 15:16:06 +0800 Subject: [PATCH 03/16] Bumb version v0.4.241 --- crawl4ai/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 73e5c025..a378ab80 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.4.24" +__version__ = "0.4.241" From 704bd66b63f16211537df264de5308b18e63f66c Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 1 Jan 2025 15:23:16 +0800 Subject: [PATCH 04/16] Uphrade plawyright installation command to install dependencies --- crawl4ai/install.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/install.py b/crawl4ai/install.py index e54f0850..eb4ede7c 100644 --- a/crawl4ai/install.py +++ b/crawl4ai/install.py @@ -17,7 +17,7 @@ def post_install(): def install_playwright(): logger.info("Installing Playwright browsers...", tag="INIT") try: - subprocess.check_call([sys.executable, "-m", "playwright", "install"]) + subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps"]) logger.success("Playwright installation completed successfully.", tag="COMPLETE") except subprocess.CalledProcessError as e: logger.error(f"Error during Playwright installation: {e}", tag="ERROR") From 304260e4849fcbfbdc27ecc9d3670224de4f71ef Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 1 Jan 2025 15:33:36 +0800 Subject: [PATCH 05/16] refactor(install): simplify Playwright installation error handling - Remove setup_docs() call from post_install() - Simplify error messages for Playwright installation failures - Use sys.executable for more accurate Python path in error messages - Add --with-deps flag to Playwright install command --- crawl4ai/install.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/crawl4ai/install.py b/crawl4ai/install.py index eb4ede7c..2725efbb 100644 --- a/crawl4ai/install.py +++ b/crawl4ai/install.py @@ -11,7 +11,6 @@ def post_install(): logger.info("Running post-installation setup...", tag="INIT") install_playwright() run_migration() - asyncio.run(setup_docs()) logger.success("Post-installation setup completed!", tag="COMPLETE") def install_playwright(): @@ -20,15 +19,11 @@ def install_playwright(): subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps"]) logger.success("Playwright installation completed successfully.", tag="COMPLETE") except subprocess.CalledProcessError as e: - logger.error(f"Error during Playwright installation: {e}", tag="ERROR") - logger.warning( - "Please run 'python -m playwright install' manually after the installation." - ) + # logger.error(f"Error during Playwright installation: {e}", tag="ERROR") + logger.warning(f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation.") except Exception as e: - logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR") - logger.warning( - "Please run 'python -m playwright install' manually after the installation." - ) + # logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR") + logger.warning(f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation.") def run_migration(): """Initialize database during installation""" From 3f019d34cc77ed32679e87409da3fe2a2cf8fb7c Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 1 Jan 2025 15:39:33 +0800 Subject: [PATCH 06/16] docs: update project description emojis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Change project description emojis from 🔥🕷️ to 🚀🤖 - Update emojis consistently in both setup.py and pyproject.toml --- pyproject.toml | 32 ++++++++++++++++++++++++++++++++ setup.py | 25 +++++++++++++++---------- 2 files changed, 47 insertions(+), 10 deletions(-) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..a00d0025 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,32 @@ +[build-system] +requires = ["setuptools>=64.0.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "Crawl4AI" +dynamic = ["version"] +description = "🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & scraper" +readme = "README.md" +requires-python = ">=3.9" +license = {text = "MIT"} +authors = [ + {name = "Unclecode", email = "unclecode@kidocode.com"} +] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", +] + +[tool.setuptools] +packages = ["crawl4ai"] +package-data = {"crawl4ai" = ["js_snippet/*.js"]} + +[tool.setuptools.dynamic] +version = {attr = "crawl4ai.__version__.__version__"} diff --git a/setup.py b/setup.py index 358088d2..d258b182 100644 --- a/setup.py +++ b/setup.py @@ -31,13 +31,17 @@ for folder in content_folders: # Read requirements and version __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) with open(os.path.join(__location__, "requirements.txt")) as f: - requirements = f.read().splitlines() + requirements = [line for line in f.read().splitlines() if line and not line.startswith('#')] -with open("crawl4ai/__version__.py") as f: - for line in f: - if line.startswith("__version__"): - version = line.split("=")[1].strip().strip('"') - break +version = "0.0.0" # This will be overridden by pyproject.toml's dynamic version +try: + with open("crawl4ai/__version__.py") as f: + for line in f: + if line.startswith("__version__"): + version = line.split("=")[1].strip().strip('"') + break +except Exception: + pass # Let pyproject.toml handle version # Define requirements default_requirements = requirements @@ -49,7 +53,7 @@ sync_requirements = ["selenium"] setup( name="Crawl4AI", version=version, - description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper", + description="🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & scraper", long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown", url="https://github.com/unclecode/crawl4ai", @@ -86,10 +90,11 @@ setup( "Intended Audience :: Developers", "License :: OSI Approved :: Apache Software License", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", ], - python_requires=">=3.7", + python_requires=">=3.9", ) From 78b6ba5cef561c4fba0427840dfc51c6f4790b81 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 1 Jan 2025 15:45:27 +0800 Subject: [PATCH 07/16] build: modernize package configuration with pyproject.toml - Add pyproject.toml for PEP 517 build system support - Configure dependencies, scripts, and metadata in pyproject.toml - Set Python requirement to >=3.9 and add support up to 3.13 - Keep setup.py for backwards compatibility - Move package dependencies and entry points to pyproject.toml --- pyproject.toml | 43 +++++++++++++++++++++++++++++++++++++++++++ setup.py | 2 ++ 2 files changed, 45 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index a00d0025..aae932a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,29 @@ license = {text = "MIT"} authors = [ {name = "Unclecode", email = "unclecode@kidocode.com"} ] +dependencies = [ + "aiosqlite~=0.20", + "lxml~=5.3", + "litellm>=1.53.1", + "numpy>=1.26.0,<3", + "pillow~=10.4", + "playwright>=1.49.0", + "python-dotenv~=1.0", + "requests~=2.26", + "beautifulsoup4~=4.12", + "tf-playwright-stealth>=1.1.0", + "xxhash~=3.4", + "rank-bm25~=0.2", + "aiofiles>=24.1.0", + "colorama~=0.4", + "snowballstemmer~=2.2", + "pydantic>=2.10", + "pyOpenSSL>=24.3.0", + "psutil>=6.1.1", + "nltk>=3.9.1", + "playwright", + "aiofiles" +] classifiers = [ "Development Status :: 3 - Alpha", "Intended Audience :: Developers", @@ -24,6 +47,26 @@ classifiers = [ "Programming Language :: Python :: 3.13", ] +[project.optional-dependencies] +torch = ["torch", "nltk", "scikit-learn"] +transformer = ["transformers", "tokenizers"] +cosine = ["torch", "transformers", "nltk"] +sync = ["selenium"] +all = [ + "torch", + "nltk", + "scikit-learn", + "transformers", + "tokenizers", + "selenium" +] + +[project.scripts] +crawl4ai-download-models = "crawl4ai.model_loader:main" +crawl4ai-migrate = "crawl4ai.migrations:main" +crawl4ai-setup = "crawl4ai.install:post_install" +crawl = "crawl4ai.cli:cli" + [tool.setuptools] packages = ["crawl4ai"] package-data = {"crawl4ai" = ["js_snippet/*.js"]} diff --git a/setup.py b/setup.py index d258b182..7dd1363b 100644 --- a/setup.py +++ b/setup.py @@ -3,6 +3,8 @@ import os from pathlib import Path import shutil +# Note: Most configuration is now in pyproject.toml +# This setup.py is kept for backwards compatibility # Create the .crawl4ai folder in the user's home directory if it doesn't exist # If the folder already exists, remove the cache folder From 67f65f958bdb4752b8b25ee1707b6f8e8e859e65 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 1 Jan 2025 15:52:01 +0800 Subject: [PATCH 08/16] refactor(build): simplify setup.py configuration - Remove dependency management from setup.py - Remove entry points configuration (moved to pyproject.toml) - Keep minimal setup.py for backwards compatibility - Clean up package metadata structure --- docs/examples/v0_4_24_walkthrough.py | 5 ++-- requirements.txt | 2 ++ setup.py | 35 +--------------------------- 3 files changed, 6 insertions(+), 36 deletions(-) diff --git a/docs/examples/v0_4_24_walkthrough.py b/docs/examples/v0_4_24_walkthrough.py index a4e1aaa3..135ac29c 100644 --- a/docs/examples/v0_4_24_walkthrough.py +++ b/docs/examples/v0_4_24_walkthrough.py @@ -169,7 +169,8 @@ async def demo_content_filtering(): ) run_config = CrawlerRunConfig( - markdown_generator=markdown_gen + markdown_generator=markdown_gen, + cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: @@ -436,7 +437,7 @@ async def main(): await demo_ssl_features() await demo_content_filtering() await demo_json_extraction() - await demo_input_formats() + # await demo_input_formats() if __name__ == "__main__": asyncio.run(main()) diff --git a/requirements.txt b/requirements.txt index f676100e..00ce69d6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ +# Note: These requirements are also specified in pyproject.toml +# This file is kept for development environment setup and compatibility aiosqlite~=0.20 lxml~=5.3 litellm>=1.53.1 diff --git a/setup.py b/setup.py index 7dd1363b..dad3199d 100644 --- a/setup.py +++ b/setup.py @@ -30,11 +30,6 @@ cache_folder.mkdir(exist_ok=True) for folder in content_folders: (crawl4ai_folder / folder).mkdir(exist_ok=True) -# Read requirements and version -__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) -with open(os.path.join(__location__, "requirements.txt")) as f: - requirements = [line for line in f.read().splitlines() if line and not line.startswith('#')] - version = "0.0.0" # This will be overridden by pyproject.toml's dynamic version try: with open("crawl4ai/__version__.py") as f: @@ -45,13 +40,6 @@ try: except Exception: pass # Let pyproject.toml handle version -# Define requirements -default_requirements = requirements -torch_requirements = ["torch", "nltk", "scikit-learn"] -transformer_requirements = ["transformers", "tokenizers"] -cosine_similarity_requirements = ["torch", "transformers", "nltk"] -sync_requirements = ["selenium"] - setup( name="Crawl4AI", version=version, @@ -64,28 +52,7 @@ setup( license="MIT", packages=find_packages(), package_data={ - 'crawl4ai': ['js_snippet/*.js'] # This matches the exact path structure - }, - install_requires=default_requirements - + ["playwright", "aiofiles"], # Added aiofiles - extras_require={ - "torch": torch_requirements, - "transformer": transformer_requirements, - "cosine": cosine_similarity_requirements, - "sync": sync_requirements, - "all": default_requirements - + torch_requirements - + transformer_requirements - + cosine_similarity_requirements - + sync_requirements, - }, - entry_points={ - "console_scripts": [ - "crawl4ai-download-models=crawl4ai.model_loader:main", - "crawl4ai-migrate=crawl4ai.migrations:main", - 'crawl4ai-setup=crawl4ai.install:post_install', - 'crawl=crawl4ai.cli:cli', - ], + 'crawl4ai': ['js_snippet/*.js'] }, classifiers=[ "Development Status :: 3 - Alpha", From 74a7c6dbb65ec01cbb2e719f9e8071526163a66d Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 1 Jan 2025 16:10:08 +0800 Subject: [PATCH 09/16] feat(install): specify chrome and chromium for playwright - Install Chrome and Chromium browsers explicitly - Split browser installation into separate commands --- crawl4ai/install.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crawl4ai/install.py b/crawl4ai/install.py index 2725efbb..251a3199 100644 --- a/crawl4ai/install.py +++ b/crawl4ai/install.py @@ -16,7 +16,8 @@ def post_install(): def install_playwright(): logger.info("Installing Playwright browsers...", tag="INIT") try: - subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps"]) + subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "chrome"]) + subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "chromium"]) logger.success("Playwright installation completed successfully.", tag="COMPLETE") except subprocess.CalledProcessError as e: # logger.error(f"Error during Playwright installation: {e}", tag="ERROR") From dc6a24618eebd91dda64b62e1fab84f784431899 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 1 Jan 2025 16:33:43 +0800 Subject: [PATCH 10/16] feat(install): add doctor command and force browser install - Add --force flag to Playwright browser installation - Add doctor command to test crawling functionality - Install Chrome and Chromium browsers explicitly - Add crawl4ai-doctor entry point in pyproject.toml - Implement simple health check focused on crawling test --- crawl4ai/install.py | 45 +++++++++++++++++++++++++++++++++++++++++++-- pyproject.toml | 1 + 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/crawl4ai/install.py b/crawl4ai/install.py index 251a3199..7f80fd2c 100644 --- a/crawl4ai/install.py +++ b/crawl4ai/install.py @@ -16,8 +16,8 @@ def post_install(): def install_playwright(): logger.info("Installing Playwright browsers...", tag="INIT") try: - subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "chrome"]) - subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "chromium"]) + subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "--force", "chrome"]) + subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "--force", "chromium"]) logger.success("Playwright installation completed successfully.", tag="COMPLETE") except subprocess.CalledProcessError as e: # logger.error(f"Error during Playwright installation: {e}", tag="ERROR") @@ -40,3 +40,44 @@ def run_migration(): logger.warning(f"Database initialization failed: {e}") logger.warning("Database will be initialized on first use") +async def run_doctor(): + """Test if Crawl4AI is working properly""" + logger.info("Running Crawl4AI health check...", tag="INIT") + try: + from .async_webcrawler import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode + + browser_config = BrowserConfig( + headless=True, + browser_type="chromium", + ignore_https_errors=True, + light_mode=True, + viewport_width=1280, + viewport_height=720 + ) + + run_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + screenshot=True, + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + logger.info("Testing crawling capabilities...", tag="TEST") + result = await crawler.arun( + url="https://crawl4ai.com", + config=run_config + ) + + if result and result.markdown: + logger.success("✅ Crawling test passed!", tag="COMPLETE") + return True + else: + raise Exception("Failed to get content") + + except Exception as e: + logger.error(f"❌ Test failed: {e}", tag="ERROR") + return False + +def doctor(): + """Entry point for the doctor command""" + import asyncio + return asyncio.run(run_doctor()) diff --git a/pyproject.toml b/pyproject.toml index aae932a8..2774542a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,6 +65,7 @@ all = [ crawl4ai-download-models = "crawl4ai.model_loader:main" crawl4ai-migrate = "crawl4ai.migrations:main" crawl4ai-setup = "crawl4ai.install:post_install" +crawl4ai-doctor = "crawl4ai.install:doctor" crawl = "crawl4ai.cli:cli" [tool.setuptools] From 4a4f613238165bb8c330a044b0df9a146c5041dc Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 1 Jan 2025 16:54:03 +0800 Subject: [PATCH 11/16] docs: simplify installation instructions - Add crawl4ai-doctor command to verify installation - Update browser installation instructions in README and docs - Move optional features to documentation - Add manual browser installation steps as fallback - Update getting-started guide with verification step --- README.md | 12 ++++++++---- docs/md_v3/tutorials/getting-started.md | 9 ++++++++- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 0a56a397..94b06250 100644 --- a/README.md +++ b/README.md @@ -39,13 +39,17 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant ```bash # Install the package pip install crawl4ai + +# Run post-installation setup crawl4ai-setup -# Install Playwright with system dependencies (recommended) -playwright install --with-deps +# Verify your installation +crawl4ai-doctor +``` -# Or install specific browsers: -playwright install --with-deps chrome # Recommended for Colab/Linux +If you encounter any browser-related issues, you can install them manually: +```bash +python -m playwright install --with-deps chrome chromium ``` 2. Run a simple web crawl: diff --git a/docs/md_v3/tutorials/getting-started.md b/docs/md_v3/tutorials/getting-started.md index 045590cb..b148e6e1 100644 --- a/docs/md_v3/tutorials/getting-started.md +++ b/docs/md_v3/tutorials/getting-started.md @@ -31,7 +31,14 @@ By the end of this guide, you’ll have installed Crawl4AI, performed a basic cr ```bash pip install crawl4ai crawl4ai-setup -playwright install --with-deps + +# Verify your installation +crawl4ai-doctor +``` + +If you encounter any browser-related issues, you can install them manually: +```bash +python -m playwright install --with-deps chrome chromium ``` - **`crawl4ai-setup`** installs and configures Playwright (Chromium by default). From d36ef3d424e88d4760a00c262708509d6e22649d Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 1 Jan 2025 17:19:54 +0800 Subject: [PATCH 12/16] refactor(install): use chromium as default browser - Remove Chrome installation to reduce setup time - Keep Chromium as default browser for better cross-platform compatibility --- crawl4ai/install.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/install.py b/crawl4ai/install.py index 7f80fd2c..7efb6800 100644 --- a/crawl4ai/install.py +++ b/crawl4ai/install.py @@ -16,7 +16,7 @@ def post_install(): def install_playwright(): logger.info("Installing Playwright browsers...", tag="INIT") try: - subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "--force", "chrome"]) + # subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "--force", "chrome"]) subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "--force", "chromium"]) logger.success("Playwright installation completed successfully.", tag="COMPLETE") except subprocess.CalledProcessError as e: From 5313c71a0d9cba0c0e4c8f3390555b8c02f4798a Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 1 Jan 2025 17:24:44 +0800 Subject: [PATCH 13/16] docs: update REAME browser installation command - Remove Chrome from manual installation command - Keep Chromium as the only default browser in docs --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 94b06250..6598ad84 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ crawl4ai-doctor If you encounter any browser-related issues, you can install them manually: ```bash -python -m playwright install --with-deps chrome chromium +python -m playwright install --with-deps chromium ``` 2. Run a simple web crawl: From e9d9a6ffe85132e4aed238b3c2b3228b38fc1849 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 1 Jan 2025 17:38:59 +0800 Subject: [PATCH 14/16] fix: ensure js_snippet files are included in package - Add js_snippet to packages list in pyproject.toml - Verified JS files are properly included in installed package - Bump version to 0.4.242 --- crawl4ai/__version__.py | 2 +- pyproject.toml | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index a378ab80..6a6b93f3 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.4.241" +__version__ = "0.4.242" diff --git a/pyproject.toml b/pyproject.toml index 2774542a..9a1ee248 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,8 +69,10 @@ crawl4ai-doctor = "crawl4ai.install:doctor" crawl = "crawl4ai.cli:cli" [tool.setuptools] -packages = ["crawl4ai"] -package-data = {"crawl4ai" = ["js_snippet/*.js"]} +packages = ["crawl4ai", "crawl4ai.js_snippet"] + +[tool.setuptools.package-data] +crawl4ai = ["js_snippet/*.js"] [tool.setuptools.dynamic] version = {attr = "crawl4ai.__version__.__version__"} From bfe21b29d47e9d391aa6868b9c27a843155044e4 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 1 Jan 2025 17:53:51 +0800 Subject: [PATCH 15/16] build: streamline package discovery and bump to v0.4.243 - Replace explicit package listing with setuptools.find - Include all crawl4ai.* packages automatically - Use `packages = {find = {where = ["."], include = ["crawl4ai*"]}}` syntax - Bump version to 0.4.243 This change simplifies package maintenance by automatically discovering all subpackages under crawl4ai namespace instead of listing them manually. --- crawl4ai/__version__.py | 2 +- pyproject.toml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 6a6b93f3..2761f396 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.4.242" +__version__ = "0.4.243" diff --git a/pyproject.toml b/pyproject.toml index 9a1ee248..b3247e8a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,10 +69,10 @@ crawl4ai-doctor = "crawl4ai.install:doctor" crawl = "crawl4ai.cli:cli" [tool.setuptools] -packages = ["crawl4ai", "crawl4ai.js_snippet"] +packages = {find = {where = ["."], include = ["crawl4ai*"]}} [tool.setuptools.package-data] crawl4ai = ["js_snippet/*.js"] [tool.setuptools.dynamic] -version = {attr = "crawl4ai.__version__.__version__"} +version = {attr = "crawl4ai.__version__.__version__"} \ No newline at end of file From c64979b8dd855123b7b624ae30ab36af3ed56132 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 1 Jan 2025 18:10:38 +0800 Subject: [PATCH 16/16] docs: update README --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 6598ad84..51bded41 100644 --- a/README.md +++ b/README.md @@ -20,9 +20,9 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease. -[✨ Check out latest update v0.4.24](#-recent-updates) +[✨ Check out latest update v0.4.24x](#-recent-updates) -🎉 **Version 0.4.24 is out!** Major improvements in extraction strategies with enhanced JSON handling, SSL security, and Amazon product extraction. Plus, a completely revamped content filtering system! [Read the release notes →](https://crawl4ai.com/mkdocs/blog) +🎉 **Version 0.4.24x is out!** Major improvements in extraction strategies with enhanced JSON handling, SSL security, and Amazon product extraction. Plus, a completely revamped content filtering system! [Read the release notes →](https://crawl4ai.com/mkdocs/blog) ## 🧐 Why Crawl4AI? @@ -38,7 +38,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant 1. Install Crawl4AI: ```bash # Install the package -pip install crawl4ai +pip install -U crawl4ai # Run post-installation setup crawl4ai-setup