Merge branch 'v0.4.243'

2025-01-01 18:11:15 +08:00
parent 3e769a9c6c c64979b8dd
commit 318554e6bf
8 changed files with 170 additions and 70 deletions
--- a/README.md
+++ b/README.md
@@ -20,9 +20,9 @@

 Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for LLMs, AI agents, and data pipelines. Open source, flexible, and built for real-time performance, Crawl4AI empowers developers with unmatched speed, precision, and deployment ease.  

-[✨ Check out latest update v0.4.24](#-recent-updates)
+[✨ Check out latest update v0.4.24x](#-recent-updates)

-🎉 **Version 0.4.24 is out!** Major improvements in extraction strategies with enhanced JSON handling, SSL security, and Amazon product extraction. Plus, a completely revamped content filtering system! [Read the release notes →](https://crawl4ai.com/mkdocs/blog)
+🎉 **Version 0.4.24x is out!** Major improvements in extraction strategies with enhanced JSON handling, SSL security, and Amazon product extraction. Plus, a completely revamped content filtering system! [Read the release notes →](https://crawl4ai.com/mkdocs/blog)

 ## 🧐 Why Crawl4AI?

@@ -38,14 +38,18 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant
 1. Install Crawl4AI:
 ```bash
 # Install the package
-pip install crawl4ai
+pip install -U crawl4ai
+
+# Run post-installation setup
 crawl4ai-setup

-# Install Playwright with system dependencies (recommended)
-playwright install --with-deps 
+# Verify your installation
+crawl4ai-doctor
+```

-# Or install specific browsers:
-playwright install --with-deps chrome  # Recommended for Colab/Linux
+If you encounter any browser-related issues, you can install them manually:
+```bash
+python -m playwright install --with-deps chromium
 ```

 2. Run a simple web crawl:
--- a/crawl4ai/version.py
+++ b/crawl4ai/version.py
@@ -1,2 +1,2 @@
 # crawl4ai/_version.py
-__version__ = "0.4.24"
+__version__ = "0.4.243"
--- a/crawl4ai/install.py
+++ b/crawl4ai/install.py
@@ -2,7 +2,6 @@ import subprocess
 import sys
 import asyncio
 from .async_logger import AsyncLogger, LogLevel
-from .docs_manager import DocsManager

 # Initialize logger
 logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
@@ -12,24 +11,20 @@ def post_install():
    logger.info("Running post-installation setup...", tag="INIT")
    install_playwright()
    run_migration()
-    asyncio.run(setup_docs())
    logger.success("Post-installation setup completed!", tag="COMPLETE")
    
 def install_playwright():
    logger.info("Installing Playwright browsers...", tag="INIT")
    try:
-        subprocess.check_call([sys.executable, "-m", "playwright", "install"])
+        # subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "--force", "chrome"])
+        subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "--force", "chromium"])
        logger.success("Playwright installation completed successfully.", tag="COMPLETE")
    except subprocess.CalledProcessError as e:
-        logger.error(f"Error during Playwright installation: {e}", tag="ERROR")
-        logger.warning(
-            "Please run 'python -m playwright install' manually after the installation."
-        )
+        # logger.error(f"Error during Playwright installation: {e}", tag="ERROR")
+        logger.warning(f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation.")
    except Exception as e:
-        logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR")
-        logger.warning(
-            "Please run 'python -m playwright install' manually after the installation."
-        )
+        # logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR")
+        logger.warning(f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation.")

 def run_migration():
    """Initialize database during installation"""
@@ -45,7 +40,44 @@ def run_migration():
        logger.warning(f"Database initialization failed: {e}")
        logger.warning("Database will be initialized on first use")

-async def setup_docs():
-    """Download documentation files"""
-    docs_manager = DocsManager(logger)
-    await docs_manager.update_docs()
+async def run_doctor():
+    """Test if Crawl4AI is working properly"""
+    logger.info("Running Crawl4AI health check...", tag="INIT")
+    try:
+        from .async_webcrawler import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+
+        browser_config = BrowserConfig(
+            headless=True,
+            browser_type="chromium",
+            ignore_https_errors=True,
+            light_mode=True,
+            viewport_width=1280,
+            viewport_height=720
+        )
+
+        run_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            screenshot=True,
+        )
+
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            logger.info("Testing crawling capabilities...", tag="TEST")
+            result = await crawler.arun(
+                url="https://crawl4ai.com",
+                config=run_config
+            )
+
+            if result and result.markdown:
+                logger.success("✅ Crawling test passed!", tag="COMPLETE")
+                return True
+            else:
+                raise Exception("Failed to get content")
+
+    except Exception as e:
+        logger.error(f"❌ Test failed: {e}", tag="ERROR")
+        return False
+
+def doctor():
+    """Entry point for the doctor command"""
+    import asyncio
+    return asyncio.run(run_doctor())
--- a/docs/examples/v0_4_24_walkthrough.py
+++ b/docs/examples/v0_4_24_walkthrough.py
@@ -169,7 +169,8 @@ async def demo_content_filtering():
    )

    run_config = CrawlerRunConfig(
-        markdown_generator=markdown_gen
+        markdown_generator=markdown_gen,
+        cache_mode=CacheMode.BYPASS
    )

    async with AsyncWebCrawler() as crawler:
@@ -436,7 +437,7 @@ async def main():
    await demo_ssl_features()
    await demo_content_filtering()
    await demo_json_extraction()
-    await demo_input_formats()
+    # await demo_input_formats()

 if __name__ == "__main__":
    asyncio.run(main())
--- a/docs/md_v3/tutorials/getting-started.md
+++ b/docs/md_v3/tutorials/getting-started.md
@@ -31,7 +31,14 @@ By the end of this guide, you’ll have installed Crawl4AI, performed a basic cr
 ```bash
 pip install crawl4ai
 crawl4ai-setup
-playwright install --with-deps  
+
+# Verify your installation
+crawl4ai-doctor
+```
+
+If you encounter any browser-related issues, you can install them manually:
+```bash
+python -m playwright install --with-deps chrome chromium
 ```

 - **`crawl4ai-setup`** installs and configures Playwright (Chromium by default).
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,78 @@
+[build-system]
+requires = ["setuptools>=64.0.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "Crawl4AI"
+dynamic = ["version"]
+description = "🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
+readme = "README.md"
+requires-python = ">=3.9"
+license = {text = "MIT"}
+authors = [
+    {name = "Unclecode", email = "unclecode@kidocode.com"}
+]
+dependencies = [
+    "aiosqlite~=0.20",
+    "lxml~=5.3",
+    "litellm>=1.53.1",
+    "numpy>=1.26.0,<3",
+    "pillow~=10.4",
+    "playwright>=1.49.0",
+    "python-dotenv~=1.0",
+    "requests~=2.26",
+    "beautifulsoup4~=4.12",
+    "tf-playwright-stealth>=1.1.0",
+    "xxhash~=3.4",
+    "rank-bm25~=0.2",
+    "aiofiles>=24.1.0",
+    "colorama~=0.4",
+    "snowballstemmer~=2.2",
+    "pydantic>=2.10",
+    "pyOpenSSL>=24.3.0",
+    "psutil>=6.1.1",
+    "nltk>=3.9.1",
+    "playwright",
+    "aiofiles"
+]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+]
+
+[project.optional-dependencies]
+torch = ["torch", "nltk", "scikit-learn"]
+transformer = ["transformers", "tokenizers"]
+cosine = ["torch", "transformers", "nltk"]
+sync = ["selenium"]
+all = [
+    "torch",
+    "nltk",
+    "scikit-learn",
+    "transformers",
+    "tokenizers",
+    "selenium"
+]
+
+[project.scripts]
+crawl4ai-download-models = "crawl4ai.model_loader:main"
+crawl4ai-migrate = "crawl4ai.migrations:main"
+crawl4ai-setup = "crawl4ai.install:post_install"
+crawl4ai-doctor = "crawl4ai.install:doctor"
+crawl = "crawl4ai.cli:cli"
+
+[tool.setuptools]
+packages = {find = {where = ["."], include = ["crawl4ai*"]}}
+
+[tool.setuptools.package-data]
+crawl4ai = ["js_snippet/*.js"]
+
+[tool.setuptools.dynamic]
+version = {attr = "crawl4ai.__version__.__version__"}
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,5 @@
+# Note: These requirements are also specified in pyproject.toml
+# This file is kept for development environment setup and compatibility
 aiosqlite~=0.20
 lxml~=5.3
 litellm>=1.53.1
@@ -14,4 +16,6 @@ aiofiles>=24.1.0
 colorama~=0.4
 snowballstemmer~=2.2
 pydantic>=2.10
-pyOpenSSL>=24.3.0
+pyOpenSSL>=24.3.0
+psutil>=6.1.1
+nltk>=3.9.1
--- a/setup.py
+++ b/setup.py
@@ -3,6 +3,8 @@ import os
 from pathlib import Path
 import shutil

+# Note: Most configuration is now in pyproject.toml
+# This setup.py is kept for backwards compatibility

 # Create the .crawl4ai folder in the user's home directory if it doesn't exist
 # If the folder already exists, remove the cache folder
@@ -28,28 +30,20 @@ cache_folder.mkdir(exist_ok=True)
 for folder in content_folders:
    (crawl4ai_folder / folder).mkdir(exist_ok=True)

-# Read requirements and version
-__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
-with open(os.path.join(__location__, "requirements.txt")) as f:
-    requirements = f.read().splitlines()
-
-with open("crawl4ai/__version__.py") as f:
-    for line in f:
-        if line.startswith("__version__"):
-            version = line.split("=")[1].strip().strip('"')
-            break
-
-# Define requirements
-default_requirements = requirements
-torch_requirements = ["torch", "nltk", "scikit-learn"]
-transformer_requirements = ["transformers", "tokenizers"]
-cosine_similarity_requirements = ["torch", "transformers", "nltk"]
-sync_requirements = ["selenium"]
+version = "0.0.0"  # This will be overridden by pyproject.toml's dynamic version
+try:
+    with open("crawl4ai/__version__.py") as f:
+        for line in f:
+            if line.startswith("__version__"):
+                version = line.split("=")[1].strip().strip('"')
+                break
+except Exception:
+    pass  # Let pyproject.toml handle version

 setup(
    name="Crawl4AI",
    version=version,
-    description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper",
+    description="🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & scraper",
    long_description=open("README.md", encoding="utf-8").read(),
    long_description_content_type="text/markdown",
    url="https://github.com/unclecode/crawl4ai",
@@ -58,38 +52,18 @@ setup(
    license="MIT",
    packages=find_packages(),
    package_data={
-        'crawl4ai': ['js_snippet/*.js']  # This matches the exact path structure
-    },
-    install_requires=default_requirements
-    + ["playwright", "aiofiles"],  # Added aiofiles
-    extras_require={
-        "torch": torch_requirements,
-        "transformer": transformer_requirements,
-        "cosine": cosine_similarity_requirements,
-        "sync": sync_requirements,
-        "all": default_requirements
-        + torch_requirements
-        + transformer_requirements
-        + cosine_similarity_requirements
-        + sync_requirements,
-    },
-    entry_points={
-        "console_scripts": [
-            "crawl4ai-download-models=crawl4ai.model_loader:main",
-            "crawl4ai-migrate=crawl4ai.migrations:main",  
-            'crawl4ai-setup=crawl4ai.install:post_install', 
-            'crawl=crawl4ai.cli:cli',
-        ],
+        'crawl4ai': ['js_snippet/*.js']
    },
    classifiers=[
        "Development Status :: 3 - Alpha",
        "Intended Audience :: Developers",
        "License :: OSI Approved :: Apache Software License",
        "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.7",
-        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
+        "Programming Language :: Python :: 3.13",
    ],
-    python_requires=">=3.7",
+    python_requires=">=3.9",
 )