Enhance installation and migration processes

- Added a post-installation setup script for initialization. - Updated README with installation notes for Playwright setup. - Enhanced migration logging for better error visibility. - Added 'pydantic' to requirements. - Bumped version to 0.3.746.
2024-11-29 18:48:44 +08:00
parent 12e73d4898
commit d202f3539b
8 changed files with 90 additions and 102 deletions
--- a/README.md
+++ b/README.md
@@ -27,6 +27,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant
 1. Install Crawl4AI:
 ```bash
 pip install crawl4ai
+crawl4ai-setup # Setup the browser
 ```

 2. Run a simple web crawl:
@@ -125,34 +126,6 @@ if __name__ == "__main__":

 ✨ Visit our [Documentation Website](https://crawl4ai.com/mkdocs/)

-## Features ✨
-
- 🆓 Completely free and open-source
- 🚀 Blazing fast performance, outperforming many paid services
- 🤖 LLM-friendly output formats (JSON, cleaned HTML, markdown)
- 🌐 Multi-browser support (Chromium, Firefox, WebKit)
- 🌍 Supports crawling multiple URLs simultaneously
- 🎨 Extracts and returns all media tags (Images, Audio, and Video)
- 🔗 Extracts all external and internal links
- 📚 Extracts metadata from the page
- 🔄 Custom hooks for authentication, headers, and page modifications
- 🕵️ User-agent customization
- 🖼️ Takes screenshots of pages with enhanced error handling
- 📜 Executes multiple custom JavaScripts before crawling
- 📊 Generates structured output without LLM using JsonCssExtractionStrategy
- 📚 Various chunking strategies: topic-based, regex, sentence, and more
- 🧠 Advanced extraction strategies: cosine clustering, LLM, and more
- 🎯 CSS selector support for precise data extraction
- 📝 Passes instructions/keywords to refine extraction
- 🔒 Proxy support with authentication for enhanced access
- 🔄 Session management for complex multi-page crawling
- 🌐 Asynchronous architecture for improved performance
- 🖼️ Improved image processing with lazy-loading detection
- 🕰️ Enhanced handling of delayed content loading
- 🔑 Custom headers support for LLM interactions
- 🖼️ iframe content extraction for comprehensive analysis
- ⏱️ Flexible timeout and delayed content retrieval options
-
 ## Installation 🛠️

 Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package or use Docker.
@@ -168,11 +141,12 @@ For basic web crawling and scraping tasks:

 ```bash
 pip install crawl4ai
+crawl4ai-setup # Setup the browser
 ```

 By default, this will install the asynchronous version of Crawl4AI, using Playwright for web crawling.

-👉 **Note**: When you install Crawl4AI, the setup script should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods:
+👉 **Note**: When you install Crawl4AI, the `crawl4ai-setup` should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods:

 1. Through the command line:

--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -4,7 +4,6 @@ from .async_webcrawler import AsyncWebCrawler, CacheMode

 from .models import CrawlResult
 from .__version__ import __version__
-# __version__ = "0.3.73"

 __all__ = [
    "AsyncWebCrawler",
--- a/crawl4ai/version.py
+++ b/crawl4ai/version.py
@@ -1,2 +1,2 @@
 # crawl4ai/_version.py
-__version__ = "0.3.745"
+__version__ = "0.3.746"
--- a/crawl4ai/install.py
+++ b/crawl4ai/install.py
@@ -0,0 +1,44 @@
+import subprocess
+import sys
+import asyncio
+from .async_logger import AsyncLogger, LogLevel
+
+# Initialize logger
+logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
+
+def post_install():
+    """Run all post-installation tasks"""
+    logger.info("Running post-installation setup...", tag="INIT")
+    install_playwright()
+    run_migration()
+    logger.success("Post-installation setup completed!", tag="COMPLETE")
+    
+def install_playwright():
+    logger.info("Installing Playwright browsers...", tag="INIT")
+    try:
+        subprocess.check_call([sys.executable, "-m", "playwright", "install"])
+        logger.success("Playwright installation completed successfully.", tag="COMPLETE")
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Error during Playwright installation: {e}", tag="ERROR")
+        logger.warning(
+            "Please run 'python -m playwright install' manually after the installation."
+        )
+    except Exception as e:
+        logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR")
+        logger.warning(
+            "Please run 'python -m playwright install' manually after the installation."
+        )
+
+def run_migration():
+    """Initialize database during installation"""
+    try:
+        logger.info("Starting database initialization...", tag="INIT")
+        from crawl4ai.async_database import async_db_manager
+
+        asyncio.run(async_db_manager.initialize())
+        logger.success("Database initialization completed successfully.", tag="COMPLETE")
+    except ImportError:
+        logger.warning("Database module not found. Will initialize on first use.")
+    except Exception as e:
+        logger.warning(f"Database initialization failed: {e}")
+        logger.warning("Database will be initialized on first use")
--- a/crawl4ai/migrations.py
+++ b/crawl4ai/migrations.py
@@ -9,9 +9,13 @@ import aiofiles
 import shutil
 import time
 from datetime import datetime
+from .async_logger import AsyncLogger, LogLevel

-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
+# Initialize logger
+logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
+
+# logging.basicConfig(level=logging.INFO)
+# logger = logging.getLogger(__name__)

 class DatabaseMigration:
    def __init__(self, db_path: str):
@@ -55,7 +59,8 @@ class DatabaseMigration:

    async def migrate_database(self):
        """Migrate existing database to file-based storage"""
-        logger.info("Starting database migration...")
+        # logger.info("Starting database migration...")
+        logger.info("Starting database migration...", tag="INIT")
        
        try:
            async with aiosqlite.connect(self.db_path) as db:
@@ -91,19 +96,25 @@ class DatabaseMigration:
                    
                    migrated_count += 1
                    if migrated_count % 100 == 0:
-                        logger.info(f"Migrated {migrated_count} records...")
+                        logger.info(f"Migrated {migrated_count} records...", tag="INIT")
+                        

                await db.commit()
-                logger.info(f"Migration completed. {migrated_count} records processed.")
+                logger.success(f"Migration completed. {migrated_count} records processed.", tag="COMPLETE")

        except Exception as e:
-            logger.error(f"Migration failed: {e}")
-            raise
+            # logger.error(f"Migration failed: {e}")
+            logger.error(
+                message="Migration failed: {error}",
+                tag="ERROR",
+                params={"error": str(e)}
+            )
+            raise e

 async def backup_database(db_path: str) -> str:
    """Create backup of existing database"""
    if not os.path.exists(db_path):
-        logger.info("No existing database found. Skipping backup.")
+        logger.info("No existing database found. Skipping backup.", tag="INIT")
        return None
        
    # Create backup with timestamp
@@ -116,11 +127,16 @@ async def backup_database(db_path: str) -> str:
        
        # Create backup
        shutil.copy2(db_path, backup_path)
-        logger.info(f"Database backup created at: {backup_path}")
+        logger.info(f"Database backup created at: {backup_path}", tag="COMPLETE")
        return backup_path
    except Exception as e:
-        logger.error(f"Backup failed: {e}")
-        raise
+        # logger.error(f"Backup failed: {e}")
+        logger.error(
+                message="Migration failed: {error}",
+                tag="ERROR",
+                params={"error": str(e)}
+            )
+        raise e
    
 async def run_migration(db_path: Optional[str] = None):
    """Run database migration"""
@@ -128,7 +144,7 @@ async def run_migration(db_path: Optional[str] = None):
        db_path = os.path.join(Path.home(), ".crawl4ai", "crawl4ai.db")
    
    if not os.path.exists(db_path):
-        logger.info("No existing database found. Skipping migration.")
+        logger.info("No existing database found. Skipping migration.", tag="INIT")
        return
        
    # Create backup first
--- a/docs/examples/quickstart_async.py
+++ b/docs/examples/quickstart_async.py
@@ -562,18 +562,18 @@ async def fit_markdown_remove_overlay():


 async def main():
-    # await simple_crawl()
-    # await simple_example_with_running_js_code()
-    # await simple_example_with_css_selector()
+    await simple_crawl()
+    await simple_example_with_running_js_code()
+    await simple_example_with_css_selector()
    # await use_proxy()
-    # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
-    # await extract_structured_data_using_css_extractor()
+    await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
+    await extract_structured_data_using_css_extractor()

    # LLM extraction examples
    # await extract_structured_data_using_llm()
    # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
    # await extract_structured_data_using_llm("ollama/llama3.2")    
-    # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
+    await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))

    # You always can pass custom headers to the extraction strategy
    # custom_headers = {
@@ -582,9 +582,9 @@ async def main():
    # }
    # await extract_structured_data_using_llm(extra_headers=custom_headers)
    
-    # await crawl_dynamic_content_pages_method_1()
-    # await crawl_dynamic_content_pages_method_2()
-    # await crawl_dynamic_content_pages_method_3()
+    await crawl_dynamic_content_pages_method_1()
+    await crawl_dynamic_content_pages_method_2()
+    await crawl_dynamic_content_pages_method_3()
    
    await crawl_custom_browser_type()
    
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,4 @@
 aiosqlite~=0.20
-html2text~=2024.2
 lxml~=5.3
 litellm>=1.53.1
 numpy>=1.26.0,<3
@@ -14,3 +13,4 @@ rank-bm25~=0.2
 aiofiles>=24.1.0
 colorama~=0.4
 snowballstemmer~=2.2
+pydantic>=2.10
--- a/setup.py
+++ b/setup.py
@@ -1,11 +1,8 @@
 from setuptools import setup, find_packages
-from setuptools.command.install import install
 import os
 from pathlib import Path
 import shutil
-import subprocess
-import sys
-import asyncio
+

 # Create the .crawl4ai folder in the user's home directory if it doesn't exist
 # If the folder already exists, remove the cache folder
@@ -49,46 +46,6 @@ transformer_requirements = ["transformers", "tokenizers"]
 cosine_similarity_requirements = ["torch", "transformers", "nltk"]
 sync_requirements = ["selenium"]

-
-def install_playwright():
-    print("Installing Playwright browsers...")
-    try:
-        subprocess.check_call([sys.executable, "-m", "playwright", "install"])
-        print("Playwright installation completed successfully.")
-    except subprocess.CalledProcessError as e:
-        print(f"Error during Playwright installation: {e}")
-        print(
-            "Please run 'python -m playwright install' manually after the installation."
-        )
-    except Exception as e:
-        print(f"Unexpected error during Playwright installation: {e}")
-        print(
-            "Please run 'python -m playwright install' manually after the installation."
-        )
-
-
-def run_migration():
-    """Initialize database during installation"""
-    try:
-        print("Starting database initialization...")
-        from crawl4ai.async_database import async_db_manager
-
-        asyncio.run(async_db_manager.initialize())
-        print("Database initialization completed successfully.")
-    except ImportError:
-        print("Warning: Database module not found. Will initialize on first use.")
-    except Exception as e:
-        print(f"Warning: Database initialization failed: {e}")
-        print("Database will be initialized on first use")
-
-
-class PostInstallCommand(install):
-    def run(self):
-        install.run(self)
-        install_playwright()
-        # run_migration()
-
-
 setup(
    name="Crawl4AI",
    version=version,
@@ -116,7 +73,8 @@ setup(
    entry_points={
        "console_scripts": [
            "crawl4ai-download-models=crawl4ai.model_loader:main",
-            "crawl4ai-migrate=crawl4ai.migrations:main",  # Added migration command
+            "crawl4ai-migrate=crawl4ai.migrations:main",  
+            'crawl4ai-setup=crawl4ai.install:post_install', 
        ],
    },
    classifiers=[
@@ -130,7 +88,4 @@ setup(
        "Programming Language :: Python :: 3.10",
    ],
    python_requires=">=3.7",
-    cmdclass={
-        "install": PostInstallCommand,
-    },
 )