Enhance installation and migration processes

- Added a post-installation setup script for initialization. - Updated README with installation notes for Playwright setup. - Enhanced migration logging for better error visibility. - Added 'pydantic' to requirements. - Bumped version to 0.3.746.
2024-11-29 18:48:44 +08:00
parent 12e73d4898
commit d202f3539b
8 changed files with 90 additions and 102 deletions
--- a/README.md
+++ b/README.md
@@ -27,6 +27,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant
 1. Install Crawl4AI:
 ```bash
 pip install crawl4ai
 crawl4ai-setup # Setup the browser
 ```
 2. Run a simple web crawl:
@@ -125,34 +126,6 @@ if __name__ == "__main__":
 ✨ Visit our [Documentation Website](https://crawl4ai.com/mkdocs/)
 ## Features ✨
 - 🆓 Completely free and open-source
 - 🚀 Blazing fast performance, outperforming many paid services
 - 🤖 LLM-friendly output formats (JSON, cleaned HTML, markdown)
 - 🌐 Multi-browser support (Chromium, Firefox, WebKit)
 - 🌍 Supports crawling multiple URLs simultaneously
 - 🎨 Extracts and returns all media tags (Images, Audio, and Video)
 - 🔗 Extracts all external and internal links
 - 📚 Extracts metadata from the page
 - 🔄 Custom hooks for authentication, headers, and page modifications
 - 🕵️ User-agent customization
 - 🖼️ Takes screenshots of pages with enhanced error handling
 - 📜 Executes multiple custom JavaScripts before crawling
 - 📊 Generates structured output without LLM using JsonCssExtractionStrategy
 - 📚 Various chunking strategies: topic-based, regex, sentence, and more
 - 🧠 Advanced extraction strategies: cosine clustering, LLM, and more
 - 🎯 CSS selector support for precise data extraction
 - 📝 Passes instructions/keywords to refine extraction
 - 🔒 Proxy support with authentication for enhanced access
 - 🔄 Session management for complex multi-page crawling
 - 🌐 Asynchronous architecture for improved performance
 - 🖼️ Improved image processing with lazy-loading detection
 - 🕰️ Enhanced handling of delayed content loading
 - 🔑 Custom headers support for LLM interactions
 - 🖼️ iframe content extraction for comprehensive analysis
 - ⏱️ Flexible timeout and delayed content retrieval options
 ## Installation 🛠️
 Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package or use Docker.
@@ -168,11 +141,12 @@ For basic web crawling and scraping tasks:
 ```bash
 pip install crawl4ai
 crawl4ai-setup # Setup the browser
 ```
 By default, this will install the asynchronous version of Crawl4AI, using Playwright for web crawling.
-👉 **Note**: When you install Crawl4AI, the setup script should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods:
+👉 **Note**: When you install Crawl4AI, the `crawl4ai-setup` should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods:
 1. Through the command line:
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -4,7 +4,6 @@ from .async_webcrawler import AsyncWebCrawler, CacheMode
 from .models import CrawlResult
 from .__version__ import __version__
 # __version__ = "0.3.73"
 __all__ = [
    "AsyncWebCrawler",
--- a/crawl4ai/version.py
+++ b/crawl4ai/version.py
@@ -1,2 +1,2 @@
 # crawl4ai/_version.py
-__version__ = "0.3.745"
+__version__ = "0.3.746"
--- a/crawl4ai/install.py
+++ b/crawl4ai/install.py
@@ -0,0 +1,44 @@
 import subprocess
 import sys
 import asyncio
 from .async_logger import AsyncLogger, LogLevel
 # Initialize logger
 logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
 def post_install():
    """Run all post-installation tasks"""
    logger.info("Running post-installation setup...", tag="INIT")
    install_playwright()
    run_migration()
    logger.success("Post-installation setup completed!", tag="COMPLETE")
 def install_playwright():
    logger.info("Installing Playwright browsers...", tag="INIT")
    try:
        subprocess.check_call([sys.executable, "-m", "playwright", "install"])
        logger.success("Playwright installation completed successfully.", tag="COMPLETE")
    except subprocess.CalledProcessError as e:
        logger.error(f"Error during Playwright installation: {e}", tag="ERROR")
        logger.warning(
            "Please run 'python -m playwright install' manually after the installation."
        )
    except Exception as e:
        logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR")
        logger.warning(
            "Please run 'python -m playwright install' manually after the installation."
        )
 def run_migration():
    """Initialize database during installation"""
    try:
        logger.info("Starting database initialization...", tag="INIT")
        from crawl4ai.async_database import async_db_manager
        asyncio.run(async_db_manager.initialize())
        logger.success("Database initialization completed successfully.", tag="COMPLETE")
    except ImportError:
        logger.warning("Database module not found. Will initialize on first use.")
    except Exception as e:
        logger.warning(f"Database initialization failed: {e}")
        logger.warning("Database will be initialized on first use")
--- a/crawl4ai/migrations.py
+++ b/crawl4ai/migrations.py
@@ -9,9 +9,13 @@ import aiofiles
 import shutil
 import time
 from datetime import datetime
 from .async_logger import AsyncLogger, LogLevel
-logging.basicConfig(level=logging.INFO)
+# Initialize logger
-logger = logging.getLogger(__name__)
+logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)
 # logging.basicConfig(level=logging.INFO)
 # logger = logging.getLogger(__name__)
 class DatabaseMigration:
    def __init__(self, db_path: str):
@@ -55,7 +59,8 @@ class DatabaseMigration:
    async def migrate_database(self):
        """Migrate existing database to file-based storage"""
-        logger.info("Starting database migration...")
+        # logger.info("Starting database migration...")
        logger.info("Starting database migration...", tag="INIT")
        try:
            async with aiosqlite.connect(self.db_path) as db:
@@ -91,19 +96,25 @@ class DatabaseMigration:
                    migrated_count += 1
                    if migrated_count % 100 == 0:
-                        logger.info(f"Migrated {migrated_count} records...")
+                        logger.info(f"Migrated {migrated_count} records...", tag="INIT")
                await db.commit()
-                logger.info(f"Migration completed. {migrated_count} records processed.")
+                logger.success(f"Migration completed. {migrated_count} records processed.", tag="COMPLETE")
        except Exception as e:
-            logger.error(f"Migration failed: {e}")
+            # logger.error(f"Migration failed: {e}")
-            raise
+            logger.error(
                message="Migration failed: {error}",
                tag="ERROR",
                params={"error": str(e)}
            )
            raise e
 async def backup_database(db_path: str) -> str:
    """Create backup of existing database"""
    if not os.path.exists(db_path):
-        logger.info("No existing database found. Skipping backup.")
+        logger.info("No existing database found. Skipping backup.", tag="INIT")
        return None
    # Create backup with timestamp
@@ -116,11 +127,16 @@ async def backup_database(db_path: str) -> str:
        # Create backup
        shutil.copy2(db_path, backup_path)
-        logger.info(f"Database backup created at: {backup_path}")
+        logger.info(f"Database backup created at: {backup_path}", tag="COMPLETE")
        return backup_path
    except Exception as e:
-        logger.error(f"Backup failed: {e}")
+        # logger.error(f"Backup failed: {e}")
-        raise
+        logger.error(
                message="Migration failed: {error}",
                tag="ERROR",
                params={"error": str(e)}
            )
        raise e
 async def run_migration(db_path: Optional[str] = None):
    """Run database migration"""
@@ -128,7 +144,7 @@ async def run_migration(db_path: Optional[str] = None):
        db_path = os.path.join(Path.home(), ".crawl4ai", "crawl4ai.db")
    if not os.path.exists(db_path):
-        logger.info("No existing database found. Skipping migration.")
+        logger.info("No existing database found. Skipping migration.", tag="INIT")
        return
    # Create backup first
--- a/docs/examples/quickstart_async.py
+++ b/docs/examples/quickstart_async.py
@@ -562,18 +562,18 @@ async def fit_markdown_remove_overlay():
 async def main():
-    # await simple_crawl()
+    await simple_crawl()
-    # await simple_example_with_running_js_code()
+    await simple_example_with_running_js_code()
-    # await simple_example_with_css_selector()
+    await simple_example_with_css_selector()
    # await use_proxy()
-    # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
+    await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
-    # await extract_structured_data_using_css_extractor()
+    await extract_structured_data_using_css_extractor()
    # LLM extraction examples
    # await extract_structured_data_using_llm()
    # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
    # await extract_structured_data_using_llm("ollama/llama3.2")    
-    # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
+    await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
    # You always can pass custom headers to the extraction strategy
    # custom_headers = {
@@ -582,9 +582,9 @@ async def main():
    # }
    # await extract_structured_data_using_llm(extra_headers=custom_headers)
-    # await crawl_dynamic_content_pages_method_1()
+    await crawl_dynamic_content_pages_method_1()
-    # await crawl_dynamic_content_pages_method_2()
+    await crawl_dynamic_content_pages_method_2()
-    # await crawl_dynamic_content_pages_method_3()
+    await crawl_dynamic_content_pages_method_3()
    await crawl_custom_browser_type()
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,4 @@
 aiosqlite~=0.20
 html2text~=2024.2
 lxml~=5.3
 litellm>=1.53.1
 numpy>=1.26.0,<3
@@ -13,4 +12,5 @@ xxhash~=3.4
 rank-bm25~=0.2
 aiofiles>=24.1.0
 colorama~=0.4
-snowballstemmer~=2.2
+snowballstemmer~=2.2
 pydantic>=2.10
--- a/setup.py
+++ b/setup.py
@@ -1,11 +1,8 @@
 from setuptools import setup, find_packages
 from setuptools.command.install import install
 import os
 from pathlib import Path
 import shutil
-import subprocess
+
 import sys
 import asyncio
 # Create the .crawl4ai folder in the user's home directory if it doesn't exist
 # If the folder already exists, remove the cache folder
@@ -49,46 +46,6 @@ transformer_requirements = ["transformers", "tokenizers"]
 cosine_similarity_requirements = ["torch", "transformers", "nltk"]
 sync_requirements = ["selenium"]
 def install_playwright():
    print("Installing Playwright browsers...")
    try:
        subprocess.check_call([sys.executable, "-m", "playwright", "install"])
        print("Playwright installation completed successfully.")
    except subprocess.CalledProcessError as e:
        print(f"Error during Playwright installation: {e}")
        print(
            "Please run 'python -m playwright install' manually after the installation."
        )
    except Exception as e:
        print(f"Unexpected error during Playwright installation: {e}")
        print(
            "Please run 'python -m playwright install' manually after the installation."
        )
 def run_migration():
    """Initialize database during installation"""
    try:
        print("Starting database initialization...")
        from crawl4ai.async_database import async_db_manager
        asyncio.run(async_db_manager.initialize())
        print("Database initialization completed successfully.")
    except ImportError:
        print("Warning: Database module not found. Will initialize on first use.")
    except Exception as e:
        print(f"Warning: Database initialization failed: {e}")
        print("Database will be initialized on first use")
 class PostInstallCommand(install):
    def run(self):
        install.run(self)
        install_playwright()
        # run_migration()
 setup(
    name="Crawl4AI",
    version=version,
@@ -116,7 +73,8 @@ setup(
    entry_points={
        "console_scripts": [
            "crawl4ai-download-models=crawl4ai.model_loader:main",
-            "crawl4ai-migrate=crawl4ai.migrations:main",  # Added migration command
+            "crawl4ai-migrate=crawl4ai.migrations:main",  
            'crawl4ai-setup=crawl4ai.install:post_install', 
        ],
    },
    classifiers=[
@@ -130,7 +88,4 @@ setup(
        "Programming Language :: Python :: 3.10",
    ],
    python_requires=">=3.7",
    cmdclass={
        "install": PostInstallCommand,
    },
 )
`@@ -1,2 +1,2 @@`
	`# crawl4ai/_version.py`	`# crawl4ai/_version.py`
	`__version__ = "0.3.745"`	`__version__ = "0.3.746"`