crawl4ai/crawl4ai/install.py

import subprocess
import sys
import asyncio
from .async_logger import AsyncLogger, LogLevel
from pathlib import Path
import os
import shutil

# Initialize logger
logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True)

def setup_home_directory():
    """Set up the .crawl4ai folder structure in the user's home directory."""
    base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY")
    crawl4ai_folder = Path(base_dir) if base_dir else Path.home()
    crawl4ai_config = crawl4ai_folder / "global.yml"
    crawl4ai_folder = crawl4ai_folder / ".crawl4ai"
    cache_folder = crawl4ai_folder / "cache"
    content_folders = [
        "html_content",
        "cleaned_html",
        "markdown_content",
        "extracted_content",
        "screenshots",
    ]

    # Clean up old cache if exists
    if cache_folder.exists():
        shutil.rmtree(cache_folder)

    # Create new folder structure
    crawl4ai_folder.mkdir(exist_ok=True)
    cache_folder.mkdir(exist_ok=True)
    for folder in content_folders:
        (crawl4ai_folder / folder).mkdir(exist_ok=True)

    # If config file does not exist, create it
    if not crawl4ai_config.exists():
        with open(crawl4ai_config, "w") as f:
            f.write("")

def post_install():
    """Run all post-installation tasks"""
    logger.info("Running post-installation setup...", tag="INIT")
    setup_home_directory()
    install_playwright()
    run_migration()
    logger.success("Post-installation setup completed!", tag="COMPLETE")


def install_playwright():
    logger.info("Installing Playwright browsers...", tag="INIT")
    try:
        # subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps", "--force", "chrome"])
        subprocess.check_call(
            [
                sys.executable,
                "-m",
                "playwright",
                "install",
                "--with-deps",
                "--force",
                "chromium",
            ]
        )
        logger.success(
            "Playwright installation completed successfully.", tag="COMPLETE"
        )
    except subprocess.CalledProcessError:
        # logger.error(f"Error during Playwright installation: {e}", tag="ERROR")
        logger.warning(
            f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation."
        )
    except Exception:
        # logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR")
        logger.warning(
            f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation."
        )


def run_migration():
    """Initialize database during installation"""
    try:
        logger.info("Starting database initialization...", tag="INIT")
        from crawl4ai.async_database import async_db_manager

        asyncio.run(async_db_manager.initialize())
        logger.success(
            "Database initialization completed successfully.", tag="COMPLETE"
        )
    except ImportError:
        logger.warning("Database module not found. Will initialize on first use.")
    except Exception as e:
        logger.warning(f"Database initialization failed: {e}")
        logger.warning("Database will be initialized on first use")


async def run_doctor():
    """Test if Crawl4AI is working properly"""
    logger.info("Running Crawl4AI health check...", tag="INIT")
    try:
        from .async_webcrawler import (
            AsyncWebCrawler,
            BrowserConfig,
            CrawlerRunConfig,
            CacheMode,
        )

        browser_config = BrowserConfig(
            headless=True,
            browser_type="chromium",
            ignore_https_errors=True,
            light_mode=True,
            viewport_width=1280,
            viewport_height=720,
        )

        run_config = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS,
            screenshot=True,
        )

        async with AsyncWebCrawler(config=browser_config) as crawler:
            logger.info("Testing crawling capabilities...", tag="TEST")
            result = await crawler.arun(url="https://crawl4ai.com", config=run_config)

            if result and result.markdown:
                logger.success("✅ Crawling test passed!", tag="COMPLETE")
                return True
            else:
                raise Exception("Failed to get content")

    except Exception as e:
        logger.error(f"❌ Test failed: {e}", tag="ERROR")
        return False


def doctor():
    """Entry point for the doctor command"""
    import asyncio

    asyncio.run(run_doctor())
    sys.exit(0)