crawl4ai/tests/check_dependencies.py

#!/usr/bin/env python3
"""
Dependency checker for Crawl4AI
Analyzes imports in the codebase and shows which files use them
"""

import ast
import os
import sys
from pathlib import Path
from typing import Set, Dict, List, Tuple
from collections import defaultdict
import re
import toml

# Standard library modules to ignore
STDLIB_MODULES = {
    'abc', 'argparse', 'asyncio', 'base64', 'collections', 'concurrent', 'contextlib',
    'copy', 'datetime', 'decimal', 'email', 'enum', 'functools', 'glob', 'hashlib',
    'http', 'importlib', 'io', 'itertools', 'json', 'logging', 'math', 'mimetypes',
    'multiprocessing', 'os', 'pathlib', 'pickle', 'platform', 'pprint', 'random',
    're', 'shutil', 'signal', 'socket', 'sqlite3', 'string', 'subprocess', 'sys',
    'tempfile', 'threading', 'time', 'traceback', 'typing', 'unittest', 'urllib',
    'uuid', 'warnings', 'weakref', 'xml', 'zipfile', 'dataclasses', 'secrets',
    'statistics', 'textwrap', 'queue', 'csv', 'gzip', 'tarfile', 'configparser',
    'inspect', 'operator', 'struct', 'binascii', 'codecs', 'locale', 'gc',
    'atexit', 'builtins', 'html', 'errno', 'fcntl', 'pwd', 'grp', 'resource',
    'termios', 'tty', 'pty', 'select', 'selectors', 'ssl', 'zlib', 'bz2',
    'lzma', 'types', 'copy', 'pydoc', 'profile', 'cProfile', 'timeit',
    'trace', 'doctest', 'pdb', 'contextvars', 'dataclasses', 'graphlib',
    'zoneinfo', 'tomllib', 'cgi', 'wsgiref', 'fileinput', 'linecache',
    'tokenize', 'tabnanny', 'compileall', 'dis', 'pickletools', 'formatter',
    '__future__', 'array', 'ctypes', 'heapq', 'bisect', 'array', 'weakref',
    'types', 'copy', 'pprint', 'repr', 'numbers', 'cmath', 'fractions',
    'statistics', 'itertools', 'functools', 'operator', 'pathlib', 'fileinput',
    'stat', 'filecmp', 'tempfile', 'glob', 'fnmatch', 'linecache', 'shutil',
    'pickle', 'copyreg', 'shelve', 'marshal', 'dbm', 'sqlite3', 'zlib', 'gzip',
    'bz2', 'lzma', 'zipfile', 'tarfile', 'configparser', 'netrc', 'xdrlib',
    'plistlib', 'hashlib', 'hmac', 'secrets', 'os', 'io', 'time', 'argparse',
    'getopt', 'logging', 'getpass', 'curses', 'platform', 'errno', 'ctypes',
    'threading', 'multiprocessing', 'concurrent', 'subprocess', 'sched', 'queue',
    'contextvars', 'asyncio', 'socket', 'ssl', 'email', 'json', 'mailcap',
    'mailbox', 'mimetypes', 'base64', 'binhex', 'binascii', 'quopri', 'uu',
    'html', 'xml', 'webbrowser', 'cgi', 'cgitb', 'wsgiref', 'urllib', 'http',
    'ftplib', 'poplib', 'imaplib', 'nntplib', 'smtplib', 'smtpd', 'telnetlib',
    'uuid', 'socketserver', 'xmlrpc', 'ipaddress', 'audioop', 'aifc', 'sunau',
    'wave', 'chunk', 'colorsys', 'imghdr', 'sndhdr', 'ossaudiodev', 'gettext',
    'locale', 'turtle', 'cmd', 'shlex', 'tkinter', 'typing', 'pydoc', 'doctest',
    'unittest', 'test', '2to3', 'distutils', 'venv', 'ensurepip', 'zipapp',
    'py_compile', 'compileall', 'dis', 'pickletools', 'pdb', 'timeit', 'trace',
    'tracemalloc', 'warnings', 'faulthandler', 'pdb', 'dataclasses', 'cgi',
    'cgitb', 'chunk', 'crypt', 'imghdr', 'mailcap', 'nis', 'nntplib', 'optparse',
    'ossaudiodev', 'pipes', 'smtpd', 'sndhdr', 'spwd', 'sunau', 'telnetlib',
    'uu', 'xdrlib', 'msilib', 'pstats', 'rlcompleter', 'tkinter', 'ast'
}

# Known package name mappings (import name -> package name)
PACKAGE_MAPPINGS = {
    'bs4': 'beautifulsoup4',
    'PIL': 'pillow',
    'cv2': 'opencv-python',
    'sklearn': 'scikit-learn',
    'yaml': 'PyYAML',
    'OpenSSL': 'pyOpenSSL',
    'sqlalchemy': 'SQLAlchemy',
    'playwright': 'playwright',
    'patchright': 'patchright',
    'dotenv': 'python-dotenv',
    'fake_useragent': 'fake-useragent',
    'playwright_stealth': 'tf-playwright-stealth',
    'sentence_transformers': 'sentence-transformers',
    'rank_bm25': 'rank-bm25',
    'snowballstemmer': 'snowballstemmer',
    'pypdf': 'pypdf',
    'pdf2image': 'pdf2image',
}


class ImportVisitor(ast.NodeVisitor):
    """AST visitor to extract imports from Python files"""

    def __init__(self):
        self.imports = {}  # Changed to dict to store line numbers
        self.from_imports = {}

    def visit_Import(self, node):
        for alias in node.names:
            module_name = alias.name.split('.')[0]
            if module_name not in self.imports:
                self.imports[module_name] = []
            self.imports[module_name].append(node.lineno)

    def visit_ImportFrom(self, node):
        if node.module and node.level == 0:  # absolute imports only
            module_name = node.module.split('.')[0]
            if module_name not in self.from_imports:
                self.from_imports[module_name] = []
            self.from_imports[module_name].append(node.lineno)


def extract_imports_from_file(filepath: Path) -> Dict[str, List[int]]:
    """Extract all imports from a Python file with line numbers"""
    all_imports = {}

    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()

        tree = ast.parse(content)
        visitor = ImportVisitor()
        visitor.visit(tree)

        # Merge imports and from_imports
        for module, lines in visitor.imports.items():
            if module not in all_imports:
                all_imports[module] = []
            all_imports[module].extend(lines)

        for module, lines in visitor.from_imports.items():
            if module not in all_imports:
                all_imports[module] = []
            all_imports[module].extend(lines)

    except Exception as e:
        # Silently skip files that can't be parsed
        pass

    return all_imports


def get_codebase_imports_with_files(root_dir: Path) -> Dict[str, List[Tuple[str, List[int]]]]:
    """Get all imports from the crawl4ai library and docs folders with file locations and line numbers"""
    import_to_files = defaultdict(list)

    # Only scan crawl4ai library folder and docs folder
    target_dirs = [
        root_dir / 'crawl4ai',
        root_dir / 'docs'
    ]

    for target_dir in target_dirs:
        if not target_dir.exists():
            continue

        for py_file in target_dir.rglob('*.py'):
            # Skip __pycache__ directories
            if '__pycache__' in py_file.parts:
                continue

            # Skip setup.py and similar files
            if py_file.name in ['setup.py', 'setup.cfg', 'conf.py']:
                continue

            imports = extract_imports_from_file(py_file)

            # Map each import to the file and line numbers
            for imp, line_numbers in imports.items():
                relative_path = py_file.relative_to(root_dir)
                import_to_files[imp].append((str(relative_path), sorted(line_numbers)))

    return dict(import_to_files)


def get_declared_dependencies() -> Set[str]:
    """Get declared dependencies from pyproject.toml and requirements.txt"""
    declared = set()

    # Read from pyproject.toml
    if Path('pyproject.toml').exists():
        with open('pyproject.toml', 'r') as f:
            data = toml.load(f)

        # Get main dependencies
        deps = data.get('project', {}).get('dependencies', [])
        for dep in deps:
            # Parse dependency string (e.g., "numpy>=1.26.0,<3")
            match = re.match(r'^([a-zA-Z0-9_-]+)', dep)
            if match:
                pkg_name = match.group(1).lower()
                declared.add(pkg_name)

        # Get optional dependencies
        optional = data.get('project', {}).get('optional-dependencies', {})
        for group, deps in optional.items():
            for dep in deps:
                match = re.match(r'^([a-zA-Z0-9_-]+)', dep)
                if match:
                    pkg_name = match.group(1).lower()
                    declared.add(pkg_name)

    # Also check requirements.txt as backup
    if Path('requirements.txt').exists():
        with open('requirements.txt', 'r') as f:
            for line in f:
                line = line.strip()
                if line and not line.startswith('#'):
                    match = re.match(r'^([a-zA-Z0-9_-]+)', line)
                    if match:
                        pkg_name = match.group(1).lower()
                        declared.add(pkg_name)

    return declared


def normalize_package_name(name: str) -> str:
    """Normalize package name for comparison"""
    # Handle known mappings first
    if name in PACKAGE_MAPPINGS:
        return PACKAGE_MAPPINGS[name].lower()

    # Basic normalization
    return name.lower().replace('_', '-')


def check_missing_dependencies():
    """Main function to check for missing dependencies"""
    print("🔍 Analyzing crawl4ai library and docs folders...\n")

    # Get all imports with their file locations
    root_dir = Path('.')
    import_to_files = get_codebase_imports_with_files(root_dir)

    # Get declared dependencies
    declared_deps = get_declared_dependencies()

    # Normalize declared dependencies
    normalized_declared = {normalize_package_name(dep) for dep in declared_deps}

    # Categorize imports
    external_imports = {}
    local_imports = {}

    # Known local packages
    local_packages = {'crawl4ai'}

    for imp, file_info in import_to_files.items():
        # Skip standard library
        if imp in STDLIB_MODULES:
            continue

        # Check if it's a local import
        if any(imp.startswith(local) for local in local_packages):
            local_imports[imp] = file_info
        else:
            external_imports[imp] = file_info

    # Check which external imports are not declared
    not_declared = {}
    declared_imports = {}

    for imp, file_info in external_imports.items():
        normalized_imp = normalize_package_name(imp)

        # Check if import is covered by declared dependencies
        found = False
        for declared in normalized_declared:
            if normalized_imp == declared or normalized_imp.startswith(declared + '.') or declared.startswith(normalized_imp):
                found = True
                break

        if found:
            declared_imports[imp] = file_info
        else:
            not_declared[imp] = file_info

    # Print results
    print(f"📊 Summary:")
    print(f"  - Total unique imports: {len(import_to_files)}")
    print(f"  - External imports: {len(external_imports)}")
    print(f"  - Declared dependencies: {len(declared_deps)}")
    print(f"  - External imports NOT in dependencies: {len(not_declared)}\n")

    if not_declared:
        print("❌ External imports NOT declared in pyproject.toml or requirements.txt:\n")

        # Sort by import name
        for imp in sorted(not_declared.keys()):
            file_info = not_declared[imp]
            print(f"  📦 {imp}")
            if imp in PACKAGE_MAPPINGS:
                print(f"     → Package name: {PACKAGE_MAPPINGS[imp]}")

            # Show up to 3 files that use this import
            for i, (file_path, line_numbers) in enumerate(file_info[:3]):
                # Format line numbers for clickable output
                if len(line_numbers) == 1:
                    print(f"     - {file_path}:{line_numbers[0]}")
                else:
                    # Show first few line numbers
                    line_str = ','.join(str(ln) for ln in line_numbers[:3])
                    if len(line_numbers) > 3:
                        line_str += f"... ({len(line_numbers)} imports)"
                    print(f"     - {file_path}: lines {line_str}")

            if len(file_info) > 3:
                print(f"     ... and {len(file_info) - 3} more files")
            print()

    # Check for potentially unused dependencies
    print("\n🔎 Checking declared dependencies usage...\n")

    # Get all used external packages
    used_packages = set()
    for imp in external_imports.keys():
        normalized = normalize_package_name(imp)
        used_packages.add(normalized)

    # Find unused
    unused = []
    for dep in declared_deps:
        normalized_dep = normalize_package_name(dep)

        # Check if any import uses this dependency
        found_usage = False
        for used in used_packages:
            if used == normalized_dep or used.startswith(normalized_dep) or normalized_dep.startswith(used):
                found_usage = True
                break

        if not found_usage:
            # Some packages are commonly unused directly
            indirect_deps = {'wheel', 'setuptools', 'pip', 'colorama', 'certifi', 'packaging', 'urllib3'}
            if normalized_dep not in indirect_deps:
                unused.append(dep)

    if unused:
        print("⚠️  Declared dependencies with NO imports found:")
        for dep in sorted(unused):
            print(f"  - {dep}")
        print("\n  Note: These might be used indirectly or by other dependencies")
    else:
        print("✅ All declared dependencies have corresponding imports")

    print("\n" + "="*60)
    print("💡 How to use this report:")
    print("  1. Check each ❌ import to see if it's legitimate")
    print("  2. If legitimate, add the package to pyproject.toml")
    print("  3. If it's an internal module or typo, fix the import")
    print("  4. Review unused dependencies - remove if truly not needed")
    print("="*60)


if __name__ == '__main__':
    check_missing_dependencies()