feat: add stealth mode and enhance undetected browser support

- Add playwright-stealth integration with enable_stealth parameter in BrowserConfig - Merge undetected browser strategy into main async_crawler_strategy.py using adapter pattern - Add browser adapters (BrowserAdapter, PlaywrightAdapter, UndetectedAdapter) for flexible browser switching - Update install.py to install both playwright and patchright browsers automatically - Add comprehensive documentation for anti-bot features (stealth mode + undetected browser) - Create examples demonstrating stealth mode usage and comparison tests - Update pyproject.toml and requirements.txt with patchright>=1.49.0 and other dependencies - Remove duplicate/unused dependencies (alphashape, cssselect, pyperclip, shapely, selenium) - Add dependency checker tool in tests/check_dependencies.py Breaking changes: None - all existing functionality preserved 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-07-17 16:59:10 +08:00
parent 5c33cbcca2
commit 6a728cbe5b
27 changed files with 2833 additions and 460 deletions
--- a/tests/check_dependencies.py
+++ b/tests/check_dependencies.py
@@ -0,0 +1,344 @@
+#!/usr/bin/env python3
+"""
+Dependency checker for Crawl4AI
+Analyzes imports in the codebase and shows which files use them
+"""
+
+import ast
+import os
+import sys
+from pathlib import Path
+from typing import Set, Dict, List, Tuple
+from collections import defaultdict
+import re
+import toml
+
+# Standard library modules to ignore
+STDLIB_MODULES = {
+    'abc', 'argparse', 'asyncio', 'base64', 'collections', 'concurrent', 'contextlib',
+    'copy', 'datetime', 'decimal', 'email', 'enum', 'functools', 'glob', 'hashlib',
+    'http', 'importlib', 'io', 'itertools', 'json', 'logging', 'math', 'mimetypes',
+    'multiprocessing', 'os', 'pathlib', 'pickle', 'platform', 'pprint', 'random',
+    're', 'shutil', 'signal', 'socket', 'sqlite3', 'string', 'subprocess', 'sys',
+    'tempfile', 'threading', 'time', 'traceback', 'typing', 'unittest', 'urllib',
+    'uuid', 'warnings', 'weakref', 'xml', 'zipfile', 'dataclasses', 'secrets',
+    'statistics', 'textwrap', 'queue', 'csv', 'gzip', 'tarfile', 'configparser',
+    'inspect', 'operator', 'struct', 'binascii', 'codecs', 'locale', 'gc',
+    'atexit', 'builtins', 'html', 'errno', 'fcntl', 'pwd', 'grp', 'resource',
+    'termios', 'tty', 'pty', 'select', 'selectors', 'ssl', 'zlib', 'bz2',
+    'lzma', 'types', 'copy', 'pydoc', 'profile', 'cProfile', 'timeit',
+    'trace', 'doctest', 'pdb', 'contextvars', 'dataclasses', 'graphlib',
+    'zoneinfo', 'tomllib', 'cgi', 'wsgiref', 'fileinput', 'linecache',
+    'tokenize', 'tabnanny', 'compileall', 'dis', 'pickletools', 'formatter',
+    '__future__', 'array', 'ctypes', 'heapq', 'bisect', 'array', 'weakref',
+    'types', 'copy', 'pprint', 'repr', 'numbers', 'cmath', 'fractions',
+    'statistics', 'itertools', 'functools', 'operator', 'pathlib', 'fileinput',
+    'stat', 'filecmp', 'tempfile', 'glob', 'fnmatch', 'linecache', 'shutil',
+    'pickle', 'copyreg', 'shelve', 'marshal', 'dbm', 'sqlite3', 'zlib', 'gzip',
+    'bz2', 'lzma', 'zipfile', 'tarfile', 'configparser', 'netrc', 'xdrlib',
+    'plistlib', 'hashlib', 'hmac', 'secrets', 'os', 'io', 'time', 'argparse',
+    'getopt', 'logging', 'getpass', 'curses', 'platform', 'errno', 'ctypes',
+    'threading', 'multiprocessing', 'concurrent', 'subprocess', 'sched', 'queue',
+    'contextvars', 'asyncio', 'socket', 'ssl', 'email', 'json', 'mailcap',
+    'mailbox', 'mimetypes', 'base64', 'binhex', 'binascii', 'quopri', 'uu',
+    'html', 'xml', 'webbrowser', 'cgi', 'cgitb', 'wsgiref', 'urllib', 'http',
+    'ftplib', 'poplib', 'imaplib', 'nntplib', 'smtplib', 'smtpd', 'telnetlib',
+    'uuid', 'socketserver', 'xmlrpc', 'ipaddress', 'audioop', 'aifc', 'sunau',
+    'wave', 'chunk', 'colorsys', 'imghdr', 'sndhdr', 'ossaudiodev', 'gettext',
+    'locale', 'turtle', 'cmd', 'shlex', 'tkinter', 'typing', 'pydoc', 'doctest',
+    'unittest', 'test', '2to3', 'distutils', 'venv', 'ensurepip', 'zipapp',
+    'py_compile', 'compileall', 'dis', 'pickletools', 'pdb', 'timeit', 'trace',
+    'tracemalloc', 'warnings', 'faulthandler', 'pdb', 'dataclasses', 'cgi', 
+    'cgitb', 'chunk', 'crypt', 'imghdr', 'mailcap', 'nis', 'nntplib', 'optparse',
+    'ossaudiodev', 'pipes', 'smtpd', 'sndhdr', 'spwd', 'sunau', 'telnetlib',
+    'uu', 'xdrlib', 'msilib', 'pstats', 'rlcompleter', 'tkinter', 'ast'
+}
+
+# Known package name mappings (import name -> package name)
+PACKAGE_MAPPINGS = {
+    'bs4': 'beautifulsoup4',
+    'PIL': 'pillow',
+    'cv2': 'opencv-python',
+    'sklearn': 'scikit-learn',
+    'yaml': 'PyYAML',
+    'OpenSSL': 'pyOpenSSL',
+    'sqlalchemy': 'SQLAlchemy',
+    'playwright': 'playwright',
+    'patchright': 'patchright',
+    'dotenv': 'python-dotenv',
+    'fake_useragent': 'fake-useragent',
+    'playwright_stealth': 'tf-playwright-stealth',
+    'sentence_transformers': 'sentence-transformers',
+    'rank_bm25': 'rank-bm25',
+    'snowballstemmer': 'snowballstemmer',
+    'PyPDF2': 'PyPDF2',
+    'pdf2image': 'pdf2image',
+}
+
+
+class ImportVisitor(ast.NodeVisitor):
+    """AST visitor to extract imports from Python files"""
+    
+    def __init__(self):
+        self.imports = {}  # Changed to dict to store line numbers
+        self.from_imports = {}
+    
+    def visit_Import(self, node):
+        for alias in node.names:
+            module_name = alias.name.split('.')[0]
+            if module_name not in self.imports:
+                self.imports[module_name] = []
+            self.imports[module_name].append(node.lineno)
+    
+    def visit_ImportFrom(self, node):
+        if node.module and node.level == 0:  # absolute imports only
+            module_name = node.module.split('.')[0]
+            if module_name not in self.from_imports:
+                self.from_imports[module_name] = []
+            self.from_imports[module_name].append(node.lineno)
+
+
+def extract_imports_from_file(filepath: Path) -> Dict[str, List[int]]:
+    """Extract all imports from a Python file with line numbers"""
+    all_imports = {}
+    
+    try:
+        with open(filepath, 'r', encoding='utf-8') as f:
+            content = f.read()
+        
+        tree = ast.parse(content)
+        visitor = ImportVisitor()
+        visitor.visit(tree)
+        
+        # Merge imports and from_imports
+        for module, lines in visitor.imports.items():
+            if module not in all_imports:
+                all_imports[module] = []
+            all_imports[module].extend(lines)
+            
+        for module, lines in visitor.from_imports.items():
+            if module not in all_imports:
+                all_imports[module] = []
+            all_imports[module].extend(lines)
+        
+    except Exception as e:
+        # Silently skip files that can't be parsed
+        pass
+    
+    return all_imports
+
+
+def get_codebase_imports_with_files(root_dir: Path) -> Dict[str, List[Tuple[str, List[int]]]]:
+    """Get all imports from the crawl4ai library and docs folders with file locations and line numbers"""
+    import_to_files = defaultdict(list)
+    
+    # Only scan crawl4ai library folder and docs folder
+    target_dirs = [
+        root_dir / 'crawl4ai',
+        root_dir / 'docs'
+    ]
+    
+    for target_dir in target_dirs:
+        if not target_dir.exists():
+            continue
+            
+        for py_file in target_dir.rglob('*.py'):
+            # Skip __pycache__ directories
+            if '__pycache__' in py_file.parts:
+                continue
+            
+            # Skip setup.py and similar files
+            if py_file.name in ['setup.py', 'setup.cfg', 'conf.py']:
+                continue
+                
+            imports = extract_imports_from_file(py_file)
+            
+            # Map each import to the file and line numbers
+            for imp, line_numbers in imports.items():
+                relative_path = py_file.relative_to(root_dir)
+                import_to_files[imp].append((str(relative_path), sorted(line_numbers)))
+    
+    return dict(import_to_files)
+
+
+def get_declared_dependencies() -> Set[str]:
+    """Get declared dependencies from pyproject.toml and requirements.txt"""
+    declared = set()
+    
+    # Read from pyproject.toml
+    if Path('pyproject.toml').exists():
+        with open('pyproject.toml', 'r') as f:
+            data = toml.load(f)
+        
+        # Get main dependencies
+        deps = data.get('project', {}).get('dependencies', [])
+        for dep in deps:
+            # Parse dependency string (e.g., "numpy>=1.26.0,<3")
+            match = re.match(r'^([a-zA-Z0-9_-]+)', dep)
+            if match:
+                pkg_name = match.group(1).lower()
+                declared.add(pkg_name)
+        
+        # Get optional dependencies
+        optional = data.get('project', {}).get('optional-dependencies', {})
+        for group, deps in optional.items():
+            for dep in deps:
+                match = re.match(r'^([a-zA-Z0-9_-]+)', dep)
+                if match:
+                    pkg_name = match.group(1).lower()
+                    declared.add(pkg_name)
+    
+    # Also check requirements.txt as backup
+    if Path('requirements.txt').exists():
+        with open('requirements.txt', 'r') as f:
+            for line in f:
+                line = line.strip()
+                if line and not line.startswith('#'):
+                    match = re.match(r'^([a-zA-Z0-9_-]+)', line)
+                    if match:
+                        pkg_name = match.group(1).lower()
+                        declared.add(pkg_name)
+    
+    return declared
+
+
+def normalize_package_name(name: str) -> str:
+    """Normalize package name for comparison"""
+    # Handle known mappings first
+    if name in PACKAGE_MAPPINGS:
+        return PACKAGE_MAPPINGS[name].lower()
+    
+    # Basic normalization
+    return name.lower().replace('_', '-')
+
+
+def check_missing_dependencies():
+    """Main function to check for missing dependencies"""
+    print("🔍 Analyzing crawl4ai library and docs folders...\n")
+    
+    # Get all imports with their file locations
+    root_dir = Path('.')
+    import_to_files = get_codebase_imports_with_files(root_dir)
+    
+    # Get declared dependencies
+    declared_deps = get_declared_dependencies()
+    
+    # Normalize declared dependencies
+    normalized_declared = {normalize_package_name(dep) for dep in declared_deps}
+    
+    # Categorize imports
+    external_imports = {}
+    local_imports = {}
+    
+    # Known local packages
+    local_packages = {'crawl4ai'}
+    
+    for imp, file_info in import_to_files.items():
+        # Skip standard library
+        if imp in STDLIB_MODULES:
+            continue
+            
+        # Check if it's a local import
+        if any(imp.startswith(local) for local in local_packages):
+            local_imports[imp] = file_info
+        else:
+            external_imports[imp] = file_info
+    
+    # Check which external imports are not declared
+    not_declared = {}
+    declared_imports = {}
+    
+    for imp, file_info in external_imports.items():
+        normalized_imp = normalize_package_name(imp)
+        
+        # Check if import is covered by declared dependencies
+        found = False
+        for declared in normalized_declared:
+            if normalized_imp == declared or normalized_imp.startswith(declared + '.') or declared.startswith(normalized_imp):
+                found = True
+                break
+        
+        if found:
+            declared_imports[imp] = file_info
+        else:
+            not_declared[imp] = file_info
+    
+    # Print results
+    print(f"📊 Summary:")
+    print(f"  - Total unique imports: {len(import_to_files)}")
+    print(f"  - External imports: {len(external_imports)}")
+    print(f"  - Declared dependencies: {len(declared_deps)}")
+    print(f"  - External imports NOT in dependencies: {len(not_declared)}\n")
+    
+    if not_declared:
+        print("❌ External imports NOT declared in pyproject.toml or requirements.txt:\n")
+        
+        # Sort by import name
+        for imp in sorted(not_declared.keys()):
+            file_info = not_declared[imp]
+            print(f"  📦 {imp}")
+            if imp in PACKAGE_MAPPINGS:
+                print(f"     → Package name: {PACKAGE_MAPPINGS[imp]}")
+            
+            # Show up to 3 files that use this import
+            for i, (file_path, line_numbers) in enumerate(file_info[:3]):
+                # Format line numbers for clickable output
+                if len(line_numbers) == 1:
+                    print(f"     - {file_path}:{line_numbers[0]}")
+                else:
+                    # Show first few line numbers
+                    line_str = ','.join(str(ln) for ln in line_numbers[:3])
+                    if len(line_numbers) > 3:
+                        line_str += f"... ({len(line_numbers)} imports)"
+                    print(f"     - {file_path}: lines {line_str}")
+            
+            if len(file_info) > 3:
+                print(f"     ... and {len(file_info) - 3} more files")
+            print()
+    
+    # Check for potentially unused dependencies
+    print("\n🔎 Checking declared dependencies usage...\n")
+    
+    # Get all used external packages
+    used_packages = set()
+    for imp in external_imports.keys():
+        normalized = normalize_package_name(imp)
+        used_packages.add(normalized)
+    
+    # Find unused
+    unused = []
+    for dep in declared_deps:
+        normalized_dep = normalize_package_name(dep)
+        
+        # Check if any import uses this dependency
+        found_usage = False
+        for used in used_packages:
+            if used == normalized_dep or used.startswith(normalized_dep) or normalized_dep.startswith(used):
+                found_usage = True
+                break
+        
+        if not found_usage:
+            # Some packages are commonly unused directly
+            indirect_deps = {'wheel', 'setuptools', 'pip', 'colorama', 'certifi', 'packaging', 'urllib3'}
+            if normalized_dep not in indirect_deps:
+                unused.append(dep)
+    
+    if unused:
+        print("⚠️  Declared dependencies with NO imports found:")
+        for dep in sorted(unused):
+            print(f"  - {dep}")
+        print("\n  Note: These might be used indirectly or by other dependencies")
+    else:
+        print("✅ All declared dependencies have corresponding imports")
+    
+    print("\n" + "="*60)
+    print("💡 How to use this report:")
+    print("  1. Check each ❌ import to see if it's legitimate")
+    print("  2. If legitimate, add the package to pyproject.toml")
+    print("  3. If it's an internal module or typo, fix the import")
+    print("  4. Review unused dependencies - remove if truly not needed")
+    print("="*60)
+
+
+if __name__ == '__main__':
+    check_missing_dependencies()