Add all 5 deployments solution for testing

2025-03-10 18:57:14 +08:00
parent 9547bada3a
commit 3ea3c0520d
38 changed files with 6431 additions and 0 deletions
--- a/deploy/lambda/Dockerfile
+++ b/deploy/lambda/Dockerfile
@@ -0,0 +1,104 @@
+FROM python:3.12-bookworm AS python-builder
+
+RUN pip install poetry
+
+ENV POETRY_NO_INTERACTION=1 \
+    POETRY_CACHE_DIR=/tmp/poetry_cache
+
+WORKDIR /app
+
+COPY pyproject.toml poetry.lock ./
+RUN --mount=type=cache,target=$POETRY_CACHE_DIR poetry export -f requirements.txt -o requirements.txt
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    python3-dev \
+    python3-setuptools \
+    python3-wheel \
+    python3-pip \
+    gcc \
+    g++ \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install specific dependencies that have build issues
+RUN pip install --no-cache-dir cchardet
+
+FROM python:3.12-bookworm
+
+# Install AWS Lambda Runtime Interface Client
+RUN python3 -m pip install --no-cache-dir awslambdaric
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    curl \
+    wget \
+    gnupg \
+    git \
+    cmake \
+    pkg-config \
+    python3-dev \
+    libjpeg-dev \
+    redis-server \
+    supervisor \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libglib2.0-0 \
+    libnss3 \
+    libnspr4 \
+    libatk1.0-0 \
+    libatk-bridge2.0-0 \
+    libcups2 \
+    libdrm2 \
+    libdbus-1-3 \
+    libxcb1 \
+    libxkbcommon0 \
+    libx11-6 \
+    libxcomposite1 \
+    libxdamage1 \
+    libxext6 \
+    libxfixes3 \
+    libxrandr2 \
+    libgbm1 \
+    libpango-1.0-0 \
+    libcairo2 \
+    libasound2 \
+    libatspi2.0-0 \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install build essentials for any compilations needed
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    python3-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set up function directory and browser path
+ARG FUNCTION_DIR="/function"
+RUN mkdir -p "${FUNCTION_DIR}/pw-browsers"
+RUN mkdir -p "/tmp/.crawl4ai"
+
+# Set critical environment variables
+ENV PLAYWRIGHT_BROWSERS_PATH="${FUNCTION_DIR}/pw-browsers" \
+    HOME="/tmp" \
+    CRAWL4_AI_BASE_DIRECTORY="/tmp/.crawl4ai"
+
+# Create Craw4ai base directory
+RUN mkdir -p ${CRAWL4_AI_BASE_DIRECTORY}
+
+RUN pip install --no-cache-dir faust-cchardet
+
+# Install Crawl4ai and dependencies
+RUN pip install --no-cache-dir git+https://github.com/unclecode/crawl4ai.git@next
+
+# Install Chromium only (no deps flag)
+RUN playwright install chromium
+
+# Copy function code
+COPY lambda_function.py ${FUNCTION_DIR}/
+
+# Set working directory
+WORKDIR ${FUNCTION_DIR}
+
+ENTRYPOINT [ "/usr/local/bin/python", "-m", "awslambdaric" ]
+CMD [ "lambda_function.handler" ]
--- a/deploy/lambda/deploy.py
+++ b/deploy/lambda/deploy.py
--- a/deploy/lambda/guide.md
+++ b/deploy/lambda/guide.md
@@ -0,0 +1,345 @@
+# Deploying Crawl4ai on AWS Lambda
+
+This guide walks you through deploying Crawl4ai as an AWS Lambda function with API Gateway integration. You'll learn how to set up, test, and clean up your deployment.
+
+## Prerequisites
+
+Before you begin, ensure you have:
+
+- AWS CLI installed and configured (`aws configure`)
+- Docker installed and running
+- Python 3.8+ installed
+- Basic familiarity with AWS services
+
+## Project Files
+
+Your project directory should contain:
+
+- `Dockerfile`: Container configuration for Lambda
+- `lambda_function.py`: Lambda handler code
+- `deploy.py`: Our deployment script
+
+## Step 1: Install Required Python Packages
+
+Install the Python packages needed for our deployment script:
+
+```bash
+pip install typer rich
+```
+
+## Step 2: Run the Deployment Script
+
+Our Python script automates the entire deployment process:
+
+```bash
+python deploy.py
+```
+
+The script will guide you through:
+
+1. Configuration setup (AWS region, function name, memory allocation)
+2. Docker image building
+3. ECR repository creation
+4. Lambda function deployment
+5. API Gateway setup
+6. Provisioned concurrency configuration (optional)
+
+Follow the prompts and confirm each step by pressing Enter.
+
+## Step 3: Manual Deployment (Alternative to the Script)
+
+If you prefer to deploy manually or understand what the script does, follow these steps:
+
+### Building and Pushing the Docker Image
+
+```bash
+# Build the Docker image
+docker build -t crawl4ai-lambda .
+
+# Create an ECR repository (if it doesn't exist)
+aws ecr create-repository --repository-name crawl4ai-lambda
+
+# Get ECR login password and login
+aws ecr get-login-password | docker login --username AWS --password-stdin $(aws sts get-caller-identity --query Account --output text).dkr.ecr.us-east-1.amazonaws.com
+
+# Tag the image
+ECR_URI=$(aws ecr describe-repositories --repository-names crawl4ai-lambda --query 'repositories[0].repositoryUri' --output text)
+docker tag crawl4ai-lambda:latest $ECR_URI:latest
+
+# Push the image to ECR
+docker push $ECR_URI:latest
+```
+
+### Creating the Lambda Function
+
+```bash
+# Get IAM role ARN (create it if needed)
+ROLE_ARN=$(aws iam get-role --role-name lambda-execution-role --query 'Role.Arn' --output text)
+
+# Create Lambda function
+aws lambda create-function \
+    --function-name crawl4ai-function \
+    --package-type Image \
+    --code ImageUri=$ECR_URI:latest \
+    --role $ROLE_ARN \
+    --timeout 300 \
+    --memory-size 4096 \
+    --ephemeral-storage Size=10240 \
+    --environment "Variables={CRAWL4_AI_BASE_DIRECTORY=/tmp/.crawl4ai,HOME=/tmp,PLAYWRIGHT_BROWSERS_PATH=/function/pw-browsers}"
+```
+
+If you're updating an existing function:
+
+```bash
+# Update function code
+aws lambda update-function-code \
+    --function-name crawl4ai-function \
+    --image-uri $ECR_URI:latest
+
+# Update function configuration
+aws lambda update-function-configuration \
+    --function-name crawl4ai-function \
+    --timeout 300 \
+    --memory-size 4096 \
+    --ephemeral-storage Size=10240 \
+    --environment "Variables={CRAWL4_AI_BASE_DIRECTORY=/tmp/.crawl4ai,HOME=/tmp,PLAYWRIGHT_BROWSERS_PATH=/function/pw-browsers}"
+```
+
+### Setting Up API Gateway
+
+```bash
+# Create API Gateway
+API_ID=$(aws apigateway create-rest-api --name crawl4ai-api --query 'id' --output text)
+
+# Get root resource ID
+PARENT_ID=$(aws apigateway get-resources --rest-api-id $API_ID --query 'items[?path==`/`].id' --output text)
+
+# Create resource
+RESOURCE_ID=$(aws apigateway create-resource --rest-api-id $API_ID --parent-id $PARENT_ID --path-part "crawl" --query 'id' --output text)
+
+# Create POST method
+aws apigateway put-method --rest-api-id $API_ID --resource-id $RESOURCE_ID --http-method POST --authorization-type NONE
+
+# Get Lambda function ARN
+LAMBDA_ARN=$(aws lambda get-function --function-name crawl4ai-function --query 'Configuration.FunctionArn' --output text)
+
+# Set Lambda integration
+aws apigateway put-integration \
+    --rest-api-id $API_ID \
+    --resource-id $RESOURCE_ID \
+    --http-method POST \
+    --type AWS_PROXY \
+    --integration-http-method POST \
+    --uri arn:aws:apigateway:us-east-1:lambda:path/2015-03-31/functions/$LAMBDA_ARN/invocations
+
+# Deploy API
+aws apigateway create-deployment --rest-api-id $API_ID --stage-name prod
+
+# Set Lambda permission
+ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
+aws lambda add-permission \
+    --function-name crawl4ai-function \
+    --statement-id apigateway \
+    --action lambda:InvokeFunction \
+    --principal apigateway.amazonaws.com \
+    --source-arn "arn:aws:execute-api:us-east-1:$ACCOUNT_ID:$API_ID/*/POST/crawl"
+```
+
+### Setting Up Provisioned Concurrency (Optional)
+
+This reduces cold starts:
+
+```bash
+# Publish a version
+VERSION=$(aws lambda publish-version --function-name crawl4ai-function --query 'Version' --output text)
+
+# Create alias
+aws lambda create-alias \
+    --function-name crawl4ai-function \
+    --name prod \
+    --function-version $VERSION
+
+# Configure provisioned concurrency
+aws lambda put-provisioned-concurrency-config \
+    --function-name crawl4ai-function \
+    --qualifier prod \
+    --provisioned-concurrent-executions 2
+
+# Update API Gateway to use alias
+LAMBDA_ALIAS_ARN="arn:aws:lambda:us-east-1:$ACCOUNT_ID:function:crawl4ai-function:prod"
+aws apigateway put-integration \
+    --rest-api-id $API_ID \
+    --resource-id $RESOURCE_ID \
+    --http-method POST \
+    --type AWS_PROXY \
+    --integration-http-method POST \
+    --uri arn:aws:apigateway:us-east-1:lambda:path/2015-03-31/functions/$LAMBDA_ALIAS_ARN/invocations
+
+# Redeploy API Gateway
+aws apigateway create-deployment --rest-api-id $API_ID --stage-name prod
+```
+
+## Step 4: Testing the Deployment
+
+Once deployed, test your function with:
+
+```bash
+ENDPOINT_URL="https://$API_ID.execute-api.us-east-1.amazonaws.com/prod/crawl"
+
+# Test with curl
+curl -X POST $ENDPOINT_URL \
+  -H "Content-Type: application/json" \
+  -d '{"url":"https://example.com"}'
+```
+
+Or using Python:
+
+```python
+import requests
+import json
+
+url = "https://your-api-id.execute-api.us-east-1.amazonaws.com/prod/crawl"
+payload = {
+    "url": "https://example.com",
+    "browser_config": {
+        "headless": True,
+        "verbose": False
+    },
+    "crawler_config": {
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "markdown_generator": {
+                    "type": "DefaultMarkdownGenerator",
+                    "params": {
+                        "content_filter": {
+                            "type": "PruningContentFilter",
+                            "params": {
+                                "threshold": 0.48,
+                                "threshold_type": "fixed"
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+response = requests.post(url, json=payload)
+result = response.json()
+print(json.dumps(result, indent=2))
+```
+
+## Step 5: Cleaning Up Resources
+
+To remove all AWS resources created for this deployment:
+
+```bash
+python deploy.py cleanup
+```
+
+Or manually:
+
+```bash
+# Delete API Gateway
+aws apigateway delete-rest-api --rest-api-id $API_ID
+
+# Remove provisioned concurrency (if configured)
+aws lambda delete-provisioned-concurrency-config \
+    --function-name crawl4ai-function \
+    --qualifier prod
+
+# Delete alias (if created)
+aws lambda delete-alias \
+    --function-name crawl4ai-function \
+    --name prod
+
+# Delete Lambda function
+aws lambda delete-function --function-name crawl4ai-function
+
+# Delete ECR repository
+aws ecr delete-repository --repository-name crawl4ai-lambda --force
+```
+
+## Troubleshooting
+
+### Cold Start Issues
+
+If experiencing long cold starts:
+- Enable provisioned concurrency
+- Increase memory allocation (4096 MB recommended)
+- Ensure the Lambda function has enough ephemeral storage
+
+### Permission Errors
+
+If you encounter permission errors:
+- Check the IAM role has the necessary permissions
+- Ensure API Gateway has permission to invoke the Lambda function
+
+### Container Size Issues
+
+If your container is too large:
+- Optimize the Dockerfile
+- Use multi-stage builds
+- Consider removing unnecessary dependencies
+
+## Performance Considerations
+
+- Lambda memory affects CPU allocation - higher memory means faster execution
+- Provisioned concurrency eliminates cold starts but costs more
+- Optimize the Playwright setup for faster browser initialization
+
+## Security Best Practices
+
+- Use the principle of least privilege for IAM roles
+- Implement API Gateway authentication for production deployments
+- Consider using AWS KMS for storing sensitive configuration
+
+## Useful AWS Console Links
+
+Here are quick links to access important AWS console pages for monitoring and managing your deployment:
+
+| Resource | Console Link |
+|----------|-------------|
+| Lambda Functions | [AWS Lambda Console](https://console.aws.amazon.com/lambda/home#/functions) |
+| Lambda Function Logs | [CloudWatch Logs](https://console.aws.amazon.com/cloudwatch/home#logsV2:log-groups) |
+| API Gateway | [API Gateway Console](https://console.aws.amazon.com/apigateway/home) |
+| ECR Repositories | [ECR Console](https://console.aws.amazon.com/ecr/repositories) |
+| IAM Roles | [IAM Console](https://console.aws.amazon.com/iamv2/home#/roles) |
+| CloudWatch Metrics | [CloudWatch Metrics](https://console.aws.amazon.com/cloudwatch/home#metricsV2) |
+
+### Monitoring Lambda Execution
+
+To monitor your Lambda function:
+
+1. Go to the [Lambda function console](https://console.aws.amazon.com/lambda/home#/functions)
+2. Select your function (`crawl4ai-function`)
+3. Click the "Monitor" tab to see:
+   - Invocation metrics
+   - Success/failure rates
+   - Duration statistics
+
+### Viewing Lambda Logs
+
+To see detailed execution logs:
+
+1. Go to [CloudWatch Logs](https://console.aws.amazon.com/cloudwatch/home#logsV2:log-groups)
+2. Find the log group named `/aws/lambda/crawl4ai-function`
+3. Click to see the latest log streams
+4. Each stream contains logs from a function execution
+
+### Checking API Gateway Traffic
+
+To monitor API requests:
+
+1. Go to the [API Gateway console](https://console.aws.amazon.com/apigateway/home)
+2. Select your API (`crawl4ai-api`)
+3. Click "Dashboard" to see:
+   - API calls
+   - Latency
+   - Error rates
+
+## Conclusion
+
+You now have Crawl4ai running as a serverless function on AWS Lambda! This setup allows you to crawl websites on-demand without maintaining infrastructure, while paying only for the compute time you use.
--- a/deploy/lambda/lambda_function.py
+++ b/deploy/lambda/lambda_function.py
@@ -0,0 +1,107 @@
+import json
+import asyncio
+import os
+
+# Ensure environment variables and directories are set
+os.environ['CRAWL4_AI_BASE_DIRECTORY'] = '/tmp/.crawl4ai'
+os.environ['HOME'] = '/tmp'
+
+# Create directory if it doesn't exist
+os.makedirs('/tmp/.crawl4ai', exist_ok=True)
+
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig,
+    CrawlerRunConfig,
+    CacheMode
+)
+
+
+def handler(event, context):
+    # Parse the incoming event (API Gateway request)
+    try:
+        body = json.loads(event.get('body', '{}'))
+        
+        url = body.get('url')
+        if not url:
+            return {
+                'statusCode': 400,
+                'body': json.dumps({'error': 'URL is required'})
+            }
+        
+        # Get optional configurations or use defaults
+        browser_config_dict = body.get('browser_config', {})
+        crawler_config_dict = body.get('crawler_config', {})
+        
+        # Run the crawler
+        result = asyncio.run(crawl(url, browser_config_dict, crawler_config_dict))
+        
+        # Return successful response
+        return {
+            'statusCode': 200,
+            'headers': {
+                'Content-Type': 'application/json'
+            },
+            'body': json.dumps(result)
+        }
+    
+    except Exception as e:
+        # Handle errors
+        import traceback
+        return {
+            'statusCode': 500,
+            'body': json.dumps({
+                'error': str(e),
+                'traceback': traceback.format_exc()
+            })
+        }
+
+async def crawl(url, browser_config_dict, crawler_config_dict):
+    """
+    Run the crawler with the provided configurations, with Lambda-specific settings
+    """
+    # Start with user-provided config but override with Lambda-required settings
+    base_browser_config = BrowserConfig.load(browser_config_dict) if browser_config_dict else BrowserConfig()
+    
+    # Apply Lambda-specific browser configurations
+    browser_config = BrowserConfig(
+        verbose=True,
+        browser_type="chromium",
+        headless=True,
+        user_agent_mode="random",
+        light_mode=True,
+        use_managed_browser=False,
+        extra_args=[
+            "--headless=new",
+            "--no-sandbox",
+            "--disable-dev-shm-usage",
+            "--disable-setuid-sandbox",
+            "--remote-allow-origins=*",
+            "--autoplay-policy=user-gesture-required",
+            "--single-process",            
+        ],
+        # # Carry over any other settings from user config that aren't overridden
+        # **{k: v for k, v in base_browser_config.model_dump().items() 
+        #    if k not in ['verbose', 'browser_type', 'headless', 'user_agent_mode', 
+        #                'light_mode', 'use_managed_browser', 'extra_args']}
+    )
+    
+    # Start with user-provided crawler config but ensure cache is bypassed
+    base_crawler_config = CrawlerRunConfig.load(crawler_config_dict) if crawler_config_dict else CrawlerRunConfig()
+    
+    # Apply Lambda-specific crawler configurations
+    crawler_config = CrawlerRunConfig(
+        exclude_external_links=base_crawler_config.exclude_external_links,
+        remove_overlay_elements=True,
+        magic=True,
+        cache_mode=CacheMode.BYPASS,
+        # Carry over markdown generator and other settings
+        markdown_generator=base_crawler_config.markdown_generator
+    )
+    
+    # Perform the crawl with Lambda-optimized settings
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        result = await crawler.arun(url=url, config=crawler_config)
+        
+        # Return serializable results
+        return result.model_dump()