Add all 5 deployments solution for testing
This commit is contained in:
104
deploy/lambda/Dockerfile
Normal file
104
deploy/lambda/Dockerfile
Normal file
@@ -0,0 +1,104 @@
|
||||
FROM python:3.12-bookworm AS python-builder
|
||||
|
||||
RUN pip install poetry
|
||||
|
||||
ENV POETRY_NO_INTERACTION=1 \
|
||||
POETRY_CACHE_DIR=/tmp/poetry_cache
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY pyproject.toml poetry.lock ./
|
||||
RUN --mount=type=cache,target=$POETRY_CACHE_DIR poetry export -f requirements.txt -o requirements.txt
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
python3-dev \
|
||||
python3-setuptools \
|
||||
python3-wheel \
|
||||
python3-pip \
|
||||
gcc \
|
||||
g++ \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install specific dependencies that have build issues
|
||||
RUN pip install --no-cache-dir cchardet
|
||||
|
||||
FROM python:3.12-bookworm
|
||||
|
||||
# Install AWS Lambda Runtime Interface Client
|
||||
RUN python3 -m pip install --no-cache-dir awslambdaric
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
curl \
|
||||
wget \
|
||||
gnupg \
|
||||
git \
|
||||
cmake \
|
||||
pkg-config \
|
||||
python3-dev \
|
||||
libjpeg-dev \
|
||||
redis-server \
|
||||
supervisor \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
libglib2.0-0 \
|
||||
libnss3 \
|
||||
libnspr4 \
|
||||
libatk1.0-0 \
|
||||
libatk-bridge2.0-0 \
|
||||
libcups2 \
|
||||
libdrm2 \
|
||||
libdbus-1-3 \
|
||||
libxcb1 \
|
||||
libxkbcommon0 \
|
||||
libx11-6 \
|
||||
libxcomposite1 \
|
||||
libxdamage1 \
|
||||
libxext6 \
|
||||
libxfixes3 \
|
||||
libxrandr2 \
|
||||
libgbm1 \
|
||||
libpango-1.0-0 \
|
||||
libcairo2 \
|
||||
libasound2 \
|
||||
libatspi2.0-0 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install build essentials for any compilations needed
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
python3-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Set up function directory and browser path
|
||||
ARG FUNCTION_DIR="/function"
|
||||
RUN mkdir -p "${FUNCTION_DIR}/pw-browsers"
|
||||
RUN mkdir -p "/tmp/.crawl4ai"
|
||||
|
||||
# Set critical environment variables
|
||||
ENV PLAYWRIGHT_BROWSERS_PATH="${FUNCTION_DIR}/pw-browsers" \
|
||||
HOME="/tmp" \
|
||||
CRAWL4_AI_BASE_DIRECTORY="/tmp/.crawl4ai"
|
||||
|
||||
# Create Craw4ai base directory
|
||||
RUN mkdir -p ${CRAWL4_AI_BASE_DIRECTORY}
|
||||
|
||||
RUN pip install --no-cache-dir faust-cchardet
|
||||
|
||||
# Install Crawl4ai and dependencies
|
||||
RUN pip install --no-cache-dir git+https://github.com/unclecode/crawl4ai.git@next
|
||||
|
||||
# Install Chromium only (no deps flag)
|
||||
RUN playwright install chromium
|
||||
|
||||
# Copy function code
|
||||
COPY lambda_function.py ${FUNCTION_DIR}/
|
||||
|
||||
# Set working directory
|
||||
WORKDIR ${FUNCTION_DIR}
|
||||
|
||||
ENTRYPOINT [ "/usr/local/bin/python", "-m", "awslambdaric" ]
|
||||
CMD [ "lambda_function.handler" ]
|
||||
1081
deploy/lambda/deploy.py
Normal file
1081
deploy/lambda/deploy.py
Normal file
File diff suppressed because it is too large
Load Diff
345
deploy/lambda/guide.md
Normal file
345
deploy/lambda/guide.md
Normal file
@@ -0,0 +1,345 @@
|
||||
# Deploying Crawl4ai on AWS Lambda
|
||||
|
||||
This guide walks you through deploying Crawl4ai as an AWS Lambda function with API Gateway integration. You'll learn how to set up, test, and clean up your deployment.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before you begin, ensure you have:
|
||||
|
||||
- AWS CLI installed and configured (`aws configure`)
|
||||
- Docker installed and running
|
||||
- Python 3.8+ installed
|
||||
- Basic familiarity with AWS services
|
||||
|
||||
## Project Files
|
||||
|
||||
Your project directory should contain:
|
||||
|
||||
- `Dockerfile`: Container configuration for Lambda
|
||||
- `lambda_function.py`: Lambda handler code
|
||||
- `deploy.py`: Our deployment script
|
||||
|
||||
## Step 1: Install Required Python Packages
|
||||
|
||||
Install the Python packages needed for our deployment script:
|
||||
|
||||
```bash
|
||||
pip install typer rich
|
||||
```
|
||||
|
||||
## Step 2: Run the Deployment Script
|
||||
|
||||
Our Python script automates the entire deployment process:
|
||||
|
||||
```bash
|
||||
python deploy.py
|
||||
```
|
||||
|
||||
The script will guide you through:
|
||||
|
||||
1. Configuration setup (AWS region, function name, memory allocation)
|
||||
2. Docker image building
|
||||
3. ECR repository creation
|
||||
4. Lambda function deployment
|
||||
5. API Gateway setup
|
||||
6. Provisioned concurrency configuration (optional)
|
||||
|
||||
Follow the prompts and confirm each step by pressing Enter.
|
||||
|
||||
## Step 3: Manual Deployment (Alternative to the Script)
|
||||
|
||||
If you prefer to deploy manually or understand what the script does, follow these steps:
|
||||
|
||||
### Building and Pushing the Docker Image
|
||||
|
||||
```bash
|
||||
# Build the Docker image
|
||||
docker build -t crawl4ai-lambda .
|
||||
|
||||
# Create an ECR repository (if it doesn't exist)
|
||||
aws ecr create-repository --repository-name crawl4ai-lambda
|
||||
|
||||
# Get ECR login password and login
|
||||
aws ecr get-login-password | docker login --username AWS --password-stdin $(aws sts get-caller-identity --query Account --output text).dkr.ecr.us-east-1.amazonaws.com
|
||||
|
||||
# Tag the image
|
||||
ECR_URI=$(aws ecr describe-repositories --repository-names crawl4ai-lambda --query 'repositories[0].repositoryUri' --output text)
|
||||
docker tag crawl4ai-lambda:latest $ECR_URI:latest
|
||||
|
||||
# Push the image to ECR
|
||||
docker push $ECR_URI:latest
|
||||
```
|
||||
|
||||
### Creating the Lambda Function
|
||||
|
||||
```bash
|
||||
# Get IAM role ARN (create it if needed)
|
||||
ROLE_ARN=$(aws iam get-role --role-name lambda-execution-role --query 'Role.Arn' --output text)
|
||||
|
||||
# Create Lambda function
|
||||
aws lambda create-function \
|
||||
--function-name crawl4ai-function \
|
||||
--package-type Image \
|
||||
--code ImageUri=$ECR_URI:latest \
|
||||
--role $ROLE_ARN \
|
||||
--timeout 300 \
|
||||
--memory-size 4096 \
|
||||
--ephemeral-storage Size=10240 \
|
||||
--environment "Variables={CRAWL4_AI_BASE_DIRECTORY=/tmp/.crawl4ai,HOME=/tmp,PLAYWRIGHT_BROWSERS_PATH=/function/pw-browsers}"
|
||||
```
|
||||
|
||||
If you're updating an existing function:
|
||||
|
||||
```bash
|
||||
# Update function code
|
||||
aws lambda update-function-code \
|
||||
--function-name crawl4ai-function \
|
||||
--image-uri $ECR_URI:latest
|
||||
|
||||
# Update function configuration
|
||||
aws lambda update-function-configuration \
|
||||
--function-name crawl4ai-function \
|
||||
--timeout 300 \
|
||||
--memory-size 4096 \
|
||||
--ephemeral-storage Size=10240 \
|
||||
--environment "Variables={CRAWL4_AI_BASE_DIRECTORY=/tmp/.crawl4ai,HOME=/tmp,PLAYWRIGHT_BROWSERS_PATH=/function/pw-browsers}"
|
||||
```
|
||||
|
||||
### Setting Up API Gateway
|
||||
|
||||
```bash
|
||||
# Create API Gateway
|
||||
API_ID=$(aws apigateway create-rest-api --name crawl4ai-api --query 'id' --output text)
|
||||
|
||||
# Get root resource ID
|
||||
PARENT_ID=$(aws apigateway get-resources --rest-api-id $API_ID --query 'items[?path==`/`].id' --output text)
|
||||
|
||||
# Create resource
|
||||
RESOURCE_ID=$(aws apigateway create-resource --rest-api-id $API_ID --parent-id $PARENT_ID --path-part "crawl" --query 'id' --output text)
|
||||
|
||||
# Create POST method
|
||||
aws apigateway put-method --rest-api-id $API_ID --resource-id $RESOURCE_ID --http-method POST --authorization-type NONE
|
||||
|
||||
# Get Lambda function ARN
|
||||
LAMBDA_ARN=$(aws lambda get-function --function-name crawl4ai-function --query 'Configuration.FunctionArn' --output text)
|
||||
|
||||
# Set Lambda integration
|
||||
aws apigateway put-integration \
|
||||
--rest-api-id $API_ID \
|
||||
--resource-id $RESOURCE_ID \
|
||||
--http-method POST \
|
||||
--type AWS_PROXY \
|
||||
--integration-http-method POST \
|
||||
--uri arn:aws:apigateway:us-east-1:lambda:path/2015-03-31/functions/$LAMBDA_ARN/invocations
|
||||
|
||||
# Deploy API
|
||||
aws apigateway create-deployment --rest-api-id $API_ID --stage-name prod
|
||||
|
||||
# Set Lambda permission
|
||||
ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
|
||||
aws lambda add-permission \
|
||||
--function-name crawl4ai-function \
|
||||
--statement-id apigateway \
|
||||
--action lambda:InvokeFunction \
|
||||
--principal apigateway.amazonaws.com \
|
||||
--source-arn "arn:aws:execute-api:us-east-1:$ACCOUNT_ID:$API_ID/*/POST/crawl"
|
||||
```
|
||||
|
||||
### Setting Up Provisioned Concurrency (Optional)
|
||||
|
||||
This reduces cold starts:
|
||||
|
||||
```bash
|
||||
# Publish a version
|
||||
VERSION=$(aws lambda publish-version --function-name crawl4ai-function --query 'Version' --output text)
|
||||
|
||||
# Create alias
|
||||
aws lambda create-alias \
|
||||
--function-name crawl4ai-function \
|
||||
--name prod \
|
||||
--function-version $VERSION
|
||||
|
||||
# Configure provisioned concurrency
|
||||
aws lambda put-provisioned-concurrency-config \
|
||||
--function-name crawl4ai-function \
|
||||
--qualifier prod \
|
||||
--provisioned-concurrent-executions 2
|
||||
|
||||
# Update API Gateway to use alias
|
||||
LAMBDA_ALIAS_ARN="arn:aws:lambda:us-east-1:$ACCOUNT_ID:function:crawl4ai-function:prod"
|
||||
aws apigateway put-integration \
|
||||
--rest-api-id $API_ID \
|
||||
--resource-id $RESOURCE_ID \
|
||||
--http-method POST \
|
||||
--type AWS_PROXY \
|
||||
--integration-http-method POST \
|
||||
--uri arn:aws:apigateway:us-east-1:lambda:path/2015-03-31/functions/$LAMBDA_ALIAS_ARN/invocations
|
||||
|
||||
# Redeploy API Gateway
|
||||
aws apigateway create-deployment --rest-api-id $API_ID --stage-name prod
|
||||
```
|
||||
|
||||
## Step 4: Testing the Deployment
|
||||
|
||||
Once deployed, test your function with:
|
||||
|
||||
```bash
|
||||
ENDPOINT_URL="https://$API_ID.execute-api.us-east-1.amazonaws.com/prod/crawl"
|
||||
|
||||
# Test with curl
|
||||
curl -X POST $ENDPOINT_URL \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"url":"https://example.com"}'
|
||||
```
|
||||
|
||||
Or using Python:
|
||||
|
||||
```python
|
||||
import requests
|
||||
import json
|
||||
|
||||
url = "https://your-api-id.execute-api.us-east-1.amazonaws.com/prod/crawl"
|
||||
payload = {
|
||||
"url": "https://example.com",
|
||||
"browser_config": {
|
||||
"headless": True,
|
||||
"verbose": False
|
||||
},
|
||||
"crawler_config": {
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"markdown_generator": {
|
||||
"type": "DefaultMarkdownGenerator",
|
||||
"params": {
|
||||
"content_filter": {
|
||||
"type": "PruningContentFilter",
|
||||
"params": {
|
||||
"threshold": 0.48,
|
||||
"threshold_type": "fixed"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
response = requests.post(url, json=payload)
|
||||
result = response.json()
|
||||
print(json.dumps(result, indent=2))
|
||||
```
|
||||
|
||||
## Step 5: Cleaning Up Resources
|
||||
|
||||
To remove all AWS resources created for this deployment:
|
||||
|
||||
```bash
|
||||
python deploy.py cleanup
|
||||
```
|
||||
|
||||
Or manually:
|
||||
|
||||
```bash
|
||||
# Delete API Gateway
|
||||
aws apigateway delete-rest-api --rest-api-id $API_ID
|
||||
|
||||
# Remove provisioned concurrency (if configured)
|
||||
aws lambda delete-provisioned-concurrency-config \
|
||||
--function-name crawl4ai-function \
|
||||
--qualifier prod
|
||||
|
||||
# Delete alias (if created)
|
||||
aws lambda delete-alias \
|
||||
--function-name crawl4ai-function \
|
||||
--name prod
|
||||
|
||||
# Delete Lambda function
|
||||
aws lambda delete-function --function-name crawl4ai-function
|
||||
|
||||
# Delete ECR repository
|
||||
aws ecr delete-repository --repository-name crawl4ai-lambda --force
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Cold Start Issues
|
||||
|
||||
If experiencing long cold starts:
|
||||
- Enable provisioned concurrency
|
||||
- Increase memory allocation (4096 MB recommended)
|
||||
- Ensure the Lambda function has enough ephemeral storage
|
||||
|
||||
### Permission Errors
|
||||
|
||||
If you encounter permission errors:
|
||||
- Check the IAM role has the necessary permissions
|
||||
- Ensure API Gateway has permission to invoke the Lambda function
|
||||
|
||||
### Container Size Issues
|
||||
|
||||
If your container is too large:
|
||||
- Optimize the Dockerfile
|
||||
- Use multi-stage builds
|
||||
- Consider removing unnecessary dependencies
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
- Lambda memory affects CPU allocation - higher memory means faster execution
|
||||
- Provisioned concurrency eliminates cold starts but costs more
|
||||
- Optimize the Playwright setup for faster browser initialization
|
||||
|
||||
## Security Best Practices
|
||||
|
||||
- Use the principle of least privilege for IAM roles
|
||||
- Implement API Gateway authentication for production deployments
|
||||
- Consider using AWS KMS for storing sensitive configuration
|
||||
|
||||
## Useful AWS Console Links
|
||||
|
||||
Here are quick links to access important AWS console pages for monitoring and managing your deployment:
|
||||
|
||||
| Resource | Console Link |
|
||||
|----------|-------------|
|
||||
| Lambda Functions | [AWS Lambda Console](https://console.aws.amazon.com/lambda/home#/functions) |
|
||||
| Lambda Function Logs | [CloudWatch Logs](https://console.aws.amazon.com/cloudwatch/home#logsV2:log-groups) |
|
||||
| API Gateway | [API Gateway Console](https://console.aws.amazon.com/apigateway/home) |
|
||||
| ECR Repositories | [ECR Console](https://console.aws.amazon.com/ecr/repositories) |
|
||||
| IAM Roles | [IAM Console](https://console.aws.amazon.com/iamv2/home#/roles) |
|
||||
| CloudWatch Metrics | [CloudWatch Metrics](https://console.aws.amazon.com/cloudwatch/home#metricsV2) |
|
||||
|
||||
### Monitoring Lambda Execution
|
||||
|
||||
To monitor your Lambda function:
|
||||
|
||||
1. Go to the [Lambda function console](https://console.aws.amazon.com/lambda/home#/functions)
|
||||
2. Select your function (`crawl4ai-function`)
|
||||
3. Click the "Monitor" tab to see:
|
||||
- Invocation metrics
|
||||
- Success/failure rates
|
||||
- Duration statistics
|
||||
|
||||
### Viewing Lambda Logs
|
||||
|
||||
To see detailed execution logs:
|
||||
|
||||
1. Go to [CloudWatch Logs](https://console.aws.amazon.com/cloudwatch/home#logsV2:log-groups)
|
||||
2. Find the log group named `/aws/lambda/crawl4ai-function`
|
||||
3. Click to see the latest log streams
|
||||
4. Each stream contains logs from a function execution
|
||||
|
||||
### Checking API Gateway Traffic
|
||||
|
||||
To monitor API requests:
|
||||
|
||||
1. Go to the [API Gateway console](https://console.aws.amazon.com/apigateway/home)
|
||||
2. Select your API (`crawl4ai-api`)
|
||||
3. Click "Dashboard" to see:
|
||||
- API calls
|
||||
- Latency
|
||||
- Error rates
|
||||
|
||||
## Conclusion
|
||||
|
||||
You now have Crawl4ai running as a serverless function on AWS Lambda! This setup allows you to crawl websites on-demand without maintaining infrastructure, while paying only for the compute time you use.
|
||||
107
deploy/lambda/lambda_function.py
Normal file
107
deploy/lambda/lambda_function.py
Normal file
@@ -0,0 +1,107 @@
|
||||
import json
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
# Ensure environment variables and directories are set
|
||||
os.environ['CRAWL4_AI_BASE_DIRECTORY'] = '/tmp/.crawl4ai'
|
||||
os.environ['HOME'] = '/tmp'
|
||||
|
||||
# Create directory if it doesn't exist
|
||||
os.makedirs('/tmp/.crawl4ai', exist_ok=True)
|
||||
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
CacheMode
|
||||
)
|
||||
|
||||
|
||||
def handler(event, context):
|
||||
# Parse the incoming event (API Gateway request)
|
||||
try:
|
||||
body = json.loads(event.get('body', '{}'))
|
||||
|
||||
url = body.get('url')
|
||||
if not url:
|
||||
return {
|
||||
'statusCode': 400,
|
||||
'body': json.dumps({'error': 'URL is required'})
|
||||
}
|
||||
|
||||
# Get optional configurations or use defaults
|
||||
browser_config_dict = body.get('browser_config', {})
|
||||
crawler_config_dict = body.get('crawler_config', {})
|
||||
|
||||
# Run the crawler
|
||||
result = asyncio.run(crawl(url, browser_config_dict, crawler_config_dict))
|
||||
|
||||
# Return successful response
|
||||
return {
|
||||
'statusCode': 200,
|
||||
'headers': {
|
||||
'Content-Type': 'application/json'
|
||||
},
|
||||
'body': json.dumps(result)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
# Handle errors
|
||||
import traceback
|
||||
return {
|
||||
'statusCode': 500,
|
||||
'body': json.dumps({
|
||||
'error': str(e),
|
||||
'traceback': traceback.format_exc()
|
||||
})
|
||||
}
|
||||
|
||||
async def crawl(url, browser_config_dict, crawler_config_dict):
|
||||
"""
|
||||
Run the crawler with the provided configurations, with Lambda-specific settings
|
||||
"""
|
||||
# Start with user-provided config but override with Lambda-required settings
|
||||
base_browser_config = BrowserConfig.load(browser_config_dict) if browser_config_dict else BrowserConfig()
|
||||
|
||||
# Apply Lambda-specific browser configurations
|
||||
browser_config = BrowserConfig(
|
||||
verbose=True,
|
||||
browser_type="chromium",
|
||||
headless=True,
|
||||
user_agent_mode="random",
|
||||
light_mode=True,
|
||||
use_managed_browser=False,
|
||||
extra_args=[
|
||||
"--headless=new",
|
||||
"--no-sandbox",
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-setuid-sandbox",
|
||||
"--remote-allow-origins=*",
|
||||
"--autoplay-policy=user-gesture-required",
|
||||
"--single-process",
|
||||
],
|
||||
# # Carry over any other settings from user config that aren't overridden
|
||||
# **{k: v for k, v in base_browser_config.model_dump().items()
|
||||
# if k not in ['verbose', 'browser_type', 'headless', 'user_agent_mode',
|
||||
# 'light_mode', 'use_managed_browser', 'extra_args']}
|
||||
)
|
||||
|
||||
# Start with user-provided crawler config but ensure cache is bypassed
|
||||
base_crawler_config = CrawlerRunConfig.load(crawler_config_dict) if crawler_config_dict else CrawlerRunConfig()
|
||||
|
||||
# Apply Lambda-specific crawler configurations
|
||||
crawler_config = CrawlerRunConfig(
|
||||
exclude_external_links=base_crawler_config.exclude_external_links,
|
||||
remove_overlay_elements=True,
|
||||
magic=True,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
# Carry over markdown generator and other settings
|
||||
markdown_generator=base_crawler_config.markdown_generator
|
||||
)
|
||||
|
||||
# Perform the crawl with Lambda-optimized settings
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url=url, config=crawler_config)
|
||||
|
||||
# Return serializable results
|
||||
return result.model_dump()
|
||||
Reference in New Issue
Block a user