Compare commits
1 Commits
bug/proxy_
...
deploy
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3ea3c0520d |
137
deploy/aws/Dockerfile
Normal file
137
deploy/aws/Dockerfile
Normal file
@@ -0,0 +1,137 @@
|
||||
FROM python:3.10-slim
|
||||
|
||||
# Set build arguments
|
||||
ARG APP_HOME=/app
|
||||
ARG GITHUB_REPO=https://github.com/unclecode/crawl4ai.git
|
||||
ARG GITHUB_BRANCH=next
|
||||
ARG USE_LOCAL=False
|
||||
ARG CONFIG_PATH=""
|
||||
|
||||
ENV PYTHONFAULTHANDLER=1 \
|
||||
PYTHONHASHSEED=random \
|
||||
PYTHONUNBUFFERED=1 \
|
||||
PIP_NO_CACHE_DIR=1 \
|
||||
PYTHONDONTWRITEBYTECODE=1 \
|
||||
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
||||
PIP_DEFAULT_TIMEOUT=100 \
|
||||
DEBIAN_FRONTEND=noninteractive \
|
||||
REDIS_HOST=localhost \
|
||||
REDIS_PORT=6379
|
||||
|
||||
ARG PYTHON_VERSION=3.10
|
||||
ARG INSTALL_TYPE=default
|
||||
ARG ENABLE_GPU=false
|
||||
ARG TARGETARCH
|
||||
|
||||
LABEL maintainer="unclecode"
|
||||
LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
|
||||
LABEL version="1.0"
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
curl \
|
||||
wget \
|
||||
gnupg \
|
||||
git \
|
||||
cmake \
|
||||
pkg-config \
|
||||
python3-dev \
|
||||
libjpeg-dev \
|
||||
redis-server \
|
||||
supervisor \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
libglib2.0-0 \
|
||||
libnss3 \
|
||||
libnspr4 \
|
||||
libatk1.0-0 \
|
||||
libatk-bridge2.0-0 \
|
||||
libcups2 \
|
||||
libdrm2 \
|
||||
libdbus-1-3 \
|
||||
libxcb1 \
|
||||
libxkbcommon0 \
|
||||
libx11-6 \
|
||||
libxcomposite1 \
|
||||
libxdamage1 \
|
||||
libxext6 \
|
||||
libxfixes3 \
|
||||
libxrandr2 \
|
||||
libgbm1 \
|
||||
libpango-1.0-0 \
|
||||
libcairo2 \
|
||||
libasound2 \
|
||||
libatspi2.0-0 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
|
||||
apt-get update && apt-get install -y --no-install-recommends \
|
||||
nvidia-cuda-toolkit \
|
||||
&& rm -rf /var/lib/apt/lists/* ; \
|
||||
else \
|
||||
echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
|
||||
fi
|
||||
|
||||
RUN if [ "$TARGETARCH" = "arm64" ]; then \
|
||||
echo "🦾 Installing ARM-specific optimizations"; \
|
||||
apt-get update && apt-get install -y --no-install-recommends \
|
||||
libopenblas-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*; \
|
||||
elif [ "$TARGETARCH" = "amd64" ]; then \
|
||||
echo "🖥️ Installing AMD64-specific optimizations"; \
|
||||
apt-get update && apt-get install -y --no-install-recommends \
|
||||
libomp-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*; \
|
||||
else \
|
||||
echo "Skipping platform-specific optimizations (unsupported platform)"; \
|
||||
fi
|
||||
|
||||
WORKDIR ${APP_HOME}
|
||||
|
||||
RUN git clone --branch ${GITHUB_BRANCH} ${GITHUB_REPO} /tmp/crawl4ai
|
||||
|
||||
COPY docker/supervisord.conf .
|
||||
COPY docker/requirements.txt .
|
||||
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
|
||||
pip install "/tmp/crawl4ai/[all]" && \
|
||||
python -m nltk.downloader punkt stopwords && \
|
||||
python -m crawl4ai.model_loader ; \
|
||||
elif [ "$INSTALL_TYPE" = "torch" ] ; then \
|
||||
pip install "/tmp/crawl4ai/[torch]" ; \
|
||||
elif [ "$INSTALL_TYPE" = "transformer" ] ; then \
|
||||
pip install "/tmp/crawl4ai/[transformer]" && \
|
||||
python -m crawl4ai.model_loader ; \
|
||||
else \
|
||||
pip install "/tmp/crawl4ai" ; \
|
||||
fi
|
||||
|
||||
RUN pip install --no-cache-dir --upgrade pip && \
|
||||
python -c "import crawl4ai; print('✅ crawl4ai is ready to rock!')" && \
|
||||
python -c "from playwright.sync_api import sync_playwright; print('✅ Playwright is feeling dramatic!')"
|
||||
|
||||
RUN playwright install --with-deps chromium
|
||||
|
||||
COPY docker/* ${APP_HOME}/
|
||||
RUN if [ -n "$CONFIG_PATH" ] && [ -f "$CONFIG_PATH" ]; then \
|
||||
echo "Using custom config from $CONFIG_PATH" && \
|
||||
cp $CONFIG_PATH /app/config.yml; \
|
||||
fi
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD bash -c '\
|
||||
MEM=$(free -m | awk "/^Mem:/{print \$2}"); \
|
||||
if [ $MEM -lt 2048 ]; then \
|
||||
echo "⚠️ Warning: Less than 2GB RAM available! Your container might need a memory boost! 🚀"; \
|
||||
exit 1; \
|
||||
fi && \
|
||||
redis-cli ping > /dev/null && \
|
||||
curl -f http://localhost:8000/health || exit 1'
|
||||
|
||||
# EXPOSE 6379
|
||||
|
||||
CMD ["supervisord", "-c", "supervisord.conf"]
|
||||
|
||||
3
deploy/aws/deploy-config.yml
Executable file
3
deploy/aws/deploy-config.yml
Executable file
@@ -0,0 +1,3 @@
|
||||
project_name: PROJECT_NAME
|
||||
domain_name: DOMAIN_NAME
|
||||
aws_region: AWS_REGION
|
||||
729
deploy/aws/deploy.py
Executable file
729
deploy/aws/deploy.py
Executable file
@@ -0,0 +1,729 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import json
|
||||
import yaml
|
||||
import requests
|
||||
import os
|
||||
|
||||
# Steps for deployment
|
||||
STEPS = [
|
||||
"refresh_aws_auth",
|
||||
"fetch_or_create_vpc_and_subnets",
|
||||
"create_ecr_repositories",
|
||||
"create_iam_role",
|
||||
"create_security_groups",
|
||||
"request_acm_certificate",
|
||||
"build_and_push_docker",
|
||||
"create_task_definition",
|
||||
"setup_alb",
|
||||
"deploy_ecs_service",
|
||||
"configure_custom_domain",
|
||||
"test_endpoints"
|
||||
]
|
||||
|
||||
# Utility function to prompt user for confirmation
|
||||
def confirm_step(step_name):
|
||||
while True:
|
||||
response = input(f"Proceed with {step_name}? (yes/no): ").strip().lower()
|
||||
if response in ["yes", "no"]:
|
||||
return response == "yes"
|
||||
print("Please enter 'yes' or 'no'.")
|
||||
|
||||
# Utility function to run AWS CLI or shell commands and handle errors
|
||||
def run_command(command, error_message, additional_diagnostics=None, cwd="."):
|
||||
try:
|
||||
result = subprocess.run(command, capture_output=True, text=True, check=True, cwd=cwd)
|
||||
return result
|
||||
except subprocess.CalledProcessError as e:
|
||||
with open("error_context.md", "w") as f:
|
||||
f.write(f"{error_message}:\n")
|
||||
f.write(f"Command: {' '.join(command)}\n")
|
||||
f.write(f"Exit Code: {e.returncode}\n")
|
||||
f.write(f"Stdout: {e.stdout}\n")
|
||||
f.write(f"Stderr: {e.stderr}\n")
|
||||
if additional_diagnostics:
|
||||
for diag_cmd in additional_diagnostics:
|
||||
diag_result = subprocess.run(diag_cmd, capture_output=True, text=True)
|
||||
f.write(f"\nDiagnostic command: {' '.join(diag_cmd)}\n")
|
||||
f.write(f"Stdout: {diag_result.stdout}\n")
|
||||
f.write(f"Stderr: {diag_result.stderr}\n")
|
||||
raise Exception(f"{error_message}: {e.stderr}")
|
||||
|
||||
# Utility function to load or initialize state
|
||||
def load_state(project_name):
|
||||
state_file = f"{project_name}-state.json"
|
||||
if os.path.exists(state_file):
|
||||
with open(state_file, "r") as f:
|
||||
return json.load(f)
|
||||
return {"last_step": -1}
|
||||
|
||||
# Utility function to save state
|
||||
def save_state(project_name, state):
|
||||
state_file = f"{project_name}-state.json"
|
||||
with open(state_file, "w") as f:
|
||||
json.dump(state, f, indent=4)
|
||||
|
||||
# DNS Check Function
|
||||
def check_dns_propagation(domain, alb_dns):
|
||||
try:
|
||||
result = subprocess.run(["dig", "+short", domain], capture_output=True, text=True)
|
||||
if alb_dns in result.stdout:
|
||||
return True
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"Failed to check DNS: {e}")
|
||||
return False
|
||||
|
||||
# Step Functions
|
||||
def refresh_aws_auth(project_name, state, config):
|
||||
if state["last_step"] >= 0:
|
||||
print("Skipping refresh_aws_auth (already completed)")
|
||||
return
|
||||
if not confirm_step("Refresh AWS authentication"):
|
||||
sys.exit("User aborted.")
|
||||
run_command(
|
||||
["aws", "sts", "get-caller-identity"],
|
||||
"Failed to verify AWS credentials"
|
||||
)
|
||||
print("AWS authentication verified.")
|
||||
state["last_step"] = 0
|
||||
save_state(project_name, state)
|
||||
|
||||
def fetch_or_create_vpc_and_subnets(project_name, state, config):
|
||||
if state["last_step"] >= 1:
|
||||
print("Skipping fetch_or_create_vpc_and_subnets (already completed)")
|
||||
return state["vpc_id"], state["public_subnets"]
|
||||
if not confirm_step("Fetch or Create VPC and Subnets"):
|
||||
sys.exit("User aborted.")
|
||||
|
||||
# Fetch AWS account ID
|
||||
result = run_command(
|
||||
["aws", "sts", "get-caller-identity"],
|
||||
"Failed to get AWS account ID"
|
||||
)
|
||||
account_id = json.loads(result.stdout)["Account"]
|
||||
|
||||
# Fetch default VPC
|
||||
result = run_command(
|
||||
["aws", "ec2", "describe-vpcs", "--filters", "Name=isDefault,Values=true", "--region", config["aws_region"]],
|
||||
"Failed to describe VPCs"
|
||||
)
|
||||
vpcs = json.loads(result.stdout).get("Vpcs", [])
|
||||
if not vpcs:
|
||||
result = run_command(
|
||||
["aws", "ec2", "create-vpc", "--cidr-block", "10.0.0.0/16", "--region", config["aws_region"]],
|
||||
"Failed to create VPC"
|
||||
)
|
||||
vpc_id = json.loads(result.stdout)["Vpc"]["VpcId"]
|
||||
run_command(
|
||||
["aws", "ec2", "modify-vpc-attribute", "--vpc-id", vpc_id, "--enable-dns-hostnames", "--region", config["aws_region"]],
|
||||
"Failed to enable DNS hostnames"
|
||||
)
|
||||
else:
|
||||
vpc_id = vpcs[0]["VpcId"]
|
||||
|
||||
# Fetch or create subnets
|
||||
result = run_command(
|
||||
["aws", "ec2", "describe-subnets", "--filters", f"Name=vpc-id,Values={vpc_id}", "--region", config["aws_region"]],
|
||||
"Failed to describe subnets"
|
||||
)
|
||||
subnets = json.loads(result.stdout).get("Subnets", [])
|
||||
if len(subnets) < 2:
|
||||
azs = json.loads(run_command(
|
||||
["aws", "ec2", "describe-availability-zones", "--region", config["aws_region"]],
|
||||
"Failed to describe availability zones"
|
||||
).stdout)["AvailabilityZones"][:2]
|
||||
subnet_ids = []
|
||||
for i, az in enumerate(azs):
|
||||
az_name = az["ZoneName"]
|
||||
result = run_command(
|
||||
["aws", "ec2", "create-subnet", "--vpc-id", vpc_id, "--cidr-block", f"10.0.{i}.0/24", "--availability-zone", az_name, "--region", config["aws_region"]],
|
||||
f"Failed to create subnet in {az_name}"
|
||||
)
|
||||
subnet_id = json.loads(result.stdout)["Subnet"]["SubnetId"]
|
||||
subnet_ids.append(subnet_id)
|
||||
run_command(
|
||||
["aws", "ec2", "modify-subnet-attribute", "--subnet-id", subnet_id, "--map-public-ip-on-launch", "--region", config["aws_region"]],
|
||||
f"Failed to make subnet {subnet_id} public"
|
||||
)
|
||||
else:
|
||||
subnet_ids = [s["SubnetId"] for s in subnets[:2]]
|
||||
|
||||
# Ensure internet gateway
|
||||
result = run_command(
|
||||
["aws", "ec2", "describe-internet-gateways", "--filters", f"Name=attachment.vpc-id,Values={vpc_id}", "--region", config["aws_region"]],
|
||||
"Failed to describe internet gateways"
|
||||
)
|
||||
igws = json.loads(result.stdout).get("InternetGateways", [])
|
||||
if not igws:
|
||||
result = run_command(
|
||||
["aws", "ec2", "create-internet-gateway", "--region", config["aws_region"]],
|
||||
"Failed to create internet gateway"
|
||||
)
|
||||
igw_id = json.loads(result.stdout)["InternetGateway"]["InternetGatewayId"]
|
||||
run_command(
|
||||
["aws", "ec2", "attach-internet-gateway", "--vpc-id", vpc_id, "--internet-gateway-id", igw_id, "--region", config["aws_region"]],
|
||||
"Failed to attach internet gateway"
|
||||
)
|
||||
|
||||
state["vpc_id"] = vpc_id
|
||||
state["public_subnets"] = subnet_ids
|
||||
state["last_step"] = 1
|
||||
save_state(project_name, state)
|
||||
print(f"VPC ID: {vpc_id}, Subnets: {subnet_ids}")
|
||||
return vpc_id, subnet_ids
|
||||
|
||||
def create_ecr_repositories(project_name, state, config):
|
||||
if state["last_step"] >= 2:
|
||||
print("Skipping create_ecr_repositories (already completed)")
|
||||
return
|
||||
if not confirm_step("Create ECR Repositories"):
|
||||
sys.exit("User aborted.")
|
||||
|
||||
account_id = json.loads(run_command(
|
||||
["aws", "sts", "get-caller-identity"],
|
||||
"Failed to get AWS account ID"
|
||||
).stdout)["Account"]
|
||||
repos = [project_name, f"{project_name}-nginx"]
|
||||
for repo in repos:
|
||||
result = subprocess.run(
|
||||
["aws", "ecr", "describe-repositories", "--repository-names", repo, "--region", config["aws_region"]],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
if result.returncode != 0:
|
||||
run_command(
|
||||
["aws", "ecr", "create-repository", "--repository-name", repo, "--region", config["aws_region"]],
|
||||
f"Failed to create ECR repository {repo}"
|
||||
)
|
||||
print(f"ECR repository {repo} is ready.")
|
||||
state["last_step"] = 2
|
||||
save_state(project_name, state)
|
||||
|
||||
def create_iam_role(project_name, state, config):
|
||||
if state["last_step"] >= 3:
|
||||
print("Skipping create_iam_role (already completed)")
|
||||
return
|
||||
if not confirm_step("Create IAM Role"):
|
||||
sys.exit("User aborted.")
|
||||
|
||||
account_id = json.loads(run_command(
|
||||
["aws", "sts", "get-caller-identity"],
|
||||
"Failed to get AWS account ID"
|
||||
).stdout)["Account"]
|
||||
role_name = "ecsTaskExecutionRole"
|
||||
trust_policy = {
|
||||
"Version": "2012-10-17",
|
||||
"Statement": [
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Principal": {"Service": "ecs-tasks.amazonaws.com"},
|
||||
"Action": "sts:AssumeRole"
|
||||
}
|
||||
]
|
||||
}
|
||||
with open("trust_policy.json", "w") as f:
|
||||
json.dump(trust_policy, f)
|
||||
|
||||
result = subprocess.run(
|
||||
["aws", "iam", "get-role", "--role-name", role_name],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
if result.returncode != 0:
|
||||
run_command(
|
||||
["aws", "iam", "create-role", "--role-name", role_name, "--assume-role-policy-document", "file://trust_policy.json"],
|
||||
f"Failed to create IAM role {role_name}"
|
||||
)
|
||||
run_command(
|
||||
["aws", "iam", "attach-role-policy", "--role-name", role_name, "--policy-arn", "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"],
|
||||
"Failed to attach ECS task execution policy"
|
||||
)
|
||||
os.remove("trust_policy.json")
|
||||
state["execution_role_arn"] = f"arn:aws:iam::{account_id}:role/{role_name}"
|
||||
state["last_step"] = 3
|
||||
save_state(project_name, state)
|
||||
print(f"IAM role {role_name} configured.")
|
||||
|
||||
def create_security_groups(project_name, state, config):
|
||||
if state["last_step"] >= 4:
|
||||
print("Skipping create_security_groups (already completed)")
|
||||
return state["alb_sg_id"], state["ecs_sg_id"]
|
||||
if not confirm_step("Create Security Groups"):
|
||||
sys.exit("User aborted.")
|
||||
|
||||
vpc_id = state["vpc_id"]
|
||||
alb_sg_name = f"{project_name}-alb-sg"
|
||||
result = run_command(
|
||||
["aws", "ec2", "describe-security-groups", "--filters", f"Name=vpc-id,Values={vpc_id}", f"Name=group-name,Values={alb_sg_name}", "--region", config["aws_region"]],
|
||||
"Failed to describe ALB security group"
|
||||
)
|
||||
if not json.loads(result.stdout).get("SecurityGroups"):
|
||||
result = run_command(
|
||||
["aws", "ec2", "create-security-group", "--group-name", alb_sg_name, "--description", "Security group for ALB", "--vpc-id", vpc_id, "--region", config["aws_region"]],
|
||||
"Failed to create ALB security group"
|
||||
)
|
||||
alb_sg_id = json.loads(result.stdout)["GroupId"]
|
||||
run_command(
|
||||
["aws", "ec2", "authorize-security-group-ingress", "--group-id", alb_sg_id, "--protocol", "tcp", "--port", "80", "--cidr", "0.0.0.0/0", "--region", config["aws_region"]],
|
||||
"Failed to authorize HTTP ingress"
|
||||
)
|
||||
run_command(
|
||||
["aws", "ec2", "authorize-security-group-ingress", "--group-id", alb_sg_id, "--protocol", "tcp", "--port", "443", "--cidr", "0.0.0.0/0", "--region", config["aws_region"]],
|
||||
"Failed to authorize HTTPS ingress"
|
||||
)
|
||||
else:
|
||||
alb_sg_id = json.loads(result.stdout)["SecurityGroups"][0]["GroupId"]
|
||||
|
||||
ecs_sg_name = f"{project_name}-ecs-sg"
|
||||
result = run_command(
|
||||
["aws", "ec2", "describe-security-groups", "--filters", f"Name=vpc-id,Values={vpc_id}", f"Name=group-name,Values={ecs_sg_name}", "--region", config["aws_region"]],
|
||||
"Failed to describe ECS security group"
|
||||
)
|
||||
if not json.loads(result.stdout).get("SecurityGroups"):
|
||||
result = run_command(
|
||||
["aws", "ec2", "create-security-group", "--group-name", ecs_sg_name, "--description", "Security group for ECS tasks", "--vpc-id", vpc_id, "--region", config["aws_region"]],
|
||||
"Failed to create ECS security group"
|
||||
)
|
||||
ecs_sg_id = json.loads(result.stdout)["GroupId"]
|
||||
run_command(
|
||||
["aws", "ec2", "authorize-security-group-ingress", "--group-id", ecs_sg_id, "--protocol", "tcp", "--port", "80", "--source-group", alb_sg_id, "--region", config["aws_region"]],
|
||||
"Failed to authorize ECS ingress"
|
||||
)
|
||||
else:
|
||||
ecs_sg_id = json.loads(result.stdout)["SecurityGroups"][0]["GroupId"]
|
||||
|
||||
state["alb_sg_id"] = alb_sg_id
|
||||
state["ecs_sg_id"] = ecs_sg_id
|
||||
state["last_step"] = 4
|
||||
save_state(project_name, state)
|
||||
print("Security groups configured.")
|
||||
return alb_sg_id, ecs_sg_id
|
||||
|
||||
def request_acm_certificate(project_name, state, config):
|
||||
if state["last_step"] >= 5:
|
||||
print("Skipping request_acm_certificate (already completed)")
|
||||
return state["cert_arn"]
|
||||
if not confirm_step("Request ACM Certificate"):
|
||||
sys.exit("User aborted.")
|
||||
|
||||
domain_name = config["domain_name"]
|
||||
result = run_command(
|
||||
["aws", "acm", "describe-certificates", "--certificate-statuses", "ISSUED", "--region", config["aws_region"]],
|
||||
"Failed to describe certificates"
|
||||
)
|
||||
certificates = json.loads(result.stdout).get("CertificateSummaryList", [])
|
||||
cert_arn = next((c["CertificateArn"] for c in certificates if c["DomainName"] == domain_name), None)
|
||||
|
||||
if not cert_arn:
|
||||
result = run_command(
|
||||
["aws", "acm", "request-certificate", "--domain-name", domain_name, "--validation-method", "DNS", "--region", config["aws_region"]],
|
||||
"Failed to request ACM certificate"
|
||||
)
|
||||
cert_arn = json.loads(result.stdout)["CertificateArn"]
|
||||
|
||||
time.sleep(10)
|
||||
result = run_command(
|
||||
["aws", "acm", "describe-certificate", "--certificate-arn", cert_arn, "--region", config["aws_region"]],
|
||||
"Failed to describe certificate"
|
||||
)
|
||||
cert_details = json.loads(result.stdout)["Certificate"]
|
||||
dns_validations = cert_details.get("DomainValidationOptions", [])
|
||||
for validation in dns_validations:
|
||||
if validation["ValidationMethod"] == "DNS" and "ResourceRecord" in validation:
|
||||
record = validation["ResourceRecord"]
|
||||
print(f"Please add this DNS record to validate the certificate for {domain_name}:")
|
||||
print(f"Name: {record['Name']}")
|
||||
print(f"Type: {record['Type']}")
|
||||
print(f"Value: {record['Value']}")
|
||||
print("Press Enter after adding the DNS record...")
|
||||
input()
|
||||
|
||||
while True:
|
||||
result = run_command(
|
||||
["aws", "acm", "describe-certificate", "--certificate-arn", cert_arn, "--region", config["aws_region"]],
|
||||
"Failed to check certificate status"
|
||||
)
|
||||
status = json.loads(result.stdout)["Certificate"]["Status"]
|
||||
if status == "ISSUED":
|
||||
break
|
||||
elif status in ["FAILED", "REVOKED", "INACTIVE"]:
|
||||
print("Certificate issuance failed.")
|
||||
sys.exit(1)
|
||||
time.sleep(10)
|
||||
|
||||
state["cert_arn"] = cert_arn
|
||||
state["last_step"] = 5
|
||||
save_state(project_name, state)
|
||||
print(f"Certificate ARN: {cert_arn}")
|
||||
return cert_arn
|
||||
|
||||
def build_and_push_docker(project_name, state, config):
|
||||
if state["last_step"] >= 6:
|
||||
print("Skipping build_and_push_docker (already completed)")
|
||||
return state["fastapi_image"], state["nginx_image"]
|
||||
if not confirm_step("Build and Push Docker Images"):
|
||||
sys.exit("User aborted.")
|
||||
|
||||
with open("./version.txt", "r") as f:
|
||||
version = f.read().strip()
|
||||
|
||||
account_id = json.loads(run_command(
|
||||
["aws", "sts", "get-caller-identity"],
|
||||
"Failed to get AWS account ID"
|
||||
).stdout)["Account"]
|
||||
region = config["aws_region"]
|
||||
|
||||
login_password = run_command(
|
||||
["aws", "ecr", "get-login-password", "--region", region],
|
||||
"Failed to get ECR login password"
|
||||
).stdout.strip()
|
||||
run_command(
|
||||
["docker", "login", "--username", "AWS", "--password", login_password, f"{account_id}.dkr.ecr.{region}.amazonaws.com"],
|
||||
"Failed to authenticate Docker to ECR"
|
||||
)
|
||||
|
||||
fastapi_image = f"{account_id}.dkr.ecr.{region}.amazonaws.com/{project_name}:{version}"
|
||||
run_command(
|
||||
["docker", "build", "-f", "Dockerfile", "-t", fastapi_image, "."],
|
||||
"Failed to build FastAPI Docker image"
|
||||
)
|
||||
run_command(
|
||||
["docker", "push", fastapi_image],
|
||||
"Failed to push FastAPI image"
|
||||
)
|
||||
|
||||
nginx_image = f"{account_id}.dkr.ecr.{region}.amazonaws.com/{project_name}-nginx:{version}"
|
||||
run_command(
|
||||
["docker", "build", "-f", "Dockerfile", "-t", nginx_image, "."],
|
||||
"Failed to build Nginx Docker image",
|
||||
cwd="./nginx"
|
||||
)
|
||||
run_command(
|
||||
["docker", "push", nginx_image],
|
||||
"Failed to push Nginx image"
|
||||
)
|
||||
|
||||
state["fastapi_image"] = fastapi_image
|
||||
state["nginx_image"] = nginx_image
|
||||
state["last_step"] = 6
|
||||
save_state(project_name, state)
|
||||
print("Docker images built and pushed.")
|
||||
return fastapi_image, nginx_image
|
||||
|
||||
def create_task_definition(project_name, state, config):
|
||||
if state["last_step"] >= 7:
|
||||
print("Skipping create_task_definition (already completed)")
|
||||
return state["task_def_arn"]
|
||||
if not confirm_step("Create Task Definition"):
|
||||
sys.exit("User aborted.")
|
||||
|
||||
log_group = f"/ecs/{project_name}-logs"
|
||||
result = run_command(
|
||||
["aws", "logs", "describe-log-groups", "--log-group-name-prefix", log_group, "--region", config["aws_region"]],
|
||||
"Failed to describe log groups"
|
||||
)
|
||||
if not any(lg["logGroupName"] == log_group for lg in json.loads(result.stdout).get("logGroups", [])):
|
||||
run_command(
|
||||
["aws", "logs", "create-log-group", "--log-group-name", log_group, "--region", config["aws_region"]],
|
||||
f"Failed to create log group {log_group}"
|
||||
)
|
||||
|
||||
task_definition = {
|
||||
"family": f"{project_name}-taskdef",
|
||||
"networkMode": "awsvpc",
|
||||
"requiresCompatibilities": ["FARGATE"],
|
||||
"cpu": "512",
|
||||
"memory": "2048",
|
||||
"executionRoleArn": state["execution_role_arn"],
|
||||
"containerDefinitions": [
|
||||
{
|
||||
"name": "fastapi",
|
||||
"image": state["fastapi_image"],
|
||||
"portMappings": [{"containerPort": 8000, "hostPort": 8000, "protocol": "tcp"}],
|
||||
"logConfiguration": {
|
||||
"logDriver": "awslogs",
|
||||
"options": {
|
||||
"awslogs-group": log_group,
|
||||
"awslogs-region": config["aws_region"],
|
||||
"awslogs-stream-prefix": "fastapi"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "nginx",
|
||||
"image": state["nginx_image"],
|
||||
"portMappings": [{"containerPort": 80, "hostPort": 80, "protocol": "tcp"}],
|
||||
"logConfiguration": {
|
||||
"logDriver": "awslogs",
|
||||
"options": {
|
||||
"awslogs-group": log_group,
|
||||
"awslogs-region": config["aws_region"],
|
||||
"awslogs-stream-prefix": "nginx"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
with open("task_def.json", "w") as f:
|
||||
json.dump(task_definition, f)
|
||||
result = run_command(
|
||||
["aws", "ecs", "register-task-definition", "--cli-input-json", "file://task_def.json", "--region", config["aws_region"]],
|
||||
"Failed to register task definition"
|
||||
)
|
||||
task_def_arn = json.loads(result.stdout)["taskDefinition"]["taskDefinitionArn"]
|
||||
os.remove("task_def.json")
|
||||
|
||||
state["task_def_arn"] = task_def_arn
|
||||
state["last_step"] = 7
|
||||
save_state(project_name, state)
|
||||
print("Task definition created.")
|
||||
return task_def_arn
|
||||
|
||||
def setup_alb(project_name, state, config):
|
||||
if state["last_step"] >= 8:
|
||||
print("Skipping setup_alb (already completed)")
|
||||
return state["alb_arn"], state["tg_arn"], state["alb_dns"]
|
||||
if not confirm_step("Set Up ALB"):
|
||||
sys.exit("User aborted.")
|
||||
|
||||
vpc_id = state["vpc_id"]
|
||||
public_subnets = state["public_subnets"]
|
||||
alb_name = f"{project_name}-alb"
|
||||
|
||||
result = subprocess.run(
|
||||
["aws", "elbv2", "describe-load-balancers", "--names", alb_name, "--region", config["aws_region"]],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
if result.returncode != 0:
|
||||
run_command(
|
||||
["aws", "elbv2", "create-load-balancer", "--name", alb_name, "--subnets"] + public_subnets + ["--security-groups", state["alb_sg_id"], "--region", config["aws_region"]],
|
||||
"Failed to create ALB"
|
||||
)
|
||||
alb_arn = json.loads(run_command(
|
||||
["aws", "elbv2", "describe-load-balancers", "--names", alb_name, "--region", config["aws_region"]],
|
||||
"Failed to describe ALB"
|
||||
).stdout)["LoadBalancers"][0]["LoadBalancerArn"]
|
||||
alb_dns = json.loads(run_command(
|
||||
["aws", "elbv2", "describe-load-balancers", "--names", alb_name, "--region", config["aws_region"]],
|
||||
"Failed to get ALB DNS name"
|
||||
).stdout)["LoadBalancers"][0]["DNSName"]
|
||||
|
||||
tg_name = f"{project_name}-tg"
|
||||
result = subprocess.run(
|
||||
["aws", "elbv2", "describe-target-groups", "--names", tg_name, "--region", config["aws_region"]],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
if result.returncode != 0:
|
||||
run_command(
|
||||
["aws", "elbv2", "create-target-group", "--name", tg_name, "--protocol", "HTTP", "--port", "80", "--vpc-id", vpc_id, "--region", config["aws_region"]],
|
||||
"Failed to create target group"
|
||||
)
|
||||
tg_arn = json.loads(run_command(
|
||||
["aws", "elbv2", "describe-target-groups", "--names", tg_name, "--region", config["aws_region"]],
|
||||
"Failed to describe target group"
|
||||
).stdout)["TargetGroups"][0]["TargetGroupArn"]
|
||||
|
||||
result = run_command(
|
||||
["aws", "elbv2", "describe-listeners", "--load-balancer-arn", alb_arn, "--region", config["aws_region"]],
|
||||
"Failed to describe listeners"
|
||||
)
|
||||
listeners = json.loads(result.stdout).get("Listeners", [])
|
||||
if not any(l["Port"] == 80 for l in listeners):
|
||||
run_command(
|
||||
["aws", "elbv2", "create-listener", "--load-balancer-arn", alb_arn, "--protocol", "HTTP", "--port", "80", "--default-actions", "Type=redirect,RedirectConfig={Protocol=HTTPS,Port=443,StatusCode=HTTP_301}", "--region", config["aws_region"]],
|
||||
"Failed to create HTTP listener"
|
||||
)
|
||||
if not any(l["Port"] == 443 for l in listeners):
|
||||
run_command(
|
||||
["aws", "elbv2", "create-listener", "--load-balancer-arn", alb_arn, "--protocol", "HTTPS", "--port", "443", "--certificates", f"CertificateArn={state['cert_arn']}", "--default-actions", f"Type=forward,TargetGroupArn={tg_arn}", "--region", config["aws_region"]],
|
||||
"Failed to create HTTPS listener"
|
||||
)
|
||||
|
||||
state["alb_arn"] = alb_arn
|
||||
state["tg_arn"] = tg_arn
|
||||
state["alb_dns"] = alb_dns
|
||||
state["last_step"] = 8
|
||||
save_state(project_name, state)
|
||||
print("ALB configured.")
|
||||
return alb_arn, tg_arn, alb_dns
|
||||
|
||||
def deploy_ecs_service(project_name, state, config):
|
||||
if state["last_step"] >= 9:
|
||||
print("Skipping deploy_ecs_service (already completed)")
|
||||
return
|
||||
if not confirm_step("Deploy ECS Service"):
|
||||
sys.exit("User aborted.")
|
||||
|
||||
cluster_name = f"{project_name}-cluster"
|
||||
result = run_command(
|
||||
["aws", "ecs", "describe-clusters", "--clusters", cluster_name, "--region", config["aws_region"]],
|
||||
"Failed to describe clusters"
|
||||
)
|
||||
if not json.loads(result.stdout).get("clusters"):
|
||||
run_command(
|
||||
["aws", "ecs", "create-cluster", "--cluster-name", cluster_name, "--region", config["aws_region"]],
|
||||
"Failed to create ECS cluster"
|
||||
)
|
||||
|
||||
service_name = f"{project_name}-service"
|
||||
result = run_command(
|
||||
["aws", "ecs", "describe-services", "--cluster", cluster_name, "--services", service_name, "--region", config["aws_region"]],
|
||||
"Failed to describe services",
|
||||
additional_diagnostics=[["aws", "ecs", "list-tasks", "--cluster", cluster_name, "--service-name", service_name, "--region", config["aws_region"]]]
|
||||
)
|
||||
services = json.loads(result.stdout).get("services", [])
|
||||
if not services or services[0]["status"] == "INACTIVE":
|
||||
run_command(
|
||||
["aws", "ecs", "create-service", "--cluster", cluster_name, "--service-name", service_name, "--task-definition", state["task_def_arn"], "--desired-count", "1", "--launch-type", "FARGATE", "--network-configuration", f"awsvpcConfiguration={{subnets={json.dumps(state['public_subnets'])},securityGroups=[{state['ecs_sg_id']}],assignPublicIp=ENABLED}}", "--load-balancers", f"targetGroupArn={state['tg_arn']},containerName=nginx,containerPort=80", "--region", config["aws_region"]],
|
||||
"Failed to create ECS service"
|
||||
)
|
||||
else:
|
||||
run_command(
|
||||
["aws", "ecs", "update-service", "--cluster", cluster_name, "--service", service_name, "--task-definition", state["task_def_arn"], "--region", config["aws_region"]],
|
||||
"Failed to update ECS service"
|
||||
)
|
||||
|
||||
state["last_step"] = 9
|
||||
save_state(project_name, state)
|
||||
print("ECS service deployed.")
|
||||
|
||||
def configure_custom_domain(project_name, state, config):
|
||||
if state["last_step"] >= 10:
|
||||
print("Skipping configure_custom_domain (already completed)")
|
||||
return
|
||||
if not confirm_step("Configure Custom Domain"):
|
||||
sys.exit("User aborted.")
|
||||
|
||||
domain_name = config["domain_name"]
|
||||
alb_dns = state["alb_dns"]
|
||||
print(f"Please add a CNAME record for {domain_name} pointing to {alb_dns} in your DNS provider.")
|
||||
print("Press Enter after updating the DNS record...")
|
||||
input()
|
||||
|
||||
while not check_dns_propagation(domain_name, alb_dns):
|
||||
print("DNS propagation not complete. Waiting 30 seconds before retrying...")
|
||||
time.sleep(30)
|
||||
print("DNS propagation confirmed.")
|
||||
|
||||
state["last_step"] = 10
|
||||
save_state(project_name, state)
|
||||
print("Custom domain configured.")
|
||||
|
||||
def test_endpoints(project_name, state, config):
|
||||
if state["last_step"] >= 11:
|
||||
print("Skipping test_endpoints (already completed)")
|
||||
return
|
||||
if not confirm_step("Test Endpoints"):
|
||||
sys.exit("User aborted.")
|
||||
|
||||
domain = config["domain_name"]
|
||||
time.sleep(30) # Wait for service to stabilize
|
||||
|
||||
response = requests.get(f"https://{domain}/health", verify=False)
|
||||
if response.status_code != 200:
|
||||
with open("error_context.md", "w") as f:
|
||||
f.write("Health endpoint test failed:\n")
|
||||
f.write(f"Status Code: {response.status_code}\n")
|
||||
f.write(f"Response: {response.text}\n")
|
||||
sys.exit(1)
|
||||
print("Health endpoint test passed.")
|
||||
|
||||
payload = {
|
||||
"urls": ["https://example.com"],
|
||||
"browser_config": {"headless": True},
|
||||
"crawler_config": {"stream": False}
|
||||
}
|
||||
response = requests.post(f"https://{domain}/crawl", json=payload, verify=False)
|
||||
if response.status_code != 200:
|
||||
with open("error_context.md", "w") as f:
|
||||
f.write("Crawl endpoint test failed:\n")
|
||||
f.write(f"Status Code: {response.status_code}\n")
|
||||
f.write(f"Response: {response.text}\n")
|
||||
sys.exit(1)
|
||||
print("Crawl endpoint test passed.")
|
||||
|
||||
state["last_step"] = 11
|
||||
save_state(project_name, state)
|
||||
print("Endpoints tested successfully.")
|
||||
|
||||
# Main Deployment Function
|
||||
def deploy(project_name, force=False):
|
||||
config_file = f"{project_name}-config.yml"
|
||||
if not os.path.exists(config_file):
|
||||
print(f"Configuration file {config_file} not found. Run 'init' first.")
|
||||
sys.exit(1)
|
||||
|
||||
with open(config_file, "r") as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
state = load_state(project_name)
|
||||
if force:
|
||||
state = {"last_step": -1}
|
||||
|
||||
last_step = state.get("last_step", -1)
|
||||
|
||||
for step_idx, step_name in enumerate(STEPS):
|
||||
if step_idx <= last_step:
|
||||
print(f"Skipping {step_name} (already completed)")
|
||||
continue
|
||||
print(f"Executing step: {step_name}")
|
||||
func = globals()[step_name]
|
||||
if step_name == "fetch_or_create_vpc_and_subnets":
|
||||
vpc_id, public_subnets = func(project_name, state, config)
|
||||
elif step_name == "create_security_groups":
|
||||
alb_sg_id, ecs_sg_id = func(project_name, state, config)
|
||||
elif step_name == "request_acm_certificate":
|
||||
cert_arn = func(project_name, state, config)
|
||||
elif step_name == "build_and_push_docker":
|
||||
fastapi_image, nginx_image = func(project_name, state, config)
|
||||
elif step_name == "create_task_definition":
|
||||
task_def_arn = func(project_name, state, config)
|
||||
elif step_name == "setup_alb":
|
||||
alb_arn, tg_arn, alb_dns = func(project_name, state, config)
|
||||
elif step_name == "deploy_ecs_service":
|
||||
func(project_name, state, config)
|
||||
elif step_name == "configure_custom_domain":
|
||||
func(project_name, state, config)
|
||||
elif step_name == "test_endpoints":
|
||||
func(project_name, state, config)
|
||||
else:
|
||||
func(project_name, state, config)
|
||||
|
||||
# Init Command
|
||||
def init(project_name, domain_name, aws_region):
|
||||
config = {
|
||||
"project_name": project_name,
|
||||
"domain_name": domain_name,
|
||||
"aws_region": aws_region
|
||||
}
|
||||
config_file = f"{project_name}-config.yml"
|
||||
with open(config_file, "w") as f:
|
||||
yaml.dump(config, f)
|
||||
print(f"Configuration file {config_file} created.")
|
||||
|
||||
# Argument Parser
|
||||
parser = argparse.ArgumentParser(description="Crawl4AI Deployment Script")
|
||||
subparsers = parser.add_subparsers(dest="command")
|
||||
|
||||
# Init Parser
|
||||
init_parser = subparsers.add_parser("init", help="Initialize configuration")
|
||||
init_parser.add_argument("--project", required=True, help="Project name")
|
||||
init_parser.add_argument("--domain", required=True, help="Domain name")
|
||||
init_parser.add_argument("--region", required=True, help="AWS region")
|
||||
|
||||
# Deploy Parser
|
||||
deploy_parser = subparsers.add_parser("deploy", help="Deploy the project")
|
||||
deploy_parser.add_argument("--project", required=True, help="Project name")
|
||||
deploy_parser.add_argument("--force", action="store_true", help="Force redeployment from start")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.command == "init":
|
||||
init(args.project, args.domain, args.region)
|
||||
elif args.command == "deploy":
|
||||
deploy(args.project, args.force)
|
||||
else:
|
||||
parser.print_help()
|
||||
31
deploy/aws/docker/.dockerignore
Normal file
31
deploy/aws/docker/.dockerignore
Normal file
@@ -0,0 +1,31 @@
|
||||
# .dockerignore
|
||||
*
|
||||
|
||||
# Allow specific files and directories when using local installation
|
||||
!crawl4ai/
|
||||
!docs/
|
||||
!deploy/docker/
|
||||
!setup.py
|
||||
!pyproject.toml
|
||||
!README.md
|
||||
!LICENSE
|
||||
!MANIFEST.in
|
||||
!setup.cfg
|
||||
!mkdocs.yml
|
||||
|
||||
.git/
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.pyd
|
||||
.DS_Store
|
||||
.env
|
||||
.venv
|
||||
venv/
|
||||
tests/
|
||||
coverage.xml
|
||||
*.log
|
||||
*.swp
|
||||
*.egg-info/
|
||||
dist/
|
||||
build/
|
||||
8
deploy/aws/docker/.llm.env.example
Normal file
8
deploy/aws/docker/.llm.env.example
Normal file
@@ -0,0 +1,8 @@
|
||||
# LLM Provider Keys
|
||||
OPENAI_API_KEY=your_openai_key_here
|
||||
DEEPSEEK_API_KEY=your_deepseek_key_here
|
||||
ANTHROPIC_API_KEY=your_anthropic_key_here
|
||||
GROQ_API_KEY=your_groq_key_here
|
||||
TOGETHER_API_KEY=your_together_key_here
|
||||
MISTRAL_API_KEY=your_mistral_key_here
|
||||
GEMINI_API_TOKEN=your_gemini_key_here
|
||||
847
deploy/aws/docker/README.md
Normal file
847
deploy/aws/docker/README.md
Normal file
@@ -0,0 +1,847 @@
|
||||
# Crawl4AI Docker Guide 🐳
|
||||
|
||||
## Table of Contents
|
||||
- [Prerequisites](#prerequisites)
|
||||
- [Installation](#installation)
|
||||
- [Local Build](#local-build)
|
||||
- [Docker Hub](#docker-hub)
|
||||
- [Dockerfile Parameters](#dockerfile-parameters)
|
||||
- [Using the API](#using-the-api)
|
||||
- [Understanding Request Schema](#understanding-request-schema)
|
||||
- [REST API Examples](#rest-api-examples)
|
||||
- [Python SDK](#python-sdk)
|
||||
- [Metrics & Monitoring](#metrics--monitoring)
|
||||
- [Deployment Scenarios](#deployment-scenarios)
|
||||
- [Complete Examples](#complete-examples)
|
||||
- [Getting Help](#getting-help)
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before we dive in, make sure you have:
|
||||
- Docker installed and running (version 20.10.0 or higher)
|
||||
- At least 4GB of RAM available for the container
|
||||
- Python 3.10+ (if using the Python SDK)
|
||||
- Node.js 16+ (if using the Node.js examples)
|
||||
|
||||
> 💡 **Pro tip**: Run `docker info` to check your Docker installation and available resources.
|
||||
|
||||
## Installation
|
||||
|
||||
### Local Build
|
||||
|
||||
Let's get your local environment set up step by step!
|
||||
|
||||
#### 1. Building the Image
|
||||
|
||||
First, clone the repository and build the Docker image:
|
||||
|
||||
```bash
|
||||
# Clone the repository
|
||||
git clone https://github.com/unclecode/crawl4ai.git
|
||||
cd crawl4ai/deploy
|
||||
|
||||
# Build the Docker image
|
||||
docker build --platform=linux/amd64 --no-cache -t crawl4ai .
|
||||
|
||||
# Or build for arm64
|
||||
docker build --platform=linux/arm64 --no-cache -t crawl4ai .
|
||||
```
|
||||
|
||||
#### 2. Environment Setup
|
||||
|
||||
If you plan to use LLMs (Language Models), you'll need to set up your API keys. Create a `.llm.env` file:
|
||||
|
||||
```env
|
||||
# OpenAI
|
||||
OPENAI_API_KEY=sk-your-key
|
||||
|
||||
# Anthropic
|
||||
ANTHROPIC_API_KEY=your-anthropic-key
|
||||
|
||||
# DeepSeek
|
||||
DEEPSEEK_API_KEY=your-deepseek-key
|
||||
|
||||
# Check out https://docs.litellm.ai/docs/providers for more providers!
|
||||
```
|
||||
|
||||
> 🔑 **Note**: Keep your API keys secure! Never commit them to version control.
|
||||
|
||||
#### 3. Running the Container
|
||||
|
||||
You have several options for running the container:
|
||||
|
||||
Basic run (no LLM support):
|
||||
```bash
|
||||
docker run -d -p 8000:8000 --name crawl4ai crawl4ai
|
||||
```
|
||||
|
||||
With LLM support:
|
||||
```bash
|
||||
docker run -d -p 8000:8000 \
|
||||
--env-file .llm.env \
|
||||
--name crawl4ai \
|
||||
crawl4ai
|
||||
```
|
||||
|
||||
Using host environment variables (Not a good practice, but works for local testing):
|
||||
```bash
|
||||
docker run -d -p 8000:8000 \
|
||||
--env-file .llm.env \
|
||||
--env "$(env)" \
|
||||
--name crawl4ai \
|
||||
crawl4ai
|
||||
```
|
||||
|
||||
#### Multi-Platform Build
|
||||
For distributing your image across different architectures, use `buildx`:
|
||||
|
||||
```bash
|
||||
# Set up buildx builder
|
||||
docker buildx create --use
|
||||
|
||||
# Build for multiple platforms
|
||||
docker buildx build \
|
||||
--platform linux/amd64,linux/arm64 \
|
||||
-t crawl4ai \
|
||||
--push \
|
||||
.
|
||||
```
|
||||
|
||||
> 💡 **Note**: Multi-platform builds require Docker Buildx and need to be pushed to a registry.
|
||||
|
||||
#### Development Build
|
||||
For development, you might want to enable all features:
|
||||
|
||||
```bash
|
||||
docker build -t crawl4ai
|
||||
--build-arg INSTALL_TYPE=all \
|
||||
--build-arg PYTHON_VERSION=3.10 \
|
||||
--build-arg ENABLE_GPU=true \
|
||||
.
|
||||
```
|
||||
|
||||
#### GPU-Enabled Build
|
||||
If you plan to use GPU acceleration:
|
||||
|
||||
```bash
|
||||
docker build -t crawl4ai
|
||||
--build-arg ENABLE_GPU=true \
|
||||
deploy/docker/
|
||||
```
|
||||
|
||||
### Build Arguments Explained
|
||||
|
||||
| Argument | Description | Default | Options |
|
||||
|----------|-------------|---------|----------|
|
||||
| PYTHON_VERSION | Python version | 3.10 | 3.8, 3.9, 3.10 |
|
||||
| INSTALL_TYPE | Feature set | default | default, all, torch, transformer |
|
||||
| ENABLE_GPU | GPU support | false | true, false |
|
||||
| APP_HOME | Install path | /app | any valid path |
|
||||
|
||||
### Build Best Practices
|
||||
|
||||
1. **Choose the Right Install Type**
|
||||
- `default`: Basic installation, smallest image, to be honest, I use this most of the time.
|
||||
- `all`: Full features, larger image (include transformer, and nltk, make sure you really need them)
|
||||
|
||||
2. **Platform Considerations**
|
||||
- Let Docker auto-detect platform unless you need cross-compilation
|
||||
- Use --platform for specific architecture requirements
|
||||
- Consider buildx for multi-architecture distribution
|
||||
|
||||
3. **Performance Optimization**
|
||||
- The image automatically includes platform-specific optimizations
|
||||
- AMD64 gets OpenMP optimizations
|
||||
- ARM64 gets OpenBLAS optimizations
|
||||
|
||||
### Docker Hub
|
||||
|
||||
> 🚧 Coming soon! The image will be available at `crawl4ai`. Stay tuned!
|
||||
|
||||
## Using the API
|
||||
|
||||
In the following sections, we discuss two ways to communicate with the Docker server. One option is to use the client SDK that I developed for Python, and I will soon develop one for Node.js. I highly recommend this approach to avoid mistakes. Alternatively, you can take a more technical route by using the JSON structure and passing it to all the URLs, which I will explain in detail.
|
||||
|
||||
### Python SDK
|
||||
|
||||
The SDK makes things easier! Here's how to use it:
|
||||
|
||||
```python
|
||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||
from crawl4ai import BrowserConfig, CrawlerRunConfig
|
||||
|
||||
async def main():
|
||||
async with Crawl4aiDockerClient(base_url="http://localhost:8000", verbose=True) as client:
|
||||
# If JWT is enabled, you can authenticate like this: (more on this later)
|
||||
# await client.authenticate("test@example.com")
|
||||
|
||||
# Non-streaming crawl
|
||||
results = await client.crawl(
|
||||
["https://example.com", "https://python.org"],
|
||||
browser_config=BrowserConfig(headless=True),
|
||||
crawler_config=CrawlerRunConfig()
|
||||
)
|
||||
print(f"Non-streaming results: {results}")
|
||||
|
||||
# Streaming crawl
|
||||
crawler_config = CrawlerRunConfig(stream=True)
|
||||
async for result in await client.crawl(
|
||||
["https://example.com", "https://python.org"],
|
||||
browser_config=BrowserConfig(headless=True),
|
||||
crawler_config=crawler_config
|
||||
):
|
||||
print(f"Streamed result: {result}")
|
||||
|
||||
# Get schema
|
||||
schema = await client.get_schema()
|
||||
print(f"Schema: {schema}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
`Crawl4aiDockerClient` is an async context manager that handles the connection for you. You can pass in optional parameters for more control:
|
||||
|
||||
- `base_url` (str): Base URL of the Crawl4AI Docker server
|
||||
- `timeout` (float): Default timeout for requests in seconds
|
||||
- `verify_ssl` (bool): Whether to verify SSL certificates
|
||||
- `verbose` (bool): Whether to show logging output
|
||||
- `log_file` (str, optional): Path to log file if file logging is desired
|
||||
|
||||
This client SDK generates a properly structured JSON request for the server's HTTP API.
|
||||
|
||||
## Second Approach: Direct API Calls
|
||||
|
||||
This is super important! The API expects a specific structure that matches our Python classes. Let me show you how it works.
|
||||
|
||||
### Understanding Configuration Structure
|
||||
|
||||
Let's dive deep into how configurations work in Crawl4AI. Every configuration object follows a consistent pattern of `type` and `params`. This structure enables complex, nested configurations while maintaining clarity.
|
||||
|
||||
#### The Basic Pattern
|
||||
|
||||
Try this in Python to understand the structure:
|
||||
```python
|
||||
from crawl4ai import BrowserConfig
|
||||
|
||||
# Create a config and see its structure
|
||||
config = BrowserConfig(headless=True)
|
||||
print(config.dump())
|
||||
```
|
||||
|
||||
This outputs:
|
||||
```json
|
||||
{
|
||||
"type": "BrowserConfig",
|
||||
"params": {
|
||||
"headless": true
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### Simple vs Complex Values
|
||||
|
||||
The structure follows these rules:
|
||||
- Simple values (strings, numbers, booleans, lists) are passed directly
|
||||
- Complex values (classes, dictionaries) use the type-params pattern
|
||||
|
||||
For example, with dictionaries:
|
||||
```json
|
||||
{
|
||||
"browser_config": {
|
||||
"type": "BrowserConfig",
|
||||
"params": {
|
||||
"headless": true, // Simple boolean - direct value
|
||||
"viewport": { // Complex dictionary - needs type-params
|
||||
"type": "dict",
|
||||
"value": {
|
||||
"width": 1200,
|
||||
"height": 800
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### Strategy Pattern and Nesting
|
||||
|
||||
Strategies (like chunking or content filtering) demonstrate why we need this structure. Consider this chunking configuration:
|
||||
|
||||
```json
|
||||
{
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"chunking_strategy": {
|
||||
"type": "RegexChunking", // Strategy implementation
|
||||
"params": {
|
||||
"patterns": ["\n\n", "\\.\\s+"]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Here, `chunking_strategy` accepts any chunking implementation. The `type` field tells the system which strategy to use, and `params` configures that specific strategy.
|
||||
|
||||
#### Complex Nested Example
|
||||
|
||||
Let's look at a more complex example with content filtering:
|
||||
|
||||
```json
|
||||
{
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"markdown_generator": {
|
||||
"type": "DefaultMarkdownGenerator",
|
||||
"params": {
|
||||
"content_filter": {
|
||||
"type": "PruningContentFilter",
|
||||
"params": {
|
||||
"threshold": 0.48,
|
||||
"threshold_type": "fixed"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
This shows how deeply configurations can nest while maintaining a consistent structure.
|
||||
|
||||
#### Quick Grammar Overview
|
||||
```
|
||||
config := {
|
||||
"type": string,
|
||||
"params": {
|
||||
key: simple_value | complex_value
|
||||
}
|
||||
}
|
||||
|
||||
simple_value := string | number | boolean | [simple_value]
|
||||
complex_value := config | dict_value
|
||||
|
||||
dict_value := {
|
||||
"type": "dict",
|
||||
"value": object
|
||||
}
|
||||
```
|
||||
|
||||
#### Important Rules 🚨
|
||||
|
||||
- Always use the type-params pattern for class instances
|
||||
- Use direct values for primitives (numbers, strings, booleans)
|
||||
- Wrap dictionaries with {"type": "dict", "value": {...}}
|
||||
- Arrays/lists are passed directly without type-params
|
||||
- All parameters are optional unless specifically required
|
||||
|
||||
#### Pro Tip 💡
|
||||
|
||||
The easiest way to get the correct structure is to:
|
||||
1. Create configuration objects in Python
|
||||
2. Use the `dump()` method to see their JSON representation
|
||||
3. Use that JSON in your API calls
|
||||
|
||||
Example:
|
||||
```python
|
||||
from crawl4ai import CrawlerRunConfig, PruningContentFilter
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
content_filter=PruningContentFilter(threshold=0.48)
|
||||
)
|
||||
print(config.dump()) # Use this JSON in your API calls
|
||||
```
|
||||
|
||||
|
||||
#### More Examples
|
||||
|
||||
**Advanced Crawler Configuration**
|
||||
|
||||
```json
|
||||
{
|
||||
"urls": ["https://example.com"],
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"cache_mode": "bypass",
|
||||
"markdown_generator": {
|
||||
"type": "DefaultMarkdownGenerator",
|
||||
"params": {
|
||||
"content_filter": {
|
||||
"type": "PruningContentFilter",
|
||||
"params": {
|
||||
"threshold": 0.48,
|
||||
"threshold_type": "fixed",
|
||||
"min_word_threshold": 0
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Extraction Strategy**:
|
||||
|
||||
```json
|
||||
{
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"extraction_strategy": {
|
||||
"type": "JsonCssExtractionStrategy",
|
||||
"params": {
|
||||
"schema": {
|
||||
"baseSelector": "article.post",
|
||||
"fields": [
|
||||
{"name": "title", "selector": "h1", "type": "text"},
|
||||
{"name": "content", "selector": ".content", "type": "html"}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**LLM Extraction Strategy**
|
||||
|
||||
```json
|
||||
{
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"extraction_strategy": {
|
||||
"type": "LLMExtractionStrategy",
|
||||
"params": {
|
||||
"instruction": "Extract article title, author, publication date and main content",
|
||||
"provider": "openai/gpt-4",
|
||||
"api_token": "your-api-token",
|
||||
"schema": {
|
||||
"type": "dict",
|
||||
"value": {
|
||||
"title": "Article Schema",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string",
|
||||
"description": "The article's headline"
|
||||
},
|
||||
"author": {
|
||||
"type": "string",
|
||||
"description": "The author's name"
|
||||
},
|
||||
"published_date": {
|
||||
"type": "string",
|
||||
"format": "date-time",
|
||||
"description": "Publication date and time"
|
||||
},
|
||||
"content": {
|
||||
"type": "string",
|
||||
"description": "The main article content"
|
||||
}
|
||||
},
|
||||
"required": ["title", "content"]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Deep Crawler Example**
|
||||
|
||||
```json
|
||||
{
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"deep_crawl_strategy": {
|
||||
"type": "BFSDeepCrawlStrategy",
|
||||
"params": {
|
||||
"max_depth": 3,
|
||||
"max_pages": 100,
|
||||
"filter_chain": {
|
||||
"type": "FastFilterChain",
|
||||
"params": {
|
||||
"filters": [
|
||||
{
|
||||
"type": "FastContentTypeFilter",
|
||||
"params": {
|
||||
"allowed_types": ["text/html", "application/xhtml+xml"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "FastDomainFilter",
|
||||
"params": {
|
||||
"allowed_domains": ["blog.*", "docs.*"],
|
||||
"blocked_domains": ["ads.*", "analytics.*"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "FastURLPatternFilter",
|
||||
"params": {
|
||||
"allowed_patterns": ["^/blog/", "^/docs/"],
|
||||
"blocked_patterns": [".*/ads/", ".*/sponsored/"]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"url_scorer": {
|
||||
"type": "FastCompositeScorer",
|
||||
"params": {
|
||||
"scorers": [
|
||||
{
|
||||
"type": "FastKeywordRelevanceScorer",
|
||||
"params": {
|
||||
"keywords": ["tutorial", "guide", "documentation"],
|
||||
"weight": 1.0
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "FastPathDepthScorer",
|
||||
"params": {
|
||||
"weight": 0.5,
|
||||
"preferred_depth": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "FastFreshnessScorer",
|
||||
"params": {
|
||||
"weight": 0.8,
|
||||
"max_age_days": 365
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### REST API Examples
|
||||
|
||||
Let's look at some practical examples:
|
||||
|
||||
#### Simple Crawl
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
crawl_payload = {
|
||||
"urls": ["https://example.com"],
|
||||
"browser_config": {"headless": True},
|
||||
"crawler_config": {"stream": False}
|
||||
}
|
||||
response = requests.post(
|
||||
"http://localhost:8000/crawl",
|
||||
# headers={"Authorization": f"Bearer {token}"}, # If JWT is enabled, more on this later
|
||||
json=crawl_payload
|
||||
)
|
||||
print(response.json()) # Print the response for debugging
|
||||
```
|
||||
|
||||
#### Streaming Results
|
||||
|
||||
```python
|
||||
async def test_stream_crawl(session, token: str):
|
||||
"""Test the /crawl/stream endpoint with multiple URLs."""
|
||||
url = "http://localhost:8000/crawl/stream"
|
||||
payload = {
|
||||
"urls": [
|
||||
"https://example.com",
|
||||
"https://example.com/page1",
|
||||
"https://example.com/page2",
|
||||
"https://example.com/page3",
|
||||
],
|
||||
"browser_config": {"headless": True, "viewport": {"width": 1200}},
|
||||
"crawler_config": {"stream": True, "cache_mode": "aggressive"}
|
||||
}
|
||||
|
||||
# headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled, more on this later
|
||||
|
||||
try:
|
||||
async with session.post(url, json=payload, headers=headers) as response:
|
||||
status = response.status
|
||||
print(f"Status: {status} (Expected: 200)")
|
||||
assert status == 200, f"Expected 200, got {status}"
|
||||
|
||||
# Read streaming response line-by-line (NDJSON)
|
||||
async for line in response.content:
|
||||
if line:
|
||||
data = json.loads(line.decode('utf-8').strip())
|
||||
print(f"Streamed Result: {json.dumps(data, indent=2)}")
|
||||
except Exception as e:
|
||||
print(f"Error in streaming crawl test: {str(e)}")
|
||||
```
|
||||
|
||||
## Metrics & Monitoring
|
||||
|
||||
Keep an eye on your crawler with these endpoints:
|
||||
|
||||
- `/health` - Quick health check
|
||||
- `/metrics` - Detailed Prometheus metrics
|
||||
- `/schema` - Full API schema
|
||||
|
||||
Example health check:
|
||||
```bash
|
||||
curl http://localhost:8000/health
|
||||
```
|
||||
|
||||
## Deployment Scenarios
|
||||
|
||||
> 🚧 Coming soon! We'll cover:
|
||||
> - Kubernetes deployment
|
||||
> - Cloud provider setups (AWS, GCP, Azure)
|
||||
> - High-availability configurations
|
||||
> - Load balancing strategies
|
||||
|
||||
## Complete Examples
|
||||
|
||||
Check out the `examples` folder in our repository for full working examples! Here are two to get you started:
|
||||
[Using Client SDK](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_sdk_example.py)
|
||||
[Using REST API](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_rest_api_example.py)
|
||||
|
||||
## Server Configuration
|
||||
|
||||
The server's behavior can be customized through the `config.yml` file. Let's explore how to configure your Crawl4AI server for optimal performance and security.
|
||||
|
||||
### Understanding config.yml
|
||||
|
||||
The configuration file is located at `deploy/docker/config.yml`. You can either modify this file before building the image or mount a custom configuration when running the container.
|
||||
|
||||
Here's a detailed breakdown of the configuration options:
|
||||
|
||||
```yaml
|
||||
# Application Configuration
|
||||
app:
|
||||
title: "Crawl4AI API" # Server title in OpenAPI docs
|
||||
version: "1.0.0" # API version
|
||||
host: "0.0.0.0" # Listen on all interfaces
|
||||
port: 8000 # Server port
|
||||
reload: True # Enable hot reloading (development only)
|
||||
timeout_keep_alive: 300 # Keep-alive timeout in seconds
|
||||
|
||||
# Rate Limiting Configuration
|
||||
rate_limiting:
|
||||
enabled: True # Enable/disable rate limiting
|
||||
default_limit: "100/minute" # Rate limit format: "number/timeunit"
|
||||
trusted_proxies: [] # List of trusted proxy IPs
|
||||
storage_uri: "memory://" # Use "redis://localhost:6379" for production
|
||||
|
||||
# Security Configuration
|
||||
security:
|
||||
enabled: false # Master toggle for security features
|
||||
jwt_enabled: true # Enable JWT authentication
|
||||
https_redirect: True # Force HTTPS
|
||||
trusted_hosts: ["*"] # Allowed hosts (use specific domains in production)
|
||||
headers: # Security headers
|
||||
x_content_type_options: "nosniff"
|
||||
x_frame_options: "DENY"
|
||||
content_security_policy: "default-src 'self'"
|
||||
strict_transport_security: "max-age=63072000; includeSubDomains"
|
||||
|
||||
# Crawler Configuration
|
||||
crawler:
|
||||
memory_threshold_percent: 95.0 # Memory usage threshold
|
||||
rate_limiter:
|
||||
base_delay: [1.0, 2.0] # Min and max delay between requests
|
||||
timeouts:
|
||||
stream_init: 30.0 # Stream initialization timeout
|
||||
batch_process: 300.0 # Batch processing timeout
|
||||
|
||||
# Logging Configuration
|
||||
logging:
|
||||
level: "INFO" # Log level (DEBUG, INFO, WARNING, ERROR)
|
||||
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
|
||||
# Observability Configuration
|
||||
observability:
|
||||
prometheus:
|
||||
enabled: True # Enable Prometheus metrics
|
||||
endpoint: "/metrics" # Metrics endpoint
|
||||
health_check:
|
||||
endpoint: "/health" # Health check endpoint
|
||||
```
|
||||
|
||||
### JWT Authentication
|
||||
|
||||
When `security.jwt_enabled` is set to `true` in your config.yml, all endpoints require JWT authentication via bearer tokens. Here's how it works:
|
||||
|
||||
#### Getting a Token
|
||||
```python
|
||||
POST /token
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"email": "user@example.com"
|
||||
}
|
||||
```
|
||||
|
||||
The endpoint returns:
|
||||
```json
|
||||
{
|
||||
"email": "user@example.com",
|
||||
"access_token": "eyJ0eXAiOiJKV1QiLCJhbGciOi...",
|
||||
"token_type": "bearer"
|
||||
}
|
||||
```
|
||||
|
||||
#### Using the Token
|
||||
Add the token to your requests:
|
||||
```bash
|
||||
curl -H "Authorization: Bearer eyJ0eXAiOiJKV1QiLCJhbGci..." http://localhost:8000/crawl
|
||||
```
|
||||
|
||||
Using the Python SDK:
|
||||
```python
|
||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||
|
||||
async with Crawl4aiDockerClient() as client:
|
||||
# Authenticate first
|
||||
await client.authenticate("user@example.com")
|
||||
|
||||
# Now all requests will include the token automatically
|
||||
result = await client.crawl(urls=["https://example.com"])
|
||||
```
|
||||
|
||||
#### Production Considerations 💡
|
||||
The default implementation uses a simple email verification. For production use, consider:
|
||||
- Email verification via OTP/magic links
|
||||
- OAuth2 integration
|
||||
- Rate limiting token generation
|
||||
- Token expiration and refresh mechanisms
|
||||
- IP-based restrictions
|
||||
|
||||
### Configuration Tips and Best Practices
|
||||
|
||||
1. **Production Settings** 🏭
|
||||
|
||||
```yaml
|
||||
app:
|
||||
reload: False # Disable reload in production
|
||||
timeout_keep_alive: 120 # Lower timeout for better resource management
|
||||
|
||||
rate_limiting:
|
||||
storage_uri: "redis://redis:6379" # Use Redis for distributed rate limiting
|
||||
default_limit: "50/minute" # More conservative rate limit
|
||||
|
||||
security:
|
||||
enabled: true # Enable all security features
|
||||
trusted_hosts: ["your-domain.com"] # Restrict to your domain
|
||||
```
|
||||
|
||||
2. **Development Settings** 🛠️
|
||||
|
||||
```yaml
|
||||
app:
|
||||
reload: True # Enable hot reloading
|
||||
timeout_keep_alive: 300 # Longer timeout for debugging
|
||||
|
||||
logging:
|
||||
level: "DEBUG" # More verbose logging
|
||||
```
|
||||
|
||||
3. **High-Traffic Settings** 🚦
|
||||
|
||||
```yaml
|
||||
crawler:
|
||||
memory_threshold_percent: 85.0 # More conservative memory limit
|
||||
rate_limiter:
|
||||
base_delay: [2.0, 4.0] # More aggressive rate limiting
|
||||
```
|
||||
|
||||
### Customizing Your Configuration
|
||||
|
||||
#### Method 1: Pre-build Configuration
|
||||
|
||||
```bash
|
||||
# Copy and modify config before building
|
||||
cd crawl4ai/deploy
|
||||
vim custom-config.yml # Or use any editor
|
||||
|
||||
# Build with custom config
|
||||
docker build --platform=linux/amd64 --no-cache -t crawl4ai:latest .
|
||||
```
|
||||
|
||||
#### Method 2: Build-time Configuration
|
||||
|
||||
Use a custom config during build:
|
||||
|
||||
```bash
|
||||
# Build with custom config
|
||||
docker build --platform=linux/amd64 --no-cache \
|
||||
--build-arg CONFIG_PATH=/path/to/custom-config.yml \
|
||||
-t crawl4ai:latest .
|
||||
```
|
||||
|
||||
#### Method 3: Runtime Configuration
|
||||
```bash
|
||||
# Mount custom config at runtime
|
||||
docker run -d -p 8000:8000 \
|
||||
-v $(pwd)/custom-config.yml:/app/config.yml \
|
||||
crawl4ai-server:prod
|
||||
```
|
||||
|
||||
> 💡 Note: When using Method 2, `/path/to/custom-config.yml` is relative to deploy directory.
|
||||
> 💡 Note: When using Method 3, ensure your custom config file has all required fields as the container will use this instead of the built-in config.
|
||||
|
||||
### Configuration Recommendations
|
||||
|
||||
1. **Security First** 🔒
|
||||
- Always enable security in production
|
||||
- Use specific trusted_hosts instead of wildcards
|
||||
- Set up proper rate limiting to protect your server
|
||||
- Consider your environment before enabling HTTPS redirect
|
||||
|
||||
2. **Resource Management** 💻
|
||||
- Adjust memory_threshold_percent based on available RAM
|
||||
- Set timeouts according to your content size and network conditions
|
||||
- Use Redis for rate limiting in multi-container setups
|
||||
|
||||
3. **Monitoring** 📊
|
||||
- Enable Prometheus if you need metrics
|
||||
- Set DEBUG logging in development, INFO in production
|
||||
- Regular health check monitoring is crucial
|
||||
|
||||
4. **Performance Tuning** ⚡
|
||||
- Start with conservative rate limiter delays
|
||||
- Increase batch_process timeout for large content
|
||||
- Adjust stream_init timeout based on initial response times
|
||||
|
||||
## Getting Help
|
||||
|
||||
We're here to help you succeed with Crawl4AI! Here's how to get support:
|
||||
|
||||
- 📖 Check our [full documentation](https://docs.crawl4ai.com)
|
||||
- 🐛 Found a bug? [Open an issue](https://github.com/unclecode/crawl4ai/issues)
|
||||
- 💬 Join our [Discord community](https://discord.gg/crawl4ai)
|
||||
- ⭐ Star us on GitHub to show support!
|
||||
|
||||
## Summary
|
||||
|
||||
In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
|
||||
- Building and running the Docker container
|
||||
- Configuring the environment
|
||||
- Making API requests with proper typing
|
||||
- Using the Python SDK
|
||||
- Monitoring your deployment
|
||||
|
||||
Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs.
|
||||
|
||||
Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀
|
||||
|
||||
Happy crawling! 🕷️
|
||||
442
deploy/aws/docker/api.py
Normal file
442
deploy/aws/docker/api.py
Normal file
@@ -0,0 +1,442 @@
|
||||
import os
|
||||
import json
|
||||
import asyncio
|
||||
from typing import List, Tuple
|
||||
|
||||
import logging
|
||||
from typing import Optional, AsyncGenerator
|
||||
from urllib.parse import unquote
|
||||
from fastapi import HTTPException, Request, status
|
||||
from fastapi.background import BackgroundTasks
|
||||
from fastapi.responses import JSONResponse
|
||||
from redis import asyncio as aioredis
|
||||
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
CrawlerRunConfig,
|
||||
LLMExtractionStrategy,
|
||||
CacheMode,
|
||||
BrowserConfig,
|
||||
MemoryAdaptiveDispatcher,
|
||||
RateLimiter
|
||||
)
|
||||
from crawl4ai.utils import perform_completion_with_backoff
|
||||
from crawl4ai.content_filter_strategy import (
|
||||
PruningContentFilter,
|
||||
BM25ContentFilter,
|
||||
LLMContentFilter
|
||||
)
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||
|
||||
from utils import (
|
||||
TaskStatus,
|
||||
FilterType,
|
||||
get_base_url,
|
||||
is_task_id,
|
||||
should_cleanup_task,
|
||||
decode_redis_hash
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
async def handle_llm_qa(
|
||||
url: str,
|
||||
query: str,
|
||||
config: dict
|
||||
) -> str:
|
||||
"""Process QA using LLM with crawled content as context."""
|
||||
try:
|
||||
# Extract base URL by finding last '?q=' occurrence
|
||||
last_q_index = url.rfind('?q=')
|
||||
if last_q_index != -1:
|
||||
url = url[:last_q_index]
|
||||
|
||||
# Get markdown content
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url)
|
||||
if not result.success:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=result.error_message
|
||||
)
|
||||
content = result.markdown_v2.fit_markdown
|
||||
|
||||
# Create prompt and get LLM response
|
||||
prompt = f"""Use the following content as context to answer the question.
|
||||
Content:
|
||||
{content}
|
||||
|
||||
Question: {query}
|
||||
|
||||
Answer:"""
|
||||
|
||||
response = perform_completion_with_backoff(
|
||||
provider=config["llm"]["provider"],
|
||||
prompt_with_variables=prompt,
|
||||
api_token=os.environ.get(config["llm"].get("api_key_env", ""))
|
||||
)
|
||||
|
||||
return response.choices[0].message.content
|
||||
except Exception as e:
|
||||
logger.error(f"QA processing error: {str(e)}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=str(e)
|
||||
)
|
||||
|
||||
async def process_llm_extraction(
|
||||
redis: aioredis.Redis,
|
||||
config: dict,
|
||||
task_id: str,
|
||||
url: str,
|
||||
instruction: str,
|
||||
schema: Optional[str] = None,
|
||||
cache: str = "0"
|
||||
) -> None:
|
||||
"""Process LLM extraction in background."""
|
||||
try:
|
||||
# If config['llm'] has api_key then ignore the api_key_env
|
||||
api_key = ""
|
||||
if "api_key" in config["llm"]:
|
||||
api_key = config["llm"]["api_key"]
|
||||
else:
|
||||
api_key = os.environ.get(config["llm"].get("api_key_env", None), "")
|
||||
llm_strategy = LLMExtractionStrategy(
|
||||
provider=config["llm"]["provider"],
|
||||
api_token=api_key,
|
||||
instruction=instruction,
|
||||
schema=json.loads(schema) if schema else None,
|
||||
)
|
||||
|
||||
cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.WRITE_ONLY
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
config=CrawlerRunConfig(
|
||||
extraction_strategy=llm_strategy,
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
cache_mode=cache_mode
|
||||
)
|
||||
)
|
||||
|
||||
if not result.success:
|
||||
await redis.hset(f"task:{task_id}", mapping={
|
||||
"status": TaskStatus.FAILED,
|
||||
"error": result.error_message
|
||||
})
|
||||
return
|
||||
|
||||
try:
|
||||
content = json.loads(result.extracted_content)
|
||||
except json.JSONDecodeError:
|
||||
content = result.extracted_content
|
||||
await redis.hset(f"task:{task_id}", mapping={
|
||||
"status": TaskStatus.COMPLETED,
|
||||
"result": json.dumps(content)
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"LLM extraction error: {str(e)}", exc_info=True)
|
||||
await redis.hset(f"task:{task_id}", mapping={
|
||||
"status": TaskStatus.FAILED,
|
||||
"error": str(e)
|
||||
})
|
||||
|
||||
async def handle_markdown_request(
|
||||
url: str,
|
||||
filter_type: FilterType,
|
||||
query: Optional[str] = None,
|
||||
cache: str = "0",
|
||||
config: Optional[dict] = None
|
||||
) -> str:
|
||||
"""Handle markdown generation requests."""
|
||||
try:
|
||||
decoded_url = unquote(url)
|
||||
if not decoded_url.startswith(('http://', 'https://')):
|
||||
decoded_url = 'https://' + decoded_url
|
||||
|
||||
if filter_type == FilterType.RAW:
|
||||
md_generator = DefaultMarkdownGenerator()
|
||||
else:
|
||||
content_filter = {
|
||||
FilterType.FIT: PruningContentFilter(),
|
||||
FilterType.BM25: BM25ContentFilter(user_query=query or ""),
|
||||
FilterType.LLM: LLMContentFilter(
|
||||
provider=config["llm"]["provider"],
|
||||
api_token=os.environ.get(config["llm"].get("api_key_env", None), ""),
|
||||
instruction=query or "Extract main content"
|
||||
)
|
||||
}[filter_type]
|
||||
md_generator = DefaultMarkdownGenerator(content_filter=content_filter)
|
||||
|
||||
cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.WRITE_ONLY
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url=decoded_url,
|
||||
config=CrawlerRunConfig(
|
||||
markdown_generator=md_generator,
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
cache_mode=cache_mode
|
||||
)
|
||||
)
|
||||
|
||||
if not result.success:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=result.error_message
|
||||
)
|
||||
|
||||
return (result.markdown_v2.raw_markdown
|
||||
if filter_type == FilterType.RAW
|
||||
else result.markdown_v2.fit_markdown)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Markdown error: {str(e)}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=str(e)
|
||||
)
|
||||
|
||||
async def handle_llm_request(
|
||||
redis: aioredis.Redis,
|
||||
background_tasks: BackgroundTasks,
|
||||
request: Request,
|
||||
input_path: str,
|
||||
query: Optional[str] = None,
|
||||
schema: Optional[str] = None,
|
||||
cache: str = "0",
|
||||
config: Optional[dict] = None
|
||||
) -> JSONResponse:
|
||||
"""Handle LLM extraction requests."""
|
||||
base_url = get_base_url(request)
|
||||
|
||||
try:
|
||||
if is_task_id(input_path):
|
||||
return await handle_task_status(
|
||||
redis, input_path, base_url
|
||||
)
|
||||
|
||||
if not query:
|
||||
return JSONResponse({
|
||||
"message": "Please provide an instruction",
|
||||
"_links": {
|
||||
"example": {
|
||||
"href": f"{base_url}/llm/{input_path}?q=Extract+main+content",
|
||||
"title": "Try this example"
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
return await create_new_task(
|
||||
redis,
|
||||
background_tasks,
|
||||
input_path,
|
||||
query,
|
||||
schema,
|
||||
cache,
|
||||
base_url,
|
||||
config
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"LLM endpoint error: {str(e)}", exc_info=True)
|
||||
return JSONResponse({
|
||||
"error": str(e),
|
||||
"_links": {
|
||||
"retry": {"href": str(request.url)}
|
||||
}
|
||||
}, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
|
||||
|
||||
async def handle_task_status(
|
||||
redis: aioredis.Redis,
|
||||
task_id: str,
|
||||
base_url: str
|
||||
) -> JSONResponse:
|
||||
"""Handle task status check requests."""
|
||||
task = await redis.hgetall(f"task:{task_id}")
|
||||
if not task:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail="Task not found"
|
||||
)
|
||||
|
||||
task = decode_redis_hash(task)
|
||||
response = create_task_response(task, task_id, base_url)
|
||||
|
||||
if task["status"] in [TaskStatus.COMPLETED, TaskStatus.FAILED]:
|
||||
if should_cleanup_task(task["created_at"]):
|
||||
await redis.delete(f"task:{task_id}")
|
||||
|
||||
return JSONResponse(response)
|
||||
|
||||
async def create_new_task(
|
||||
redis: aioredis.Redis,
|
||||
background_tasks: BackgroundTasks,
|
||||
input_path: str,
|
||||
query: str,
|
||||
schema: Optional[str],
|
||||
cache: str,
|
||||
base_url: str,
|
||||
config: dict
|
||||
) -> JSONResponse:
|
||||
"""Create and initialize a new task."""
|
||||
decoded_url = unquote(input_path)
|
||||
if not decoded_url.startswith(('http://', 'https://')):
|
||||
decoded_url = 'https://' + decoded_url
|
||||
|
||||
from datetime import datetime
|
||||
task_id = f"llm_{int(datetime.now().timestamp())}_{id(background_tasks)}"
|
||||
|
||||
await redis.hset(f"task:{task_id}", mapping={
|
||||
"status": TaskStatus.PROCESSING,
|
||||
"created_at": datetime.now().isoformat(),
|
||||
"url": decoded_url
|
||||
})
|
||||
|
||||
background_tasks.add_task(
|
||||
process_llm_extraction,
|
||||
redis,
|
||||
config,
|
||||
task_id,
|
||||
decoded_url,
|
||||
query,
|
||||
schema,
|
||||
cache
|
||||
)
|
||||
|
||||
return JSONResponse({
|
||||
"task_id": task_id,
|
||||
"status": TaskStatus.PROCESSING,
|
||||
"url": decoded_url,
|
||||
"_links": {
|
||||
"self": {"href": f"{base_url}/llm/{task_id}"},
|
||||
"status": {"href": f"{base_url}/llm/{task_id}"}
|
||||
}
|
||||
})
|
||||
|
||||
def create_task_response(task: dict, task_id: str, base_url: str) -> dict:
|
||||
"""Create response for task status check."""
|
||||
response = {
|
||||
"task_id": task_id,
|
||||
"status": task["status"],
|
||||
"created_at": task["created_at"],
|
||||
"url": task["url"],
|
||||
"_links": {
|
||||
"self": {"href": f"{base_url}/llm/{task_id}"},
|
||||
"refresh": {"href": f"{base_url}/llm/{task_id}"}
|
||||
}
|
||||
}
|
||||
|
||||
if task["status"] == TaskStatus.COMPLETED:
|
||||
response["result"] = json.loads(task["result"])
|
||||
elif task["status"] == TaskStatus.FAILED:
|
||||
response["error"] = task["error"]
|
||||
|
||||
return response
|
||||
|
||||
async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) -> AsyncGenerator[bytes, None]:
|
||||
"""Stream results with heartbeats and completion markers."""
|
||||
import json
|
||||
from utils import datetime_handler
|
||||
|
||||
try:
|
||||
async for result in results_gen:
|
||||
try:
|
||||
result_dict = result.model_dump()
|
||||
logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")
|
||||
data = json.dumps(result_dict, default=datetime_handler) + "\n"
|
||||
yield data.encode('utf-8')
|
||||
except Exception as e:
|
||||
logger.error(f"Serialization error: {e}")
|
||||
error_response = {"error": str(e), "url": getattr(result, 'url', 'unknown')}
|
||||
yield (json.dumps(error_response) + "\n").encode('utf-8')
|
||||
|
||||
yield json.dumps({"status": "completed"}).encode('utf-8')
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.warning("Client disconnected during streaming")
|
||||
finally:
|
||||
try:
|
||||
await crawler.close()
|
||||
except Exception as e:
|
||||
logger.error(f"Crawler cleanup error: {e}")
|
||||
|
||||
async def handle_crawl_request(
|
||||
urls: List[str],
|
||||
browser_config: dict,
|
||||
crawler_config: dict,
|
||||
config: dict
|
||||
) -> dict:
|
||||
"""Handle non-streaming crawl requests."""
|
||||
try:
|
||||
browser_config = BrowserConfig.load(browser_config)
|
||||
crawler_config = CrawlerRunConfig.load(crawler_config)
|
||||
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=config["crawler"]["memory_threshold_percent"],
|
||||
rate_limiter=RateLimiter(
|
||||
base_delay=tuple(config["crawler"]["rate_limiter"]["base_delay"])
|
||||
)
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=crawler_config,
|
||||
dispatcher=dispatcher
|
||||
)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"results": [result.model_dump() for result in results]
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Crawl error: {str(e)}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=str(e)
|
||||
)
|
||||
|
||||
async def handle_stream_crawl_request(
|
||||
urls: List[str],
|
||||
browser_config: dict,
|
||||
crawler_config: dict,
|
||||
config: dict
|
||||
) -> Tuple[AsyncWebCrawler, AsyncGenerator]:
|
||||
"""Handle streaming crawl requests."""
|
||||
try:
|
||||
browser_config = BrowserConfig.load(browser_config)
|
||||
browser_config.verbose = True
|
||||
crawler_config = CrawlerRunConfig.load(crawler_config)
|
||||
crawler_config.scraping_strategy = LXMLWebScrapingStrategy()
|
||||
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=config["crawler"]["memory_threshold_percent"],
|
||||
rate_limiter=RateLimiter(
|
||||
base_delay=tuple(config["crawler"]["rate_limiter"]["base_delay"])
|
||||
)
|
||||
)
|
||||
|
||||
crawler = AsyncWebCrawler(config=browser_config)
|
||||
await crawler.start()
|
||||
|
||||
results_gen = await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=crawler_config,
|
||||
dispatcher=dispatcher
|
||||
)
|
||||
|
||||
return crawler, results_gen
|
||||
|
||||
except Exception as e:
|
||||
if 'crawler' in locals():
|
||||
await crawler.close()
|
||||
logger.error(f"Stream crawl error: {str(e)}", exc_info=True)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=str(e)
|
||||
)
|
||||
46
deploy/aws/docker/auth.py
Normal file
46
deploy/aws/docker/auth.py
Normal file
@@ -0,0 +1,46 @@
|
||||
import os
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Dict, Optional
|
||||
from jwt import JWT, jwk_from_dict
|
||||
from jwt.utils import get_int_from_datetime
|
||||
from fastapi import Depends, HTTPException
|
||||
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
||||
from pydantic import EmailStr
|
||||
from pydantic.main import BaseModel
|
||||
import base64
|
||||
|
||||
instance = JWT()
|
||||
security = HTTPBearer()
|
||||
SECRET_KEY = os.environ.get("SECRET_KEY", "mysecret")
|
||||
ACCESS_TOKEN_EXPIRE_MINUTES = 60
|
||||
|
||||
def get_jwk_from_secret(secret: str):
|
||||
"""Convert a secret string into a JWK object."""
|
||||
secret_bytes = secret.encode('utf-8')
|
||||
b64_secret = base64.urlsafe_b64encode(secret_bytes).rstrip(b'=').decode('utf-8')
|
||||
return jwk_from_dict({"kty": "oct", "k": b64_secret})
|
||||
|
||||
def create_access_token(data: dict, expires_delta: Optional[timedelta] = None) -> str:
|
||||
"""Create a JWT access token with an expiration."""
|
||||
to_encode = data.copy()
|
||||
expire = datetime.now(timezone.utc) + (expires_delta or timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES))
|
||||
to_encode.update({"exp": get_int_from_datetime(expire)})
|
||||
signing_key = get_jwk_from_secret(SECRET_KEY)
|
||||
return instance.encode(to_encode, signing_key, alg='HS256')
|
||||
|
||||
def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)) -> Dict:
|
||||
"""Verify the JWT token from the Authorization header."""
|
||||
token = credentials.credentials
|
||||
verifying_key = get_jwk_from_secret(SECRET_KEY)
|
||||
try:
|
||||
payload = instance.decode(token, verifying_key, do_time_check=True, algorithms='HS256')
|
||||
return payload
|
||||
except Exception:
|
||||
raise HTTPException(status_code=401, detail="Invalid or expired token")
|
||||
|
||||
def get_token_dependency(config: Dict):
|
||||
"""Return the token dependency if JWT is enabled, else None."""
|
||||
return verify_token if config.get("security", {}).get("jwt_enabled", False) else None
|
||||
|
||||
class TokenRequest(BaseModel):
|
||||
email: EmailStr
|
||||
71
deploy/aws/docker/config.yml
Normal file
71
deploy/aws/docker/config.yml
Normal file
@@ -0,0 +1,71 @@
|
||||
# Application Configuration
|
||||
app:
|
||||
title: "Crawl4AI API"
|
||||
version: "1.0.0"
|
||||
host: "0.0.0.0"
|
||||
port: 8000
|
||||
reload: True
|
||||
timeout_keep_alive: 300
|
||||
|
||||
# Default LLM Configuration
|
||||
llm:
|
||||
provider: "openai/gpt-4o-mini"
|
||||
api_key_env: "OPENAI_API_KEY"
|
||||
# api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
|
||||
|
||||
# Redis Configuration
|
||||
redis:
|
||||
host: "localhost"
|
||||
port: 6379
|
||||
db: 0
|
||||
password: ""
|
||||
ssl: False
|
||||
ssl_cert_reqs: None
|
||||
ssl_ca_certs: None
|
||||
ssl_certfile: None
|
||||
ssl_keyfile: None
|
||||
ssl_cert_reqs: None
|
||||
ssl_ca_certs: None
|
||||
ssl_certfile: None
|
||||
ssl_keyfile: None
|
||||
|
||||
# Rate Limiting Configuration
|
||||
rate_limiting:
|
||||
enabled: True
|
||||
default_limit: "1000/minute"
|
||||
trusted_proxies: []
|
||||
storage_uri: "memory://" # Use "redis://localhost:6379" for production
|
||||
|
||||
# Security Configuration
|
||||
security:
|
||||
enabled: true
|
||||
jwt_enabled: true
|
||||
https_redirect: false
|
||||
trusted_hosts: ["*"]
|
||||
headers:
|
||||
x_content_type_options: "nosniff"
|
||||
x_frame_options: "DENY"
|
||||
content_security_policy: "default-src 'self'"
|
||||
strict_transport_security: "max-age=63072000; includeSubDomains"
|
||||
|
||||
# Crawler Configuration
|
||||
crawler:
|
||||
memory_threshold_percent: 95.0
|
||||
rate_limiter:
|
||||
base_delay: [1.0, 2.0]
|
||||
timeouts:
|
||||
stream_init: 30.0 # Timeout for stream initialization
|
||||
batch_process: 300.0 # Timeout for batch processing
|
||||
|
||||
# Logging Configuration
|
||||
logging:
|
||||
level: "INFO"
|
||||
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
|
||||
# Observability Configuration
|
||||
observability:
|
||||
prometheus:
|
||||
enabled: True
|
||||
endpoint: "/metrics"
|
||||
health_check:
|
||||
endpoint: "/health"
|
||||
10
deploy/aws/docker/requirements.txt
Normal file
10
deploy/aws/docker/requirements.txt
Normal file
@@ -0,0 +1,10 @@
|
||||
crawl4ai
|
||||
fastapi
|
||||
uvicorn
|
||||
gunicorn>=23.0.0
|
||||
slowapi>=0.1.9
|
||||
prometheus-fastapi-instrumentator>=7.0.2
|
||||
redis>=5.2.1
|
||||
jwt>=1.3.1
|
||||
dnspython>=2.7.0
|
||||
email-validator>=2.2.0
|
||||
181
deploy/aws/docker/server.py
Normal file
181
deploy/aws/docker/server.py
Normal file
@@ -0,0 +1,181 @@
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from typing import List, Optional, Dict
|
||||
from fastapi import FastAPI, HTTPException, Request, Query, Path, Depends
|
||||
from fastapi.responses import StreamingResponse, RedirectResponse, PlainTextResponse, JSONResponse
|
||||
from fastapi.middleware.httpsredirect import HTTPSRedirectMiddleware
|
||||
from fastapi.middleware.trustedhost import TrustedHostMiddleware
|
||||
from pydantic import BaseModel, Field
|
||||
from slowapi import Limiter
|
||||
from slowapi.util import get_remote_address
|
||||
from prometheus_fastapi_instrumentator import Instrumentator
|
||||
from redis import asyncio as aioredis
|
||||
|
||||
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
|
||||
from utils import FilterType, load_config, setup_logging, verify_email_domain
|
||||
from api import (
|
||||
handle_markdown_request,
|
||||
handle_llm_qa,
|
||||
handle_stream_crawl_request,
|
||||
handle_crawl_request,
|
||||
stream_results
|
||||
)
|
||||
from auth import create_access_token, get_token_dependency, TokenRequest # Import from auth.py
|
||||
|
||||
__version__ = "0.2.6"
|
||||
|
||||
class CrawlRequest(BaseModel):
|
||||
urls: List[str] = Field(min_length=1, max_length=100)
|
||||
browser_config: Optional[Dict] = Field(default_factory=dict)
|
||||
crawler_config: Optional[Dict] = Field(default_factory=dict)
|
||||
|
||||
# Load configuration and setup
|
||||
config = load_config()
|
||||
setup_logging(config)
|
||||
|
||||
# Initialize Redis
|
||||
redis = aioredis.from_url(config["redis"].get("uri", "redis://localhost"))
|
||||
|
||||
# Initialize rate limiter
|
||||
limiter = Limiter(
|
||||
key_func=get_remote_address,
|
||||
default_limits=[config["rate_limiting"]["default_limit"]],
|
||||
storage_uri=config["rate_limiting"]["storage_uri"]
|
||||
)
|
||||
|
||||
app = FastAPI(
|
||||
title=config["app"]["title"],
|
||||
version=config["app"]["version"]
|
||||
)
|
||||
|
||||
# Configure middleware
|
||||
def setup_security_middleware(app, config):
|
||||
sec_config = config.get("security", {})
|
||||
if sec_config.get("enabled", False):
|
||||
if sec_config.get("https_redirect", False):
|
||||
app.add_middleware(HTTPSRedirectMiddleware)
|
||||
if sec_config.get("trusted_hosts", []) != ["*"]:
|
||||
app.add_middleware(TrustedHostMiddleware, allowed_hosts=sec_config["trusted_hosts"])
|
||||
|
||||
setup_security_middleware(app, config)
|
||||
|
||||
# Prometheus instrumentation
|
||||
if config["observability"]["prometheus"]["enabled"]:
|
||||
Instrumentator().instrument(app).expose(app)
|
||||
|
||||
# Get token dependency based on config
|
||||
token_dependency = get_token_dependency(config)
|
||||
|
||||
# Middleware for security headers
|
||||
@app.middleware("http")
|
||||
async def add_security_headers(request: Request, call_next):
|
||||
response = await call_next(request)
|
||||
if config["security"]["enabled"]:
|
||||
response.headers.update(config["security"]["headers"])
|
||||
return response
|
||||
|
||||
# Token endpoint (always available, but usage depends on config)
|
||||
@app.post("/token")
|
||||
async def get_token(request_data: TokenRequest):
|
||||
if not verify_email_domain(request_data.email):
|
||||
raise HTTPException(status_code=400, detail="Invalid email domain")
|
||||
token = create_access_token({"sub": request_data.email})
|
||||
return {"email": request_data.email, "access_token": token, "token_type": "bearer"}
|
||||
|
||||
# Endpoints with conditional auth
|
||||
@app.get("/md/{url:path}")
|
||||
@limiter.limit(config["rate_limiting"]["default_limit"])
|
||||
async def get_markdown(
|
||||
request: Request,
|
||||
url: str,
|
||||
f: FilterType = FilterType.FIT,
|
||||
q: Optional[str] = None,
|
||||
c: Optional[str] = "0",
|
||||
token_data: Optional[Dict] = Depends(token_dependency)
|
||||
):
|
||||
result = await handle_markdown_request(url, f, q, c, config)
|
||||
return PlainTextResponse(result)
|
||||
|
||||
@app.get("/llm/{url:path}", description="URL should be without http/https prefix")
|
||||
async def llm_endpoint(
|
||||
request: Request,
|
||||
url: str = Path(...),
|
||||
q: Optional[str] = Query(None),
|
||||
token_data: Optional[Dict] = Depends(token_dependency)
|
||||
):
|
||||
if not q:
|
||||
raise HTTPException(status_code=400, detail="Query parameter 'q' is required")
|
||||
if not url.startswith(('http://', 'https://')):
|
||||
url = 'https://' + url
|
||||
try:
|
||||
answer = await handle_llm_qa(url, q, config)
|
||||
return JSONResponse({"answer": answer})
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.get("/schema")
|
||||
async def get_schema():
|
||||
from crawl4ai import BrowserConfig, CrawlerRunConfig
|
||||
return {"browser": BrowserConfig().dump(), "crawler": CrawlerRunConfig().dump()}
|
||||
|
||||
@app.get(config["observability"]["health_check"]["endpoint"])
|
||||
async def health():
|
||||
return {"status": "ok", "timestamp": time.time(), "version": __version__}
|
||||
|
||||
@app.get(config["observability"]["prometheus"]["endpoint"])
|
||||
async def metrics():
|
||||
return RedirectResponse(url=config["observability"]["prometheus"]["endpoint"])
|
||||
|
||||
@app.post("/crawl")
|
||||
@limiter.limit(config["rate_limiting"]["default_limit"])
|
||||
async def crawl(
|
||||
request: Request,
|
||||
crawl_request: CrawlRequest,
|
||||
token_data: Optional[Dict] = Depends(token_dependency)
|
||||
):
|
||||
if not crawl_request.urls:
|
||||
raise HTTPException(status_code=400, detail="At least one URL required")
|
||||
|
||||
results = await handle_crawl_request(
|
||||
urls=crawl_request.urls,
|
||||
browser_config=crawl_request.browser_config,
|
||||
crawler_config=crawl_request.crawler_config,
|
||||
config=config
|
||||
)
|
||||
|
||||
return JSONResponse(results)
|
||||
|
||||
|
||||
@app.post("/crawl/stream")
|
||||
@limiter.limit(config["rate_limiting"]["default_limit"])
|
||||
async def crawl_stream(
|
||||
request: Request,
|
||||
crawl_request: CrawlRequest,
|
||||
token_data: Optional[Dict] = Depends(token_dependency)
|
||||
):
|
||||
if not crawl_request.urls:
|
||||
raise HTTPException(status_code=400, detail="At least one URL required")
|
||||
|
||||
crawler, results_gen = await handle_stream_crawl_request(
|
||||
urls=crawl_request.urls,
|
||||
browser_config=crawl_request.browser_config,
|
||||
crawler_config=crawl_request.crawler_config,
|
||||
config=config
|
||||
)
|
||||
|
||||
return StreamingResponse(
|
||||
stream_results(crawler, results_gen),
|
||||
media_type='application/x-ndjson',
|
||||
headers={'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'X-Stream-Status': 'active'}
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(
|
||||
"server:app",
|
||||
host=config["app"]["host"],
|
||||
port=config["app"]["port"],
|
||||
reload=config["app"]["reload"],
|
||||
timeout_keep_alive=config["app"]["timeout_keep_alive"]
|
||||
)
|
||||
12
deploy/aws/docker/supervisord.conf
Normal file
12
deploy/aws/docker/supervisord.conf
Normal file
@@ -0,0 +1,12 @@
|
||||
[supervisord]
|
||||
nodaemon=true
|
||||
|
||||
[program:redis]
|
||||
command=redis-server
|
||||
autorestart=true
|
||||
priority=10
|
||||
|
||||
[program:gunicorn]
|
||||
command=gunicorn --bind 0.0.0.0:8000 --workers 4 --threads 2 --timeout 300 --graceful-timeout 60 --keep-alive 65 --log-level debug --worker-class uvicorn.workers.UvicornWorker --max-requests 1000 --max-requests-jitter 50 server:app
|
||||
autorestart=true
|
||||
priority=20
|
||||
66
deploy/aws/docker/utils.py
Normal file
66
deploy/aws/docker/utils.py
Normal file
@@ -0,0 +1,66 @@
|
||||
import dns.resolver
|
||||
import logging
|
||||
import yaml
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from fastapi import Request
|
||||
from typing import Dict, Optional
|
||||
|
||||
class TaskStatus(str, Enum):
|
||||
PROCESSING = "processing"
|
||||
FAILED = "failed"
|
||||
COMPLETED = "completed"
|
||||
|
||||
class FilterType(str, Enum):
|
||||
RAW = "raw"
|
||||
FIT = "fit"
|
||||
BM25 = "bm25"
|
||||
LLM = "llm"
|
||||
|
||||
def load_config() -> Dict:
|
||||
"""Load and return application configuration."""
|
||||
config_path = Path(__file__).parent / "config.yml"
|
||||
with open(config_path, "r") as config_file:
|
||||
return yaml.safe_load(config_file)
|
||||
|
||||
def setup_logging(config: Dict) -> None:
|
||||
"""Configure application logging."""
|
||||
logging.basicConfig(
|
||||
level=config["logging"]["level"],
|
||||
format=config["logging"]["format"]
|
||||
)
|
||||
|
||||
def get_base_url(request: Request) -> str:
|
||||
"""Get base URL including scheme and host."""
|
||||
return f"{request.url.scheme}://{request.url.netloc}"
|
||||
|
||||
def is_task_id(value: str) -> bool:
|
||||
"""Check if the value matches task ID pattern."""
|
||||
return value.startswith("llm_") and "_" in value
|
||||
|
||||
def datetime_handler(obj: any) -> Optional[str]:
|
||||
"""Handle datetime serialization for JSON."""
|
||||
if hasattr(obj, 'isoformat'):
|
||||
return obj.isoformat()
|
||||
raise TypeError(f"Object of type {type(obj)} is not JSON serializable")
|
||||
|
||||
def should_cleanup_task(created_at: str) -> bool:
|
||||
"""Check if task should be cleaned up based on creation time."""
|
||||
created = datetime.fromisoformat(created_at)
|
||||
return (datetime.now() - created).total_seconds() > 3600
|
||||
|
||||
def decode_redis_hash(hash_data: Dict[bytes, bytes]) -> Dict[str, str]:
|
||||
"""Decode Redis hash data from bytes to strings."""
|
||||
return {k.decode('utf-8'): v.decode('utf-8') for k, v in hash_data.items()}
|
||||
|
||||
|
||||
|
||||
def verify_email_domain(email: str) -> bool:
|
||||
try:
|
||||
domain = email.split('@')[1]
|
||||
# Try to resolve MX records for the domain.
|
||||
records = dns.resolver.resolve(domain, 'MX')
|
||||
return True if records else False
|
||||
except Exception as e:
|
||||
return False
|
||||
77
deploy/aws/howto.md
Normal file
77
deploy/aws/howto.md
Normal file
@@ -0,0 +1,77 @@
|
||||
# Crawl4AI API Quickstart
|
||||
|
||||
This document shows how to generate an API token and use it to call the `/crawl` and `/md` endpoints.
|
||||
|
||||
---
|
||||
|
||||
## 1. Crawl Example
|
||||
|
||||
Send a POST request to `/crawl` with the following JSON payload:
|
||||
|
||||
```json
|
||||
{
|
||||
"urls": ["https://example.com"],
|
||||
"browser_config": { "headless": true, "verbose": true },
|
||||
"crawler_config": { "stream": false, "cache_mode": "enabled" }
|
||||
}
|
||||
```
|
||||
|
||||
**cURL Command:**
|
||||
|
||||
```bash
|
||||
curl -X POST "https://api.crawl4ai.com/crawl" \
|
||||
-H "Authorization: Bearer YOUR_API_TOKEN" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"urls": ["https://example.com"],
|
||||
"browser_config": {"headless": true, "verbose": true},
|
||||
"crawler_config": {"stream": false, "cache_mode": "enabled"}
|
||||
}'
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 2. Markdown Retrieval Example
|
||||
|
||||
To retrieve markdown from a given URL (e.g., `https://example.com`), use:
|
||||
|
||||
```bash
|
||||
curl -X GET "https://api.crawl4ai.com/md/example.com" \
|
||||
-H "Authorization: Bearer YOUR_API_TOKEN"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. Python Code Example (Using `requests`)
|
||||
|
||||
Below is a sample Python script that demonstrates using the `requests` library to call the API endpoints:
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
BASE_URL = "https://api.crawl4ai.com"
|
||||
TOKEN = "YOUR_API_TOKEN" # Replace with your actual token
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {TOKEN}",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
# Crawl endpoint example
|
||||
crawl_payload = {
|
||||
"urls": ["https://example.com"],
|
||||
"browser_config": {"headless": True, "verbose": True},
|
||||
"crawler_config": {"stream": False, "cache_mode": "enabled"}
|
||||
}
|
||||
|
||||
crawl_response = requests.post(f"{BASE_URL}/crawl", json=crawl_payload, headers=headers)
|
||||
print("Crawl Response:", crawl_response.json())
|
||||
|
||||
# /md endpoint example
|
||||
md_response = requests.get(f"{BASE_URL}/md/example.com", headers=headers)
|
||||
print("Markdown Content:", md_response.text)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
Happy crawling!
|
||||
2
deploy/aws/nginx/Dockerfile
Normal file
2
deploy/aws/nginx/Dockerfile
Normal file
@@ -0,0 +1,2 @@
|
||||
FROM nginx:alpine
|
||||
COPY nginx.conf /etc/nginx/conf.d/default.conf
|
||||
55
deploy/aws/nginx/nginx.conf
Normal file
55
deploy/aws/nginx/nginx.conf
Normal file
@@ -0,0 +1,55 @@
|
||||
server {
|
||||
listen 80;
|
||||
server_name api.crawl4ai.com;
|
||||
|
||||
# Main logging settings
|
||||
error_log /var/log/nginx/error.log debug;
|
||||
access_log /var/log/nginx/access.log combined buffer=512k flush=1m;
|
||||
|
||||
# Timeout and buffering settings
|
||||
proxy_connect_timeout 300;
|
||||
proxy_send_timeout 300;
|
||||
proxy_read_timeout 300;
|
||||
send_timeout 300;
|
||||
proxy_buffer_size 128k;
|
||||
proxy_buffers 4 256k;
|
||||
proxy_busy_buffers_size 256k;
|
||||
|
||||
# Health check location
|
||||
location /health {
|
||||
proxy_pass http://127.0.0.1:8000/health;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
}
|
||||
|
||||
# Main proxy for application endpoints
|
||||
location / {
|
||||
proxy_pass http://127.0.0.1:8000;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
add_header X-Debug-Info $request_uri;
|
||||
proxy_request_buffering off;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Connection "";
|
||||
proxy_buffering off;
|
||||
}
|
||||
|
||||
# New endpoint: serve Nginx error log
|
||||
location /nginx/error {
|
||||
# Using "alias" to serve the error log file
|
||||
alias /var/log/nginx/error.log;
|
||||
# Optionally, you might restrict access with "allow" and "deny" directives.
|
||||
}
|
||||
|
||||
# New endpoint: serve Nginx access log
|
||||
location /nginx/access {
|
||||
alias /var/log/nginx/access.log;
|
||||
}
|
||||
|
||||
client_max_body_size 10M;
|
||||
client_body_buffer_size 128k;
|
||||
}
|
||||
1
deploy/aws/version.txt
Normal file
1
deploy/aws/version.txt
Normal file
@@ -0,0 +1 @@
|
||||
v0.1.0
|
||||
63
deploy/gcloud-function/Dockerfile
Normal file
63
deploy/gcloud-function/Dockerfile
Normal file
@@ -0,0 +1,63 @@
|
||||
FROM --platform=linux/amd64 python:3.10-slim
|
||||
|
||||
# Install system dependencies required for Chromium and Git
|
||||
RUN apt-get update && apt-get install -y \
|
||||
python3-dev \
|
||||
pkg-config \
|
||||
libjpeg-dev \
|
||||
gcc \
|
||||
build-essential \
|
||||
libnss3 \
|
||||
libnspr4 \
|
||||
libatk1.0-0 \
|
||||
libatk-bridge2.0-0 \
|
||||
libcups2 \
|
||||
libdrm2 \
|
||||
libxkbcommon0 \
|
||||
libxcomposite1 \
|
||||
libxdamage1 \
|
||||
libxfixes3 \
|
||||
libxrandr2 \
|
||||
libgbm1 \
|
||||
libasound2 \
|
||||
libpango-1.0-0 \
|
||||
libcairo2 \
|
||||
procps \
|
||||
git \
|
||||
socat \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Make a directory for crawl4ai call it crawl4ai_repo
|
||||
# RUN mkdir crawl4ai_repo
|
||||
|
||||
# # Clone Crawl4ai from the next branch and install it
|
||||
# RUN git clone --branch next https://github.com/unclecode/crawl4ai.git ./crawl4ai_repo \
|
||||
# && cd crawl4ai_repo \
|
||||
# && pip install . \
|
||||
# && cd .. \
|
||||
# && rm -rf crawl4ai_repo
|
||||
|
||||
RUN python3 -m venv /app/venv
|
||||
ENV PATH="/app/venv/bin:$PATH"
|
||||
# RUN pip install git+https://github.com/unclecode/crawl4ai.git@next
|
||||
|
||||
# Copy requirements and install remaining dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install -r requirements.txt
|
||||
|
||||
# Copy application files
|
||||
COPY resources /app/resources
|
||||
COPY main.py .
|
||||
COPY start.sh .
|
||||
|
||||
# Set permissions for Chrome binary and start script
|
||||
RUN chmod +x /app/resources/chrome/headless_shell && \
|
||||
chmod -R 755 /app/resources/chrome && \
|
||||
chmod +x start.sh
|
||||
|
||||
ENV FUNCTION_TARGET=crawl
|
||||
EXPOSE 8080 9223
|
||||
|
||||
CMD ["/app/start.sh"]
|
||||
8
deploy/gcloud-function/config.yml
Normal file
8
deploy/gcloud-function/config.yml
Normal file
@@ -0,0 +1,8 @@
|
||||
project_id: PROJECT_ID
|
||||
region: REGION_NAME
|
||||
artifact_repo: ARTIFACT_REPO_NAME
|
||||
function_name: FUNCTION_NAME
|
||||
memory: "2048MB"
|
||||
timeout: "540s"
|
||||
local_image: "gcr.io/ARTIFACT_REPO_NAME/crawl4ai:latest"
|
||||
test_query_url: "https://example.com"
|
||||
187
deploy/gcloud-function/deploy.py
Normal file
187
deploy/gcloud-function/deploy.py
Normal file
@@ -0,0 +1,187 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import subprocess
|
||||
import sys
|
||||
import yaml
|
||||
import requests
|
||||
|
||||
def run_command(cmd, explanation, require_confirm=True, allow_already_exists=False):
|
||||
print("\n=== {} ===".format(explanation))
|
||||
if require_confirm:
|
||||
input("Press Enter to run: [{}]\n".format(cmd))
|
||||
print("Running: {}".format(cmd))
|
||||
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
if allow_already_exists and "ALREADY_EXISTS" in result.stderr:
|
||||
print("Repository already exists, skipping creation.")
|
||||
return ""
|
||||
print("Error:\n{}".format(result.stderr))
|
||||
sys.exit(1)
|
||||
out = result.stdout.strip()
|
||||
if out:
|
||||
print("Output:\n{}".format(out))
|
||||
return out
|
||||
|
||||
def load_config():
|
||||
try:
|
||||
with open("config.yml", "r") as f:
|
||||
config = yaml.safe_load(f)
|
||||
except Exception as e:
|
||||
print("Failed to load config.yml: {}".format(e))
|
||||
sys.exit(1)
|
||||
required = ["project_id", "region", "artifact_repo", "function_name", "local_image"]
|
||||
for key in required:
|
||||
if key not in config or not config[key]:
|
||||
print("Missing required config parameter: {}".format(key))
|
||||
sys.exit(1)
|
||||
return config
|
||||
|
||||
def deploy_function(config):
|
||||
project_id = config["project_id"]
|
||||
region = config["region"]
|
||||
artifact_repo = config["artifact_repo"]
|
||||
function_name = config["function_name"]
|
||||
memory = config.get("memory", "2048MB")
|
||||
timeout = config.get("timeout", "540s")
|
||||
local_image = config["local_image"]
|
||||
test_query_url = config.get("test_query_url", "https://example.com")
|
||||
|
||||
# Repository image format: "<region>-docker.pkg.dev/<project_id>/<artifact_repo>/<function_name>:latest"
|
||||
repo_image = f"{region}-docker.pkg.dev/{project_id}/{artifact_repo}/{function_name}:latest"
|
||||
|
||||
# 1. Create Artifact Registry repository (skip if exists)
|
||||
cmd = f"gcloud artifacts repositories create {artifact_repo} --repository-format=docker --location={region} --project={project_id}"
|
||||
run_command(cmd, "Creating Artifact Registry repository (if it doesn't exist)", allow_already_exists=True)
|
||||
|
||||
# 2. Tag the local Docker image with the repository image name
|
||||
cmd = f"docker tag {local_image} {repo_image}"
|
||||
run_command(cmd, "Tagging Docker image for Artifact Registry")
|
||||
|
||||
# 3. Authenticate Docker to Artifact Registry
|
||||
cmd = f"gcloud auth configure-docker {region}-docker.pkg.dev"
|
||||
run_command(cmd, "Authenticating Docker to Artifact Registry")
|
||||
|
||||
# 4. Push the tagged Docker image to Artifact Registry
|
||||
cmd = f"docker push {repo_image}"
|
||||
run_command(cmd, "Pushing Docker image to Artifact Registry")
|
||||
|
||||
# 5. Deploy the Cloud Function using the custom container
|
||||
cmd = (
|
||||
f"gcloud beta functions deploy {function_name} "
|
||||
f"--gen2 "
|
||||
f"--runtime=python310 "
|
||||
f"--entry-point=crawl "
|
||||
f"--region={region} "
|
||||
f"--docker-repository={region}-docker.pkg.dev/{project_id}/{artifact_repo} "
|
||||
f"--trigger-http "
|
||||
f"--memory={memory} "
|
||||
f"--timeout={timeout} "
|
||||
f"--project={project_id}"
|
||||
)
|
||||
run_command(cmd, "Deploying Cloud Function using custom container")
|
||||
|
||||
# 6. Set the Cloud Function to allow public (unauthenticated) invocations
|
||||
cmd = (
|
||||
f"gcloud functions add-iam-policy-binding {function_name} "
|
||||
f"--region={region} "
|
||||
f"--member='allUsers' "
|
||||
f"--role='roles/cloudfunctions.invoker' "
|
||||
f"--project={project_id}"
|
||||
f"--quiet"
|
||||
)
|
||||
run_command(cmd, "Setting Cloud Function IAM to allow public invocations")
|
||||
|
||||
# 7. Retrieve the deployed Cloud Function URL
|
||||
cmd = (
|
||||
f"gcloud functions describe {function_name} "
|
||||
f"--region={region} "
|
||||
f"--project={project_id} "
|
||||
f"--format='value(serviceConfig.uri)'"
|
||||
)
|
||||
deployed_url = run_command(cmd, "Extracting deployed Cloud Function URL", require_confirm=False)
|
||||
print("\nDeployed URL: {}\n".format(deployed_url))
|
||||
|
||||
# 8. Test the deployed function
|
||||
test_url = f"{deployed_url}?url={test_query_url}"
|
||||
print("Testing function with: {}".format(test_url))
|
||||
try:
|
||||
response = requests.get(test_url)
|
||||
print("Response status: {}".format(response.status_code))
|
||||
print("Response body:\n{}".format(response.text))
|
||||
if response.status_code == 200:
|
||||
print("Test successful!")
|
||||
else:
|
||||
print("Non-200 response; check function logs.")
|
||||
except Exception as e:
|
||||
print("Test request error: {}".format(e))
|
||||
sys.exit(1)
|
||||
|
||||
# 9. Final usage help
|
||||
print("\nDeployment complete!")
|
||||
print("Invoke your function with:")
|
||||
print(f"curl '{deployed_url}?url={test_query_url}'")
|
||||
print("For further instructions, refer to your documentation.")
|
||||
|
||||
def delete_function(config):
|
||||
project_id = config["project_id"]
|
||||
region = config["region"]
|
||||
function_name = config["function_name"]
|
||||
cmd = f"gcloud functions delete {function_name} --region={region} --project={project_id} --quiet"
|
||||
run_command(cmd, "Deleting Cloud Function")
|
||||
|
||||
def describe_function(config):
|
||||
project_id = config["project_id"]
|
||||
region = config["region"]
|
||||
function_name = config["function_name"]
|
||||
cmd = (
|
||||
f"gcloud functions describe {function_name} "
|
||||
f"--region={region} "
|
||||
f"--project={project_id} "
|
||||
f"--format='value(serviceConfig.uri)'"
|
||||
)
|
||||
deployed_url = run_command(cmd, "Describing Cloud Function to extract URL", require_confirm=False)
|
||||
print("\nCloud Function URL: {}\n".format(deployed_url))
|
||||
|
||||
def clear_all(config):
|
||||
print("\n=== CLEAR ALL RESOURCES ===")
|
||||
project_id = config["project_id"]
|
||||
region = config["region"]
|
||||
artifact_repo = config["artifact_repo"]
|
||||
|
||||
confirm = input("WARNING: This will DELETE the Cloud Function and the Artifact Registry repository. Are you sure? (y/N): ")
|
||||
if confirm.lower() != "y":
|
||||
print("Aborting clear operation.")
|
||||
sys.exit(0)
|
||||
|
||||
# Delete the Cloud Function
|
||||
delete_function(config)
|
||||
# Delete the Artifact Registry repository
|
||||
cmd = f"gcloud artifacts repositories delete {artifact_repo} --location={region} --project={project_id} --quiet"
|
||||
run_command(cmd, "Deleting Artifact Registry repository", require_confirm=False)
|
||||
print("All resources cleared.")
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Deploy, delete, describe, or clear Cloud Function resources using config.yml")
|
||||
subparsers = parser.add_subparsers(dest="command", required=True)
|
||||
|
||||
subparsers.add_parser("deploy", help="Deploy the Cloud Function")
|
||||
subparsers.add_parser("delete", help="Delete the deployed Cloud Function")
|
||||
subparsers.add_parser("describe", help="Describe the Cloud Function and return its URL")
|
||||
subparsers.add_parser("clear", help="Delete the Cloud Function and Artifact Registry repository")
|
||||
|
||||
args = parser.parse_args()
|
||||
config = load_config()
|
||||
|
||||
if args.command == "deploy":
|
||||
deploy_function(config)
|
||||
elif args.command == "delete":
|
||||
delete_function(config)
|
||||
elif args.command == "describe":
|
||||
describe_function(config)
|
||||
elif args.command == "clear":
|
||||
clear_all(config)
|
||||
else:
|
||||
parser.print_help()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
204
deploy/gcloud-function/guide.md
Normal file
204
deploy/gcloud-function/guide.md
Normal file
@@ -0,0 +1,204 @@
|
||||
# Deploying Crawl4ai on Google Cloud Functions
|
||||
|
||||
This guide explains how to deploy **Crawl4ai**—an open‑source web crawler library—on Google Cloud Functions Gen2 using a custom container. We assume your project folder already includes:
|
||||
|
||||
- **Dockerfile:** Builds your container image (which installs Crawl4ai from its Git repository).
|
||||
- **start.sh:** Activates your virtual environment and starts the function (using the Functions Framework).
|
||||
- **main.py:** Contains your function logic with the entry point `crawl` (and imports Crawl4ai).
|
||||
|
||||
The guide is divided into two parts:
|
||||
1. Manual deployment steps (using CLI commands)
|
||||
2. Automated deployment using a Python script (`deploy.py`)
|
||||
|
||||
---
|
||||
|
||||
## Part 1: Manual Deployment Process
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- **Google Cloud Project:** Ensure your project is active and billing is enabled.
|
||||
- **Google Cloud CLI & Docker:** Installed and configured on your local machine.
|
||||
- **Permissions:** You must have rights to create Cloud Functions and Artifact Registry repositories.
|
||||
- **Files:** Your Dockerfile, start.sh, and main.py should be in the same directory.
|
||||
|
||||
### Step 1: Build Your Docker Image
|
||||
|
||||
Your Dockerfile packages Crawl4ai along with all its dependencies. Build your image with:
|
||||
|
||||
```bash
|
||||
docker build -t gcr.io/<PROJECT_ID>/<FUNCTION_NAME>:latest .
|
||||
```
|
||||
|
||||
Replace `<PROJECT_ID>` with your Google Cloud project ID and `<FUNCTION_NAME>` with your chosen function name (for example, `crawl4ai-t1`).
|
||||
|
||||
### Step 2: Create an Artifact Registry Repository
|
||||
|
||||
Cloud Functions Gen2 requires your custom container image to reside in an Artifact Registry repository. Create one by running:
|
||||
|
||||
```bash
|
||||
gcloud artifacts repositories create <ARTIFACT_REPO> \
|
||||
--repository-format=docker \
|
||||
--location=<REGION> \
|
||||
--project=<PROJECT_ID>
|
||||
```
|
||||
|
||||
Replace `<ARTIFACT_REPO>` (for example, `crawl4ai`) and `<REGION>` (for example, `asia-east1`).
|
||||
> **Note:** If you receive an `ALREADY_EXISTS` error, the repository is already created; simply proceed to the next step.
|
||||
|
||||
### Step 3: Tag Your Docker Image
|
||||
|
||||
Tag your locally built Docker image so it matches the Artifact Registry format:
|
||||
|
||||
```bash
|
||||
docker tag gcr.io/<PROJECT_ID>/<FUNCTION_NAME>:latest <REGION>-docker.pkg.dev/<PROJECT_ID>/<ARTIFACT_REPO>/<FUNCTION_NAME>:latest
|
||||
```
|
||||
|
||||
This step “renames” the image so you can push it to your repository.
|
||||
|
||||
### Step 4: Authenticate Docker to Artifact Registry
|
||||
|
||||
Configure Docker authentication to the Artifact Registry:
|
||||
|
||||
```bash
|
||||
gcloud auth configure-docker <REGION>-docker.pkg.dev
|
||||
```
|
||||
|
||||
This ensures Docker can securely push images to your registry using your Cloud credentials.
|
||||
|
||||
### Step 5: Push the Docker Image
|
||||
|
||||
Push the tagged image to Artifact Registry:
|
||||
|
||||
```bash
|
||||
docker push <REGION>-docker.pkg.dev/<PROJECT_ID>/<ARTIFACT_REPO>/<FUNCTION_NAME>:latest
|
||||
```
|
||||
|
||||
Once complete, your container image (with Crawl4ai installed) is hosted in Artifact Registry.
|
||||
|
||||
### Step 6: Deploy the Cloud Function
|
||||
|
||||
Deploy your function using the custom container image. Run:
|
||||
|
||||
```bash
|
||||
gcloud beta functions deploy <FUNCTION_NAME> \
|
||||
--gen2 \
|
||||
--region=<REGION> \
|
||||
--docker-repository=<REGION>-docker.pkg.dev/<PROJECT_ID>/<ARTIFACT_REPO> \
|
||||
--trigger-http \
|
||||
--memory=2048MB \
|
||||
--timeout=540s \
|
||||
--project=<PROJECT_ID>
|
||||
```
|
||||
|
||||
This command tells Cloud Functions Gen2 to pull your container image from Artifact Registry and deploy it. Make sure your main.py defines the `crawl` entry point.
|
||||
|
||||
### Step 7: Make the Function Public
|
||||
|
||||
To allow external (unauthenticated) access, update the function’s IAM policy:
|
||||
|
||||
```bash
|
||||
gcloud functions add-iam-policy-binding <FUNCTION_NAME> \
|
||||
--region=<REGION> \
|
||||
--member="allUsers" \
|
||||
--role="roles/cloudfunctions.invoker" \
|
||||
--project=<PROJECT_ID> \
|
||||
--quiet
|
||||
```
|
||||
|
||||
Using the `--quiet` flag ensures the command runs non‑interactively so the policy is applied immediately.
|
||||
|
||||
### Step 8: Retrieve and Test Your Function URL
|
||||
|
||||
Get the URL for your deployed function:
|
||||
|
||||
```bash
|
||||
gcloud functions describe <FUNCTION_NAME> \
|
||||
--region=<REGION> \
|
||||
--project=<PROJECT_ID> \
|
||||
--format='value(serviceConfig.uri)'
|
||||
```
|
||||
|
||||
Test your deployment with a sample GET request (using curl or your browser):
|
||||
|
||||
```bash
|
||||
curl "<FUNCTION_URL>?url=https://example.com"
|
||||
```
|
||||
|
||||
Replace `<FUNCTION_URL>` with the output URL from the previous command. A successful test (HTTP status 200) means Crawl4ai is running on Cloud Functions.
|
||||
|
||||
---
|
||||
|
||||
## Part 2: Automated Deployment with deploy.py
|
||||
|
||||
For a more streamlined process, use the provided `deploy.py` script. This Python script automates the manual steps, prompting you to confirm key actions and providing detailed logs throughout the process.
|
||||
|
||||
### What deploy.py Does:
|
||||
|
||||
- **Reads Parameters:** It loads a `config.yml` file containing all necessary parameters such as `project_id`, `region`, `artifact_repo`, `function_name`, `local_image`, etc.
|
||||
- **Creates/Skips Repository:** It creates the Artifact Registry repository (or skips if it already exists).
|
||||
- **Tags & Pushes:** It tags your local Docker image and pushes it to the Artifact Registry.
|
||||
- **Deploys the Function:** It deploys the Cloud Function with your custom container.
|
||||
- **Updates IAM:** It sets the IAM policy to allow public access (using the `--quiet` flag).
|
||||
- **Tests the Deployment:** It extracts the deployed URL and performs a test request.
|
||||
- **Additional Commands:** You can also use subcommands in the script to delete or describe the deployed function, or even clear all resources.
|
||||
|
||||
### Example config.yml
|
||||
|
||||
Create a `config.yml` file in the same folder as your Dockerfile. An example configuration:
|
||||
|
||||
```yaml
|
||||
project_id: your-project-id
|
||||
region: asia-east1
|
||||
artifact_repo: crawl4ai
|
||||
function_name: crawl4ai-t1
|
||||
memory: "2048MB"
|
||||
timeout: "540s"
|
||||
local_image: "gcr.io/your-project-id/crawl4ai-t1:latest"
|
||||
test_query_url: "https://example.com"
|
||||
```
|
||||
|
||||
### How to Use deploy.py
|
||||
|
||||
- **Deploy the Function:**
|
||||
|
||||
```bash
|
||||
python deploy.py deploy
|
||||
```
|
||||
|
||||
The script will guide you through each step, display the output, and ask for confirmation before executing critical commands.
|
||||
|
||||
- **Describe the Function:**
|
||||
|
||||
If you forget the function URL and want to retrieve it later:
|
||||
|
||||
```bash
|
||||
python deploy.py describe
|
||||
```
|
||||
|
||||
- **Delete the Function:**
|
||||
|
||||
To remove just the Cloud Function:
|
||||
|
||||
```bash
|
||||
python deploy.py delete
|
||||
```
|
||||
|
||||
- **Clear All Resources:**
|
||||
|
||||
To delete both the Cloud Function and the Artifact Registry repository:
|
||||
|
||||
```bash
|
||||
python deploy.py clear
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Conclusion
|
||||
|
||||
This guide has walked you through two deployment methods for Crawl4ai on Google Cloud Functions Gen2:
|
||||
|
||||
1. **Manual Deployment:** Building your Docker image, pushing it to Artifact Registry, deploying the Cloud Function, and setting up IAM.
|
||||
2. **Automated Deployment:** Using `deploy.py` with a configuration file to handle the entire process interactively.
|
||||
|
||||
By following these instructions, you can deploy, test, and manage your Crawl4ai-based Cloud Function with ease. Enjoy using Crawl4ai in your cloud environment!
|
||||
|
||||
158
deploy/gcloud-function/main.py
Normal file
158
deploy/gcloud-function/main.py
Normal file
@@ -0,0 +1,158 @@
|
||||
# Cleanup Chrome process on module unload
|
||||
import atexit
|
||||
import asyncio
|
||||
import logging
|
||||
import functions_framework
|
||||
from flask import jsonify, Request
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import subprocess
|
||||
import signal
|
||||
import requests
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
logger.info(f"Python version: {sys.version}")
|
||||
logger.info(f"Python path: {sys.path}")
|
||||
|
||||
# Try to find where crawl4ai is coming from
|
||||
try:
|
||||
import crawl4ai
|
||||
logger.info(f"Crawl4AI module location: {crawl4ai.__file__}")
|
||||
logger.info(f"Contents of crawl4ai: {dir(crawl4ai)}")
|
||||
except ImportError:
|
||||
logger.error("Crawl4AI module not found")
|
||||
|
||||
# Now attempt the import
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Paths and constants
|
||||
FUNCTION_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||
CHROME_BINARY = os.path.join(FUNCTION_DIR, "resources/chrome/headless_shell")
|
||||
CDP_PORT = 9222
|
||||
|
||||
def start_chrome():
|
||||
"""Start Chrome process synchronously with exponential backoff."""
|
||||
logger.debug("Starting Chrome process...")
|
||||
chrome_args = [
|
||||
CHROME_BINARY,
|
||||
f"--remote-debugging-port={CDP_PORT}",
|
||||
"--remote-debugging-address=0.0.0.0",
|
||||
"--no-sandbox",
|
||||
"--disable-setuid-sandbox",
|
||||
"--headless=new",
|
||||
"--disable-gpu",
|
||||
"--disable-dev-shm-usage",
|
||||
"--no-zygote",
|
||||
"--single-process",
|
||||
"--disable-features=site-per-process",
|
||||
"--no-first-run",
|
||||
"--disable-extensions"
|
||||
]
|
||||
|
||||
process = subprocess.Popen(
|
||||
chrome_args,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
preexec_fn=os.setsid
|
||||
)
|
||||
|
||||
logger.debug(f"Chrome process started with PID: {process.pid}")
|
||||
|
||||
# Wait for CDP endpoint with exponential backoff
|
||||
wait_time = 1 # Start with 1 second
|
||||
max_wait_time = 16 # Cap at 16 seconds per retry
|
||||
max_attempts = 10 # Total attempts
|
||||
for attempt in range(max_attempts):
|
||||
try:
|
||||
response = requests.get(f"http://127.0.0.1:{CDP_PORT}/json/version", timeout=2)
|
||||
if response.status_code == 200:
|
||||
# Get ws URL from response
|
||||
ws_url = response.json()['webSocketDebuggerUrl']
|
||||
logger.debug("Chrome CDP is ready")
|
||||
logger.debug(f"CDP URL: {ws_url}")
|
||||
return process
|
||||
except requests.exceptions.ConnectionError:
|
||||
logger.debug(f"Waiting for CDP endpoint (attempt {attempt + 1}/{max_attempts}), retrying in {wait_time} seconds")
|
||||
time.sleep(wait_time)
|
||||
wait_time = min(wait_time * 2, max_wait_time) # Double wait time, up to max
|
||||
|
||||
# If we get here, all retries failed
|
||||
stdout, stderr = process.communicate() # Get output for debugging
|
||||
logger.error(f"Chrome stdout: {stdout.decode()}")
|
||||
logger.error(f"Chrome stderr: {stderr.decode()}")
|
||||
raise Exception("Chrome CDP endpoint failed to start after retries")
|
||||
|
||||
async def fetch_with_crawl4ai(url: str) -> dict:
|
||||
"""Fetch page content using Crawl4ai and return the result object"""
|
||||
# Get CDP URL from the running Chrome instance
|
||||
version_response = requests.get(f'http://localhost:{CDP_PORT}/json/version')
|
||||
cdp_url = version_response.json()['webSocketDebuggerUrl']
|
||||
|
||||
# Configure and run Crawl4ai
|
||||
browser_config = BrowserConfig(cdp_url=cdp_url, use_managed_browser=True)
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
result : CrawlResult = await crawler.arun(
|
||||
url=url, config=crawler_config
|
||||
)
|
||||
return result.model_dump() # Convert Pydantic model to dict for JSON response
|
||||
|
||||
# Start Chrome when the module loads
|
||||
logger.info("Starting Chrome process on module load")
|
||||
chrome_process = start_chrome()
|
||||
|
||||
@functions_framework.http
|
||||
def crawl(request: Request):
|
||||
"""HTTP Cloud Function to fetch web content using Crawl4ai"""
|
||||
try:
|
||||
url = request.args.get('url')
|
||||
if not url:
|
||||
return jsonify({'error': 'URL parameter is required', 'status': 400}), 400
|
||||
|
||||
# Create and run an asyncio event loop
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
||||
try:
|
||||
result = loop.run_until_complete(
|
||||
asyncio.wait_for(fetch_with_crawl4ai(url), timeout=10.0)
|
||||
)
|
||||
return jsonify({
|
||||
'status': 200,
|
||||
'data': result
|
||||
})
|
||||
finally:
|
||||
loop.close()
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Unexpected error: {str(e)}"
|
||||
logger.error(error_msg, exc_info=True)
|
||||
return jsonify({
|
||||
'error': error_msg,
|
||||
'status': 500,
|
||||
'details': {
|
||||
'error_type': type(e).__name__,
|
||||
'stack_trace': str(e),
|
||||
'chrome_running': chrome_process.poll() is None if chrome_process else False
|
||||
}
|
||||
}), 500
|
||||
|
||||
|
||||
@atexit.register
|
||||
def cleanup():
|
||||
"""Cleanup Chrome process on shutdown"""
|
||||
if chrome_process and chrome_process.poll() is None:
|
||||
try:
|
||||
os.killpg(os.getpgid(chrome_process.pid), signal.SIGTERM)
|
||||
logger.info("Chrome process terminated")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to terminate Chrome process: {e}")
|
||||
5
deploy/gcloud-function/requirements.txt
Normal file
5
deploy/gcloud-function/requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
functions-framework==3.*
|
||||
flask==2.3.3
|
||||
requests==2.31.0
|
||||
websockets==12.0
|
||||
git+https://github.com/unclecode/crawl4ai.git@next
|
||||
10
deploy/gcloud-function/resources/chrome/fonts.conf
Executable file
10
deploy/gcloud-function/resources/chrome/fonts.conf
Executable file
@@ -0,0 +1,10 @@
|
||||
<?xml version="1.0" ?>
|
||||
<!DOCTYPE fontconfig SYSTEM "fonts.dtd">
|
||||
<fontconfig>
|
||||
<dir>/var/task/.fonts</dir>
|
||||
<dir>/var/task/fonts</dir>
|
||||
<dir>/opt/fonts</dir>
|
||||
<dir>/tmp/fonts</dir>
|
||||
<cachedir>/tmp/fonts-cache/</cachedir>
|
||||
<config></config>
|
||||
</fontconfig>
|
||||
BIN
deploy/gcloud-function/resources/chrome/fonts/Open_Sans/OpenSans-Bold.ttf
Executable file
BIN
deploy/gcloud-function/resources/chrome/fonts/Open_Sans/OpenSans-Bold.ttf
Executable file
Binary file not shown.
BIN
deploy/gcloud-function/resources/chrome/fonts/Open_Sans/OpenSans-Italic.ttf
Executable file
BIN
deploy/gcloud-function/resources/chrome/fonts/Open_Sans/OpenSans-Italic.ttf
Executable file
Binary file not shown.
BIN
deploy/gcloud-function/resources/chrome/fonts/Open_Sans/OpenSans-Light.ttf
Executable file
BIN
deploy/gcloud-function/resources/chrome/fonts/Open_Sans/OpenSans-Light.ttf
Executable file
Binary file not shown.
BIN
deploy/gcloud-function/resources/chrome/fonts/Open_Sans/OpenSans-Regular.ttf
Executable file
BIN
deploy/gcloud-function/resources/chrome/fonts/Open_Sans/OpenSans-Regular.ttf
Executable file
Binary file not shown.
BIN
deploy/gcloud-function/resources/chrome/libvulkan.so.1
Executable file
BIN
deploy/gcloud-function/resources/chrome/libvulkan.so.1
Executable file
Binary file not shown.
@@ -0,0 +1 @@
|
||||
{"file_format_version": "1.0.0", "ICD": {"library_path": "./libvk_swiftshader.so", "api_version": "1.0.5"}}
|
||||
104
deploy/lambda/Dockerfile
Normal file
104
deploy/lambda/Dockerfile
Normal file
@@ -0,0 +1,104 @@
|
||||
FROM python:3.12-bookworm AS python-builder
|
||||
|
||||
RUN pip install poetry
|
||||
|
||||
ENV POETRY_NO_INTERACTION=1 \
|
||||
POETRY_CACHE_DIR=/tmp/poetry_cache
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY pyproject.toml poetry.lock ./
|
||||
RUN --mount=type=cache,target=$POETRY_CACHE_DIR poetry export -f requirements.txt -o requirements.txt
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
python3-dev \
|
||||
python3-setuptools \
|
||||
python3-wheel \
|
||||
python3-pip \
|
||||
gcc \
|
||||
g++ \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install specific dependencies that have build issues
|
||||
RUN pip install --no-cache-dir cchardet
|
||||
|
||||
FROM python:3.12-bookworm
|
||||
|
||||
# Install AWS Lambda Runtime Interface Client
|
||||
RUN python3 -m pip install --no-cache-dir awslambdaric
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
curl \
|
||||
wget \
|
||||
gnupg \
|
||||
git \
|
||||
cmake \
|
||||
pkg-config \
|
||||
python3-dev \
|
||||
libjpeg-dev \
|
||||
redis-server \
|
||||
supervisor \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
libglib2.0-0 \
|
||||
libnss3 \
|
||||
libnspr4 \
|
||||
libatk1.0-0 \
|
||||
libatk-bridge2.0-0 \
|
||||
libcups2 \
|
||||
libdrm2 \
|
||||
libdbus-1-3 \
|
||||
libxcb1 \
|
||||
libxkbcommon0 \
|
||||
libx11-6 \
|
||||
libxcomposite1 \
|
||||
libxdamage1 \
|
||||
libxext6 \
|
||||
libxfixes3 \
|
||||
libxrandr2 \
|
||||
libgbm1 \
|
||||
libpango-1.0-0 \
|
||||
libcairo2 \
|
||||
libasound2 \
|
||||
libatspi2.0-0 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install build essentials for any compilations needed
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
python3-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Set up function directory and browser path
|
||||
ARG FUNCTION_DIR="/function"
|
||||
RUN mkdir -p "${FUNCTION_DIR}/pw-browsers"
|
||||
RUN mkdir -p "/tmp/.crawl4ai"
|
||||
|
||||
# Set critical environment variables
|
||||
ENV PLAYWRIGHT_BROWSERS_PATH="${FUNCTION_DIR}/pw-browsers" \
|
||||
HOME="/tmp" \
|
||||
CRAWL4_AI_BASE_DIRECTORY="/tmp/.crawl4ai"
|
||||
|
||||
# Create Craw4ai base directory
|
||||
RUN mkdir -p ${CRAWL4_AI_BASE_DIRECTORY}
|
||||
|
||||
RUN pip install --no-cache-dir faust-cchardet
|
||||
|
||||
# Install Crawl4ai and dependencies
|
||||
RUN pip install --no-cache-dir git+https://github.com/unclecode/crawl4ai.git@next
|
||||
|
||||
# Install Chromium only (no deps flag)
|
||||
RUN playwright install chromium
|
||||
|
||||
# Copy function code
|
||||
COPY lambda_function.py ${FUNCTION_DIR}/
|
||||
|
||||
# Set working directory
|
||||
WORKDIR ${FUNCTION_DIR}
|
||||
|
||||
ENTRYPOINT [ "/usr/local/bin/python", "-m", "awslambdaric" ]
|
||||
CMD [ "lambda_function.handler" ]
|
||||
1081
deploy/lambda/deploy.py
Normal file
1081
deploy/lambda/deploy.py
Normal file
File diff suppressed because it is too large
Load Diff
345
deploy/lambda/guide.md
Normal file
345
deploy/lambda/guide.md
Normal file
@@ -0,0 +1,345 @@
|
||||
# Deploying Crawl4ai on AWS Lambda
|
||||
|
||||
This guide walks you through deploying Crawl4ai as an AWS Lambda function with API Gateway integration. You'll learn how to set up, test, and clean up your deployment.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before you begin, ensure you have:
|
||||
|
||||
- AWS CLI installed and configured (`aws configure`)
|
||||
- Docker installed and running
|
||||
- Python 3.8+ installed
|
||||
- Basic familiarity with AWS services
|
||||
|
||||
## Project Files
|
||||
|
||||
Your project directory should contain:
|
||||
|
||||
- `Dockerfile`: Container configuration for Lambda
|
||||
- `lambda_function.py`: Lambda handler code
|
||||
- `deploy.py`: Our deployment script
|
||||
|
||||
## Step 1: Install Required Python Packages
|
||||
|
||||
Install the Python packages needed for our deployment script:
|
||||
|
||||
```bash
|
||||
pip install typer rich
|
||||
```
|
||||
|
||||
## Step 2: Run the Deployment Script
|
||||
|
||||
Our Python script automates the entire deployment process:
|
||||
|
||||
```bash
|
||||
python deploy.py
|
||||
```
|
||||
|
||||
The script will guide you through:
|
||||
|
||||
1. Configuration setup (AWS region, function name, memory allocation)
|
||||
2. Docker image building
|
||||
3. ECR repository creation
|
||||
4. Lambda function deployment
|
||||
5. API Gateway setup
|
||||
6. Provisioned concurrency configuration (optional)
|
||||
|
||||
Follow the prompts and confirm each step by pressing Enter.
|
||||
|
||||
## Step 3: Manual Deployment (Alternative to the Script)
|
||||
|
||||
If you prefer to deploy manually or understand what the script does, follow these steps:
|
||||
|
||||
### Building and Pushing the Docker Image
|
||||
|
||||
```bash
|
||||
# Build the Docker image
|
||||
docker build -t crawl4ai-lambda .
|
||||
|
||||
# Create an ECR repository (if it doesn't exist)
|
||||
aws ecr create-repository --repository-name crawl4ai-lambda
|
||||
|
||||
# Get ECR login password and login
|
||||
aws ecr get-login-password | docker login --username AWS --password-stdin $(aws sts get-caller-identity --query Account --output text).dkr.ecr.us-east-1.amazonaws.com
|
||||
|
||||
# Tag the image
|
||||
ECR_URI=$(aws ecr describe-repositories --repository-names crawl4ai-lambda --query 'repositories[0].repositoryUri' --output text)
|
||||
docker tag crawl4ai-lambda:latest $ECR_URI:latest
|
||||
|
||||
# Push the image to ECR
|
||||
docker push $ECR_URI:latest
|
||||
```
|
||||
|
||||
### Creating the Lambda Function
|
||||
|
||||
```bash
|
||||
# Get IAM role ARN (create it if needed)
|
||||
ROLE_ARN=$(aws iam get-role --role-name lambda-execution-role --query 'Role.Arn' --output text)
|
||||
|
||||
# Create Lambda function
|
||||
aws lambda create-function \
|
||||
--function-name crawl4ai-function \
|
||||
--package-type Image \
|
||||
--code ImageUri=$ECR_URI:latest \
|
||||
--role $ROLE_ARN \
|
||||
--timeout 300 \
|
||||
--memory-size 4096 \
|
||||
--ephemeral-storage Size=10240 \
|
||||
--environment "Variables={CRAWL4_AI_BASE_DIRECTORY=/tmp/.crawl4ai,HOME=/tmp,PLAYWRIGHT_BROWSERS_PATH=/function/pw-browsers}"
|
||||
```
|
||||
|
||||
If you're updating an existing function:
|
||||
|
||||
```bash
|
||||
# Update function code
|
||||
aws lambda update-function-code \
|
||||
--function-name crawl4ai-function \
|
||||
--image-uri $ECR_URI:latest
|
||||
|
||||
# Update function configuration
|
||||
aws lambda update-function-configuration \
|
||||
--function-name crawl4ai-function \
|
||||
--timeout 300 \
|
||||
--memory-size 4096 \
|
||||
--ephemeral-storage Size=10240 \
|
||||
--environment "Variables={CRAWL4_AI_BASE_DIRECTORY=/tmp/.crawl4ai,HOME=/tmp,PLAYWRIGHT_BROWSERS_PATH=/function/pw-browsers}"
|
||||
```
|
||||
|
||||
### Setting Up API Gateway
|
||||
|
||||
```bash
|
||||
# Create API Gateway
|
||||
API_ID=$(aws apigateway create-rest-api --name crawl4ai-api --query 'id' --output text)
|
||||
|
||||
# Get root resource ID
|
||||
PARENT_ID=$(aws apigateway get-resources --rest-api-id $API_ID --query 'items[?path==`/`].id' --output text)
|
||||
|
||||
# Create resource
|
||||
RESOURCE_ID=$(aws apigateway create-resource --rest-api-id $API_ID --parent-id $PARENT_ID --path-part "crawl" --query 'id' --output text)
|
||||
|
||||
# Create POST method
|
||||
aws apigateway put-method --rest-api-id $API_ID --resource-id $RESOURCE_ID --http-method POST --authorization-type NONE
|
||||
|
||||
# Get Lambda function ARN
|
||||
LAMBDA_ARN=$(aws lambda get-function --function-name crawl4ai-function --query 'Configuration.FunctionArn' --output text)
|
||||
|
||||
# Set Lambda integration
|
||||
aws apigateway put-integration \
|
||||
--rest-api-id $API_ID \
|
||||
--resource-id $RESOURCE_ID \
|
||||
--http-method POST \
|
||||
--type AWS_PROXY \
|
||||
--integration-http-method POST \
|
||||
--uri arn:aws:apigateway:us-east-1:lambda:path/2015-03-31/functions/$LAMBDA_ARN/invocations
|
||||
|
||||
# Deploy API
|
||||
aws apigateway create-deployment --rest-api-id $API_ID --stage-name prod
|
||||
|
||||
# Set Lambda permission
|
||||
ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
|
||||
aws lambda add-permission \
|
||||
--function-name crawl4ai-function \
|
||||
--statement-id apigateway \
|
||||
--action lambda:InvokeFunction \
|
||||
--principal apigateway.amazonaws.com \
|
||||
--source-arn "arn:aws:execute-api:us-east-1:$ACCOUNT_ID:$API_ID/*/POST/crawl"
|
||||
```
|
||||
|
||||
### Setting Up Provisioned Concurrency (Optional)
|
||||
|
||||
This reduces cold starts:
|
||||
|
||||
```bash
|
||||
# Publish a version
|
||||
VERSION=$(aws lambda publish-version --function-name crawl4ai-function --query 'Version' --output text)
|
||||
|
||||
# Create alias
|
||||
aws lambda create-alias \
|
||||
--function-name crawl4ai-function \
|
||||
--name prod \
|
||||
--function-version $VERSION
|
||||
|
||||
# Configure provisioned concurrency
|
||||
aws lambda put-provisioned-concurrency-config \
|
||||
--function-name crawl4ai-function \
|
||||
--qualifier prod \
|
||||
--provisioned-concurrent-executions 2
|
||||
|
||||
# Update API Gateway to use alias
|
||||
LAMBDA_ALIAS_ARN="arn:aws:lambda:us-east-1:$ACCOUNT_ID:function:crawl4ai-function:prod"
|
||||
aws apigateway put-integration \
|
||||
--rest-api-id $API_ID \
|
||||
--resource-id $RESOURCE_ID \
|
||||
--http-method POST \
|
||||
--type AWS_PROXY \
|
||||
--integration-http-method POST \
|
||||
--uri arn:aws:apigateway:us-east-1:lambda:path/2015-03-31/functions/$LAMBDA_ALIAS_ARN/invocations
|
||||
|
||||
# Redeploy API Gateway
|
||||
aws apigateway create-deployment --rest-api-id $API_ID --stage-name prod
|
||||
```
|
||||
|
||||
## Step 4: Testing the Deployment
|
||||
|
||||
Once deployed, test your function with:
|
||||
|
||||
```bash
|
||||
ENDPOINT_URL="https://$API_ID.execute-api.us-east-1.amazonaws.com/prod/crawl"
|
||||
|
||||
# Test with curl
|
||||
curl -X POST $ENDPOINT_URL \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"url":"https://example.com"}'
|
||||
```
|
||||
|
||||
Or using Python:
|
||||
|
||||
```python
|
||||
import requests
|
||||
import json
|
||||
|
||||
url = "https://your-api-id.execute-api.us-east-1.amazonaws.com/prod/crawl"
|
||||
payload = {
|
||||
"url": "https://example.com",
|
||||
"browser_config": {
|
||||
"headless": True,
|
||||
"verbose": False
|
||||
},
|
||||
"crawler_config": {
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"markdown_generator": {
|
||||
"type": "DefaultMarkdownGenerator",
|
||||
"params": {
|
||||
"content_filter": {
|
||||
"type": "PruningContentFilter",
|
||||
"params": {
|
||||
"threshold": 0.48,
|
||||
"threshold_type": "fixed"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
response = requests.post(url, json=payload)
|
||||
result = response.json()
|
||||
print(json.dumps(result, indent=2))
|
||||
```
|
||||
|
||||
## Step 5: Cleaning Up Resources
|
||||
|
||||
To remove all AWS resources created for this deployment:
|
||||
|
||||
```bash
|
||||
python deploy.py cleanup
|
||||
```
|
||||
|
||||
Or manually:
|
||||
|
||||
```bash
|
||||
# Delete API Gateway
|
||||
aws apigateway delete-rest-api --rest-api-id $API_ID
|
||||
|
||||
# Remove provisioned concurrency (if configured)
|
||||
aws lambda delete-provisioned-concurrency-config \
|
||||
--function-name crawl4ai-function \
|
||||
--qualifier prod
|
||||
|
||||
# Delete alias (if created)
|
||||
aws lambda delete-alias \
|
||||
--function-name crawl4ai-function \
|
||||
--name prod
|
||||
|
||||
# Delete Lambda function
|
||||
aws lambda delete-function --function-name crawl4ai-function
|
||||
|
||||
# Delete ECR repository
|
||||
aws ecr delete-repository --repository-name crawl4ai-lambda --force
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Cold Start Issues
|
||||
|
||||
If experiencing long cold starts:
|
||||
- Enable provisioned concurrency
|
||||
- Increase memory allocation (4096 MB recommended)
|
||||
- Ensure the Lambda function has enough ephemeral storage
|
||||
|
||||
### Permission Errors
|
||||
|
||||
If you encounter permission errors:
|
||||
- Check the IAM role has the necessary permissions
|
||||
- Ensure API Gateway has permission to invoke the Lambda function
|
||||
|
||||
### Container Size Issues
|
||||
|
||||
If your container is too large:
|
||||
- Optimize the Dockerfile
|
||||
- Use multi-stage builds
|
||||
- Consider removing unnecessary dependencies
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
- Lambda memory affects CPU allocation - higher memory means faster execution
|
||||
- Provisioned concurrency eliminates cold starts but costs more
|
||||
- Optimize the Playwright setup for faster browser initialization
|
||||
|
||||
## Security Best Practices
|
||||
|
||||
- Use the principle of least privilege for IAM roles
|
||||
- Implement API Gateway authentication for production deployments
|
||||
- Consider using AWS KMS for storing sensitive configuration
|
||||
|
||||
## Useful AWS Console Links
|
||||
|
||||
Here are quick links to access important AWS console pages for monitoring and managing your deployment:
|
||||
|
||||
| Resource | Console Link |
|
||||
|----------|-------------|
|
||||
| Lambda Functions | [AWS Lambda Console](https://console.aws.amazon.com/lambda/home#/functions) |
|
||||
| Lambda Function Logs | [CloudWatch Logs](https://console.aws.amazon.com/cloudwatch/home#logsV2:log-groups) |
|
||||
| API Gateway | [API Gateway Console](https://console.aws.amazon.com/apigateway/home) |
|
||||
| ECR Repositories | [ECR Console](https://console.aws.amazon.com/ecr/repositories) |
|
||||
| IAM Roles | [IAM Console](https://console.aws.amazon.com/iamv2/home#/roles) |
|
||||
| CloudWatch Metrics | [CloudWatch Metrics](https://console.aws.amazon.com/cloudwatch/home#metricsV2) |
|
||||
|
||||
### Monitoring Lambda Execution
|
||||
|
||||
To monitor your Lambda function:
|
||||
|
||||
1. Go to the [Lambda function console](https://console.aws.amazon.com/lambda/home#/functions)
|
||||
2. Select your function (`crawl4ai-function`)
|
||||
3. Click the "Monitor" tab to see:
|
||||
- Invocation metrics
|
||||
- Success/failure rates
|
||||
- Duration statistics
|
||||
|
||||
### Viewing Lambda Logs
|
||||
|
||||
To see detailed execution logs:
|
||||
|
||||
1. Go to [CloudWatch Logs](https://console.aws.amazon.com/cloudwatch/home#logsV2:log-groups)
|
||||
2. Find the log group named `/aws/lambda/crawl4ai-function`
|
||||
3. Click to see the latest log streams
|
||||
4. Each stream contains logs from a function execution
|
||||
|
||||
### Checking API Gateway Traffic
|
||||
|
||||
To monitor API requests:
|
||||
|
||||
1. Go to the [API Gateway console](https://console.aws.amazon.com/apigateway/home)
|
||||
2. Select your API (`crawl4ai-api`)
|
||||
3. Click "Dashboard" to see:
|
||||
- API calls
|
||||
- Latency
|
||||
- Error rates
|
||||
|
||||
## Conclusion
|
||||
|
||||
You now have Crawl4ai running as a serverless function on AWS Lambda! This setup allows you to crawl websites on-demand without maintaining infrastructure, while paying only for the compute time you use.
|
||||
107
deploy/lambda/lambda_function.py
Normal file
107
deploy/lambda/lambda_function.py
Normal file
@@ -0,0 +1,107 @@
|
||||
import json
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
# Ensure environment variables and directories are set
|
||||
os.environ['CRAWL4_AI_BASE_DIRECTORY'] = '/tmp/.crawl4ai'
|
||||
os.environ['HOME'] = '/tmp'
|
||||
|
||||
# Create directory if it doesn't exist
|
||||
os.makedirs('/tmp/.crawl4ai', exist_ok=True)
|
||||
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
CacheMode
|
||||
)
|
||||
|
||||
|
||||
def handler(event, context):
|
||||
# Parse the incoming event (API Gateway request)
|
||||
try:
|
||||
body = json.loads(event.get('body', '{}'))
|
||||
|
||||
url = body.get('url')
|
||||
if not url:
|
||||
return {
|
||||
'statusCode': 400,
|
||||
'body': json.dumps({'error': 'URL is required'})
|
||||
}
|
||||
|
||||
# Get optional configurations or use defaults
|
||||
browser_config_dict = body.get('browser_config', {})
|
||||
crawler_config_dict = body.get('crawler_config', {})
|
||||
|
||||
# Run the crawler
|
||||
result = asyncio.run(crawl(url, browser_config_dict, crawler_config_dict))
|
||||
|
||||
# Return successful response
|
||||
return {
|
||||
'statusCode': 200,
|
||||
'headers': {
|
||||
'Content-Type': 'application/json'
|
||||
},
|
||||
'body': json.dumps(result)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
# Handle errors
|
||||
import traceback
|
||||
return {
|
||||
'statusCode': 500,
|
||||
'body': json.dumps({
|
||||
'error': str(e),
|
||||
'traceback': traceback.format_exc()
|
||||
})
|
||||
}
|
||||
|
||||
async def crawl(url, browser_config_dict, crawler_config_dict):
|
||||
"""
|
||||
Run the crawler with the provided configurations, with Lambda-specific settings
|
||||
"""
|
||||
# Start with user-provided config but override with Lambda-required settings
|
||||
base_browser_config = BrowserConfig.load(browser_config_dict) if browser_config_dict else BrowserConfig()
|
||||
|
||||
# Apply Lambda-specific browser configurations
|
||||
browser_config = BrowserConfig(
|
||||
verbose=True,
|
||||
browser_type="chromium",
|
||||
headless=True,
|
||||
user_agent_mode="random",
|
||||
light_mode=True,
|
||||
use_managed_browser=False,
|
||||
extra_args=[
|
||||
"--headless=new",
|
||||
"--no-sandbox",
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-setuid-sandbox",
|
||||
"--remote-allow-origins=*",
|
||||
"--autoplay-policy=user-gesture-required",
|
||||
"--single-process",
|
||||
],
|
||||
# # Carry over any other settings from user config that aren't overridden
|
||||
# **{k: v for k, v in base_browser_config.model_dump().items()
|
||||
# if k not in ['verbose', 'browser_type', 'headless', 'user_agent_mode',
|
||||
# 'light_mode', 'use_managed_browser', 'extra_args']}
|
||||
)
|
||||
|
||||
# Start with user-provided crawler config but ensure cache is bypassed
|
||||
base_crawler_config = CrawlerRunConfig.load(crawler_config_dict) if crawler_config_dict else CrawlerRunConfig()
|
||||
|
||||
# Apply Lambda-specific crawler configurations
|
||||
crawler_config = CrawlerRunConfig(
|
||||
exclude_external_links=base_crawler_config.exclude_external_links,
|
||||
remove_overlay_elements=True,
|
||||
magic=True,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
# Carry over markdown generator and other settings
|
||||
markdown_generator=base_crawler_config.markdown_generator
|
||||
)
|
||||
|
||||
# Perform the crawl with Lambda-optimized settings
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url=url, config=crawler_config)
|
||||
|
||||
# Return serializable results
|
||||
return result.model_dump()
|
||||
543
deploy/modal/crawl4ai_api_service.py
Normal file
543
deploy/modal/crawl4ai_api_service.py
Normal file
@@ -0,0 +1,543 @@
|
||||
import os
|
||||
import time
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any, Optional, List
|
||||
|
||||
import modal
|
||||
from modal import Image, App, Volume, Secret, web_endpoint, function
|
||||
|
||||
# Configuration
|
||||
APP_NAME = "crawl4ai-api"
|
||||
CRAWL4AI_VERSION = "next" # Using the 'next' branch
|
||||
PYTHON_VERSION = "3.10" # Compatible with playwright
|
||||
DEFAULT_CREDITS = 1000
|
||||
|
||||
# Create a custom image with Crawl4ai and its dependencies
|
||||
image = Image.debian_slim(python_version=PYTHON_VERSION).pip_install(
|
||||
["fastapi[standard]", "pymongo", "pydantic"]
|
||||
).run_commands(
|
||||
"apt-get update",
|
||||
"apt-get install -y software-properties-common",
|
||||
"apt-get install -y git",
|
||||
"apt-add-repository non-free",
|
||||
"apt-add-repository contrib",
|
||||
# Install crawl4ai from the next branch
|
||||
f"pip install -U git+https://github.com/unclecode/crawl4ai.git@{CRAWL4AI_VERSION}",
|
||||
"pip install -U fastapi[standard]",
|
||||
"pip install -U pydantic",
|
||||
# Install playwright and browsers
|
||||
"crawl4ai-setup",
|
||||
)
|
||||
|
||||
# Create persistent volume for user database
|
||||
user_db = Volume.from_name("crawl4ai-users", create_if_missing=True)
|
||||
|
||||
# Create admin secret for secure operations
|
||||
admin_secret = Secret.from_name("admin-secret", create_if_missing=True)
|
||||
|
||||
# Define the app
|
||||
app = App(APP_NAME, image=image)
|
||||
|
||||
# Default configurations
|
||||
DEFAULT_BROWSER_CONFIG = {
|
||||
"headless": True,
|
||||
"verbose": False,
|
||||
}
|
||||
|
||||
DEFAULT_CRAWLER_CONFIG = {
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"markdown_generator": {
|
||||
"type": "DefaultMarkdownGenerator",
|
||||
"params": {
|
||||
"content_filter": {
|
||||
"type": "PruningContentFilter",
|
||||
"params": {
|
||||
"threshold": 0.48,
|
||||
"threshold_type": "fixed"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Database operations
|
||||
@app.function(volumes={"/data": user_db})
|
||||
def init_db() -> None:
|
||||
"""Initialize database with indexes."""
|
||||
from pymongo import MongoClient, ASCENDING
|
||||
|
||||
client = MongoClient("mongodb://localhost:27017")
|
||||
db = client.crawl4ai_db
|
||||
|
||||
# Ensure indexes for faster lookups
|
||||
db.users.create_index([("api_token", ASCENDING)], unique=True)
|
||||
db.users.create_index([("email", ASCENDING)], unique=True)
|
||||
|
||||
# Create usage stats collection
|
||||
db.usage_stats.create_index([("user_id", ASCENDING), ("timestamp", ASCENDING)])
|
||||
|
||||
print("Database initialized with required indexes")
|
||||
|
||||
@app.function(volumes={"/data": user_db})
|
||||
def get_user_by_token(api_token: str) -> Optional[Dict[str, Any]]:
|
||||
"""Get user by API token."""
|
||||
from pymongo import MongoClient
|
||||
from bson.objectid import ObjectId
|
||||
|
||||
client = MongoClient("mongodb://localhost:27017")
|
||||
db = client.crawl4ai_db
|
||||
|
||||
user = db.users.find_one({"api_token": api_token})
|
||||
if not user:
|
||||
return None
|
||||
|
||||
# Convert ObjectId to string for serialization
|
||||
user["_id"] = str(user["_id"])
|
||||
return user
|
||||
|
||||
@app.function(volumes={"/data": user_db})
|
||||
def create_user(email: str, name: str) -> Dict[str, Any]:
|
||||
"""Create a new user with initial credits."""
|
||||
from pymongo import MongoClient
|
||||
from bson.objectid import ObjectId
|
||||
|
||||
client = MongoClient("mongodb://localhost:27017")
|
||||
db = client.crawl4ai_db
|
||||
|
||||
# Generate API token
|
||||
api_token = str(uuid.uuid4())
|
||||
|
||||
user = {
|
||||
"email": email,
|
||||
"name": name,
|
||||
"api_token": api_token,
|
||||
"credits": DEFAULT_CREDITS,
|
||||
"created_at": datetime.utcnow(),
|
||||
"updated_at": datetime.utcnow(),
|
||||
"is_active": True
|
||||
}
|
||||
|
||||
try:
|
||||
result = db.users.insert_one(user)
|
||||
user["_id"] = str(result.inserted_id)
|
||||
return user
|
||||
except Exception as e:
|
||||
if "duplicate key error" in str(e):
|
||||
return {"error": "User with this email already exists"}
|
||||
raise
|
||||
|
||||
@app.function(volumes={"/data": user_db})
|
||||
def update_user_credits(api_token: str, amount: int) -> Dict[str, Any]:
|
||||
"""Update user credits (add or subtract)."""
|
||||
from pymongo import MongoClient
|
||||
|
||||
client = MongoClient("mongodb://localhost:27017")
|
||||
db = client.crawl4ai_db
|
||||
|
||||
# First get current user to check credits
|
||||
user = db.users.find_one({"api_token": api_token})
|
||||
if not user:
|
||||
return {"success": False, "error": "User not found"}
|
||||
|
||||
# For deductions, ensure sufficient credits
|
||||
if amount < 0 and user["credits"] + amount < 0:
|
||||
return {"success": False, "error": "Insufficient credits"}
|
||||
|
||||
# Update credits
|
||||
result = db.users.update_one(
|
||||
{"api_token": api_token},
|
||||
{
|
||||
"$inc": {"credits": amount},
|
||||
"$set": {"updated_at": datetime.utcnow()}
|
||||
}
|
||||
)
|
||||
|
||||
if result.modified_count == 1:
|
||||
# Get updated user
|
||||
updated_user = db.users.find_one({"api_token": api_token})
|
||||
return {
|
||||
"success": True,
|
||||
"credits": updated_user["credits"]
|
||||
}
|
||||
else:
|
||||
return {"success": False, "error": "Failed to update credits"}
|
||||
|
||||
@app.function(volumes={"/data": user_db})
|
||||
def log_usage(user_id: str, url: str, success: bool, error: Optional[str] = None) -> None:
|
||||
"""Log usage statistics."""
|
||||
from pymongo import MongoClient
|
||||
from bson.objectid import ObjectId
|
||||
|
||||
client = MongoClient("mongodb://localhost:27017")
|
||||
db = client.crawl4ai_db
|
||||
|
||||
log_entry = {
|
||||
"user_id": user_id,
|
||||
"url": url,
|
||||
"timestamp": datetime.utcnow(),
|
||||
"success": success,
|
||||
"error": error
|
||||
}
|
||||
|
||||
db.usage_stats.insert_one(log_entry)
|
||||
|
||||
# Main crawling function
|
||||
@app.function(timeout=300) # 5 minute timeout
|
||||
async def crawl(
|
||||
url: str,
|
||||
browser_config: Optional[Dict[str, Any]] = None,
|
||||
crawler_config: Optional[Dict[str, Any]] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Crawl a given URL using Crawl4ai.
|
||||
|
||||
Args:
|
||||
url: The URL to crawl
|
||||
browser_config: Optional browser configuration to override defaults
|
||||
crawler_config: Optional crawler configuration to override defaults
|
||||
|
||||
Returns:
|
||||
A dictionary containing the crawl results
|
||||
"""
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
CrawlResult
|
||||
)
|
||||
|
||||
# Prepare browser config using the loader method
|
||||
if browser_config is None:
|
||||
browser_config = DEFAULT_BROWSER_CONFIG
|
||||
browser_config_obj = BrowserConfig.load(browser_config)
|
||||
|
||||
# Prepare crawler config using the loader method
|
||||
if crawler_config is None:
|
||||
crawler_config = DEFAULT_CRAWLER_CONFIG
|
||||
crawler_config_obj = CrawlerRunConfig.load(crawler_config)
|
||||
|
||||
# Perform the crawl
|
||||
async with AsyncWebCrawler(config=browser_config_obj) as crawler:
|
||||
result: CrawlResult = await crawler.arun(url=url, config=crawler_config_obj)
|
||||
|
||||
# Return serializable results
|
||||
try:
|
||||
# Try newer Pydantic v2 method
|
||||
return result.model_dump()
|
||||
except AttributeError:
|
||||
try:
|
||||
# Try older Pydantic v1 method
|
||||
return result.dict()
|
||||
except AttributeError:
|
||||
# Fallback to manual conversion
|
||||
return {
|
||||
"url": result.url,
|
||||
"title": result.title,
|
||||
"status": result.status,
|
||||
"content": str(result.content) if hasattr(result, "content") else None,
|
||||
"links": [{"url": link.url, "text": link.text} for link in result.links] if hasattr(result, "links") else [],
|
||||
"markdown_v2": {
|
||||
"raw_markdown": result.markdown_v2.raw_markdown if hasattr(result, "markdown_v2") else None
|
||||
}
|
||||
}
|
||||
|
||||
# API endpoints
|
||||
@app.function()
|
||||
@web_endpoint(method="POST")
|
||||
def crawl_endpoint(data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Web endpoint that accepts POST requests with JSON data containing:
|
||||
- api_token: User's API token
|
||||
- url: The URL to crawl
|
||||
- browser_config: Optional browser configuration
|
||||
- crawler_config: Optional crawler configuration
|
||||
|
||||
Returns the crawl results and remaining credits.
|
||||
"""
|
||||
# Extract and validate API token
|
||||
api_token = data.get("api_token")
|
||||
if not api_token:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "API token is required",
|
||||
"status_code": 401
|
||||
}
|
||||
|
||||
# Verify user
|
||||
user = get_user_by_token.remote(api_token)
|
||||
if not user:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "Invalid API token",
|
||||
"status_code": 401
|
||||
}
|
||||
|
||||
if not user.get("is_active", False):
|
||||
return {
|
||||
"success": False,
|
||||
"error": "Account is inactive",
|
||||
"status_code": 403
|
||||
}
|
||||
|
||||
# Validate URL
|
||||
url = data.get("url")
|
||||
if not url:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "URL is required",
|
||||
"status_code": 400
|
||||
}
|
||||
|
||||
# Check credits
|
||||
if user.get("credits", 0) <= 0:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "Insufficient credits",
|
||||
"status_code": 403
|
||||
}
|
||||
|
||||
# Deduct credit first (1 credit per call)
|
||||
credit_result = update_user_credits.remote(api_token, -1)
|
||||
if not credit_result.get("success", False):
|
||||
return {
|
||||
"success": False,
|
||||
"error": credit_result.get("error", "Failed to process credits"),
|
||||
"status_code": 500
|
||||
}
|
||||
|
||||
# Extract configs
|
||||
browser_config = data.get("browser_config")
|
||||
crawler_config = data.get("crawler_config")
|
||||
|
||||
# Perform crawl
|
||||
try:
|
||||
start_time = time.time()
|
||||
result = crawl.remote(url, browser_config, crawler_config)
|
||||
execution_time = time.time() - start_time
|
||||
|
||||
# Log successful usage
|
||||
log_usage.spawn(user["_id"], url, True)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"data": result,
|
||||
"credits_remaining": credit_result.get("credits"),
|
||||
"execution_time_seconds": round(execution_time, 2),
|
||||
"status_code": 200
|
||||
}
|
||||
except Exception as e:
|
||||
# Log failed usage
|
||||
log_usage.spawn(user["_id"], url, False, str(e))
|
||||
|
||||
# Return error
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Crawling error: {str(e)}",
|
||||
"credits_remaining": credit_result.get("credits"),
|
||||
"status_code": 500
|
||||
}
|
||||
|
||||
# Admin endpoints
|
||||
@app.function(secrets=[admin_secret])
|
||||
@web_endpoint(method="POST")
|
||||
def admin_create_user(data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Admin endpoint to create new users."""
|
||||
# Validate admin token
|
||||
admin_token = data.get("admin_token")
|
||||
if admin_token != os.environ.get("ADMIN_TOKEN"):
|
||||
return {
|
||||
"success": False,
|
||||
"error": "Invalid admin token",
|
||||
"status_code": 401
|
||||
}
|
||||
|
||||
# Validate input
|
||||
email = data.get("email")
|
||||
name = data.get("name")
|
||||
|
||||
if not email or not name:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "Email and name are required",
|
||||
"status_code": 400
|
||||
}
|
||||
|
||||
# Create user
|
||||
user = create_user.remote(email, name)
|
||||
|
||||
if "error" in user:
|
||||
return {
|
||||
"success": False,
|
||||
"error": user["error"],
|
||||
"status_code": 400
|
||||
}
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"data": {
|
||||
"user_id": user["_id"],
|
||||
"email": user["email"],
|
||||
"name": user["name"],
|
||||
"api_token": user["api_token"],
|
||||
"credits": user["credits"],
|
||||
"created_at": user["created_at"].isoformat() if isinstance(user["created_at"], datetime) else user["created_at"]
|
||||
},
|
||||
"status_code": 201
|
||||
}
|
||||
|
||||
@app.function(secrets=[admin_secret])
|
||||
@web_endpoint(method="POST")
|
||||
def admin_update_credits(data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Admin endpoint to update user credits."""
|
||||
# Validate admin token
|
||||
admin_token = data.get("admin_token")
|
||||
if admin_token != os.environ.get("ADMIN_TOKEN"):
|
||||
return {
|
||||
"success": False,
|
||||
"error": "Invalid admin token",
|
||||
"status_code": 401
|
||||
}
|
||||
|
||||
# Validate input
|
||||
api_token = data.get("api_token")
|
||||
amount = data.get("amount")
|
||||
|
||||
if not api_token:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "API token is required",
|
||||
"status_code": 400
|
||||
}
|
||||
|
||||
if not isinstance(amount, int):
|
||||
return {
|
||||
"success": False,
|
||||
"error": "Amount must be an integer",
|
||||
"status_code": 400
|
||||
}
|
||||
|
||||
# Update credits
|
||||
result = update_user_credits.remote(api_token, amount)
|
||||
|
||||
if not result.get("success", False):
|
||||
return {
|
||||
"success": False,
|
||||
"error": result.get("error", "Failed to update credits"),
|
||||
"status_code": 400
|
||||
}
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"data": {
|
||||
"credits": result["credits"]
|
||||
},
|
||||
"status_code": 200
|
||||
}
|
||||
|
||||
@app.function(secrets=[admin_secret])
|
||||
@web_endpoint(method="GET")
|
||||
def admin_get_users(admin_token: str) -> Dict[str, Any]:
|
||||
"""Admin endpoint to list all users."""
|
||||
# Validate admin token
|
||||
if admin_token != os.environ.get("ADMIN_TOKEN"):
|
||||
return {
|
||||
"success": False,
|
||||
"error": "Invalid admin token",
|
||||
"status_code": 401
|
||||
}
|
||||
|
||||
users = get_all_users.remote()
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"data": users,
|
||||
"status_code": 200
|
||||
}
|
||||
|
||||
@app.function(volumes={"/data": user_db})
|
||||
def get_all_users() -> List[Dict[str, Any]]:
|
||||
"""Get all users (for admin)."""
|
||||
from pymongo import MongoClient
|
||||
|
||||
client = MongoClient("mongodb://localhost:27017")
|
||||
db = client.crawl4ai_db
|
||||
|
||||
users = []
|
||||
for user in db.users.find():
|
||||
# Convert ObjectId to string
|
||||
user["_id"] = str(user["_id"])
|
||||
# Convert datetime to ISO format
|
||||
for field in ["created_at", "updated_at"]:
|
||||
if field in user and isinstance(user[field], datetime):
|
||||
user[field] = user[field].isoformat()
|
||||
users.append(user)
|
||||
|
||||
return users
|
||||
|
||||
# Public endpoints
|
||||
@app.function()
|
||||
@web_endpoint(method="GET")
|
||||
def health_check() -> Dict[str, Any]:
|
||||
"""Health check endpoint."""
|
||||
return {
|
||||
"status": "online",
|
||||
"service": APP_NAME,
|
||||
"version": CRAWL4AI_VERSION,
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
@app.function()
|
||||
@web_endpoint(method="GET")
|
||||
def check_credits(api_token: str) -> Dict[str, Any]:
|
||||
"""Check user credits."""
|
||||
if not api_token:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "API token is required",
|
||||
"status_code": 401
|
||||
}
|
||||
|
||||
user = get_user_by_token.remote(api_token)
|
||||
if not user:
|
||||
return {
|
||||
"success": False,
|
||||
"error": "Invalid API token",
|
||||
"status_code": 401
|
||||
}
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"data": {
|
||||
"credits": user["credits"],
|
||||
"email": user["email"],
|
||||
"name": user["name"]
|
||||
},
|
||||
"status_code": 200
|
||||
}
|
||||
|
||||
# Local entrypoint for testing
|
||||
@app.local_entrypoint()
|
||||
def main(url: str = "https://www.modal.com"):
|
||||
"""Command line entrypoint for local testing."""
|
||||
print("Initializing database...")
|
||||
init_db.remote()
|
||||
|
||||
print(f"Testing crawl on URL: {url}")
|
||||
result = crawl.remote(url)
|
||||
|
||||
# Print sample of result
|
||||
print("\nCrawl Result Sample:")
|
||||
if "title" in result:
|
||||
print(f"Title: {result['title']}")
|
||||
if "status" in result:
|
||||
print(f"Status: {result['status']}")
|
||||
if "links" in result:
|
||||
print(f"Links found: {len(result['links'])}")
|
||||
if "markdown_v2" in result and result["markdown_v2"] and "raw_markdown" in result["markdown_v2"]:
|
||||
print("\nMarkdown Preview (first 300 chars):")
|
||||
print(result["markdown_v2"]["raw_markdown"][:300] + "...")
|
||||
127
deploy/modal/entry.py
Normal file
127
deploy/modal/entry.py
Normal file
@@ -0,0 +1,127 @@
|
||||
import modal
|
||||
from typing import Optional, Dict, Any
|
||||
|
||||
# Create a custom image with Crawl4ai and its dependencies
|
||||
# "pip install crawl4ai",
|
||||
image = modal.Image.debian_slim(python_version="3.10").pip_install(["fastapi[standard]"]).run_commands(
|
||||
"apt-get update",
|
||||
"apt-get install -y software-properties-common",
|
||||
"apt-get install -y git",
|
||||
"apt-add-repository non-free",
|
||||
"apt-add-repository contrib",
|
||||
"pip install -U git+https://github.com/unclecode/crawl4ai.git@next",
|
||||
"pip install -U fastapi[standard]",
|
||||
"pip install -U pydantic",
|
||||
"crawl4ai-setup", # This installs playwright and downloads chromium
|
||||
# Print fastpi version
|
||||
"python -m fastapi --version",
|
||||
)
|
||||
|
||||
# Define the app
|
||||
app = modal.App("crawl4ai", image=image)
|
||||
|
||||
# Define default configurations
|
||||
DEFAULT_BROWSER_CONFIG = {
|
||||
"headless": True,
|
||||
"verbose": False,
|
||||
}
|
||||
|
||||
DEFAULT_CRAWLER_CONFIG = {
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"markdown_generator": {
|
||||
"type": "DefaultMarkdownGenerator",
|
||||
"params": {
|
||||
"content_filter": {
|
||||
"type": "PruningContentFilter",
|
||||
"params": {
|
||||
"threshold": 0.48,
|
||||
"threshold_type": "fixed"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@app.function(timeout=300) # 5 minute timeout
|
||||
async def crawl(
|
||||
url: str,
|
||||
browser_config: Optional[Dict[str, Any]] = None,
|
||||
crawler_config: Optional[Dict[str, Any]] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Crawl a given URL using Crawl4ai.
|
||||
|
||||
Args:
|
||||
url: The URL to crawl
|
||||
browser_config: Optional browser configuration to override defaults
|
||||
crawler_config: Optional crawler configuration to override defaults
|
||||
|
||||
Returns:
|
||||
A dictionary containing the crawl results
|
||||
"""
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
CrawlResult
|
||||
)
|
||||
|
||||
|
||||
# Prepare browser config using the loader method
|
||||
if browser_config is None:
|
||||
browser_config = DEFAULT_BROWSER_CONFIG
|
||||
browser_config_obj = BrowserConfig.load(browser_config)
|
||||
|
||||
# Prepare crawler config using the loader method
|
||||
if crawler_config is None:
|
||||
crawler_config = DEFAULT_CRAWLER_CONFIG
|
||||
crawler_config_obj = CrawlerRunConfig.load(crawler_config)
|
||||
|
||||
|
||||
# Perform the crawl
|
||||
async with AsyncWebCrawler(config=browser_config_obj) as crawler:
|
||||
result: CrawlResult = await crawler.arun(url=url, config=crawler_config_obj)
|
||||
|
||||
# Return serializable results
|
||||
try:
|
||||
# Try newer Pydantic v2 method
|
||||
return result.model_dump()
|
||||
except AttributeError:
|
||||
try:
|
||||
# Try older Pydantic v1 method
|
||||
return result.__dict__
|
||||
except AttributeError:
|
||||
# Fallback to returning the raw result
|
||||
return result
|
||||
|
||||
@app.function()
|
||||
@modal.web_endpoint(method="POST")
|
||||
def crawl_endpoint(data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Web endpoint that accepts POST requests with JSON data containing:
|
||||
- url: The URL to crawl
|
||||
- browser_config: Optional browser configuration
|
||||
- crawler_config: Optional crawler configuration
|
||||
|
||||
Returns the crawl results.
|
||||
"""
|
||||
url = data.get("url")
|
||||
if not url:
|
||||
return {"error": "URL is required"}
|
||||
|
||||
browser_config = data.get("browser_config")
|
||||
crawler_config = data.get("crawler_config")
|
||||
|
||||
return crawl.remote(url, browser_config, crawler_config)
|
||||
|
||||
@app.local_entrypoint()
|
||||
def main(url: str = "https://www.modal.com"):
|
||||
"""
|
||||
Command line entrypoint for local testing.
|
||||
"""
|
||||
result = crawl.remote(url)
|
||||
print(result)
|
||||
453
deploy/modal/guide.md
Normal file
453
deploy/modal/guide.md
Normal file
@@ -0,0 +1,453 @@
|
||||
# Deploying Crawl4ai with Modal: A Comprehensive Tutorial
|
||||
|
||||
Hey there! UncleCode here. I'm excited to show you how to deploy Crawl4ai using Modal - a fantastic serverless platform that makes deployment super simple and scalable.
|
||||
|
||||
In this tutorial, I'll walk you through deploying your own Crawl4ai instance on Modal's infrastructure. This will give you a powerful, scalable web crawling solution without having to worry about infrastructure management.
|
||||
|
||||
## What is Modal?
|
||||
|
||||
Modal is a serverless platform that allows you to run Python functions in the cloud without managing servers. It's perfect for deploying Crawl4ai because:
|
||||
|
||||
1. It handles all the infrastructure for you
|
||||
2. It scales automatically based on demand
|
||||
3. It makes deployment incredibly simple
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before we get started, you'll need:
|
||||
|
||||
- A Modal account (sign up at [modal.com](https://modal.com))
|
||||
- Python 3.10 or later installed on your local machine
|
||||
- Basic familiarity with Python and command-line operations
|
||||
|
||||
## Step 1: Setting Up Your Modal Account
|
||||
|
||||
First, sign up for a Modal account at [modal.com](https://modal.com) if you haven't already. Modal offers a generous free tier that's perfect for getting started.
|
||||
|
||||
After signing up, install the Modal CLI and authenticate:
|
||||
|
||||
```bash
|
||||
pip install modal
|
||||
modal token new
|
||||
```
|
||||
|
||||
This will open a browser window where you can authenticate and generate a token for the CLI.
|
||||
|
||||
## Step 2: Creating Your Crawl4ai Deployment
|
||||
|
||||
Now, let's create a Python file called `crawl4ai_modal.py` with our deployment code:
|
||||
|
||||
```python
|
||||
import modal
|
||||
from typing import Optional, Dict, Any
|
||||
|
||||
# Create a custom image with Crawl4ai and its dependencies
|
||||
image = modal.Image.debian_slim(python_version="3.10").pip_install(
|
||||
["fastapi[standard]"]
|
||||
).run_commands(
|
||||
"apt-get update",
|
||||
"apt-get install -y software-properties-common",
|
||||
"apt-get install -y git",
|
||||
"apt-add-repository non-free",
|
||||
"apt-add-repository contrib",
|
||||
"pip install -U crawl4ai",
|
||||
"pip install -U fastapi[standard]",
|
||||
"pip install -U pydantic",
|
||||
"crawl4ai-setup", # This installs playwright and downloads chromium
|
||||
)
|
||||
|
||||
# Define the app
|
||||
app = modal.App("crawl4ai", image=image)
|
||||
|
||||
# Define default configurations
|
||||
DEFAULT_BROWSER_CONFIG = {
|
||||
"headless": True,
|
||||
"verbose": False,
|
||||
}
|
||||
|
||||
DEFAULT_CRAWLER_CONFIG = {
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"markdown_generator": {
|
||||
"type": "DefaultMarkdownGenerator",
|
||||
"params": {
|
||||
"content_filter": {
|
||||
"type": "PruningContentFilter",
|
||||
"params": {
|
||||
"threshold": 0.48,
|
||||
"threshold_type": "fixed"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@app.function(timeout=300) # 5 minute timeout
|
||||
async def crawl(
|
||||
url: str,
|
||||
browser_config: Optional[Dict[str, Any]] = None,
|
||||
crawler_config: Optional[Dict[str, Any]] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Crawl a given URL using Crawl4ai.
|
||||
|
||||
Args:
|
||||
url: The URL to crawl
|
||||
browser_config: Optional browser configuration to override defaults
|
||||
crawler_config: Optional crawler configuration to override defaults
|
||||
|
||||
Returns:
|
||||
A dictionary containing the crawl results
|
||||
"""
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
CrawlResult
|
||||
)
|
||||
|
||||
# Prepare browser config using the loader method
|
||||
if browser_config is None:
|
||||
browser_config = DEFAULT_BROWSER_CONFIG
|
||||
browser_config_obj = BrowserConfig.load(browser_config)
|
||||
|
||||
# Prepare crawler config using the loader method
|
||||
if crawler_config is None:
|
||||
crawler_config = DEFAULT_CRAWLER_CONFIG
|
||||
crawler_config_obj = CrawlerRunConfig.load(crawler_config)
|
||||
|
||||
# Perform the crawl
|
||||
async with AsyncWebCrawler(config=browser_config_obj) as crawler:
|
||||
result: CrawlResult = await crawler.arun(url=url, config=crawler_config_obj)
|
||||
|
||||
# Return serializable results
|
||||
try:
|
||||
# Try newer Pydantic v2 method
|
||||
return result.model_dump()
|
||||
except AttributeError:
|
||||
try:
|
||||
# Try older Pydantic v1 method
|
||||
return result.dict()
|
||||
except AttributeError:
|
||||
# Fallback to manual conversion
|
||||
return {
|
||||
"url": result.url,
|
||||
"title": result.title,
|
||||
"status": result.status,
|
||||
"content": str(result.content) if hasattr(result, "content") else None,
|
||||
"links": [{"url": link.url, "text": link.text} for link in result.links] if hasattr(result, "links") else [],
|
||||
"markdown_v2": {
|
||||
"raw_markdown": result.markdown_v2.raw_markdown if hasattr(result, "markdown_v2") else None
|
||||
}
|
||||
}
|
||||
|
||||
@app.function()
|
||||
@modal.web_endpoint(method="POST")
|
||||
def crawl_endpoint(data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Web endpoint that accepts POST requests with JSON data containing:
|
||||
- url: The URL to crawl
|
||||
- browser_config: Optional browser configuration
|
||||
- crawler_config: Optional crawler configuration
|
||||
|
||||
Returns the crawl results.
|
||||
"""
|
||||
url = data.get("url")
|
||||
if not url:
|
||||
return {"error": "URL is required"}
|
||||
|
||||
browser_config = data.get("browser_config")
|
||||
crawler_config = data.get("crawler_config")
|
||||
|
||||
return crawl.remote(url, browser_config, crawler_config)
|
||||
|
||||
@app.local_entrypoint()
|
||||
def main(url: str = "https://www.modal.com"):
|
||||
"""
|
||||
Command line entrypoint for local testing.
|
||||
"""
|
||||
result = crawl.remote(url)
|
||||
print(result)
|
||||
```
|
||||
|
||||
## Step 3: Understanding the Code Components
|
||||
|
||||
Let's break down what's happening in this code:
|
||||
|
||||
### 1. Image Definition
|
||||
|
||||
```python
|
||||
image = modal.Image.debian_slim(python_version="3.10").pip_install(
|
||||
["fastapi[standard]"]
|
||||
).run_commands(
|
||||
"apt-get update",
|
||||
"apt-get install -y software-properties-common",
|
||||
"apt-get install -y git",
|
||||
"apt-add-repository non-free",
|
||||
"apt-add-repository contrib",
|
||||
"pip install -U git+https://github.com/unclecode/crawl4ai.git@next",
|
||||
"pip install -U fastapi[standard]",
|
||||
"pip install -U pydantic",
|
||||
"crawl4ai-setup", # This installs playwright and downloads chromium
|
||||
)
|
||||
```
|
||||
|
||||
This section defines the container image that Modal will use to run your code. It:
|
||||
- Starts with a Debian Slim base image with Python 3.10
|
||||
- Installs FastAPI
|
||||
- Updates the system packages
|
||||
- Installs Git and other dependencies
|
||||
- Installs Crawl4ai from the GitHub repository
|
||||
- Runs the Crawl4ai setup to install Playwright and download Chromium
|
||||
|
||||
### 2. Modal App Definition
|
||||
|
||||
```python
|
||||
app = modal.App("crawl4ai", image=image)
|
||||
```
|
||||
|
||||
This creates a Modal application named "crawl4ai" that uses the image we defined above.
|
||||
|
||||
### 3. Default Configurations
|
||||
|
||||
```python
|
||||
DEFAULT_BROWSER_CONFIG = {
|
||||
"headless": True,
|
||||
"verbose": False,
|
||||
}
|
||||
|
||||
DEFAULT_CRAWLER_CONFIG = {
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"markdown_generator": {
|
||||
"type": "DefaultMarkdownGenerator",
|
||||
"params": {
|
||||
"content_filter": {
|
||||
"type": "PruningContentFilter",
|
||||
"params": {
|
||||
"threshold": 0.48,
|
||||
"threshold_type": "fixed"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
These define the default configurations for the browser and crawler. You can customize these settings based on your specific needs.
|
||||
|
||||
### 4. The Crawl Function
|
||||
|
||||
```python
|
||||
@app.function(timeout=300)
|
||||
async def crawl(url, browser_config, crawler_config):
|
||||
# Function implementation
|
||||
```
|
||||
|
||||
This is the main function that performs the crawling. It:
|
||||
- Takes a URL and optional configurations
|
||||
- Sets up the browser and crawler with those configurations
|
||||
- Performs the crawl
|
||||
- Returns the results in a serializable format
|
||||
|
||||
The `@app.function(timeout=300)` decorator tells Modal to run this function in the cloud with a 5-minute timeout.
|
||||
|
||||
### 5. The Web Endpoint
|
||||
|
||||
```python
|
||||
@app.function()
|
||||
@modal.web_endpoint(method="POST")
|
||||
def crawl_endpoint(data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
# Function implementation
|
||||
```
|
||||
|
||||
This creates a web endpoint that accepts POST requests. It:
|
||||
- Extracts the URL and configurations from the request
|
||||
- Calls the crawl function with those parameters
|
||||
- Returns the results
|
||||
|
||||
### 6. Local Entrypoint
|
||||
|
||||
```python
|
||||
@app.local_entrypoint()
|
||||
def main(url: str = "https://www.modal.com"):
|
||||
# Function implementation
|
||||
```
|
||||
|
||||
This provides a way to test the application from the command line.
|
||||
|
||||
## Step 4: Testing Locally
|
||||
|
||||
Before deploying, let's test our application locally:
|
||||
|
||||
```bash
|
||||
modal run crawl4ai_modal.py --url "https://example.com"
|
||||
```
|
||||
|
||||
This command will:
|
||||
1. Upload your code to Modal
|
||||
2. Create the necessary containers
|
||||
3. Run the `main` function with the specified URL
|
||||
4. Return the results
|
||||
|
||||
Modal will handle all the infrastructure setup for you. You should see the crawling results printed to your console.
|
||||
|
||||
## Step 5: Deploying Your Application
|
||||
|
||||
Once you're satisfied with the local testing, it's time to deploy:
|
||||
|
||||
```bash
|
||||
modal deploy crawl4ai_modal.py
|
||||
```
|
||||
|
||||
This will deploy your application to Modal's cloud. The deployment process will output URLs for your web endpoints.
|
||||
|
||||
You should see output similar to:
|
||||
|
||||
```
|
||||
✓ Deployed crawl4ai.
|
||||
URLs:
|
||||
crawl_endpoint => https://your-username--crawl-endpoint.modal.run
|
||||
```
|
||||
|
||||
Save this URL - you'll need it to make requests to your deployment.
|
||||
|
||||
## Step 6: Using Your Deployment
|
||||
|
||||
Now that your application is deployed, you can use it by sending POST requests to the endpoint URL:
|
||||
|
||||
```bash
|
||||
curl -X POST https://your-username--crawl-endpoint.modal.run \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"url": "https://example.com"}'
|
||||
```
|
||||
|
||||
Or in Python:
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
response = requests.post(
|
||||
"https://your-username--crawl-endpoint.modal.run",
|
||||
json={"url": "https://example.com"}
|
||||
)
|
||||
|
||||
result = response.json()
|
||||
print(result)
|
||||
```
|
||||
|
||||
You can also customize the browser and crawler configurations:
|
||||
|
||||
```python
|
||||
requests.post(
|
||||
"https://your-username--crawl-endpoint.modal.run",
|
||||
json={
|
||||
"url": "https://example.com",
|
||||
"browser_config": {
|
||||
"headless": False,
|
||||
"verbose": True
|
||||
},
|
||||
"crawler_config": {
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"markdown_generator": {
|
||||
"type": "DefaultMarkdownGenerator",
|
||||
"params": {
|
||||
"content_filter": {
|
||||
"type": "PruningContentFilter",
|
||||
"params": {
|
||||
"threshold": 0.6, # Adjusted threshold
|
||||
"threshold_type": "fixed"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
## Step 7: Calling Your Deployment from Another Python Script
|
||||
|
||||
You can also call your deployed function directly from another Python script:
|
||||
|
||||
```python
|
||||
import modal
|
||||
|
||||
# Get a reference to the deployed function
|
||||
crawl_function = modal.Function.from_name("crawl4ai", "crawl")
|
||||
|
||||
# Call the function
|
||||
result = crawl_function.remote("https://example.com")
|
||||
print(result)
|
||||
```
|
||||
|
||||
## Understanding Modal's Execution Flow
|
||||
|
||||
To understand how Modal works, it's important to know:
|
||||
|
||||
1. **Local vs. Remote Execution**: When you call a function with `.remote()`, it runs in Modal's cloud, not on your local machine.
|
||||
|
||||
2. **Container Lifecycle**: Modal creates containers on-demand and destroys them when they're not needed.
|
||||
|
||||
3. **Caching**: Modal caches your container images to speed up subsequent runs.
|
||||
|
||||
4. **Serverless Scaling**: Modal automatically scales your application based on demand.
|
||||
|
||||
## Customizing Your Deployment
|
||||
|
||||
You can customize your deployment in several ways:
|
||||
|
||||
### Changing the Crawl4ai Version
|
||||
|
||||
To use a different version of Crawl4ai, update the installation command in the image definition:
|
||||
|
||||
```python
|
||||
"pip install -U git+https://github.com/unclecode/crawl4ai.git@main", # Use main branch
|
||||
```
|
||||
|
||||
### Adjusting Resource Limits
|
||||
|
||||
You can change the resources allocated to your functions:
|
||||
|
||||
```python
|
||||
@app.function(timeout=600, cpu=2, memory=4096) # 10 minute timeout, 2 CPUs, 4GB RAM
|
||||
async def crawl(...):
|
||||
# Function implementation
|
||||
```
|
||||
|
||||
### Keeping Containers Warm
|
||||
|
||||
To reduce cold start times, you can keep containers warm:
|
||||
|
||||
```python
|
||||
@app.function(keep_warm=1) # Keep 1 container warm
|
||||
async def crawl(...):
|
||||
# Function implementation
|
||||
```
|
||||
|
||||
## Conclusion
|
||||
|
||||
That's it! You've successfully deployed Crawl4ai on Modal. You now have a scalable web crawling solution that can handle as many requests as you need without requiring any infrastructure management.
|
||||
|
||||
The beauty of this setup is its simplicity - Modal handles all the hard parts, letting you focus on using Crawl4ai to extract the data you need.
|
||||
|
||||
Feel free to reach out if you have any questions or need help with your deployment!
|
||||
|
||||
Happy crawling!
|
||||
- UncleCode
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- [Modal Documentation](https://modal.com/docs)
|
||||
- [Crawl4ai GitHub Repository](https://github.com/unclecode/crawl4ai)
|
||||
- [Crawl4ai Documentation](https://docs.crawl4ai.com)
|
||||
317
deploy/modal/test_modal.py
Normal file
317
deploy/modal/test_modal.py
Normal file
@@ -0,0 +1,317 @@
|
||||
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Crawl4ai API Testing Script
|
||||
|
||||
This script tests all endpoints of the Crawl4ai API service and demonstrates their usage.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from typing import Dict, Any, List, Optional
|
||||
|
||||
import requests
|
||||
|
||||
# Colors for terminal output
|
||||
class Colors:
|
||||
HEADER = '\033[95m'
|
||||
BLUE = '\033[94m'
|
||||
GREEN = '\033[92m'
|
||||
YELLOW = '\033[93m'
|
||||
RED = '\033[91m'
|
||||
ENDC = '\033[0m'
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
|
||||
def print_header(text: str) -> None:
|
||||
"""Print a formatted header."""
|
||||
print(f"\n{Colors.HEADER}{Colors.BOLD}{'=' * 80}{Colors.ENDC}")
|
||||
print(f"{Colors.HEADER}{Colors.BOLD}{text.center(80)}{Colors.ENDC}")
|
||||
print(f"{Colors.HEADER}{Colors.BOLD}{'=' * 80}{Colors.ENDC}\n")
|
||||
|
||||
def print_step(text: str) -> None:
|
||||
"""Print a formatted step description."""
|
||||
print(f"{Colors.BLUE}{Colors.BOLD}>> {text}{Colors.ENDC}")
|
||||
|
||||
def print_success(text: str) -> None:
|
||||
"""Print a success message."""
|
||||
print(f"{Colors.GREEN}✓ {text}{Colors.ENDC}")
|
||||
|
||||
def print_warning(text: str) -> None:
|
||||
"""Print a warning message."""
|
||||
print(f"{Colors.YELLOW}⚠ {text}{Colors.ENDC}")
|
||||
|
||||
def print_error(text: str) -> None:
|
||||
"""Print an error message."""
|
||||
print(f"{Colors.RED}✗ {text}{Colors.ENDC}")
|
||||
|
||||
def print_json(data: Dict[str, Any]) -> None:
|
||||
"""Pretty print JSON data."""
|
||||
print(json.dumps(data, indent=2))
|
||||
|
||||
def make_request(method: str, url: str, params: Optional[Dict[str, Any]] = None,
|
||||
json_data: Optional[Dict[str, Any]] = None,
|
||||
expected_status: int = 200) -> Dict[str, Any]:
|
||||
"""Make an HTTP request and handle errors."""
|
||||
print_step(f"Making {method.upper()} request to {url}")
|
||||
|
||||
if params:
|
||||
print(f" Parameters: {params}")
|
||||
if json_data:
|
||||
print(f" JSON Data: {json_data}")
|
||||
|
||||
try:
|
||||
response = requests.request(
|
||||
method=method,
|
||||
url=url,
|
||||
params=params,
|
||||
json=json_data,
|
||||
timeout=300 # 5 minute timeout for crawling operations
|
||||
)
|
||||
|
||||
status_code = response.status_code
|
||||
print(f" Status Code: {status_code}")
|
||||
|
||||
try:
|
||||
data = response.json()
|
||||
print(" Response:")
|
||||
print_json(data)
|
||||
|
||||
if status_code != expected_status:
|
||||
print_error(f"Expected status code {expected_status}, got {status_code}")
|
||||
return data
|
||||
|
||||
print_success("Request successful")
|
||||
return data
|
||||
except ValueError:
|
||||
print_error("Response is not valid JSON")
|
||||
print(response.text)
|
||||
return {"error": "Invalid JSON response"}
|
||||
|
||||
except requests.RequestException as e:
|
||||
print_error(f"Request failed: {str(e)}")
|
||||
return {"error": str(e)}
|
||||
|
||||
def test_health_check(base_url: str) -> bool:
|
||||
"""Test the health check endpoint."""
|
||||
print_header("Testing Health Check Endpoint")
|
||||
|
||||
response = make_request("GET", f"{base_url}/health_check")
|
||||
|
||||
if "status" in response and response["status"] == "online":
|
||||
print_success("Health check passed")
|
||||
return True
|
||||
else:
|
||||
print_error("Health check failed")
|
||||
return False
|
||||
|
||||
def test_admin_create_user(base_url: str, admin_token: str, email: str, name: str) -> Optional[str]:
|
||||
"""Test creating a new user."""
|
||||
print_header("Testing Admin User Creation")
|
||||
|
||||
response = make_request(
|
||||
"POST",
|
||||
f"{base_url}/admin_create_user",
|
||||
json_data={
|
||||
"admin_token": admin_token,
|
||||
"email": email,
|
||||
"name": name
|
||||
},
|
||||
expected_status=201
|
||||
)
|
||||
|
||||
if response.get("success") and "data" in response:
|
||||
api_token = response["data"].get("api_token")
|
||||
if api_token:
|
||||
print_success(f"User created successfully with API token: {api_token}")
|
||||
return api_token
|
||||
|
||||
print_error("Failed to create user")
|
||||
return None
|
||||
|
||||
def test_check_credits(base_url: str, api_token: str) -> Optional[int]:
|
||||
"""Test checking user credits."""
|
||||
print_header("Testing Check Credits Endpoint")
|
||||
|
||||
response = make_request(
|
||||
"GET",
|
||||
f"{base_url}/check_credits",
|
||||
params={"api_token": api_token}
|
||||
)
|
||||
|
||||
if response.get("success") and "data" in response:
|
||||
credits = response["data"].get("credits")
|
||||
if credits is not None:
|
||||
print_success(f"User has {credits} credits")
|
||||
return credits
|
||||
|
||||
print_error("Failed to check credits")
|
||||
return None
|
||||
|
||||
def test_crawl_endpoint(base_url: str, api_token: str, url: str) -> bool:
|
||||
"""Test the crawl endpoint."""
|
||||
print_header("Testing Crawl Endpoint")
|
||||
|
||||
response = make_request(
|
||||
"POST",
|
||||
f"{base_url}/crawl_endpoint",
|
||||
json_data={
|
||||
"api_token": api_token,
|
||||
"url": url
|
||||
}
|
||||
)
|
||||
|
||||
if response.get("success") and "data" in response:
|
||||
print_success("Crawl completed successfully")
|
||||
|
||||
# Display some crawl result data
|
||||
data = response["data"]
|
||||
if "title" in data:
|
||||
print(f"Page Title: {data['title']}")
|
||||
if "status" in data:
|
||||
print(f"Status: {data['status']}")
|
||||
if "links" in data:
|
||||
print(f"Links found: {len(data['links'])}")
|
||||
if "markdown_v2" in data and data["markdown_v2"] and "raw_markdown" in data["markdown_v2"]:
|
||||
print("Markdown Preview (first 200 chars):")
|
||||
print(data["markdown_v2"]["raw_markdown"][:200] + "...")
|
||||
|
||||
credits_remaining = response.get("credits_remaining")
|
||||
if credits_remaining is not None:
|
||||
print(f"Credits remaining: {credits_remaining}")
|
||||
|
||||
return True
|
||||
|
||||
print_error("Crawl failed")
|
||||
return False
|
||||
|
||||
def test_admin_update_credits(base_url: str, admin_token: str, api_token: str, amount: int) -> bool:
|
||||
"""Test updating user credits."""
|
||||
print_header("Testing Admin Update Credits")
|
||||
|
||||
response = make_request(
|
||||
"POST",
|
||||
f"{base_url}/admin_update_credits",
|
||||
json_data={
|
||||
"admin_token": admin_token,
|
||||
"api_token": api_token,
|
||||
"amount": amount
|
||||
}
|
||||
)
|
||||
|
||||
if response.get("success") and "data" in response:
|
||||
print_success(f"Credits updated successfully, new balance: {response['data'].get('credits')}")
|
||||
return True
|
||||
|
||||
print_error("Failed to update credits")
|
||||
return False
|
||||
|
||||
def test_admin_get_users(base_url: str, admin_token: str) -> List[Dict[str, Any]]:
|
||||
"""Test getting all users."""
|
||||
print_header("Testing Admin Get All Users")
|
||||
|
||||
response = make_request(
|
||||
"GET",
|
||||
f"{base_url}/admin_get_users",
|
||||
params={"admin_token": admin_token}
|
||||
)
|
||||
|
||||
if response.get("success") and "data" in response:
|
||||
users = response["data"]
|
||||
print_success(f"Retrieved {len(users)} users")
|
||||
return users
|
||||
|
||||
print_error("Failed to get users")
|
||||
return []
|
||||
|
||||
def run_full_test(base_url: str, admin_token: str) -> None:
|
||||
"""Run all tests in sequence."""
|
||||
# Remove trailing slash if present
|
||||
base_url = base_url.rstrip('/')
|
||||
|
||||
# Test 1: Health Check
|
||||
if not test_health_check(base_url):
|
||||
print_error("Health check failed, aborting tests")
|
||||
sys.exit(1)
|
||||
|
||||
# Test 2: Create a test user
|
||||
email = f"test-user-{int(time.time())}@example.com"
|
||||
name = "Test User"
|
||||
api_token = test_admin_create_user(base_url, admin_token, email, name)
|
||||
|
||||
if not api_token:
|
||||
print_error("User creation failed, aborting tests")
|
||||
sys.exit(1)
|
||||
|
||||
# Test 3: Check initial credits
|
||||
initial_credits = test_check_credits(base_url, api_token)
|
||||
|
||||
if initial_credits is None:
|
||||
print_error("Credit check failed, aborting tests")
|
||||
sys.exit(1)
|
||||
|
||||
# Test 4: Perform a crawl
|
||||
test_url = "https://news.ycombinator.com"
|
||||
crawl_success = test_crawl_endpoint(base_url, api_token, test_url)
|
||||
|
||||
if not crawl_success:
|
||||
print_warning("Crawl test failed, but continuing with other tests")
|
||||
|
||||
# Test 5: Check credits after crawl
|
||||
post_crawl_credits = test_check_credits(base_url, api_token)
|
||||
|
||||
if post_crawl_credits is not None and initial_credits is not None:
|
||||
if post_crawl_credits == initial_credits - 1:
|
||||
print_success("Credit deduction verified")
|
||||
else:
|
||||
print_warning(f"Unexpected credit change: {initial_credits} -> {post_crawl_credits}")
|
||||
|
||||
# Test 6: Add credits
|
||||
add_credits_amount = 50
|
||||
if test_admin_update_credits(base_url, admin_token, api_token, add_credits_amount):
|
||||
print_success(f"Added {add_credits_amount} credits")
|
||||
|
||||
# Test 7: Check credits after addition
|
||||
post_addition_credits = test_check_credits(base_url, api_token)
|
||||
|
||||
if post_addition_credits is not None and post_crawl_credits is not None:
|
||||
if post_addition_credits == post_crawl_credits + add_credits_amount:
|
||||
print_success("Credit addition verified")
|
||||
else:
|
||||
print_warning(f"Unexpected credit change: {post_crawl_credits} -> {post_addition_credits}")
|
||||
|
||||
# Test 8: Get all users
|
||||
users = test_admin_get_users(base_url, admin_token)
|
||||
|
||||
if users:
|
||||
# Check if our test user is in the list
|
||||
test_user = next((user for user in users if user.get("email") == email), None)
|
||||
if test_user:
|
||||
print_success("Test user found in users list")
|
||||
else:
|
||||
print_warning("Test user not found in users list")
|
||||
|
||||
# Final report
|
||||
print_header("Test Summary")
|
||||
|
||||
print_success("All endpoints tested successfully")
|
||||
print(f"Test user created with email: {email}")
|
||||
print(f"API token: {api_token}")
|
||||
print(f"Final credit balance: {post_addition_credits}")
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Test Crawl4ai API endpoints")
|
||||
parser.add_argument("--base-url", required=True, help="Base URL of the Crawl4ai API (e.g., https://username--crawl4ai-api.modal.run)")
|
||||
parser.add_argument("--admin-token", required=True, help="Admin token for authentication")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print_header("Crawl4ai API Test Script")
|
||||
print(f"Testing API at: {args.base_url}")
|
||||
|
||||
run_full_test(args.base_url, args.admin_token)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user