Add all 5 deployments solution for testing

This commit is contained in:
UncleCode
2025-03-10 18:57:14 +08:00
parent 9547bada3a
commit 3ea3c0520d
38 changed files with 6431 additions and 0 deletions

137
deploy/aws/Dockerfile Normal file
View File

@@ -0,0 +1,137 @@
FROM python:3.10-slim
# Set build arguments
ARG APP_HOME=/app
ARG GITHUB_REPO=https://github.com/unclecode/crawl4ai.git
ARG GITHUB_BRANCH=next
ARG USE_LOCAL=False
ARG CONFIG_PATH=""
ENV PYTHONFAULTHANDLER=1 \
PYTHONHASHSEED=random \
PYTHONUNBUFFERED=1 \
PIP_NO_CACHE_DIR=1 \
PYTHONDONTWRITEBYTECODE=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
PIP_DEFAULT_TIMEOUT=100 \
DEBIAN_FRONTEND=noninteractive \
REDIS_HOST=localhost \
REDIS_PORT=6379
ARG PYTHON_VERSION=3.10
ARG INSTALL_TYPE=default
ARG ENABLE_GPU=false
ARG TARGETARCH
LABEL maintainer="unclecode"
LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
LABEL version="1.0"
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
curl \
wget \
gnupg \
git \
cmake \
pkg-config \
python3-dev \
libjpeg-dev \
redis-server \
supervisor \
&& rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get install -y --no-install-recommends \
libglib2.0-0 \
libnss3 \
libnspr4 \
libatk1.0-0 \
libatk-bridge2.0-0 \
libcups2 \
libdrm2 \
libdbus-1-3 \
libxcb1 \
libxkbcommon0 \
libx11-6 \
libxcomposite1 \
libxdamage1 \
libxext6 \
libxfixes3 \
libxrandr2 \
libgbm1 \
libpango-1.0-0 \
libcairo2 \
libasound2 \
libatspi2.0-0 \
&& rm -rf /var/lib/apt/lists/*
RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
apt-get update && apt-get install -y --no-install-recommends \
nvidia-cuda-toolkit \
&& rm -rf /var/lib/apt/lists/* ; \
else \
echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
fi
RUN if [ "$TARGETARCH" = "arm64" ]; then \
echo "🦾 Installing ARM-specific optimizations"; \
apt-get update && apt-get install -y --no-install-recommends \
libopenblas-dev \
&& rm -rf /var/lib/apt/lists/*; \
elif [ "$TARGETARCH" = "amd64" ]; then \
echo "🖥️ Installing AMD64-specific optimizations"; \
apt-get update && apt-get install -y --no-install-recommends \
libomp-dev \
&& rm -rf /var/lib/apt/lists/*; \
else \
echo "Skipping platform-specific optimizations (unsupported platform)"; \
fi
WORKDIR ${APP_HOME}
RUN git clone --branch ${GITHUB_BRANCH} ${GITHUB_REPO} /tmp/crawl4ai
COPY docker/supervisord.conf .
COPY docker/requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
pip install "/tmp/crawl4ai/[all]" && \
python -m nltk.downloader punkt stopwords && \
python -m crawl4ai.model_loader ; \
elif [ "$INSTALL_TYPE" = "torch" ] ; then \
pip install "/tmp/crawl4ai/[torch]" ; \
elif [ "$INSTALL_TYPE" = "transformer" ] ; then \
pip install "/tmp/crawl4ai/[transformer]" && \
python -m crawl4ai.model_loader ; \
else \
pip install "/tmp/crawl4ai" ; \
fi
RUN pip install --no-cache-dir --upgrade pip && \
python -c "import crawl4ai; print('✅ crawl4ai is ready to rock!')" && \
python -c "from playwright.sync_api import sync_playwright; print('✅ Playwright is feeling dramatic!')"
RUN playwright install --with-deps chromium
COPY docker/* ${APP_HOME}/
RUN if [ -n "$CONFIG_PATH" ] && [ -f "$CONFIG_PATH" ]; then \
echo "Using custom config from $CONFIG_PATH" && \
cp $CONFIG_PATH /app/config.yml; \
fi
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD bash -c '\
MEM=$(free -m | awk "/^Mem:/{print \$2}"); \
if [ $MEM -lt 2048 ]; then \
echo "⚠️ Warning: Less than 2GB RAM available! Your container might need a memory boost! 🚀"; \
exit 1; \
fi && \
redis-cli ping > /dev/null && \
curl -f http://localhost:8000/health || exit 1'
# EXPOSE 6379
CMD ["supervisord", "-c", "supervisord.conf"]

3
deploy/aws/deploy-config.yml Executable file
View File

@@ -0,0 +1,3 @@
project_name: PROJECT_NAME
domain_name: DOMAIN_NAME
aws_region: AWS_REGION

729
deploy/aws/deploy.py Executable file
View File

@@ -0,0 +1,729 @@
#!/usr/bin/env python3
import argparse
import subprocess
import sys
import time
import json
import yaml
import requests
import os
# Steps for deployment
STEPS = [
"refresh_aws_auth",
"fetch_or_create_vpc_and_subnets",
"create_ecr_repositories",
"create_iam_role",
"create_security_groups",
"request_acm_certificate",
"build_and_push_docker",
"create_task_definition",
"setup_alb",
"deploy_ecs_service",
"configure_custom_domain",
"test_endpoints"
]
# Utility function to prompt user for confirmation
def confirm_step(step_name):
while True:
response = input(f"Proceed with {step_name}? (yes/no): ").strip().lower()
if response in ["yes", "no"]:
return response == "yes"
print("Please enter 'yes' or 'no'.")
# Utility function to run AWS CLI or shell commands and handle errors
def run_command(command, error_message, additional_diagnostics=None, cwd="."):
try:
result = subprocess.run(command, capture_output=True, text=True, check=True, cwd=cwd)
return result
except subprocess.CalledProcessError as e:
with open("error_context.md", "w") as f:
f.write(f"{error_message}:\n")
f.write(f"Command: {' '.join(command)}\n")
f.write(f"Exit Code: {e.returncode}\n")
f.write(f"Stdout: {e.stdout}\n")
f.write(f"Stderr: {e.stderr}\n")
if additional_diagnostics:
for diag_cmd in additional_diagnostics:
diag_result = subprocess.run(diag_cmd, capture_output=True, text=True)
f.write(f"\nDiagnostic command: {' '.join(diag_cmd)}\n")
f.write(f"Stdout: {diag_result.stdout}\n")
f.write(f"Stderr: {diag_result.stderr}\n")
raise Exception(f"{error_message}: {e.stderr}")
# Utility function to load or initialize state
def load_state(project_name):
state_file = f"{project_name}-state.json"
if os.path.exists(state_file):
with open(state_file, "r") as f:
return json.load(f)
return {"last_step": -1}
# Utility function to save state
def save_state(project_name, state):
state_file = f"{project_name}-state.json"
with open(state_file, "w") as f:
json.dump(state, f, indent=4)
# DNS Check Function
def check_dns_propagation(domain, alb_dns):
try:
result = subprocess.run(["dig", "+short", domain], capture_output=True, text=True)
if alb_dns in result.stdout:
return True
return False
except Exception as e:
print(f"Failed to check DNS: {e}")
return False
# Step Functions
def refresh_aws_auth(project_name, state, config):
if state["last_step"] >= 0:
print("Skipping refresh_aws_auth (already completed)")
return
if not confirm_step("Refresh AWS authentication"):
sys.exit("User aborted.")
run_command(
["aws", "sts", "get-caller-identity"],
"Failed to verify AWS credentials"
)
print("AWS authentication verified.")
state["last_step"] = 0
save_state(project_name, state)
def fetch_or_create_vpc_and_subnets(project_name, state, config):
if state["last_step"] >= 1:
print("Skipping fetch_or_create_vpc_and_subnets (already completed)")
return state["vpc_id"], state["public_subnets"]
if not confirm_step("Fetch or Create VPC and Subnets"):
sys.exit("User aborted.")
# Fetch AWS account ID
result = run_command(
["aws", "sts", "get-caller-identity"],
"Failed to get AWS account ID"
)
account_id = json.loads(result.stdout)["Account"]
# Fetch default VPC
result = run_command(
["aws", "ec2", "describe-vpcs", "--filters", "Name=isDefault,Values=true", "--region", config["aws_region"]],
"Failed to describe VPCs"
)
vpcs = json.loads(result.stdout).get("Vpcs", [])
if not vpcs:
result = run_command(
["aws", "ec2", "create-vpc", "--cidr-block", "10.0.0.0/16", "--region", config["aws_region"]],
"Failed to create VPC"
)
vpc_id = json.loads(result.stdout)["Vpc"]["VpcId"]
run_command(
["aws", "ec2", "modify-vpc-attribute", "--vpc-id", vpc_id, "--enable-dns-hostnames", "--region", config["aws_region"]],
"Failed to enable DNS hostnames"
)
else:
vpc_id = vpcs[0]["VpcId"]
# Fetch or create subnets
result = run_command(
["aws", "ec2", "describe-subnets", "--filters", f"Name=vpc-id,Values={vpc_id}", "--region", config["aws_region"]],
"Failed to describe subnets"
)
subnets = json.loads(result.stdout).get("Subnets", [])
if len(subnets) < 2:
azs = json.loads(run_command(
["aws", "ec2", "describe-availability-zones", "--region", config["aws_region"]],
"Failed to describe availability zones"
).stdout)["AvailabilityZones"][:2]
subnet_ids = []
for i, az in enumerate(azs):
az_name = az["ZoneName"]
result = run_command(
["aws", "ec2", "create-subnet", "--vpc-id", vpc_id, "--cidr-block", f"10.0.{i}.0/24", "--availability-zone", az_name, "--region", config["aws_region"]],
f"Failed to create subnet in {az_name}"
)
subnet_id = json.loads(result.stdout)["Subnet"]["SubnetId"]
subnet_ids.append(subnet_id)
run_command(
["aws", "ec2", "modify-subnet-attribute", "--subnet-id", subnet_id, "--map-public-ip-on-launch", "--region", config["aws_region"]],
f"Failed to make subnet {subnet_id} public"
)
else:
subnet_ids = [s["SubnetId"] for s in subnets[:2]]
# Ensure internet gateway
result = run_command(
["aws", "ec2", "describe-internet-gateways", "--filters", f"Name=attachment.vpc-id,Values={vpc_id}", "--region", config["aws_region"]],
"Failed to describe internet gateways"
)
igws = json.loads(result.stdout).get("InternetGateways", [])
if not igws:
result = run_command(
["aws", "ec2", "create-internet-gateway", "--region", config["aws_region"]],
"Failed to create internet gateway"
)
igw_id = json.loads(result.stdout)["InternetGateway"]["InternetGatewayId"]
run_command(
["aws", "ec2", "attach-internet-gateway", "--vpc-id", vpc_id, "--internet-gateway-id", igw_id, "--region", config["aws_region"]],
"Failed to attach internet gateway"
)
state["vpc_id"] = vpc_id
state["public_subnets"] = subnet_ids
state["last_step"] = 1
save_state(project_name, state)
print(f"VPC ID: {vpc_id}, Subnets: {subnet_ids}")
return vpc_id, subnet_ids
def create_ecr_repositories(project_name, state, config):
if state["last_step"] >= 2:
print("Skipping create_ecr_repositories (already completed)")
return
if not confirm_step("Create ECR Repositories"):
sys.exit("User aborted.")
account_id = json.loads(run_command(
["aws", "sts", "get-caller-identity"],
"Failed to get AWS account ID"
).stdout)["Account"]
repos = [project_name, f"{project_name}-nginx"]
for repo in repos:
result = subprocess.run(
["aws", "ecr", "describe-repositories", "--repository-names", repo, "--region", config["aws_region"]],
capture_output=True, text=True
)
if result.returncode != 0:
run_command(
["aws", "ecr", "create-repository", "--repository-name", repo, "--region", config["aws_region"]],
f"Failed to create ECR repository {repo}"
)
print(f"ECR repository {repo} is ready.")
state["last_step"] = 2
save_state(project_name, state)
def create_iam_role(project_name, state, config):
if state["last_step"] >= 3:
print("Skipping create_iam_role (already completed)")
return
if not confirm_step("Create IAM Role"):
sys.exit("User aborted.")
account_id = json.loads(run_command(
["aws", "sts", "get-caller-identity"],
"Failed to get AWS account ID"
).stdout)["Account"]
role_name = "ecsTaskExecutionRole"
trust_policy = {
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {"Service": "ecs-tasks.amazonaws.com"},
"Action": "sts:AssumeRole"
}
]
}
with open("trust_policy.json", "w") as f:
json.dump(trust_policy, f)
result = subprocess.run(
["aws", "iam", "get-role", "--role-name", role_name],
capture_output=True, text=True
)
if result.returncode != 0:
run_command(
["aws", "iam", "create-role", "--role-name", role_name, "--assume-role-policy-document", "file://trust_policy.json"],
f"Failed to create IAM role {role_name}"
)
run_command(
["aws", "iam", "attach-role-policy", "--role-name", role_name, "--policy-arn", "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"],
"Failed to attach ECS task execution policy"
)
os.remove("trust_policy.json")
state["execution_role_arn"] = f"arn:aws:iam::{account_id}:role/{role_name}"
state["last_step"] = 3
save_state(project_name, state)
print(f"IAM role {role_name} configured.")
def create_security_groups(project_name, state, config):
if state["last_step"] >= 4:
print("Skipping create_security_groups (already completed)")
return state["alb_sg_id"], state["ecs_sg_id"]
if not confirm_step("Create Security Groups"):
sys.exit("User aborted.")
vpc_id = state["vpc_id"]
alb_sg_name = f"{project_name}-alb-sg"
result = run_command(
["aws", "ec2", "describe-security-groups", "--filters", f"Name=vpc-id,Values={vpc_id}", f"Name=group-name,Values={alb_sg_name}", "--region", config["aws_region"]],
"Failed to describe ALB security group"
)
if not json.loads(result.stdout).get("SecurityGroups"):
result = run_command(
["aws", "ec2", "create-security-group", "--group-name", alb_sg_name, "--description", "Security group for ALB", "--vpc-id", vpc_id, "--region", config["aws_region"]],
"Failed to create ALB security group"
)
alb_sg_id = json.loads(result.stdout)["GroupId"]
run_command(
["aws", "ec2", "authorize-security-group-ingress", "--group-id", alb_sg_id, "--protocol", "tcp", "--port", "80", "--cidr", "0.0.0.0/0", "--region", config["aws_region"]],
"Failed to authorize HTTP ingress"
)
run_command(
["aws", "ec2", "authorize-security-group-ingress", "--group-id", alb_sg_id, "--protocol", "tcp", "--port", "443", "--cidr", "0.0.0.0/0", "--region", config["aws_region"]],
"Failed to authorize HTTPS ingress"
)
else:
alb_sg_id = json.loads(result.stdout)["SecurityGroups"][0]["GroupId"]
ecs_sg_name = f"{project_name}-ecs-sg"
result = run_command(
["aws", "ec2", "describe-security-groups", "--filters", f"Name=vpc-id,Values={vpc_id}", f"Name=group-name,Values={ecs_sg_name}", "--region", config["aws_region"]],
"Failed to describe ECS security group"
)
if not json.loads(result.stdout).get("SecurityGroups"):
result = run_command(
["aws", "ec2", "create-security-group", "--group-name", ecs_sg_name, "--description", "Security group for ECS tasks", "--vpc-id", vpc_id, "--region", config["aws_region"]],
"Failed to create ECS security group"
)
ecs_sg_id = json.loads(result.stdout)["GroupId"]
run_command(
["aws", "ec2", "authorize-security-group-ingress", "--group-id", ecs_sg_id, "--protocol", "tcp", "--port", "80", "--source-group", alb_sg_id, "--region", config["aws_region"]],
"Failed to authorize ECS ingress"
)
else:
ecs_sg_id = json.loads(result.stdout)["SecurityGroups"][0]["GroupId"]
state["alb_sg_id"] = alb_sg_id
state["ecs_sg_id"] = ecs_sg_id
state["last_step"] = 4
save_state(project_name, state)
print("Security groups configured.")
return alb_sg_id, ecs_sg_id
def request_acm_certificate(project_name, state, config):
if state["last_step"] >= 5:
print("Skipping request_acm_certificate (already completed)")
return state["cert_arn"]
if not confirm_step("Request ACM Certificate"):
sys.exit("User aborted.")
domain_name = config["domain_name"]
result = run_command(
["aws", "acm", "describe-certificates", "--certificate-statuses", "ISSUED", "--region", config["aws_region"]],
"Failed to describe certificates"
)
certificates = json.loads(result.stdout).get("CertificateSummaryList", [])
cert_arn = next((c["CertificateArn"] for c in certificates if c["DomainName"] == domain_name), None)
if not cert_arn:
result = run_command(
["aws", "acm", "request-certificate", "--domain-name", domain_name, "--validation-method", "DNS", "--region", config["aws_region"]],
"Failed to request ACM certificate"
)
cert_arn = json.loads(result.stdout)["CertificateArn"]
time.sleep(10)
result = run_command(
["aws", "acm", "describe-certificate", "--certificate-arn", cert_arn, "--region", config["aws_region"]],
"Failed to describe certificate"
)
cert_details = json.loads(result.stdout)["Certificate"]
dns_validations = cert_details.get("DomainValidationOptions", [])
for validation in dns_validations:
if validation["ValidationMethod"] == "DNS" and "ResourceRecord" in validation:
record = validation["ResourceRecord"]
print(f"Please add this DNS record to validate the certificate for {domain_name}:")
print(f"Name: {record['Name']}")
print(f"Type: {record['Type']}")
print(f"Value: {record['Value']}")
print("Press Enter after adding the DNS record...")
input()
while True:
result = run_command(
["aws", "acm", "describe-certificate", "--certificate-arn", cert_arn, "--region", config["aws_region"]],
"Failed to check certificate status"
)
status = json.loads(result.stdout)["Certificate"]["Status"]
if status == "ISSUED":
break
elif status in ["FAILED", "REVOKED", "INACTIVE"]:
print("Certificate issuance failed.")
sys.exit(1)
time.sleep(10)
state["cert_arn"] = cert_arn
state["last_step"] = 5
save_state(project_name, state)
print(f"Certificate ARN: {cert_arn}")
return cert_arn
def build_and_push_docker(project_name, state, config):
if state["last_step"] >= 6:
print("Skipping build_and_push_docker (already completed)")
return state["fastapi_image"], state["nginx_image"]
if not confirm_step("Build and Push Docker Images"):
sys.exit("User aborted.")
with open("./version.txt", "r") as f:
version = f.read().strip()
account_id = json.loads(run_command(
["aws", "sts", "get-caller-identity"],
"Failed to get AWS account ID"
).stdout)["Account"]
region = config["aws_region"]
login_password = run_command(
["aws", "ecr", "get-login-password", "--region", region],
"Failed to get ECR login password"
).stdout.strip()
run_command(
["docker", "login", "--username", "AWS", "--password", login_password, f"{account_id}.dkr.ecr.{region}.amazonaws.com"],
"Failed to authenticate Docker to ECR"
)
fastapi_image = f"{account_id}.dkr.ecr.{region}.amazonaws.com/{project_name}:{version}"
run_command(
["docker", "build", "-f", "Dockerfile", "-t", fastapi_image, "."],
"Failed to build FastAPI Docker image"
)
run_command(
["docker", "push", fastapi_image],
"Failed to push FastAPI image"
)
nginx_image = f"{account_id}.dkr.ecr.{region}.amazonaws.com/{project_name}-nginx:{version}"
run_command(
["docker", "build", "-f", "Dockerfile", "-t", nginx_image, "."],
"Failed to build Nginx Docker image",
cwd="./nginx"
)
run_command(
["docker", "push", nginx_image],
"Failed to push Nginx image"
)
state["fastapi_image"] = fastapi_image
state["nginx_image"] = nginx_image
state["last_step"] = 6
save_state(project_name, state)
print("Docker images built and pushed.")
return fastapi_image, nginx_image
def create_task_definition(project_name, state, config):
if state["last_step"] >= 7:
print("Skipping create_task_definition (already completed)")
return state["task_def_arn"]
if not confirm_step("Create Task Definition"):
sys.exit("User aborted.")
log_group = f"/ecs/{project_name}-logs"
result = run_command(
["aws", "logs", "describe-log-groups", "--log-group-name-prefix", log_group, "--region", config["aws_region"]],
"Failed to describe log groups"
)
if not any(lg["logGroupName"] == log_group for lg in json.loads(result.stdout).get("logGroups", [])):
run_command(
["aws", "logs", "create-log-group", "--log-group-name", log_group, "--region", config["aws_region"]],
f"Failed to create log group {log_group}"
)
task_definition = {
"family": f"{project_name}-taskdef",
"networkMode": "awsvpc",
"requiresCompatibilities": ["FARGATE"],
"cpu": "512",
"memory": "2048",
"executionRoleArn": state["execution_role_arn"],
"containerDefinitions": [
{
"name": "fastapi",
"image": state["fastapi_image"],
"portMappings": [{"containerPort": 8000, "hostPort": 8000, "protocol": "tcp"}],
"logConfiguration": {
"logDriver": "awslogs",
"options": {
"awslogs-group": log_group,
"awslogs-region": config["aws_region"],
"awslogs-stream-prefix": "fastapi"
}
}
},
{
"name": "nginx",
"image": state["nginx_image"],
"portMappings": [{"containerPort": 80, "hostPort": 80, "protocol": "tcp"}],
"logConfiguration": {
"logDriver": "awslogs",
"options": {
"awslogs-group": log_group,
"awslogs-region": config["aws_region"],
"awslogs-stream-prefix": "nginx"
}
}
}
]
}
with open("task_def.json", "w") as f:
json.dump(task_definition, f)
result = run_command(
["aws", "ecs", "register-task-definition", "--cli-input-json", "file://task_def.json", "--region", config["aws_region"]],
"Failed to register task definition"
)
task_def_arn = json.loads(result.stdout)["taskDefinition"]["taskDefinitionArn"]
os.remove("task_def.json")
state["task_def_arn"] = task_def_arn
state["last_step"] = 7
save_state(project_name, state)
print("Task definition created.")
return task_def_arn
def setup_alb(project_name, state, config):
if state["last_step"] >= 8:
print("Skipping setup_alb (already completed)")
return state["alb_arn"], state["tg_arn"], state["alb_dns"]
if not confirm_step("Set Up ALB"):
sys.exit("User aborted.")
vpc_id = state["vpc_id"]
public_subnets = state["public_subnets"]
alb_name = f"{project_name}-alb"
result = subprocess.run(
["aws", "elbv2", "describe-load-balancers", "--names", alb_name, "--region", config["aws_region"]],
capture_output=True, text=True
)
if result.returncode != 0:
run_command(
["aws", "elbv2", "create-load-balancer", "--name", alb_name, "--subnets"] + public_subnets + ["--security-groups", state["alb_sg_id"], "--region", config["aws_region"]],
"Failed to create ALB"
)
alb_arn = json.loads(run_command(
["aws", "elbv2", "describe-load-balancers", "--names", alb_name, "--region", config["aws_region"]],
"Failed to describe ALB"
).stdout)["LoadBalancers"][0]["LoadBalancerArn"]
alb_dns = json.loads(run_command(
["aws", "elbv2", "describe-load-balancers", "--names", alb_name, "--region", config["aws_region"]],
"Failed to get ALB DNS name"
).stdout)["LoadBalancers"][0]["DNSName"]
tg_name = f"{project_name}-tg"
result = subprocess.run(
["aws", "elbv2", "describe-target-groups", "--names", tg_name, "--region", config["aws_region"]],
capture_output=True, text=True
)
if result.returncode != 0:
run_command(
["aws", "elbv2", "create-target-group", "--name", tg_name, "--protocol", "HTTP", "--port", "80", "--vpc-id", vpc_id, "--region", config["aws_region"]],
"Failed to create target group"
)
tg_arn = json.loads(run_command(
["aws", "elbv2", "describe-target-groups", "--names", tg_name, "--region", config["aws_region"]],
"Failed to describe target group"
).stdout)["TargetGroups"][0]["TargetGroupArn"]
result = run_command(
["aws", "elbv2", "describe-listeners", "--load-balancer-arn", alb_arn, "--region", config["aws_region"]],
"Failed to describe listeners"
)
listeners = json.loads(result.stdout).get("Listeners", [])
if not any(l["Port"] == 80 for l in listeners):
run_command(
["aws", "elbv2", "create-listener", "--load-balancer-arn", alb_arn, "--protocol", "HTTP", "--port", "80", "--default-actions", "Type=redirect,RedirectConfig={Protocol=HTTPS,Port=443,StatusCode=HTTP_301}", "--region", config["aws_region"]],
"Failed to create HTTP listener"
)
if not any(l["Port"] == 443 for l in listeners):
run_command(
["aws", "elbv2", "create-listener", "--load-balancer-arn", alb_arn, "--protocol", "HTTPS", "--port", "443", "--certificates", f"CertificateArn={state['cert_arn']}", "--default-actions", f"Type=forward,TargetGroupArn={tg_arn}", "--region", config["aws_region"]],
"Failed to create HTTPS listener"
)
state["alb_arn"] = alb_arn
state["tg_arn"] = tg_arn
state["alb_dns"] = alb_dns
state["last_step"] = 8
save_state(project_name, state)
print("ALB configured.")
return alb_arn, tg_arn, alb_dns
def deploy_ecs_service(project_name, state, config):
if state["last_step"] >= 9:
print("Skipping deploy_ecs_service (already completed)")
return
if not confirm_step("Deploy ECS Service"):
sys.exit("User aborted.")
cluster_name = f"{project_name}-cluster"
result = run_command(
["aws", "ecs", "describe-clusters", "--clusters", cluster_name, "--region", config["aws_region"]],
"Failed to describe clusters"
)
if not json.loads(result.stdout).get("clusters"):
run_command(
["aws", "ecs", "create-cluster", "--cluster-name", cluster_name, "--region", config["aws_region"]],
"Failed to create ECS cluster"
)
service_name = f"{project_name}-service"
result = run_command(
["aws", "ecs", "describe-services", "--cluster", cluster_name, "--services", service_name, "--region", config["aws_region"]],
"Failed to describe services",
additional_diagnostics=[["aws", "ecs", "list-tasks", "--cluster", cluster_name, "--service-name", service_name, "--region", config["aws_region"]]]
)
services = json.loads(result.stdout).get("services", [])
if not services or services[0]["status"] == "INACTIVE":
run_command(
["aws", "ecs", "create-service", "--cluster", cluster_name, "--service-name", service_name, "--task-definition", state["task_def_arn"], "--desired-count", "1", "--launch-type", "FARGATE", "--network-configuration", f"awsvpcConfiguration={{subnets={json.dumps(state['public_subnets'])},securityGroups=[{state['ecs_sg_id']}],assignPublicIp=ENABLED}}", "--load-balancers", f"targetGroupArn={state['tg_arn']},containerName=nginx,containerPort=80", "--region", config["aws_region"]],
"Failed to create ECS service"
)
else:
run_command(
["aws", "ecs", "update-service", "--cluster", cluster_name, "--service", service_name, "--task-definition", state["task_def_arn"], "--region", config["aws_region"]],
"Failed to update ECS service"
)
state["last_step"] = 9
save_state(project_name, state)
print("ECS service deployed.")
def configure_custom_domain(project_name, state, config):
if state["last_step"] >= 10:
print("Skipping configure_custom_domain (already completed)")
return
if not confirm_step("Configure Custom Domain"):
sys.exit("User aborted.")
domain_name = config["domain_name"]
alb_dns = state["alb_dns"]
print(f"Please add a CNAME record for {domain_name} pointing to {alb_dns} in your DNS provider.")
print("Press Enter after updating the DNS record...")
input()
while not check_dns_propagation(domain_name, alb_dns):
print("DNS propagation not complete. Waiting 30 seconds before retrying...")
time.sleep(30)
print("DNS propagation confirmed.")
state["last_step"] = 10
save_state(project_name, state)
print("Custom domain configured.")
def test_endpoints(project_name, state, config):
if state["last_step"] >= 11:
print("Skipping test_endpoints (already completed)")
return
if not confirm_step("Test Endpoints"):
sys.exit("User aborted.")
domain = config["domain_name"]
time.sleep(30) # Wait for service to stabilize
response = requests.get(f"https://{domain}/health", verify=False)
if response.status_code != 200:
with open("error_context.md", "w") as f:
f.write("Health endpoint test failed:\n")
f.write(f"Status Code: {response.status_code}\n")
f.write(f"Response: {response.text}\n")
sys.exit(1)
print("Health endpoint test passed.")
payload = {
"urls": ["https://example.com"],
"browser_config": {"headless": True},
"crawler_config": {"stream": False}
}
response = requests.post(f"https://{domain}/crawl", json=payload, verify=False)
if response.status_code != 200:
with open("error_context.md", "w") as f:
f.write("Crawl endpoint test failed:\n")
f.write(f"Status Code: {response.status_code}\n")
f.write(f"Response: {response.text}\n")
sys.exit(1)
print("Crawl endpoint test passed.")
state["last_step"] = 11
save_state(project_name, state)
print("Endpoints tested successfully.")
# Main Deployment Function
def deploy(project_name, force=False):
config_file = f"{project_name}-config.yml"
if not os.path.exists(config_file):
print(f"Configuration file {config_file} not found. Run 'init' first.")
sys.exit(1)
with open(config_file, "r") as f:
config = yaml.safe_load(f)
state = load_state(project_name)
if force:
state = {"last_step": -1}
last_step = state.get("last_step", -1)
for step_idx, step_name in enumerate(STEPS):
if step_idx <= last_step:
print(f"Skipping {step_name} (already completed)")
continue
print(f"Executing step: {step_name}")
func = globals()[step_name]
if step_name == "fetch_or_create_vpc_and_subnets":
vpc_id, public_subnets = func(project_name, state, config)
elif step_name == "create_security_groups":
alb_sg_id, ecs_sg_id = func(project_name, state, config)
elif step_name == "request_acm_certificate":
cert_arn = func(project_name, state, config)
elif step_name == "build_and_push_docker":
fastapi_image, nginx_image = func(project_name, state, config)
elif step_name == "create_task_definition":
task_def_arn = func(project_name, state, config)
elif step_name == "setup_alb":
alb_arn, tg_arn, alb_dns = func(project_name, state, config)
elif step_name == "deploy_ecs_service":
func(project_name, state, config)
elif step_name == "configure_custom_domain":
func(project_name, state, config)
elif step_name == "test_endpoints":
func(project_name, state, config)
else:
func(project_name, state, config)
# Init Command
def init(project_name, domain_name, aws_region):
config = {
"project_name": project_name,
"domain_name": domain_name,
"aws_region": aws_region
}
config_file = f"{project_name}-config.yml"
with open(config_file, "w") as f:
yaml.dump(config, f)
print(f"Configuration file {config_file} created.")
# Argument Parser
parser = argparse.ArgumentParser(description="Crawl4AI Deployment Script")
subparsers = parser.add_subparsers(dest="command")
# Init Parser
init_parser = subparsers.add_parser("init", help="Initialize configuration")
init_parser.add_argument("--project", required=True, help="Project name")
init_parser.add_argument("--domain", required=True, help="Domain name")
init_parser.add_argument("--region", required=True, help="AWS region")
# Deploy Parser
deploy_parser = subparsers.add_parser("deploy", help="Deploy the project")
deploy_parser.add_argument("--project", required=True, help="Project name")
deploy_parser.add_argument("--force", action="store_true", help="Force redeployment from start")
args = parser.parse_args()
if args.command == "init":
init(args.project, args.domain, args.region)
elif args.command == "deploy":
deploy(args.project, args.force)
else:
parser.print_help()

View File

@@ -0,0 +1,31 @@
# .dockerignore
*
# Allow specific files and directories when using local installation
!crawl4ai/
!docs/
!deploy/docker/
!setup.py
!pyproject.toml
!README.md
!LICENSE
!MANIFEST.in
!setup.cfg
!mkdocs.yml
.git/
__pycache__/
*.pyc
*.pyo
*.pyd
.DS_Store
.env
.venv
venv/
tests/
coverage.xml
*.log
*.swp
*.egg-info/
dist/
build/

View File

@@ -0,0 +1,8 @@
# LLM Provider Keys
OPENAI_API_KEY=your_openai_key_here
DEEPSEEK_API_KEY=your_deepseek_key_here
ANTHROPIC_API_KEY=your_anthropic_key_here
GROQ_API_KEY=your_groq_key_here
TOGETHER_API_KEY=your_together_key_here
MISTRAL_API_KEY=your_mistral_key_here
GEMINI_API_TOKEN=your_gemini_key_here

847
deploy/aws/docker/README.md Normal file
View File

@@ -0,0 +1,847 @@
# Crawl4AI Docker Guide 🐳
## Table of Contents
- [Prerequisites](#prerequisites)
- [Installation](#installation)
- [Local Build](#local-build)
- [Docker Hub](#docker-hub)
- [Dockerfile Parameters](#dockerfile-parameters)
- [Using the API](#using-the-api)
- [Understanding Request Schema](#understanding-request-schema)
- [REST API Examples](#rest-api-examples)
- [Python SDK](#python-sdk)
- [Metrics & Monitoring](#metrics--monitoring)
- [Deployment Scenarios](#deployment-scenarios)
- [Complete Examples](#complete-examples)
- [Getting Help](#getting-help)
## Prerequisites
Before we dive in, make sure you have:
- Docker installed and running (version 20.10.0 or higher)
- At least 4GB of RAM available for the container
- Python 3.10+ (if using the Python SDK)
- Node.js 16+ (if using the Node.js examples)
> 💡 **Pro tip**: Run `docker info` to check your Docker installation and available resources.
## Installation
### Local Build
Let's get your local environment set up step by step!
#### 1. Building the Image
First, clone the repository and build the Docker image:
```bash
# Clone the repository
git clone https://github.com/unclecode/crawl4ai.git
cd crawl4ai/deploy
# Build the Docker image
docker build --platform=linux/amd64 --no-cache -t crawl4ai .
# Or build for arm64
docker build --platform=linux/arm64 --no-cache -t crawl4ai .
```
#### 2. Environment Setup
If you plan to use LLMs (Language Models), you'll need to set up your API keys. Create a `.llm.env` file:
```env
# OpenAI
OPENAI_API_KEY=sk-your-key
# Anthropic
ANTHROPIC_API_KEY=your-anthropic-key
# DeepSeek
DEEPSEEK_API_KEY=your-deepseek-key
# Check out https://docs.litellm.ai/docs/providers for more providers!
```
> 🔑 **Note**: Keep your API keys secure! Never commit them to version control.
#### 3. Running the Container
You have several options for running the container:
Basic run (no LLM support):
```bash
docker run -d -p 8000:8000 --name crawl4ai crawl4ai
```
With LLM support:
```bash
docker run -d -p 8000:8000 \
--env-file .llm.env \
--name crawl4ai \
crawl4ai
```
Using host environment variables (Not a good practice, but works for local testing):
```bash
docker run -d -p 8000:8000 \
--env-file .llm.env \
--env "$(env)" \
--name crawl4ai \
crawl4ai
```
#### Multi-Platform Build
For distributing your image across different architectures, use `buildx`:
```bash
# Set up buildx builder
docker buildx create --use
# Build for multiple platforms
docker buildx build \
--platform linux/amd64,linux/arm64 \
-t crawl4ai \
--push \
.
```
> 💡 **Note**: Multi-platform builds require Docker Buildx and need to be pushed to a registry.
#### Development Build
For development, you might want to enable all features:
```bash
docker build -t crawl4ai
--build-arg INSTALL_TYPE=all \
--build-arg PYTHON_VERSION=3.10 \
--build-arg ENABLE_GPU=true \
.
```
#### GPU-Enabled Build
If you plan to use GPU acceleration:
```bash
docker build -t crawl4ai
--build-arg ENABLE_GPU=true \
deploy/docker/
```
### Build Arguments Explained
| Argument | Description | Default | Options |
|----------|-------------|---------|----------|
| PYTHON_VERSION | Python version | 3.10 | 3.8, 3.9, 3.10 |
| INSTALL_TYPE | Feature set | default | default, all, torch, transformer |
| ENABLE_GPU | GPU support | false | true, false |
| APP_HOME | Install path | /app | any valid path |
### Build Best Practices
1. **Choose the Right Install Type**
- `default`: Basic installation, smallest image, to be honest, I use this most of the time.
- `all`: Full features, larger image (include transformer, and nltk, make sure you really need them)
2. **Platform Considerations**
- Let Docker auto-detect platform unless you need cross-compilation
- Use --platform for specific architecture requirements
- Consider buildx for multi-architecture distribution
3. **Performance Optimization**
- The image automatically includes platform-specific optimizations
- AMD64 gets OpenMP optimizations
- ARM64 gets OpenBLAS optimizations
### Docker Hub
> 🚧 Coming soon! The image will be available at `crawl4ai`. Stay tuned!
## Using the API
In the following sections, we discuss two ways to communicate with the Docker server. One option is to use the client SDK that I developed for Python, and I will soon develop one for Node.js. I highly recommend this approach to avoid mistakes. Alternatively, you can take a more technical route by using the JSON structure and passing it to all the URLs, which I will explain in detail.
### Python SDK
The SDK makes things easier! Here's how to use it:
```python
from crawl4ai.docker_client import Crawl4aiDockerClient
from crawl4ai import BrowserConfig, CrawlerRunConfig
async def main():
async with Crawl4aiDockerClient(base_url="http://localhost:8000", verbose=True) as client:
# If JWT is enabled, you can authenticate like this: (more on this later)
# await client.authenticate("test@example.com")
# Non-streaming crawl
results = await client.crawl(
["https://example.com", "https://python.org"],
browser_config=BrowserConfig(headless=True),
crawler_config=CrawlerRunConfig()
)
print(f"Non-streaming results: {results}")
# Streaming crawl
crawler_config = CrawlerRunConfig(stream=True)
async for result in await client.crawl(
["https://example.com", "https://python.org"],
browser_config=BrowserConfig(headless=True),
crawler_config=crawler_config
):
print(f"Streamed result: {result}")
# Get schema
schema = await client.get_schema()
print(f"Schema: {schema}")
if __name__ == "__main__":
asyncio.run(main())
```
`Crawl4aiDockerClient` is an async context manager that handles the connection for you. You can pass in optional parameters for more control:
- `base_url` (str): Base URL of the Crawl4AI Docker server
- `timeout` (float): Default timeout for requests in seconds
- `verify_ssl` (bool): Whether to verify SSL certificates
- `verbose` (bool): Whether to show logging output
- `log_file` (str, optional): Path to log file if file logging is desired
This client SDK generates a properly structured JSON request for the server's HTTP API.
## Second Approach: Direct API Calls
This is super important! The API expects a specific structure that matches our Python classes. Let me show you how it works.
### Understanding Configuration Structure
Let's dive deep into how configurations work in Crawl4AI. Every configuration object follows a consistent pattern of `type` and `params`. This structure enables complex, nested configurations while maintaining clarity.
#### The Basic Pattern
Try this in Python to understand the structure:
```python
from crawl4ai import BrowserConfig
# Create a config and see its structure
config = BrowserConfig(headless=True)
print(config.dump())
```
This outputs:
```json
{
"type": "BrowserConfig",
"params": {
"headless": true
}
}
```
#### Simple vs Complex Values
The structure follows these rules:
- Simple values (strings, numbers, booleans, lists) are passed directly
- Complex values (classes, dictionaries) use the type-params pattern
For example, with dictionaries:
```json
{
"browser_config": {
"type": "BrowserConfig",
"params": {
"headless": true, // Simple boolean - direct value
"viewport": { // Complex dictionary - needs type-params
"type": "dict",
"value": {
"width": 1200,
"height": 800
}
}
}
}
}
```
#### Strategy Pattern and Nesting
Strategies (like chunking or content filtering) demonstrate why we need this structure. Consider this chunking configuration:
```json
{
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"chunking_strategy": {
"type": "RegexChunking", // Strategy implementation
"params": {
"patterns": ["\n\n", "\\.\\s+"]
}
}
}
}
}
```
Here, `chunking_strategy` accepts any chunking implementation. The `type` field tells the system which strategy to use, and `params` configures that specific strategy.
#### Complex Nested Example
Let's look at a more complex example with content filtering:
```json
{
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"markdown_generator": {
"type": "DefaultMarkdownGenerator",
"params": {
"content_filter": {
"type": "PruningContentFilter",
"params": {
"threshold": 0.48,
"threshold_type": "fixed"
}
}
}
}
}
}
}
```
This shows how deeply configurations can nest while maintaining a consistent structure.
#### Quick Grammar Overview
```
config := {
"type": string,
"params": {
key: simple_value | complex_value
}
}
simple_value := string | number | boolean | [simple_value]
complex_value := config | dict_value
dict_value := {
"type": "dict",
"value": object
}
```
#### Important Rules 🚨
- Always use the type-params pattern for class instances
- Use direct values for primitives (numbers, strings, booleans)
- Wrap dictionaries with {"type": "dict", "value": {...}}
- Arrays/lists are passed directly without type-params
- All parameters are optional unless specifically required
#### Pro Tip 💡
The easiest way to get the correct structure is to:
1. Create configuration objects in Python
2. Use the `dump()` method to see their JSON representation
3. Use that JSON in your API calls
Example:
```python
from crawl4ai import CrawlerRunConfig, PruningContentFilter
config = CrawlerRunConfig(
content_filter=PruningContentFilter(threshold=0.48)
)
print(config.dump()) # Use this JSON in your API calls
```
#### More Examples
**Advanced Crawler Configuration**
```json
{
"urls": ["https://example.com"],
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": "bypass",
"markdown_generator": {
"type": "DefaultMarkdownGenerator",
"params": {
"content_filter": {
"type": "PruningContentFilter",
"params": {
"threshold": 0.48,
"threshold_type": "fixed",
"min_word_threshold": 0
}
}
}
}
}
}
}
```
**Extraction Strategy**:
```json
{
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"extraction_strategy": {
"type": "JsonCssExtractionStrategy",
"params": {
"schema": {
"baseSelector": "article.post",
"fields": [
{"name": "title", "selector": "h1", "type": "text"},
{"name": "content", "selector": ".content", "type": "html"}
]
}
}
}
}
}
}
```
**LLM Extraction Strategy**
```json
{
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"extraction_strategy": {
"type": "LLMExtractionStrategy",
"params": {
"instruction": "Extract article title, author, publication date and main content",
"provider": "openai/gpt-4",
"api_token": "your-api-token",
"schema": {
"type": "dict",
"value": {
"title": "Article Schema",
"type": "object",
"properties": {
"title": {
"type": "string",
"description": "The article's headline"
},
"author": {
"type": "string",
"description": "The author's name"
},
"published_date": {
"type": "string",
"format": "date-time",
"description": "Publication date and time"
},
"content": {
"type": "string",
"description": "The main article content"
}
},
"required": ["title", "content"]
}
}
}
}
}
}
}
```
**Deep Crawler Example**
```json
{
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": 3,
"max_pages": 100,
"filter_chain": {
"type": "FastFilterChain",
"params": {
"filters": [
{
"type": "FastContentTypeFilter",
"params": {
"allowed_types": ["text/html", "application/xhtml+xml"]
}
},
{
"type": "FastDomainFilter",
"params": {
"allowed_domains": ["blog.*", "docs.*"],
"blocked_domains": ["ads.*", "analytics.*"]
}
},
{
"type": "FastURLPatternFilter",
"params": {
"allowed_patterns": ["^/blog/", "^/docs/"],
"blocked_patterns": [".*/ads/", ".*/sponsored/"]
}
}
]
}
},
"url_scorer": {
"type": "FastCompositeScorer",
"params": {
"scorers": [
{
"type": "FastKeywordRelevanceScorer",
"params": {
"keywords": ["tutorial", "guide", "documentation"],
"weight": 1.0
}
},
{
"type": "FastPathDepthScorer",
"params": {
"weight": 0.5,
"preferred_depth": 2
}
},
{
"type": "FastFreshnessScorer",
"params": {
"weight": 0.8,
"max_age_days": 365
}
}
]
}
}
}
}
}
}
}
```
### REST API Examples
Let's look at some practical examples:
#### Simple Crawl
```python
import requests
crawl_payload = {
"urls": ["https://example.com"],
"browser_config": {"headless": True},
"crawler_config": {"stream": False}
}
response = requests.post(
"http://localhost:8000/crawl",
# headers={"Authorization": f"Bearer {token}"}, # If JWT is enabled, more on this later
json=crawl_payload
)
print(response.json()) # Print the response for debugging
```
#### Streaming Results
```python
async def test_stream_crawl(session, token: str):
"""Test the /crawl/stream endpoint with multiple URLs."""
url = "http://localhost:8000/crawl/stream"
payload = {
"urls": [
"https://example.com",
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page3",
],
"browser_config": {"headless": True, "viewport": {"width": 1200}},
"crawler_config": {"stream": True, "cache_mode": "aggressive"}
}
# headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled, more on this later
try:
async with session.post(url, json=payload, headers=headers) as response:
status = response.status
print(f"Status: {status} (Expected: 200)")
assert status == 200, f"Expected 200, got {status}"
# Read streaming response line-by-line (NDJSON)
async for line in response.content:
if line:
data = json.loads(line.decode('utf-8').strip())
print(f"Streamed Result: {json.dumps(data, indent=2)}")
except Exception as e:
print(f"Error in streaming crawl test: {str(e)}")
```
## Metrics & Monitoring
Keep an eye on your crawler with these endpoints:
- `/health` - Quick health check
- `/metrics` - Detailed Prometheus metrics
- `/schema` - Full API schema
Example health check:
```bash
curl http://localhost:8000/health
```
## Deployment Scenarios
> 🚧 Coming soon! We'll cover:
> - Kubernetes deployment
> - Cloud provider setups (AWS, GCP, Azure)
> - High-availability configurations
> - Load balancing strategies
## Complete Examples
Check out the `examples` folder in our repository for full working examples! Here are two to get you started:
[Using Client SDK](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_sdk_example.py)
[Using REST API](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_rest_api_example.py)
## Server Configuration
The server's behavior can be customized through the `config.yml` file. Let's explore how to configure your Crawl4AI server for optimal performance and security.
### Understanding config.yml
The configuration file is located at `deploy/docker/config.yml`. You can either modify this file before building the image or mount a custom configuration when running the container.
Here's a detailed breakdown of the configuration options:
```yaml
# Application Configuration
app:
title: "Crawl4AI API" # Server title in OpenAPI docs
version: "1.0.0" # API version
host: "0.0.0.0" # Listen on all interfaces
port: 8000 # Server port
reload: True # Enable hot reloading (development only)
timeout_keep_alive: 300 # Keep-alive timeout in seconds
# Rate Limiting Configuration
rate_limiting:
enabled: True # Enable/disable rate limiting
default_limit: "100/minute" # Rate limit format: "number/timeunit"
trusted_proxies: [] # List of trusted proxy IPs
storage_uri: "memory://" # Use "redis://localhost:6379" for production
# Security Configuration
security:
enabled: false # Master toggle for security features
jwt_enabled: true # Enable JWT authentication
https_redirect: True # Force HTTPS
trusted_hosts: ["*"] # Allowed hosts (use specific domains in production)
headers: # Security headers
x_content_type_options: "nosniff"
x_frame_options: "DENY"
content_security_policy: "default-src 'self'"
strict_transport_security: "max-age=63072000; includeSubDomains"
# Crawler Configuration
crawler:
memory_threshold_percent: 95.0 # Memory usage threshold
rate_limiter:
base_delay: [1.0, 2.0] # Min and max delay between requests
timeouts:
stream_init: 30.0 # Stream initialization timeout
batch_process: 300.0 # Batch processing timeout
# Logging Configuration
logging:
level: "INFO" # Log level (DEBUG, INFO, WARNING, ERROR)
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
# Observability Configuration
observability:
prometheus:
enabled: True # Enable Prometheus metrics
endpoint: "/metrics" # Metrics endpoint
health_check:
endpoint: "/health" # Health check endpoint
```
### JWT Authentication
When `security.jwt_enabled` is set to `true` in your config.yml, all endpoints require JWT authentication via bearer tokens. Here's how it works:
#### Getting a Token
```python
POST /token
Content-Type: application/json
{
"email": "user@example.com"
}
```
The endpoint returns:
```json
{
"email": "user@example.com",
"access_token": "eyJ0eXAiOiJKV1QiLCJhbGciOi...",
"token_type": "bearer"
}
```
#### Using the Token
Add the token to your requests:
```bash
curl -H "Authorization: Bearer eyJ0eXAiOiJKV1QiLCJhbGci..." http://localhost:8000/crawl
```
Using the Python SDK:
```python
from crawl4ai.docker_client import Crawl4aiDockerClient
async with Crawl4aiDockerClient() as client:
# Authenticate first
await client.authenticate("user@example.com")
# Now all requests will include the token automatically
result = await client.crawl(urls=["https://example.com"])
```
#### Production Considerations 💡
The default implementation uses a simple email verification. For production use, consider:
- Email verification via OTP/magic links
- OAuth2 integration
- Rate limiting token generation
- Token expiration and refresh mechanisms
- IP-based restrictions
### Configuration Tips and Best Practices
1. **Production Settings** 🏭
```yaml
app:
reload: False # Disable reload in production
timeout_keep_alive: 120 # Lower timeout for better resource management
rate_limiting:
storage_uri: "redis://redis:6379" # Use Redis for distributed rate limiting
default_limit: "50/minute" # More conservative rate limit
security:
enabled: true # Enable all security features
trusted_hosts: ["your-domain.com"] # Restrict to your domain
```
2. **Development Settings** 🛠️
```yaml
app:
reload: True # Enable hot reloading
timeout_keep_alive: 300 # Longer timeout for debugging
logging:
level: "DEBUG" # More verbose logging
```
3. **High-Traffic Settings** 🚦
```yaml
crawler:
memory_threshold_percent: 85.0 # More conservative memory limit
rate_limiter:
base_delay: [2.0, 4.0] # More aggressive rate limiting
```
### Customizing Your Configuration
#### Method 1: Pre-build Configuration
```bash
# Copy and modify config before building
cd crawl4ai/deploy
vim custom-config.yml # Or use any editor
# Build with custom config
docker build --platform=linux/amd64 --no-cache -t crawl4ai:latest .
```
#### Method 2: Build-time Configuration
Use a custom config during build:
```bash
# Build with custom config
docker build --platform=linux/amd64 --no-cache \
--build-arg CONFIG_PATH=/path/to/custom-config.yml \
-t crawl4ai:latest .
```
#### Method 3: Runtime Configuration
```bash
# Mount custom config at runtime
docker run -d -p 8000:8000 \
-v $(pwd)/custom-config.yml:/app/config.yml \
crawl4ai-server:prod
```
> 💡 Note: When using Method 2, `/path/to/custom-config.yml` is relative to deploy directory.
> 💡 Note: When using Method 3, ensure your custom config file has all required fields as the container will use this instead of the built-in config.
### Configuration Recommendations
1. **Security First** 🔒
- Always enable security in production
- Use specific trusted_hosts instead of wildcards
- Set up proper rate limiting to protect your server
- Consider your environment before enabling HTTPS redirect
2. **Resource Management** 💻
- Adjust memory_threshold_percent based on available RAM
- Set timeouts according to your content size and network conditions
- Use Redis for rate limiting in multi-container setups
3. **Monitoring** 📊
- Enable Prometheus if you need metrics
- Set DEBUG logging in development, INFO in production
- Regular health check monitoring is crucial
4. **Performance Tuning** ⚡
- Start with conservative rate limiter delays
- Increase batch_process timeout for large content
- Adjust stream_init timeout based on initial response times
## Getting Help
We're here to help you succeed with Crawl4AI! Here's how to get support:
- 📖 Check our [full documentation](https://docs.crawl4ai.com)
- 🐛 Found a bug? [Open an issue](https://github.com/unclecode/crawl4ai/issues)
- 💬 Join our [Discord community](https://discord.gg/crawl4ai)
- ⭐ Star us on GitHub to show support!
## Summary
In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
- Building and running the Docker container
- Configuring the environment
- Making API requests with proper typing
- Using the Python SDK
- Monitoring your deployment
Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs.
Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀
Happy crawling! 🕷️

442
deploy/aws/docker/api.py Normal file
View File

@@ -0,0 +1,442 @@
import os
import json
import asyncio
from typing import List, Tuple
import logging
from typing import Optional, AsyncGenerator
from urllib.parse import unquote
from fastapi import HTTPException, Request, status
from fastapi.background import BackgroundTasks
from fastapi.responses import JSONResponse
from redis import asyncio as aioredis
from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
LLMExtractionStrategy,
CacheMode,
BrowserConfig,
MemoryAdaptiveDispatcher,
RateLimiter
)
from crawl4ai.utils import perform_completion_with_backoff
from crawl4ai.content_filter_strategy import (
PruningContentFilter,
BM25ContentFilter,
LLMContentFilter
)
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from utils import (
TaskStatus,
FilterType,
get_base_url,
is_task_id,
should_cleanup_task,
decode_redis_hash
)
logger = logging.getLogger(__name__)
async def handle_llm_qa(
url: str,
query: str,
config: dict
) -> str:
"""Process QA using LLM with crawled content as context."""
try:
# Extract base URL by finding last '?q=' occurrence
last_q_index = url.rfind('?q=')
if last_q_index != -1:
url = url[:last_q_index]
# Get markdown content
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url)
if not result.success:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=result.error_message
)
content = result.markdown_v2.fit_markdown
# Create prompt and get LLM response
prompt = f"""Use the following content as context to answer the question.
Content:
{content}
Question: {query}
Answer:"""
response = perform_completion_with_backoff(
provider=config["llm"]["provider"],
prompt_with_variables=prompt,
api_token=os.environ.get(config["llm"].get("api_key_env", ""))
)
return response.choices[0].message.content
except Exception as e:
logger.error(f"QA processing error: {str(e)}", exc_info=True)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(e)
)
async def process_llm_extraction(
redis: aioredis.Redis,
config: dict,
task_id: str,
url: str,
instruction: str,
schema: Optional[str] = None,
cache: str = "0"
) -> None:
"""Process LLM extraction in background."""
try:
# If config['llm'] has api_key then ignore the api_key_env
api_key = ""
if "api_key" in config["llm"]:
api_key = config["llm"]["api_key"]
else:
api_key = os.environ.get(config["llm"].get("api_key_env", None), "")
llm_strategy = LLMExtractionStrategy(
provider=config["llm"]["provider"],
api_token=api_key,
instruction=instruction,
schema=json.loads(schema) if schema else None,
)
cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.WRITE_ONLY
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=url,
config=CrawlerRunConfig(
extraction_strategy=llm_strategy,
scraping_strategy=LXMLWebScrapingStrategy(),
cache_mode=cache_mode
)
)
if not result.success:
await redis.hset(f"task:{task_id}", mapping={
"status": TaskStatus.FAILED,
"error": result.error_message
})
return
try:
content = json.loads(result.extracted_content)
except json.JSONDecodeError:
content = result.extracted_content
await redis.hset(f"task:{task_id}", mapping={
"status": TaskStatus.COMPLETED,
"result": json.dumps(content)
})
except Exception as e:
logger.error(f"LLM extraction error: {str(e)}", exc_info=True)
await redis.hset(f"task:{task_id}", mapping={
"status": TaskStatus.FAILED,
"error": str(e)
})
async def handle_markdown_request(
url: str,
filter_type: FilterType,
query: Optional[str] = None,
cache: str = "0",
config: Optional[dict] = None
) -> str:
"""Handle markdown generation requests."""
try:
decoded_url = unquote(url)
if not decoded_url.startswith(('http://', 'https://')):
decoded_url = 'https://' + decoded_url
if filter_type == FilterType.RAW:
md_generator = DefaultMarkdownGenerator()
else:
content_filter = {
FilterType.FIT: PruningContentFilter(),
FilterType.BM25: BM25ContentFilter(user_query=query or ""),
FilterType.LLM: LLMContentFilter(
provider=config["llm"]["provider"],
api_token=os.environ.get(config["llm"].get("api_key_env", None), ""),
instruction=query or "Extract main content"
)
}[filter_type]
md_generator = DefaultMarkdownGenerator(content_filter=content_filter)
cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.WRITE_ONLY
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=decoded_url,
config=CrawlerRunConfig(
markdown_generator=md_generator,
scraping_strategy=LXMLWebScrapingStrategy(),
cache_mode=cache_mode
)
)
if not result.success:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=result.error_message
)
return (result.markdown_v2.raw_markdown
if filter_type == FilterType.RAW
else result.markdown_v2.fit_markdown)
except Exception as e:
logger.error(f"Markdown error: {str(e)}", exc_info=True)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(e)
)
async def handle_llm_request(
redis: aioredis.Redis,
background_tasks: BackgroundTasks,
request: Request,
input_path: str,
query: Optional[str] = None,
schema: Optional[str] = None,
cache: str = "0",
config: Optional[dict] = None
) -> JSONResponse:
"""Handle LLM extraction requests."""
base_url = get_base_url(request)
try:
if is_task_id(input_path):
return await handle_task_status(
redis, input_path, base_url
)
if not query:
return JSONResponse({
"message": "Please provide an instruction",
"_links": {
"example": {
"href": f"{base_url}/llm/{input_path}?q=Extract+main+content",
"title": "Try this example"
}
}
})
return await create_new_task(
redis,
background_tasks,
input_path,
query,
schema,
cache,
base_url,
config
)
except Exception as e:
logger.error(f"LLM endpoint error: {str(e)}", exc_info=True)
return JSONResponse({
"error": str(e),
"_links": {
"retry": {"href": str(request.url)}
}
}, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
async def handle_task_status(
redis: aioredis.Redis,
task_id: str,
base_url: str
) -> JSONResponse:
"""Handle task status check requests."""
task = await redis.hgetall(f"task:{task_id}")
if not task:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="Task not found"
)
task = decode_redis_hash(task)
response = create_task_response(task, task_id, base_url)
if task["status"] in [TaskStatus.COMPLETED, TaskStatus.FAILED]:
if should_cleanup_task(task["created_at"]):
await redis.delete(f"task:{task_id}")
return JSONResponse(response)
async def create_new_task(
redis: aioredis.Redis,
background_tasks: BackgroundTasks,
input_path: str,
query: str,
schema: Optional[str],
cache: str,
base_url: str,
config: dict
) -> JSONResponse:
"""Create and initialize a new task."""
decoded_url = unquote(input_path)
if not decoded_url.startswith(('http://', 'https://')):
decoded_url = 'https://' + decoded_url
from datetime import datetime
task_id = f"llm_{int(datetime.now().timestamp())}_{id(background_tasks)}"
await redis.hset(f"task:{task_id}", mapping={
"status": TaskStatus.PROCESSING,
"created_at": datetime.now().isoformat(),
"url": decoded_url
})
background_tasks.add_task(
process_llm_extraction,
redis,
config,
task_id,
decoded_url,
query,
schema,
cache
)
return JSONResponse({
"task_id": task_id,
"status": TaskStatus.PROCESSING,
"url": decoded_url,
"_links": {
"self": {"href": f"{base_url}/llm/{task_id}"},
"status": {"href": f"{base_url}/llm/{task_id}"}
}
})
def create_task_response(task: dict, task_id: str, base_url: str) -> dict:
"""Create response for task status check."""
response = {
"task_id": task_id,
"status": task["status"],
"created_at": task["created_at"],
"url": task["url"],
"_links": {
"self": {"href": f"{base_url}/llm/{task_id}"},
"refresh": {"href": f"{base_url}/llm/{task_id}"}
}
}
if task["status"] == TaskStatus.COMPLETED:
response["result"] = json.loads(task["result"])
elif task["status"] == TaskStatus.FAILED:
response["error"] = task["error"]
return response
async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) -> AsyncGenerator[bytes, None]:
"""Stream results with heartbeats and completion markers."""
import json
from utils import datetime_handler
try:
async for result in results_gen:
try:
result_dict = result.model_dump()
logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")
data = json.dumps(result_dict, default=datetime_handler) + "\n"
yield data.encode('utf-8')
except Exception as e:
logger.error(f"Serialization error: {e}")
error_response = {"error": str(e), "url": getattr(result, 'url', 'unknown')}
yield (json.dumps(error_response) + "\n").encode('utf-8')
yield json.dumps({"status": "completed"}).encode('utf-8')
except asyncio.CancelledError:
logger.warning("Client disconnected during streaming")
finally:
try:
await crawler.close()
except Exception as e:
logger.error(f"Crawler cleanup error: {e}")
async def handle_crawl_request(
urls: List[str],
browser_config: dict,
crawler_config: dict,
config: dict
) -> dict:
"""Handle non-streaming crawl requests."""
try:
browser_config = BrowserConfig.load(browser_config)
crawler_config = CrawlerRunConfig.load(crawler_config)
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=config["crawler"]["memory_threshold_percent"],
rate_limiter=RateLimiter(
base_delay=tuple(config["crawler"]["rate_limiter"]["base_delay"])
)
)
async with AsyncWebCrawler(config=browser_config) as crawler:
results = await crawler.arun_many(
urls=urls,
config=crawler_config,
dispatcher=dispatcher
)
return {
"success": True,
"results": [result.model_dump() for result in results]
}
except Exception as e:
logger.error(f"Crawl error: {str(e)}", exc_info=True)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(e)
)
async def handle_stream_crawl_request(
urls: List[str],
browser_config: dict,
crawler_config: dict,
config: dict
) -> Tuple[AsyncWebCrawler, AsyncGenerator]:
"""Handle streaming crawl requests."""
try:
browser_config = BrowserConfig.load(browser_config)
browser_config.verbose = True
crawler_config = CrawlerRunConfig.load(crawler_config)
crawler_config.scraping_strategy = LXMLWebScrapingStrategy()
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=config["crawler"]["memory_threshold_percent"],
rate_limiter=RateLimiter(
base_delay=tuple(config["crawler"]["rate_limiter"]["base_delay"])
)
)
crawler = AsyncWebCrawler(config=browser_config)
await crawler.start()
results_gen = await crawler.arun_many(
urls=urls,
config=crawler_config,
dispatcher=dispatcher
)
return crawler, results_gen
except Exception as e:
if 'crawler' in locals():
await crawler.close()
logger.error(f"Stream crawl error: {str(e)}", exc_info=True)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(e)
)

46
deploy/aws/docker/auth.py Normal file
View File

@@ -0,0 +1,46 @@
import os
from datetime import datetime, timedelta, timezone
from typing import Dict, Optional
from jwt import JWT, jwk_from_dict
from jwt.utils import get_int_from_datetime
from fastapi import Depends, HTTPException
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
from pydantic import EmailStr
from pydantic.main import BaseModel
import base64
instance = JWT()
security = HTTPBearer()
SECRET_KEY = os.environ.get("SECRET_KEY", "mysecret")
ACCESS_TOKEN_EXPIRE_MINUTES = 60
def get_jwk_from_secret(secret: str):
"""Convert a secret string into a JWK object."""
secret_bytes = secret.encode('utf-8')
b64_secret = base64.urlsafe_b64encode(secret_bytes).rstrip(b'=').decode('utf-8')
return jwk_from_dict({"kty": "oct", "k": b64_secret})
def create_access_token(data: dict, expires_delta: Optional[timedelta] = None) -> str:
"""Create a JWT access token with an expiration."""
to_encode = data.copy()
expire = datetime.now(timezone.utc) + (expires_delta or timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES))
to_encode.update({"exp": get_int_from_datetime(expire)})
signing_key = get_jwk_from_secret(SECRET_KEY)
return instance.encode(to_encode, signing_key, alg='HS256')
def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)) -> Dict:
"""Verify the JWT token from the Authorization header."""
token = credentials.credentials
verifying_key = get_jwk_from_secret(SECRET_KEY)
try:
payload = instance.decode(token, verifying_key, do_time_check=True, algorithms='HS256')
return payload
except Exception:
raise HTTPException(status_code=401, detail="Invalid or expired token")
def get_token_dependency(config: Dict):
"""Return the token dependency if JWT is enabled, else None."""
return verify_token if config.get("security", {}).get("jwt_enabled", False) else None
class TokenRequest(BaseModel):
email: EmailStr

View File

@@ -0,0 +1,71 @@
# Application Configuration
app:
title: "Crawl4AI API"
version: "1.0.0"
host: "0.0.0.0"
port: 8000
reload: True
timeout_keep_alive: 300
# Default LLM Configuration
llm:
provider: "openai/gpt-4o-mini"
api_key_env: "OPENAI_API_KEY"
# api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
# Redis Configuration
redis:
host: "localhost"
port: 6379
db: 0
password: ""
ssl: False
ssl_cert_reqs: None
ssl_ca_certs: None
ssl_certfile: None
ssl_keyfile: None
ssl_cert_reqs: None
ssl_ca_certs: None
ssl_certfile: None
ssl_keyfile: None
# Rate Limiting Configuration
rate_limiting:
enabled: True
default_limit: "1000/minute"
trusted_proxies: []
storage_uri: "memory://" # Use "redis://localhost:6379" for production
# Security Configuration
security:
enabled: true
jwt_enabled: true
https_redirect: false
trusted_hosts: ["*"]
headers:
x_content_type_options: "nosniff"
x_frame_options: "DENY"
content_security_policy: "default-src 'self'"
strict_transport_security: "max-age=63072000; includeSubDomains"
# Crawler Configuration
crawler:
memory_threshold_percent: 95.0
rate_limiter:
base_delay: [1.0, 2.0]
timeouts:
stream_init: 30.0 # Timeout for stream initialization
batch_process: 300.0 # Timeout for batch processing
# Logging Configuration
logging:
level: "INFO"
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
# Observability Configuration
observability:
prometheus:
enabled: True
endpoint: "/metrics"
health_check:
endpoint: "/health"

View File

@@ -0,0 +1,10 @@
crawl4ai
fastapi
uvicorn
gunicorn>=23.0.0
slowapi>=0.1.9
prometheus-fastapi-instrumentator>=7.0.2
redis>=5.2.1
jwt>=1.3.1
dnspython>=2.7.0
email-validator>=2.2.0

181
deploy/aws/docker/server.py Normal file
View File

@@ -0,0 +1,181 @@
import os
import sys
import time
from typing import List, Optional, Dict
from fastapi import FastAPI, HTTPException, Request, Query, Path, Depends
from fastapi.responses import StreamingResponse, RedirectResponse, PlainTextResponse, JSONResponse
from fastapi.middleware.httpsredirect import HTTPSRedirectMiddleware
from fastapi.middleware.trustedhost import TrustedHostMiddleware
from pydantic import BaseModel, Field
from slowapi import Limiter
from slowapi.util import get_remote_address
from prometheus_fastapi_instrumentator import Instrumentator
from redis import asyncio as aioredis
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
from utils import FilterType, load_config, setup_logging, verify_email_domain
from api import (
handle_markdown_request,
handle_llm_qa,
handle_stream_crawl_request,
handle_crawl_request,
stream_results
)
from auth import create_access_token, get_token_dependency, TokenRequest # Import from auth.py
__version__ = "0.2.6"
class CrawlRequest(BaseModel):
urls: List[str] = Field(min_length=1, max_length=100)
browser_config: Optional[Dict] = Field(default_factory=dict)
crawler_config: Optional[Dict] = Field(default_factory=dict)
# Load configuration and setup
config = load_config()
setup_logging(config)
# Initialize Redis
redis = aioredis.from_url(config["redis"].get("uri", "redis://localhost"))
# Initialize rate limiter
limiter = Limiter(
key_func=get_remote_address,
default_limits=[config["rate_limiting"]["default_limit"]],
storage_uri=config["rate_limiting"]["storage_uri"]
)
app = FastAPI(
title=config["app"]["title"],
version=config["app"]["version"]
)
# Configure middleware
def setup_security_middleware(app, config):
sec_config = config.get("security", {})
if sec_config.get("enabled", False):
if sec_config.get("https_redirect", False):
app.add_middleware(HTTPSRedirectMiddleware)
if sec_config.get("trusted_hosts", []) != ["*"]:
app.add_middleware(TrustedHostMiddleware, allowed_hosts=sec_config["trusted_hosts"])
setup_security_middleware(app, config)
# Prometheus instrumentation
if config["observability"]["prometheus"]["enabled"]:
Instrumentator().instrument(app).expose(app)
# Get token dependency based on config
token_dependency = get_token_dependency(config)
# Middleware for security headers
@app.middleware("http")
async def add_security_headers(request: Request, call_next):
response = await call_next(request)
if config["security"]["enabled"]:
response.headers.update(config["security"]["headers"])
return response
# Token endpoint (always available, but usage depends on config)
@app.post("/token")
async def get_token(request_data: TokenRequest):
if not verify_email_domain(request_data.email):
raise HTTPException(status_code=400, detail="Invalid email domain")
token = create_access_token({"sub": request_data.email})
return {"email": request_data.email, "access_token": token, "token_type": "bearer"}
# Endpoints with conditional auth
@app.get("/md/{url:path}")
@limiter.limit(config["rate_limiting"]["default_limit"])
async def get_markdown(
request: Request,
url: str,
f: FilterType = FilterType.FIT,
q: Optional[str] = None,
c: Optional[str] = "0",
token_data: Optional[Dict] = Depends(token_dependency)
):
result = await handle_markdown_request(url, f, q, c, config)
return PlainTextResponse(result)
@app.get("/llm/{url:path}", description="URL should be without http/https prefix")
async def llm_endpoint(
request: Request,
url: str = Path(...),
q: Optional[str] = Query(None),
token_data: Optional[Dict] = Depends(token_dependency)
):
if not q:
raise HTTPException(status_code=400, detail="Query parameter 'q' is required")
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
try:
answer = await handle_llm_qa(url, q, config)
return JSONResponse({"answer": answer})
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/schema")
async def get_schema():
from crawl4ai import BrowserConfig, CrawlerRunConfig
return {"browser": BrowserConfig().dump(), "crawler": CrawlerRunConfig().dump()}
@app.get(config["observability"]["health_check"]["endpoint"])
async def health():
return {"status": "ok", "timestamp": time.time(), "version": __version__}
@app.get(config["observability"]["prometheus"]["endpoint"])
async def metrics():
return RedirectResponse(url=config["observability"]["prometheus"]["endpoint"])
@app.post("/crawl")
@limiter.limit(config["rate_limiting"]["default_limit"])
async def crawl(
request: Request,
crawl_request: CrawlRequest,
token_data: Optional[Dict] = Depends(token_dependency)
):
if not crawl_request.urls:
raise HTTPException(status_code=400, detail="At least one URL required")
results = await handle_crawl_request(
urls=crawl_request.urls,
browser_config=crawl_request.browser_config,
crawler_config=crawl_request.crawler_config,
config=config
)
return JSONResponse(results)
@app.post("/crawl/stream")
@limiter.limit(config["rate_limiting"]["default_limit"])
async def crawl_stream(
request: Request,
crawl_request: CrawlRequest,
token_data: Optional[Dict] = Depends(token_dependency)
):
if not crawl_request.urls:
raise HTTPException(status_code=400, detail="At least one URL required")
crawler, results_gen = await handle_stream_crawl_request(
urls=crawl_request.urls,
browser_config=crawl_request.browser_config,
crawler_config=crawl_request.crawler_config,
config=config
)
return StreamingResponse(
stream_results(crawler, results_gen),
media_type='application/x-ndjson',
headers={'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'X-Stream-Status': 'active'}
)
if __name__ == "__main__":
import uvicorn
uvicorn.run(
"server:app",
host=config["app"]["host"],
port=config["app"]["port"],
reload=config["app"]["reload"],
timeout_keep_alive=config["app"]["timeout_keep_alive"]
)

View File

@@ -0,0 +1,12 @@
[supervisord]
nodaemon=true
[program:redis]
command=redis-server
autorestart=true
priority=10
[program:gunicorn]
command=gunicorn --bind 0.0.0.0:8000 --workers 4 --threads 2 --timeout 300 --graceful-timeout 60 --keep-alive 65 --log-level debug --worker-class uvicorn.workers.UvicornWorker --max-requests 1000 --max-requests-jitter 50 server:app
autorestart=true
priority=20

View File

@@ -0,0 +1,66 @@
import dns.resolver
import logging
import yaml
from datetime import datetime
from enum import Enum
from pathlib import Path
from fastapi import Request
from typing import Dict, Optional
class TaskStatus(str, Enum):
PROCESSING = "processing"
FAILED = "failed"
COMPLETED = "completed"
class FilterType(str, Enum):
RAW = "raw"
FIT = "fit"
BM25 = "bm25"
LLM = "llm"
def load_config() -> Dict:
"""Load and return application configuration."""
config_path = Path(__file__).parent / "config.yml"
with open(config_path, "r") as config_file:
return yaml.safe_load(config_file)
def setup_logging(config: Dict) -> None:
"""Configure application logging."""
logging.basicConfig(
level=config["logging"]["level"],
format=config["logging"]["format"]
)
def get_base_url(request: Request) -> str:
"""Get base URL including scheme and host."""
return f"{request.url.scheme}://{request.url.netloc}"
def is_task_id(value: str) -> bool:
"""Check if the value matches task ID pattern."""
return value.startswith("llm_") and "_" in value
def datetime_handler(obj: any) -> Optional[str]:
"""Handle datetime serialization for JSON."""
if hasattr(obj, 'isoformat'):
return obj.isoformat()
raise TypeError(f"Object of type {type(obj)} is not JSON serializable")
def should_cleanup_task(created_at: str) -> bool:
"""Check if task should be cleaned up based on creation time."""
created = datetime.fromisoformat(created_at)
return (datetime.now() - created).total_seconds() > 3600
def decode_redis_hash(hash_data: Dict[bytes, bytes]) -> Dict[str, str]:
"""Decode Redis hash data from bytes to strings."""
return {k.decode('utf-8'): v.decode('utf-8') for k, v in hash_data.items()}
def verify_email_domain(email: str) -> bool:
try:
domain = email.split('@')[1]
# Try to resolve MX records for the domain.
records = dns.resolver.resolve(domain, 'MX')
return True if records else False
except Exception as e:
return False

77
deploy/aws/howto.md Normal file
View File

@@ -0,0 +1,77 @@
# Crawl4AI API Quickstart
This document shows how to generate an API token and use it to call the `/crawl` and `/md` endpoints.
---
## 1. Crawl Example
Send a POST request to `/crawl` with the following JSON payload:
```json
{
"urls": ["https://example.com"],
"browser_config": { "headless": true, "verbose": true },
"crawler_config": { "stream": false, "cache_mode": "enabled" }
}
```
**cURL Command:**
```bash
curl -X POST "https://api.crawl4ai.com/crawl" \
-H "Authorization: Bearer YOUR_API_TOKEN" \
-H "Content-Type: application/json" \
-d '{
"urls": ["https://example.com"],
"browser_config": {"headless": true, "verbose": true},
"crawler_config": {"stream": false, "cache_mode": "enabled"}
}'
```
---
## 2. Markdown Retrieval Example
To retrieve markdown from a given URL (e.g., `https://example.com`), use:
```bash
curl -X GET "https://api.crawl4ai.com/md/example.com" \
-H "Authorization: Bearer YOUR_API_TOKEN"
```
---
## 3. Python Code Example (Using `requests`)
Below is a sample Python script that demonstrates using the `requests` library to call the API endpoints:
```python
import requests
BASE_URL = "https://api.crawl4ai.com"
TOKEN = "YOUR_API_TOKEN" # Replace with your actual token
headers = {
"Authorization": f"Bearer {TOKEN}",
"Content-Type": "application/json"
}
# Crawl endpoint example
crawl_payload = {
"urls": ["https://example.com"],
"browser_config": {"headless": True, "verbose": True},
"crawler_config": {"stream": False, "cache_mode": "enabled"}
}
crawl_response = requests.post(f"{BASE_URL}/crawl", json=crawl_payload, headers=headers)
print("Crawl Response:", crawl_response.json())
# /md endpoint example
md_response = requests.get(f"{BASE_URL}/md/example.com", headers=headers)
print("Markdown Content:", md_response.text)
```
---
Happy crawling!

View File

@@ -0,0 +1,2 @@
FROM nginx:alpine
COPY nginx.conf /etc/nginx/conf.d/default.conf

View File

@@ -0,0 +1,55 @@
server {
listen 80;
server_name api.crawl4ai.com;
# Main logging settings
error_log /var/log/nginx/error.log debug;
access_log /var/log/nginx/access.log combined buffer=512k flush=1m;
# Timeout and buffering settings
proxy_connect_timeout 300;
proxy_send_timeout 300;
proxy_read_timeout 300;
send_timeout 300;
proxy_buffer_size 128k;
proxy_buffers 4 256k;
proxy_busy_buffers_size 256k;
# Health check location
location /health {
proxy_pass http://127.0.0.1:8000/health;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
# Main proxy for application endpoints
location / {
proxy_pass http://127.0.0.1:8000;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
add_header X-Debug-Info $request_uri;
proxy_request_buffering off;
proxy_http_version 1.1;
proxy_set_header Connection "";
proxy_buffering off;
}
# New endpoint: serve Nginx error log
location /nginx/error {
# Using "alias" to serve the error log file
alias /var/log/nginx/error.log;
# Optionally, you might restrict access with "allow" and "deny" directives.
}
# New endpoint: serve Nginx access log
location /nginx/access {
alias /var/log/nginx/access.log;
}
client_max_body_size 10M;
client_body_buffer_size 128k;
}

1
deploy/aws/version.txt Normal file
View File

@@ -0,0 +1 @@
v0.1.0