diff --git a/deploy/aws/Dockerfile b/deploy/aws/Dockerfile new file mode 100644 index 00000000..3043bd57 --- /dev/null +++ b/deploy/aws/Dockerfile @@ -0,0 +1,137 @@ +FROM python:3.10-slim + +# Set build arguments +ARG APP_HOME=/app +ARG GITHUB_REPO=https://github.com/unclecode/crawl4ai.git +ARG GITHUB_BRANCH=next +ARG USE_LOCAL=False +ARG CONFIG_PATH="" + +ENV PYTHONFAULTHANDLER=1 \ + PYTHONHASHSEED=random \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 \ + PIP_DEFAULT_TIMEOUT=100 \ + DEBIAN_FRONTEND=noninteractive \ + REDIS_HOST=localhost \ + REDIS_PORT=6379 + +ARG PYTHON_VERSION=3.10 +ARG INSTALL_TYPE=default +ARG ENABLE_GPU=false +ARG TARGETARCH + +LABEL maintainer="unclecode" +LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper" +LABEL version="1.0" + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + wget \ + gnupg \ + git \ + cmake \ + pkg-config \ + python3-dev \ + libjpeg-dev \ + redis-server \ + supervisor \ + && rm -rf /var/lib/apt/lists/* + +RUN apt-get update && apt-get install -y --no-install-recommends \ + libglib2.0-0 \ + libnss3 \ + libnspr4 \ + libatk1.0-0 \ + libatk-bridge2.0-0 \ + libcups2 \ + libdrm2 \ + libdbus-1-3 \ + libxcb1 \ + libxkbcommon0 \ + libx11-6 \ + libxcomposite1 \ + libxdamage1 \ + libxext6 \ + libxfixes3 \ + libxrandr2 \ + libgbm1 \ + libpango-1.0-0 \ + libcairo2 \ + libasound2 \ + libatspi2.0-0 \ + && rm -rf /var/lib/apt/lists/* + +RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \ + apt-get update && apt-get install -y --no-install-recommends \ + nvidia-cuda-toolkit \ + && rm -rf /var/lib/apt/lists/* ; \ +else \ + echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \ +fi + +RUN if [ "$TARGETARCH" = "arm64" ]; then \ + echo "🦾 Installing ARM-specific optimizations"; \ + apt-get update && apt-get install -y --no-install-recommends \ + libopenblas-dev \ + && rm -rf /var/lib/apt/lists/*; \ +elif [ "$TARGETARCH" = "amd64" ]; then \ + echo "🖥️ Installing AMD64-specific optimizations"; \ + apt-get update && apt-get install -y --no-install-recommends \ + libomp-dev \ + && rm -rf /var/lib/apt/lists/*; \ +else \ + echo "Skipping platform-specific optimizations (unsupported platform)"; \ +fi + +WORKDIR ${APP_HOME} + +RUN git clone --branch ${GITHUB_BRANCH} ${GITHUB_REPO} /tmp/crawl4ai + +COPY docker/supervisord.conf . +COPY docker/requirements.txt . + +RUN pip install --no-cache-dir -r requirements.txt + +RUN if [ "$INSTALL_TYPE" = "all" ] ; then \ + pip install "/tmp/crawl4ai/[all]" && \ + python -m nltk.downloader punkt stopwords && \ + python -m crawl4ai.model_loader ; \ + elif [ "$INSTALL_TYPE" = "torch" ] ; then \ + pip install "/tmp/crawl4ai/[torch]" ; \ + elif [ "$INSTALL_TYPE" = "transformer" ] ; then \ + pip install "/tmp/crawl4ai/[transformer]" && \ + python -m crawl4ai.model_loader ; \ + else \ + pip install "/tmp/crawl4ai" ; \ + fi + +RUN pip install --no-cache-dir --upgrade pip && \ + python -c "import crawl4ai; print('✅ crawl4ai is ready to rock!')" && \ + python -c "from playwright.sync_api import sync_playwright; print('✅ Playwright is feeling dramatic!')" + +RUN playwright install --with-deps chromium + +COPY docker/* ${APP_HOME}/ +RUN if [ -n "$CONFIG_PATH" ] && [ -f "$CONFIG_PATH" ]; then \ + echo "Using custom config from $CONFIG_PATH" && \ + cp $CONFIG_PATH /app/config.yml; \ +fi + +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD bash -c '\ + MEM=$(free -m | awk "/^Mem:/{print \$2}"); \ + if [ $MEM -lt 2048 ]; then \ + echo "⚠️ Warning: Less than 2GB RAM available! Your container might need a memory boost! 🚀"; \ + exit 1; \ + fi && \ + redis-cli ping > /dev/null && \ + curl -f http://localhost:8000/health || exit 1' + +# EXPOSE 6379 + +CMD ["supervisord", "-c", "supervisord.conf"] + diff --git a/deploy/aws/deploy-config.yml b/deploy/aws/deploy-config.yml new file mode 100755 index 00000000..aa10a50c --- /dev/null +++ b/deploy/aws/deploy-config.yml @@ -0,0 +1,3 @@ +project_name: PROJECT_NAME +domain_name: DOMAIN_NAME +aws_region: AWS_REGION \ No newline at end of file diff --git a/deploy/aws/deploy.py b/deploy/aws/deploy.py new file mode 100755 index 00000000..9ddc7662 --- /dev/null +++ b/deploy/aws/deploy.py @@ -0,0 +1,729 @@ +#!/usr/bin/env python3 +import argparse +import subprocess +import sys +import time +import json +import yaml +import requests +import os + +# Steps for deployment +STEPS = [ + "refresh_aws_auth", + "fetch_or_create_vpc_and_subnets", + "create_ecr_repositories", + "create_iam_role", + "create_security_groups", + "request_acm_certificate", + "build_and_push_docker", + "create_task_definition", + "setup_alb", + "deploy_ecs_service", + "configure_custom_domain", + "test_endpoints" +] + +# Utility function to prompt user for confirmation +def confirm_step(step_name): + while True: + response = input(f"Proceed with {step_name}? (yes/no): ").strip().lower() + if response in ["yes", "no"]: + return response == "yes" + print("Please enter 'yes' or 'no'.") + +# Utility function to run AWS CLI or shell commands and handle errors +def run_command(command, error_message, additional_diagnostics=None, cwd="."): + try: + result = subprocess.run(command, capture_output=True, text=True, check=True, cwd=cwd) + return result + except subprocess.CalledProcessError as e: + with open("error_context.md", "w") as f: + f.write(f"{error_message}:\n") + f.write(f"Command: {' '.join(command)}\n") + f.write(f"Exit Code: {e.returncode}\n") + f.write(f"Stdout: {e.stdout}\n") + f.write(f"Stderr: {e.stderr}\n") + if additional_diagnostics: + for diag_cmd in additional_diagnostics: + diag_result = subprocess.run(diag_cmd, capture_output=True, text=True) + f.write(f"\nDiagnostic command: {' '.join(diag_cmd)}\n") + f.write(f"Stdout: {diag_result.stdout}\n") + f.write(f"Stderr: {diag_result.stderr}\n") + raise Exception(f"{error_message}: {e.stderr}") + +# Utility function to load or initialize state +def load_state(project_name): + state_file = f"{project_name}-state.json" + if os.path.exists(state_file): + with open(state_file, "r") as f: + return json.load(f) + return {"last_step": -1} + +# Utility function to save state +def save_state(project_name, state): + state_file = f"{project_name}-state.json" + with open(state_file, "w") as f: + json.dump(state, f, indent=4) + +# DNS Check Function +def check_dns_propagation(domain, alb_dns): + try: + result = subprocess.run(["dig", "+short", domain], capture_output=True, text=True) + if alb_dns in result.stdout: + return True + return False + except Exception as e: + print(f"Failed to check DNS: {e}") + return False + +# Step Functions +def refresh_aws_auth(project_name, state, config): + if state["last_step"] >= 0: + print("Skipping refresh_aws_auth (already completed)") + return + if not confirm_step("Refresh AWS authentication"): + sys.exit("User aborted.") + run_command( + ["aws", "sts", "get-caller-identity"], + "Failed to verify AWS credentials" + ) + print("AWS authentication verified.") + state["last_step"] = 0 + save_state(project_name, state) + +def fetch_or_create_vpc_and_subnets(project_name, state, config): + if state["last_step"] >= 1: + print("Skipping fetch_or_create_vpc_and_subnets (already completed)") + return state["vpc_id"], state["public_subnets"] + if not confirm_step("Fetch or Create VPC and Subnets"): + sys.exit("User aborted.") + + # Fetch AWS account ID + result = run_command( + ["aws", "sts", "get-caller-identity"], + "Failed to get AWS account ID" + ) + account_id = json.loads(result.stdout)["Account"] + + # Fetch default VPC + result = run_command( + ["aws", "ec2", "describe-vpcs", "--filters", "Name=isDefault,Values=true", "--region", config["aws_region"]], + "Failed to describe VPCs" + ) + vpcs = json.loads(result.stdout).get("Vpcs", []) + if not vpcs: + result = run_command( + ["aws", "ec2", "create-vpc", "--cidr-block", "10.0.0.0/16", "--region", config["aws_region"]], + "Failed to create VPC" + ) + vpc_id = json.loads(result.stdout)["Vpc"]["VpcId"] + run_command( + ["aws", "ec2", "modify-vpc-attribute", "--vpc-id", vpc_id, "--enable-dns-hostnames", "--region", config["aws_region"]], + "Failed to enable DNS hostnames" + ) + else: + vpc_id = vpcs[0]["VpcId"] + + # Fetch or create subnets + result = run_command( + ["aws", "ec2", "describe-subnets", "--filters", f"Name=vpc-id,Values={vpc_id}", "--region", config["aws_region"]], + "Failed to describe subnets" + ) + subnets = json.loads(result.stdout).get("Subnets", []) + if len(subnets) < 2: + azs = json.loads(run_command( + ["aws", "ec2", "describe-availability-zones", "--region", config["aws_region"]], + "Failed to describe availability zones" + ).stdout)["AvailabilityZones"][:2] + subnet_ids = [] + for i, az in enumerate(azs): + az_name = az["ZoneName"] + result = run_command( + ["aws", "ec2", "create-subnet", "--vpc-id", vpc_id, "--cidr-block", f"10.0.{i}.0/24", "--availability-zone", az_name, "--region", config["aws_region"]], + f"Failed to create subnet in {az_name}" + ) + subnet_id = json.loads(result.stdout)["Subnet"]["SubnetId"] + subnet_ids.append(subnet_id) + run_command( + ["aws", "ec2", "modify-subnet-attribute", "--subnet-id", subnet_id, "--map-public-ip-on-launch", "--region", config["aws_region"]], + f"Failed to make subnet {subnet_id} public" + ) + else: + subnet_ids = [s["SubnetId"] for s in subnets[:2]] + + # Ensure internet gateway + result = run_command( + ["aws", "ec2", "describe-internet-gateways", "--filters", f"Name=attachment.vpc-id,Values={vpc_id}", "--region", config["aws_region"]], + "Failed to describe internet gateways" + ) + igws = json.loads(result.stdout).get("InternetGateways", []) + if not igws: + result = run_command( + ["aws", "ec2", "create-internet-gateway", "--region", config["aws_region"]], + "Failed to create internet gateway" + ) + igw_id = json.loads(result.stdout)["InternetGateway"]["InternetGatewayId"] + run_command( + ["aws", "ec2", "attach-internet-gateway", "--vpc-id", vpc_id, "--internet-gateway-id", igw_id, "--region", config["aws_region"]], + "Failed to attach internet gateway" + ) + + state["vpc_id"] = vpc_id + state["public_subnets"] = subnet_ids + state["last_step"] = 1 + save_state(project_name, state) + print(f"VPC ID: {vpc_id}, Subnets: {subnet_ids}") + return vpc_id, subnet_ids + +def create_ecr_repositories(project_name, state, config): + if state["last_step"] >= 2: + print("Skipping create_ecr_repositories (already completed)") + return + if not confirm_step("Create ECR Repositories"): + sys.exit("User aborted.") + + account_id = json.loads(run_command( + ["aws", "sts", "get-caller-identity"], + "Failed to get AWS account ID" + ).stdout)["Account"] + repos = [project_name, f"{project_name}-nginx"] + for repo in repos: + result = subprocess.run( + ["aws", "ecr", "describe-repositories", "--repository-names", repo, "--region", config["aws_region"]], + capture_output=True, text=True + ) + if result.returncode != 0: + run_command( + ["aws", "ecr", "create-repository", "--repository-name", repo, "--region", config["aws_region"]], + f"Failed to create ECR repository {repo}" + ) + print(f"ECR repository {repo} is ready.") + state["last_step"] = 2 + save_state(project_name, state) + +def create_iam_role(project_name, state, config): + if state["last_step"] >= 3: + print("Skipping create_iam_role (already completed)") + return + if not confirm_step("Create IAM Role"): + sys.exit("User aborted.") + + account_id = json.loads(run_command( + ["aws", "sts", "get-caller-identity"], + "Failed to get AWS account ID" + ).stdout)["Account"] + role_name = "ecsTaskExecutionRole" + trust_policy = { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": {"Service": "ecs-tasks.amazonaws.com"}, + "Action": "sts:AssumeRole" + } + ] + } + with open("trust_policy.json", "w") as f: + json.dump(trust_policy, f) + + result = subprocess.run( + ["aws", "iam", "get-role", "--role-name", role_name], + capture_output=True, text=True + ) + if result.returncode != 0: + run_command( + ["aws", "iam", "create-role", "--role-name", role_name, "--assume-role-policy-document", "file://trust_policy.json"], + f"Failed to create IAM role {role_name}" + ) + run_command( + ["aws", "iam", "attach-role-policy", "--role-name", role_name, "--policy-arn", "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"], + "Failed to attach ECS task execution policy" + ) + os.remove("trust_policy.json") + state["execution_role_arn"] = f"arn:aws:iam::{account_id}:role/{role_name}" + state["last_step"] = 3 + save_state(project_name, state) + print(f"IAM role {role_name} configured.") + +def create_security_groups(project_name, state, config): + if state["last_step"] >= 4: + print("Skipping create_security_groups (already completed)") + return state["alb_sg_id"], state["ecs_sg_id"] + if not confirm_step("Create Security Groups"): + sys.exit("User aborted.") + + vpc_id = state["vpc_id"] + alb_sg_name = f"{project_name}-alb-sg" + result = run_command( + ["aws", "ec2", "describe-security-groups", "--filters", f"Name=vpc-id,Values={vpc_id}", f"Name=group-name,Values={alb_sg_name}", "--region", config["aws_region"]], + "Failed to describe ALB security group" + ) + if not json.loads(result.stdout).get("SecurityGroups"): + result = run_command( + ["aws", "ec2", "create-security-group", "--group-name", alb_sg_name, "--description", "Security group for ALB", "--vpc-id", vpc_id, "--region", config["aws_region"]], + "Failed to create ALB security group" + ) + alb_sg_id = json.loads(result.stdout)["GroupId"] + run_command( + ["aws", "ec2", "authorize-security-group-ingress", "--group-id", alb_sg_id, "--protocol", "tcp", "--port", "80", "--cidr", "0.0.0.0/0", "--region", config["aws_region"]], + "Failed to authorize HTTP ingress" + ) + run_command( + ["aws", "ec2", "authorize-security-group-ingress", "--group-id", alb_sg_id, "--protocol", "tcp", "--port", "443", "--cidr", "0.0.0.0/0", "--region", config["aws_region"]], + "Failed to authorize HTTPS ingress" + ) + else: + alb_sg_id = json.loads(result.stdout)["SecurityGroups"][0]["GroupId"] + + ecs_sg_name = f"{project_name}-ecs-sg" + result = run_command( + ["aws", "ec2", "describe-security-groups", "--filters", f"Name=vpc-id,Values={vpc_id}", f"Name=group-name,Values={ecs_sg_name}", "--region", config["aws_region"]], + "Failed to describe ECS security group" + ) + if not json.loads(result.stdout).get("SecurityGroups"): + result = run_command( + ["aws", "ec2", "create-security-group", "--group-name", ecs_sg_name, "--description", "Security group for ECS tasks", "--vpc-id", vpc_id, "--region", config["aws_region"]], + "Failed to create ECS security group" + ) + ecs_sg_id = json.loads(result.stdout)["GroupId"] + run_command( + ["aws", "ec2", "authorize-security-group-ingress", "--group-id", ecs_sg_id, "--protocol", "tcp", "--port", "80", "--source-group", alb_sg_id, "--region", config["aws_region"]], + "Failed to authorize ECS ingress" + ) + else: + ecs_sg_id = json.loads(result.stdout)["SecurityGroups"][0]["GroupId"] + + state["alb_sg_id"] = alb_sg_id + state["ecs_sg_id"] = ecs_sg_id + state["last_step"] = 4 + save_state(project_name, state) + print("Security groups configured.") + return alb_sg_id, ecs_sg_id + +def request_acm_certificate(project_name, state, config): + if state["last_step"] >= 5: + print("Skipping request_acm_certificate (already completed)") + return state["cert_arn"] + if not confirm_step("Request ACM Certificate"): + sys.exit("User aborted.") + + domain_name = config["domain_name"] + result = run_command( + ["aws", "acm", "describe-certificates", "--certificate-statuses", "ISSUED", "--region", config["aws_region"]], + "Failed to describe certificates" + ) + certificates = json.loads(result.stdout).get("CertificateSummaryList", []) + cert_arn = next((c["CertificateArn"] for c in certificates if c["DomainName"] == domain_name), None) + + if not cert_arn: + result = run_command( + ["aws", "acm", "request-certificate", "--domain-name", domain_name, "--validation-method", "DNS", "--region", config["aws_region"]], + "Failed to request ACM certificate" + ) + cert_arn = json.loads(result.stdout)["CertificateArn"] + + time.sleep(10) + result = run_command( + ["aws", "acm", "describe-certificate", "--certificate-arn", cert_arn, "--region", config["aws_region"]], + "Failed to describe certificate" + ) + cert_details = json.loads(result.stdout)["Certificate"] + dns_validations = cert_details.get("DomainValidationOptions", []) + for validation in dns_validations: + if validation["ValidationMethod"] == "DNS" and "ResourceRecord" in validation: + record = validation["ResourceRecord"] + print(f"Please add this DNS record to validate the certificate for {domain_name}:") + print(f"Name: {record['Name']}") + print(f"Type: {record['Type']}") + print(f"Value: {record['Value']}") + print("Press Enter after adding the DNS record...") + input() + + while True: + result = run_command( + ["aws", "acm", "describe-certificate", "--certificate-arn", cert_arn, "--region", config["aws_region"]], + "Failed to check certificate status" + ) + status = json.loads(result.stdout)["Certificate"]["Status"] + if status == "ISSUED": + break + elif status in ["FAILED", "REVOKED", "INACTIVE"]: + print("Certificate issuance failed.") + sys.exit(1) + time.sleep(10) + + state["cert_arn"] = cert_arn + state["last_step"] = 5 + save_state(project_name, state) + print(f"Certificate ARN: {cert_arn}") + return cert_arn + +def build_and_push_docker(project_name, state, config): + if state["last_step"] >= 6: + print("Skipping build_and_push_docker (already completed)") + return state["fastapi_image"], state["nginx_image"] + if not confirm_step("Build and Push Docker Images"): + sys.exit("User aborted.") + + with open("./version.txt", "r") as f: + version = f.read().strip() + + account_id = json.loads(run_command( + ["aws", "sts", "get-caller-identity"], + "Failed to get AWS account ID" + ).stdout)["Account"] + region = config["aws_region"] + + login_password = run_command( + ["aws", "ecr", "get-login-password", "--region", region], + "Failed to get ECR login password" + ).stdout.strip() + run_command( + ["docker", "login", "--username", "AWS", "--password", login_password, f"{account_id}.dkr.ecr.{region}.amazonaws.com"], + "Failed to authenticate Docker to ECR" + ) + + fastapi_image = f"{account_id}.dkr.ecr.{region}.amazonaws.com/{project_name}:{version}" + run_command( + ["docker", "build", "-f", "Dockerfile", "-t", fastapi_image, "."], + "Failed to build FastAPI Docker image" + ) + run_command( + ["docker", "push", fastapi_image], + "Failed to push FastAPI image" + ) + + nginx_image = f"{account_id}.dkr.ecr.{region}.amazonaws.com/{project_name}-nginx:{version}" + run_command( + ["docker", "build", "-f", "Dockerfile", "-t", nginx_image, "."], + "Failed to build Nginx Docker image", + cwd="./nginx" + ) + run_command( + ["docker", "push", nginx_image], + "Failed to push Nginx image" + ) + + state["fastapi_image"] = fastapi_image + state["nginx_image"] = nginx_image + state["last_step"] = 6 + save_state(project_name, state) + print("Docker images built and pushed.") + return fastapi_image, nginx_image + +def create_task_definition(project_name, state, config): + if state["last_step"] >= 7: + print("Skipping create_task_definition (already completed)") + return state["task_def_arn"] + if not confirm_step("Create Task Definition"): + sys.exit("User aborted.") + + log_group = f"/ecs/{project_name}-logs" + result = run_command( + ["aws", "logs", "describe-log-groups", "--log-group-name-prefix", log_group, "--region", config["aws_region"]], + "Failed to describe log groups" + ) + if not any(lg["logGroupName"] == log_group for lg in json.loads(result.stdout).get("logGroups", [])): + run_command( + ["aws", "logs", "create-log-group", "--log-group-name", log_group, "--region", config["aws_region"]], + f"Failed to create log group {log_group}" + ) + + task_definition = { + "family": f"{project_name}-taskdef", + "networkMode": "awsvpc", + "requiresCompatibilities": ["FARGATE"], + "cpu": "512", + "memory": "2048", + "executionRoleArn": state["execution_role_arn"], + "containerDefinitions": [ + { + "name": "fastapi", + "image": state["fastapi_image"], + "portMappings": [{"containerPort": 8000, "hostPort": 8000, "protocol": "tcp"}], + "logConfiguration": { + "logDriver": "awslogs", + "options": { + "awslogs-group": log_group, + "awslogs-region": config["aws_region"], + "awslogs-stream-prefix": "fastapi" + } + } + }, + { + "name": "nginx", + "image": state["nginx_image"], + "portMappings": [{"containerPort": 80, "hostPort": 80, "protocol": "tcp"}], + "logConfiguration": { + "logDriver": "awslogs", + "options": { + "awslogs-group": log_group, + "awslogs-region": config["aws_region"], + "awslogs-stream-prefix": "nginx" + } + } + } + ] + } + + with open("task_def.json", "w") as f: + json.dump(task_definition, f) + result = run_command( + ["aws", "ecs", "register-task-definition", "--cli-input-json", "file://task_def.json", "--region", config["aws_region"]], + "Failed to register task definition" + ) + task_def_arn = json.loads(result.stdout)["taskDefinition"]["taskDefinitionArn"] + os.remove("task_def.json") + + state["task_def_arn"] = task_def_arn + state["last_step"] = 7 + save_state(project_name, state) + print("Task definition created.") + return task_def_arn + +def setup_alb(project_name, state, config): + if state["last_step"] >= 8: + print("Skipping setup_alb (already completed)") + return state["alb_arn"], state["tg_arn"], state["alb_dns"] + if not confirm_step("Set Up ALB"): + sys.exit("User aborted.") + + vpc_id = state["vpc_id"] + public_subnets = state["public_subnets"] + alb_name = f"{project_name}-alb" + + result = subprocess.run( + ["aws", "elbv2", "describe-load-balancers", "--names", alb_name, "--region", config["aws_region"]], + capture_output=True, text=True + ) + if result.returncode != 0: + run_command( + ["aws", "elbv2", "create-load-balancer", "--name", alb_name, "--subnets"] + public_subnets + ["--security-groups", state["alb_sg_id"], "--region", config["aws_region"]], + "Failed to create ALB" + ) + alb_arn = json.loads(run_command( + ["aws", "elbv2", "describe-load-balancers", "--names", alb_name, "--region", config["aws_region"]], + "Failed to describe ALB" + ).stdout)["LoadBalancers"][0]["LoadBalancerArn"] + alb_dns = json.loads(run_command( + ["aws", "elbv2", "describe-load-balancers", "--names", alb_name, "--region", config["aws_region"]], + "Failed to get ALB DNS name" + ).stdout)["LoadBalancers"][0]["DNSName"] + + tg_name = f"{project_name}-tg" + result = subprocess.run( + ["aws", "elbv2", "describe-target-groups", "--names", tg_name, "--region", config["aws_region"]], + capture_output=True, text=True + ) + if result.returncode != 0: + run_command( + ["aws", "elbv2", "create-target-group", "--name", tg_name, "--protocol", "HTTP", "--port", "80", "--vpc-id", vpc_id, "--region", config["aws_region"]], + "Failed to create target group" + ) + tg_arn = json.loads(run_command( + ["aws", "elbv2", "describe-target-groups", "--names", tg_name, "--region", config["aws_region"]], + "Failed to describe target group" + ).stdout)["TargetGroups"][0]["TargetGroupArn"] + + result = run_command( + ["aws", "elbv2", "describe-listeners", "--load-balancer-arn", alb_arn, "--region", config["aws_region"]], + "Failed to describe listeners" + ) + listeners = json.loads(result.stdout).get("Listeners", []) + if not any(l["Port"] == 80 for l in listeners): + run_command( + ["aws", "elbv2", "create-listener", "--load-balancer-arn", alb_arn, "--protocol", "HTTP", "--port", "80", "--default-actions", "Type=redirect,RedirectConfig={Protocol=HTTPS,Port=443,StatusCode=HTTP_301}", "--region", config["aws_region"]], + "Failed to create HTTP listener" + ) + if not any(l["Port"] == 443 for l in listeners): + run_command( + ["aws", "elbv2", "create-listener", "--load-balancer-arn", alb_arn, "--protocol", "HTTPS", "--port", "443", "--certificates", f"CertificateArn={state['cert_arn']}", "--default-actions", f"Type=forward,TargetGroupArn={tg_arn}", "--region", config["aws_region"]], + "Failed to create HTTPS listener" + ) + + state["alb_arn"] = alb_arn + state["tg_arn"] = tg_arn + state["alb_dns"] = alb_dns + state["last_step"] = 8 + save_state(project_name, state) + print("ALB configured.") + return alb_arn, tg_arn, alb_dns + +def deploy_ecs_service(project_name, state, config): + if state["last_step"] >= 9: + print("Skipping deploy_ecs_service (already completed)") + return + if not confirm_step("Deploy ECS Service"): + sys.exit("User aborted.") + + cluster_name = f"{project_name}-cluster" + result = run_command( + ["aws", "ecs", "describe-clusters", "--clusters", cluster_name, "--region", config["aws_region"]], + "Failed to describe clusters" + ) + if not json.loads(result.stdout).get("clusters"): + run_command( + ["aws", "ecs", "create-cluster", "--cluster-name", cluster_name, "--region", config["aws_region"]], + "Failed to create ECS cluster" + ) + + service_name = f"{project_name}-service" + result = run_command( + ["aws", "ecs", "describe-services", "--cluster", cluster_name, "--services", service_name, "--region", config["aws_region"]], + "Failed to describe services", + additional_diagnostics=[["aws", "ecs", "list-tasks", "--cluster", cluster_name, "--service-name", service_name, "--region", config["aws_region"]]] + ) + services = json.loads(result.stdout).get("services", []) + if not services or services[0]["status"] == "INACTIVE": + run_command( + ["aws", "ecs", "create-service", "--cluster", cluster_name, "--service-name", service_name, "--task-definition", state["task_def_arn"], "--desired-count", "1", "--launch-type", "FARGATE", "--network-configuration", f"awsvpcConfiguration={{subnets={json.dumps(state['public_subnets'])},securityGroups=[{state['ecs_sg_id']}],assignPublicIp=ENABLED}}", "--load-balancers", f"targetGroupArn={state['tg_arn']},containerName=nginx,containerPort=80", "--region", config["aws_region"]], + "Failed to create ECS service" + ) + else: + run_command( + ["aws", "ecs", "update-service", "--cluster", cluster_name, "--service", service_name, "--task-definition", state["task_def_arn"], "--region", config["aws_region"]], + "Failed to update ECS service" + ) + + state["last_step"] = 9 + save_state(project_name, state) + print("ECS service deployed.") + +def configure_custom_domain(project_name, state, config): + if state["last_step"] >= 10: + print("Skipping configure_custom_domain (already completed)") + return + if not confirm_step("Configure Custom Domain"): + sys.exit("User aborted.") + + domain_name = config["domain_name"] + alb_dns = state["alb_dns"] + print(f"Please add a CNAME record for {domain_name} pointing to {alb_dns} in your DNS provider.") + print("Press Enter after updating the DNS record...") + input() + + while not check_dns_propagation(domain_name, alb_dns): + print("DNS propagation not complete. Waiting 30 seconds before retrying...") + time.sleep(30) + print("DNS propagation confirmed.") + + state["last_step"] = 10 + save_state(project_name, state) + print("Custom domain configured.") + +def test_endpoints(project_name, state, config): + if state["last_step"] >= 11: + print("Skipping test_endpoints (already completed)") + return + if not confirm_step("Test Endpoints"): + sys.exit("User aborted.") + + domain = config["domain_name"] + time.sleep(30) # Wait for service to stabilize + + response = requests.get(f"https://{domain}/health", verify=False) + if response.status_code != 200: + with open("error_context.md", "w") as f: + f.write("Health endpoint test failed:\n") + f.write(f"Status Code: {response.status_code}\n") + f.write(f"Response: {response.text}\n") + sys.exit(1) + print("Health endpoint test passed.") + + payload = { + "urls": ["https://example.com"], + "browser_config": {"headless": True}, + "crawler_config": {"stream": False} + } + response = requests.post(f"https://{domain}/crawl", json=payload, verify=False) + if response.status_code != 200: + with open("error_context.md", "w") as f: + f.write("Crawl endpoint test failed:\n") + f.write(f"Status Code: {response.status_code}\n") + f.write(f"Response: {response.text}\n") + sys.exit(1) + print("Crawl endpoint test passed.") + + state["last_step"] = 11 + save_state(project_name, state) + print("Endpoints tested successfully.") + +# Main Deployment Function +def deploy(project_name, force=False): + config_file = f"{project_name}-config.yml" + if not os.path.exists(config_file): + print(f"Configuration file {config_file} not found. Run 'init' first.") + sys.exit(1) + + with open(config_file, "r") as f: + config = yaml.safe_load(f) + + state = load_state(project_name) + if force: + state = {"last_step": -1} + + last_step = state.get("last_step", -1) + + for step_idx, step_name in enumerate(STEPS): + if step_idx <= last_step: + print(f"Skipping {step_name} (already completed)") + continue + print(f"Executing step: {step_name}") + func = globals()[step_name] + if step_name == "fetch_or_create_vpc_and_subnets": + vpc_id, public_subnets = func(project_name, state, config) + elif step_name == "create_security_groups": + alb_sg_id, ecs_sg_id = func(project_name, state, config) + elif step_name == "request_acm_certificate": + cert_arn = func(project_name, state, config) + elif step_name == "build_and_push_docker": + fastapi_image, nginx_image = func(project_name, state, config) + elif step_name == "create_task_definition": + task_def_arn = func(project_name, state, config) + elif step_name == "setup_alb": + alb_arn, tg_arn, alb_dns = func(project_name, state, config) + elif step_name == "deploy_ecs_service": + func(project_name, state, config) + elif step_name == "configure_custom_domain": + func(project_name, state, config) + elif step_name == "test_endpoints": + func(project_name, state, config) + else: + func(project_name, state, config) + +# Init Command +def init(project_name, domain_name, aws_region): + config = { + "project_name": project_name, + "domain_name": domain_name, + "aws_region": aws_region + } + config_file = f"{project_name}-config.yml" + with open(config_file, "w") as f: + yaml.dump(config, f) + print(f"Configuration file {config_file} created.") + +# Argument Parser +parser = argparse.ArgumentParser(description="Crawl4AI Deployment Script") +subparsers = parser.add_subparsers(dest="command") + +# Init Parser +init_parser = subparsers.add_parser("init", help="Initialize configuration") +init_parser.add_argument("--project", required=True, help="Project name") +init_parser.add_argument("--domain", required=True, help="Domain name") +init_parser.add_argument("--region", required=True, help="AWS region") + +# Deploy Parser +deploy_parser = subparsers.add_parser("deploy", help="Deploy the project") +deploy_parser.add_argument("--project", required=True, help="Project name") +deploy_parser.add_argument("--force", action="store_true", help="Force redeployment from start") + +args = parser.parse_args() + +if args.command == "init": + init(args.project, args.domain, args.region) +elif args.command == "deploy": + deploy(args.project, args.force) +else: + parser.print_help() \ No newline at end of file diff --git a/deploy/aws/docker/.dockerignore b/deploy/aws/docker/.dockerignore new file mode 100644 index 00000000..6f126444 --- /dev/null +++ b/deploy/aws/docker/.dockerignore @@ -0,0 +1,31 @@ +# .dockerignore +* + +# Allow specific files and directories when using local installation +!crawl4ai/ +!docs/ +!deploy/docker/ +!setup.py +!pyproject.toml +!README.md +!LICENSE +!MANIFEST.in +!setup.cfg +!mkdocs.yml + +.git/ +__pycache__/ +*.pyc +*.pyo +*.pyd +.DS_Store +.env +.venv +venv/ +tests/ +coverage.xml +*.log +*.swp +*.egg-info/ +dist/ +build/ \ No newline at end of file diff --git a/deploy/aws/docker/.llm.env.example b/deploy/aws/docker/.llm.env.example new file mode 100644 index 00000000..5fee4a93 --- /dev/null +++ b/deploy/aws/docker/.llm.env.example @@ -0,0 +1,8 @@ +# LLM Provider Keys +OPENAI_API_KEY=your_openai_key_here +DEEPSEEK_API_KEY=your_deepseek_key_here +ANTHROPIC_API_KEY=your_anthropic_key_here +GROQ_API_KEY=your_groq_key_here +TOGETHER_API_KEY=your_together_key_here +MISTRAL_API_KEY=your_mistral_key_here +GEMINI_API_TOKEN=your_gemini_key_here \ No newline at end of file diff --git a/deploy/aws/docker/README.md b/deploy/aws/docker/README.md new file mode 100644 index 00000000..f62e58c4 --- /dev/null +++ b/deploy/aws/docker/README.md @@ -0,0 +1,847 @@ +# Crawl4AI Docker Guide 🐳 + +## Table of Contents +- [Prerequisites](#prerequisites) +- [Installation](#installation) + - [Local Build](#local-build) + - [Docker Hub](#docker-hub) +- [Dockerfile Parameters](#dockerfile-parameters) +- [Using the API](#using-the-api) + - [Understanding Request Schema](#understanding-request-schema) + - [REST API Examples](#rest-api-examples) + - [Python SDK](#python-sdk) +- [Metrics & Monitoring](#metrics--monitoring) +- [Deployment Scenarios](#deployment-scenarios) +- [Complete Examples](#complete-examples) +- [Getting Help](#getting-help) + +## Prerequisites + +Before we dive in, make sure you have: +- Docker installed and running (version 20.10.0 or higher) +- At least 4GB of RAM available for the container +- Python 3.10+ (if using the Python SDK) +- Node.js 16+ (if using the Node.js examples) + +> 💡 **Pro tip**: Run `docker info` to check your Docker installation and available resources. + +## Installation + +### Local Build + +Let's get your local environment set up step by step! + +#### 1. Building the Image + +First, clone the repository and build the Docker image: + +```bash +# Clone the repository +git clone https://github.com/unclecode/crawl4ai.git +cd crawl4ai/deploy + +# Build the Docker image +docker build --platform=linux/amd64 --no-cache -t crawl4ai . + +# Or build for arm64 +docker build --platform=linux/arm64 --no-cache -t crawl4ai . +``` + +#### 2. Environment Setup + +If you plan to use LLMs (Language Models), you'll need to set up your API keys. Create a `.llm.env` file: + +```env +# OpenAI +OPENAI_API_KEY=sk-your-key + +# Anthropic +ANTHROPIC_API_KEY=your-anthropic-key + +# DeepSeek +DEEPSEEK_API_KEY=your-deepseek-key + +# Check out https://docs.litellm.ai/docs/providers for more providers! +``` + +> 🔑 **Note**: Keep your API keys secure! Never commit them to version control. + +#### 3. Running the Container + +You have several options for running the container: + +Basic run (no LLM support): +```bash +docker run -d -p 8000:8000 --name crawl4ai crawl4ai +``` + +With LLM support: +```bash +docker run -d -p 8000:8000 \ + --env-file .llm.env \ + --name crawl4ai \ + crawl4ai +``` + +Using host environment variables (Not a good practice, but works for local testing): +```bash +docker run -d -p 8000:8000 \ + --env-file .llm.env \ + --env "$(env)" \ + --name crawl4ai \ + crawl4ai +``` + +#### Multi-Platform Build +For distributing your image across different architectures, use `buildx`: + +```bash +# Set up buildx builder +docker buildx create --use + +# Build for multiple platforms +docker buildx build \ + --platform linux/amd64,linux/arm64 \ + -t crawl4ai \ + --push \ + . +``` + +> 💡 **Note**: Multi-platform builds require Docker Buildx and need to be pushed to a registry. + +#### Development Build +For development, you might want to enable all features: + +```bash +docker build -t crawl4ai + --build-arg INSTALL_TYPE=all \ + --build-arg PYTHON_VERSION=3.10 \ + --build-arg ENABLE_GPU=true \ + . +``` + +#### GPU-Enabled Build +If you plan to use GPU acceleration: + +```bash +docker build -t crawl4ai + --build-arg ENABLE_GPU=true \ + deploy/docker/ +``` + +### Build Arguments Explained + +| Argument | Description | Default | Options | +|----------|-------------|---------|----------| +| PYTHON_VERSION | Python version | 3.10 | 3.8, 3.9, 3.10 | +| INSTALL_TYPE | Feature set | default | default, all, torch, transformer | +| ENABLE_GPU | GPU support | false | true, false | +| APP_HOME | Install path | /app | any valid path | + +### Build Best Practices + +1. **Choose the Right Install Type** + - `default`: Basic installation, smallest image, to be honest, I use this most of the time. + - `all`: Full features, larger image (include transformer, and nltk, make sure you really need them) + +2. **Platform Considerations** + - Let Docker auto-detect platform unless you need cross-compilation + - Use --platform for specific architecture requirements + - Consider buildx for multi-architecture distribution + +3. **Performance Optimization** + - The image automatically includes platform-specific optimizations + - AMD64 gets OpenMP optimizations + - ARM64 gets OpenBLAS optimizations + +### Docker Hub + +> 🚧 Coming soon! The image will be available at `crawl4ai`. Stay tuned! + +## Using the API + +In the following sections, we discuss two ways to communicate with the Docker server. One option is to use the client SDK that I developed for Python, and I will soon develop one for Node.js. I highly recommend this approach to avoid mistakes. Alternatively, you can take a more technical route by using the JSON structure and passing it to all the URLs, which I will explain in detail. + +### Python SDK + +The SDK makes things easier! Here's how to use it: + +```python +from crawl4ai.docker_client import Crawl4aiDockerClient +from crawl4ai import BrowserConfig, CrawlerRunConfig + +async def main(): + async with Crawl4aiDockerClient(base_url="http://localhost:8000", verbose=True) as client: + # If JWT is enabled, you can authenticate like this: (more on this later) + # await client.authenticate("test@example.com") + + # Non-streaming crawl + results = await client.crawl( + ["https://example.com", "https://python.org"], + browser_config=BrowserConfig(headless=True), + crawler_config=CrawlerRunConfig() + ) + print(f"Non-streaming results: {results}") + + # Streaming crawl + crawler_config = CrawlerRunConfig(stream=True) + async for result in await client.crawl( + ["https://example.com", "https://python.org"], + browser_config=BrowserConfig(headless=True), + crawler_config=crawler_config + ): + print(f"Streamed result: {result}") + + # Get schema + schema = await client.get_schema() + print(f"Schema: {schema}") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +`Crawl4aiDockerClient` is an async context manager that handles the connection for you. You can pass in optional parameters for more control: + +- `base_url` (str): Base URL of the Crawl4AI Docker server +- `timeout` (float): Default timeout for requests in seconds +- `verify_ssl` (bool): Whether to verify SSL certificates +- `verbose` (bool): Whether to show logging output +- `log_file` (str, optional): Path to log file if file logging is desired + +This client SDK generates a properly structured JSON request for the server's HTTP API. + +## Second Approach: Direct API Calls + +This is super important! The API expects a specific structure that matches our Python classes. Let me show you how it works. + +### Understanding Configuration Structure + +Let's dive deep into how configurations work in Crawl4AI. Every configuration object follows a consistent pattern of `type` and `params`. This structure enables complex, nested configurations while maintaining clarity. + +#### The Basic Pattern + +Try this in Python to understand the structure: +```python +from crawl4ai import BrowserConfig + +# Create a config and see its structure +config = BrowserConfig(headless=True) +print(config.dump()) +``` + +This outputs: +```json +{ + "type": "BrowserConfig", + "params": { + "headless": true + } +} +``` + +#### Simple vs Complex Values + +The structure follows these rules: +- Simple values (strings, numbers, booleans, lists) are passed directly +- Complex values (classes, dictionaries) use the type-params pattern + +For example, with dictionaries: +```json +{ + "browser_config": { + "type": "BrowserConfig", + "params": { + "headless": true, // Simple boolean - direct value + "viewport": { // Complex dictionary - needs type-params + "type": "dict", + "value": { + "width": 1200, + "height": 800 + } + } + } + } +} +``` + +#### Strategy Pattern and Nesting + +Strategies (like chunking or content filtering) demonstrate why we need this structure. Consider this chunking configuration: + +```json +{ + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "chunking_strategy": { + "type": "RegexChunking", // Strategy implementation + "params": { + "patterns": ["\n\n", "\\.\\s+"] + } + } + } + } +} +``` + +Here, `chunking_strategy` accepts any chunking implementation. The `type` field tells the system which strategy to use, and `params` configures that specific strategy. + +#### Complex Nested Example + +Let's look at a more complex example with content filtering: + +```json +{ + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "markdown_generator": { + "type": "DefaultMarkdownGenerator", + "params": { + "content_filter": { + "type": "PruningContentFilter", + "params": { + "threshold": 0.48, + "threshold_type": "fixed" + } + } + } + } + } + } +} +``` + +This shows how deeply configurations can nest while maintaining a consistent structure. + +#### Quick Grammar Overview +``` +config := { + "type": string, + "params": { + key: simple_value | complex_value + } +} + +simple_value := string | number | boolean | [simple_value] +complex_value := config | dict_value + +dict_value := { + "type": "dict", + "value": object +} +``` + +#### Important Rules 🚨 + +- Always use the type-params pattern for class instances +- Use direct values for primitives (numbers, strings, booleans) +- Wrap dictionaries with {"type": "dict", "value": {...}} +- Arrays/lists are passed directly without type-params +- All parameters are optional unless specifically required + +#### Pro Tip 💡 + +The easiest way to get the correct structure is to: +1. Create configuration objects in Python +2. Use the `dump()` method to see their JSON representation +3. Use that JSON in your API calls + +Example: +```python +from crawl4ai import CrawlerRunConfig, PruningContentFilter + +config = CrawlerRunConfig( + content_filter=PruningContentFilter(threshold=0.48) +) +print(config.dump()) # Use this JSON in your API calls +``` + + +#### More Examples + +**Advanced Crawler Configuration** + +```json +{ + "urls": ["https://example.com"], + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "cache_mode": "bypass", + "markdown_generator": { + "type": "DefaultMarkdownGenerator", + "params": { + "content_filter": { + "type": "PruningContentFilter", + "params": { + "threshold": 0.48, + "threshold_type": "fixed", + "min_word_threshold": 0 + } + } + } + } + } + } +} +``` + +**Extraction Strategy**: + +```json +{ + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "extraction_strategy": { + "type": "JsonCssExtractionStrategy", + "params": { + "schema": { + "baseSelector": "article.post", + "fields": [ + {"name": "title", "selector": "h1", "type": "text"}, + {"name": "content", "selector": ".content", "type": "html"} + ] + } + } + } + } + } +} +``` + +**LLM Extraction Strategy** + +```json +{ + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "extraction_strategy": { + "type": "LLMExtractionStrategy", + "params": { + "instruction": "Extract article title, author, publication date and main content", + "provider": "openai/gpt-4", + "api_token": "your-api-token", + "schema": { + "type": "dict", + "value": { + "title": "Article Schema", + "type": "object", + "properties": { + "title": { + "type": "string", + "description": "The article's headline" + }, + "author": { + "type": "string", + "description": "The author's name" + }, + "published_date": { + "type": "string", + "format": "date-time", + "description": "Publication date and time" + }, + "content": { + "type": "string", + "description": "The main article content" + } + }, + "required": ["title", "content"] + } + } + } + } + } + } +} +``` + +**Deep Crawler Example** + +```json +{ + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "deep_crawl_strategy": { + "type": "BFSDeepCrawlStrategy", + "params": { + "max_depth": 3, + "max_pages": 100, + "filter_chain": { + "type": "FastFilterChain", + "params": { + "filters": [ + { + "type": "FastContentTypeFilter", + "params": { + "allowed_types": ["text/html", "application/xhtml+xml"] + } + }, + { + "type": "FastDomainFilter", + "params": { + "allowed_domains": ["blog.*", "docs.*"], + "blocked_domains": ["ads.*", "analytics.*"] + } + }, + { + "type": "FastURLPatternFilter", + "params": { + "allowed_patterns": ["^/blog/", "^/docs/"], + "blocked_patterns": [".*/ads/", ".*/sponsored/"] + } + } + ] + } + }, + "url_scorer": { + "type": "FastCompositeScorer", + "params": { + "scorers": [ + { + "type": "FastKeywordRelevanceScorer", + "params": { + "keywords": ["tutorial", "guide", "documentation"], + "weight": 1.0 + } + }, + { + "type": "FastPathDepthScorer", + "params": { + "weight": 0.5, + "preferred_depth": 2 + } + }, + { + "type": "FastFreshnessScorer", + "params": { + "weight": 0.8, + "max_age_days": 365 + } + } + ] + } + } + } + } + } + } +} +``` + +### REST API Examples + +Let's look at some practical examples: + +#### Simple Crawl + +```python +import requests + +crawl_payload = { + "urls": ["https://example.com"], + "browser_config": {"headless": True}, + "crawler_config": {"stream": False} +} +response = requests.post( + "http://localhost:8000/crawl", + # headers={"Authorization": f"Bearer {token}"}, # If JWT is enabled, more on this later + json=crawl_payload +) +print(response.json()) # Print the response for debugging +``` + +#### Streaming Results + +```python +async def test_stream_crawl(session, token: str): + """Test the /crawl/stream endpoint with multiple URLs.""" + url = "http://localhost:8000/crawl/stream" + payload = { + "urls": [ + "https://example.com", + "https://example.com/page1", + "https://example.com/page2", + "https://example.com/page3", + ], + "browser_config": {"headless": True, "viewport": {"width": 1200}}, + "crawler_config": {"stream": True, "cache_mode": "aggressive"} + } + + # headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled, more on this later + + try: + async with session.post(url, json=payload, headers=headers) as response: + status = response.status + print(f"Status: {status} (Expected: 200)") + assert status == 200, f"Expected 200, got {status}" + + # Read streaming response line-by-line (NDJSON) + async for line in response.content: + if line: + data = json.loads(line.decode('utf-8').strip()) + print(f"Streamed Result: {json.dumps(data, indent=2)}") + except Exception as e: + print(f"Error in streaming crawl test: {str(e)}") +``` + +## Metrics & Monitoring + +Keep an eye on your crawler with these endpoints: + +- `/health` - Quick health check +- `/metrics` - Detailed Prometheus metrics +- `/schema` - Full API schema + +Example health check: +```bash +curl http://localhost:8000/health +``` + +## Deployment Scenarios + +> 🚧 Coming soon! We'll cover: +> - Kubernetes deployment +> - Cloud provider setups (AWS, GCP, Azure) +> - High-availability configurations +> - Load balancing strategies + +## Complete Examples + +Check out the `examples` folder in our repository for full working examples! Here are two to get you started: +[Using Client SDK](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_sdk_example.py) +[Using REST API](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_rest_api_example.py) + +## Server Configuration + +The server's behavior can be customized through the `config.yml` file. Let's explore how to configure your Crawl4AI server for optimal performance and security. + +### Understanding config.yml + +The configuration file is located at `deploy/docker/config.yml`. You can either modify this file before building the image or mount a custom configuration when running the container. + +Here's a detailed breakdown of the configuration options: + +```yaml +# Application Configuration +app: + title: "Crawl4AI API" # Server title in OpenAPI docs + version: "1.0.0" # API version + host: "0.0.0.0" # Listen on all interfaces + port: 8000 # Server port + reload: True # Enable hot reloading (development only) + timeout_keep_alive: 300 # Keep-alive timeout in seconds + +# Rate Limiting Configuration +rate_limiting: + enabled: True # Enable/disable rate limiting + default_limit: "100/minute" # Rate limit format: "number/timeunit" + trusted_proxies: [] # List of trusted proxy IPs + storage_uri: "memory://" # Use "redis://localhost:6379" for production + +# Security Configuration +security: + enabled: false # Master toggle for security features + jwt_enabled: true # Enable JWT authentication + https_redirect: True # Force HTTPS + trusted_hosts: ["*"] # Allowed hosts (use specific domains in production) + headers: # Security headers + x_content_type_options: "nosniff" + x_frame_options: "DENY" + content_security_policy: "default-src 'self'" + strict_transport_security: "max-age=63072000; includeSubDomains" + +# Crawler Configuration +crawler: + memory_threshold_percent: 95.0 # Memory usage threshold + rate_limiter: + base_delay: [1.0, 2.0] # Min and max delay between requests + timeouts: + stream_init: 30.0 # Stream initialization timeout + batch_process: 300.0 # Batch processing timeout + +# Logging Configuration +logging: + level: "INFO" # Log level (DEBUG, INFO, WARNING, ERROR) + format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + +# Observability Configuration +observability: + prometheus: + enabled: True # Enable Prometheus metrics + endpoint: "/metrics" # Metrics endpoint + health_check: + endpoint: "/health" # Health check endpoint +``` + +### JWT Authentication + +When `security.jwt_enabled` is set to `true` in your config.yml, all endpoints require JWT authentication via bearer tokens. Here's how it works: + +#### Getting a Token +```python +POST /token +Content-Type: application/json + +{ + "email": "user@example.com" +} +``` + +The endpoint returns: +```json +{ + "email": "user@example.com", + "access_token": "eyJ0eXAiOiJKV1QiLCJhbGciOi...", + "token_type": "bearer" +} +``` + +#### Using the Token +Add the token to your requests: +```bash +curl -H "Authorization: Bearer eyJ0eXAiOiJKV1QiLCJhbGci..." http://localhost:8000/crawl +``` + +Using the Python SDK: +```python +from crawl4ai.docker_client import Crawl4aiDockerClient + +async with Crawl4aiDockerClient() as client: + # Authenticate first + await client.authenticate("user@example.com") + + # Now all requests will include the token automatically + result = await client.crawl(urls=["https://example.com"]) +``` + +#### Production Considerations 💡 +The default implementation uses a simple email verification. For production use, consider: +- Email verification via OTP/magic links +- OAuth2 integration +- Rate limiting token generation +- Token expiration and refresh mechanisms +- IP-based restrictions + +### Configuration Tips and Best Practices + +1. **Production Settings** 🏭 + + ```yaml + app: + reload: False # Disable reload in production + timeout_keep_alive: 120 # Lower timeout for better resource management + + rate_limiting: + storage_uri: "redis://redis:6379" # Use Redis for distributed rate limiting + default_limit: "50/minute" # More conservative rate limit + + security: + enabled: true # Enable all security features + trusted_hosts: ["your-domain.com"] # Restrict to your domain + ``` + +2. **Development Settings** 🛠️ + + ```yaml + app: + reload: True # Enable hot reloading + timeout_keep_alive: 300 # Longer timeout for debugging + + logging: + level: "DEBUG" # More verbose logging + ``` + +3. **High-Traffic Settings** 🚦 + + ```yaml + crawler: + memory_threshold_percent: 85.0 # More conservative memory limit + rate_limiter: + base_delay: [2.0, 4.0] # More aggressive rate limiting + ``` + +### Customizing Your Configuration + +#### Method 1: Pre-build Configuration + +```bash +# Copy and modify config before building +cd crawl4ai/deploy +vim custom-config.yml # Or use any editor + +# Build with custom config +docker build --platform=linux/amd64 --no-cache -t crawl4ai:latest . +``` + +#### Method 2: Build-time Configuration + +Use a custom config during build: + +```bash +# Build with custom config +docker build --platform=linux/amd64 --no-cache \ + --build-arg CONFIG_PATH=/path/to/custom-config.yml \ + -t crawl4ai:latest . +``` + +#### Method 3: Runtime Configuration +```bash +# Mount custom config at runtime +docker run -d -p 8000:8000 \ + -v $(pwd)/custom-config.yml:/app/config.yml \ + crawl4ai-server:prod +``` + +> 💡 Note: When using Method 2, `/path/to/custom-config.yml` is relative to deploy directory. +> 💡 Note: When using Method 3, ensure your custom config file has all required fields as the container will use this instead of the built-in config. + +### Configuration Recommendations + +1. **Security First** 🔒 + - Always enable security in production + - Use specific trusted_hosts instead of wildcards + - Set up proper rate limiting to protect your server + - Consider your environment before enabling HTTPS redirect + +2. **Resource Management** 💻 + - Adjust memory_threshold_percent based on available RAM + - Set timeouts according to your content size and network conditions + - Use Redis for rate limiting in multi-container setups + +3. **Monitoring** 📊 + - Enable Prometheus if you need metrics + - Set DEBUG logging in development, INFO in production + - Regular health check monitoring is crucial + +4. **Performance Tuning** ⚡ + - Start with conservative rate limiter delays + - Increase batch_process timeout for large content + - Adjust stream_init timeout based on initial response times + +## Getting Help + +We're here to help you succeed with Crawl4AI! Here's how to get support: + +- 📖 Check our [full documentation](https://docs.crawl4ai.com) +- 🐛 Found a bug? [Open an issue](https://github.com/unclecode/crawl4ai/issues) +- 💬 Join our [Discord community](https://discord.gg/crawl4ai) +- ⭐ Star us on GitHub to show support! + +## Summary + +In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment: +- Building and running the Docker container +- Configuring the environment +- Making API requests with proper typing +- Using the Python SDK +- Monitoring your deployment + +Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs. + +Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀 + +Happy crawling! 🕷️ \ No newline at end of file diff --git a/deploy/aws/docker/api.py b/deploy/aws/docker/api.py new file mode 100644 index 00000000..df3d5c49 --- /dev/null +++ b/deploy/aws/docker/api.py @@ -0,0 +1,442 @@ +import os +import json +import asyncio +from typing import List, Tuple + +import logging +from typing import Optional, AsyncGenerator +from urllib.parse import unquote +from fastapi import HTTPException, Request, status +from fastapi.background import BackgroundTasks +from fastapi.responses import JSONResponse +from redis import asyncio as aioredis + +from crawl4ai import ( + AsyncWebCrawler, + CrawlerRunConfig, + LLMExtractionStrategy, + CacheMode, + BrowserConfig, + MemoryAdaptiveDispatcher, + RateLimiter +) +from crawl4ai.utils import perform_completion_with_backoff +from crawl4ai.content_filter_strategy import ( + PruningContentFilter, + BM25ContentFilter, + LLMContentFilter +) +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy + +from utils import ( + TaskStatus, + FilterType, + get_base_url, + is_task_id, + should_cleanup_task, + decode_redis_hash +) + +logger = logging.getLogger(__name__) + +async def handle_llm_qa( + url: str, + query: str, + config: dict +) -> str: + """Process QA using LLM with crawled content as context.""" + try: + # Extract base URL by finding last '?q=' occurrence + last_q_index = url.rfind('?q=') + if last_q_index != -1: + url = url[:last_q_index] + + # Get markdown content + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url) + if not result.success: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=result.error_message + ) + content = result.markdown_v2.fit_markdown + + # Create prompt and get LLM response + prompt = f"""Use the following content as context to answer the question. + Content: + {content} + + Question: {query} + + Answer:""" + + response = perform_completion_with_backoff( + provider=config["llm"]["provider"], + prompt_with_variables=prompt, + api_token=os.environ.get(config["llm"].get("api_key_env", "")) + ) + + return response.choices[0].message.content + except Exception as e: + logger.error(f"QA processing error: {str(e)}", exc_info=True) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=str(e) + ) + +async def process_llm_extraction( + redis: aioredis.Redis, + config: dict, + task_id: str, + url: str, + instruction: str, + schema: Optional[str] = None, + cache: str = "0" +) -> None: + """Process LLM extraction in background.""" + try: + # If config['llm'] has api_key then ignore the api_key_env + api_key = "" + if "api_key" in config["llm"]: + api_key = config["llm"]["api_key"] + else: + api_key = os.environ.get(config["llm"].get("api_key_env", None), "") + llm_strategy = LLMExtractionStrategy( + provider=config["llm"]["provider"], + api_token=api_key, + instruction=instruction, + schema=json.loads(schema) if schema else None, + ) + + cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.WRITE_ONLY + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url=url, + config=CrawlerRunConfig( + extraction_strategy=llm_strategy, + scraping_strategy=LXMLWebScrapingStrategy(), + cache_mode=cache_mode + ) + ) + + if not result.success: + await redis.hset(f"task:{task_id}", mapping={ + "status": TaskStatus.FAILED, + "error": result.error_message + }) + return + + try: + content = json.loads(result.extracted_content) + except json.JSONDecodeError: + content = result.extracted_content + await redis.hset(f"task:{task_id}", mapping={ + "status": TaskStatus.COMPLETED, + "result": json.dumps(content) + }) + + except Exception as e: + logger.error(f"LLM extraction error: {str(e)}", exc_info=True) + await redis.hset(f"task:{task_id}", mapping={ + "status": TaskStatus.FAILED, + "error": str(e) + }) + +async def handle_markdown_request( + url: str, + filter_type: FilterType, + query: Optional[str] = None, + cache: str = "0", + config: Optional[dict] = None +) -> str: + """Handle markdown generation requests.""" + try: + decoded_url = unquote(url) + if not decoded_url.startswith(('http://', 'https://')): + decoded_url = 'https://' + decoded_url + + if filter_type == FilterType.RAW: + md_generator = DefaultMarkdownGenerator() + else: + content_filter = { + FilterType.FIT: PruningContentFilter(), + FilterType.BM25: BM25ContentFilter(user_query=query or ""), + FilterType.LLM: LLMContentFilter( + provider=config["llm"]["provider"], + api_token=os.environ.get(config["llm"].get("api_key_env", None), ""), + instruction=query or "Extract main content" + ) + }[filter_type] + md_generator = DefaultMarkdownGenerator(content_filter=content_filter) + + cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.WRITE_ONLY + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url=decoded_url, + config=CrawlerRunConfig( + markdown_generator=md_generator, + scraping_strategy=LXMLWebScrapingStrategy(), + cache_mode=cache_mode + ) + ) + + if not result.success: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=result.error_message + ) + + return (result.markdown_v2.raw_markdown + if filter_type == FilterType.RAW + else result.markdown_v2.fit_markdown) + + except Exception as e: + logger.error(f"Markdown error: {str(e)}", exc_info=True) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=str(e) + ) + +async def handle_llm_request( + redis: aioredis.Redis, + background_tasks: BackgroundTasks, + request: Request, + input_path: str, + query: Optional[str] = None, + schema: Optional[str] = None, + cache: str = "0", + config: Optional[dict] = None +) -> JSONResponse: + """Handle LLM extraction requests.""" + base_url = get_base_url(request) + + try: + if is_task_id(input_path): + return await handle_task_status( + redis, input_path, base_url + ) + + if not query: + return JSONResponse({ + "message": "Please provide an instruction", + "_links": { + "example": { + "href": f"{base_url}/llm/{input_path}?q=Extract+main+content", + "title": "Try this example" + } + } + }) + + return await create_new_task( + redis, + background_tasks, + input_path, + query, + schema, + cache, + base_url, + config + ) + + except Exception as e: + logger.error(f"LLM endpoint error: {str(e)}", exc_info=True) + return JSONResponse({ + "error": str(e), + "_links": { + "retry": {"href": str(request.url)} + } + }, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) + +async def handle_task_status( + redis: aioredis.Redis, + task_id: str, + base_url: str +) -> JSONResponse: + """Handle task status check requests.""" + task = await redis.hgetall(f"task:{task_id}") + if not task: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail="Task not found" + ) + + task = decode_redis_hash(task) + response = create_task_response(task, task_id, base_url) + + if task["status"] in [TaskStatus.COMPLETED, TaskStatus.FAILED]: + if should_cleanup_task(task["created_at"]): + await redis.delete(f"task:{task_id}") + + return JSONResponse(response) + +async def create_new_task( + redis: aioredis.Redis, + background_tasks: BackgroundTasks, + input_path: str, + query: str, + schema: Optional[str], + cache: str, + base_url: str, + config: dict +) -> JSONResponse: + """Create and initialize a new task.""" + decoded_url = unquote(input_path) + if not decoded_url.startswith(('http://', 'https://')): + decoded_url = 'https://' + decoded_url + + from datetime import datetime + task_id = f"llm_{int(datetime.now().timestamp())}_{id(background_tasks)}" + + await redis.hset(f"task:{task_id}", mapping={ + "status": TaskStatus.PROCESSING, + "created_at": datetime.now().isoformat(), + "url": decoded_url + }) + + background_tasks.add_task( + process_llm_extraction, + redis, + config, + task_id, + decoded_url, + query, + schema, + cache + ) + + return JSONResponse({ + "task_id": task_id, + "status": TaskStatus.PROCESSING, + "url": decoded_url, + "_links": { + "self": {"href": f"{base_url}/llm/{task_id}"}, + "status": {"href": f"{base_url}/llm/{task_id}"} + } + }) + +def create_task_response(task: dict, task_id: str, base_url: str) -> dict: + """Create response for task status check.""" + response = { + "task_id": task_id, + "status": task["status"], + "created_at": task["created_at"], + "url": task["url"], + "_links": { + "self": {"href": f"{base_url}/llm/{task_id}"}, + "refresh": {"href": f"{base_url}/llm/{task_id}"} + } + } + + if task["status"] == TaskStatus.COMPLETED: + response["result"] = json.loads(task["result"]) + elif task["status"] == TaskStatus.FAILED: + response["error"] = task["error"] + + return response + +async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) -> AsyncGenerator[bytes, None]: + """Stream results with heartbeats and completion markers.""" + import json + from utils import datetime_handler + + try: + async for result in results_gen: + try: + result_dict = result.model_dump() + logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}") + data = json.dumps(result_dict, default=datetime_handler) + "\n" + yield data.encode('utf-8') + except Exception as e: + logger.error(f"Serialization error: {e}") + error_response = {"error": str(e), "url": getattr(result, 'url', 'unknown')} + yield (json.dumps(error_response) + "\n").encode('utf-8') + + yield json.dumps({"status": "completed"}).encode('utf-8') + + except asyncio.CancelledError: + logger.warning("Client disconnected during streaming") + finally: + try: + await crawler.close() + except Exception as e: + logger.error(f"Crawler cleanup error: {e}") + +async def handle_crawl_request( + urls: List[str], + browser_config: dict, + crawler_config: dict, + config: dict +) -> dict: + """Handle non-streaming crawl requests.""" + try: + browser_config = BrowserConfig.load(browser_config) + crawler_config = CrawlerRunConfig.load(crawler_config) + + dispatcher = MemoryAdaptiveDispatcher( + memory_threshold_percent=config["crawler"]["memory_threshold_percent"], + rate_limiter=RateLimiter( + base_delay=tuple(config["crawler"]["rate_limiter"]["base_delay"]) + ) + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + results = await crawler.arun_many( + urls=urls, + config=crawler_config, + dispatcher=dispatcher + ) + + return { + "success": True, + "results": [result.model_dump() for result in results] + } + + except Exception as e: + logger.error(f"Crawl error: {str(e)}", exc_info=True) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=str(e) + ) + +async def handle_stream_crawl_request( + urls: List[str], + browser_config: dict, + crawler_config: dict, + config: dict +) -> Tuple[AsyncWebCrawler, AsyncGenerator]: + """Handle streaming crawl requests.""" + try: + browser_config = BrowserConfig.load(browser_config) + browser_config.verbose = True + crawler_config = CrawlerRunConfig.load(crawler_config) + crawler_config.scraping_strategy = LXMLWebScrapingStrategy() + + dispatcher = MemoryAdaptiveDispatcher( + memory_threshold_percent=config["crawler"]["memory_threshold_percent"], + rate_limiter=RateLimiter( + base_delay=tuple(config["crawler"]["rate_limiter"]["base_delay"]) + ) + ) + + crawler = AsyncWebCrawler(config=browser_config) + await crawler.start() + + results_gen = await crawler.arun_many( + urls=urls, + config=crawler_config, + dispatcher=dispatcher + ) + + return crawler, results_gen + + except Exception as e: + if 'crawler' in locals(): + await crawler.close() + logger.error(f"Stream crawl error: {str(e)}", exc_info=True) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=str(e) + ) \ No newline at end of file diff --git a/deploy/aws/docker/auth.py b/deploy/aws/docker/auth.py new file mode 100644 index 00000000..8851bd36 --- /dev/null +++ b/deploy/aws/docker/auth.py @@ -0,0 +1,46 @@ +import os +from datetime import datetime, timedelta, timezone +from typing import Dict, Optional +from jwt import JWT, jwk_from_dict +from jwt.utils import get_int_from_datetime +from fastapi import Depends, HTTPException +from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials +from pydantic import EmailStr +from pydantic.main import BaseModel +import base64 + +instance = JWT() +security = HTTPBearer() +SECRET_KEY = os.environ.get("SECRET_KEY", "mysecret") +ACCESS_TOKEN_EXPIRE_MINUTES = 60 + +def get_jwk_from_secret(secret: str): + """Convert a secret string into a JWK object.""" + secret_bytes = secret.encode('utf-8') + b64_secret = base64.urlsafe_b64encode(secret_bytes).rstrip(b'=').decode('utf-8') + return jwk_from_dict({"kty": "oct", "k": b64_secret}) + +def create_access_token(data: dict, expires_delta: Optional[timedelta] = None) -> str: + """Create a JWT access token with an expiration.""" + to_encode = data.copy() + expire = datetime.now(timezone.utc) + (expires_delta or timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)) + to_encode.update({"exp": get_int_from_datetime(expire)}) + signing_key = get_jwk_from_secret(SECRET_KEY) + return instance.encode(to_encode, signing_key, alg='HS256') + +def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)) -> Dict: + """Verify the JWT token from the Authorization header.""" + token = credentials.credentials + verifying_key = get_jwk_from_secret(SECRET_KEY) + try: + payload = instance.decode(token, verifying_key, do_time_check=True, algorithms='HS256') + return payload + except Exception: + raise HTTPException(status_code=401, detail="Invalid or expired token") + +def get_token_dependency(config: Dict): + """Return the token dependency if JWT is enabled, else None.""" + return verify_token if config.get("security", {}).get("jwt_enabled", False) else None + +class TokenRequest(BaseModel): + email: EmailStr \ No newline at end of file diff --git a/deploy/aws/docker/config.yml b/deploy/aws/docker/config.yml new file mode 100644 index 00000000..fc118bf4 --- /dev/null +++ b/deploy/aws/docker/config.yml @@ -0,0 +1,71 @@ +# Application Configuration +app: + title: "Crawl4AI API" + version: "1.0.0" + host: "0.0.0.0" + port: 8000 + reload: True + timeout_keep_alive: 300 + +# Default LLM Configuration +llm: + provider: "openai/gpt-4o-mini" + api_key_env: "OPENAI_API_KEY" + # api_key: sk-... # If you pass the API key directly then api_key_env will be ignored + +# Redis Configuration +redis: + host: "localhost" + port: 6379 + db: 0 + password: "" + ssl: False + ssl_cert_reqs: None + ssl_ca_certs: None + ssl_certfile: None + ssl_keyfile: None + ssl_cert_reqs: None + ssl_ca_certs: None + ssl_certfile: None + ssl_keyfile: None + +# Rate Limiting Configuration +rate_limiting: + enabled: True + default_limit: "1000/minute" + trusted_proxies: [] + storage_uri: "memory://" # Use "redis://localhost:6379" for production + +# Security Configuration +security: + enabled: true + jwt_enabled: true + https_redirect: false + trusted_hosts: ["*"] + headers: + x_content_type_options: "nosniff" + x_frame_options: "DENY" + content_security_policy: "default-src 'self'" + strict_transport_security: "max-age=63072000; includeSubDomains" + +# Crawler Configuration +crawler: + memory_threshold_percent: 95.0 + rate_limiter: + base_delay: [1.0, 2.0] + timeouts: + stream_init: 30.0 # Timeout for stream initialization + batch_process: 300.0 # Timeout for batch processing + +# Logging Configuration +logging: + level: "INFO" + format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + +# Observability Configuration +observability: + prometheus: + enabled: True + endpoint: "/metrics" + health_check: + endpoint: "/health" \ No newline at end of file diff --git a/deploy/aws/docker/requirements.txt b/deploy/aws/docker/requirements.txt new file mode 100644 index 00000000..b7e6d8ad --- /dev/null +++ b/deploy/aws/docker/requirements.txt @@ -0,0 +1,10 @@ +crawl4ai +fastapi +uvicorn +gunicorn>=23.0.0 +slowapi>=0.1.9 +prometheus-fastapi-instrumentator>=7.0.2 +redis>=5.2.1 +jwt>=1.3.1 +dnspython>=2.7.0 +email-validator>=2.2.0 \ No newline at end of file diff --git a/deploy/aws/docker/server.py b/deploy/aws/docker/server.py new file mode 100644 index 00000000..edb55130 --- /dev/null +++ b/deploy/aws/docker/server.py @@ -0,0 +1,181 @@ +import os +import sys +import time +from typing import List, Optional, Dict +from fastapi import FastAPI, HTTPException, Request, Query, Path, Depends +from fastapi.responses import StreamingResponse, RedirectResponse, PlainTextResponse, JSONResponse +from fastapi.middleware.httpsredirect import HTTPSRedirectMiddleware +from fastapi.middleware.trustedhost import TrustedHostMiddleware +from pydantic import BaseModel, Field +from slowapi import Limiter +from slowapi.util import get_remote_address +from prometheus_fastapi_instrumentator import Instrumentator +from redis import asyncio as aioredis + +sys.path.append(os.path.dirname(os.path.realpath(__file__))) +from utils import FilterType, load_config, setup_logging, verify_email_domain +from api import ( + handle_markdown_request, + handle_llm_qa, + handle_stream_crawl_request, + handle_crawl_request, + stream_results +) +from auth import create_access_token, get_token_dependency, TokenRequest # Import from auth.py + +__version__ = "0.2.6" + +class CrawlRequest(BaseModel): + urls: List[str] = Field(min_length=1, max_length=100) + browser_config: Optional[Dict] = Field(default_factory=dict) + crawler_config: Optional[Dict] = Field(default_factory=dict) + +# Load configuration and setup +config = load_config() +setup_logging(config) + +# Initialize Redis +redis = aioredis.from_url(config["redis"].get("uri", "redis://localhost")) + +# Initialize rate limiter +limiter = Limiter( + key_func=get_remote_address, + default_limits=[config["rate_limiting"]["default_limit"]], + storage_uri=config["rate_limiting"]["storage_uri"] +) + +app = FastAPI( + title=config["app"]["title"], + version=config["app"]["version"] +) + +# Configure middleware +def setup_security_middleware(app, config): + sec_config = config.get("security", {}) + if sec_config.get("enabled", False): + if sec_config.get("https_redirect", False): + app.add_middleware(HTTPSRedirectMiddleware) + if sec_config.get("trusted_hosts", []) != ["*"]: + app.add_middleware(TrustedHostMiddleware, allowed_hosts=sec_config["trusted_hosts"]) + +setup_security_middleware(app, config) + +# Prometheus instrumentation +if config["observability"]["prometheus"]["enabled"]: + Instrumentator().instrument(app).expose(app) + +# Get token dependency based on config +token_dependency = get_token_dependency(config) + +# Middleware for security headers +@app.middleware("http") +async def add_security_headers(request: Request, call_next): + response = await call_next(request) + if config["security"]["enabled"]: + response.headers.update(config["security"]["headers"]) + return response + +# Token endpoint (always available, but usage depends on config) +@app.post("/token") +async def get_token(request_data: TokenRequest): + if not verify_email_domain(request_data.email): + raise HTTPException(status_code=400, detail="Invalid email domain") + token = create_access_token({"sub": request_data.email}) + return {"email": request_data.email, "access_token": token, "token_type": "bearer"} + +# Endpoints with conditional auth +@app.get("/md/{url:path}") +@limiter.limit(config["rate_limiting"]["default_limit"]) +async def get_markdown( + request: Request, + url: str, + f: FilterType = FilterType.FIT, + q: Optional[str] = None, + c: Optional[str] = "0", + token_data: Optional[Dict] = Depends(token_dependency) +): + result = await handle_markdown_request(url, f, q, c, config) + return PlainTextResponse(result) + +@app.get("/llm/{url:path}", description="URL should be without http/https prefix") +async def llm_endpoint( + request: Request, + url: str = Path(...), + q: Optional[str] = Query(None), + token_data: Optional[Dict] = Depends(token_dependency) +): + if not q: + raise HTTPException(status_code=400, detail="Query parameter 'q' is required") + if not url.startswith(('http://', 'https://')): + url = 'https://' + url + try: + answer = await handle_llm_qa(url, q, config) + return JSONResponse({"answer": answer}) + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + +@app.get("/schema") +async def get_schema(): + from crawl4ai import BrowserConfig, CrawlerRunConfig + return {"browser": BrowserConfig().dump(), "crawler": CrawlerRunConfig().dump()} + +@app.get(config["observability"]["health_check"]["endpoint"]) +async def health(): + return {"status": "ok", "timestamp": time.time(), "version": __version__} + +@app.get(config["observability"]["prometheus"]["endpoint"]) +async def metrics(): + return RedirectResponse(url=config["observability"]["prometheus"]["endpoint"]) + +@app.post("/crawl") +@limiter.limit(config["rate_limiting"]["default_limit"]) +async def crawl( + request: Request, + crawl_request: CrawlRequest, + token_data: Optional[Dict] = Depends(token_dependency) +): + if not crawl_request.urls: + raise HTTPException(status_code=400, detail="At least one URL required") + + results = await handle_crawl_request( + urls=crawl_request.urls, + browser_config=crawl_request.browser_config, + crawler_config=crawl_request.crawler_config, + config=config + ) + + return JSONResponse(results) + + +@app.post("/crawl/stream") +@limiter.limit(config["rate_limiting"]["default_limit"]) +async def crawl_stream( + request: Request, + crawl_request: CrawlRequest, + token_data: Optional[Dict] = Depends(token_dependency) +): + if not crawl_request.urls: + raise HTTPException(status_code=400, detail="At least one URL required") + + crawler, results_gen = await handle_stream_crawl_request( + urls=crawl_request.urls, + browser_config=crawl_request.browser_config, + crawler_config=crawl_request.crawler_config, + config=config + ) + + return StreamingResponse( + stream_results(crawler, results_gen), + media_type='application/x-ndjson', + headers={'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'X-Stream-Status': 'active'} + ) + +if __name__ == "__main__": + import uvicorn + uvicorn.run( + "server:app", + host=config["app"]["host"], + port=config["app"]["port"], + reload=config["app"]["reload"], + timeout_keep_alive=config["app"]["timeout_keep_alive"] + ) \ No newline at end of file diff --git a/deploy/aws/docker/supervisord.conf b/deploy/aws/docker/supervisord.conf new file mode 100644 index 00000000..1274f2c3 --- /dev/null +++ b/deploy/aws/docker/supervisord.conf @@ -0,0 +1,12 @@ +[supervisord] +nodaemon=true + +[program:redis] +command=redis-server +autorestart=true +priority=10 + +[program:gunicorn] +command=gunicorn --bind 0.0.0.0:8000 --workers 4 --threads 2 --timeout 300 --graceful-timeout 60 --keep-alive 65 --log-level debug --worker-class uvicorn.workers.UvicornWorker --max-requests 1000 --max-requests-jitter 50 server:app +autorestart=true +priority=20 \ No newline at end of file diff --git a/deploy/aws/docker/utils.py b/deploy/aws/docker/utils.py new file mode 100644 index 00000000..ff0aa2df --- /dev/null +++ b/deploy/aws/docker/utils.py @@ -0,0 +1,66 @@ +import dns.resolver +import logging +import yaml +from datetime import datetime +from enum import Enum +from pathlib import Path +from fastapi import Request +from typing import Dict, Optional + +class TaskStatus(str, Enum): + PROCESSING = "processing" + FAILED = "failed" + COMPLETED = "completed" + +class FilterType(str, Enum): + RAW = "raw" + FIT = "fit" + BM25 = "bm25" + LLM = "llm" + +def load_config() -> Dict: + """Load and return application configuration.""" + config_path = Path(__file__).parent / "config.yml" + with open(config_path, "r") as config_file: + return yaml.safe_load(config_file) + +def setup_logging(config: Dict) -> None: + """Configure application logging.""" + logging.basicConfig( + level=config["logging"]["level"], + format=config["logging"]["format"] + ) + +def get_base_url(request: Request) -> str: + """Get base URL including scheme and host.""" + return f"{request.url.scheme}://{request.url.netloc}" + +def is_task_id(value: str) -> bool: + """Check if the value matches task ID pattern.""" + return value.startswith("llm_") and "_" in value + +def datetime_handler(obj: any) -> Optional[str]: + """Handle datetime serialization for JSON.""" + if hasattr(obj, 'isoformat'): + return obj.isoformat() + raise TypeError(f"Object of type {type(obj)} is not JSON serializable") + +def should_cleanup_task(created_at: str) -> bool: + """Check if task should be cleaned up based on creation time.""" + created = datetime.fromisoformat(created_at) + return (datetime.now() - created).total_seconds() > 3600 + +def decode_redis_hash(hash_data: Dict[bytes, bytes]) -> Dict[str, str]: + """Decode Redis hash data from bytes to strings.""" + return {k.decode('utf-8'): v.decode('utf-8') for k, v in hash_data.items()} + + + +def verify_email_domain(email: str) -> bool: + try: + domain = email.split('@')[1] + # Try to resolve MX records for the domain. + records = dns.resolver.resolve(domain, 'MX') + return True if records else False + except Exception as e: + return False \ No newline at end of file diff --git a/deploy/aws/howto.md b/deploy/aws/howto.md new file mode 100644 index 00000000..b8912b12 --- /dev/null +++ b/deploy/aws/howto.md @@ -0,0 +1,77 @@ +# Crawl4AI API Quickstart + +This document shows how to generate an API token and use it to call the `/crawl` and `/md` endpoints. + +--- + +## 1. Crawl Example + +Send a POST request to `/crawl` with the following JSON payload: + +```json +{ + "urls": ["https://example.com"], + "browser_config": { "headless": true, "verbose": true }, + "crawler_config": { "stream": false, "cache_mode": "enabled" } +} +``` + +**cURL Command:** + +```bash +curl -X POST "https://api.crawl4ai.com/crawl" \ + -H "Authorization: Bearer YOUR_API_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{ + "urls": ["https://example.com"], + "browser_config": {"headless": true, "verbose": true}, + "crawler_config": {"stream": false, "cache_mode": "enabled"} + }' +``` + +--- + +## 2. Markdown Retrieval Example + +To retrieve markdown from a given URL (e.g., `https://example.com`), use: + +```bash +curl -X GET "https://api.crawl4ai.com/md/example.com" \ + -H "Authorization: Bearer YOUR_API_TOKEN" +``` + +--- + +## 3. Python Code Example (Using `requests`) + +Below is a sample Python script that demonstrates using the `requests` library to call the API endpoints: + +```python +import requests + +BASE_URL = "https://api.crawl4ai.com" +TOKEN = "YOUR_API_TOKEN" # Replace with your actual token + +headers = { + "Authorization": f"Bearer {TOKEN}", + "Content-Type": "application/json" +} + +# Crawl endpoint example +crawl_payload = { + "urls": ["https://example.com"], + "browser_config": {"headless": True, "verbose": True}, + "crawler_config": {"stream": False, "cache_mode": "enabled"} +} + +crawl_response = requests.post(f"{BASE_URL}/crawl", json=crawl_payload, headers=headers) +print("Crawl Response:", crawl_response.json()) + +# /md endpoint example +md_response = requests.get(f"{BASE_URL}/md/example.com", headers=headers) +print("Markdown Content:", md_response.text) +``` + +--- + +Happy crawling! diff --git a/deploy/aws/nginx/Dockerfile b/deploy/aws/nginx/Dockerfile new file mode 100644 index 00000000..67fb0773 --- /dev/null +++ b/deploy/aws/nginx/Dockerfile @@ -0,0 +1,2 @@ +FROM nginx:alpine +COPY nginx.conf /etc/nginx/conf.d/default.conf diff --git a/deploy/aws/nginx/nginx.conf b/deploy/aws/nginx/nginx.conf new file mode 100644 index 00000000..73f24f17 --- /dev/null +++ b/deploy/aws/nginx/nginx.conf @@ -0,0 +1,55 @@ +server { + listen 80; + server_name api.crawl4ai.com; + + # Main logging settings + error_log /var/log/nginx/error.log debug; + access_log /var/log/nginx/access.log combined buffer=512k flush=1m; + + # Timeout and buffering settings + proxy_connect_timeout 300; + proxy_send_timeout 300; + proxy_read_timeout 300; + send_timeout 300; + proxy_buffer_size 128k; + proxy_buffers 4 256k; + proxy_busy_buffers_size 256k; + + # Health check location + location /health { + proxy_pass http://127.0.0.1:8000/health; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + # Main proxy for application endpoints + location / { + proxy_pass http://127.0.0.1:8000; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + add_header X-Debug-Info $request_uri; + proxy_request_buffering off; + proxy_http_version 1.1; + proxy_set_header Connection ""; + proxy_buffering off; + } + + # New endpoint: serve Nginx error log + location /nginx/error { + # Using "alias" to serve the error log file + alias /var/log/nginx/error.log; + # Optionally, you might restrict access with "allow" and "deny" directives. + } + + # New endpoint: serve Nginx access log + location /nginx/access { + alias /var/log/nginx/access.log; + } + + client_max_body_size 10M; + client_body_buffer_size 128k; +} diff --git a/deploy/aws/version.txt b/deploy/aws/version.txt new file mode 100644 index 00000000..9ff151c5 --- /dev/null +++ b/deploy/aws/version.txt @@ -0,0 +1 @@ +v0.1.0 \ No newline at end of file diff --git a/deploy/gcloud-function/Dockerfile b/deploy/gcloud-function/Dockerfile new file mode 100644 index 00000000..36e28df8 --- /dev/null +++ b/deploy/gcloud-function/Dockerfile @@ -0,0 +1,63 @@ +FROM --platform=linux/amd64 python:3.10-slim + +# Install system dependencies required for Chromium and Git +RUN apt-get update && apt-get install -y \ + python3-dev \ + pkg-config \ + libjpeg-dev \ + gcc \ + build-essential \ + libnss3 \ + libnspr4 \ + libatk1.0-0 \ + libatk-bridge2.0-0 \ + libcups2 \ + libdrm2 \ + libxkbcommon0 \ + libxcomposite1 \ + libxdamage1 \ + libxfixes3 \ + libxrandr2 \ + libgbm1 \ + libasound2 \ + libpango-1.0-0 \ + libcairo2 \ + procps \ + git \ + socat \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Make a directory for crawl4ai call it crawl4ai_repo +# RUN mkdir crawl4ai_repo + +# # Clone Crawl4ai from the next branch and install it +# RUN git clone --branch next https://github.com/unclecode/crawl4ai.git ./crawl4ai_repo \ +# && cd crawl4ai_repo \ +# && pip install . \ +# && cd .. \ +# && rm -rf crawl4ai_repo + +RUN python3 -m venv /app/venv +ENV PATH="/app/venv/bin:$PATH" +# RUN pip install git+https://github.com/unclecode/crawl4ai.git@next + +# Copy requirements and install remaining dependencies +COPY requirements.txt . +RUN pip install -r requirements.txt + +# Copy application files +COPY resources /app/resources +COPY main.py . +COPY start.sh . + +# Set permissions for Chrome binary and start script +RUN chmod +x /app/resources/chrome/headless_shell && \ + chmod -R 755 /app/resources/chrome && \ + chmod +x start.sh + +ENV FUNCTION_TARGET=crawl +EXPOSE 8080 9223 + +CMD ["/app/start.sh"] \ No newline at end of file diff --git a/deploy/gcloud-function/config.yml b/deploy/gcloud-function/config.yml new file mode 100644 index 00000000..3c49d386 --- /dev/null +++ b/deploy/gcloud-function/config.yml @@ -0,0 +1,8 @@ +project_id: PROJECT_ID +region: REGION_NAME +artifact_repo: ARTIFACT_REPO_NAME +function_name: FUNCTION_NAME +memory: "2048MB" +timeout: "540s" +local_image: "gcr.io/ARTIFACT_REPO_NAME/crawl4ai:latest" +test_query_url: "https://example.com" diff --git a/deploy/gcloud-function/deploy.py b/deploy/gcloud-function/deploy.py new file mode 100644 index 00000000..4cd6e5b0 --- /dev/null +++ b/deploy/gcloud-function/deploy.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python3 +import argparse +import subprocess +import sys +import yaml +import requests + +def run_command(cmd, explanation, require_confirm=True, allow_already_exists=False): + print("\n=== {} ===".format(explanation)) + if require_confirm: + input("Press Enter to run: [{}]\n".format(cmd)) + print("Running: {}".format(cmd)) + result = subprocess.run(cmd, shell=True, capture_output=True, text=True) + if result.returncode != 0: + if allow_already_exists and "ALREADY_EXISTS" in result.stderr: + print("Repository already exists, skipping creation.") + return "" + print("Error:\n{}".format(result.stderr)) + sys.exit(1) + out = result.stdout.strip() + if out: + print("Output:\n{}".format(out)) + return out + +def load_config(): + try: + with open("config.yml", "r") as f: + config = yaml.safe_load(f) + except Exception as e: + print("Failed to load config.yml: {}".format(e)) + sys.exit(1) + required = ["project_id", "region", "artifact_repo", "function_name", "local_image"] + for key in required: + if key not in config or not config[key]: + print("Missing required config parameter: {}".format(key)) + sys.exit(1) + return config + +def deploy_function(config): + project_id = config["project_id"] + region = config["region"] + artifact_repo = config["artifact_repo"] + function_name = config["function_name"] + memory = config.get("memory", "2048MB") + timeout = config.get("timeout", "540s") + local_image = config["local_image"] + test_query_url = config.get("test_query_url", "https://example.com") + + # Repository image format: "-docker.pkg.dev///:latest" + repo_image = f"{region}-docker.pkg.dev/{project_id}/{artifact_repo}/{function_name}:latest" + + # 1. Create Artifact Registry repository (skip if exists) + cmd = f"gcloud artifacts repositories create {artifact_repo} --repository-format=docker --location={region} --project={project_id}" + run_command(cmd, "Creating Artifact Registry repository (if it doesn't exist)", allow_already_exists=True) + + # 2. Tag the local Docker image with the repository image name + cmd = f"docker tag {local_image} {repo_image}" + run_command(cmd, "Tagging Docker image for Artifact Registry") + + # 3. Authenticate Docker to Artifact Registry + cmd = f"gcloud auth configure-docker {region}-docker.pkg.dev" + run_command(cmd, "Authenticating Docker to Artifact Registry") + + # 4. Push the tagged Docker image to Artifact Registry + cmd = f"docker push {repo_image}" + run_command(cmd, "Pushing Docker image to Artifact Registry") + + # 5. Deploy the Cloud Function using the custom container + cmd = ( + f"gcloud beta functions deploy {function_name} " + f"--gen2 " + f"--runtime=python310 " + f"--entry-point=crawl " + f"--region={region} " + f"--docker-repository={region}-docker.pkg.dev/{project_id}/{artifact_repo} " + f"--trigger-http " + f"--memory={memory} " + f"--timeout={timeout} " + f"--project={project_id}" + ) + run_command(cmd, "Deploying Cloud Function using custom container") + + # 6. Set the Cloud Function to allow public (unauthenticated) invocations + cmd = ( + f"gcloud functions add-iam-policy-binding {function_name} " + f"--region={region} " + f"--member='allUsers' " + f"--role='roles/cloudfunctions.invoker' " + f"--project={project_id}" + f"--quiet" + ) + run_command(cmd, "Setting Cloud Function IAM to allow public invocations") + + # 7. Retrieve the deployed Cloud Function URL + cmd = ( + f"gcloud functions describe {function_name} " + f"--region={region} " + f"--project={project_id} " + f"--format='value(serviceConfig.uri)'" + ) + deployed_url = run_command(cmd, "Extracting deployed Cloud Function URL", require_confirm=False) + print("\nDeployed URL: {}\n".format(deployed_url)) + + # 8. Test the deployed function + test_url = f"{deployed_url}?url={test_query_url}" + print("Testing function with: {}".format(test_url)) + try: + response = requests.get(test_url) + print("Response status: {}".format(response.status_code)) + print("Response body:\n{}".format(response.text)) + if response.status_code == 200: + print("Test successful!") + else: + print("Non-200 response; check function logs.") + except Exception as e: + print("Test request error: {}".format(e)) + sys.exit(1) + + # 9. Final usage help + print("\nDeployment complete!") + print("Invoke your function with:") + print(f"curl '{deployed_url}?url={test_query_url}'") + print("For further instructions, refer to your documentation.") + +def delete_function(config): + project_id = config["project_id"] + region = config["region"] + function_name = config["function_name"] + cmd = f"gcloud functions delete {function_name} --region={region} --project={project_id} --quiet" + run_command(cmd, "Deleting Cloud Function") + +def describe_function(config): + project_id = config["project_id"] + region = config["region"] + function_name = config["function_name"] + cmd = ( + f"gcloud functions describe {function_name} " + f"--region={region} " + f"--project={project_id} " + f"--format='value(serviceConfig.uri)'" + ) + deployed_url = run_command(cmd, "Describing Cloud Function to extract URL", require_confirm=False) + print("\nCloud Function URL: {}\n".format(deployed_url)) + +def clear_all(config): + print("\n=== CLEAR ALL RESOURCES ===") + project_id = config["project_id"] + region = config["region"] + artifact_repo = config["artifact_repo"] + + confirm = input("WARNING: This will DELETE the Cloud Function and the Artifact Registry repository. Are you sure? (y/N): ") + if confirm.lower() != "y": + print("Aborting clear operation.") + sys.exit(0) + + # Delete the Cloud Function + delete_function(config) + # Delete the Artifact Registry repository + cmd = f"gcloud artifacts repositories delete {artifact_repo} --location={region} --project={project_id} --quiet" + run_command(cmd, "Deleting Artifact Registry repository", require_confirm=False) + print("All resources cleared.") + +def main(): + parser = argparse.ArgumentParser(description="Deploy, delete, describe, or clear Cloud Function resources using config.yml") + subparsers = parser.add_subparsers(dest="command", required=True) + + subparsers.add_parser("deploy", help="Deploy the Cloud Function") + subparsers.add_parser("delete", help="Delete the deployed Cloud Function") + subparsers.add_parser("describe", help="Describe the Cloud Function and return its URL") + subparsers.add_parser("clear", help="Delete the Cloud Function and Artifact Registry repository") + + args = parser.parse_args() + config = load_config() + + if args.command == "deploy": + deploy_function(config) + elif args.command == "delete": + delete_function(config) + elif args.command == "describe": + describe_function(config) + elif args.command == "clear": + clear_all(config) + else: + parser.print_help() + +if __name__ == "__main__": + main() diff --git a/deploy/gcloud-function/guide.md b/deploy/gcloud-function/guide.md new file mode 100644 index 00000000..e72ee567 --- /dev/null +++ b/deploy/gcloud-function/guide.md @@ -0,0 +1,204 @@ +# Deploying Crawl4ai on Google Cloud Functions + +This guide explains how to deploy **Crawl4ai**—an open‑source web crawler library—on Google Cloud Functions Gen2 using a custom container. We assume your project folder already includes: + +- **Dockerfile:** Builds your container image (which installs Crawl4ai from its Git repository). +- **start.sh:** Activates your virtual environment and starts the function (using the Functions Framework). +- **main.py:** Contains your function logic with the entry point `crawl` (and imports Crawl4ai). + +The guide is divided into two parts: +1. Manual deployment steps (using CLI commands) +2. Automated deployment using a Python script (`deploy.py`) + +--- + +## Part 1: Manual Deployment Process + +### Prerequisites + +- **Google Cloud Project:** Ensure your project is active and billing is enabled. +- **Google Cloud CLI & Docker:** Installed and configured on your local machine. +- **Permissions:** You must have rights to create Cloud Functions and Artifact Registry repositories. +- **Files:** Your Dockerfile, start.sh, and main.py should be in the same directory. + +### Step 1: Build Your Docker Image + +Your Dockerfile packages Crawl4ai along with all its dependencies. Build your image with: + +```bash +docker build -t gcr.io//:latest . +``` + +Replace `` with your Google Cloud project ID and `` with your chosen function name (for example, `crawl4ai-t1`). + +### Step 2: Create an Artifact Registry Repository + +Cloud Functions Gen2 requires your custom container image to reside in an Artifact Registry repository. Create one by running: + +```bash +gcloud artifacts repositories create \ + --repository-format=docker \ + --location= \ + --project= +``` + +Replace `` (for example, `crawl4ai`) and `` (for example, `asia-east1`). +> **Note:** If you receive an `ALREADY_EXISTS` error, the repository is already created; simply proceed to the next step. + +### Step 3: Tag Your Docker Image + +Tag your locally built Docker image so it matches the Artifact Registry format: + +```bash +docker tag gcr.io//:latest -docker.pkg.dev///:latest +``` + +This step “renames” the image so you can push it to your repository. + +### Step 4: Authenticate Docker to Artifact Registry + +Configure Docker authentication to the Artifact Registry: + +```bash +gcloud auth configure-docker -docker.pkg.dev +``` + +This ensures Docker can securely push images to your registry using your Cloud credentials. + +### Step 5: Push the Docker Image + +Push the tagged image to Artifact Registry: + +```bash +docker push -docker.pkg.dev///:latest +``` + +Once complete, your container image (with Crawl4ai installed) is hosted in Artifact Registry. + +### Step 6: Deploy the Cloud Function + +Deploy your function using the custom container image. Run: + +```bash +gcloud beta functions deploy \ + --gen2 \ + --region= \ + --docker-repository=-docker.pkg.dev// \ + --trigger-http \ + --memory=2048MB \ + --timeout=540s \ + --project= +``` + +This command tells Cloud Functions Gen2 to pull your container image from Artifact Registry and deploy it. Make sure your main.py defines the `crawl` entry point. + +### Step 7: Make the Function Public + +To allow external (unauthenticated) access, update the function’s IAM policy: + +```bash +gcloud functions add-iam-policy-binding \ + --region= \ + --member="allUsers" \ + --role="roles/cloudfunctions.invoker" \ + --project= \ + --quiet +``` + +Using the `--quiet` flag ensures the command runs non‑interactively so the policy is applied immediately. + +### Step 8: Retrieve and Test Your Function URL + +Get the URL for your deployed function: + +```bash +gcloud functions describe \ + --region= \ + --project= \ + --format='value(serviceConfig.uri)' +``` + +Test your deployment with a sample GET request (using curl or your browser): + +```bash +curl "?url=https://example.com" +``` + +Replace `` with the output URL from the previous command. A successful test (HTTP status 200) means Crawl4ai is running on Cloud Functions. + +--- + +## Part 2: Automated Deployment with deploy.py + +For a more streamlined process, use the provided `deploy.py` script. This Python script automates the manual steps, prompting you to confirm key actions and providing detailed logs throughout the process. + +### What deploy.py Does: + +- **Reads Parameters:** It loads a `config.yml` file containing all necessary parameters such as `project_id`, `region`, `artifact_repo`, `function_name`, `local_image`, etc. +- **Creates/Skips Repository:** It creates the Artifact Registry repository (or skips if it already exists). +- **Tags & Pushes:** It tags your local Docker image and pushes it to the Artifact Registry. +- **Deploys the Function:** It deploys the Cloud Function with your custom container. +- **Updates IAM:** It sets the IAM policy to allow public access (using the `--quiet` flag). +- **Tests the Deployment:** It extracts the deployed URL and performs a test request. +- **Additional Commands:** You can also use subcommands in the script to delete or describe the deployed function, or even clear all resources. + +### Example config.yml + +Create a `config.yml` file in the same folder as your Dockerfile. An example configuration: + +```yaml +project_id: your-project-id +region: asia-east1 +artifact_repo: crawl4ai +function_name: crawl4ai-t1 +memory: "2048MB" +timeout: "540s" +local_image: "gcr.io/your-project-id/crawl4ai-t1:latest" +test_query_url: "https://example.com" +``` + +### How to Use deploy.py + +- **Deploy the Function:** + + ```bash + python deploy.py deploy + ``` + + The script will guide you through each step, display the output, and ask for confirmation before executing critical commands. + +- **Describe the Function:** + + If you forget the function URL and want to retrieve it later: + + ```bash + python deploy.py describe + ``` + +- **Delete the Function:** + + To remove just the Cloud Function: + + ```bash + python deploy.py delete + ``` + +- **Clear All Resources:** + + To delete both the Cloud Function and the Artifact Registry repository: + + ```bash + python deploy.py clear + ``` + +--- + +## Conclusion + +This guide has walked you through two deployment methods for Crawl4ai on Google Cloud Functions Gen2: + +1. **Manual Deployment:** Building your Docker image, pushing it to Artifact Registry, deploying the Cloud Function, and setting up IAM. +2. **Automated Deployment:** Using `deploy.py` with a configuration file to handle the entire process interactively. + +By following these instructions, you can deploy, test, and manage your Crawl4ai-based Cloud Function with ease. Enjoy using Crawl4ai in your cloud environment! + diff --git a/deploy/gcloud-function/main.py b/deploy/gcloud-function/main.py new file mode 100644 index 00000000..7660fcb9 --- /dev/null +++ b/deploy/gcloud-function/main.py @@ -0,0 +1,158 @@ +# Cleanup Chrome process on module unload +import atexit +import asyncio +import logging +import functions_framework +from flask import jsonify, Request +import os +import sys +import time +import subprocess +import signal +import requests + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +logger.info(f"Python version: {sys.version}") +logger.info(f"Python path: {sys.path}") + +# Try to find where crawl4ai is coming from +try: + import crawl4ai + logger.info(f"Crawl4AI module location: {crawl4ai.__file__}") + logger.info(f"Contents of crawl4ai: {dir(crawl4ai)}") +except ImportError: + logger.error("Crawl4AI module not found") + +# Now attempt the import +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult + +# Configure logging +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger(__name__) + +# Paths and constants +FUNCTION_DIR = os.path.dirname(os.path.realpath(__file__)) +CHROME_BINARY = os.path.join(FUNCTION_DIR, "resources/chrome/headless_shell") +CDP_PORT = 9222 + +def start_chrome(): + """Start Chrome process synchronously with exponential backoff.""" + logger.debug("Starting Chrome process...") + chrome_args = [ + CHROME_BINARY, + f"--remote-debugging-port={CDP_PORT}", + "--remote-debugging-address=0.0.0.0", + "--no-sandbox", + "--disable-setuid-sandbox", + "--headless=new", + "--disable-gpu", + "--disable-dev-shm-usage", + "--no-zygote", + "--single-process", + "--disable-features=site-per-process", + "--no-first-run", + "--disable-extensions" + ] + + process = subprocess.Popen( + chrome_args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + preexec_fn=os.setsid + ) + + logger.debug(f"Chrome process started with PID: {process.pid}") + + # Wait for CDP endpoint with exponential backoff + wait_time = 1 # Start with 1 second + max_wait_time = 16 # Cap at 16 seconds per retry + max_attempts = 10 # Total attempts + for attempt in range(max_attempts): + try: + response = requests.get(f"http://127.0.0.1:{CDP_PORT}/json/version", timeout=2) + if response.status_code == 200: + # Get ws URL from response + ws_url = response.json()['webSocketDebuggerUrl'] + logger.debug("Chrome CDP is ready") + logger.debug(f"CDP URL: {ws_url}") + return process + except requests.exceptions.ConnectionError: + logger.debug(f"Waiting for CDP endpoint (attempt {attempt + 1}/{max_attempts}), retrying in {wait_time} seconds") + time.sleep(wait_time) + wait_time = min(wait_time * 2, max_wait_time) # Double wait time, up to max + + # If we get here, all retries failed + stdout, stderr = process.communicate() # Get output for debugging + logger.error(f"Chrome stdout: {stdout.decode()}") + logger.error(f"Chrome stderr: {stderr.decode()}") + raise Exception("Chrome CDP endpoint failed to start after retries") + +async def fetch_with_crawl4ai(url: str) -> dict: + """Fetch page content using Crawl4ai and return the result object""" + # Get CDP URL from the running Chrome instance + version_response = requests.get(f'http://localhost:{CDP_PORT}/json/version') + cdp_url = version_response.json()['webSocketDebuggerUrl'] + + # Configure and run Crawl4ai + browser_config = BrowserConfig(cdp_url=cdp_url, use_managed_browser=True) + async with AsyncWebCrawler(config=browser_config) as crawler: + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + ) + result : CrawlResult = await crawler.arun( + url=url, config=crawler_config + ) + return result.model_dump() # Convert Pydantic model to dict for JSON response + +# Start Chrome when the module loads +logger.info("Starting Chrome process on module load") +chrome_process = start_chrome() + +@functions_framework.http +def crawl(request: Request): + """HTTP Cloud Function to fetch web content using Crawl4ai""" + try: + url = request.args.get('url') + if not url: + return jsonify({'error': 'URL parameter is required', 'status': 400}), 400 + + # Create and run an asyncio event loop + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + try: + result = loop.run_until_complete( + asyncio.wait_for(fetch_with_crawl4ai(url), timeout=10.0) + ) + return jsonify({ + 'status': 200, + 'data': result + }) + finally: + loop.close() + + except Exception as e: + error_msg = f"Unexpected error: {str(e)}" + logger.error(error_msg, exc_info=True) + return jsonify({ + 'error': error_msg, + 'status': 500, + 'details': { + 'error_type': type(e).__name__, + 'stack_trace': str(e), + 'chrome_running': chrome_process.poll() is None if chrome_process else False + } + }), 500 + + +@atexit.register +def cleanup(): + """Cleanup Chrome process on shutdown""" + if chrome_process and chrome_process.poll() is None: + try: + os.killpg(os.getpgid(chrome_process.pid), signal.SIGTERM) + logger.info("Chrome process terminated") + except Exception as e: + logger.error(f"Failed to terminate Chrome process: {e}") \ No newline at end of file diff --git a/deploy/gcloud-function/requirements.txt b/deploy/gcloud-function/requirements.txt new file mode 100644 index 00000000..6cc3d354 --- /dev/null +++ b/deploy/gcloud-function/requirements.txt @@ -0,0 +1,5 @@ +functions-framework==3.* +flask==2.3.3 +requests==2.31.0 +websockets==12.0 +git+https://github.com/unclecode/crawl4ai.git@next \ No newline at end of file diff --git a/deploy/gcloud-function/resources/chrome/fonts.conf b/deploy/gcloud-function/resources/chrome/fonts.conf new file mode 100755 index 00000000..3f8207ea --- /dev/null +++ b/deploy/gcloud-function/resources/chrome/fonts.conf @@ -0,0 +1,10 @@ + + + + /var/task/.fonts + /var/task/fonts + /opt/fonts + /tmp/fonts + /tmp/fonts-cache/ + + diff --git a/deploy/gcloud-function/resources/chrome/fonts/Open_Sans/OpenSans-Bold.ttf b/deploy/gcloud-function/resources/chrome/fonts/Open_Sans/OpenSans-Bold.ttf new file mode 100755 index 00000000..b7fadfa4 Binary files /dev/null and b/deploy/gcloud-function/resources/chrome/fonts/Open_Sans/OpenSans-Bold.ttf differ diff --git a/deploy/gcloud-function/resources/chrome/fonts/Open_Sans/OpenSans-Italic.ttf b/deploy/gcloud-function/resources/chrome/fonts/Open_Sans/OpenSans-Italic.ttf new file mode 100755 index 00000000..e99cb92d Binary files /dev/null and b/deploy/gcloud-function/resources/chrome/fonts/Open_Sans/OpenSans-Italic.ttf differ diff --git a/deploy/gcloud-function/resources/chrome/fonts/Open_Sans/OpenSans-Light.ttf b/deploy/gcloud-function/resources/chrome/fonts/Open_Sans/OpenSans-Light.ttf new file mode 100755 index 00000000..a0ba2043 Binary files /dev/null and b/deploy/gcloud-function/resources/chrome/fonts/Open_Sans/OpenSans-Light.ttf differ diff --git a/deploy/gcloud-function/resources/chrome/fonts/Open_Sans/OpenSans-Regular.ttf b/deploy/gcloud-function/resources/chrome/fonts/Open_Sans/OpenSans-Regular.ttf new file mode 100755 index 00000000..8529c432 Binary files /dev/null and b/deploy/gcloud-function/resources/chrome/fonts/Open_Sans/OpenSans-Regular.ttf differ diff --git a/deploy/gcloud-function/resources/chrome/libvulkan.so.1 b/deploy/gcloud-function/resources/chrome/libvulkan.so.1 new file mode 100755 index 00000000..f941f2c0 Binary files /dev/null and b/deploy/gcloud-function/resources/chrome/libvulkan.so.1 differ diff --git a/deploy/gcloud-function/resources/chrome/vk_swiftshader_icd.json b/deploy/gcloud-function/resources/chrome/vk_swiftshader_icd.json new file mode 100644 index 00000000..28be1f3a --- /dev/null +++ b/deploy/gcloud-function/resources/chrome/vk_swiftshader_icd.json @@ -0,0 +1 @@ +{"file_format_version": "1.0.0", "ICD": {"library_path": "./libvk_swiftshader.so", "api_version": "1.0.5"}} \ No newline at end of file diff --git a/deploy/lambda/Dockerfile b/deploy/lambda/Dockerfile new file mode 100644 index 00000000..d929b3eb --- /dev/null +++ b/deploy/lambda/Dockerfile @@ -0,0 +1,104 @@ +FROM python:3.12-bookworm AS python-builder + +RUN pip install poetry + +ENV POETRY_NO_INTERACTION=1 \ + POETRY_CACHE_DIR=/tmp/poetry_cache + +WORKDIR /app + +COPY pyproject.toml poetry.lock ./ +RUN --mount=type=cache,target=$POETRY_CACHE_DIR poetry export -f requirements.txt -o requirements.txt + +# Install build dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + python3-dev \ + python3-setuptools \ + python3-wheel \ + python3-pip \ + gcc \ + g++ \ + && rm -rf /var/lib/apt/lists/* + +# Install specific dependencies that have build issues +RUN pip install --no-cache-dir cchardet + +FROM python:3.12-bookworm + +# Install AWS Lambda Runtime Interface Client +RUN python3 -m pip install --no-cache-dir awslambdaric + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + wget \ + gnupg \ + git \ + cmake \ + pkg-config \ + python3-dev \ + libjpeg-dev \ + redis-server \ + supervisor \ + && rm -rf /var/lib/apt/lists/* + +RUN apt-get update && apt-get install -y --no-install-recommends \ + libglib2.0-0 \ + libnss3 \ + libnspr4 \ + libatk1.0-0 \ + libatk-bridge2.0-0 \ + libcups2 \ + libdrm2 \ + libdbus-1-3 \ + libxcb1 \ + libxkbcommon0 \ + libx11-6 \ + libxcomposite1 \ + libxdamage1 \ + libxext6 \ + libxfixes3 \ + libxrandr2 \ + libgbm1 \ + libpango-1.0-0 \ + libcairo2 \ + libasound2 \ + libatspi2.0-0 \ + && rm -rf /var/lib/apt/lists/* + +# Install build essentials for any compilations needed +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + python3-dev \ + && rm -rf /var/lib/apt/lists/* + +# Set up function directory and browser path +ARG FUNCTION_DIR="/function" +RUN mkdir -p "${FUNCTION_DIR}/pw-browsers" +RUN mkdir -p "/tmp/.crawl4ai" + +# Set critical environment variables +ENV PLAYWRIGHT_BROWSERS_PATH="${FUNCTION_DIR}/pw-browsers" \ + HOME="/tmp" \ + CRAWL4_AI_BASE_DIRECTORY="/tmp/.crawl4ai" + +# Create Craw4ai base directory +RUN mkdir -p ${CRAWL4_AI_BASE_DIRECTORY} + +RUN pip install --no-cache-dir faust-cchardet + +# Install Crawl4ai and dependencies +RUN pip install --no-cache-dir git+https://github.com/unclecode/crawl4ai.git@next + +# Install Chromium only (no deps flag) +RUN playwright install chromium + +# Copy function code +COPY lambda_function.py ${FUNCTION_DIR}/ + +# Set working directory +WORKDIR ${FUNCTION_DIR} + +ENTRYPOINT [ "/usr/local/bin/python", "-m", "awslambdaric" ] +CMD [ "lambda_function.handler" ] \ No newline at end of file diff --git a/deploy/lambda/deploy.py b/deploy/lambda/deploy.py new file mode 100644 index 00000000..16e5bb98 --- /dev/null +++ b/deploy/lambda/deploy.py @@ -0,0 +1,1081 @@ +#!/usr/bin/env python3 +""" +Crawl4ai AWS Lambda Deployment Script + +This script automates the deployment of the Crawl4ai web crawler as an AWS Lambda function, +providing an interactive step-by-step process with rich terminal UI. +""" + +import os +import sys +import json +import time +import subprocess +from typing import Optional, Dict, List, Any, Tuple + +import typer +from rich.console import Console +from rich.panel import Panel +from rich.prompt import Prompt, Confirm +from rich.progress import Progress, SpinnerColumn, TextColumn +from rich.syntax import Syntax +from rich.table import Table +from rich import print as rprint + +# Initialize typer app and console +app = typer.Typer(help="Deploy Crawl4ai to AWS Lambda") +console = Console() + +# Default configuration +DEFAULT_CONFIG = { + "aws_region": "us-east-1", + "ecr_repository_name": "crawl4ai-lambda", + "lambda_function_name": "crawl4ai-function", + "api_gateway_name": "crawl4ai-api", + "memory_size": 4096, + "timeout": 300, + "enable_provisioned_concurrency": False, + "provisioned_concurrency_count": 2, + "ephemeral_storage_size": 10240, +} + +def run_command(command: List[str], capture_output: bool = False) -> Tuple[int, str, str]: + """Run a shell command and return exit code, stdout, and stderr.""" + console.print(f"[dim]$ {' '.join(command)}[/dim]") + + result = subprocess.run( + command, + capture_output=True, + text=True + ) + + stdout = result.stdout.strip() + stderr = result.stderr.strip() + + if not capture_output: + if stdout: + console.print(stdout) + if stderr and result.returncode != 0: + console.print(f"[bold red]Error:[/bold red] {stderr}") + + return result.returncode, stdout, stderr + +def show_step_header(step_number: int, step_title: str) -> None: + """Display a step header with step number and title.""" + console.print(f"\n[bold blue]Step {step_number}:[/bold blue] [bold]{step_title}[/bold]") + console.print("=" * 80) + +def wait_for_confirmation(message: str = "Press Enter to continue...") -> None: + """Wait for user confirmation to proceed.""" + console.print() + Prompt.ask(f"[yellow]{message}[/yellow]") + +def check_aws_credentials() -> bool: + """Check if AWS credentials are configured.""" + code, stdout, stderr = run_command(["aws", "sts", "get-caller-identity"], capture_output=True) + if code != 0: + console.print(Panel( + "[bold red]AWS credentials not found or invalid![/bold red]\n\n" + "Please configure your AWS credentials by running:\n" + " aws configure\n\n" + "You'll need to provide your AWS Access Key ID, Secret Access Key, and default region.", + title="AWS Authentication Error", + expand=False + )) + return False + + try: + identity = json.loads(stdout) + console.print(f"[green]Authenticated as:[/green] [bold]{identity.get('Arn')}[/bold]") + return True + except json.JSONDecodeError: + console.print("[bold red]Error parsing AWS identity information[/bold red]") + return False + +def check_prerequisites() -> bool: + """Check if all required tools are installed.""" + prerequisites = { + "aws": "AWS CLI", + "docker": "Docker" + } + + all_installed = True + + with console.status("[bold blue]Checking prerequisites...[/bold blue]"): + for cmd, name in prerequisites.items(): + try: + subprocess.run( + ["which", cmd], + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE + ) + console.print(f"[green]✓[/green] {name} is installed") + except subprocess.CalledProcessError: + console.print(f"[red]✗[/red] {name} is [bold red]not installed[/bold red]") + all_installed = False + + if not all_installed: + console.print(Panel( + "Please install the missing prerequisites and try again.", + title="Missing Prerequisites", + expand=False + )) + + return all_installed + +def verify_iam_role(config: Dict[str, Any]) -> Optional[str]: + """Verify the Lambda execution role exists and return its ARN.""" + role_name = "lambda-execution-role" + with console.status(f"[bold blue]Verifying IAM role '{role_name}'...[/bold blue]"): + code, stdout, stderr = run_command( + ["aws", "iam", "get-role", "--role-name", role_name, "--query", "Role.Arn", "--output", "text"], + capture_output=True + ) + + if code != 0: + console.print(f"[bold yellow]IAM role '{role_name}' not found. Creating it...[/bold yellow]") + # Create basic execution role + policy_document = json.dumps({ + "Version": "2012-10-17", + "Statement": [{ + "Effect": "Allow", + "Principal": {"Service": "lambda.amazonaws.com"}, + "Action": "sts:AssumeRole" + }] + }) + + code, create_stdout, create_stderr = run_command([ + "aws", "iam", "create-role", + "--role-name", role_name, + "--assume-role-policy-document", policy_document, + "--query", "Role.Arn", + "--output", "text" + ], capture_output=True) + + if code != 0: + console.print(f"[bold red]Failed to create IAM role:[/bold red] {create_stderr}") + return None + + # Attach basic Lambda execution policy + run_command([ + "aws", "iam", "attach-role-policy", + "--role-name", role_name, + "--policy-arn", "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" + ]) + + console.print(f"[green]Created IAM role:[/green] {create_stdout.strip()}") + return create_stdout.strip() + else: + console.print(f"[green]Found IAM role:[/green] {stdout.strip()}") + return stdout.strip() + +def build_docker_image(config: Dict[str, Any]) -> bool: + """Build the Docker image for the Lambda function.""" + show_step_header(1, "Building Docker Image") + console.print("This step will build the Docker image that contains Crawl4ai and its dependencies.") + + if not os.path.exists("Dockerfile"): + console.print("[bold red]Error:[/bold red] Dockerfile not found in the current directory.") + return False + + if not os.path.exists("lambda_function.py"): + console.print("[bold red]Error:[/bold red] lambda_function.py not found in the current directory.") + return False + + wait_for_confirmation() + + with Progress( + SpinnerColumn(), + TextColumn("[bold blue]Building Docker image...[/bold blue]"), + console=console + ) as progress: + progress.add_task("build", total=None) + code, stdout, stderr = run_command([ + "docker", "build", "-t", config["ecr_repository_name"], "." + ]) + + if code != 0: + console.print("[bold red]Docker build failed![/bold red]") + return False + + console.print("[bold green]Docker image built successfully![/bold green]") + return True + +def setup_ecr_repository(config: Dict[str, Any]) -> Optional[str]: + """Create ECR repository if it doesn't exist and return repository URI.""" + show_step_header(2, "Setting Up Amazon ECR Repository") + console.print("This step will create an Amazon ECR repository to store the Docker image.") + + wait_for_confirmation() + + # Check if repository exists + code, stdout, stderr = run_command([ + "aws", "ecr", "describe-repositories", + "--repository-names", config["ecr_repository_name"], + "--region", config["aws_region"], + "--query", "repositories[0].repositoryUri", + "--output", "text" + ], capture_output=True) + + if code != 0: + console.print(f"[yellow]Creating new ECR repository: {config['ecr_repository_name']}[/yellow]") + code, create_stdout, create_stderr = run_command([ + "aws", "ecr", "create-repository", + "--repository-name", config["ecr_repository_name"], + "--region", config["aws_region"], + "--query", "repository.repositoryUri", + "--output", "text" + ], capture_output=True) + + if code != 0: + console.print(f"[bold red]Failed to create ECR repository:[/bold red] {create_stderr}") + return None + + repository_uri = create_stdout.strip() + else: + repository_uri = stdout.strip() + console.print(f"[green]Found existing ECR repository:[/green] {repository_uri}") + + return repository_uri + +def push_image_to_ecr(config: Dict[str, Any], repository_uri: str) -> bool: + """Push the Docker image to ECR.""" + show_step_header(3, "Pushing Docker Image to ECR") + console.print("This step will push the Docker image to Amazon ECR.") + + wait_for_confirmation() + + # Get account ID + code, account_id, stderr = run_command([ + "aws", "sts", "get-caller-identity", + "--query", "Account", + "--output", "text" + ], capture_output=True) + + if code != 0: + console.print("[bold red]Failed to get AWS account ID[/bold red]") + return False + + account_id = account_id.strip() + + # Get ECR login password + console.print("[blue]Logging in to Amazon ECR...[/blue]") + code, password, stderr = run_command([ + "aws", "ecr", "get-login-password", + "--region", config["aws_region"] + ], capture_output=True) + + if code != 0: + console.print("[bold red]Failed to get ECR login password[/bold red]") + return False + + # Log in to ECR + login_cmd = ["docker", "login", "--username", "AWS", "--password-stdin", + f"{account_id}.dkr.ecr.{config['aws_region']}.amazonaws.com"] + + login_process = subprocess.Popen( + login_cmd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) + + stdout, stderr = login_process.communicate(input=password) + + if login_process.returncode != 0: + console.print(f"[bold red]Failed to log in to ECR:[/bold red] {stderr}") + return False + + console.print("[green]Successfully logged in to ECR[/green]") + + # Tag and push image + console.print(f"[blue]Tagging image as {repository_uri}:latest[/blue]") + code, stdout, stderr = run_command([ + "docker", "tag", + f"{config['ecr_repository_name']}:latest", + f"{repository_uri}:latest" + ]) + + if code != 0: + console.print("[bold red]Failed to tag Docker image[/bold red]") + return False + + with Progress( + SpinnerColumn(), + TextColumn("[bold blue]Pushing image to ECR...[/bold blue]"), + console=console + ) as progress: + progress.add_task("push", total=None) + code, stdout, stderr = run_command([ + "docker", "push", f"{repository_uri}:latest" + ]) + + if code != 0: + console.print("[bold red]Failed to push image to ECR[/bold red]") + return False + + console.print("[bold green]Successfully pushed image to ECR![/bold green]") + return True + +def deploy_lambda_function(config: Dict[str, Any], repository_uri: str, role_arn: str) -> bool: + """Create or update Lambda function.""" + show_step_header(4, "Deploying Lambda Function") + console.print("This step will create or update the AWS Lambda function.") + + wait_for_confirmation() + + # Check if Lambda function exists + code, stdout, stderr = run_command([ + "aws", "lambda", "list-functions", + "--region", config["aws_region"], + "--query", f"Functions[?FunctionName=='{config['lambda_function_name']}'].FunctionName", + "--output", "text" + ], capture_output=True) + + function_exists = stdout.strip() != "" + + if function_exists: + console.print(f"[yellow]Updating existing Lambda function: {config['lambda_function_name']}[/yellow]") + + # Update function code + with console.status("[bold blue]Updating Lambda function code...[/bold blue]"): + code, stdout, stderr = run_command([ + "aws", "lambda", "update-function-code", + "--region", config["aws_region"], + "--function-name", config["lambda_function_name"], + "--image-uri", f"{repository_uri}:latest" + ]) + + if code != 0: + console.print("[bold red]Failed to update Lambda function code[/bold red]") + return False + + # Update function configuration + with console.status("[bold blue]Updating Lambda function configuration...[/bold blue]"): + code, stdout, stderr = run_command([ + "aws", "lambda", "update-function-configuration", + "--region", config["aws_region"], + "--function-name", config["lambda_function_name"], + "--timeout", str(config["timeout"]), + "--memory-size", str(config["memory_size"]), + "--ephemeral-storage", f"Size={config['ephemeral_storage_size']}", + "--environment", f"Variables={{" + f"CRAWL4_AI_BASE_DIRECTORY=/tmp/.crawl4ai," + f"HOME=/tmp," + f"PLAYWRIGHT_BROWSERS_PATH=/function/pw-browsers" + f"}}" + ]) + + if code != 0: + console.print("[bold red]Failed to update Lambda function configuration[/bold red]") + return False + else: + console.print(f"[blue]Creating new Lambda function: {config['lambda_function_name']}[/blue]") + + with console.status("[bold blue]Creating Lambda function...[/bold blue]"): + code, stdout, stderr = run_command([ + "aws", "lambda", "create-function", + "--region", config["aws_region"], + "--function-name", config["lambda_function_name"], + "--package-type", "Image", + "--code", f"ImageUri={repository_uri}:latest", + "--role", role_arn, + "--timeout", str(config["timeout"]), + "--memory-size", str(config["memory_size"]), + "--ephemeral-storage", f"Size={config['ephemeral_storage_size']}", + "--environment", f"Variables={{" + f"CRAWL4_AI_BASE_DIRECTORY=/tmp/.crawl4ai," + f"HOME=/tmp," + f"PLAYWRIGHT_BROWSERS_PATH=/function/pw-browsers" + f"}}" + ]) + + if code != 0: + console.print("[bold red]Failed to create Lambda function[/bold red]") + return False + + console.print("[bold green]Lambda function deployed successfully![/bold green]") + return True + +def setup_api_gateway(config: Dict[str, Any]) -> Optional[str]: + """Create or update API Gateway.""" + show_step_header(5, "Setting Up API Gateway") + console.print("This step will create an API Gateway to expose your Lambda function as a REST API.") + + wait_for_confirmation() + + # Check if API Gateway exists + code, api_id, stderr = run_command([ + "aws", "apigateway", "get-rest-apis", + "--region", config["aws_region"], + "--query", f"items[?name=='{config['api_gateway_name']}'].id", + "--output", "text" + ], capture_output=True) + + api_id = api_id.strip() + + if not api_id: + console.print(f"[blue]Creating new API Gateway: {config['api_gateway_name']}[/blue]") + + # Create API Gateway + code, api_id, stderr = run_command([ + "aws", "apigateway", "create-rest-api", + "--name", config["api_gateway_name"], + "--region", config["aws_region"], + "--query", "id", + "--output", "text" + ], capture_output=True) + + if code != 0: + console.print("[bold red]Failed to create API Gateway[/bold red]") + return None + + api_id = api_id.strip() + + # Get root resource ID + code, parent_id, stderr = run_command([ + "aws", "apigateway", "get-resources", + "--rest-api-id", api_id, + "--region", config["aws_region"], + "--query", "items[?path=='/'].id", + "--output", "text" + ], capture_output=True) + + if code != 0: + console.print("[bold red]Failed to get API Gateway root resource[/bold red]") + return None + + parent_id = parent_id.strip() + + # Create resource + console.print("[blue]Creating API Gateway resource...[/blue]") + code, resource_id, stderr = run_command([ + "aws", "apigateway", "create-resource", + "--rest-api-id", api_id, + "--region", config["aws_region"], + "--parent-id", parent_id, + "--path-part", "crawl", + "--query", "id", + "--output", "text" + ], capture_output=True) + + if code != 0: + console.print("[bold red]Failed to create API Gateway resource[/bold red]") + return None + + resource_id = resource_id.strip() + + # Create POST method + console.print("[blue]Creating POST method...[/blue]") + code, stdout, stderr = run_command([ + "aws", "apigateway", "put-method", + "--rest-api-id", api_id, + "--resource-id", resource_id, + "--http-method", "POST", + "--authorization-type", "NONE", + "--region", config["aws_region"] + ]) + + if code != 0: + console.print("[bold red]Failed to create API Gateway method[/bold red]") + return None + + # Get Lambda function ARN + code, lambda_arn, stderr = run_command([ + "aws", "lambda", "get-function", + "--function-name", config["lambda_function_name"], + "--region", config["aws_region"], + "--query", "Configuration.FunctionArn", + "--output", "text" + ], capture_output=True) + + if code != 0: + console.print("[bold red]Failed to get Lambda function ARN[/bold red]") + return None + + lambda_arn = lambda_arn.strip() + + # Set Lambda integration + console.print("[blue]Setting up Lambda integration...[/blue]") + code, stdout, stderr = run_command([ + "aws", "apigateway", "put-integration", + "--rest-api-id", api_id, + "--resource-id", resource_id, + "--http-method", "POST", + "--type", "AWS_PROXY", + "--integration-http-method", "POST", + "--uri", f"arn:aws:apigateway:{config['aws_region']}:lambda:path/2015-03-31/functions/{lambda_arn}/invocations", + "--region", config["aws_region"] + ]) + + if code != 0: + console.print("[bold red]Failed to set API Gateway integration[/bold red]") + return None + + # Deploy API + console.print("[blue]Deploying API...[/blue]") + code, stdout, stderr = run_command([ + "aws", "apigateway", "create-deployment", + "--rest-api-id", api_id, + "--stage-name", "prod", + "--region", config["aws_region"] + ]) + + if code != 0: + console.print("[bold red]Failed to deploy API Gateway[/bold red]") + return None + + # Set Lambda permission + account_id = lambda_arn.split(":")[4] + + console.print("[blue]Setting Lambda permissions...[/blue]") + code, stdout, stderr = run_command([ + "aws", "lambda", "add-permission", + "--function-name", config["lambda_function_name"], + "--statement-id", "apigateway", + "--action", "lambda:InvokeFunction", + "--principal", "apigateway.amazonaws.com", + "--source-arn", f"arn:aws:execute-api:{config['aws_region']}:{account_id}:{api_id}/*/POST/crawl", + "--region", config["aws_region"] + ]) + + if code != 0: + console.print("[bold red]Failed to set Lambda permission[/bold red]") + return None + else: + console.print(f"[green]Found existing API Gateway:[/green] {api_id}") + + console.print("[bold green]API Gateway setup complete![/bold green]") + return api_id + +def configure_provisioned_concurrency(config: Dict[str, Any]) -> bool: + """Configure provisioned concurrency if enabled.""" + if not config["enable_provisioned_concurrency"]: + console.print("[yellow]Skipping provisioned concurrency setup (not enabled)[/yellow]") + return True + + show_step_header(6, "Setting Up Provisioned Concurrency") + console.print("This step will configure provisioned concurrency to avoid cold starts.") + + wait_for_confirmation() + + # Publish a version + console.print("[blue]Publishing Lambda version...[/blue]") + code, version, stderr = run_command([ + "aws", "lambda", "publish-version", + "--function-name", config["lambda_function_name"], + "--region", config["aws_region"], + "--query", "Version", + "--output", "text" + ], capture_output=True) + + if code != 0: + console.print("[bold red]Failed to publish Lambda version[/bold red]") + return False + + version = version.strip() + console.print(f"[green]Published version:[/green] {version}") + + # Check if alias exists + alias_exists = False + code, stdout, stderr = run_command([ + "aws", "lambda", "get-alias", + "--function-name", config["lambda_function_name"], + "--name", "prod", + "--region", config["aws_region"] + ], capture_output=True) + + alias_exists = code == 0 + + # Create or update alias + if alias_exists: + console.print("[blue]Updating 'prod' alias...[/blue]") + code, stdout, stderr = run_command([ + "aws", "lambda", "update-alias", + "--function-name", config["lambda_function_name"], + "--name", "prod", + "--function-version", version, + "--region", config["aws_region"] + ]) + else: + console.print("[blue]Creating 'prod' alias...[/blue]") + code, stdout, stderr = run_command([ + "aws", "lambda", "create-alias", + "--function-name", config["lambda_function_name"], + "--name", "prod", + "--function-version", version, + "--region", config["aws_region"] + ]) + + if code != 0: + console.print("[bold red]Failed to create/update alias[/bold red]") + return False + + # Configure provisioned concurrency + console.print(f"[blue]Configuring provisioned concurrency ({config['provisioned_concurrency_count']} instances)...[/blue]") + code, stdout, stderr = run_command([ + "aws", "lambda", "put-provisioned-concurrency-config", + "--function-name", config["lambda_function_name"], + "--qualifier", "prod", + "--provisioned-concurrent-executions", str(config["provisioned_concurrency_count"]), + "--region", config["aws_region"] + ]) + + if code != 0: + console.print("[bold red]Failed to configure provisioned concurrency[/bold red]") + return False + + # Update API Gateway to use alias + api_id = run_command([ + "aws", "apigateway", "get-rest-apis", + "--region", config["aws_region"], + "--query", f"items[?name=='{config['api_gateway_name']}'].id", + "--output", "text" + ], capture_output=True)[1].strip() + + if not api_id: + console.print("[bold red]Failed to find API Gateway ID[/bold red]") + return False + + resource_id = run_command([ + "aws", "apigateway", "get-resources", + "--rest-api-id", api_id, + "--region", config["aws_region"], + "--query", "items[?path=='/crawl'].id", + "--output", "text" + ], capture_output=True)[1].strip() + + if not resource_id: + console.print("[bold red]Failed to find API Gateway resource ID[/bold red]") + return False + + account_id = run_command([ + "aws", "sts", "get-caller-identity", + "--query", "Account", + "--output", "text" + ], capture_output=True)[1].strip() + + lambda_alias_arn = f"arn:aws:lambda:{config['aws_region']}:{account_id}:function:{config['lambda_function_name']}:prod" + + console.print("[blue]Updating API Gateway to use Lambda alias...[/blue]") + code, stdout, stderr = run_command([ + "aws", "apigateway", "put-integration", + "--rest-api-id", api_id, + "--resource-id", resource_id, + "--http-method", "POST", + "--type", "AWS_PROXY", + "--integration-http-method", "POST", + "--uri", f"arn:aws:apigateway:{config['aws_region']}:lambda:path/2015-03-31/functions/{lambda_alias_arn}/invocations", + "--region", config["aws_region"] + ]) + + if code != 0: + console.print("[bold red]Failed to update API Gateway integration[/bold red]") + return False + + # Redeploy API Gateway + console.print("[blue]Redeploying API Gateway...[/blue]") + code, stdout, stderr = run_command([ + "aws", "apigateway", "create-deployment", + "--rest-api-id", api_id, + "--stage-name", "prod", + "--region", config["aws_region"] + ]) + + if code != 0: + console.print("[bold red]Failed to redeploy API Gateway[/bold red]") + return False + + console.print("[bold green]Provisioned concurrency setup complete![/bold green]") + return True + +def show_deployment_summary(config: Dict[str, Any], api_id: str) -> None: + """Show a summary of the deployment.""" + endpoint_url = f"https://{api_id}.execute-api.{config['aws_region']}.amazonaws.com/prod/crawl" + + # Create a summary table + table = Table(title="Deployment Summary") + table.add_column("Component", style="cyan") + table.add_column("Details", style="green") + + table.add_row("API Endpoint", endpoint_url) + table.add_row("Lambda Function", config["lambda_function_name"]) + table.add_row("Memory Size", f"{config['memory_size']} MB") + table.add_row("Timeout", f"{config['timeout']} seconds") + table.add_row("Ephemeral Storage", f"{config['ephemeral_storage_size']} MB") + table.add_row("Provisioned Concurrency", + "Enabled" if config["enable_provisioned_concurrency"] + else "Disabled") + + if config["enable_provisioned_concurrency"]: + table.add_row("Concurrency Units", str(config["provisioned_concurrency_count"])) + + console.print("\n") + console.print(Panel( + "🚀 [bold green]Crawl4ai has been successfully deployed to AWS Lambda![/bold green]", + expand=False + )) + console.print(table) + + # Example usage + console.print("\n[bold]Example Usage:[/bold]") + url = "https://example.com" + example_cmd = f"curl -X POST {endpoint_url} -H 'Content-Type: application/json' -d '{{\"url\": \"{url}\"}}'" + console.print(Syntax(example_cmd, "bash", theme="monokai", line_numbers=False)) + + console.print("\n[bold]Python Example:[/bold]") + python_example = f'''import requests +import json + +url = "{endpoint_url}" +payload = {{ + "url": "https://example.com", + "browser_config": {{ + "headless": True, + "verbose": False + }}, + "crawler_config": {{ + "crawler_config": {{ + "type": "CrawlerRunConfig", + "params": {{ + "markdown_generator": {{ + "type": "DefaultMarkdownGenerator", + "params": {{ + "content_filter": {{ + "type": "PruningContentFilter", + "params": {{ + "threshold": 0.48, + "threshold_type": "fixed" + }} + }} + }} + }} + }} + }} + }} +}} + +response = requests.post(url, json=payload) +result = response.json() +print(json.dumps(result, indent=2)) +''' + console.print(Syntax(python_example, "python", theme="monokai", line_numbers=False)) + + console.print("\n[bold green]Thank you for using Crawl4ai on AWS Lambda![/bold green]") + + +def cleanup_resources(config: Dict[str, Any]) -> None: + """Clean up all AWS resources created for Crawl4ai deployment.""" + show_step_header("Cleanup", "Removing AWS Resources") + console.print("This will remove all AWS resources created for Crawl4ai deployment, including:") + console.print(" • Lambda Function") + console.print(" • API Gateway") + console.print(" • ECR Repository and Images") + console.print(" • IAM Permissions") + + if not Confirm.ask( + "[bold red]⚠️ Are you sure you want to delete all resources?[/bold red]", + default=False + ): + console.print("[yellow]Cleanup cancelled.[/yellow]") + return + + # Get API Gateway ID + api_id = None + with console.status("[blue]Finding API Gateway...[/blue]"): + code, api_id, stderr = run_command([ + "aws", "apigateway", "get-rest-apis", + "--region", config["aws_region"], + "--query", f"items[?name=='{config['api_gateway_name']}'].id", + "--output", "text" + ], capture_output=True) + api_id = api_id.strip() + + # Delete API Gateway + if api_id: + console.print(f"[blue]Deleting API Gateway: {api_id}[/blue]") + code, stdout, stderr = run_command([ + "aws", "apigateway", "delete-rest-api", + "--rest-api-id", api_id, + "--region", config["aws_region"] + ]) + + if code == 0: + console.print("[green]✓[/green] API Gateway deleted successfully") + else: + console.print(f"[red]✗[/red] Failed to delete API Gateway: {stderr}") + else: + console.print("[yellow]No API Gateway found to delete[/yellow]") + + # Check if Lambda function exists + lambda_exists = False + with console.status("[blue]Checking Lambda function...[/blue]"): + code, stdout, stderr = run_command([ + "aws", "lambda", "get-function", + "--function-name", config["lambda_function_name"], + "--region", config["aws_region"] + ], capture_output=True) + lambda_exists = code == 0 + + # Delete provisioned concurrency if it exists + if lambda_exists and config.get("enable_provisioned_concurrency", False): + try: + console.print("[blue]Removing provisioned concurrency...[/blue]") + run_command([ + "aws", "lambda", "delete-provisioned-concurrency-config", + "--function-name", config["lambda_function_name"], + "--qualifier", "prod", + "--region", config["aws_region"] + ], capture_output=True) + + console.print("[blue]Deleting function alias...[/blue]") + run_command([ + "aws", "lambda", "delete-alias", + "--function-name", config["lambda_function_name"], + "--name", "prod", + "--region", config["aws_region"] + ], capture_output=True) + except Exception as e: + console.print(f"[yellow]Warning: {str(e)}[/yellow]") + + # Delete Lambda function + if lambda_exists: + console.print(f"[blue]Deleting Lambda function: {config['lambda_function_name']}[/blue]") + code, stdout, stderr = run_command([ + "aws", "lambda", "delete-function", + "--function-name", config["lambda_function_name"], + "--region", config["aws_region"] + ]) + + if code == 0: + console.print("[green]✓[/green] Lambda function deleted successfully") + else: + console.print(f"[red]✗[/red] Failed to delete Lambda function: {stderr}") + else: + console.print("[yellow]No Lambda function found to delete[/yellow]") + + # Check if ECR repository exists + ecr_exists = False + with console.status("[blue]Checking ECR repository...[/blue]"): + code, stdout, stderr = run_command([ + "aws", "ecr", "describe-repositories", + "--repository-names", config["ecr_repository_name"], + "--region", config["aws_region"] + ], capture_output=True) + ecr_exists = code == 0 + + # Delete ECR repository + if ecr_exists: + console.print(f"[blue]Deleting ECR repository: {config['ecr_repository_name']}[/blue]") + code, stdout, stderr = run_command([ + "aws", "ecr", "delete-repository", + "--repository-name", config["ecr_repository_name"], + "--force", # Force delete even if it contains images + "--region", config["aws_region"] + ]) + + if code == 0: + console.print("[green]✓[/green] ECR repository deleted successfully") + else: + console.print(f"[red]✗[/red] Failed to delete ECR repository: {stderr}") + else: + console.print("[yellow]No ECR repository found to delete[/yellow]") + + # Check and clean up IAM permissions + console.print("[blue]Cleaning up IAM permissions...[/blue]") + try: + # Remove the Lambda permission for API Gateway + run_command([ + "aws", "lambda", "remove-permission", + "--function-name", config["lambda_function_name"], + "--statement-id", "apigateway", + "--region", config["aws_region"] + ], capture_output=True) + except Exception: + # It's okay if this fails, the function might already be deleted + pass + + # Clean up local Docker images + console.print("[blue]Cleaning up local Docker images...[/blue]") + try: + # Get account ID + code, account_id, stderr = run_command([ + "aws", "sts", "get-caller-identity", + "--query", "Account", + "--output", "text" + ], capture_output=True) + + if code == 0: + account_id = account_id.strip() + repo_uri = f"{account_id}.dkr.ecr.{config['aws_region']}.amazonaws.com/{config['ecr_repository_name']}" + + run_command([ + "docker", "rmi", + f"{repo_uri}:latest", + "--force" + ], capture_output=True) + + run_command([ + "docker", "rmi", + f"{config['ecr_repository_name']}:latest", + "--force" + ], capture_output=True) + except Exception as e: + console.print(f"[yellow]Warning: Failed to clean up Docker images: {str(e)}[/yellow]") + + console.print("\n[bold green]Cleanup Complete![/bold green]") + console.print("All AWS resources for the Crawl4ai deployment have been removed.") + +# Add this to the app commands +@app.command() +def cleanup(): + """ + Remove all AWS resources created for Crawl4ai deployment. + + This will delete the Lambda function, API Gateway, and ECR repository. + """ + console.print(Panel( + "[bold red]Crawl4ai AWS Resources Cleanup[/bold red]\n\n" + "This will remove all AWS resources created for Crawl4ai deployment.", + title="Warning", + expand=False + )) + + # Check AWS credentials + if not check_aws_credentials(): + return + + # Get configuration + config = DEFAULT_CONFIG.copy() + + console.print("\n[bold]Configuration[/bold]") + console.print("Please confirm the resources to clean up:") + + config["aws_region"] = Prompt.ask( + "AWS Region", + default=config["aws_region"] + ) + + config["lambda_function_name"] = Prompt.ask( + "Lambda Function Name", + default=config["lambda_function_name"] + ) + + config["api_gateway_name"] = Prompt.ask( + "API Gateway Name", + default=config["api_gateway_name"] + ) + + config["ecr_repository_name"] = Prompt.ask( + "ECR Repository Name", + default=config["ecr_repository_name"] + ) + + # Run cleanup + cleanup_resources(config) + +@app.command() +def main() -> None: + """ + Deploy Crawl4ai to AWS Lambda. + + This script guides you through the process of deploying Crawl4ai + as an AWS Lambda function with API Gateway integration. + """ + # Show welcome banner + console.print(Panel( + "[bold blue]Crawl4ai AWS Lambda Deployment Wizard[/bold blue]\n\n" + "This tool will help you deploy Crawl4ai to AWS Lambda with API Gateway integration.", + title="Welcome", + expand=False + )) + + # Check prerequisites + if not check_prerequisites(): + return + + # Check AWS credentials + if not check_aws_credentials(): + return + + # Get configuration + config = DEFAULT_CONFIG.copy() + + console.print("\n[bold]Configuration[/bold]") + console.print("Please configure your deployment:") + + config["aws_region"] = Prompt.ask( + "AWS Region", + default=config["aws_region"] + ) + + config["lambda_function_name"] = Prompt.ask( + "Lambda Function Name", + default=config["lambda_function_name"] + ) + + config["api_gateway_name"] = Prompt.ask( + "API Gateway Name", + default=config["api_gateway_name"] + ) + + config["memory_size"] = int(Prompt.ask( + "Lambda Memory Size (MB)", + default=str(config["memory_size"]) + )) + + config["timeout"] = int(Prompt.ask( + "Lambda Timeout (seconds)", + default=str(config["timeout"]) + )) + + config["enable_provisioned_concurrency"] = Confirm.ask( + "Enable Provisioned Concurrency (reduces cold starts)?", + default=config["enable_provisioned_concurrency"] + ) + + if config["enable_provisioned_concurrency"]: + config["provisioned_concurrency_count"] = int(Prompt.ask( + "Number of Provisioned Concurrency instances", + default=str(config["provisioned_concurrency_count"]) + )) + + # Verify IAM role + role_arn = verify_iam_role(config) + if not role_arn: + console.print("[bold red]Failed to verify or create IAM role[/bold red]") + return + + # Build Docker image + if not build_docker_image(config): + return + + # Setup ECR repository + repository_uri = setup_ecr_repository(config) + if not repository_uri: + return + + # Push image to ECR + if not push_image_to_ecr(config, repository_uri): + return + + # Deploy Lambda function + if not deploy_lambda_function(config, repository_uri, role_arn): + return + + # Setup API Gateway + api_id = setup_api_gateway(config) + if not api_id: + return + + # Configure provisioned concurrency if enabled + if not configure_provisioned_concurrency(config): + return + + # Show deployment summary + show_deployment_summary(config, api_id) + +if __name__ == "__main__": + app() \ No newline at end of file diff --git a/deploy/lambda/guide.md b/deploy/lambda/guide.md new file mode 100644 index 00000000..01b683ad --- /dev/null +++ b/deploy/lambda/guide.md @@ -0,0 +1,345 @@ +# Deploying Crawl4ai on AWS Lambda + +This guide walks you through deploying Crawl4ai as an AWS Lambda function with API Gateway integration. You'll learn how to set up, test, and clean up your deployment. + +## Prerequisites + +Before you begin, ensure you have: + +- AWS CLI installed and configured (`aws configure`) +- Docker installed and running +- Python 3.8+ installed +- Basic familiarity with AWS services + +## Project Files + +Your project directory should contain: + +- `Dockerfile`: Container configuration for Lambda +- `lambda_function.py`: Lambda handler code +- `deploy.py`: Our deployment script + +## Step 1: Install Required Python Packages + +Install the Python packages needed for our deployment script: + +```bash +pip install typer rich +``` + +## Step 2: Run the Deployment Script + +Our Python script automates the entire deployment process: + +```bash +python deploy.py +``` + +The script will guide you through: + +1. Configuration setup (AWS region, function name, memory allocation) +2. Docker image building +3. ECR repository creation +4. Lambda function deployment +5. API Gateway setup +6. Provisioned concurrency configuration (optional) + +Follow the prompts and confirm each step by pressing Enter. + +## Step 3: Manual Deployment (Alternative to the Script) + +If you prefer to deploy manually or understand what the script does, follow these steps: + +### Building and Pushing the Docker Image + +```bash +# Build the Docker image +docker build -t crawl4ai-lambda . + +# Create an ECR repository (if it doesn't exist) +aws ecr create-repository --repository-name crawl4ai-lambda + +# Get ECR login password and login +aws ecr get-login-password | docker login --username AWS --password-stdin $(aws sts get-caller-identity --query Account --output text).dkr.ecr.us-east-1.amazonaws.com + +# Tag the image +ECR_URI=$(aws ecr describe-repositories --repository-names crawl4ai-lambda --query 'repositories[0].repositoryUri' --output text) +docker tag crawl4ai-lambda:latest $ECR_URI:latest + +# Push the image to ECR +docker push $ECR_URI:latest +``` + +### Creating the Lambda Function + +```bash +# Get IAM role ARN (create it if needed) +ROLE_ARN=$(aws iam get-role --role-name lambda-execution-role --query 'Role.Arn' --output text) + +# Create Lambda function +aws lambda create-function \ + --function-name crawl4ai-function \ + --package-type Image \ + --code ImageUri=$ECR_URI:latest \ + --role $ROLE_ARN \ + --timeout 300 \ + --memory-size 4096 \ + --ephemeral-storage Size=10240 \ + --environment "Variables={CRAWL4_AI_BASE_DIRECTORY=/tmp/.crawl4ai,HOME=/tmp,PLAYWRIGHT_BROWSERS_PATH=/function/pw-browsers}" +``` + +If you're updating an existing function: + +```bash +# Update function code +aws lambda update-function-code \ + --function-name crawl4ai-function \ + --image-uri $ECR_URI:latest + +# Update function configuration +aws lambda update-function-configuration \ + --function-name crawl4ai-function \ + --timeout 300 \ + --memory-size 4096 \ + --ephemeral-storage Size=10240 \ + --environment "Variables={CRAWL4_AI_BASE_DIRECTORY=/tmp/.crawl4ai,HOME=/tmp,PLAYWRIGHT_BROWSERS_PATH=/function/pw-browsers}" +``` + +### Setting Up API Gateway + +```bash +# Create API Gateway +API_ID=$(aws apigateway create-rest-api --name crawl4ai-api --query 'id' --output text) + +# Get root resource ID +PARENT_ID=$(aws apigateway get-resources --rest-api-id $API_ID --query 'items[?path==`/`].id' --output text) + +# Create resource +RESOURCE_ID=$(aws apigateway create-resource --rest-api-id $API_ID --parent-id $PARENT_ID --path-part "crawl" --query 'id' --output text) + +# Create POST method +aws apigateway put-method --rest-api-id $API_ID --resource-id $RESOURCE_ID --http-method POST --authorization-type NONE + +# Get Lambda function ARN +LAMBDA_ARN=$(aws lambda get-function --function-name crawl4ai-function --query 'Configuration.FunctionArn' --output text) + +# Set Lambda integration +aws apigateway put-integration \ + --rest-api-id $API_ID \ + --resource-id $RESOURCE_ID \ + --http-method POST \ + --type AWS_PROXY \ + --integration-http-method POST \ + --uri arn:aws:apigateway:us-east-1:lambda:path/2015-03-31/functions/$LAMBDA_ARN/invocations + +# Deploy API +aws apigateway create-deployment --rest-api-id $API_ID --stage-name prod + +# Set Lambda permission +ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) +aws lambda add-permission \ + --function-name crawl4ai-function \ + --statement-id apigateway \ + --action lambda:InvokeFunction \ + --principal apigateway.amazonaws.com \ + --source-arn "arn:aws:execute-api:us-east-1:$ACCOUNT_ID:$API_ID/*/POST/crawl" +``` + +### Setting Up Provisioned Concurrency (Optional) + +This reduces cold starts: + +```bash +# Publish a version +VERSION=$(aws lambda publish-version --function-name crawl4ai-function --query 'Version' --output text) + +# Create alias +aws lambda create-alias \ + --function-name crawl4ai-function \ + --name prod \ + --function-version $VERSION + +# Configure provisioned concurrency +aws lambda put-provisioned-concurrency-config \ + --function-name crawl4ai-function \ + --qualifier prod \ + --provisioned-concurrent-executions 2 + +# Update API Gateway to use alias +LAMBDA_ALIAS_ARN="arn:aws:lambda:us-east-1:$ACCOUNT_ID:function:crawl4ai-function:prod" +aws apigateway put-integration \ + --rest-api-id $API_ID \ + --resource-id $RESOURCE_ID \ + --http-method POST \ + --type AWS_PROXY \ + --integration-http-method POST \ + --uri arn:aws:apigateway:us-east-1:lambda:path/2015-03-31/functions/$LAMBDA_ALIAS_ARN/invocations + +# Redeploy API Gateway +aws apigateway create-deployment --rest-api-id $API_ID --stage-name prod +``` + +## Step 4: Testing the Deployment + +Once deployed, test your function with: + +```bash +ENDPOINT_URL="https://$API_ID.execute-api.us-east-1.amazonaws.com/prod/crawl" + +# Test with curl +curl -X POST $ENDPOINT_URL \ + -H "Content-Type: application/json" \ + -d '{"url":"https://example.com"}' +``` + +Or using Python: + +```python +import requests +import json + +url = "https://your-api-id.execute-api.us-east-1.amazonaws.com/prod/crawl" +payload = { + "url": "https://example.com", + "browser_config": { + "headless": True, + "verbose": False + }, + "crawler_config": { + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "markdown_generator": { + "type": "DefaultMarkdownGenerator", + "params": { + "content_filter": { + "type": "PruningContentFilter", + "params": { + "threshold": 0.48, + "threshold_type": "fixed" + } + } + } + } + } + } + } +} + +response = requests.post(url, json=payload) +result = response.json() +print(json.dumps(result, indent=2)) +``` + +## Step 5: Cleaning Up Resources + +To remove all AWS resources created for this deployment: + +```bash +python deploy.py cleanup +``` + +Or manually: + +```bash +# Delete API Gateway +aws apigateway delete-rest-api --rest-api-id $API_ID + +# Remove provisioned concurrency (if configured) +aws lambda delete-provisioned-concurrency-config \ + --function-name crawl4ai-function \ + --qualifier prod + +# Delete alias (if created) +aws lambda delete-alias \ + --function-name crawl4ai-function \ + --name prod + +# Delete Lambda function +aws lambda delete-function --function-name crawl4ai-function + +# Delete ECR repository +aws ecr delete-repository --repository-name crawl4ai-lambda --force +``` + +## Troubleshooting + +### Cold Start Issues + +If experiencing long cold starts: +- Enable provisioned concurrency +- Increase memory allocation (4096 MB recommended) +- Ensure the Lambda function has enough ephemeral storage + +### Permission Errors + +If you encounter permission errors: +- Check the IAM role has the necessary permissions +- Ensure API Gateway has permission to invoke the Lambda function + +### Container Size Issues + +If your container is too large: +- Optimize the Dockerfile +- Use multi-stage builds +- Consider removing unnecessary dependencies + +## Performance Considerations + +- Lambda memory affects CPU allocation - higher memory means faster execution +- Provisioned concurrency eliminates cold starts but costs more +- Optimize the Playwright setup for faster browser initialization + +## Security Best Practices + +- Use the principle of least privilege for IAM roles +- Implement API Gateway authentication for production deployments +- Consider using AWS KMS for storing sensitive configuration + +## Useful AWS Console Links + +Here are quick links to access important AWS console pages for monitoring and managing your deployment: + +| Resource | Console Link | +|----------|-------------| +| Lambda Functions | [AWS Lambda Console](https://console.aws.amazon.com/lambda/home#/functions) | +| Lambda Function Logs | [CloudWatch Logs](https://console.aws.amazon.com/cloudwatch/home#logsV2:log-groups) | +| API Gateway | [API Gateway Console](https://console.aws.amazon.com/apigateway/home) | +| ECR Repositories | [ECR Console](https://console.aws.amazon.com/ecr/repositories) | +| IAM Roles | [IAM Console](https://console.aws.amazon.com/iamv2/home#/roles) | +| CloudWatch Metrics | [CloudWatch Metrics](https://console.aws.amazon.com/cloudwatch/home#metricsV2) | + +### Monitoring Lambda Execution + +To monitor your Lambda function: + +1. Go to the [Lambda function console](https://console.aws.amazon.com/lambda/home#/functions) +2. Select your function (`crawl4ai-function`) +3. Click the "Monitor" tab to see: + - Invocation metrics + - Success/failure rates + - Duration statistics + +### Viewing Lambda Logs + +To see detailed execution logs: + +1. Go to [CloudWatch Logs](https://console.aws.amazon.com/cloudwatch/home#logsV2:log-groups) +2. Find the log group named `/aws/lambda/crawl4ai-function` +3. Click to see the latest log streams +4. Each stream contains logs from a function execution + +### Checking API Gateway Traffic + +To monitor API requests: + +1. Go to the [API Gateway console](https://console.aws.amazon.com/apigateway/home) +2. Select your API (`crawl4ai-api`) +3. Click "Dashboard" to see: + - API calls + - Latency + - Error rates + +## Conclusion + +You now have Crawl4ai running as a serverless function on AWS Lambda! This setup allows you to crawl websites on-demand without maintaining infrastructure, while paying only for the compute time you use. \ No newline at end of file diff --git a/deploy/lambda/lambda_function.py b/deploy/lambda/lambda_function.py new file mode 100644 index 00000000..25cdb065 --- /dev/null +++ b/deploy/lambda/lambda_function.py @@ -0,0 +1,107 @@ +import json +import asyncio +import os + +# Ensure environment variables and directories are set +os.environ['CRAWL4_AI_BASE_DIRECTORY'] = '/tmp/.crawl4ai' +os.environ['HOME'] = '/tmp' + +# Create directory if it doesn't exist +os.makedirs('/tmp/.crawl4ai', exist_ok=True) + +from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CrawlerRunConfig, + CacheMode +) + + +def handler(event, context): + # Parse the incoming event (API Gateway request) + try: + body = json.loads(event.get('body', '{}')) + + url = body.get('url') + if not url: + return { + 'statusCode': 400, + 'body': json.dumps({'error': 'URL is required'}) + } + + # Get optional configurations or use defaults + browser_config_dict = body.get('browser_config', {}) + crawler_config_dict = body.get('crawler_config', {}) + + # Run the crawler + result = asyncio.run(crawl(url, browser_config_dict, crawler_config_dict)) + + # Return successful response + return { + 'statusCode': 200, + 'headers': { + 'Content-Type': 'application/json' + }, + 'body': json.dumps(result) + } + + except Exception as e: + # Handle errors + import traceback + return { + 'statusCode': 500, + 'body': json.dumps({ + 'error': str(e), + 'traceback': traceback.format_exc() + }) + } + +async def crawl(url, browser_config_dict, crawler_config_dict): + """ + Run the crawler with the provided configurations, with Lambda-specific settings + """ + # Start with user-provided config but override with Lambda-required settings + base_browser_config = BrowserConfig.load(browser_config_dict) if browser_config_dict else BrowserConfig() + + # Apply Lambda-specific browser configurations + browser_config = BrowserConfig( + verbose=True, + browser_type="chromium", + headless=True, + user_agent_mode="random", + light_mode=True, + use_managed_browser=False, + extra_args=[ + "--headless=new", + "--no-sandbox", + "--disable-dev-shm-usage", + "--disable-setuid-sandbox", + "--remote-allow-origins=*", + "--autoplay-policy=user-gesture-required", + "--single-process", + ], + # # Carry over any other settings from user config that aren't overridden + # **{k: v for k, v in base_browser_config.model_dump().items() + # if k not in ['verbose', 'browser_type', 'headless', 'user_agent_mode', + # 'light_mode', 'use_managed_browser', 'extra_args']} + ) + + # Start with user-provided crawler config but ensure cache is bypassed + base_crawler_config = CrawlerRunConfig.load(crawler_config_dict) if crawler_config_dict else CrawlerRunConfig() + + # Apply Lambda-specific crawler configurations + crawler_config = CrawlerRunConfig( + exclude_external_links=base_crawler_config.exclude_external_links, + remove_overlay_elements=True, + magic=True, + cache_mode=CacheMode.BYPASS, + # Carry over markdown generator and other settings + markdown_generator=base_crawler_config.markdown_generator + ) + + # Perform the crawl with Lambda-optimized settings + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url=url, config=crawler_config) + + # Return serializable results + return result.model_dump() \ No newline at end of file diff --git a/deploy/modal/crawl4ai_api_service.py b/deploy/modal/crawl4ai_api_service.py new file mode 100644 index 00000000..93fabe60 --- /dev/null +++ b/deploy/modal/crawl4ai_api_service.py @@ -0,0 +1,543 @@ +import os +import time +import uuid +from datetime import datetime +from typing import Dict, Any, Optional, List + +import modal +from modal import Image, App, Volume, Secret, web_endpoint, function + +# Configuration +APP_NAME = "crawl4ai-api" +CRAWL4AI_VERSION = "next" # Using the 'next' branch +PYTHON_VERSION = "3.10" # Compatible with playwright +DEFAULT_CREDITS = 1000 + +# Create a custom image with Crawl4ai and its dependencies +image = Image.debian_slim(python_version=PYTHON_VERSION).pip_install( + ["fastapi[standard]", "pymongo", "pydantic"] +).run_commands( + "apt-get update", + "apt-get install -y software-properties-common", + "apt-get install -y git", + "apt-add-repository non-free", + "apt-add-repository contrib", + # Install crawl4ai from the next branch + f"pip install -U git+https://github.com/unclecode/crawl4ai.git@{CRAWL4AI_VERSION}", + "pip install -U fastapi[standard]", + "pip install -U pydantic", + # Install playwright and browsers + "crawl4ai-setup", +) + +# Create persistent volume for user database +user_db = Volume.from_name("crawl4ai-users", create_if_missing=True) + +# Create admin secret for secure operations +admin_secret = Secret.from_name("admin-secret", create_if_missing=True) + +# Define the app +app = App(APP_NAME, image=image) + +# Default configurations +DEFAULT_BROWSER_CONFIG = { + "headless": True, + "verbose": False, +} + +DEFAULT_CRAWLER_CONFIG = { + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "markdown_generator": { + "type": "DefaultMarkdownGenerator", + "params": { + "content_filter": { + "type": "PruningContentFilter", + "params": { + "threshold": 0.48, + "threshold_type": "fixed" + } + } + } + } + } + } +} + +# Database operations +@app.function(volumes={"/data": user_db}) +def init_db() -> None: + """Initialize database with indexes.""" + from pymongo import MongoClient, ASCENDING + + client = MongoClient("mongodb://localhost:27017") + db = client.crawl4ai_db + + # Ensure indexes for faster lookups + db.users.create_index([("api_token", ASCENDING)], unique=True) + db.users.create_index([("email", ASCENDING)], unique=True) + + # Create usage stats collection + db.usage_stats.create_index([("user_id", ASCENDING), ("timestamp", ASCENDING)]) + + print("Database initialized with required indexes") + +@app.function(volumes={"/data": user_db}) +def get_user_by_token(api_token: str) -> Optional[Dict[str, Any]]: + """Get user by API token.""" + from pymongo import MongoClient + from bson.objectid import ObjectId + + client = MongoClient("mongodb://localhost:27017") + db = client.crawl4ai_db + + user = db.users.find_one({"api_token": api_token}) + if not user: + return None + + # Convert ObjectId to string for serialization + user["_id"] = str(user["_id"]) + return user + +@app.function(volumes={"/data": user_db}) +def create_user(email: str, name: str) -> Dict[str, Any]: + """Create a new user with initial credits.""" + from pymongo import MongoClient + from bson.objectid import ObjectId + + client = MongoClient("mongodb://localhost:27017") + db = client.crawl4ai_db + + # Generate API token + api_token = str(uuid.uuid4()) + + user = { + "email": email, + "name": name, + "api_token": api_token, + "credits": DEFAULT_CREDITS, + "created_at": datetime.utcnow(), + "updated_at": datetime.utcnow(), + "is_active": True + } + + try: + result = db.users.insert_one(user) + user["_id"] = str(result.inserted_id) + return user + except Exception as e: + if "duplicate key error" in str(e): + return {"error": "User with this email already exists"} + raise + +@app.function(volumes={"/data": user_db}) +def update_user_credits(api_token: str, amount: int) -> Dict[str, Any]: + """Update user credits (add or subtract).""" + from pymongo import MongoClient + + client = MongoClient("mongodb://localhost:27017") + db = client.crawl4ai_db + + # First get current user to check credits + user = db.users.find_one({"api_token": api_token}) + if not user: + return {"success": False, "error": "User not found"} + + # For deductions, ensure sufficient credits + if amount < 0 and user["credits"] + amount < 0: + return {"success": False, "error": "Insufficient credits"} + + # Update credits + result = db.users.update_one( + {"api_token": api_token}, + { + "$inc": {"credits": amount}, + "$set": {"updated_at": datetime.utcnow()} + } + ) + + if result.modified_count == 1: + # Get updated user + updated_user = db.users.find_one({"api_token": api_token}) + return { + "success": True, + "credits": updated_user["credits"] + } + else: + return {"success": False, "error": "Failed to update credits"} + +@app.function(volumes={"/data": user_db}) +def log_usage(user_id: str, url: str, success: bool, error: Optional[str] = None) -> None: + """Log usage statistics.""" + from pymongo import MongoClient + from bson.objectid import ObjectId + + client = MongoClient("mongodb://localhost:27017") + db = client.crawl4ai_db + + log_entry = { + "user_id": user_id, + "url": url, + "timestamp": datetime.utcnow(), + "success": success, + "error": error + } + + db.usage_stats.insert_one(log_entry) + +# Main crawling function +@app.function(timeout=300) # 5 minute timeout +async def crawl( + url: str, + browser_config: Optional[Dict[str, Any]] = None, + crawler_config: Optional[Dict[str, Any]] = None, +) -> Dict[str, Any]: + """ + Crawl a given URL using Crawl4ai. + + Args: + url: The URL to crawl + browser_config: Optional browser configuration to override defaults + crawler_config: Optional crawler configuration to override defaults + + Returns: + A dictionary containing the crawl results + """ + from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CrawlerRunConfig, + CrawlResult + ) + + # Prepare browser config using the loader method + if browser_config is None: + browser_config = DEFAULT_BROWSER_CONFIG + browser_config_obj = BrowserConfig.load(browser_config) + + # Prepare crawler config using the loader method + if crawler_config is None: + crawler_config = DEFAULT_CRAWLER_CONFIG + crawler_config_obj = CrawlerRunConfig.load(crawler_config) + + # Perform the crawl + async with AsyncWebCrawler(config=browser_config_obj) as crawler: + result: CrawlResult = await crawler.arun(url=url, config=crawler_config_obj) + + # Return serializable results + try: + # Try newer Pydantic v2 method + return result.model_dump() + except AttributeError: + try: + # Try older Pydantic v1 method + return result.dict() + except AttributeError: + # Fallback to manual conversion + return { + "url": result.url, + "title": result.title, + "status": result.status, + "content": str(result.content) if hasattr(result, "content") else None, + "links": [{"url": link.url, "text": link.text} for link in result.links] if hasattr(result, "links") else [], + "markdown_v2": { + "raw_markdown": result.markdown_v2.raw_markdown if hasattr(result, "markdown_v2") else None + } + } + +# API endpoints +@app.function() +@web_endpoint(method="POST") +def crawl_endpoint(data: Dict[str, Any]) -> Dict[str, Any]: + """ + Web endpoint that accepts POST requests with JSON data containing: + - api_token: User's API token + - url: The URL to crawl + - browser_config: Optional browser configuration + - crawler_config: Optional crawler configuration + + Returns the crawl results and remaining credits. + """ + # Extract and validate API token + api_token = data.get("api_token") + if not api_token: + return { + "success": False, + "error": "API token is required", + "status_code": 401 + } + + # Verify user + user = get_user_by_token.remote(api_token) + if not user: + return { + "success": False, + "error": "Invalid API token", + "status_code": 401 + } + + if not user.get("is_active", False): + return { + "success": False, + "error": "Account is inactive", + "status_code": 403 + } + + # Validate URL + url = data.get("url") + if not url: + return { + "success": False, + "error": "URL is required", + "status_code": 400 + } + + # Check credits + if user.get("credits", 0) <= 0: + return { + "success": False, + "error": "Insufficient credits", + "status_code": 403 + } + + # Deduct credit first (1 credit per call) + credit_result = update_user_credits.remote(api_token, -1) + if not credit_result.get("success", False): + return { + "success": False, + "error": credit_result.get("error", "Failed to process credits"), + "status_code": 500 + } + + # Extract configs + browser_config = data.get("browser_config") + crawler_config = data.get("crawler_config") + + # Perform crawl + try: + start_time = time.time() + result = crawl.remote(url, browser_config, crawler_config) + execution_time = time.time() - start_time + + # Log successful usage + log_usage.spawn(user["_id"], url, True) + + return { + "success": True, + "data": result, + "credits_remaining": credit_result.get("credits"), + "execution_time_seconds": round(execution_time, 2), + "status_code": 200 + } + except Exception as e: + # Log failed usage + log_usage.spawn(user["_id"], url, False, str(e)) + + # Return error + return { + "success": False, + "error": f"Crawling error: {str(e)}", + "credits_remaining": credit_result.get("credits"), + "status_code": 500 + } + +# Admin endpoints +@app.function(secrets=[admin_secret]) +@web_endpoint(method="POST") +def admin_create_user(data: Dict[str, Any]) -> Dict[str, Any]: + """Admin endpoint to create new users.""" + # Validate admin token + admin_token = data.get("admin_token") + if admin_token != os.environ.get("ADMIN_TOKEN"): + return { + "success": False, + "error": "Invalid admin token", + "status_code": 401 + } + + # Validate input + email = data.get("email") + name = data.get("name") + + if not email or not name: + return { + "success": False, + "error": "Email and name are required", + "status_code": 400 + } + + # Create user + user = create_user.remote(email, name) + + if "error" in user: + return { + "success": False, + "error": user["error"], + "status_code": 400 + } + + return { + "success": True, + "data": { + "user_id": user["_id"], + "email": user["email"], + "name": user["name"], + "api_token": user["api_token"], + "credits": user["credits"], + "created_at": user["created_at"].isoformat() if isinstance(user["created_at"], datetime) else user["created_at"] + }, + "status_code": 201 + } + +@app.function(secrets=[admin_secret]) +@web_endpoint(method="POST") +def admin_update_credits(data: Dict[str, Any]) -> Dict[str, Any]: + """Admin endpoint to update user credits.""" + # Validate admin token + admin_token = data.get("admin_token") + if admin_token != os.environ.get("ADMIN_TOKEN"): + return { + "success": False, + "error": "Invalid admin token", + "status_code": 401 + } + + # Validate input + api_token = data.get("api_token") + amount = data.get("amount") + + if not api_token: + return { + "success": False, + "error": "API token is required", + "status_code": 400 + } + + if not isinstance(amount, int): + return { + "success": False, + "error": "Amount must be an integer", + "status_code": 400 + } + + # Update credits + result = update_user_credits.remote(api_token, amount) + + if not result.get("success", False): + return { + "success": False, + "error": result.get("error", "Failed to update credits"), + "status_code": 400 + } + + return { + "success": True, + "data": { + "credits": result["credits"] + }, + "status_code": 200 + } + +@app.function(secrets=[admin_secret]) +@web_endpoint(method="GET") +def admin_get_users(admin_token: str) -> Dict[str, Any]: + """Admin endpoint to list all users.""" + # Validate admin token + if admin_token != os.environ.get("ADMIN_TOKEN"): + return { + "success": False, + "error": "Invalid admin token", + "status_code": 401 + } + + users = get_all_users.remote() + + return { + "success": True, + "data": users, + "status_code": 200 + } + +@app.function(volumes={"/data": user_db}) +def get_all_users() -> List[Dict[str, Any]]: + """Get all users (for admin).""" + from pymongo import MongoClient + + client = MongoClient("mongodb://localhost:27017") + db = client.crawl4ai_db + + users = [] + for user in db.users.find(): + # Convert ObjectId to string + user["_id"] = str(user["_id"]) + # Convert datetime to ISO format + for field in ["created_at", "updated_at"]: + if field in user and isinstance(user[field], datetime): + user[field] = user[field].isoformat() + users.append(user) + + return users + +# Public endpoints +@app.function() +@web_endpoint(method="GET") +def health_check() -> Dict[str, Any]: + """Health check endpoint.""" + return { + "status": "online", + "service": APP_NAME, + "version": CRAWL4AI_VERSION, + "timestamp": datetime.utcnow().isoformat() + } + +@app.function() +@web_endpoint(method="GET") +def check_credits(api_token: str) -> Dict[str, Any]: + """Check user credits.""" + if not api_token: + return { + "success": False, + "error": "API token is required", + "status_code": 401 + } + + user = get_user_by_token.remote(api_token) + if not user: + return { + "success": False, + "error": "Invalid API token", + "status_code": 401 + } + + return { + "success": True, + "data": { + "credits": user["credits"], + "email": user["email"], + "name": user["name"] + }, + "status_code": 200 + } + +# Local entrypoint for testing +@app.local_entrypoint() +def main(url: str = "https://www.modal.com"): + """Command line entrypoint for local testing.""" + print("Initializing database...") + init_db.remote() + + print(f"Testing crawl on URL: {url}") + result = crawl.remote(url) + + # Print sample of result + print("\nCrawl Result Sample:") + if "title" in result: + print(f"Title: {result['title']}") + if "status" in result: + print(f"Status: {result['status']}") + if "links" in result: + print(f"Links found: {len(result['links'])}") + if "markdown_v2" in result and result["markdown_v2"] and "raw_markdown" in result["markdown_v2"]: + print("\nMarkdown Preview (first 300 chars):") + print(result["markdown_v2"]["raw_markdown"][:300] + "...") \ No newline at end of file diff --git a/deploy/modal/entry.py b/deploy/modal/entry.py new file mode 100644 index 00000000..1721487d --- /dev/null +++ b/deploy/modal/entry.py @@ -0,0 +1,127 @@ +import modal +from typing import Optional, Dict, Any + +# Create a custom image with Crawl4ai and its dependencies +# "pip install crawl4ai", +image = modal.Image.debian_slim(python_version="3.10").pip_install(["fastapi[standard]"]).run_commands( + "apt-get update", + "apt-get install -y software-properties-common", + "apt-get install -y git", + "apt-add-repository non-free", + "apt-add-repository contrib", + "pip install -U git+https://github.com/unclecode/crawl4ai.git@next", + "pip install -U fastapi[standard]", + "pip install -U pydantic", + "crawl4ai-setup", # This installs playwright and downloads chromium + # Print fastpi version + "python -m fastapi --version", +) + +# Define the app +app = modal.App("crawl4ai", image=image) + +# Define default configurations +DEFAULT_BROWSER_CONFIG = { + "headless": True, + "verbose": False, +} + +DEFAULT_CRAWLER_CONFIG = { + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "markdown_generator": { + "type": "DefaultMarkdownGenerator", + "params": { + "content_filter": { + "type": "PruningContentFilter", + "params": { + "threshold": 0.48, + "threshold_type": "fixed" + } + } + } + } + } + } +} + +@app.function(timeout=300) # 5 minute timeout +async def crawl( + url: str, + browser_config: Optional[Dict[str, Any]] = None, + crawler_config: Optional[Dict[str, Any]] = None, +) -> Dict[str, Any]: + """ + Crawl a given URL using Crawl4ai. + + Args: + url: The URL to crawl + browser_config: Optional browser configuration to override defaults + crawler_config: Optional crawler configuration to override defaults + + Returns: + A dictionary containing the crawl results + """ + from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CrawlerRunConfig, + CrawlResult + ) + + + # Prepare browser config using the loader method + if browser_config is None: + browser_config = DEFAULT_BROWSER_CONFIG + browser_config_obj = BrowserConfig.load(browser_config) + + # Prepare crawler config using the loader method + if crawler_config is None: + crawler_config = DEFAULT_CRAWLER_CONFIG + crawler_config_obj = CrawlerRunConfig.load(crawler_config) + + + # Perform the crawl + async with AsyncWebCrawler(config=browser_config_obj) as crawler: + result: CrawlResult = await crawler.arun(url=url, config=crawler_config_obj) + + # Return serializable results + try: + # Try newer Pydantic v2 method + return result.model_dump() + except AttributeError: + try: + # Try older Pydantic v1 method + return result.__dict__ + except AttributeError: + # Fallback to returning the raw result + return result + +@app.function() +@modal.web_endpoint(method="POST") +def crawl_endpoint(data: Dict[str, Any]) -> Dict[str, Any]: + """ + Web endpoint that accepts POST requests with JSON data containing: + - url: The URL to crawl + - browser_config: Optional browser configuration + - crawler_config: Optional crawler configuration + + Returns the crawl results. + """ + url = data.get("url") + if not url: + return {"error": "URL is required"} + + browser_config = data.get("browser_config") + crawler_config = data.get("crawler_config") + + return crawl.remote(url, browser_config, crawler_config) + +@app.local_entrypoint() +def main(url: str = "https://www.modal.com"): + """ + Command line entrypoint for local testing. + """ + result = crawl.remote(url) + print(result) diff --git a/deploy/modal/guide.md b/deploy/modal/guide.md new file mode 100644 index 00000000..7930bce0 --- /dev/null +++ b/deploy/modal/guide.md @@ -0,0 +1,453 @@ +# Deploying Crawl4ai with Modal: A Comprehensive Tutorial + +Hey there! UncleCode here. I'm excited to show you how to deploy Crawl4ai using Modal - a fantastic serverless platform that makes deployment super simple and scalable. + +In this tutorial, I'll walk you through deploying your own Crawl4ai instance on Modal's infrastructure. This will give you a powerful, scalable web crawling solution without having to worry about infrastructure management. + +## What is Modal? + +Modal is a serverless platform that allows you to run Python functions in the cloud without managing servers. It's perfect for deploying Crawl4ai because: + +1. It handles all the infrastructure for you +2. It scales automatically based on demand +3. It makes deployment incredibly simple + +## Prerequisites + +Before we get started, you'll need: + +- A Modal account (sign up at [modal.com](https://modal.com)) +- Python 3.10 or later installed on your local machine +- Basic familiarity with Python and command-line operations + +## Step 1: Setting Up Your Modal Account + +First, sign up for a Modal account at [modal.com](https://modal.com) if you haven't already. Modal offers a generous free tier that's perfect for getting started. + +After signing up, install the Modal CLI and authenticate: + +```bash +pip install modal +modal token new +``` + +This will open a browser window where you can authenticate and generate a token for the CLI. + +## Step 2: Creating Your Crawl4ai Deployment + +Now, let's create a Python file called `crawl4ai_modal.py` with our deployment code: + +```python +import modal +from typing import Optional, Dict, Any + +# Create a custom image with Crawl4ai and its dependencies +image = modal.Image.debian_slim(python_version="3.10").pip_install( + ["fastapi[standard]"] +).run_commands( + "apt-get update", + "apt-get install -y software-properties-common", + "apt-get install -y git", + "apt-add-repository non-free", + "apt-add-repository contrib", + "pip install -U crawl4ai", + "pip install -U fastapi[standard]", + "pip install -U pydantic", + "crawl4ai-setup", # This installs playwright and downloads chromium +) + +# Define the app +app = modal.App("crawl4ai", image=image) + +# Define default configurations +DEFAULT_BROWSER_CONFIG = { + "headless": True, + "verbose": False, +} + +DEFAULT_CRAWLER_CONFIG = { + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "markdown_generator": { + "type": "DefaultMarkdownGenerator", + "params": { + "content_filter": { + "type": "PruningContentFilter", + "params": { + "threshold": 0.48, + "threshold_type": "fixed" + } + } + } + } + } + } +} + +@app.function(timeout=300) # 5 minute timeout +async def crawl( + url: str, + browser_config: Optional[Dict[str, Any]] = None, + crawler_config: Optional[Dict[str, Any]] = None, +) -> Dict[str, Any]: + """ + Crawl a given URL using Crawl4ai. + + Args: + url: The URL to crawl + browser_config: Optional browser configuration to override defaults + crawler_config: Optional crawler configuration to override defaults + + Returns: + A dictionary containing the crawl results + """ + from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CrawlerRunConfig, + CrawlResult + ) + + # Prepare browser config using the loader method + if browser_config is None: + browser_config = DEFAULT_BROWSER_CONFIG + browser_config_obj = BrowserConfig.load(browser_config) + + # Prepare crawler config using the loader method + if crawler_config is None: + crawler_config = DEFAULT_CRAWLER_CONFIG + crawler_config_obj = CrawlerRunConfig.load(crawler_config) + + # Perform the crawl + async with AsyncWebCrawler(config=browser_config_obj) as crawler: + result: CrawlResult = await crawler.arun(url=url, config=crawler_config_obj) + + # Return serializable results + try: + # Try newer Pydantic v2 method + return result.model_dump() + except AttributeError: + try: + # Try older Pydantic v1 method + return result.dict() + except AttributeError: + # Fallback to manual conversion + return { + "url": result.url, + "title": result.title, + "status": result.status, + "content": str(result.content) if hasattr(result, "content") else None, + "links": [{"url": link.url, "text": link.text} for link in result.links] if hasattr(result, "links") else [], + "markdown_v2": { + "raw_markdown": result.markdown_v2.raw_markdown if hasattr(result, "markdown_v2") else None + } + } + +@app.function() +@modal.web_endpoint(method="POST") +def crawl_endpoint(data: Dict[str, Any]) -> Dict[str, Any]: + """ + Web endpoint that accepts POST requests with JSON data containing: + - url: The URL to crawl + - browser_config: Optional browser configuration + - crawler_config: Optional crawler configuration + + Returns the crawl results. + """ + url = data.get("url") + if not url: + return {"error": "URL is required"} + + browser_config = data.get("browser_config") + crawler_config = data.get("crawler_config") + + return crawl.remote(url, browser_config, crawler_config) + +@app.local_entrypoint() +def main(url: str = "https://www.modal.com"): + """ + Command line entrypoint for local testing. + """ + result = crawl.remote(url) + print(result) +``` + +## Step 3: Understanding the Code Components + +Let's break down what's happening in this code: + +### 1. Image Definition + +```python +image = modal.Image.debian_slim(python_version="3.10").pip_install( + ["fastapi[standard]"] +).run_commands( + "apt-get update", + "apt-get install -y software-properties-common", + "apt-get install -y git", + "apt-add-repository non-free", + "apt-add-repository contrib", + "pip install -U git+https://github.com/unclecode/crawl4ai.git@next", + "pip install -U fastapi[standard]", + "pip install -U pydantic", + "crawl4ai-setup", # This installs playwright and downloads chromium +) +``` + +This section defines the container image that Modal will use to run your code. It: +- Starts with a Debian Slim base image with Python 3.10 +- Installs FastAPI +- Updates the system packages +- Installs Git and other dependencies +- Installs Crawl4ai from the GitHub repository +- Runs the Crawl4ai setup to install Playwright and download Chromium + +### 2. Modal App Definition + +```python +app = modal.App("crawl4ai", image=image) +``` + +This creates a Modal application named "crawl4ai" that uses the image we defined above. + +### 3. Default Configurations + +```python +DEFAULT_BROWSER_CONFIG = { + "headless": True, + "verbose": False, +} + +DEFAULT_CRAWLER_CONFIG = { + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "markdown_generator": { + "type": "DefaultMarkdownGenerator", + "params": { + "content_filter": { + "type": "PruningContentFilter", + "params": { + "threshold": 0.48, + "threshold_type": "fixed" + } + } + } + } + } + } +} +``` + +These define the default configurations for the browser and crawler. You can customize these settings based on your specific needs. + +### 4. The Crawl Function + +```python +@app.function(timeout=300) +async def crawl(url, browser_config, crawler_config): + # Function implementation +``` + +This is the main function that performs the crawling. It: +- Takes a URL and optional configurations +- Sets up the browser and crawler with those configurations +- Performs the crawl +- Returns the results in a serializable format + +The `@app.function(timeout=300)` decorator tells Modal to run this function in the cloud with a 5-minute timeout. + +### 5. The Web Endpoint + +```python +@app.function() +@modal.web_endpoint(method="POST") +def crawl_endpoint(data: Dict[str, Any]) -> Dict[str, Any]: + # Function implementation +``` + +This creates a web endpoint that accepts POST requests. It: +- Extracts the URL and configurations from the request +- Calls the crawl function with those parameters +- Returns the results + +### 6. Local Entrypoint + +```python +@app.local_entrypoint() +def main(url: str = "https://www.modal.com"): + # Function implementation +``` + +This provides a way to test the application from the command line. + +## Step 4: Testing Locally + +Before deploying, let's test our application locally: + +```bash +modal run crawl4ai_modal.py --url "https://example.com" +``` + +This command will: +1. Upload your code to Modal +2. Create the necessary containers +3. Run the `main` function with the specified URL +4. Return the results + +Modal will handle all the infrastructure setup for you. You should see the crawling results printed to your console. + +## Step 5: Deploying Your Application + +Once you're satisfied with the local testing, it's time to deploy: + +```bash +modal deploy crawl4ai_modal.py +``` + +This will deploy your application to Modal's cloud. The deployment process will output URLs for your web endpoints. + +You should see output similar to: + +``` +✓ Deployed crawl4ai. + URLs: + crawl_endpoint => https://your-username--crawl-endpoint.modal.run +``` + +Save this URL - you'll need it to make requests to your deployment. + +## Step 6: Using Your Deployment + +Now that your application is deployed, you can use it by sending POST requests to the endpoint URL: + +```bash +curl -X POST https://your-username--crawl-endpoint.modal.run \ + -H "Content-Type: application/json" \ + -d '{"url": "https://example.com"}' +``` + +Or in Python: + +```python +import requests + +response = requests.post( + "https://your-username--crawl-endpoint.modal.run", + json={"url": "https://example.com"} +) + +result = response.json() +print(result) +``` + +You can also customize the browser and crawler configurations: + +```python +requests.post( + "https://your-username--crawl-endpoint.modal.run", + json={ + "url": "https://example.com", + "browser_config": { + "headless": False, + "verbose": True + }, + "crawler_config": { + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "markdown_generator": { + "type": "DefaultMarkdownGenerator", + "params": { + "content_filter": { + "type": "PruningContentFilter", + "params": { + "threshold": 0.6, # Adjusted threshold + "threshold_type": "fixed" + } + } + } + } + } + } + } + } +) +``` + +## Step 7: Calling Your Deployment from Another Python Script + +You can also call your deployed function directly from another Python script: + +```python +import modal + +# Get a reference to the deployed function +crawl_function = modal.Function.from_name("crawl4ai", "crawl") + +# Call the function +result = crawl_function.remote("https://example.com") +print(result) +``` + +## Understanding Modal's Execution Flow + +To understand how Modal works, it's important to know: + +1. **Local vs. Remote Execution**: When you call a function with `.remote()`, it runs in Modal's cloud, not on your local machine. + +2. **Container Lifecycle**: Modal creates containers on-demand and destroys them when they're not needed. + +3. **Caching**: Modal caches your container images to speed up subsequent runs. + +4. **Serverless Scaling**: Modal automatically scales your application based on demand. + +## Customizing Your Deployment + +You can customize your deployment in several ways: + +### Changing the Crawl4ai Version + +To use a different version of Crawl4ai, update the installation command in the image definition: + +```python +"pip install -U git+https://github.com/unclecode/crawl4ai.git@main", # Use main branch +``` + +### Adjusting Resource Limits + +You can change the resources allocated to your functions: + +```python +@app.function(timeout=600, cpu=2, memory=4096) # 10 minute timeout, 2 CPUs, 4GB RAM +async def crawl(...): + # Function implementation +``` + +### Keeping Containers Warm + +To reduce cold start times, you can keep containers warm: + +```python +@app.function(keep_warm=1) # Keep 1 container warm +async def crawl(...): + # Function implementation +``` + +## Conclusion + +That's it! You've successfully deployed Crawl4ai on Modal. You now have a scalable web crawling solution that can handle as many requests as you need without requiring any infrastructure management. + +The beauty of this setup is its simplicity - Modal handles all the hard parts, letting you focus on using Crawl4ai to extract the data you need. + +Feel free to reach out if you have any questions or need help with your deployment! + +Happy crawling! +- UncleCode + +## Additional Resources + +- [Modal Documentation](https://modal.com/docs) +- [Crawl4ai GitHub Repository](https://github.com/unclecode/crawl4ai) +- [Crawl4ai Documentation](https://docs.crawl4ai.com) diff --git a/deploy/modal/test_modal.py b/deploy/modal/test_modal.py new file mode 100644 index 00000000..f844f2d1 --- /dev/null +++ b/deploy/modal/test_modal.py @@ -0,0 +1,317 @@ + +#!/usr/bin/env python3 +""" +Crawl4ai API Testing Script + +This script tests all endpoints of the Crawl4ai API service and demonstrates their usage. +""" + +import argparse +import json +import sys +import time +from typing import Dict, Any, List, Optional + +import requests + +# Colors for terminal output +class Colors: + HEADER = '\033[95m' + BLUE = '\033[94m' + GREEN = '\033[92m' + YELLOW = '\033[93m' + RED = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + +def print_header(text: str) -> None: + """Print a formatted header.""" + print(f"\n{Colors.HEADER}{Colors.BOLD}{'=' * 80}{Colors.ENDC}") + print(f"{Colors.HEADER}{Colors.BOLD}{text.center(80)}{Colors.ENDC}") + print(f"{Colors.HEADER}{Colors.BOLD}{'=' * 80}{Colors.ENDC}\n") + +def print_step(text: str) -> None: + """Print a formatted step description.""" + print(f"{Colors.BLUE}{Colors.BOLD}>> {text}{Colors.ENDC}") + +def print_success(text: str) -> None: + """Print a success message.""" + print(f"{Colors.GREEN}✓ {text}{Colors.ENDC}") + +def print_warning(text: str) -> None: + """Print a warning message.""" + print(f"{Colors.YELLOW}⚠ {text}{Colors.ENDC}") + +def print_error(text: str) -> None: + """Print an error message.""" + print(f"{Colors.RED}✗ {text}{Colors.ENDC}") + +def print_json(data: Dict[str, Any]) -> None: + """Pretty print JSON data.""" + print(json.dumps(data, indent=2)) + +def make_request(method: str, url: str, params: Optional[Dict[str, Any]] = None, + json_data: Optional[Dict[str, Any]] = None, + expected_status: int = 200) -> Dict[str, Any]: + """Make an HTTP request and handle errors.""" + print_step(f"Making {method.upper()} request to {url}") + + if params: + print(f" Parameters: {params}") + if json_data: + print(f" JSON Data: {json_data}") + + try: + response = requests.request( + method=method, + url=url, + params=params, + json=json_data, + timeout=300 # 5 minute timeout for crawling operations + ) + + status_code = response.status_code + print(f" Status Code: {status_code}") + + try: + data = response.json() + print(" Response:") + print_json(data) + + if status_code != expected_status: + print_error(f"Expected status code {expected_status}, got {status_code}") + return data + + print_success("Request successful") + return data + except ValueError: + print_error("Response is not valid JSON") + print(response.text) + return {"error": "Invalid JSON response"} + + except requests.RequestException as e: + print_error(f"Request failed: {str(e)}") + return {"error": str(e)} + +def test_health_check(base_url: str) -> bool: + """Test the health check endpoint.""" + print_header("Testing Health Check Endpoint") + + response = make_request("GET", f"{base_url}/health_check") + + if "status" in response and response["status"] == "online": + print_success("Health check passed") + return True + else: + print_error("Health check failed") + return False + +def test_admin_create_user(base_url: str, admin_token: str, email: str, name: str) -> Optional[str]: + """Test creating a new user.""" + print_header("Testing Admin User Creation") + + response = make_request( + "POST", + f"{base_url}/admin_create_user", + json_data={ + "admin_token": admin_token, + "email": email, + "name": name + }, + expected_status=201 + ) + + if response.get("success") and "data" in response: + api_token = response["data"].get("api_token") + if api_token: + print_success(f"User created successfully with API token: {api_token}") + return api_token + + print_error("Failed to create user") + return None + +def test_check_credits(base_url: str, api_token: str) -> Optional[int]: + """Test checking user credits.""" + print_header("Testing Check Credits Endpoint") + + response = make_request( + "GET", + f"{base_url}/check_credits", + params={"api_token": api_token} + ) + + if response.get("success") and "data" in response: + credits = response["data"].get("credits") + if credits is not None: + print_success(f"User has {credits} credits") + return credits + + print_error("Failed to check credits") + return None + +def test_crawl_endpoint(base_url: str, api_token: str, url: str) -> bool: + """Test the crawl endpoint.""" + print_header("Testing Crawl Endpoint") + + response = make_request( + "POST", + f"{base_url}/crawl_endpoint", + json_data={ + "api_token": api_token, + "url": url + } + ) + + if response.get("success") and "data" in response: + print_success("Crawl completed successfully") + + # Display some crawl result data + data = response["data"] + if "title" in data: + print(f"Page Title: {data['title']}") + if "status" in data: + print(f"Status: {data['status']}") + if "links" in data: + print(f"Links found: {len(data['links'])}") + if "markdown_v2" in data and data["markdown_v2"] and "raw_markdown" in data["markdown_v2"]: + print("Markdown Preview (first 200 chars):") + print(data["markdown_v2"]["raw_markdown"][:200] + "...") + + credits_remaining = response.get("credits_remaining") + if credits_remaining is not None: + print(f"Credits remaining: {credits_remaining}") + + return True + + print_error("Crawl failed") + return False + +def test_admin_update_credits(base_url: str, admin_token: str, api_token: str, amount: int) -> bool: + """Test updating user credits.""" + print_header("Testing Admin Update Credits") + + response = make_request( + "POST", + f"{base_url}/admin_update_credits", + json_data={ + "admin_token": admin_token, + "api_token": api_token, + "amount": amount + } + ) + + if response.get("success") and "data" in response: + print_success(f"Credits updated successfully, new balance: {response['data'].get('credits')}") + return True + + print_error("Failed to update credits") + return False + +def test_admin_get_users(base_url: str, admin_token: str) -> List[Dict[str, Any]]: + """Test getting all users.""" + print_header("Testing Admin Get All Users") + + response = make_request( + "GET", + f"{base_url}/admin_get_users", + params={"admin_token": admin_token} + ) + + if response.get("success") and "data" in response: + users = response["data"] + print_success(f"Retrieved {len(users)} users") + return users + + print_error("Failed to get users") + return [] + +def run_full_test(base_url: str, admin_token: str) -> None: + """Run all tests in sequence.""" + # Remove trailing slash if present + base_url = base_url.rstrip('/') + + # Test 1: Health Check + if not test_health_check(base_url): + print_error("Health check failed, aborting tests") + sys.exit(1) + + # Test 2: Create a test user + email = f"test-user-{int(time.time())}@example.com" + name = "Test User" + api_token = test_admin_create_user(base_url, admin_token, email, name) + + if not api_token: + print_error("User creation failed, aborting tests") + sys.exit(1) + + # Test 3: Check initial credits + initial_credits = test_check_credits(base_url, api_token) + + if initial_credits is None: + print_error("Credit check failed, aborting tests") + sys.exit(1) + + # Test 4: Perform a crawl + test_url = "https://news.ycombinator.com" + crawl_success = test_crawl_endpoint(base_url, api_token, test_url) + + if not crawl_success: + print_warning("Crawl test failed, but continuing with other tests") + + # Test 5: Check credits after crawl + post_crawl_credits = test_check_credits(base_url, api_token) + + if post_crawl_credits is not None and initial_credits is not None: + if post_crawl_credits == initial_credits - 1: + print_success("Credit deduction verified") + else: + print_warning(f"Unexpected credit change: {initial_credits} -> {post_crawl_credits}") + + # Test 6: Add credits + add_credits_amount = 50 + if test_admin_update_credits(base_url, admin_token, api_token, add_credits_amount): + print_success(f"Added {add_credits_amount} credits") + + # Test 7: Check credits after addition + post_addition_credits = test_check_credits(base_url, api_token) + + if post_addition_credits is not None and post_crawl_credits is not None: + if post_addition_credits == post_crawl_credits + add_credits_amount: + print_success("Credit addition verified") + else: + print_warning(f"Unexpected credit change: {post_crawl_credits} -> {post_addition_credits}") + + # Test 8: Get all users + users = test_admin_get_users(base_url, admin_token) + + if users: + # Check if our test user is in the list + test_user = next((user for user in users if user.get("email") == email), None) + if test_user: + print_success("Test user found in users list") + else: + print_warning("Test user not found in users list") + + # Final report + print_header("Test Summary") + + print_success("All endpoints tested successfully") + print(f"Test user created with email: {email}") + print(f"API token: {api_token}") + print(f"Final credit balance: {post_addition_credits}") + +def main(): + parser = argparse.ArgumentParser(description="Test Crawl4ai API endpoints") + parser.add_argument("--base-url", required=True, help="Base URL of the Crawl4ai API (e.g., https://username--crawl4ai-api.modal.run)") + parser.add_argument("--admin-token", required=True, help="Admin token for authentication") + + args = parser.parse_args() + + print_header("Crawl4ai API Test Script") + print(f"Testing API at: {args.base_url}") + + run_full_test(args.base_url, args.admin_token) + +if __name__ == "__main__": + main() \ No newline at end of file