Add all 5 deployments solution for testing

This commit is contained in:
UncleCode
2025-03-10 18:57:14 +08:00
parent 9547bada3a
commit 3ea3c0520d
38 changed files with 6431 additions and 0 deletions

View File

@@ -0,0 +1,63 @@
FROM --platform=linux/amd64 python:3.10-slim
# Install system dependencies required for Chromium and Git
RUN apt-get update && apt-get install -y \
python3-dev \
pkg-config \
libjpeg-dev \
gcc \
build-essential \
libnss3 \
libnspr4 \
libatk1.0-0 \
libatk-bridge2.0-0 \
libcups2 \
libdrm2 \
libxkbcommon0 \
libxcomposite1 \
libxdamage1 \
libxfixes3 \
libxrandr2 \
libgbm1 \
libasound2 \
libpango-1.0-0 \
libcairo2 \
procps \
git \
socat \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
# Make a directory for crawl4ai call it crawl4ai_repo
# RUN mkdir crawl4ai_repo
# # Clone Crawl4ai from the next branch and install it
# RUN git clone --branch next https://github.com/unclecode/crawl4ai.git ./crawl4ai_repo \
# && cd crawl4ai_repo \
# && pip install . \
# && cd .. \
# && rm -rf crawl4ai_repo
RUN python3 -m venv /app/venv
ENV PATH="/app/venv/bin:$PATH"
# RUN pip install git+https://github.com/unclecode/crawl4ai.git@next
# Copy requirements and install remaining dependencies
COPY requirements.txt .
RUN pip install -r requirements.txt
# Copy application files
COPY resources /app/resources
COPY main.py .
COPY start.sh .
# Set permissions for Chrome binary and start script
RUN chmod +x /app/resources/chrome/headless_shell && \
chmod -R 755 /app/resources/chrome && \
chmod +x start.sh
ENV FUNCTION_TARGET=crawl
EXPOSE 8080 9223
CMD ["/app/start.sh"]

View File

@@ -0,0 +1,8 @@
project_id: PROJECT_ID
region: REGION_NAME
artifact_repo: ARTIFACT_REPO_NAME
function_name: FUNCTION_NAME
memory: "2048MB"
timeout: "540s"
local_image: "gcr.io/ARTIFACT_REPO_NAME/crawl4ai:latest"
test_query_url: "https://example.com"

View File

@@ -0,0 +1,187 @@
#!/usr/bin/env python3
import argparse
import subprocess
import sys
import yaml
import requests
def run_command(cmd, explanation, require_confirm=True, allow_already_exists=False):
print("\n=== {} ===".format(explanation))
if require_confirm:
input("Press Enter to run: [{}]\n".format(cmd))
print("Running: {}".format(cmd))
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
if result.returncode != 0:
if allow_already_exists and "ALREADY_EXISTS" in result.stderr:
print("Repository already exists, skipping creation.")
return ""
print("Error:\n{}".format(result.stderr))
sys.exit(1)
out = result.stdout.strip()
if out:
print("Output:\n{}".format(out))
return out
def load_config():
try:
with open("config.yml", "r") as f:
config = yaml.safe_load(f)
except Exception as e:
print("Failed to load config.yml: {}".format(e))
sys.exit(1)
required = ["project_id", "region", "artifact_repo", "function_name", "local_image"]
for key in required:
if key not in config or not config[key]:
print("Missing required config parameter: {}".format(key))
sys.exit(1)
return config
def deploy_function(config):
project_id = config["project_id"]
region = config["region"]
artifact_repo = config["artifact_repo"]
function_name = config["function_name"]
memory = config.get("memory", "2048MB")
timeout = config.get("timeout", "540s")
local_image = config["local_image"]
test_query_url = config.get("test_query_url", "https://example.com")
# Repository image format: "<region>-docker.pkg.dev/<project_id>/<artifact_repo>/<function_name>:latest"
repo_image = f"{region}-docker.pkg.dev/{project_id}/{artifact_repo}/{function_name}:latest"
# 1. Create Artifact Registry repository (skip if exists)
cmd = f"gcloud artifacts repositories create {artifact_repo} --repository-format=docker --location={region} --project={project_id}"
run_command(cmd, "Creating Artifact Registry repository (if it doesn't exist)", allow_already_exists=True)
# 2. Tag the local Docker image with the repository image name
cmd = f"docker tag {local_image} {repo_image}"
run_command(cmd, "Tagging Docker image for Artifact Registry")
# 3. Authenticate Docker to Artifact Registry
cmd = f"gcloud auth configure-docker {region}-docker.pkg.dev"
run_command(cmd, "Authenticating Docker to Artifact Registry")
# 4. Push the tagged Docker image to Artifact Registry
cmd = f"docker push {repo_image}"
run_command(cmd, "Pushing Docker image to Artifact Registry")
# 5. Deploy the Cloud Function using the custom container
cmd = (
f"gcloud beta functions deploy {function_name} "
f"--gen2 "
f"--runtime=python310 "
f"--entry-point=crawl "
f"--region={region} "
f"--docker-repository={region}-docker.pkg.dev/{project_id}/{artifact_repo} "
f"--trigger-http "
f"--memory={memory} "
f"--timeout={timeout} "
f"--project={project_id}"
)
run_command(cmd, "Deploying Cloud Function using custom container")
# 6. Set the Cloud Function to allow public (unauthenticated) invocations
cmd = (
f"gcloud functions add-iam-policy-binding {function_name} "
f"--region={region} "
f"--member='allUsers' "
f"--role='roles/cloudfunctions.invoker' "
f"--project={project_id}"
f"--quiet"
)
run_command(cmd, "Setting Cloud Function IAM to allow public invocations")
# 7. Retrieve the deployed Cloud Function URL
cmd = (
f"gcloud functions describe {function_name} "
f"--region={region} "
f"--project={project_id} "
f"--format='value(serviceConfig.uri)'"
)
deployed_url = run_command(cmd, "Extracting deployed Cloud Function URL", require_confirm=False)
print("\nDeployed URL: {}\n".format(deployed_url))
# 8. Test the deployed function
test_url = f"{deployed_url}?url={test_query_url}"
print("Testing function with: {}".format(test_url))
try:
response = requests.get(test_url)
print("Response status: {}".format(response.status_code))
print("Response body:\n{}".format(response.text))
if response.status_code == 200:
print("Test successful!")
else:
print("Non-200 response; check function logs.")
except Exception as e:
print("Test request error: {}".format(e))
sys.exit(1)
# 9. Final usage help
print("\nDeployment complete!")
print("Invoke your function with:")
print(f"curl '{deployed_url}?url={test_query_url}'")
print("For further instructions, refer to your documentation.")
def delete_function(config):
project_id = config["project_id"]
region = config["region"]
function_name = config["function_name"]
cmd = f"gcloud functions delete {function_name} --region={region} --project={project_id} --quiet"
run_command(cmd, "Deleting Cloud Function")
def describe_function(config):
project_id = config["project_id"]
region = config["region"]
function_name = config["function_name"]
cmd = (
f"gcloud functions describe {function_name} "
f"--region={region} "
f"--project={project_id} "
f"--format='value(serviceConfig.uri)'"
)
deployed_url = run_command(cmd, "Describing Cloud Function to extract URL", require_confirm=False)
print("\nCloud Function URL: {}\n".format(deployed_url))
def clear_all(config):
print("\n=== CLEAR ALL RESOURCES ===")
project_id = config["project_id"]
region = config["region"]
artifact_repo = config["artifact_repo"]
confirm = input("WARNING: This will DELETE the Cloud Function and the Artifact Registry repository. Are you sure? (y/N): ")
if confirm.lower() != "y":
print("Aborting clear operation.")
sys.exit(0)
# Delete the Cloud Function
delete_function(config)
# Delete the Artifact Registry repository
cmd = f"gcloud artifacts repositories delete {artifact_repo} --location={region} --project={project_id} --quiet"
run_command(cmd, "Deleting Artifact Registry repository", require_confirm=False)
print("All resources cleared.")
def main():
parser = argparse.ArgumentParser(description="Deploy, delete, describe, or clear Cloud Function resources using config.yml")
subparsers = parser.add_subparsers(dest="command", required=True)
subparsers.add_parser("deploy", help="Deploy the Cloud Function")
subparsers.add_parser("delete", help="Delete the deployed Cloud Function")
subparsers.add_parser("describe", help="Describe the Cloud Function and return its URL")
subparsers.add_parser("clear", help="Delete the Cloud Function and Artifact Registry repository")
args = parser.parse_args()
config = load_config()
if args.command == "deploy":
deploy_function(config)
elif args.command == "delete":
delete_function(config)
elif args.command == "describe":
describe_function(config)
elif args.command == "clear":
clear_all(config)
else:
parser.print_help()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,204 @@
# Deploying Crawl4ai on Google Cloud Functions
This guide explains how to deploy **Crawl4ai**—an opensource web crawler library—on Google Cloud Functions Gen2 using a custom container. We assume your project folder already includes:
- **Dockerfile:** Builds your container image (which installs Crawl4ai from its Git repository).
- **start.sh:** Activates your virtual environment and starts the function (using the Functions Framework).
- **main.py:** Contains your function logic with the entry point `crawl` (and imports Crawl4ai).
The guide is divided into two parts:
1. Manual deployment steps (using CLI commands)
2. Automated deployment using a Python script (`deploy.py`)
---
## Part 1: Manual Deployment Process
### Prerequisites
- **Google Cloud Project:** Ensure your project is active and billing is enabled.
- **Google Cloud CLI & Docker:** Installed and configured on your local machine.
- **Permissions:** You must have rights to create Cloud Functions and Artifact Registry repositories.
- **Files:** Your Dockerfile, start.sh, and main.py should be in the same directory.
### Step 1: Build Your Docker Image
Your Dockerfile packages Crawl4ai along with all its dependencies. Build your image with:
```bash
docker build -t gcr.io/<PROJECT_ID>/<FUNCTION_NAME>:latest .
```
Replace `<PROJECT_ID>` with your Google Cloud project ID and `<FUNCTION_NAME>` with your chosen function name (for example, `crawl4ai-t1`).
### Step 2: Create an Artifact Registry Repository
Cloud Functions Gen2 requires your custom container image to reside in an Artifact Registry repository. Create one by running:
```bash
gcloud artifacts repositories create <ARTIFACT_REPO> \
--repository-format=docker \
--location=<REGION> \
--project=<PROJECT_ID>
```
Replace `<ARTIFACT_REPO>` (for example, `crawl4ai`) and `<REGION>` (for example, `asia-east1`).
> **Note:** If you receive an `ALREADY_EXISTS` error, the repository is already created; simply proceed to the next step.
### Step 3: Tag Your Docker Image
Tag your locally built Docker image so it matches the Artifact Registry format:
```bash
docker tag gcr.io/<PROJECT_ID>/<FUNCTION_NAME>:latest <REGION>-docker.pkg.dev/<PROJECT_ID>/<ARTIFACT_REPO>/<FUNCTION_NAME>:latest
```
This step “renames” the image so you can push it to your repository.
### Step 4: Authenticate Docker to Artifact Registry
Configure Docker authentication to the Artifact Registry:
```bash
gcloud auth configure-docker <REGION>-docker.pkg.dev
```
This ensures Docker can securely push images to your registry using your Cloud credentials.
### Step 5: Push the Docker Image
Push the tagged image to Artifact Registry:
```bash
docker push <REGION>-docker.pkg.dev/<PROJECT_ID>/<ARTIFACT_REPO>/<FUNCTION_NAME>:latest
```
Once complete, your container image (with Crawl4ai installed) is hosted in Artifact Registry.
### Step 6: Deploy the Cloud Function
Deploy your function using the custom container image. Run:
```bash
gcloud beta functions deploy <FUNCTION_NAME> \
--gen2 \
--region=<REGION> \
--docker-repository=<REGION>-docker.pkg.dev/<PROJECT_ID>/<ARTIFACT_REPO> \
--trigger-http \
--memory=2048MB \
--timeout=540s \
--project=<PROJECT_ID>
```
This command tells Cloud Functions Gen2 to pull your container image from Artifact Registry and deploy it. Make sure your main.py defines the `crawl` entry point.
### Step 7: Make the Function Public
To allow external (unauthenticated) access, update the functions IAM policy:
```bash
gcloud functions add-iam-policy-binding <FUNCTION_NAME> \
--region=<REGION> \
--member="allUsers" \
--role="roles/cloudfunctions.invoker" \
--project=<PROJECT_ID> \
--quiet
```
Using the `--quiet` flag ensures the command runs noninteractively so the policy is applied immediately.
### Step 8: Retrieve and Test Your Function URL
Get the URL for your deployed function:
```bash
gcloud functions describe <FUNCTION_NAME> \
--region=<REGION> \
--project=<PROJECT_ID> \
--format='value(serviceConfig.uri)'
```
Test your deployment with a sample GET request (using curl or your browser):
```bash
curl "<FUNCTION_URL>?url=https://example.com"
```
Replace `<FUNCTION_URL>` with the output URL from the previous command. A successful test (HTTP status 200) means Crawl4ai is running on Cloud Functions.
---
## Part 2: Automated Deployment with deploy.py
For a more streamlined process, use the provided `deploy.py` script. This Python script automates the manual steps, prompting you to confirm key actions and providing detailed logs throughout the process.
### What deploy.py Does:
- **Reads Parameters:** It loads a `config.yml` file containing all necessary parameters such as `project_id`, `region`, `artifact_repo`, `function_name`, `local_image`, etc.
- **Creates/Skips Repository:** It creates the Artifact Registry repository (or skips if it already exists).
- **Tags & Pushes:** It tags your local Docker image and pushes it to the Artifact Registry.
- **Deploys the Function:** It deploys the Cloud Function with your custom container.
- **Updates IAM:** It sets the IAM policy to allow public access (using the `--quiet` flag).
- **Tests the Deployment:** It extracts the deployed URL and performs a test request.
- **Additional Commands:** You can also use subcommands in the script to delete or describe the deployed function, or even clear all resources.
### Example config.yml
Create a `config.yml` file in the same folder as your Dockerfile. An example configuration:
```yaml
project_id: your-project-id
region: asia-east1
artifact_repo: crawl4ai
function_name: crawl4ai-t1
memory: "2048MB"
timeout: "540s"
local_image: "gcr.io/your-project-id/crawl4ai-t1:latest"
test_query_url: "https://example.com"
```
### How to Use deploy.py
- **Deploy the Function:**
```bash
python deploy.py deploy
```
The script will guide you through each step, display the output, and ask for confirmation before executing critical commands.
- **Describe the Function:**
If you forget the function URL and want to retrieve it later:
```bash
python deploy.py describe
```
- **Delete the Function:**
To remove just the Cloud Function:
```bash
python deploy.py delete
```
- **Clear All Resources:**
To delete both the Cloud Function and the Artifact Registry repository:
```bash
python deploy.py clear
```
---
## Conclusion
This guide has walked you through two deployment methods for Crawl4ai on Google Cloud Functions Gen2:
1. **Manual Deployment:** Building your Docker image, pushing it to Artifact Registry, deploying the Cloud Function, and setting up IAM.
2. **Automated Deployment:** Using `deploy.py` with a configuration file to handle the entire process interactively.
By following these instructions, you can deploy, test, and manage your Crawl4ai-based Cloud Function with ease. Enjoy using Crawl4ai in your cloud environment!

View File

@@ -0,0 +1,158 @@
# Cleanup Chrome process on module unload
import atexit
import asyncio
import logging
import functions_framework
from flask import jsonify, Request
import os
import sys
import time
import subprocess
import signal
import requests
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info(f"Python version: {sys.version}")
logger.info(f"Python path: {sys.path}")
# Try to find where crawl4ai is coming from
try:
import crawl4ai
logger.info(f"Crawl4AI module location: {crawl4ai.__file__}")
logger.info(f"Contents of crawl4ai: {dir(crawl4ai)}")
except ImportError:
logger.error("Crawl4AI module not found")
# Now attempt the import
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
# Configure logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
# Paths and constants
FUNCTION_DIR = os.path.dirname(os.path.realpath(__file__))
CHROME_BINARY = os.path.join(FUNCTION_DIR, "resources/chrome/headless_shell")
CDP_PORT = 9222
def start_chrome():
"""Start Chrome process synchronously with exponential backoff."""
logger.debug("Starting Chrome process...")
chrome_args = [
CHROME_BINARY,
f"--remote-debugging-port={CDP_PORT}",
"--remote-debugging-address=0.0.0.0",
"--no-sandbox",
"--disable-setuid-sandbox",
"--headless=new",
"--disable-gpu",
"--disable-dev-shm-usage",
"--no-zygote",
"--single-process",
"--disable-features=site-per-process",
"--no-first-run",
"--disable-extensions"
]
process = subprocess.Popen(
chrome_args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
preexec_fn=os.setsid
)
logger.debug(f"Chrome process started with PID: {process.pid}")
# Wait for CDP endpoint with exponential backoff
wait_time = 1 # Start with 1 second
max_wait_time = 16 # Cap at 16 seconds per retry
max_attempts = 10 # Total attempts
for attempt in range(max_attempts):
try:
response = requests.get(f"http://127.0.0.1:{CDP_PORT}/json/version", timeout=2)
if response.status_code == 200:
# Get ws URL from response
ws_url = response.json()['webSocketDebuggerUrl']
logger.debug("Chrome CDP is ready")
logger.debug(f"CDP URL: {ws_url}")
return process
except requests.exceptions.ConnectionError:
logger.debug(f"Waiting for CDP endpoint (attempt {attempt + 1}/{max_attempts}), retrying in {wait_time} seconds")
time.sleep(wait_time)
wait_time = min(wait_time * 2, max_wait_time) # Double wait time, up to max
# If we get here, all retries failed
stdout, stderr = process.communicate() # Get output for debugging
logger.error(f"Chrome stdout: {stdout.decode()}")
logger.error(f"Chrome stderr: {stderr.decode()}")
raise Exception("Chrome CDP endpoint failed to start after retries")
async def fetch_with_crawl4ai(url: str) -> dict:
"""Fetch page content using Crawl4ai and return the result object"""
# Get CDP URL from the running Chrome instance
version_response = requests.get(f'http://localhost:{CDP_PORT}/json/version')
cdp_url = version_response.json()['webSocketDebuggerUrl']
# Configure and run Crawl4ai
browser_config = BrowserConfig(cdp_url=cdp_url, use_managed_browser=True)
async with AsyncWebCrawler(config=browser_config) as crawler:
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
)
result : CrawlResult = await crawler.arun(
url=url, config=crawler_config
)
return result.model_dump() # Convert Pydantic model to dict for JSON response
# Start Chrome when the module loads
logger.info("Starting Chrome process on module load")
chrome_process = start_chrome()
@functions_framework.http
def crawl(request: Request):
"""HTTP Cloud Function to fetch web content using Crawl4ai"""
try:
url = request.args.get('url')
if not url:
return jsonify({'error': 'URL parameter is required', 'status': 400}), 400
# Create and run an asyncio event loop
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
result = loop.run_until_complete(
asyncio.wait_for(fetch_with_crawl4ai(url), timeout=10.0)
)
return jsonify({
'status': 200,
'data': result
})
finally:
loop.close()
except Exception as e:
error_msg = f"Unexpected error: {str(e)}"
logger.error(error_msg, exc_info=True)
return jsonify({
'error': error_msg,
'status': 500,
'details': {
'error_type': type(e).__name__,
'stack_trace': str(e),
'chrome_running': chrome_process.poll() is None if chrome_process else False
}
}), 500
@atexit.register
def cleanup():
"""Cleanup Chrome process on shutdown"""
if chrome_process and chrome_process.poll() is None:
try:
os.killpg(os.getpgid(chrome_process.pid), signal.SIGTERM)
logger.info("Chrome process terminated")
except Exception as e:
logger.error(f"Failed to terminate Chrome process: {e}")

View File

@@ -0,0 +1,5 @@
functions-framework==3.*
flask==2.3.3
requests==2.31.0
websockets==12.0
git+https://github.com/unclecode/crawl4ai.git@next

View File

@@ -0,0 +1,10 @@
<?xml version="1.0" ?>
<!DOCTYPE fontconfig SYSTEM "fonts.dtd">
<fontconfig>
<dir>/var/task/.fonts</dir>
<dir>/var/task/fonts</dir>
<dir>/opt/fonts</dir>
<dir>/tmp/fonts</dir>
<cachedir>/tmp/fonts-cache/</cachedir>
<config></config>
</fontconfig>

Binary file not shown.

View File

@@ -0,0 +1 @@
{"file_format_version": "1.0.0", "ICD": {"library_path": "./libvk_swiftshader.so", "api_version": "1.0.5"}}