Add all 5 deployments solution for testing
This commit is contained in:
63
deploy/gcloud-function/Dockerfile
Normal file
63
deploy/gcloud-function/Dockerfile
Normal file
@@ -0,0 +1,63 @@
|
||||
FROM --platform=linux/amd64 python:3.10-slim
|
||||
|
||||
# Install system dependencies required for Chromium and Git
|
||||
RUN apt-get update && apt-get install -y \
|
||||
python3-dev \
|
||||
pkg-config \
|
||||
libjpeg-dev \
|
||||
gcc \
|
||||
build-essential \
|
||||
libnss3 \
|
||||
libnspr4 \
|
||||
libatk1.0-0 \
|
||||
libatk-bridge2.0-0 \
|
||||
libcups2 \
|
||||
libdrm2 \
|
||||
libxkbcommon0 \
|
||||
libxcomposite1 \
|
||||
libxdamage1 \
|
||||
libxfixes3 \
|
||||
libxrandr2 \
|
||||
libgbm1 \
|
||||
libasound2 \
|
||||
libpango-1.0-0 \
|
||||
libcairo2 \
|
||||
procps \
|
||||
git \
|
||||
socat \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Make a directory for crawl4ai call it crawl4ai_repo
|
||||
# RUN mkdir crawl4ai_repo
|
||||
|
||||
# # Clone Crawl4ai from the next branch and install it
|
||||
# RUN git clone --branch next https://github.com/unclecode/crawl4ai.git ./crawl4ai_repo \
|
||||
# && cd crawl4ai_repo \
|
||||
# && pip install . \
|
||||
# && cd .. \
|
||||
# && rm -rf crawl4ai_repo
|
||||
|
||||
RUN python3 -m venv /app/venv
|
||||
ENV PATH="/app/venv/bin:$PATH"
|
||||
# RUN pip install git+https://github.com/unclecode/crawl4ai.git@next
|
||||
|
||||
# Copy requirements and install remaining dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install -r requirements.txt
|
||||
|
||||
# Copy application files
|
||||
COPY resources /app/resources
|
||||
COPY main.py .
|
||||
COPY start.sh .
|
||||
|
||||
# Set permissions for Chrome binary and start script
|
||||
RUN chmod +x /app/resources/chrome/headless_shell && \
|
||||
chmod -R 755 /app/resources/chrome && \
|
||||
chmod +x start.sh
|
||||
|
||||
ENV FUNCTION_TARGET=crawl
|
||||
EXPOSE 8080 9223
|
||||
|
||||
CMD ["/app/start.sh"]
|
||||
8
deploy/gcloud-function/config.yml
Normal file
8
deploy/gcloud-function/config.yml
Normal file
@@ -0,0 +1,8 @@
|
||||
project_id: PROJECT_ID
|
||||
region: REGION_NAME
|
||||
artifact_repo: ARTIFACT_REPO_NAME
|
||||
function_name: FUNCTION_NAME
|
||||
memory: "2048MB"
|
||||
timeout: "540s"
|
||||
local_image: "gcr.io/ARTIFACT_REPO_NAME/crawl4ai:latest"
|
||||
test_query_url: "https://example.com"
|
||||
187
deploy/gcloud-function/deploy.py
Normal file
187
deploy/gcloud-function/deploy.py
Normal file
@@ -0,0 +1,187 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import subprocess
|
||||
import sys
|
||||
import yaml
|
||||
import requests
|
||||
|
||||
def run_command(cmd, explanation, require_confirm=True, allow_already_exists=False):
|
||||
print("\n=== {} ===".format(explanation))
|
||||
if require_confirm:
|
||||
input("Press Enter to run: [{}]\n".format(cmd))
|
||||
print("Running: {}".format(cmd))
|
||||
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
if allow_already_exists and "ALREADY_EXISTS" in result.stderr:
|
||||
print("Repository already exists, skipping creation.")
|
||||
return ""
|
||||
print("Error:\n{}".format(result.stderr))
|
||||
sys.exit(1)
|
||||
out = result.stdout.strip()
|
||||
if out:
|
||||
print("Output:\n{}".format(out))
|
||||
return out
|
||||
|
||||
def load_config():
|
||||
try:
|
||||
with open("config.yml", "r") as f:
|
||||
config = yaml.safe_load(f)
|
||||
except Exception as e:
|
||||
print("Failed to load config.yml: {}".format(e))
|
||||
sys.exit(1)
|
||||
required = ["project_id", "region", "artifact_repo", "function_name", "local_image"]
|
||||
for key in required:
|
||||
if key not in config or not config[key]:
|
||||
print("Missing required config parameter: {}".format(key))
|
||||
sys.exit(1)
|
||||
return config
|
||||
|
||||
def deploy_function(config):
|
||||
project_id = config["project_id"]
|
||||
region = config["region"]
|
||||
artifact_repo = config["artifact_repo"]
|
||||
function_name = config["function_name"]
|
||||
memory = config.get("memory", "2048MB")
|
||||
timeout = config.get("timeout", "540s")
|
||||
local_image = config["local_image"]
|
||||
test_query_url = config.get("test_query_url", "https://example.com")
|
||||
|
||||
# Repository image format: "<region>-docker.pkg.dev/<project_id>/<artifact_repo>/<function_name>:latest"
|
||||
repo_image = f"{region}-docker.pkg.dev/{project_id}/{artifact_repo}/{function_name}:latest"
|
||||
|
||||
# 1. Create Artifact Registry repository (skip if exists)
|
||||
cmd = f"gcloud artifacts repositories create {artifact_repo} --repository-format=docker --location={region} --project={project_id}"
|
||||
run_command(cmd, "Creating Artifact Registry repository (if it doesn't exist)", allow_already_exists=True)
|
||||
|
||||
# 2. Tag the local Docker image with the repository image name
|
||||
cmd = f"docker tag {local_image} {repo_image}"
|
||||
run_command(cmd, "Tagging Docker image for Artifact Registry")
|
||||
|
||||
# 3. Authenticate Docker to Artifact Registry
|
||||
cmd = f"gcloud auth configure-docker {region}-docker.pkg.dev"
|
||||
run_command(cmd, "Authenticating Docker to Artifact Registry")
|
||||
|
||||
# 4. Push the tagged Docker image to Artifact Registry
|
||||
cmd = f"docker push {repo_image}"
|
||||
run_command(cmd, "Pushing Docker image to Artifact Registry")
|
||||
|
||||
# 5. Deploy the Cloud Function using the custom container
|
||||
cmd = (
|
||||
f"gcloud beta functions deploy {function_name} "
|
||||
f"--gen2 "
|
||||
f"--runtime=python310 "
|
||||
f"--entry-point=crawl "
|
||||
f"--region={region} "
|
||||
f"--docker-repository={region}-docker.pkg.dev/{project_id}/{artifact_repo} "
|
||||
f"--trigger-http "
|
||||
f"--memory={memory} "
|
||||
f"--timeout={timeout} "
|
||||
f"--project={project_id}"
|
||||
)
|
||||
run_command(cmd, "Deploying Cloud Function using custom container")
|
||||
|
||||
# 6. Set the Cloud Function to allow public (unauthenticated) invocations
|
||||
cmd = (
|
||||
f"gcloud functions add-iam-policy-binding {function_name} "
|
||||
f"--region={region} "
|
||||
f"--member='allUsers' "
|
||||
f"--role='roles/cloudfunctions.invoker' "
|
||||
f"--project={project_id}"
|
||||
f"--quiet"
|
||||
)
|
||||
run_command(cmd, "Setting Cloud Function IAM to allow public invocations")
|
||||
|
||||
# 7. Retrieve the deployed Cloud Function URL
|
||||
cmd = (
|
||||
f"gcloud functions describe {function_name} "
|
||||
f"--region={region} "
|
||||
f"--project={project_id} "
|
||||
f"--format='value(serviceConfig.uri)'"
|
||||
)
|
||||
deployed_url = run_command(cmd, "Extracting deployed Cloud Function URL", require_confirm=False)
|
||||
print("\nDeployed URL: {}\n".format(deployed_url))
|
||||
|
||||
# 8. Test the deployed function
|
||||
test_url = f"{deployed_url}?url={test_query_url}"
|
||||
print("Testing function with: {}".format(test_url))
|
||||
try:
|
||||
response = requests.get(test_url)
|
||||
print("Response status: {}".format(response.status_code))
|
||||
print("Response body:\n{}".format(response.text))
|
||||
if response.status_code == 200:
|
||||
print("Test successful!")
|
||||
else:
|
||||
print("Non-200 response; check function logs.")
|
||||
except Exception as e:
|
||||
print("Test request error: {}".format(e))
|
||||
sys.exit(1)
|
||||
|
||||
# 9. Final usage help
|
||||
print("\nDeployment complete!")
|
||||
print("Invoke your function with:")
|
||||
print(f"curl '{deployed_url}?url={test_query_url}'")
|
||||
print("For further instructions, refer to your documentation.")
|
||||
|
||||
def delete_function(config):
|
||||
project_id = config["project_id"]
|
||||
region = config["region"]
|
||||
function_name = config["function_name"]
|
||||
cmd = f"gcloud functions delete {function_name} --region={region} --project={project_id} --quiet"
|
||||
run_command(cmd, "Deleting Cloud Function")
|
||||
|
||||
def describe_function(config):
|
||||
project_id = config["project_id"]
|
||||
region = config["region"]
|
||||
function_name = config["function_name"]
|
||||
cmd = (
|
||||
f"gcloud functions describe {function_name} "
|
||||
f"--region={region} "
|
||||
f"--project={project_id} "
|
||||
f"--format='value(serviceConfig.uri)'"
|
||||
)
|
||||
deployed_url = run_command(cmd, "Describing Cloud Function to extract URL", require_confirm=False)
|
||||
print("\nCloud Function URL: {}\n".format(deployed_url))
|
||||
|
||||
def clear_all(config):
|
||||
print("\n=== CLEAR ALL RESOURCES ===")
|
||||
project_id = config["project_id"]
|
||||
region = config["region"]
|
||||
artifact_repo = config["artifact_repo"]
|
||||
|
||||
confirm = input("WARNING: This will DELETE the Cloud Function and the Artifact Registry repository. Are you sure? (y/N): ")
|
||||
if confirm.lower() != "y":
|
||||
print("Aborting clear operation.")
|
||||
sys.exit(0)
|
||||
|
||||
# Delete the Cloud Function
|
||||
delete_function(config)
|
||||
# Delete the Artifact Registry repository
|
||||
cmd = f"gcloud artifacts repositories delete {artifact_repo} --location={region} --project={project_id} --quiet"
|
||||
run_command(cmd, "Deleting Artifact Registry repository", require_confirm=False)
|
||||
print("All resources cleared.")
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Deploy, delete, describe, or clear Cloud Function resources using config.yml")
|
||||
subparsers = parser.add_subparsers(dest="command", required=True)
|
||||
|
||||
subparsers.add_parser("deploy", help="Deploy the Cloud Function")
|
||||
subparsers.add_parser("delete", help="Delete the deployed Cloud Function")
|
||||
subparsers.add_parser("describe", help="Describe the Cloud Function and return its URL")
|
||||
subparsers.add_parser("clear", help="Delete the Cloud Function and Artifact Registry repository")
|
||||
|
||||
args = parser.parse_args()
|
||||
config = load_config()
|
||||
|
||||
if args.command == "deploy":
|
||||
deploy_function(config)
|
||||
elif args.command == "delete":
|
||||
delete_function(config)
|
||||
elif args.command == "describe":
|
||||
describe_function(config)
|
||||
elif args.command == "clear":
|
||||
clear_all(config)
|
||||
else:
|
||||
parser.print_help()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
204
deploy/gcloud-function/guide.md
Normal file
204
deploy/gcloud-function/guide.md
Normal file
@@ -0,0 +1,204 @@
|
||||
# Deploying Crawl4ai on Google Cloud Functions
|
||||
|
||||
This guide explains how to deploy **Crawl4ai**—an open‑source web crawler library—on Google Cloud Functions Gen2 using a custom container. We assume your project folder already includes:
|
||||
|
||||
- **Dockerfile:** Builds your container image (which installs Crawl4ai from its Git repository).
|
||||
- **start.sh:** Activates your virtual environment and starts the function (using the Functions Framework).
|
||||
- **main.py:** Contains your function logic with the entry point `crawl` (and imports Crawl4ai).
|
||||
|
||||
The guide is divided into two parts:
|
||||
1. Manual deployment steps (using CLI commands)
|
||||
2. Automated deployment using a Python script (`deploy.py`)
|
||||
|
||||
---
|
||||
|
||||
## Part 1: Manual Deployment Process
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- **Google Cloud Project:** Ensure your project is active and billing is enabled.
|
||||
- **Google Cloud CLI & Docker:** Installed and configured on your local machine.
|
||||
- **Permissions:** You must have rights to create Cloud Functions and Artifact Registry repositories.
|
||||
- **Files:** Your Dockerfile, start.sh, and main.py should be in the same directory.
|
||||
|
||||
### Step 1: Build Your Docker Image
|
||||
|
||||
Your Dockerfile packages Crawl4ai along with all its dependencies. Build your image with:
|
||||
|
||||
```bash
|
||||
docker build -t gcr.io/<PROJECT_ID>/<FUNCTION_NAME>:latest .
|
||||
```
|
||||
|
||||
Replace `<PROJECT_ID>` with your Google Cloud project ID and `<FUNCTION_NAME>` with your chosen function name (for example, `crawl4ai-t1`).
|
||||
|
||||
### Step 2: Create an Artifact Registry Repository
|
||||
|
||||
Cloud Functions Gen2 requires your custom container image to reside in an Artifact Registry repository. Create one by running:
|
||||
|
||||
```bash
|
||||
gcloud artifacts repositories create <ARTIFACT_REPO> \
|
||||
--repository-format=docker \
|
||||
--location=<REGION> \
|
||||
--project=<PROJECT_ID>
|
||||
```
|
||||
|
||||
Replace `<ARTIFACT_REPO>` (for example, `crawl4ai`) and `<REGION>` (for example, `asia-east1`).
|
||||
> **Note:** If you receive an `ALREADY_EXISTS` error, the repository is already created; simply proceed to the next step.
|
||||
|
||||
### Step 3: Tag Your Docker Image
|
||||
|
||||
Tag your locally built Docker image so it matches the Artifact Registry format:
|
||||
|
||||
```bash
|
||||
docker tag gcr.io/<PROJECT_ID>/<FUNCTION_NAME>:latest <REGION>-docker.pkg.dev/<PROJECT_ID>/<ARTIFACT_REPO>/<FUNCTION_NAME>:latest
|
||||
```
|
||||
|
||||
This step “renames” the image so you can push it to your repository.
|
||||
|
||||
### Step 4: Authenticate Docker to Artifact Registry
|
||||
|
||||
Configure Docker authentication to the Artifact Registry:
|
||||
|
||||
```bash
|
||||
gcloud auth configure-docker <REGION>-docker.pkg.dev
|
||||
```
|
||||
|
||||
This ensures Docker can securely push images to your registry using your Cloud credentials.
|
||||
|
||||
### Step 5: Push the Docker Image
|
||||
|
||||
Push the tagged image to Artifact Registry:
|
||||
|
||||
```bash
|
||||
docker push <REGION>-docker.pkg.dev/<PROJECT_ID>/<ARTIFACT_REPO>/<FUNCTION_NAME>:latest
|
||||
```
|
||||
|
||||
Once complete, your container image (with Crawl4ai installed) is hosted in Artifact Registry.
|
||||
|
||||
### Step 6: Deploy the Cloud Function
|
||||
|
||||
Deploy your function using the custom container image. Run:
|
||||
|
||||
```bash
|
||||
gcloud beta functions deploy <FUNCTION_NAME> \
|
||||
--gen2 \
|
||||
--region=<REGION> \
|
||||
--docker-repository=<REGION>-docker.pkg.dev/<PROJECT_ID>/<ARTIFACT_REPO> \
|
||||
--trigger-http \
|
||||
--memory=2048MB \
|
||||
--timeout=540s \
|
||||
--project=<PROJECT_ID>
|
||||
```
|
||||
|
||||
This command tells Cloud Functions Gen2 to pull your container image from Artifact Registry and deploy it. Make sure your main.py defines the `crawl` entry point.
|
||||
|
||||
### Step 7: Make the Function Public
|
||||
|
||||
To allow external (unauthenticated) access, update the function’s IAM policy:
|
||||
|
||||
```bash
|
||||
gcloud functions add-iam-policy-binding <FUNCTION_NAME> \
|
||||
--region=<REGION> \
|
||||
--member="allUsers" \
|
||||
--role="roles/cloudfunctions.invoker" \
|
||||
--project=<PROJECT_ID> \
|
||||
--quiet
|
||||
```
|
||||
|
||||
Using the `--quiet` flag ensures the command runs non‑interactively so the policy is applied immediately.
|
||||
|
||||
### Step 8: Retrieve and Test Your Function URL
|
||||
|
||||
Get the URL for your deployed function:
|
||||
|
||||
```bash
|
||||
gcloud functions describe <FUNCTION_NAME> \
|
||||
--region=<REGION> \
|
||||
--project=<PROJECT_ID> \
|
||||
--format='value(serviceConfig.uri)'
|
||||
```
|
||||
|
||||
Test your deployment with a sample GET request (using curl or your browser):
|
||||
|
||||
```bash
|
||||
curl "<FUNCTION_URL>?url=https://example.com"
|
||||
```
|
||||
|
||||
Replace `<FUNCTION_URL>` with the output URL from the previous command. A successful test (HTTP status 200) means Crawl4ai is running on Cloud Functions.
|
||||
|
||||
---
|
||||
|
||||
## Part 2: Automated Deployment with deploy.py
|
||||
|
||||
For a more streamlined process, use the provided `deploy.py` script. This Python script automates the manual steps, prompting you to confirm key actions and providing detailed logs throughout the process.
|
||||
|
||||
### What deploy.py Does:
|
||||
|
||||
- **Reads Parameters:** It loads a `config.yml` file containing all necessary parameters such as `project_id`, `region`, `artifact_repo`, `function_name`, `local_image`, etc.
|
||||
- **Creates/Skips Repository:** It creates the Artifact Registry repository (or skips if it already exists).
|
||||
- **Tags & Pushes:** It tags your local Docker image and pushes it to the Artifact Registry.
|
||||
- **Deploys the Function:** It deploys the Cloud Function with your custom container.
|
||||
- **Updates IAM:** It sets the IAM policy to allow public access (using the `--quiet` flag).
|
||||
- **Tests the Deployment:** It extracts the deployed URL and performs a test request.
|
||||
- **Additional Commands:** You can also use subcommands in the script to delete or describe the deployed function, or even clear all resources.
|
||||
|
||||
### Example config.yml
|
||||
|
||||
Create a `config.yml` file in the same folder as your Dockerfile. An example configuration:
|
||||
|
||||
```yaml
|
||||
project_id: your-project-id
|
||||
region: asia-east1
|
||||
artifact_repo: crawl4ai
|
||||
function_name: crawl4ai-t1
|
||||
memory: "2048MB"
|
||||
timeout: "540s"
|
||||
local_image: "gcr.io/your-project-id/crawl4ai-t1:latest"
|
||||
test_query_url: "https://example.com"
|
||||
```
|
||||
|
||||
### How to Use deploy.py
|
||||
|
||||
- **Deploy the Function:**
|
||||
|
||||
```bash
|
||||
python deploy.py deploy
|
||||
```
|
||||
|
||||
The script will guide you through each step, display the output, and ask for confirmation before executing critical commands.
|
||||
|
||||
- **Describe the Function:**
|
||||
|
||||
If you forget the function URL and want to retrieve it later:
|
||||
|
||||
```bash
|
||||
python deploy.py describe
|
||||
```
|
||||
|
||||
- **Delete the Function:**
|
||||
|
||||
To remove just the Cloud Function:
|
||||
|
||||
```bash
|
||||
python deploy.py delete
|
||||
```
|
||||
|
||||
- **Clear All Resources:**
|
||||
|
||||
To delete both the Cloud Function and the Artifact Registry repository:
|
||||
|
||||
```bash
|
||||
python deploy.py clear
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Conclusion
|
||||
|
||||
This guide has walked you through two deployment methods for Crawl4ai on Google Cloud Functions Gen2:
|
||||
|
||||
1. **Manual Deployment:** Building your Docker image, pushing it to Artifact Registry, deploying the Cloud Function, and setting up IAM.
|
||||
2. **Automated Deployment:** Using `deploy.py` with a configuration file to handle the entire process interactively.
|
||||
|
||||
By following these instructions, you can deploy, test, and manage your Crawl4ai-based Cloud Function with ease. Enjoy using Crawl4ai in your cloud environment!
|
||||
|
||||
158
deploy/gcloud-function/main.py
Normal file
158
deploy/gcloud-function/main.py
Normal file
@@ -0,0 +1,158 @@
|
||||
# Cleanup Chrome process on module unload
|
||||
import atexit
|
||||
import asyncio
|
||||
import logging
|
||||
import functions_framework
|
||||
from flask import jsonify, Request
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import subprocess
|
||||
import signal
|
||||
import requests
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
logger.info(f"Python version: {sys.version}")
|
||||
logger.info(f"Python path: {sys.path}")
|
||||
|
||||
# Try to find where crawl4ai is coming from
|
||||
try:
|
||||
import crawl4ai
|
||||
logger.info(f"Crawl4AI module location: {crawl4ai.__file__}")
|
||||
logger.info(f"Contents of crawl4ai: {dir(crawl4ai)}")
|
||||
except ImportError:
|
||||
logger.error("Crawl4AI module not found")
|
||||
|
||||
# Now attempt the import
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Paths and constants
|
||||
FUNCTION_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||
CHROME_BINARY = os.path.join(FUNCTION_DIR, "resources/chrome/headless_shell")
|
||||
CDP_PORT = 9222
|
||||
|
||||
def start_chrome():
|
||||
"""Start Chrome process synchronously with exponential backoff."""
|
||||
logger.debug("Starting Chrome process...")
|
||||
chrome_args = [
|
||||
CHROME_BINARY,
|
||||
f"--remote-debugging-port={CDP_PORT}",
|
||||
"--remote-debugging-address=0.0.0.0",
|
||||
"--no-sandbox",
|
||||
"--disable-setuid-sandbox",
|
||||
"--headless=new",
|
||||
"--disable-gpu",
|
||||
"--disable-dev-shm-usage",
|
||||
"--no-zygote",
|
||||
"--single-process",
|
||||
"--disable-features=site-per-process",
|
||||
"--no-first-run",
|
||||
"--disable-extensions"
|
||||
]
|
||||
|
||||
process = subprocess.Popen(
|
||||
chrome_args,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
preexec_fn=os.setsid
|
||||
)
|
||||
|
||||
logger.debug(f"Chrome process started with PID: {process.pid}")
|
||||
|
||||
# Wait for CDP endpoint with exponential backoff
|
||||
wait_time = 1 # Start with 1 second
|
||||
max_wait_time = 16 # Cap at 16 seconds per retry
|
||||
max_attempts = 10 # Total attempts
|
||||
for attempt in range(max_attempts):
|
||||
try:
|
||||
response = requests.get(f"http://127.0.0.1:{CDP_PORT}/json/version", timeout=2)
|
||||
if response.status_code == 200:
|
||||
# Get ws URL from response
|
||||
ws_url = response.json()['webSocketDebuggerUrl']
|
||||
logger.debug("Chrome CDP is ready")
|
||||
logger.debug(f"CDP URL: {ws_url}")
|
||||
return process
|
||||
except requests.exceptions.ConnectionError:
|
||||
logger.debug(f"Waiting for CDP endpoint (attempt {attempt + 1}/{max_attempts}), retrying in {wait_time} seconds")
|
||||
time.sleep(wait_time)
|
||||
wait_time = min(wait_time * 2, max_wait_time) # Double wait time, up to max
|
||||
|
||||
# If we get here, all retries failed
|
||||
stdout, stderr = process.communicate() # Get output for debugging
|
||||
logger.error(f"Chrome stdout: {stdout.decode()}")
|
||||
logger.error(f"Chrome stderr: {stderr.decode()}")
|
||||
raise Exception("Chrome CDP endpoint failed to start after retries")
|
||||
|
||||
async def fetch_with_crawl4ai(url: str) -> dict:
|
||||
"""Fetch page content using Crawl4ai and return the result object"""
|
||||
# Get CDP URL from the running Chrome instance
|
||||
version_response = requests.get(f'http://localhost:{CDP_PORT}/json/version')
|
||||
cdp_url = version_response.json()['webSocketDebuggerUrl']
|
||||
|
||||
# Configure and run Crawl4ai
|
||||
browser_config = BrowserConfig(cdp_url=cdp_url, use_managed_browser=True)
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
result : CrawlResult = await crawler.arun(
|
||||
url=url, config=crawler_config
|
||||
)
|
||||
return result.model_dump() # Convert Pydantic model to dict for JSON response
|
||||
|
||||
# Start Chrome when the module loads
|
||||
logger.info("Starting Chrome process on module load")
|
||||
chrome_process = start_chrome()
|
||||
|
||||
@functions_framework.http
|
||||
def crawl(request: Request):
|
||||
"""HTTP Cloud Function to fetch web content using Crawl4ai"""
|
||||
try:
|
||||
url = request.args.get('url')
|
||||
if not url:
|
||||
return jsonify({'error': 'URL parameter is required', 'status': 400}), 400
|
||||
|
||||
# Create and run an asyncio event loop
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
||||
try:
|
||||
result = loop.run_until_complete(
|
||||
asyncio.wait_for(fetch_with_crawl4ai(url), timeout=10.0)
|
||||
)
|
||||
return jsonify({
|
||||
'status': 200,
|
||||
'data': result
|
||||
})
|
||||
finally:
|
||||
loop.close()
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Unexpected error: {str(e)}"
|
||||
logger.error(error_msg, exc_info=True)
|
||||
return jsonify({
|
||||
'error': error_msg,
|
||||
'status': 500,
|
||||
'details': {
|
||||
'error_type': type(e).__name__,
|
||||
'stack_trace': str(e),
|
||||
'chrome_running': chrome_process.poll() is None if chrome_process else False
|
||||
}
|
||||
}), 500
|
||||
|
||||
|
||||
@atexit.register
|
||||
def cleanup():
|
||||
"""Cleanup Chrome process on shutdown"""
|
||||
if chrome_process and chrome_process.poll() is None:
|
||||
try:
|
||||
os.killpg(os.getpgid(chrome_process.pid), signal.SIGTERM)
|
||||
logger.info("Chrome process terminated")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to terminate Chrome process: {e}")
|
||||
5
deploy/gcloud-function/requirements.txt
Normal file
5
deploy/gcloud-function/requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
functions-framework==3.*
|
||||
flask==2.3.3
|
||||
requests==2.31.0
|
||||
websockets==12.0
|
||||
git+https://github.com/unclecode/crawl4ai.git@next
|
||||
10
deploy/gcloud-function/resources/chrome/fonts.conf
Executable file
10
deploy/gcloud-function/resources/chrome/fonts.conf
Executable file
@@ -0,0 +1,10 @@
|
||||
<?xml version="1.0" ?>
|
||||
<!DOCTYPE fontconfig SYSTEM "fonts.dtd">
|
||||
<fontconfig>
|
||||
<dir>/var/task/.fonts</dir>
|
||||
<dir>/var/task/fonts</dir>
|
||||
<dir>/opt/fonts</dir>
|
||||
<dir>/tmp/fonts</dir>
|
||||
<cachedir>/tmp/fonts-cache/</cachedir>
|
||||
<config></config>
|
||||
</fontconfig>
|
||||
BIN
deploy/gcloud-function/resources/chrome/fonts/Open_Sans/OpenSans-Bold.ttf
Executable file
BIN
deploy/gcloud-function/resources/chrome/fonts/Open_Sans/OpenSans-Bold.ttf
Executable file
Binary file not shown.
BIN
deploy/gcloud-function/resources/chrome/fonts/Open_Sans/OpenSans-Italic.ttf
Executable file
BIN
deploy/gcloud-function/resources/chrome/fonts/Open_Sans/OpenSans-Italic.ttf
Executable file
Binary file not shown.
BIN
deploy/gcloud-function/resources/chrome/fonts/Open_Sans/OpenSans-Light.ttf
Executable file
BIN
deploy/gcloud-function/resources/chrome/fonts/Open_Sans/OpenSans-Light.ttf
Executable file
Binary file not shown.
BIN
deploy/gcloud-function/resources/chrome/fonts/Open_Sans/OpenSans-Regular.ttf
Executable file
BIN
deploy/gcloud-function/resources/chrome/fonts/Open_Sans/OpenSans-Regular.ttf
Executable file
Binary file not shown.
BIN
deploy/gcloud-function/resources/chrome/libvulkan.so.1
Executable file
BIN
deploy/gcloud-function/resources/chrome/libvulkan.so.1
Executable file
Binary file not shown.
@@ -0,0 +1 @@
|
||||
{"file_format_version": "1.0.0", "ICD": {"library_path": "./libvk_swiftshader.so", "api_version": "1.0.5"}}
|
||||
Reference in New Issue
Block a user