Add all 5 deployments solution for testing
This commit is contained in:
158
deploy/gcloud-function/main.py
Normal file
158
deploy/gcloud-function/main.py
Normal file
@@ -0,0 +1,158 @@
|
||||
# Cleanup Chrome process on module unload
|
||||
import atexit
|
||||
import asyncio
|
||||
import logging
|
||||
import functions_framework
|
||||
from flask import jsonify, Request
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import subprocess
|
||||
import signal
|
||||
import requests
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
logger.info(f"Python version: {sys.version}")
|
||||
logger.info(f"Python path: {sys.path}")
|
||||
|
||||
# Try to find where crawl4ai is coming from
|
||||
try:
|
||||
import crawl4ai
|
||||
logger.info(f"Crawl4AI module location: {crawl4ai.__file__}")
|
||||
logger.info(f"Contents of crawl4ai: {dir(crawl4ai)}")
|
||||
except ImportError:
|
||||
logger.error("Crawl4AI module not found")
|
||||
|
||||
# Now attempt the import
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Paths and constants
|
||||
FUNCTION_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||
CHROME_BINARY = os.path.join(FUNCTION_DIR, "resources/chrome/headless_shell")
|
||||
CDP_PORT = 9222
|
||||
|
||||
def start_chrome():
|
||||
"""Start Chrome process synchronously with exponential backoff."""
|
||||
logger.debug("Starting Chrome process...")
|
||||
chrome_args = [
|
||||
CHROME_BINARY,
|
||||
f"--remote-debugging-port={CDP_PORT}",
|
||||
"--remote-debugging-address=0.0.0.0",
|
||||
"--no-sandbox",
|
||||
"--disable-setuid-sandbox",
|
||||
"--headless=new",
|
||||
"--disable-gpu",
|
||||
"--disable-dev-shm-usage",
|
||||
"--no-zygote",
|
||||
"--single-process",
|
||||
"--disable-features=site-per-process",
|
||||
"--no-first-run",
|
||||
"--disable-extensions"
|
||||
]
|
||||
|
||||
process = subprocess.Popen(
|
||||
chrome_args,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
preexec_fn=os.setsid
|
||||
)
|
||||
|
||||
logger.debug(f"Chrome process started with PID: {process.pid}")
|
||||
|
||||
# Wait for CDP endpoint with exponential backoff
|
||||
wait_time = 1 # Start with 1 second
|
||||
max_wait_time = 16 # Cap at 16 seconds per retry
|
||||
max_attempts = 10 # Total attempts
|
||||
for attempt in range(max_attempts):
|
||||
try:
|
||||
response = requests.get(f"http://127.0.0.1:{CDP_PORT}/json/version", timeout=2)
|
||||
if response.status_code == 200:
|
||||
# Get ws URL from response
|
||||
ws_url = response.json()['webSocketDebuggerUrl']
|
||||
logger.debug("Chrome CDP is ready")
|
||||
logger.debug(f"CDP URL: {ws_url}")
|
||||
return process
|
||||
except requests.exceptions.ConnectionError:
|
||||
logger.debug(f"Waiting for CDP endpoint (attempt {attempt + 1}/{max_attempts}), retrying in {wait_time} seconds")
|
||||
time.sleep(wait_time)
|
||||
wait_time = min(wait_time * 2, max_wait_time) # Double wait time, up to max
|
||||
|
||||
# If we get here, all retries failed
|
||||
stdout, stderr = process.communicate() # Get output for debugging
|
||||
logger.error(f"Chrome stdout: {stdout.decode()}")
|
||||
logger.error(f"Chrome stderr: {stderr.decode()}")
|
||||
raise Exception("Chrome CDP endpoint failed to start after retries")
|
||||
|
||||
async def fetch_with_crawl4ai(url: str) -> dict:
|
||||
"""Fetch page content using Crawl4ai and return the result object"""
|
||||
# Get CDP URL from the running Chrome instance
|
||||
version_response = requests.get(f'http://localhost:{CDP_PORT}/json/version')
|
||||
cdp_url = version_response.json()['webSocketDebuggerUrl']
|
||||
|
||||
# Configure and run Crawl4ai
|
||||
browser_config = BrowserConfig(cdp_url=cdp_url, use_managed_browser=True)
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
result : CrawlResult = await crawler.arun(
|
||||
url=url, config=crawler_config
|
||||
)
|
||||
return result.model_dump() # Convert Pydantic model to dict for JSON response
|
||||
|
||||
# Start Chrome when the module loads
|
||||
logger.info("Starting Chrome process on module load")
|
||||
chrome_process = start_chrome()
|
||||
|
||||
@functions_framework.http
|
||||
def crawl(request: Request):
|
||||
"""HTTP Cloud Function to fetch web content using Crawl4ai"""
|
||||
try:
|
||||
url = request.args.get('url')
|
||||
if not url:
|
||||
return jsonify({'error': 'URL parameter is required', 'status': 400}), 400
|
||||
|
||||
# Create and run an asyncio event loop
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
||||
try:
|
||||
result = loop.run_until_complete(
|
||||
asyncio.wait_for(fetch_with_crawl4ai(url), timeout=10.0)
|
||||
)
|
||||
return jsonify({
|
||||
'status': 200,
|
||||
'data': result
|
||||
})
|
||||
finally:
|
||||
loop.close()
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Unexpected error: {str(e)}"
|
||||
logger.error(error_msg, exc_info=True)
|
||||
return jsonify({
|
||||
'error': error_msg,
|
||||
'status': 500,
|
||||
'details': {
|
||||
'error_type': type(e).__name__,
|
||||
'stack_trace': str(e),
|
||||
'chrome_running': chrome_process.poll() is None if chrome_process else False
|
||||
}
|
||||
}), 500
|
||||
|
||||
|
||||
@atexit.register
|
||||
def cleanup():
|
||||
"""Cleanup Chrome process on shutdown"""
|
||||
if chrome_process and chrome_process.poll() is None:
|
||||
try:
|
||||
os.killpg(os.getpgid(chrome_process.pid), signal.SIGTERM)
|
||||
logger.info("Chrome process terminated")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to terminate Chrome process: {e}")
|
||||
Reference in New Issue
Block a user