Add all 5 deployments solution for testing

2025-03-10 18:57:14 +08:00
parent 9547bada3a
commit 3ea3c0520d
38 changed files with 6431 additions and 0 deletions
--- a/deploy/gcloud-function/main.py
+++ b/deploy/gcloud-function/main.py
@@ -0,0 +1,158 @@
+# Cleanup Chrome process on module unload
+import atexit
+import asyncio
+import logging
+import functions_framework
+from flask import jsonify, Request
+import os
+import sys
+import time
+import subprocess
+import signal
+import requests
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+logger.info(f"Python version: {sys.version}")
+logger.info(f"Python path: {sys.path}")
+
+# Try to find where crawl4ai is coming from
+try:
+    import crawl4ai
+    logger.info(f"Crawl4AI module location: {crawl4ai.__file__}")
+    logger.info(f"Contents of crawl4ai: {dir(crawl4ai)}")
+except ImportError:
+    logger.error("Crawl4AI module not found")
+
+# Now attempt the import
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
+
+# Configure logging
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
+# Paths and constants
+FUNCTION_DIR = os.path.dirname(os.path.realpath(__file__))
+CHROME_BINARY = os.path.join(FUNCTION_DIR, "resources/chrome/headless_shell")
+CDP_PORT = 9222
+
+def start_chrome():
+    """Start Chrome process synchronously with exponential backoff."""
+    logger.debug("Starting Chrome process...")
+    chrome_args = [
+        CHROME_BINARY,
+        f"--remote-debugging-port={CDP_PORT}",
+        "--remote-debugging-address=0.0.0.0",
+        "--no-sandbox",
+        "--disable-setuid-sandbox",
+        "--headless=new",
+        "--disable-gpu",
+        "--disable-dev-shm-usage",
+        "--no-zygote",
+        "--single-process",
+        "--disable-features=site-per-process",
+        "--no-first-run",
+        "--disable-extensions"
+    ]
+    
+    process = subprocess.Popen(
+        chrome_args,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        preexec_fn=os.setsid
+    )
+    
+    logger.debug(f"Chrome process started with PID: {process.pid}")
+    
+    # Wait for CDP endpoint with exponential backoff
+    wait_time = 1  # Start with 1 second
+    max_wait_time = 16  # Cap at 16 seconds per retry
+    max_attempts = 10  # Total attempts
+    for attempt in range(max_attempts):
+        try:
+            response = requests.get(f"http://127.0.0.1:{CDP_PORT}/json/version", timeout=2)
+            if response.status_code == 200:
+                # Get ws URL from response
+                ws_url = response.json()['webSocketDebuggerUrl']
+                logger.debug("Chrome CDP is ready")
+                logger.debug(f"CDP URL: {ws_url}")
+                return process
+        except requests.exceptions.ConnectionError:
+            logger.debug(f"Waiting for CDP endpoint (attempt {attempt + 1}/{max_attempts}), retrying in {wait_time} seconds")
+            time.sleep(wait_time)
+            wait_time = min(wait_time * 2, max_wait_time)  # Double wait time, up to max
+    
+    # If we get here, all retries failed
+    stdout, stderr = process.communicate()  # Get output for debugging
+    logger.error(f"Chrome stdout: {stdout.decode()}")
+    logger.error(f"Chrome stderr: {stderr.decode()}")
+    raise Exception("Chrome CDP endpoint failed to start after retries")
+
+async def fetch_with_crawl4ai(url: str) -> dict:
+    """Fetch page content using Crawl4ai and return the result object"""
+    # Get CDP URL from the running Chrome instance
+    version_response = requests.get(f'http://localhost:{CDP_PORT}/json/version')
+    cdp_url = version_response.json()['webSocketDebuggerUrl']
+    
+    # Configure and run Crawl4ai
+    browser_config = BrowserConfig(cdp_url=cdp_url, use_managed_browser=True)
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        crawler_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+        )
+        result : CrawlResult = await crawler.arun(
+            url=url, config=crawler_config
+        )
+        return result.model_dump()  # Convert Pydantic model to dict for JSON response
+
+# Start Chrome when the module loads
+logger.info("Starting Chrome process on module load")
+chrome_process = start_chrome()
+
+@functions_framework.http
+def crawl(request: Request):
+    """HTTP Cloud Function to fetch web content using Crawl4ai"""
+    try:
+        url = request.args.get('url')
+        if not url:
+            return jsonify({'error': 'URL parameter is required', 'status': 400}), 400
+        
+        # Create and run an asyncio event loop
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        
+        try:
+            result = loop.run_until_complete(
+                asyncio.wait_for(fetch_with_crawl4ai(url), timeout=10.0)
+            )
+            return jsonify({
+                'status': 200,
+                'data': result
+            })
+        finally:
+            loop.close()
+            
+    except Exception as e:
+        error_msg = f"Unexpected error: {str(e)}"
+        logger.error(error_msg, exc_info=True)
+        return jsonify({
+            'error': error_msg,
+            'status': 500,
+            'details': {
+                'error_type': type(e).__name__,
+                'stack_trace': str(e),
+                'chrome_running': chrome_process.poll() is None if chrome_process else False
+            }
+        }), 500
+
+
+@atexit.register
+def cleanup():
+    """Cleanup Chrome process on shutdown"""
+    if chrome_process and chrome_process.poll() is None:
+        try:
+            os.killpg(os.getpgid(chrome_process.pid), signal.SIGTERM)
+            logger.info("Chrome process terminated")
+        except Exception as e:
+            logger.error(f"Failed to terminate Chrome process: {e}")