158 lines
5.4 KiB
Python
158 lines
5.4 KiB
Python
# Cleanup Chrome process on module unload
|
|
import atexit
|
|
import asyncio
|
|
import logging
|
|
import functions_framework
|
|
from flask import jsonify, Request
|
|
import os
|
|
import sys
|
|
import time
|
|
import subprocess
|
|
import signal
|
|
import requests
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
logger.info(f"Python version: {sys.version}")
|
|
logger.info(f"Python path: {sys.path}")
|
|
|
|
# Try to find where crawl4ai is coming from
|
|
try:
|
|
import crawl4ai
|
|
logger.info(f"Crawl4AI module location: {crawl4ai.__file__}")
|
|
logger.info(f"Contents of crawl4ai: {dir(crawl4ai)}")
|
|
except ImportError:
|
|
logger.error("Crawl4AI module not found")
|
|
|
|
# Now attempt the import
|
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, CrawlResult
|
|
|
|
# Configure logging
|
|
logging.basicConfig(level=logging.DEBUG)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Paths and constants
|
|
FUNCTION_DIR = os.path.dirname(os.path.realpath(__file__))
|
|
CHROME_BINARY = os.path.join(FUNCTION_DIR, "resources/chrome/headless_shell")
|
|
CDP_PORT = 9222
|
|
|
|
def start_chrome():
|
|
"""Start Chrome process synchronously with exponential backoff."""
|
|
logger.debug("Starting Chrome process...")
|
|
chrome_args = [
|
|
CHROME_BINARY,
|
|
f"--remote-debugging-port={CDP_PORT}",
|
|
"--remote-debugging-address=0.0.0.0",
|
|
"--no-sandbox",
|
|
"--disable-setuid-sandbox",
|
|
"--headless=new",
|
|
"--disable-gpu",
|
|
"--disable-dev-shm-usage",
|
|
"--no-zygote",
|
|
"--single-process",
|
|
"--disable-features=site-per-process",
|
|
"--no-first-run",
|
|
"--disable-extensions"
|
|
]
|
|
|
|
process = subprocess.Popen(
|
|
chrome_args,
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
preexec_fn=os.setsid
|
|
)
|
|
|
|
logger.debug(f"Chrome process started with PID: {process.pid}")
|
|
|
|
# Wait for CDP endpoint with exponential backoff
|
|
wait_time = 1 # Start with 1 second
|
|
max_wait_time = 16 # Cap at 16 seconds per retry
|
|
max_attempts = 10 # Total attempts
|
|
for attempt in range(max_attempts):
|
|
try:
|
|
response = requests.get(f"http://127.0.0.1:{CDP_PORT}/json/version", timeout=2)
|
|
if response.status_code == 200:
|
|
# Get ws URL from response
|
|
ws_url = response.json()['webSocketDebuggerUrl']
|
|
logger.debug("Chrome CDP is ready")
|
|
logger.debug(f"CDP URL: {ws_url}")
|
|
return process
|
|
except requests.exceptions.ConnectionError:
|
|
logger.debug(f"Waiting for CDP endpoint (attempt {attempt + 1}/{max_attempts}), retrying in {wait_time} seconds")
|
|
time.sleep(wait_time)
|
|
wait_time = min(wait_time * 2, max_wait_time) # Double wait time, up to max
|
|
|
|
# If we get here, all retries failed
|
|
stdout, stderr = process.communicate() # Get output for debugging
|
|
logger.error(f"Chrome stdout: {stdout.decode()}")
|
|
logger.error(f"Chrome stderr: {stderr.decode()}")
|
|
raise Exception("Chrome CDP endpoint failed to start after retries")
|
|
|
|
async def fetch_with_crawl4ai(url: str) -> dict:
|
|
"""Fetch page content using Crawl4ai and return the result object"""
|
|
# Get CDP URL from the running Chrome instance
|
|
version_response = requests.get(f'http://localhost:{CDP_PORT}/json/version')
|
|
cdp_url = version_response.json()['webSocketDebuggerUrl']
|
|
|
|
# Configure and run Crawl4ai
|
|
browser_config = BrowserConfig(cdp_url=cdp_url, use_managed_browser=True)
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
crawler_config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS,
|
|
)
|
|
result : CrawlResult = await crawler.arun(
|
|
url=url, config=crawler_config
|
|
)
|
|
return result.model_dump() # Convert Pydantic model to dict for JSON response
|
|
|
|
# Start Chrome when the module loads
|
|
logger.info("Starting Chrome process on module load")
|
|
chrome_process = start_chrome()
|
|
|
|
@functions_framework.http
|
|
def crawl(request: Request):
|
|
"""HTTP Cloud Function to fetch web content using Crawl4ai"""
|
|
try:
|
|
url = request.args.get('url')
|
|
if not url:
|
|
return jsonify({'error': 'URL parameter is required', 'status': 400}), 400
|
|
|
|
# Create and run an asyncio event loop
|
|
loop = asyncio.new_event_loop()
|
|
asyncio.set_event_loop(loop)
|
|
|
|
try:
|
|
result = loop.run_until_complete(
|
|
asyncio.wait_for(fetch_with_crawl4ai(url), timeout=10.0)
|
|
)
|
|
return jsonify({
|
|
'status': 200,
|
|
'data': result
|
|
})
|
|
finally:
|
|
loop.close()
|
|
|
|
except Exception as e:
|
|
error_msg = f"Unexpected error: {str(e)}"
|
|
logger.error(error_msg, exc_info=True)
|
|
return jsonify({
|
|
'error': error_msg,
|
|
'status': 500,
|
|
'details': {
|
|
'error_type': type(e).__name__,
|
|
'stack_trace': str(e),
|
|
'chrome_running': chrome_process.poll() is None if chrome_process else False
|
|
}
|
|
}), 500
|
|
|
|
|
|
@atexit.register
|
|
def cleanup():
|
|
"""Cleanup Chrome process on shutdown"""
|
|
if chrome_process and chrome_process.poll() is None:
|
|
try:
|
|
os.killpg(os.getpgid(chrome_process.pid), signal.SIGTERM)
|
|
logger.info("Chrome process terminated")
|
|
except Exception as e:
|
|
logger.error(f"Failed to terminate Chrome process: {e}") |