blob: 956d9ae5f73033b773dec3d2e4b8b7d34a19cf86 [file] [log] [blame]
#!/bin/bash
# Copyright 2022 The IREE Authors
#
# Licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
set -o errexit # Exit if any command fails
set -o errtrace # make ERR trap inherit
set -o pipefail # return error if any part of a pipe errors
set -o nounset # error if an undefined variable is used
SUCCESS_DELETE_INSTANCE=1
FAILURE_DELETE_INSTANCE=0
RUNNER_TYPE="${RUNNER_TYPE:-cpu}"
RUNNER_TYPE="${RUNNER_TYPE,,}"
TIME_STRING="$(date +%Y-%m-%d-%s)"
INSTANCE_NAME="${INSTANCE_NAME:-github-runner-template-${RUNNER_TYPE}-${TIME_STRING}}"
IMAGE_NAME="${INSTANCE_NAME/-template/}"
ZONE="${ZONE:-us-central1-a}"
PROJECT=iree-oss
case "${RUNNER_TYPE}" in
arm64)
BASE_IMAGE_ARCH="-arm64"
;;
*)
BASE_IMAGE_ARCH=""
;;
esac
BASE_IMAGE="${BASE_IMAGE:-projects/ubuntu-os-cloud/global/images/ubuntu-2204-jammy${BASE_IMAGE_ARCH}-v20231030}"
# We create the image using n1 machines with attached T4 GPUs. This image works
# for the A100 machines as well though.
GPU_MACHINE_TYPE="n1-standard-16"
X86_64_MACHINE_TYPE="e2-medium"
ARM64_MACHINE_TYPE="t2a-standard-8"
CPU_IMAGE_SIZE_GB=10
# We need enough space to fetch Docker images that we test with
# TODO(gcmn): See if we can make the image smaller, e.g. by resizing after setup
# or using a local ssd for scratch space during setup.
GPU_IMAGE_SIZE_GB=100
# It takes a little bit to bring up ssh on the instance. I haven't found a
# better way to wait for this than just polling.
MAX_IP_ATTEMPTS=5
MAX_SSH_ATTEMPTS=10
MAX_SCP_ATTEMPTS=5
DELETE_INSTANCE_CMD=(
gcloud
compute
instances
delete
"${INSTANCE_NAME}"
--zone="${ZONE}"
)
SSH_CMD=(
gcloud
compute
ssh
"${INSTANCE_NAME}"
--zone="${ZONE}"
--no-user-output-enabled
)
function cleanup_reminder() {
echo "You can ssh in to debug with the following command:"
echo "${SSH_CMD[@]}"
echo "Make sure to delete ${INSTANCE_NAME} when you're done debugging:"
echo "${DELETE_INSTANCE_CMD[@]}"
}
function failure_exit() {
local exit_code="$?"
trap - INT ERR EXIT
if (( exit_code != 0 )); then
echo "Image creation was not successful."
if (( FAILURE_DELETE_INSTANCE==1 )); then
echo "Attempting to delete instance ${INSTANCE_NAME}"
"${DELETE_INSTANCE_CMD[@]}" --quiet
exit "${exit_code}"
else
cleanup_reminder
fi
fi
exit "${exit_code}"
}
trap failure_exit INT ERR EXIT
SCRIPT_DIR="$(dirname -- "$( readlink -f -- "$0"; )")";
function get_ip() {
gcloud compute instances describe \
"${INSTANCE_NAME}" \
--zone="${ZONE}" \
--format='value(networkInterfaces[0].accessConfigs[0].ip)'
}
function instance_ssh() {
gcloud compute ssh "${INSTANCE_NAME}" --zone="${ZONE}" \
--no-user-output-enabled \
"$@"
}
function ssh_ping() {
# ssh with a no-op command
instance_ssh --command=":"
}
function wait_for_ip() {
local -i max_attempts="$1"
local -i failed_attempts=0
while (( failed_attempts <= max_attempts )) && [[ get_ip == "" ]]; do
echo -n '.'
failed_attempts="$(( failed_attempts+1 ))"
sleep 1
done
if (( failed_attempts > max_attempts )); then
echo "Instance was never assigned an external IP. Aborting"
exit 1
fi
}
function wait_for_ssh() {
local -i max_attempts="$1"
local -i failed_attempts=0
local output=""
while (( failed_attempts <= max_attempts )) && ! ssh_output="$(ssh_ping 2>&1)"; do
echo -n '.'
failed_attempts="$(( failed_attempts+1 ))"
done
if (( failed_attempts > max_attempts )); then
echo "Failed to connect to instance via ssh. Output from ssh command:"
echo "${ssh_output}"
exit 1
fi
}
function create_image() {
if gcloud compute instances describe "${INSTANCE_NAME}" --zone="${ZONE}" > /dev/null 2>&1; then
echo "Using existing instance '${INSTANCE_NAME}'"
else
echo "Creating instance '${INSTANCE_NAME}' for boot disk"
case "${RUNNER_TYPE}" in
cpu)
local machine_type="${X86_64_MACHINE_TYPE}"
local image_size_gb="${CPU_IMAGE_SIZE_GB}"
local maintenance_policy=MIGRATE
local -a extra_args=()
;;
arm64)
local machine_type="${ARM64_MACHINE_TYPE}"
local image_size_gb="${CPU_IMAGE_SIZE_GB}"
local maintenance_policy=MIGRATE
local -a extra_args=()
;;
gpu)
local machine_type="${GPU_MACHINE_TYPE}"
local image_size_gb="${GPU_IMAGE_SIZE_GB}"
local maintenance_policy=TERMINATE
local -a extra_args=("--accelerator=count=1,type=nvidia-tesla-t4")
;;
*)
echo "Unrecognized RUNNER_TYPE=${RUNNER_TYPE}"
exit 1
;;
esac
local -a create_instance_cmd=(
gcloud
compute
instances
create
"${INSTANCE_NAME}"
--project=iree-oss
--zone="${ZONE}"
# `address=''` indicates an ephemeral IP. This *shouldn't* be necessary here,
# as the gcloud docs say that this is the default, but in fact if you leave it
# off the VM gets no external IP and is impossible to SSH into. This knowledge
# was hard won.
--network-interface=network=default,address='',network-tier=PREMIUM
--provisioning-model=STANDARD
--no-service-account
--no-scopes
--no-shielded-secure-boot
--shielded-vtpm
--shielded-integrity-monitoring
--reservation-affinity=any
--metadata-from-file=startup-script="${SCRIPT_DIR}/image_setup.sh"
--maintenance-policy="${maintenance_policy}"
--metadata="github-runner-type=${RUNNER_TYPE}"
--machine-type="${machine_type}"
--create-disk="boot=yes,device-name=${INSTANCE_NAME},image=${BASE_IMAGE},mode=rw,size=${image_size_gb},type=projects/${PROJECT}/zones/${ZONE}/diskTypes/pd-balanced,auto-delete=yes"
"${extra_args[@]}"
)
(set -x; "${create_instance_cmd[@]}")
fi
echo "Waiting for instance to start up"
# We could only use the ssh check below, but it's much nicer to know why an
# an instance isn't responsive and this is something we can check first.
wait_for_ip "${MAX_IP_ATTEMPTS}"
wait_for_ssh "${MAX_SSH_ATTEMPTS}"
echo ""
local log_file="$(mktemp --tmpdir ${INSTANCE_NAME}.XXX.startup.log)"
echo "Streaming startup logs from instance to stdout and ${log_file}"
# Get the PID of the startup script
local startup_pid="$(instance_ssh --command='systemctl show --property=ExecMainPID --value google-startup-scripts')"
echo ""
echo "*******************"
# -t forces a pseudo-tty which allows us to run tail with a follow
gcloud compute ssh "${INSTANCE_NAME}" --zone="${ZONE}" \
--no-user-output-enabled --ssh-flag="-t" \
--command="tail --follow=name --retry --lines=+1 --pid=${startup_pid} /startup.log" \
| tee "${log_file}"
echo "*******************"
echo ""
local exit_code="$(instance_ssh --command="cat /startup-exit.txt")"
if [[ "${exit_code}" != +([0-9]) ]]; then
echo "Failed to retrieve exit code from startup script (got '${exit_code}')."
exit 1
fi
if (( exit_code != 0 )); then
echo "Image setup failed with code '${exit_code}'. See logs above."
exit "${exit_code}"
fi
echo "Startup finished successfully."
echo "Deleting remote log file"
instance_ssh --command="sudo rm /startup.log"
echo "Shutting down instance"
# This actually does things synchronously, so we don't need our own loop to
# wait.
gcloud compute instances stop "${INSTANCE_NAME}" --zone="${ZONE}"
echo "Creating disk image"
gcloud compute images create "${IMAGE_NAME}" \
--source-disk="${INSTANCE_NAME}" \
--source-disk-zone="${ZONE}"
if (( SUCCESS_DELETE_INSTANCE == 1 )); then
echo "Deleting instance"
"${DELETE_INSTANCE_CMD[@]}" --quiet
else
echo "Not deleting instance because SUCCESS_DELETE_INSTANCE=${SUCCESS_DELETE_INSTANCE}"
cleanup_reminder
fi
echo "Successfully created image: ${IMAGE_NAME}"
}
create_image