|  | #!/bin/bash | 
|  |  | 
|  | # Copyright 2022 The IREE Authors | 
|  | # | 
|  | # Licensed under the Apache License v2.0 with LLVM Exceptions. | 
|  | # See https://llvm.org/LICENSE.txt for license information. | 
|  | # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
|  |  | 
|  | set -o errexit   # Exit if any command fails | 
|  | set -o errtrace  # make ERR trap inherit | 
|  | set -o pipefail  # return error if any part of a pipe errors | 
|  | set -o nounset   # error if an undefined variable is used | 
|  |  | 
|  |  | 
|  |  | 
|  | SUCCESS_DELETE_INSTANCE=1 | 
|  | FAILURE_DELETE_INSTANCE=0 | 
|  |  | 
|  | RUNNER_TYPE="${RUNNER_TYPE:-cpu}" | 
|  | RUNNER_TYPE="${RUNNER_TYPE,,}" | 
|  |  | 
|  | TIME_STRING="$(date +%Y-%m-%d-%s)" | 
|  | INSTANCE_NAME="${INSTANCE_NAME:-github-runner-template-${RUNNER_TYPE}-${TIME_STRING}}" | 
|  | IMAGE_NAME="${INSTANCE_NAME/-template/}" | 
|  | ZONE="${ZONE:-us-central1-a}" | 
|  | PROJECT=iree-oss | 
|  |  | 
|  | case "${RUNNER_TYPE}" in | 
|  | arm64) | 
|  | BASE_IMAGE_ARCH="-arm64" | 
|  | ;; | 
|  | *) | 
|  | BASE_IMAGE_ARCH="" | 
|  | ;; | 
|  | esac | 
|  | BASE_IMAGE="${BASE_IMAGE:-projects/ubuntu-os-cloud/global/images/ubuntu-2204-jammy${BASE_IMAGE_ARCH}-v20231030}" | 
|  |  | 
|  | # We create the image using n1 machines with attached T4 GPUs. This image works | 
|  | # for the A100 machines as well though. | 
|  | GPU_MACHINE_TYPE="n1-standard-16" | 
|  | X86_64_MACHINE_TYPE="e2-medium" | 
|  | ARM64_MACHINE_TYPE="t2a-standard-8" | 
|  | CPU_IMAGE_SIZE_GB=10 | 
|  | # We need enough space to fetch Docker images that we test with | 
|  | # TODO(gcmn): See if we can make the image smaller, e.g. by resizing after setup | 
|  | # or using a local ssd for scratch space during setup. | 
|  | GPU_IMAGE_SIZE_GB=100 | 
|  |  | 
|  | # It takes a little bit to bring up ssh on the instance. I haven't found a | 
|  | # better way to wait for this than just polling. | 
|  | MAX_IP_ATTEMPTS=5 | 
|  | MAX_SSH_ATTEMPTS=10 | 
|  | MAX_SCP_ATTEMPTS=5 | 
|  |  | 
|  | DELETE_INSTANCE_CMD=( | 
|  | gcloud | 
|  | compute | 
|  | instances | 
|  | delete | 
|  | "${INSTANCE_NAME}" | 
|  | --zone="${ZONE}" | 
|  | ) | 
|  |  | 
|  | SSH_CMD=( | 
|  | gcloud | 
|  | compute | 
|  | ssh | 
|  | "${INSTANCE_NAME}" | 
|  | --zone="${ZONE}" | 
|  | --no-user-output-enabled | 
|  | ) | 
|  |  | 
|  | function cleanup_reminder() { | 
|  | echo "You can ssh in to debug with the following command:" | 
|  | echo "${SSH_CMD[@]}" | 
|  | echo "Make sure to delete ${INSTANCE_NAME} when you're done debugging:" | 
|  | echo "${DELETE_INSTANCE_CMD[@]}" | 
|  | } | 
|  |  | 
|  | function failure_exit() { | 
|  | local exit_code="$?" | 
|  | trap - INT ERR EXIT | 
|  | if (( exit_code != 0 )); then | 
|  | echo "Image creation was not successful." | 
|  | if (( FAILURE_DELETE_INSTANCE==1 )); then | 
|  | echo "Attempting to delete instance ${INSTANCE_NAME}" | 
|  | "${DELETE_INSTANCE_CMD[@]}" --quiet | 
|  | exit "${exit_code}" | 
|  | else | 
|  | cleanup_reminder | 
|  | fi | 
|  | fi | 
|  | exit "${exit_code}" | 
|  | } | 
|  |  | 
|  | trap failure_exit INT ERR EXIT | 
|  |  | 
|  | SCRIPT_DIR="$(dirname -- "$( readlink -f -- "$0"; )")"; | 
|  |  | 
|  | function get_ip() { | 
|  | gcloud compute instances describe \ | 
|  | "${INSTANCE_NAME}" \ | 
|  | --zone="${ZONE}" \ | 
|  | --format='value(networkInterfaces[0].accessConfigs[0].ip)' | 
|  | } | 
|  |  | 
|  | function instance_ssh() { | 
|  | gcloud compute ssh "${INSTANCE_NAME}" --zone="${ZONE}" \ | 
|  | --no-user-output-enabled \ | 
|  | "$@" | 
|  | } | 
|  |  | 
|  | function ssh_ping() { | 
|  | # ssh with a no-op command | 
|  | instance_ssh --command=":" | 
|  | } | 
|  |  | 
|  |  | 
|  | function wait_for_ip() { | 
|  | local -i max_attempts="$1" | 
|  | local -i failed_attempts=0 | 
|  | while (( failed_attempts <= max_attempts )) && [[ get_ip == "" ]]; do | 
|  | echo -n '.' | 
|  | failed_attempts="$(( failed_attempts+1 ))" | 
|  | sleep 1 | 
|  | done | 
|  |  | 
|  | if (( failed_attempts > max_attempts )); then | 
|  | echo "Instance was never assigned an external IP. Aborting" | 
|  | exit 1 | 
|  | fi | 
|  | } | 
|  |  | 
|  | function wait_for_ssh() { | 
|  | local -i max_attempts="$1" | 
|  | local -i failed_attempts=0 | 
|  | local output="" | 
|  | while (( failed_attempts <= max_attempts )) && ! ssh_output="$(ssh_ping 2>&1)"; do | 
|  | echo -n '.' | 
|  | failed_attempts="$(( failed_attempts+1 ))" | 
|  | done | 
|  |  | 
|  | if (( failed_attempts > max_attempts )); then | 
|  | echo "Failed to connect to instance via ssh. Output from ssh command:" | 
|  | echo "${ssh_output}" | 
|  | exit 1 | 
|  | fi | 
|  | } | 
|  |  | 
|  | function create_image() { | 
|  | if gcloud compute instances describe "${INSTANCE_NAME}" --zone="${ZONE}" > /dev/null 2>&1; then | 
|  | echo "Using existing instance '${INSTANCE_NAME}'" | 
|  | else | 
|  | echo "Creating instance '${INSTANCE_NAME}' for boot disk" | 
|  | case "${RUNNER_TYPE}" in | 
|  | cpu) | 
|  | local machine_type="${X86_64_MACHINE_TYPE}" | 
|  | local image_size_gb="${CPU_IMAGE_SIZE_GB}" | 
|  | local maintenance_policy=MIGRATE | 
|  | local -a extra_args=() | 
|  | ;; | 
|  | arm64) | 
|  | local machine_type="${ARM64_MACHINE_TYPE}" | 
|  | local image_size_gb="${CPU_IMAGE_SIZE_GB}" | 
|  | local maintenance_policy=MIGRATE | 
|  | local -a extra_args=() | 
|  | ;; | 
|  | gpu) | 
|  | local machine_type="${GPU_MACHINE_TYPE}" | 
|  | local image_size_gb="${GPU_IMAGE_SIZE_GB}" | 
|  | local maintenance_policy=TERMINATE | 
|  | local -a extra_args=("--accelerator=count=1,type=nvidia-tesla-t4") | 
|  | ;; | 
|  | *) | 
|  | echo "Unrecognized RUNNER_TYPE=${RUNNER_TYPE}" | 
|  | exit 1 | 
|  | ;; | 
|  | esac | 
|  |  | 
|  | local -a create_instance_cmd=( | 
|  | gcloud | 
|  | compute | 
|  | instances | 
|  | create | 
|  | "${INSTANCE_NAME}" | 
|  | --project=iree-oss | 
|  | --zone="${ZONE}" | 
|  | # `address=''` indicates an ephemeral IP. This *shouldn't* be necessary here, | 
|  | # as the gcloud docs say that this is the default, but in fact if you leave it | 
|  | # off the VM gets no external IP and is impossible to SSH into. This knowledge | 
|  | # was hard won. | 
|  | --network-interface=network=default,address='',network-tier=PREMIUM | 
|  | --provisioning-model=STANDARD | 
|  | --no-service-account | 
|  | --no-scopes | 
|  | --no-shielded-secure-boot | 
|  | --shielded-vtpm | 
|  | --shielded-integrity-monitoring | 
|  | --reservation-affinity=any | 
|  | --metadata-from-file=startup-script="${SCRIPT_DIR}/image_setup.sh" | 
|  | --maintenance-policy="${maintenance_policy}" | 
|  | --metadata="github-runner-type=${RUNNER_TYPE}" | 
|  | --machine-type="${machine_type}" | 
|  | --create-disk="boot=yes,device-name=${INSTANCE_NAME},image=${BASE_IMAGE},mode=rw,size=${image_size_gb},type=projects/${PROJECT}/zones/${ZONE}/diskTypes/pd-balanced,auto-delete=yes" | 
|  | "${extra_args[@]}" | 
|  | ) | 
|  |  | 
|  | (set -x; "${create_instance_cmd[@]}") | 
|  | fi | 
|  |  | 
|  | echo "Waiting for instance to start up" | 
|  | # We could only use the ssh check below, but it's much nicer to know why an | 
|  | # an instance isn't responsive and this is something we can check first. | 
|  | wait_for_ip "${MAX_IP_ATTEMPTS}" | 
|  | wait_for_ssh "${MAX_SSH_ATTEMPTS}" | 
|  |  | 
|  |  | 
|  | echo "" | 
|  | local log_file="$(mktemp --tmpdir ${INSTANCE_NAME}.XXX.startup.log)" | 
|  | echo "Streaming startup logs from instance to stdout and ${log_file}" | 
|  |  | 
|  | # Get the PID of the startup script | 
|  | local startup_pid="$(instance_ssh --command='systemctl show --property=ExecMainPID --value google-startup-scripts')" | 
|  |  | 
|  | echo "" | 
|  | echo "*******************" | 
|  |  | 
|  | # -t forces a pseudo-tty which allows us to run tail with a follow | 
|  | gcloud compute ssh "${INSTANCE_NAME}" --zone="${ZONE}" \ | 
|  | --no-user-output-enabled --ssh-flag="-t" \ | 
|  | --command="tail --follow=name --retry --lines=+1 --pid=${startup_pid} /startup.log" \ | 
|  | | tee "${log_file}" | 
|  |  | 
|  | echo "*******************" | 
|  | echo "" | 
|  |  | 
|  | local exit_code="$(instance_ssh --command="cat /startup-exit.txt")" | 
|  |  | 
|  | if [[ "${exit_code}" != +([0-9]) ]]; then | 
|  | echo "Failed to retrieve exit code from startup script (got '${exit_code}')." | 
|  | exit 1 | 
|  | fi | 
|  |  | 
|  | if (( exit_code != 0 )); then | 
|  | echo "Image setup failed with code '${exit_code}'. See logs above." | 
|  | exit "${exit_code}" | 
|  | fi | 
|  |  | 
|  | echo "Startup finished successfully." | 
|  |  | 
|  | echo "Deleting remote log file" | 
|  | instance_ssh --command="sudo rm /startup.log" | 
|  |  | 
|  | echo "Shutting down instance" | 
|  | # This actually does things synchronously, so we don't need our own loop to | 
|  | # wait. | 
|  | gcloud compute instances stop "${INSTANCE_NAME}" --zone="${ZONE}" | 
|  |  | 
|  | echo "Creating disk image" | 
|  | gcloud compute images create "${IMAGE_NAME}" \ | 
|  | --source-disk="${INSTANCE_NAME}" \ | 
|  | --source-disk-zone="${ZONE}" | 
|  |  | 
|  | if (( SUCCESS_DELETE_INSTANCE == 1 )); then | 
|  | echo "Deleting instance" | 
|  | "${DELETE_INSTANCE_CMD[@]}" --quiet | 
|  | else | 
|  | echo "Not deleting instance because SUCCESS_DELETE_INSTANCE=${SUCCESS_DELETE_INSTANCE}" | 
|  | cleanup_reminder | 
|  | fi | 
|  |  | 
|  | echo "Successfully created image: ${IMAGE_NAME}" | 
|  | } | 
|  |  | 
|  | create_image |