Script the creation of a new VM image (#10390)

Note that I'm using Ubuntu 22.04 as the base image here. We're currently
using Debian, but I think it's probably better to switch to Ubuntu
because a) that's what GitHub's managed runners use and b) that's what
our Docker images use. This is configurable though.

The image setup script is based on the steps I took when manually
setting up the VM as well as scripts shared by another Googler working
on GitHub runner VMs, who demonstrably knows more about Linux than I do.
I do pretty much understand everything the script is doing, so it's not
just copy-pasta though.

Tested: Created an image using this script and deployed it to test
runners. All jobs succeed:
https://github.com/iree-org/iree/actions/runs/3049238365

skip-ci
diff --git a/build_tools/github_actions/runner/gcp/create_image.sh b/build_tools/github_actions/runner/gcp/create_image.sh
new file mode 100755
index 0000000..a1006e2
--- /dev/null
+++ b/build_tools/github_actions/runner/gcp/create_image.sh
@@ -0,0 +1,163 @@
+#!/bin/bash
+
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+set -euo pipefail
+
+TIME_STRING="$(date +%Y-%m-%d-%s)"
+
+INSTANCE_NAME="${INSTANCE_NAME:-github-runner-template-cpu-${TIME_STRING}}"
+IMAGE_NAME="${IMAGE_NAME:-github-runner-cpu-${TIME_STRING}}"
+ZONE="${ZONE:-us-central1-a}"
+PROJECT=iree-oss
+BASE_IMAGE="${BASE_IMAGE:-projects/ubuntu-os-cloud/global/images/ubuntu-2204-jammy-v20220902}"
+# It takes a little bit to bring up ssh on the instance. I haven't found a
+# better way to wait for this than just polling.
+MAX_IP_ATTEMPTS=5
+MAX_SSH_ATTEMPTS=10
+MAX_SCP_ATTEMPTS=5
+
+SCRIPT_DIR="$(dirname -- "$( readlink -f -- "$0"; )")";
+
+CREATE_INSTANCE_ARGS=(
+  "${INSTANCE_NAME}"
+  --project=iree-oss
+  --zone="${ZONE}"
+  --machine-type=e2-medium
+  # `address=''` indicates an ephemeral IP. This *shouldn't* be necessary here,
+  # as the gcloud docs say that this is the default, but in fact if you leave it
+  # off the VM gets no external IP and is impossible to SSH into. This knowledge
+  # was hard won.
+  --network-interface=network=default,address='',network-tier=PREMIUM
+  --maintenance-policy=MIGRATE
+  --provisioning-model=STANDARD
+  --no-service-account
+  --no-scopes
+  --create-disk="boot=yes,device-name=${INSTANCE_NAME},image=${BASE_IMAGE},mode=rw,size=10,type=projects/${PROJECT}/zones/${ZONE}/diskTypes/pd-balanced"
+  --no-shielded-secure-boot
+  --shielded-vtpm
+  --shielded-integrity-monitoring
+  --reservation-affinity=any
+  --metadata-from-file=startup-script="${SCRIPT_DIR}/image_setup.sh"
+)
+
+function get_ip() {
+  gcloud compute instances describe \
+    "${INSTANCE_NAME}" \
+    --zone="${ZONE}" \
+    --format='value(networkInterfaces[0].accessConfigs[0].ip)'
+}
+
+function ssh_ping() {
+  gcloud compute ssh "${INSTANCE_NAME}" \
+        --zone="${ZONE}" \
+        --command=":"
+}
+
+function wait_for_ip() {
+  local -i max_attempts="$1"
+  local -i failed_attempts=0
+  while (( failed_attempts <= max_attempts )) && [[ get_ip == "" ]]; do
+    echo -n '.'
+    failed_attempts="$(( failed_attempts+1 ))"
+    sleep 1
+  done
+
+  if (( failed_attempts > max_attempts )); then
+    echo "Instance was never assigned an external IP. Aborting"
+    exit 1
+  fi
+}
+
+function wait_for_ssh() {
+  local -i max_attempts="$1"
+  local -i failed_attempts=0
+  local output=""
+  while (( failed_attempts <= max_attempts )) && ! ssh_output="$(ssh_ping 2>&1)"; do
+    echo -n '.'
+    failed_attempts="$(( failed_attempts+1 ))"
+    sleep 1
+  done
+
+  if (( failed_attempts > max_attempts )); then
+    echo "Failed to connect to instance via ssh. Output from ssh command:"
+    echo "${ssh_output}"
+    exit 1
+  fi
+}
+
+function create_image() {
+  echo "Creating instance for boot disk"
+  (set -x; gcloud compute instances create "${CREATE_INSTANCE_ARGS[@]}")
+
+  # We could only use the ssh check below, but it's much nicer to know why an
+  # an instance isn't responsive and this is something we can check first.
+  echo "Waiting for instance to start up"
+  wait_for_ip "${MAX_IP_ATTEMPTS}"
+  wait_for_ssh "${MAX_SSH_ATTEMPTS}"
+
+  local log_file="$(mktemp)"
+  touch "${log_file}"
+
+  echo ""
+  echo "Streaming startup logs from instance"
+  tail -f "${log_file}" &
+  local -i failed_scp_attempts=0
+  local last_line=""
+  local scp_output=""
+  # Is waiting for a certain line in the logs kind of hacky? yes
+  # Is there a better way to do it? probably
+  # Does the better way involve a bunch of fiddling about? also probably
+  while (( failed_scp_attempts < MAX_SCP_ATTEMPTS )) && [[ "${last_line}" != "Setup complete" ]]; do
+    ret=0
+    scp_output="$(gcloud compute scp \
+      --zone="${ZONE}" \
+      "${INSTANCE_NAME}:/startup.log" \
+      "${log_file}" 2>&1)" || ret=$?
+    if (( ret != 0 )); then
+      failed_scp_attempts="$(( failed_scp_attempts+1 ))"
+      sleep 1
+    else
+      last_line="$(tail --lines=1 "${log_file}")"
+    fi
+  done
+
+  if (( failed_scp_attempts >= MAX_SCP_ATTEMPTS )); then
+    echo "Was unable to copy logs from instance. Output from scp:"
+    echo "${scp_output}"
+    exit 1
+  fi
+
+  if [[ "${last_line}" != "Setup complete" ]]; then
+    echo "Instance did not complete its setup. Please check the logs above."
+    exit 1
+  fi
+
+  echo "Startup finished successfully."
+
+  echo "Deleting log file"
+  gcloud compute ssh "${INSTANCE_NAME}" --zone="${ZONE}" \
+    --no-user-output-enabled \
+    --command="sudo rm /startup.log"
+
+  echo "Shutting down instance"
+  # This actually does things synchronously, so we don't need our own loop to
+  # wait.
+  gcloud compute instances stop "${INSTANCE_NAME}" --zone="${ZONE}"
+
+  echo "Creating disk image"
+  gcloud compute images create "${IMAGE_NAME}" \
+    --source-disk="${INSTANCE_NAME}" \
+    --source-disk-zone="${ZONE}"
+
+  echo "Deleting instance"
+  gcloud compute instances delete "${INSTANCE_NAME}" --zone="${ZONE}" --quiet
+
+  echo "Successfully created image: ${IMAGE_NAME}"
+}
+
+create_image
diff --git a/build_tools/github_actions/runner/gcp/image_setup.sh b/build_tools/github_actions/runner/gcp/image_setup.sh
new file mode 100644
index 0000000..f942241
--- /dev/null
+++ b/build_tools/github_actions/runner/gcp/image_setup.sh
@@ -0,0 +1,157 @@
+#!/bin/bash
+
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# This is the series of commands run on the a VM from a fresh image in order to
+# set up the disk to be used as a boot image. This script must be run as root.
+
+set -o verbose   # Print all command lines literally as they are read
+set -o xtrace    # Print all commands after they are expanded
+set -o errexit   # Exit if any command fails
+set -o errtrace  # make ERR trap inherit
+set -o pipefail  # return error if any part of a pipe errors
+set -o nounset   # error if an undefined variable is used
+
+
+function startup() {
+  #################################### APT #####################################
+  # Disable apt prompts
+  export DEBIAN_FRONTEND="noninteractive"
+
+  # Disable automatic updates and upgrades. These are ephemeral machines. We don't
+  # want the latency or inconsistency of automatic updatees.
+  systemctl stop apt-daily.timer
+  systemctl disable apt-daily.timer
+  systemctl disable apt-daily.service
+  systemctl stop apt-daily-upgrade.timer
+  systemctl disable apt-daily-upgrade.timer
+  systemctl disable apt-daily-upgrade.service
+
+  # Don't install documentation (except copyrights) since this is a CI system.
+  cat > /etc/dpkg/dpkg.cfg.d/github-actions <<EOF
+force-all
+no-pager
+# don't install docs
+path-exclude /usr/share/doc/*
+path-exclude /usr/share/man/*
+path-exclude /usr/share/groff/*
+path-exclude /usr/share/info/*
+# keep copyright files for legal reasons
+path-include /usr/share/doc/*/copyright
+EOF
+
+  # Provide default apt options like --assume-yes and --quiet since this is
+  # designed to run on CI.
+  cat > /etc/apt/apt.conf.d/github-actions <<EOF
+APT {
+  Install-Recommends "false";
+  HideAutoRemove "true";
+}
+Aptitude {
+  CmdLine {
+    Assume-Yes "true";
+  }
+}
+Acquire {
+  Retries "5";
+}
+DPkg {
+  Use-Pty "0";
+  Options {
+    "--force-confdef";
+    "--force-confnew";
+    "--force-confold";
+  }
+}
+Quiet "2";
+EOF
+
+  # Install apt-fast for parallel apt package installation.
+  add-apt-repository -y ppa:apt-fast/stable
+  apt-get update
+  apt-get install apt-fast
+  apt-get upgrade
+  apt-get dist-upgrade
+  apt-get full-upgrade
+  # Install common deps.
+  apt-get install \
+    apt-transport-https \
+    aria2 \
+    ca-certificates \
+    curl \
+    git \
+    gnupg2 \
+    jq \
+    lsb-release \
+    software-properties-common
+
+  ########################### Create the runner user ###########################
+
+  # GCE "helpfully" creates users for apparently any account that has ever
+  # logged in on any VM. Delete it if it's there.
+  userdel --force --remove runner || true
+  adduser --system --group "runner"
+  groupadd docker
+  usermod --append --groups docker runner
+  usermod --append --groups sudo runner
+  groups runner # Print out the groups of runner to verify this worked
+
+  echo "enabling passwordless sudo for runner user"
+  echo "runner ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/runner
+
+
+
+  ############################### Install Docker ###############################
+
+  # Remove Docker stuff that may already be installed, proceeding if they're not.
+  apt-get remove containerd docker docker-engine docker.io moby-engine moby-cli runc || true
+
+  # Install the latest Docker
+  curl -sfSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
+  echo \
+    "deb [arch=amd64 signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu \
+    $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list
+  apt-get update
+  apt-get install docker-ce docker-ce-cli containerd.io
+
+  # Enable docker.service.
+  sudo systemctl enable docker.service
+  sudo systemctl start docker.service
+  sudo systemctl enable containerd.service
+  sudo systemctl start containerd.service
+
+  # Docker daemon takes time to come up after installing.
+  for i in $(seq 1 30); do
+    if docker info; then
+      break
+    fi
+  done
+
+  # Make sure the runner user can use docker
+  runuser --user runner -- docker ps
+
+  ################################### Cleanup ####################################
+
+  apt-get clean
+  rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+  rm -rf /var/lib/dhcp/*
+
+  # Delete unnecessary log files
+  find /var/log -type f -regex ".*\.gz$" -delete
+  find /var/log -type f -regex ".*\.[0-9]$" -delete
+
+  # Clear all journal files
+  journalctl --rotate --vacuum-time=1s
+
+  # And clear others
+  find /var/log/ -type f -exec truncate -s 0 {} \;
+
+  # This specific log line is load bearing, as it's referenced in create_image.sh
+  echo "Setup complete"
+}
+
+startup 2>&1 | tee /startup.log