Enable custom scale-down of runner instances (#11688) Currently, instances only delete themselves when they finish their job. This doesn't help if the group has scaled up too much and then a bunch of instances sit idle. This introduces a periodic check if the runner is idle and the autoscaler thinks the group should scale down and stops the runner if so. There's a few refactors in here. Commits should mostly be reviewable individually (I think some earlier commits may have some syntax errors, so perhaps overlook those till the end). Of note, this adds cloud logging for systemd services from these VMs, which is helpful when debugging. Tested: Deployed to test environment, including a test instance-deleter service. Deliberately over-scaled instance group and confirmed that it scaled back down within ~30 minutes ![image](https://user-images.githubusercontent.com/5732088/210158149-b546c2df-02ae-4c5f-8d91-0d3a29d4e790.png) Ran jobs against test environment and confirmed no issues: https://github.com/iree-org/iree/actions/runs/3810920085/jobs/6488260971 and that group scaled up and then back down again: ![image](https://user-images.githubusercontent.com/5732088/210158162-87fc59f0-46de-4f73-b278-ba26244a8666.png) Note that when the presubmit run had finished, the group was still at seven instances, so it was this new functionality that caused the scale down.

commit: e3cb102c03ef01588cd4913d537f70a73928c34c [log] [tgz]
author: Geoffrey Martin-Noble <gcmn@google.com> Thu Jan 05 10:49:26 2023 -0800
committer: GitHub <noreply@github.com> Thu Jan 05 10:49:26 2023 -0800
tree: ffb0091ec1c407ca775480bee8fd4be0452d2a35
parent: a13b60a3ad2257a901068e08ba95014d3caca06f [diff]
diff --git a/build_tools/github_actions/runner/config/gh-runner.service b/build_tools/github_actions/runner/config/gh-runner.service
deleted file mode 100644
index dee56fe..0000000
--- a/build_tools/github_actions/runner/config/gh-runner.service
+++ /dev/null

@@ -1,16 +0,0 @@
-[Unit]
-Description=GitHub Actions Runner
-After=network.target
-
-[Service]
-User=runner
-Group=runner
-ExecStart=/runner-root/config/start.sh
-Restart=no
-KillMode=process
-KillSignal=SIGTERM
-TimeoutStopSec=5min
-ExecStopPost=/runner-root/config/delete_self.sh
-
-[Install]
-WantedBy=multi-user.target

diff --git a/build_tools/github_actions/runner/config/post_job.sh b/build_tools/github_actions/runner/config/google-cloud-ops-agent/config.yaml
old mode 100755
new mode 100644
similarity index 63%
copy from build_tools/github_actions/runner/config/post_job.sh
copy to build_tools/github_actions/runner/config/google-cloud-ops-agent/config.yaml
index 83c8e52..0e9eee5
--- a/build_tools/github_actions/runner/config/post_job.sh
+++ b/build_tools/github_actions/runner/config/google-cloud-ops-agent/config.yaml

@@ -1,13 +1,10 @@
-#!/bin/bash
-
 # Copyright 2022 The IREE Authors
 #
 # Licensed under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-set -euo pipefail
-
-SCRIPT_DIR="$(dirname -- "$( readlink -f -- "$0"; )")";
-
-"${SCRIPT_DIR}/cleanup_workdir.sh"
+logging:
+  receivers:
+    systemd:
+      type: systemd_journald

diff --git a/build_tools/github_actions/runner/config/health_server/health_server.py b/build_tools/github_actions/runner/config/health_server/health_server.py
index 5ff85e9..d62df74 100755
--- a/build_tools/github_actions/runner/config/health_server/health_server.py
+++ b/build_tools/github_actions/runner/config/health_server/health_server.py

@@ -18,21 +18,29 @@
 """
 
 import argparse
+import glob
 import http.server
 import subprocess
 from http.client import INTERNAL_SERVER_ERROR, NOT_FOUND, OK
+from typing import Optional
 
 RUNNER_SERVICE_NAME = "gh-runner"
 CHECK_SERVICE_CMD = ["systemctl", "is-active", RUNNER_SERVICE_NAME]
 CHECK_SERVICE_TIMEOUT = 10
+RUNNER_WORK_LOG_PATTERN = "/runner-root/actions-runner/_diag/Worker_*"
 
 
 class HealthCheckHandler(http.server.BaseHTTPRequestHandler):
 
-  def send_success(self):
+  def send_success(self,
+                   *,
+                   msg: Optional[str] = None,
+                   body: Optional[str] = None):
     self.send_response(OK)
     self.send_header("Content-type", "text/html")
     self.end_headers()
+    if body is not None:
+      self.wfile.write(bytes(body, encoding="utf-8"))
 
   def do_GET(self):
     try:
@@ -49,7 +57,14 @@
           NOT_FOUND, f"Runner service not found: '{' '.join(e.cmd)}' returned"
           f" '{e.stdout.strip()}' (exit code {e.returncode})")
 
-    return self.send_success()
+    # The runner writes a log file for each job it runs. In our case it only
+    # runs one, so we glob for anything matching that pattern. Yes that is an
+    # absolutely ludicrous way to get the runner's status. GitHub should really
+    # implement a proper health check so we don't have to hack around like this.
+    if glob.glob(RUNNER_WORK_LOG_PATTERN):
+      return self.send_success(body="active")
+
+    return self.send_success(body="idle")
 
 
 def main(args: argparse.Namespace):

diff --git a/build_tools/github_actions/runner/config/chown_workdir.sh b/build_tools/github_actions/runner/config/hooks/chown_workdir.sh
similarity index 100%
rename from build_tools/github_actions/runner/config/chown_workdir.sh
rename to build_tools/github_actions/runner/config/hooks/chown_workdir.sh


diff --git a/build_tools/github_actions/runner/config/cleanup_workdir.sh b/build_tools/github_actions/runner/config/hooks/cleanup_workdir.sh
similarity index 100%
rename from build_tools/github_actions/runner/config/cleanup_workdir.sh
rename to build_tools/github_actions/runner/config/hooks/cleanup_workdir.sh


diff --git a/build_tools/github_actions/runner/config/post_job.sh b/build_tools/github_actions/runner/config/hooks/post_job.sh
similarity index 73%
rename from build_tools/github_actions/runner/config/post_job.sh
rename to build_tools/github_actions/runner/config/hooks/post_job.sh
index 83c8e52..3bc8d80 100755
--- a/build_tools/github_actions/runner/config/post_job.sh
+++ b/build_tools/github_actions/runner/config/hooks/post_job.sh

@@ -8,6 +8,4 @@
 
 set -euo pipefail
 
-SCRIPT_DIR="$(dirname -- "$( readlink -f -- "$0"; )")";
-
-"${SCRIPT_DIR}/cleanup_workdir.sh"
+/runner-root/config/hooks/cleanup_workdir.sh

diff --git a/build_tools/github_actions/runner/config/pre_job.sh b/build_tools/github_actions/runner/config/hooks/pre_job.sh
similarity index 63%
rename from build_tools/github_actions/runner/config/pre_job.sh
rename to build_tools/github_actions/runner/config/hooks/pre_job.sh
index 96b1965..dbc2bb4 100755
--- a/build_tools/github_actions/runner/config/pre_job.sh
+++ b/build_tools/github_actions/runner/config/hooks/pre_job.sh

@@ -8,11 +8,9 @@
 
 set -euo pipefail
 
-SCRIPT_DIR="$(dirname -- "$( readlink -f -- "$0"; )")";
-
-source "${SCRIPT_DIR}/functions.sh"
+source /runner-root/config/functions.sh
 
 RUNNER_GROUP="$(get_attribute github-runner-group)"
 
-"${SCRIPT_DIR}/validate_trigger.${RUNNER_GROUP}.sh"
-"${SCRIPT_DIR}/chown_workdir.sh"
+"/runner-root/config/hooks/validate_trigger.${RUNNER_GROUP}.sh"
+/runner-root/config/hooks/chown_workdir.sh

diff --git a/build_tools/github_actions/runner/config/validate_trigger.postsubmit.sh b/build_tools/github_actions/runner/config/hooks/validate_trigger.postsubmit.sh
similarity index 88%
rename from build_tools/github_actions/runner/config/validate_trigger.postsubmit.sh
rename to build_tools/github_actions/runner/config/hooks/validate_trigger.postsubmit.sh
index 7554b3d..b027201 100755
--- a/build_tools/github_actions/runner/config/validate_trigger.postsubmit.sh
+++ b/build_tools/github_actions/runner/config/hooks/validate_trigger.postsubmit.sh

@@ -8,9 +8,7 @@
 
 set -euo pipefail
 
-SCRIPT_DIR="$(dirname -- "$( readlink -f -- "$0"; )")";
-
-source "${SCRIPT_DIR}/functions.sh"
+source /runner-root/config/functions.sh
 
 ALLOWED_EVENTS=(
   "push"

diff --git a/build_tools/github_actions/runner/config/validate_trigger.presubmit.sh b/build_tools/github_actions/runner/config/hooks/validate_trigger.presubmit.sh
similarity index 100%
rename from build_tools/github_actions/runner/config/validate_trigger.presubmit.sh
rename to build_tools/github_actions/runner/config/hooks/validate_trigger.presubmit.sh


diff --git a/build_tools/github_actions/runner/config/validate_trigger.releaser.sh b/build_tools/github_actions/runner/config/hooks/validate_trigger.releaser.sh
similarity index 88%
rename from build_tools/github_actions/runner/config/validate_trigger.releaser.sh
rename to build_tools/github_actions/runner/config/hooks/validate_trigger.releaser.sh
index ffda821..c9a0c45 100755
--- a/build_tools/github_actions/runner/config/validate_trigger.releaser.sh
+++ b/build_tools/github_actions/runner/config/hooks/validate_trigger.releaser.sh

@@ -8,9 +8,7 @@
 
 set -euo pipefail
 
-SCRIPT_DIR="$(dirname -- "$( readlink -f -- "$0"; )")";
-
-source "${SCRIPT_DIR}/functions.sh"
+source /runner-root/config/functions.sh
 
 ALLOWED_EVENTS=(
   "workflow_dispatch"

diff --git a/build_tools/github_actions/runner/config/register.sh b/build_tools/github_actions/runner/config/register.sh
index 6eebedb..da5c18b 100755
--- a/build_tools/github_actions/runner/config/register.sh
+++ b/build_tools/github_actions/runner/config/register.sh

@@ -12,8 +12,7 @@
 
 set -euo pipefail
 
-SCRIPT_DIR="$(dirname -- "$( readlink -f -- "$0"; )")";
-source "${SCRIPT_DIR}/functions.sh"
+source /runner-root/config/functions.sh
 
 # These use OS inventory management to fetch information about the VM operating
 # system (https://cloud.google.com/compute/docs/instances/os-inventory-management).

diff --git a/build_tools/github_actions/runner/config/runner.env b/build_tools/github_actions/runner/config/runner.env
index 6be913b..f345dd9 100644
--- a/build_tools/github_actions/runner/config/runner.env
+++ b/build_tools/github_actions/runner/config/runner.env

@@ -1,3 +1,3 @@
 LANG=C.UTF-8
-ACTIONS_RUNNER_HOOK_JOB_STARTED=/runner-root/config/pre_job.sh
-ACTIONS_RUNNER_HOOK_JOB_COMPLETED=/runner-root/config/post_job.sh
+ACTIONS_RUNNER_HOOK_JOB_STARTED=/runner-root/config/hooks/pre_job.sh
+ACTIONS_RUNNER_HOOK_JOB_COMPLETED=/runner-root/config/hooks/post_job.sh

diff --git a/build_tools/github_actions/runner/config/setup.sh b/build_tools/github_actions/runner/config/setup.sh
index ed78aad..2a9929f 100755
--- a/build_tools/github_actions/runner/config/setup.sh
+++ b/build_tools/github_actions/runner/config/setup.sh

@@ -35,6 +35,14 @@
 cp -r "${SCRIPT_DIR}" /runner-root/config
 chown -R runner:runner /runner-root/
 
+echo "Installing ops agent and turning on systemd logging"
+# TODO(gcmn): This should probably be baked into the image.
+curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh
+chmod +x add-google-cloud-ops-agent-repo.sh
+./add-google-cloud-ops-agent-repo.sh --also-install
+cp /runner-root/config/google-cloud-ops-agent/config.yaml /etc/google-cloud-ops-agent/config.yaml
+service google-cloud-ops-agent restart
+
 echo "Fetching the runner archive"
 RUNNER_VERSION="$(get_attribute github-runner-version)"
 RUNNER_ARCHIVE="actions-runner-linux-x64-${RUNNER_VERSION}.tar.gz"
@@ -54,23 +62,13 @@
 echo "Registering the runner."
 runuser --user runner /runner-root/config/register.sh
 
-echo "Setting up the deregister service."
-cp /runner-root/config/gh-runner-deregister.service /etc/systemd/system/
-
-echo "Setting up the runner service."
-cp /runner-root/config/gh-runner.service /etc/systemd/system/
-
-echo "Setting up the health check service."
-cp /runner-root/config/health-check.service /etc/systemd/system/
-
-echo "Reloading system service files to reflect changes."
+echo "Loading systemd services"
+cp /runner-root/config/systemd/system/* /etc/systemd/system/
 systemctl daemon-reload
 
-echo "Enabling the deregister service."
-systemctl enable gh-runner-deregister
+echo "Enabling systemd services."
+find /runner-root/config/systemd/system/ -type f -printf "%f\n" \
+  | xargs systemctl enable
 
-echo "Starting the runner service."
-systemctl start gh-runner
-
-echo "Starting the health check service"
-systemctl start health-check
+echo "Starting the runner services"
+systemctl start runner-setup.target

diff --git a/build_tools/github_actions/runner/config/delete_self.sh b/build_tools/github_actions/runner/config/systemd/scripts/delete_self.sh
similarity index 89%
rename from build_tools/github_actions/runner/config/delete_self.sh
rename to build_tools/github_actions/runner/config/systemd/scripts/delete_self.sh
index c688593..7959652 100755
--- a/build_tools/github_actions/runner/config/delete_self.sh
+++ b/build_tools/github_actions/runner/config/systemd/scripts/delete_self.sh

@@ -10,9 +10,7 @@
 
 set -euo pipefail
 
-SCRIPT_DIR="$(dirname -- "$( readlink -f -- "$0"; )")";
-source "${SCRIPT_DIR}/functions.sh"
-
+source /runner-root/config/functions.sh
 
 # If the nice way fails, hard shutdown
 function shutdown_now() {

diff --git a/build_tools/github_actions/runner/config/deregister.sh b/build_tools/github_actions/runner/config/systemd/scripts/deregister.sh
similarity index 87%
rename from build_tools/github_actions/runner/config/deregister.sh
rename to build_tools/github_actions/runner/config/systemd/scripts/deregister.sh
index 3ac442b..0bd16fd 100755
--- a/build_tools/github_actions/runner/config/deregister.sh
+++ b/build_tools/github_actions/runner/config/systemd/scripts/deregister.sh

@@ -10,8 +10,7 @@
 
 set -euo pipefail
 
-SCRIPT_DIR="$(dirname -- "$( readlink -f -- "$0"; )")";
-source "${SCRIPT_DIR}/functions.sh"
+source /runner-root/config/functions.sh
 
 RUNNER_SCOPE="$(get_attribute github-runner-scope)"
 DEREGISTER_TOKEN="$(get_runner_token remove ${RUNNER_SCOPE})"

diff --git a/build_tools/github_actions/runner/config/start.sh b/build_tools/github_actions/runner/config/systemd/scripts/start_runner.sh
similarity index 100%
rename from build_tools/github_actions/runner/config/start.sh
rename to build_tools/github_actions/runner/config/systemd/scripts/start_runner.sh


diff --git a/build_tools/github_actions/runner/config/systemd/scripts/stop_idle_runner.sh b/build_tools/github_actions/runner/config/systemd/scripts/stop_idle_runner.sh
new file mode 100755
index 0000000..fe2a911
--- /dev/null
+++ b/build_tools/github_actions/runner/config/systemd/scripts/stop_idle_runner.sh

@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# Copyright 2022 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Calls the instance deletion proxy to check if the autoscaler recommends
+# scaling down and if that's consistent, stop the runner.
+
+set -euo pipefail
+
+source /runner-root/config/functions.sh
+
+function get_runner_status() {
+  nice_curl localhost:8080; ret=$?
+  if (( ret!=0 )); then
+    echo "failed"
+  fi
+}
+
+function get_should_scale_down() {
+  local self_deletion_service_url="$(get_attribute instance-self-deleter-url)"
+  local id_token=$(get_metadata "instance/service-accounts/default/identity?audience=${self_deletion_service_url}&format=full")
+
+  nice_curl -X GET --header "Authorization: Bearer ${id_token}" "${self_deletion_service_url}"
+}
+
+function maybe_stop_runner() {
+  echo "Checking runner status"
+  local runner_status="$(get_runner_status)"
+  echo "runner_status='${runner_status}'"
+  if [[ "${runner_status}" != "idle" ]]; then
+    echo "Exiting"
+    return 0
+  fi
+  echo "Proceeding"
+
+  echo "Checking MIG autoscaling status. This could take a while as it waits" \
+       "for the MIG to stabilize."
+  local should_scale_down="$(get_should_scale_down)"
+  echo "should_scale_down='${should_scale_down}'"
+  if [[ "${should_scale_down}" != "true" ]]; then
+    echo "Exiting"
+    return 0
+  fi
+  echo "Proceeding"
+
+  # Double check that the runner is still idle. The above call can take a while
+  # as it waits random intervals for the MIG to be become stable. We definitely
+  # don't want to stop a runner that's in the middle of a job. We did the runner
+  # status check first before because it's much faster and an easy chance to
+  # bail out.
+  echo "Rechecking runner status"
+  local runner_status="$(get_runner_status)"
+  echo "runner_status='${runner_status}'"
+  if [[ "${runner_status}" != "idle" ]]; then
+    echo "Exiting"
+    return 0
+  fi
+  echo "Stopping runner"
+  systemctl stop gh-runner
+}
+
+maybe_stop_runner

diff --git a/build_tools/github_actions/runner/config/gh-runner-deregister.service b/build_tools/github_actions/runner/config/systemd/system/gh-runner-deregister.service
similarity index 80%
rename from build_tools/github_actions/runner/config/gh-runner-deregister.service
rename to build_tools/github_actions/runner/config/systemd/system/gh-runner-deregister.service
index 70e0277..f502532 100644
--- a/build_tools/github_actions/runner/config/gh-runner-deregister.service
+++ b/build_tools/github_actions/runner/config/systemd/system/gh-runner-deregister.service

@@ -7,7 +7,7 @@
 User=runner
 Group=runner
 Type=oneshot
-ExecStart=/runner-root/config/deregister.sh
+ExecStart=/runner-root/config/systemd/scripts/deregister.sh
 RemainAfterExit=yes
 
 [Install]

diff --git a/build_tools/github_actions/runner/config/systemd/system/gh-runner.service b/build_tools/github_actions/runner/config/systemd/system/gh-runner.service
new file mode 100644
index 0000000..b52ee60
--- /dev/null
+++ b/build_tools/github_actions/runner/config/systemd/system/gh-runner.service

@@ -0,0 +1,16 @@
+[Unit]
+Description=GitHub Actions Runner
+After=network.target
+
+[Service]
+User=runner
+Group=runner
+ExecStart=/runner-root/config/systemd/scripts/start_runner.sh
+Restart=no
+KillMode=process
+KillSignal=SIGTERM
+TimeoutStopSec=5min
+ExecStopPost=/runner-root/config/systemd/scripts/delete_self.sh
+
+[Install]
+WantedBy=runner-setup.target

diff --git a/build_tools/github_actions/runner/config/health-check.service b/build_tools/github_actions/runner/config/systemd/system/health-check.service
similarity index 75%
rename from build_tools/github_actions/runner/config/health-check.service
rename to build_tools/github_actions/runner/config/systemd/system/health-check.service
index f83c718..eabe9f9 100644
--- a/build_tools/github_actions/runner/config/health-check.service
+++ b/build_tools/github_actions/runner/config/systemd/system/health-check.service

@@ -1,6 +1,6 @@
 [Unit]
 Description=Health Check Server
-After=gh-runner.target
+After=gh-runner.target network.target
 
 [Service]
 User=root
@@ -12,4 +12,4 @@
 TimeoutStopSec=5min
 
 [Install]
-WantedBy=multi-user.target
+WantedBy=runner-setup.target

diff --git a/build_tools/github_actions/runner/config/systemd/system/runner-setup.target b/build_tools/github_actions/runner/config/systemd/system/runner-setup.target
new file mode 100644
index 0000000..8c6a77f
--- /dev/null
+++ b/build_tools/github_actions/runner/config/systemd/system/runner-setup.target

@@ -0,0 +1,5 @@
+[Unit]
+Description=Coordinates startup of all services related to the GitHub runner.
+
+[Install]
+WantedBy=multi-user.target

diff --git a/build_tools/github_actions/runner/config/systemd/system/stop-idle-runner.service b/build_tools/github_actions/runner/config/systemd/system/stop-idle-runner.service
new file mode 100644
index 0000000..65d9106
--- /dev/null
+++ b/build_tools/github_actions/runner/config/systemd/system/stop-idle-runner.service

@@ -0,0 +1,18 @@
+[Unit]
+Description=Stop the gh-runner service if it is idle and the autoscaler recommends scaling down
+After=gh-runner.target network.target
+RefuseManualStart=true
+
+[Service]
+User=root
+Group=root
+Type=oneshot
+ExecStart=/runner-root/config/systemd/scripts/stop_idle_runner.sh
+KillMode=process
+KillSignal=SIGTERM
+TimeoutStopSec=5min
+
+# The [Install] section for this unit mostly just exits so that we can call
+# enable on it without getting an error.
+[Install]
+Also=stop-idle-runner.timer

diff --git a/build_tools/github_actions/runner/config/systemd/system/stop-idle-runner.timer b/build_tools/github_actions/runner/config/systemd/system/stop-idle-runner.timer
new file mode 100644
index 0000000..87579fd
--- /dev/null
+++ b/build_tools/github_actions/runner/config/systemd/system/stop-idle-runner.timer

@@ -0,0 +1,13 @@
+[Unit]
+Description=Checks whether to stop the runner service at intervals
+
+[Timer]
+# Trigger 30 minutes after activation of the timer (via `systemctl start` on it
+# or a service that `Wants` it and every 15 minutes after the corresponding
+# service completes. Both timings are subject to systemd AccuracySec (default
+# +/-1 minute).
+OnActiveSec=30m
+OnUnitInactiveSec=15m
+
+[Install]
+WantedBy=timers.target runner-setup.target

diff --git a/build_tools/github_actions/runner/gcp/create_templates.sh b/build_tools/github_actions/runner/gcp/create_templates.sh
index ec5ddf5..154d5f7 100755
--- a/build_tools/github_actions/runner/gcp/create_templates.sh
+++ b/build_tools/github_actions/runner/gcp/create_templates.sh

@@ -15,6 +15,7 @@
 
 TESTING="${TEMPLATE_TESTING:-0}"
 DRY_RUN="${DRY_RUN:-0}"
+TESTING_SELF_DELETER="${TESTING_SELF_DELETER:-0}"
 
 GPU_IMAGE="github-runner-gpu-2022-09-29-1664451806"
 GPU_DISK_SIZE_GB=100
@@ -29,15 +30,15 @@
 TEMPLATE_BASE_NAME="${TEMPLATE_BASE_NAME:-${PROD_TEMPLATE_BASE_NAME}}"
 
 if (( TESTING==0 )) && ! git merge-base --is-ancestor "${TEMPLATE_CONFIG_REF}" main; then
-  echo "Creating testing template because TEMPLATE_CONFIG_REF='${TEMPLATE_CONFIG_REF}' is not on the main branch"
+  echo "Creating testing template because TEMPLATE_CONFIG_REF='${TEMPLATE_CONFIG_REF}' is not on the main branch" >&2
   TESTING=1
 fi
 if (( TESTING==0 )) && [[ "${TEMPLATE_CONFIG_REPO}" != "${PROD_TEMPLATE_CONFIG_REPO}" ]]; then
-  echo "Creating testing template because TEMPLATE_CONFIG_REPO '${TEMPLATE_CONFIG_REPO}'!='${PROD_TEMPLATE_CONFIG_REPO}'"
+  echo "Creating testing template because TEMPLATE_CONFIG_REPO '${TEMPLATE_CONFIG_REPO}'!='${PROD_TEMPLATE_CONFIG_REPO}'" >&2
   TESTING=1
 fi
 if (( TESTING==0 )) && [[ "${TEMPLATE_BASE_NAME}" != "${PROD_TEMPLATE_BASE_NAME}" ]]; then
-  echo "Creating testing template because TEMPLATE_BASE_NAME '${TEMPLATE_BASE_NAME}'!='${PROD_TEMPLATE_BASE_NAME}'"
+  echo "Creating testing template because TEMPLATE_BASE_NAME '${TEMPLATE_BASE_NAME}'!='${PROD_TEMPLATE_BASE_NAME}'" >&2
   TESTING=1
 fi
 
@@ -59,7 +60,12 @@
 GITHUB_RUNNER_VERSION="2.300.2"
 GITHUB_RUNNER_ARCHIVE_DIGEST="147c14700c6cb997421b9a239c012197f11ea9854cd901ee88ead6fe73a72c74"
 GITHUB_TOKEN_PROXY_URL="https://ght-proxy-zbhz5clunq-ue.a.run.app"
-INSTANCE_SELF_DELETER_URL="https://instance-self-deleter-zbhz5clunq-uc.a.run.app"
+
+if (( TESTING_SELF_DELETER==1 )); then
+  INSTANCE_SELF_DELETER_URL="https://instance-self-deleter-testing-zbhz5clunq-uc.a.run.app"
+else
+  INSTANCE_SELF_DELETER_URL="https://instance-self-deleter-zbhz5clunq-uc.a.run.app"
+fi
 
 declare -a METADATA=(
   "github-runner-version=${GITHUB_RUNNER_VERSION}"
@@ -118,6 +124,10 @@
 
   local -a cmd=(
     gcloud compute instance-templates create
+    --quiet
+  )
+
+  cmd+=(
     "${TEMPLATE_BASE_NAME}-${group}-${type}-${VERSION}"
     "${common_args[@]}"
     --service-account="github-runner-${trust}-trust@iree-oss.iam.gserviceaccount.com"
@@ -147,8 +157,9 @@
     # Prefix the command with a noop. It will still be printed by set -x
     cmd=(":" "${cmd[@]}")
   fi
-  (set -x; "${cmd[@]}")
-  echo ''
+
+  (set -x; "${cmd[@]}") >&2
+  echo '' >&2
 }
 
 for group in presubmit postsubmit; do
@@ -156,4 +167,6 @@
     create_template "${group}" "${type}"
   done
 done
-echo "Created new templates for version: ${VERSION}"
+
+echo "Created new templates for version: ${VERSION}" >&2
+echo "${VERSION}"

diff --git a/build_tools/github_actions/runner/gcp/update_autoscaling.sh b/build_tools/github_actions/runner/gcp/update_autoscaling.sh
index aaa2065..05522c3 100755
--- a/build_tools/github_actions/runner/gcp/update_autoscaling.sh
+++ b/build_tools/github_actions/runner/gcp/update_autoscaling.sh

@@ -21,7 +21,7 @@
     --min-num-replicas="${min_size}"
     --max-num-replicas="${max_size}"
     --mode=only-scale-out
-    --target-cpu-utilization=0.6
+    --target-cpu-utilization=0.2
   )
 
   (set -x; gcloud beta compute instance-groups managed set-autoscaling "${autoscaling_args[@]}")

diff --git a/build_tools/github_actions/runner/gcp/update_instance_groups.py b/build_tools/github_actions/runner/gcp/update_instance_groups.py
index ee7372d..534eea7 100755
--- a/build_tools/github_actions/runner/gcp/update_instance_groups.py
+++ b/build_tools/github_actions/runner/gcp/update_instance_groups.py

@@ -20,6 +20,9 @@
 
 CANARY_SIZE = compute.FixedOrPercent(fixed=1)
 
+TESTING_ENV_NAME = "testing"
+PROD_ENV_NAME = "prod"
+
 
 def resource_basename(resource):
   return os.path.basename(urllib.parse.urlparse(resource).path)
@@ -97,7 +100,7 @@
   )
 
   # Prod instances just have the bare name
-  modifier = None if args.env == "prod" else args.env
+  modifier = None if args.env == PROD_ENV_NAME else args.env
   migs = updater.get_migs(region=args.region,
                           type=args.type,
                           group=args.group,
@@ -123,7 +126,7 @@
   for mig in migs:
     region = resource_basename(mig.region)
     if args.command in [DIRECT_UPDATE_COMMAND_NAME, CANARY_COMMAND_NAME]:
-      if "testing" in args.version and args.env != "testing":
+      if "testing" in args.version and args.env != TESTING_ENV_NAME:
         scary_action = (f"using testing template version '{args.version}' in"
                         f" environment '{args.env}'")
         check_scary_action(scary_action, args.skip_confirmation)
@@ -280,18 +283,31 @@
       ))
   subparser_base.add_argument("--env",
                               "--environment",
-                              default="testing",
+                              default=TESTING_ENV_NAME,
                               help="The environment for the MIGs.",
-                              choices=["prod", "testing"])
+                              choices=[PROD_ENV_NAME, TESTING_ENV_NAME])
   subparser_base.add_argument(
       "--dry-run",
       action="store_true",
       default=False,
       help="Print all output but don't actually send the update request.")
-  subparser_base.add_argument("--skip-confirmation",
-                              "--force",
-                              action="store_true",
-                              help="Skip all confirmation prompts. Be careful.")
+
+  # Defaulting to true for testing environment avoids people getting in the
+  # habit of routinely passing --force.
+  skip_confirmation = subparser_base.add_mutually_exclusive_group()
+  skip_confirmation.add_argument(
+      "--skip-confirmation",
+      "--force",
+      action="store_true",
+      default=None,
+      help=("Skip all confirmation prompts. Be careful."
+            " Defaults to True for testing environment"))
+  skip_confirmation.add_argument("--noskip-confirmation",
+                                 "--noforce",
+                                 action="store_false",
+                                 default=None,
+                                 dest="skip_confirmation")
+
   # These shouldn't be set very often, but it's just as easy to make them flags
   # as it is to make them global constants.
   subparser_base.add_argument("--name-prefix",
@@ -337,6 +353,9 @@
 
   args = parser.parse_args()
 
+  if args.skip_confirmation is None:
+    args.skip_confirmation = args.env == TESTING_ENV_NAME
+
   if args.mode is None:
     if args.action == "refresh":
       args.mode = "proactive"

diff --git a/build_tools/github_actions/runner/instance_deleter/main.py b/build_tools/github_actions/runner/instance_deleter/main.py
index f069628..049d581 100644
--- a/build_tools/github_actions/runner/instance_deleter/main.py
+++ b/build_tools/github_actions/runner/instance_deleter/main.py

@@ -60,9 +60,11 @@
 """
 
 import os
+import random
 import re
-from http.client import (BAD_REQUEST, FORBIDDEN, INTERNAL_SERVER_ERROR,
-                         NOT_FOUND, UNAUTHORIZED)
+import time
+from http.client import (BAD_REQUEST, FORBIDDEN, GATEWAY_TIMEOUT,
+                         INTERNAL_SERVER_ERROR, NOT_FOUND, UNAUTHORIZED)
 
 import flask
 import functions_framework
@@ -74,11 +76,14 @@
 from google.oauth2 import id_token
 
 AUTH_HEADER_PREFIX = "Bearer "
+ALLOWED_HTTP_METHODS = ["DELETE", "GET"]
 MIG_METADATA_KEY = "created-by"
 ALLOWED_MIG_PATTERN_ENV_VARIABLE = "ALLOWED_MIG_PATTERN"
+STABILIZE_TIMEOUT_SECONDS = 100
 
 instances_client = compute.InstancesClient()
 migs_client = compute.RegionInstanceGroupManagersClient()
+autoscalers_client = compute.RegionAutoscalersClient()
 session = requests.Session()
 
 print("Server started")
@@ -110,27 +115,114 @@
   return next((item.value for item in items if item.key == key), None)
 
 
+def delete_instance_from_mig(mig_name: str, project: str, region: str,
+                             instance: compute.Instance):
+  try:
+    operation = migs_client.delete_instances(
+        instance_group_manager=mig_name,
+        project=project,
+        region=region,
+        # For some reason we can't just use a list of instance names and need to
+        # build this RhymingRythmicJavaClasses proto. Also, unlike all the other
+        # parameters, the instance has to be a fully-specified URL for the
+        # instance, not just its name.
+        region_instance_group_managers_delete_instances_request_resource=(
+            compute.RegionInstanceGroupManagersDeleteInstancesRequest(
+                instances=[instance.self_link])))
+  except (google.api_core.exceptions.Forbidden,
+          google.api_core.exceptions.Unauthorized,
+          google.api_core.exceptions.NotFound) as e:
+    print(e)
+    return flask.abort(
+        e.code, f"Error requesting that {mig_name} delete {instance.name}.")
+  except Exception as e:
+    # We'll call any other error here a server error.
+    print(e)
+    return flask.abort(
+        INTERNAL_SERVER_ERROR,
+        f"Error requesting that {mig_name} delete {instance.name}.")
+
+  try:
+    # This is actually an extended operation that you have to poll to get its
+    # status, but we just check the status once because it appears that errors
+    # always show up here and all we just want to return success in marking for
+    # deletion. We don't need to wait for the deletion to actually take place.
+    operation.result()
+  except google.api_core.exceptions.ClientError as e:
+    print(e)
+    # Unpack the actual usable error message
+    msg = (
+        f"Error requesting that {mig_name} delete {instance.name}:"
+        "\n" + "\n".join(
+            [f"{err.code}: {err.message}" for err in e.response.error.errors]))
+    print(msg)
+    # We're not actually totally sure whether this is a client or server error
+    # for the overall request, but let's call it a client error (the only client
+    # here is our VM instances, so I think we can be a bit loose).
+    return flask.abort(BAD_REQUEST, msg)
+
+  success_msg = f"{instance.name} has been marked for deletion by {mig_name}."
+  print(success_msg)
+  return success_msg
+
+
+def should_scale_down(mig_name: str, project: str, region: str):
+  start = time.time()
+  print(f"Polling {mig_name} for stability")
+  while time.time() - start < STABILIZE_TIMEOUT_SECONDS:
+    try:
+      mig = migs_client.get(project=project,
+                            region=region,
+                            instance_group_manager=mig_name)
+    except google.api_core.exceptions.NotFound as e:
+      print(e)
+      return flask.abort(
+          e.code,
+          f"Cannot find {mig_name} in region={region}, project={project}")
+    if mig.status.is_stable:
+      break
+    # We sleep for a random amount of time here to avoid synchronizing callers
+    # waiting for the MIG to be stable.
+    sleep_secs = random.randint(1, 15)
+    print(f"{mig_name} is not stable. Retrying in {sleep_secs} seconds")
+    time.sleep(sleep_secs)
+  else:
+    return flask.abort(GATEWAY_TIMEOUT,
+                       "Timed out waiting for the MIG to become stable")
+  autoscaler = autoscalers_client.get(project=project,
+                                      region=region,
+                                      autoscaler=_get_name_from_resource(
+                                          mig.status.autoscaler))
+  response = "true" if autoscaler.recommended_size < mig.target_size else "false"
+  print(
+      f"Autoscaler recommends size {autoscaler.recommended_size} and"
+      f" {mig_name} is targetting size {mig.target_size}. Sending: {response}")
+  return response
+
+
 @functions_framework.http
-def delete_self(request):
-  """HTTP Cloud Function.
+def delete_self(request: flask.Request):
+  """HTTP Cloud Function to delete the instance group making the request.
     Args:
-        request (flask.Request): The request object.
-        <https://flask.palletsprojects.com/en/1.1.x/api/#incoming-request-data>
+        request: The request object.
+        https://flask.palletsprojects.com/en/1.1.x/api/#incoming-request-data
     Returns:
         The response text, or any set of values that can be turned into a
         Response object using `make_response`
-        <https://flask.palletsprojects.com/en/1.1.x/api/#flask.make_response>.
+        https://flask.palletsprojects.com/en/1.1.x/api/#flask.make_response.
     Note:
         For more information on how Flask integrates with Cloud
         Functions, see the `Writing HTTP functions` page.
-        <https://cloud.google.com/functions/docs/writing/http#http_frameworks>
+        https://cloud.google.com/functions/docs/writing/http#http_frameworks
   """
-  if request.method != "DELETE":
+  if request.method not in ALLOWED_HTTP_METHODS:
     return flask.abort(
-        BAD_REQUEST,
-        f"Invalid method {request.method}. Only DELETE is supported.")
+        BAD_REQUEST, f"Invalid method {request.method}."
+        f" Allowed methods: {ALLOWED_HTTP_METHODS}")
 
-  # No path is needed, since the token contains all the information we need.
+  # No path is needed, since the token and method contain all the information we
+  # need. Maybe that design was a mistake, but since the resource being operated
+  # on is always the instance making the call, it seemed handy.
   if request.path != "/":
     return flask.abort(
         BAD_REQUEST,
@@ -167,8 +259,14 @@
 
   project = compute_info["project_id"]
   zone = compute_info["zone"]
+  region = _get_region(zone)
   instance_name = compute_info["instance_name"]
-  print(f"Received request to delete {instance_name}")
+
+  if request.method == "DELETE":
+    print(f"Received request to delete {instance_name}")
+  else:
+    assert request.method == "GET"
+    print(f"Received inquiry whether to delete {instance_name}")
   try:
     instance = instances_client.get(instance=instance_name,
                                     project=project,
@@ -181,22 +279,21 @@
         f"Cannot view {instance_name} in zone={zone}, project={project}")
 
   instance_id = int(compute_info["instance_id"])
-  # Verify it's *actually* the same instance. Names get reused, but IDs
-  # don't. For some reason you can't reference instances by their ID in any
-  # of the APIs.
+  # Verify it's *actually* the same instance. Names get reused, but IDs don't.
+  # For some reason you can't reference anything by ID in the API.
   if instance.id != instance_id:
     return flask.abort(
         BAD_REQUEST,
         f"Existing instance of the same name {instance.name} has a different"
         f" ID {instance.id} than token specifies {instance_id}.")
 
-  mig = _get_from_items(instance.metadata.items, MIG_METADATA_KEY)
+  mig_name = _get_from_items(instance.metadata.items, MIG_METADATA_KEY)
 
-  if mig is None:
+  if mig_name is None:
     return flask.abort(BAD_REQUEST,
                        (f"Instance is not part of a managed instance group."
                         f" Did not find {MIG_METADATA_KEY} in metadata."))
-  mig = _get_name_from_resource(mig)
+  mig_name = _get_name_from_resource(mig_name)
 
   # General good practice would be to compile the regex once, but the only way
   # to do that is to make it a global, which makes this difficult to test and
@@ -204,52 +301,17 @@
   allowed_mig_pattern = os.environ.get(ALLOWED_MIG_PATTERN_ENV_VARIABLE)
   if allowed_mig_pattern is None:
     flask.abort(
-        INTERNAL_SERVER_ERROR,
-        f"Missing required environment variable {ALLOWED_MIG_PATTERN_ENV_VARIABLE}"
-    )
+        INTERNAL_SERVER_ERROR, f"Missing required environment variable"
+        f" {ALLOWED_MIG_PATTERN_ENV_VARIABLE}")
 
-  if not re.fullmatch(allowed_mig_pattern, mig):
-    return flask.abort(FORBIDDEN, f"No access to MIG {mig}")
+  if not re.fullmatch(allowed_mig_pattern, mig_name):
+    return flask.abort(FORBIDDEN, f"No access to MIG {mig_name}")
 
-  try:
-    operation = migs_client.delete_instances(
-        instance_group_manager=mig,
-        project=project,
-        region=_get_region(zone),
-        # For some reason we can't just use a list of instance names and need to
-        # build this RhymingRythmicJavaClasses proto. Also, unlike all the other
-        # parameters, the instance has to be a fully-specified URL for the
-        # instance, not just its name.
-        region_instance_group_managers_delete_instances_request_resource=(
-            compute.RegionInstanceGroupManagersDeleteInstancesRequest(
-                instances=[instance.self_link])))
-  except (google.api_core.exceptions.Forbidden,
-          google.api_core.exceptions.Unauthorized) as e:
-    print(e)
-    return flask.abort(e.code,
-                       f"Error requesting that {mig} delete {instance_name}.")
-  except Exception as e:
-    # We'll call any other error here a server error.
-    print(e)
-    return flask.abort(INTERNAL_SERVER_ERROR,
-                       f"Error requesting that {mig} delete {instance_name}.")
+  if request.method == "DELETE":
+    return delete_instance_from_mig(mig_name=mig_name,
+                                    project=project,
+                                    region=region,
+                                    instance=instance)
 
-  try:
-    # This is actually an extended operation that you have to poll to get its
-    # status, but we just check the status once because it appears that errors
-    # always show up here.
-    operation.result()
-  except google.api_core.exceptions.ClientError as e:
-    print(e)
-    # Unpack the actual usable error message
-    msg = f"Error requesting that {mig} delete {instance_name}:" "\n" + "\n".join(
-        [f"{err.code}: {err.message}" for err in e.response.error.errors])
-    print(msg)
-    # We're not actually totally sure whether this is a client or server error
-    # for the overall request, but let's call it a client error (the only client
-    # here is our VM instances, so I think we can be a bit loose).
-    return flask.abort(BAD_REQUEST, msg)
-
-  success_msg = f"{instance_name} has been marked for deletion by {mig}."
-  print(success_msg)
-  return success_msg
+  assert request.method == "GET"
+  return should_scale_down(mig_name=mig_name, project=project, region=region)

diff --git a/build_tools/github_actions/runner/instance_deleter/main_test.py b/build_tools/github_actions/runner/instance_deleter/main_test.py
index 3b3fbc5..43397d6 100644
--- a/build_tools/github_actions/runner/instance_deleter/main_test.py
+++ b/build_tools/github_actions/runner/instance_deleter/main_test.py

@@ -62,8 +62,16 @@
     os_environ_patcher = mock.patch.dict(
         "os.environ", {main.ALLOWED_MIG_PATTERN_ENV_VARIABLE: ".*"})
     self.environ = os_environ_patcher.start()
+    autoscalers_client_patcher = mock.patch("main.autoscalers_client",
+                                            autospec=True)
+    self.autoscalers_client = autoscalers_client_patcher.start()
+    time_patcher = mock.patch("time.time", autospec=True)
+    self.time = time_patcher.start()
+    self.time.return_value = 0
+    # Just noop sleep
+    mock.patch("time.sleep", autospec=True).start()
 
-  def test_happy_path(self):
+  def test_delete_happy_path(self):
     req = Request({}, populate_request=False, shallow=True)
     req.method = "DELETE"
 
@@ -92,10 +100,6 @@
         ]))
     self.instances_client.get.return_value = instance
 
-    ext_operation = mock.MagicMock(
-        google.api_core.extended_operation.ExtendedOperation)
-    ext_operation.result.return_value = None
-
     response = main.delete_self(req)
 
     self.assertIn(MIG_NAME, response)
@@ -109,6 +113,91 @@
         .RegionInstanceGroupManagersDeleteInstancesRequest(
             instances=[instance.self_link]))
 
+  def test_get_happy_path(self):
+    req = Request({}, populate_request=False, shallow=True)
+    req.method = "GET"
+
+    token = make_token({
+        "google": {
+            "compute_engine": {
+                "project_id": PROJECT,
+                "zone": f"{REGION}-a",
+                "instance_name": INSTANCE_NAME,
+                "instance_id": str(ID1),
+            }
+        }
+    })
+
+    req.headers = {"Authorization": f"Bearer {token}"}
+
+    self_link = f"{INSTANCE_LINK_PREFIX}{INSTANCE_NAME}"
+    instance = compute.Instance(
+        id=ID1,
+        name=INSTANCE_NAME,
+        zone=ZONE,
+        self_link=self_link,
+        metadata=compute.Metadata(items=[
+            compute.Items(key=main.MIG_METADATA_KEY,
+                          value=f"{MIG_PATH_PREFIX}{MIG_NAME}")
+        ]))
+    self.instances_client.get.return_value = instance
+
+    mig = compute.InstanceGroupManager(
+        target_size=5,
+        status={
+            "is_stable": True,
+            "autoscaler": "autoscaler_link/autoscaler_name"
+        })
+    self.migs_client.get.return_value = mig
+
+    autoscaler = compute.Autoscaler(recommended_size=3)
+    self.autoscalers_client.get.return_value = autoscaler
+
+    response = main.delete_self(req)
+
+    self.assertEqual(response, "true")
+
+  def test_get_timeout(self):
+    req = Request({}, populate_request=False, shallow=True)
+    req.method = "GET"
+
+    token = make_token({
+        "google": {
+            "compute_engine": {
+                "project_id": PROJECT,
+                "zone": f"{REGION}-a",
+                "instance_name": INSTANCE_NAME,
+                "instance_id": str(ID1),
+            }
+        }
+    })
+
+    req.headers = {"Authorization": f"Bearer {token}"}
+
+    self_link = f"{INSTANCE_LINK_PREFIX}{INSTANCE_NAME}"
+    instance = compute.Instance(
+        id=ID1,
+        name=INSTANCE_NAME,
+        zone=ZONE,
+        self_link=self_link,
+        metadata=compute.Metadata(items=[
+            compute.Items(key=main.MIG_METADATA_KEY,
+                          value=f"{MIG_PATH_PREFIX}{MIG_NAME}")
+        ]))
+    self.instances_client.get.return_value = instance
+
+    mig = compute.InstanceGroupManager(
+        target_size=5,
+        status={
+            "is_stable": False,
+            "autoscaler": "autoscaler_link/autoscaler_name"
+        })
+    self.migs_client.get.return_value = mig
+    self.time.side_effect = [0, main.STABILIZE_TIMEOUT_SECONDS + 1]
+
+    with self.assertRaises(werkzeug.exceptions.GatewayTimeout):
+      main.delete_self(req)
+
   def test_narrow_allowed_migs(self):
     req = Request({}, populate_request=False, shallow=True)
     req.method = "DELETE"
@@ -159,7 +248,7 @@
 
   def test_bad_method(self):
     req = Request({}, populate_request=False, shallow=True)
-    req.method = "GET"
+    req.method = "POST"
 
     with self.assertRaises(werkzeug.exceptions.BadRequest) as ctx:
       main.delete_self(req)
commit	e3cb102c03ef01588cd4913d537f70a73928c34c	[log] [tgz]
author	Geoffrey Martin-Noble <gcmn@google.com>	Thu Jan 05 10:49:26 2023 -0800
committer	GitHub <noreply@github.com>	Thu Jan 05 10:49:26 2023 -0800
tree	ffb0091ec1c407ca775480bee8fd4be0452d2a35
parent	a13b60a3ad2257a901068e08ba95014d3caca06f [diff]