blob: 651f0a0790d8abad9b2be9fdc6a51b79a71c61b5 [file] [log] [blame]
#!/bin/bash
# Copyright 2022 The IREE Authors
#
# Licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
# This is the series of commands run on the a VM from a fresh image in order to
# set up the disk to be used as a boot image. This script must be run as root.
set -o verbose # Print all command lines literally as they are read
set -o xtrace # Print all commands after they are expanded
set -o errexit # Exit if any command fails
set -o errtrace # make ERR trap inherit
set -o pipefail # return error if any part of a pipe errors
set -o nounset # error if an undefined variable is used
function save_exit_code() {
local exit_code="$?"
echo "${exit_code}" > /startup-exit.txt
trap - EXIT
exit "${exit_code}"
}
trap save_exit_code EXIT INT TERM
# Copied from build_tools/github_actions/runner/config/functions.sh
function nice_curl() {
curl --silent --fail --show-error --location "$@"
}
get_metadata() {
local url="http://metadata.google.internal/computeMetadata/v1/${1}"
ret=0
nice_curl --header "Metadata-Flavor: Google" "${url}" || ret=$?
if [[ "${ret}" != 0 ]]; then
echo "Failed fetching ${url}" >&2
return "${ret}"
fi
}
get_attribute() {
get_metadata "instance/attributes/${1}"
}
RUNNER_TYPE="$(get_attribute github-runner-type)"
GCLOUD_VERSION=402.0.0
GCLOUD_ARCHIVE_DIGEST=a9902b57d4cba2ebb76d7354570813d3d8199c36b95a1111a1b7fea013beaaf9
function apt_maybe_purge() {
# Remove and purge packages if they are installed and don't error if they're
# not or if they're not findable in the ppa.
local -a to_remove=()
for pkg in "$@"; do
ret=0
if dpkg --status $pkg &> /dev/null; then
to_remove+=("${pkg}")
fi
done
if (( "${#to_remove[@]}" != 0 )); then
apt-get remove --purge --autoremove "${to_remove[@]}"
fi
}
function startup() {
# Shut down in 5 hours. Makes sure this instance doesn't hang around forever
# if setup fails. Someone can cancel the shutdown with `shutdown -c`.
nohup shutdown -h +300 &
cd /
############################# Set Up Environment #############################
# We'll be installing google-cloud-sdk later
PATH="/google-cloud-sdk/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
echo "PATH=\"${PATH}\"" > /etc/environment
########################### Create the runner user ###########################
# GCE "helpfully" creates users for apparently any account that has ever
# logged in on any VM. Delete it if it's there.
userdel --force --remove runner || true
adduser --system --group "runner"
groupadd docker
usermod --append --groups docker runner
usermod --append --groups sudo runner
groups runner # Print out the groups of runner to verify this worked
groups runner | grep docker || (echo "Failed to add runner user to docker group" && exit 1)
groups runner | grep sudo || (echo "Failed to add runner user to sudo group" && exit 1)
echo "enabling passwordless sudo for runner user"
echo "runner ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/99-runner
# Confirm that worked
runuser --user runner -- sudo echo "runner user has passwordless sudo"
#################################### Apt #####################################
# Disable apt prompts
export DEBIAN_FRONTEND="noninteractive"
# Disable automatic updates and upgrades. These are ephemeral machines. We don't
# want the latency or inconsistency of automatic updatees.
systemctl stop apt-daily.timer
systemctl disable apt-daily.timer
systemctl disable apt-daily.service
systemctl stop apt-daily-upgrade.timer
systemctl disable apt-daily-upgrade.timer
systemctl disable apt-daily-upgrade.service
# Don't install documentation (except copyrights) since this is a CI system.
cat > /etc/dpkg/dpkg.cfg.d/99-github-actions <<EOF
force-all
no-pager
# don't install docs
path-exclude /usr/share/doc/*
path-exclude /usr/share/man/*
path-exclude /usr/share/groff/*
path-exclude /usr/share/info/*
# keep copyright files for legal reasons
path-include /usr/share/doc/*/copyright
EOF
# Provide default apt options like --assume-yes and --quiet since this is
# designed to run on CI.
cat > /etc/apt/apt.conf.d/99-github-actions <<EOF
APT {
Install-Recommends "false";
HideAutoRemove "true";
}
Aptitude {
CmdLine {
Assume-Yes "true";
}
}
Acquire {
Retries "5";
}
DPkg {
Use-Pty "0";
Options {
"--force-confdef";
"--force-confnew";
"--force-confold";
}
}
Quiet "2";
EOF
# We install these common deps. This is a subset of what's installed on the
# GitHub managed runners. All our heavy stuff is Dockerized, so basically just
# some utilities.
local apt_packages=(
apt-transport-https
aria2
ca-certificates
curl
git
gnupg2
lsb-release
software-properties-common
# Useful for working with JSON, which is used quite a bit in GitHub actions.
jq
# We need gcc, libc, make, etc for Cuda install
build-essential
)
# Install apt-fast for parallel apt package installation.
add-apt-repository -y ppa:apt-fast/stable
apt-get update
apt-get install apt-fast
apt-get upgrade
apt-get dist-upgrade
apt-get full-upgrade
apt-get install "${apt_packages[@]}"
######################## Fix gcloud Installation Snap ########################
# Snap literally won't let you disable automatic updates. The only thing
# that's installed through snap here is the gcloud CLI, which we definitely
# don't want automatically updating (beyond our general desire to not
# automatically update on ephemeral machines). So we just delete snap entirely
# and install the CLI from a versioned archive.
systemctl stop snapd
apt_maybe_purge snapd gnome-software-plugin-snap
rm -rf /home/*/snap
rm -rf /root/snap
local gcloud_checksum="e0382917353272655959bb650643c5df72c85de326a720b97e562bb6ea4478b1"
nice_curl \
https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-414.0.0-linux-x86_64.tar.gz \
--output gcloud.tar.gz
echo "${gcloud_checksum} *gcloud.tar.gz" | sha256sum --check --strict
tar -xf gcloud.tar.gz
rm gcloud.tar.gz
google-cloud-sdk/install.sh --quiet
# This setting is now enabled by default. It sounds great, but unfortunately
# doing such an upload requires *delete* permissions on the bucket, which we
# deliberately do not give runners. For the life of me, I could not figure out
# how to use `gcloud config set` (the "proper" way to set properties) to work
# on the global properties.
cat <<EOF >> /google-cloud-sdk/properties
[storage]
parallel_composite_upload_enabled = False
EOF
runuser --user runner -- gcloud info
########################### Install the ops agent ############################
nice_curl https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh \
| bash -s -- --also-install --remove-repo --version=2.24.0
cat <<EOF >> /etc/google-cloud-ops-agent/config.yaml
logging:
receivers:
systemd:
type: systemd_journald
EOF
service google-cloud-ops-agent restart
############################### Install Docker ###############################
# Remove Docker stuff that may already be installed by all its various names
apt_maybe_purge containerd docker docker-engine docker.io moby-engine moby-cli runc
# Install the latest Docker
local docker_gpg_file="/usr/share/keyrings/docker-archive-keyring.gpg"
local docker_apt_file="/etc/apt/sources.list.d/docker.list"
nice_curl \
https://download.docker.com/linux/ubuntu/gpg \
| gpg --dearmor -o "${docker_gpg_file}"
echo \
"deb [arch=amd64 signed-by=${docker_gpg_file}] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" \
> "${docker_apt_file}"
apt-get update
apt-get install docker-ce docker-ce-cli containerd.io
# Remove gpg keys and corresponding archives since these expire and we don't
# want later things relying on them.
rm "${docker_gpg_file}" "${docker_apt_file}"
apt-get update
# Enable docker.service.
sudo systemctl enable docker.service
sudo systemctl start docker.service
sudo systemctl enable containerd.service
sudo systemctl start containerd.service
# Docker daemon takes time to come up after installing.
for i in $(seq 1 30); do
if docker info; then
break
fi
done
# Make sure the runner user can use docker
runuser --user runner -- docker ps
#################################### GPU #####################################
if [[ "${RUNNER_TYPE^^}" == GPU ]]; then
local script_dir="$(mktemp --directory --tmpdir scripts.XXX)"
nice_curl \
--remote-name-all \
--output-dir "${script_dir}" \
https://raw.githubusercontent.com/openxla/iree/main/build_tools/scripts/check_vulkan.sh \
https://raw.githubusercontent.com/openxla/iree/main/build_tools/scripts/check_cuda.sh
chmod +x "${script_dir}/check_vulkan.sh" "${script_dir}/check_cuda.sh"
# Doing these all in one command fails, probably because there's a dependency
# between them and apt-fast makes it happen in parallel. Also, it turns out
# that the Vulkan ICD is in libnvidia-gl for some reason.
apt-get install nvidia-headless-530
apt-get install libnvidia-gl-530 nvidia-utils-530 vulkan-tools
"${script_dir}/check_cuda.sh"
"${script_dir}/check_vulkan.sh"
local nvidia_gpg_file="/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg"
local nvidia_apt_file="/etc/apt/sources.list.d/nvidia-container-toolkit.list"
# Nvidia container toolkit: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/overview.html
local distribution="$(source /etc/os-release; echo "${ID}${VERSION_ID}")"
nice_curl \
https://nvidia.github.io/libnvidia-container/gpgkey \
| gpg --dearmor -o "${nvidia_gpg_file}"
nice_curl \
"https://nvidia.github.io/libnvidia-container/${distribution}/libnvidia-container.list" | \
sed "s#deb https://#deb [signed-by=${nvidia_gpg_file}] https://#g" \
> "${nvidia_apt_file}"
apt-get update
apt-get install nvidia-docker2
# Remove gpg keys and corresponding archives since these expire and we don't
# want later things relying on them.
rm "${nvidia_gpg_file}" "${nvidia_apt_file}"
apt-get update
systemctl restart docker
# Check GPU usage with Vulkan and Cuda work
function check_docker() {
local image="$1"
docker run --rm --gpus all --env NVIDIA_DRIVER_CAPABILITIES=all \
--mount="type=bind,source=${script_dir},dst=${script_dir},readonly" \
"${image}" \
bash -c "${script_dir}/check_cuda.sh && ${script_dir}/check_vulkan.sh"
}
check_docker gcr.io/iree-oss/nvidia@sha256:de6e4453614aa48059fd611d7e7255f4d6ac27ac29a47aabdc04191ec1758533
check_docker gcr.io/iree-oss/frontends-nvidia@sha256:5974a2af86926a324bdfe98bea7080212db66189613a10ee19526f761c4c1400
check_docker gcr.io/iree-oss/nvidia-bleeding-edge@sha256:522491c028ec3b4070f23910c70c8162fd9612e11d9cf062a13444df7e88ab70
# Remove the docker images we've fetched. We might want to pre-fetch Docker
# images into the VM image, but that should be a separate decision.
docker system prune --force --all
fi
################################### Cleanup ##################################
apt-get clean
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
rm -rf /var/lib/dhcp/*
# Delete unnecessary log files
find /var/log -type f -regex ".*\.gz$" -delete
find /var/log -type f -regex ".*\.[0-9]$" -delete
# Clear all journal files
journalctl --rotate --vacuum-time=1s
# And clear others
find /var/log/ -type f -exec truncate -s 0 {} \;
echo "Disk usage after setup"
df -h /
echo "Setup complete"
}
time startup 2>&1 | tee /startup.log