blob: e6234c6b73e82489355d2267915fd741c847a1a0 [file] [log] [blame]
# Copyright 2022 The IREE Authors
#
# Licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
"""A Cloud Functions proxy enabling GCE VMs in a Managed Instance Group to delete themselves.
GCE Managed instance groups don't have any good way to handle autoscaling for
long-running workloads. With the autoscaler configured to scale in, instances
get only 90 seconds warning to shut down. So we set the autoscaler to only scale
out and have the VMs tear themselves down when they're down with their work.
This is the approach suggested by the managed instance group team:
https://drive.google.com/file/d/1XlwxF_0T7pUnbzhL5ePDoW-Q3GAaLO11
But anything that brings down the VM other than a delete call to the instance
group manager API makes the VM get considered "unhealthy", which means it gets
recreated in exactly the same configuration, regardless of any update or
autoscaling settings. Making the correct API call requires broad permissions on
the instance group manager, which we don't want to give the VMs. To scope
permissions to individual instances, this proxy service makes use of instance
identity tokens to allow an instance to make a call only to delete itself.
See
https://cloud.google.com/compute/docs/instances/verifying-instance-identity
This makes use of the GCP Cloud Functions serverless offering. It's another
level of abstraction on top of Cloud Run, where you don't even need to create your
own docker container. For local development:
functions-framework --target=delete_self
curl -X DELETE -v --header "Authorization: Bearer $(cat /tmp/token.txt)" localhost:8080
You'll need to get a token that corresponds to an actual instance though or
you'll get an error:
gcloud compute ssh github-runner-testing-presubmit-cpu-us-west1-h58j \
--user-output-enabled=false \
--command "curl -sSfL \
-H 'Metadata-Flavor: Google' \
'http://metadata/computeMetadata/v1/instance/service-accounts/default/identity?audience=localhost&format=full'" \
> /tmp/token.txt
To deploy:
gcloud functions deploy instance-self-deleter \
--gen2 \
--runtime=python310 \
--region=us-central1 \
--source=. \
--entry-point=delete_self \
--trigger-http \
--run-service-account=managed-instance-deleter@iree-oss.iam.gserviceaccount.com \
--service-account=managed-instance-deleter@iree-oss.iam.gserviceaccount.com \
--ingress-settings=internal-only \
--timeout=30s \
--set-env-vars ALLOWED_MIG_PATTERN='github-runner-.*'
See https://cloud.google.com/functions/docs for more details.
"""
import os
import re
from http.client import (BAD_REQUEST, FORBIDDEN, INTERNAL_SERVER_ERROR,
NOT_FOUND, UNAUTHORIZED)
import flask
import functions_framework
import google.api_core.exceptions
import google.auth.exceptions
import requests
from google.auth import transport
from google.cloud import compute
from google.oauth2 import id_token
AUTH_HEADER_PREFIX = "Bearer "
MIG_METADATA_KEY = "created-by"
ALLOWED_MIG_PATTERN_ENV_VARIABLE = "ALLOWED_MIG_PATTERN"
instances_client = compute.InstancesClient()
migs_client = compute.RegionInstanceGroupManagersClient()
session = requests.Session()
print("Server started")
def _verify_token(token: str) -> dict:
"""Verify token signature and return the token payload"""
request = transport.requests.Request(session)
payload = id_token.verify_oauth2_token(token, request=request)
return payload
def _get_region(zone: str) -> str:
"""Extract region name from zone name"""
# Drop the trailing zone identifier to get the region. Yeah it kinda does seem
# like there should be a better way to do this...
region, _ = zone.rsplit("-", maxsplit=1)
return region
def _get_name_from_resource(resource: str) -> str:
"""Extract just the final name component from a fully scoped resource name."""
_, name = resource.rsplit("/", maxsplit=1)
return name
def _get_from_items(items: compute.Items, key: str):
# Why would the GCP Python API return something as silly as a dictionary?
return next((item.value for item in items if item.key == key), None)
@functions_framework.http
def delete_self(request):
"""HTTP Cloud Function.
Args:
request (flask.Request): The request object.
<https://flask.palletsprojects.com/en/1.1.x/api/#incoming-request-data>
Returns:
The response text, or any set of values that can be turned into a
Response object using `make_response`
<https://flask.palletsprojects.com/en/1.1.x/api/#flask.make_response>.
Note:
For more information on how Flask integrates with Cloud
Functions, see the `Writing HTTP functions` page.
<https://cloud.google.com/functions/docs/writing/http#http_frameworks>
"""
if request.method != "DELETE":
return flask.abort(
BAD_REQUEST,
f"Invalid method {request.method}. Only DELETE is supported.")
# No path is needed, since the token contains all the information we need.
if request.path != "/":
return flask.abort(
BAD_REQUEST,
f"Invalid request path {request.path}. Only root path is valid).")
auth_header = request.headers.get("Authorization")
if auth_header is None:
return flask.abort(UNAUTHORIZED, "Authorization header is missing")
if not auth_header.startswith(AUTH_HEADER_PREFIX):
return flask.abort(
UNAUTHORIZED,
f"Authorization header does not start with expected string"
f" {AUTH_HEADER_PREFIX}.")
token = auth_header[len(AUTH_HEADER_PREFIX):]
try:
# We don't verify audience here because Cloud IAM will have already done so
# and jwt's matching of audiences is exact, which means trailing slashes or
# http vs https matters and that's pretty brittle.
token_payload = _verify_token(token)
except (ValueError, google.auth.exceptions.GoogleAuthError) as e:
print(e)
return flask.abort(UNAUTHORIZED, "Decoding bearer token failed.")
print(f"Token payload: {token_payload}")
try:
compute_info = token_payload["google"]["compute_engine"]
except KeyError:
return flask.abort(
UNAUTHORIZED,
"Bearer token payload does not have expected field google.compute")
project = compute_info["project_id"]
zone = compute_info["zone"]
instance_name = compute_info["instance_name"]
try:
instance = instances_client.get(instance=instance_name,
project=project,
zone=zone)
except (google.api_core.exceptions.NotFound,
google.api_core.exceptions.Forbidden) as e:
print(e)
return flask.abort(
e.code,
f"Cannot view {instance_name} in zone={zone}, project={project}")
instance_id = int(compute_info["instance_id"])
# Verify it's *actually* the same instance. Names get reused, but IDs
# don't. For some reason you can't reference instances by their ID in any
# of the APIs.
if instance.id != instance_id:
return flask.abort(
BAD_REQUEST,
f"Existing instance of the same name {instance.name} has a different"
f" ID {instance.id} than token specifies {instance_id}.")
mig = _get_from_items(instance.metadata.items, MIG_METADATA_KEY)
if mig is None:
return flask.abort(BAD_REQUEST,
(f"Instance is not part of a managed instance group."
f" Did not find {MIG_METADATA_KEY} in metadata."))
mig = _get_name_from_resource(mig)
# General good practice would be to compile the regex once, but the only way
# to do that is to make it a global, which makes this difficult to test and
# compiling this regex should not be expensive.
allowed_mig_pattern = os.environ.get(ALLOWED_MIG_PATTERN_ENV_VARIABLE)
if allowed_mig_pattern is None:
flask.abort(
INTERNAL_SERVER_ERROR,
f"Missing required environment variable {ALLOWED_MIG_PATTERN_ENV_VARIABLE}"
)
if not re.fullmatch(allowed_mig_pattern, mig):
return flask.abort(FORBIDDEN, f"No access to MIG {mig}")
try:
operation = migs_client.delete_instances(
instance_group_manager=mig,
project=project,
region=_get_region(zone),
# For some reason we can't just use a list of instance names and need to
# build this RhymingRythmicJavaClasses proto. Also, unlike all the other
# parameters, the instance has to be a fully-specified URL for the
# instance, not just its name.
region_instance_group_managers_delete_instances_request_resource=(
compute.RegionInstanceGroupManagersDeleteInstancesRequest(
instances=[instance.self_link])))
except (google.api_core.exceptions.Forbidden,
google.api_core.exceptions.Unauthorized) as e:
print(e)
return flask.abort(e.code,
f"Error requesting that {mig} delete {instance_name}.")
except Exception as e:
# We'll call any other error here a server error.
print(e)
return flask.abort(INTERNAL_SERVER_ERROR,
f"Error requesting that {mig} delete {instance_name}.")
try:
# This is actually an extended operation that you have to poll to get its
# status, but we just check the status once because it appears that errors
# always show up here.
operation.result()
except google.api_core.exceptions.ClientError as e:
print(e)
# Unpack the actual usable error message
msg = f"Error requesting that {mig} delete {instance_name}:" "\n" + "\n".join(
[f"{err.code}: {err.message}" for err in e.response.error.errors])
print(msg)
# We're not actually totally sure whether this is a client or server error
# for the overall request, but let's call it a client error (the only client
# here is our VM instances, so I think we can be a bit loose).
return flask.abort(BAD_REQUEST, msg)
return f"{instance_name} has been marked for deletion by {mig}."