build_tools/github_actions/runner/instance_deleter/main.py - 3p/openxla/iree - Git at Google

 # Copyright 2022 The IREE Authors
 #
 # Licensed under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 """A Cloud Functions proxy enabling GCE VMs in a Managed Instance Group to delete themselves.

 GCE Managed instance groups don't have any good way to handle autoscaling for
 long-running workloads. With the autoscaler configured to scale in, instances
 get only 90 seconds warning to shut down. So we set the autoscaler to only scale
 out and have the VMs tear themselves down when they're down with their work.
 This is the approach suggested by the managed instance group team:

 https://drive.google.com/file/d/1XlwxF_0T7pUnbzhL5ePDoW-Q3GAaLO11

 But anything that brings down the VM other than a delete call to the instance
 group manager API makes the VM get considered "unhealthy", which means it gets
 recreated in exactly the same configuration, regardless of any update or
 autoscaling settings. Making the correct API call requires broad permissions on
 the instance group manager, which we don't want to give the VMs. To scope
 permissions to individual instances, this proxy service makes use of instance
 identity tokens to allow an instance to make a call only to delete itself.

 See
 https://cloud.google.com/compute/docs/instances/verifying-instance-identity

 This makes use of the GCP Cloud Functions serverless offering. It's another
 level of abstraction on top of Cloud Run, where you don't even need to create your
 own docker container. For local development:

   functions-framework --target=delete_self
   curl -X DELETE -v --header "Authorization: Bearer $(cat /tmp/token.txt)" localhost:8080

 You'll need to get a token that corresponds to an actual instance though or
 you'll get an error:

   gcloud compute ssh gh-runner-testing-presubmit-cpu-us-west1-h58j \
     --user-output-enabled=false \
     --command "curl -sSfL \
         -H 'Metadata-Flavor: Google' \
         'http://metadata/computeMetadata/v1/instance/service-accounts/default/identity?audience=localhost&format=full'" \
     > /tmp/token.txt

 To deploy:
   # Note timeout should be greater than STABILIZE_TIMEOUT_SECONDS
   gcloud functions deploy instance-self-deleter \
     --gen2 \
     --runtime=python310 \
     --region=us-central1 \
     --source=. \
     --entry-point=delete_self \
     --trigger-http \
     --run-service-account=managed-instance-deleter@iree-oss.iam.gserviceaccount.com \
     --service-account=managed-instance-deleter@iree-oss.iam.gserviceaccount.com \
     --ingress-settings=internal-only \
     --timeout=120s \
     --set-env-vars ALLOWED_MIG_PATTERN='gh-runner-.*'


 See https://cloud.google.com/functions/docs for more details.
 """

 import os
 import random
 import re
 import time
 from http.client import (
     BAD_REQUEST,
     FORBIDDEN,
     GATEWAY_TIMEOUT,
     INTERNAL_SERVER_ERROR,
     NOT_FOUND,
     UNAUTHORIZED,
 )

 import flask
 import functions_framework
 import google.api_core.exceptions
 import google.auth.exceptions
 import requests
 from google.auth import transport
 from google.cloud import compute
 from google.oauth2 import id_token

 AUTH_HEADER_PREFIX = "Bearer "
 ALLOWED_HTTP_METHODS = ["DELETE", "GET"]
 MIG_METADATA_KEY = "created-by"
 ALLOWED_MIG_PATTERN_ENV_VARIABLE = "ALLOWED_MIG_PATTERN"
 # Must be less than timeout configuration for deployment
 STABILIZE_TIMEOUT_SECONDS = 100

 instances_client = compute.InstancesClient()
 migs_client = compute.RegionInstanceGroupManagersClient()
 autoscalers_client = compute.RegionAutoscalersClient()
 session = requests.Session()

 print("Server started")


 def _verify_token(token: str) -> dict:
     """Verify token signature and return the token payload"""
     request = transport.requests.Request(session)
     payload = id_token.verify_oauth2_token(token, request=request)
     return payload


 def _get_region(zone: str) -> str:
     """Extract region name from zone name"""
     # Drop the trailing zone identifier to get the region. Yeah it kinda does seem
     # like there should be a better way to do this...
     region, _ = zone.rsplit("-", maxsplit=1)
     return region


 def _get_name_from_resource(resource: str) -> str:
     """Extract just the final name component from a fully scoped resource name."""
     _, name = resource.rsplit("/", maxsplit=1)
     return name


 def _get_from_items(items: compute.Items, key: str):
     # Why would the GCP Python API return something as silly as a dictionary?
     return next((item.value for item in items if item.key == key), None)


 def delete_instance_from_mig(
     mig_name: str, project: str, region: str, instance: compute.Instance
 ):
     try:
         operation = migs_client.delete_instances(
             instance_group_manager=mig_name,
             project=project,
             region=region,
             # For some reason we can't just use a list of instance names and need to
             # build this RhymingRythmicJavaClasses proto. Also, unlike all the other
             # parameters, the instance has to be a fully-specified URL for the
             # instance, not just its name.
             region_instance_group_managers_delete_instances_request_resource=(
                 compute.RegionInstanceGroupManagersDeleteInstancesRequest(
                     instances=[instance.self_link]
                 )
             ),
         )
     except (
         google.api_core.exceptions.Forbidden,
         google.api_core.exceptions.Unauthorized,
         google.api_core.exceptions.NotFound,
     ) as e:
         print(e)
         return flask.abort(
             e.code, f"Error requesting that {mig_name} delete {instance.name}."
         )
     except Exception as e:
         # We'll call any other error here a server error.
         print(e)
         return flask.abort(
             INTERNAL_SERVER_ERROR,
             f"Error requesting that {mig_name} delete {instance.name}.",
         )

     try:
         # This is actually an extended operation that you have to poll to get its
         # status, but we just check the status once because it appears that errors
         # always show up here and all we just want to return success in marking for
         # deletion. We don't need to wait for the deletion to actually take place.
         operation.result()
     except google.api_core.exceptions.ClientError as e:
         print(e)
         # Unpack the actual usable error message
         msg = (
             f"Error requesting that {mig_name} delete {instance.name}:"
             "\n"
             + "\n".join(
                 [f"{err.code}: {err.message}" for err in e.response.error.errors]
             )
         )
         print(msg)
         # We're not actually totally sure whether this is a client or server error
         # for the overall request, but let's call it a client error (the only client
         # here is our VM instances, so I think we can be a bit loose).
         return flask.abort(BAD_REQUEST, msg)

     success_msg = f"{instance.name} has been marked for deletion by {mig_name}."
     print(success_msg)
     return success_msg


 def should_scale_down(mig_name: str, project: str, region: str):
     start = time.time()
     print(f"Polling {mig_name} for stability")
     while time.time() - start < STABILIZE_TIMEOUT_SECONDS:
         try:
             mig = migs_client.get(
                 project=project, region=region, instance_group_manager=mig_name
             )
         except google.api_core.exceptions.NotFound as e:
             print(e)
             return flask.abort(
                 e.code, f"Cannot find {mig_name} in region={region}, project={project}"
             )
         if mig.status.is_stable:
             break
         # We sleep for a random amount of time here to avoid synchronizing callers
         # waiting for the MIG to be stable.
         sleep_secs = random.randint(1, 15)
         print(f"{mig_name} is not stable. Retrying in {sleep_secs} seconds")
         time.sleep(sleep_secs)
     else:
         return flask.abort(
             GATEWAY_TIMEOUT, "Timed out waiting for the MIG to become stable"
         )
     autoscaler = autoscalers_client.get(
         project=project,
         region=region,
         autoscaler=_get_name_from_resource(mig.status.autoscaler),
     )
     response = "true" if autoscaler.recommended_size < mig.target_size else "false"
     print(
         f"Autoscaler recommends size {autoscaler.recommended_size} and"
         f" {mig_name} is targetting size {mig.target_size}. Sending: {response}"
     )
     return response


 @functions_framework.http
 def delete_self(request: flask.Request):
     """HTTP Cloud Function to delete the instance group making the request.
     Args:
         request: The request object.
         https://flask.palletsprojects.com/en/1.1.x/api/#incoming-request-data
     Returns:
         The response text, or any set of values that can be turned into a
         Response object using `make_response`
         https://flask.palletsprojects.com/en/1.1.x/api/#flask.make_response.
     Note:
         For more information on how Flask integrates with Cloud
         Functions, see the `Writing HTTP functions` page.
         https://cloud.google.com/functions/docs/writing/http#http_frameworks
     """
     if request.method not in ALLOWED_HTTP_METHODS:
         return flask.abort(
             BAD_REQUEST,
             f"Invalid method {request.method}."
             f" Allowed methods: {ALLOWED_HTTP_METHODS}",
         )

     # No path is needed, since the token and method contain all the information we
     # need. Maybe that design was a mistake, but since the resource being operated
     # on is always the instance making the call, it seemed handy.
     if request.path != "/":
         return flask.abort(
             BAD_REQUEST,
             f"Invalid request path {request.path}. Only root path is valid).",
         )

     auth_header = request.headers.get("Authorization")
     if auth_header is None:
         return flask.abort(UNAUTHORIZED, "Authorization header is missing")
     if not auth_header.startswith(AUTH_HEADER_PREFIX):
         return flask.abort(
             UNAUTHORIZED,
             f"Authorization header does not start with expected string"
             f" {AUTH_HEADER_PREFIX}.",
         )

     token = auth_header[len(AUTH_HEADER_PREFIX) :]

     try:
         # We don't verify audience here because Cloud IAM will have already done so
         # and jwt's matching of audiences is exact, which means trailing slashes or
         # http vs https matters and that's pretty brittle.
         token_payload = _verify_token(token)
     except (ValueError, google.auth.exceptions.GoogleAuthError) as e:
         print(e)
         return flask.abort(UNAUTHORIZED, "Decoding bearer token failed.")

     print(f"Token payload: {token_payload}")

     try:
         compute_info = token_payload["google"]["compute_engine"]
     except KeyError:
         return flask.abort(
             UNAUTHORIZED,
             "Bearer token payload does not have expected field google.compute",
         )

     project = compute_info["project_id"]
     zone = compute_info["zone"]
     region = _get_region(zone)
     instance_name = compute_info["instance_name"]

     if request.method == "DELETE":
         print(f"Received request to delete {instance_name}")
     else:
         assert request.method == "GET"
         print(f"Received inquiry whether to delete {instance_name}")
     try:
         instance = instances_client.get(
             instance=instance_name, project=project, zone=zone
         )
     except (
         google.api_core.exceptions.NotFound,
         google.api_core.exceptions.Forbidden,
     ) as e:
         print(e)
         return flask.abort(
             e.code, f"Cannot view {instance_name} in zone={zone}, project={project}"
         )

     instance_id = int(compute_info["instance_id"])
     # Verify it's *actually* the same instance. Names get reused, but IDs don't.
     # For some reason you can't reference anything by ID in the API.
     if instance.id != instance_id:
         return flask.abort(
             BAD_REQUEST,
             f"Existing instance of the same name {instance.name} has a different"
             f" ID {instance.id} than token specifies {instance_id}.",
         )

     mig_name = _get_from_items(instance.metadata.items, MIG_METADATA_KEY)

     if mig_name is None:
         return flask.abort(
             BAD_REQUEST,
             (
                 f"Instance is not part of a managed instance group."
                 f" Did not find {MIG_METADATA_KEY} in metadata."
             ),
         )
     mig_name = _get_name_from_resource(mig_name)

     # General good practice would be to compile the regex once, but the only way
     # to do that is to make it a global, which makes this difficult to test and
     # compiling this regex should not be expensive.
     allowed_mig_pattern = os.environ.get(ALLOWED_MIG_PATTERN_ENV_VARIABLE)
     if allowed_mig_pattern is None:
         flask.abort(
             INTERNAL_SERVER_ERROR,
             f"Missing required environment variable"
             f" {ALLOWED_MIG_PATTERN_ENV_VARIABLE}",
         )

     if not re.fullmatch(allowed_mig_pattern, mig_name):
         return flask.abort(FORBIDDEN, f"No access to MIG {mig_name}")

     if request.method == "DELETE":
         return delete_instance_from_mig(
             mig_name=mig_name, project=project, region=region, instance=instance
         )

     assert request.method == "GET"
     return should_scale_down(mig_name=mig_name, project=project, region=region)
	# Copyright 2022 The IREE Authors
	#
	# Licensed under the Apache License v2.0 with LLVM Exceptions.
	# See https://llvm.org/LICENSE.txt for license information.
	# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	"""A Cloud Functions proxy enabling GCE VMs in a Managed Instance Group to delete themselves.

	GCE Managed instance groups don't have any good way to handle autoscaling for
	long-running workloads. With the autoscaler configured to scale in, instances
	get only 90 seconds warning to shut down. So we set the autoscaler to only scale
	out and have the VMs tear themselves down when they're down with their work.
	This is the approach suggested by the managed instance group team:

	https://drive.google.com/file/d/1XlwxF_0T7pUnbzhL5ePDoW-Q3GAaLO11

	But anything that brings down the VM other than a delete call to the instance
	group manager API makes the VM get considered "unhealthy", which means it gets
	recreated in exactly the same configuration, regardless of any update or
	autoscaling settings. Making the correct API call requires broad permissions on
	the instance group manager, which we don't want to give the VMs. To scope
	permissions to individual instances, this proxy service makes use of instance
	identity tokens to allow an instance to make a call only to delete itself.

	See
	https://cloud.google.com/compute/docs/instances/verifying-instance-identity

	This makes use of the GCP Cloud Functions serverless offering. It's another
	level of abstraction on top of Cloud Run, where you don't even need to create your
	own docker container. For local development:

	functions-framework --target=delete_self
	curl -X DELETE -v --header "Authorization: Bearer $(cat /tmp/token.txt)" localhost:8080

	You'll need to get a token that corresponds to an actual instance though or
	you'll get an error:

	gcloud compute ssh gh-runner-testing-presubmit-cpu-us-west1-h58j \
	--user-output-enabled=false \
	--command "curl -sSfL \
	-H 'Metadata-Flavor: Google' \
	'http://metadata/computeMetadata/v1/instance/service-accounts/default/identity?audience=localhost&format=full'" \
	> /tmp/token.txt

	To deploy:
	# Note timeout should be greater than STABILIZE_TIMEOUT_SECONDS
	gcloud functions deploy instance-self-deleter \
	--gen2 \
	--runtime=python310 \
	--region=us-central1 \
	--source=. \
	--entry-point=delete_self \
	--trigger-http \
	--run-service-account=managed-instance-deleter@iree-oss.iam.gserviceaccount.com \
	--service-account=managed-instance-deleter@iree-oss.iam.gserviceaccount.com \
	--ingress-settings=internal-only \
	--timeout=120s \
	--set-env-vars ALLOWED_MIG_PATTERN='gh-runner-.*'


	See https://cloud.google.com/functions/docs for more details.
	"""

	import os
	import random
	import re
	import time
	from http.client import (
	BAD_REQUEST,
	FORBIDDEN,
	GATEWAY_TIMEOUT,
	INTERNAL_SERVER_ERROR,
	NOT_FOUND,
	UNAUTHORIZED,
	)

	import flask
	import functions_framework
	import google.api_core.exceptions
	import google.auth.exceptions
	import requests
	from google.auth import transport
	from google.cloud import compute
	from google.oauth2 import id_token

	AUTH_HEADER_PREFIX = "Bearer "
	ALLOWED_HTTP_METHODS = ["DELETE", "GET"]
	MIG_METADATA_KEY = "created-by"
	ALLOWED_MIG_PATTERN_ENV_VARIABLE = "ALLOWED_MIG_PATTERN"
	# Must be less than timeout configuration for deployment
	STABILIZE_TIMEOUT_SECONDS = 100

	instances_client = compute.InstancesClient()
	migs_client = compute.RegionInstanceGroupManagersClient()
	autoscalers_client = compute.RegionAutoscalersClient()
	session = requests.Session()

	print("Server started")


	def _verify_token(token: str) -> dict:
	"""Verify token signature and return the token payload"""
	request = transport.requests.Request(session)
	payload = id_token.verify_oauth2_token(token, request=request)
	return payload


	def _get_region(zone: str) -> str:
	"""Extract region name from zone name"""
	# Drop the trailing zone identifier to get the region. Yeah it kinda does seem
	# like there should be a better way to do this...
	region, _ = zone.rsplit("-", maxsplit=1)
	return region


	def _get_name_from_resource(resource: str) -> str:
	"""Extract just the final name component from a fully scoped resource name."""
	_, name = resource.rsplit("/", maxsplit=1)
	return name


	def _get_from_items(items: compute.Items, key: str):
	# Why would the GCP Python API return something as silly as a dictionary?
	return next((item.value for item in items if item.key == key), None)


	def delete_instance_from_mig(
	mig_name: str, project: str, region: str, instance: compute.Instance
	):
	try:
	operation = migs_client.delete_instances(
	instance_group_manager=mig_name,
	project=project,
	region=region,
	# For some reason we can't just use a list of instance names and need to
	# build this RhymingRythmicJavaClasses proto. Also, unlike all the other
	# parameters, the instance has to be a fully-specified URL for the
	# instance, not just its name.
	region_instance_group_managers_delete_instances_request_resource=(
	compute.RegionInstanceGroupManagersDeleteInstancesRequest(
	instances=[instance.self_link]
	)
	),
	)
	except (
	google.api_core.exceptions.Forbidden,
	google.api_core.exceptions.Unauthorized,
	google.api_core.exceptions.NotFound,
	) as e:
	print(e)
	return flask.abort(
	e.code, f"Error requesting that {mig_name} delete {instance.name}."
	)
	except Exception as e:
	# We'll call any other error here a server error.
	print(e)
	return flask.abort(
	INTERNAL_SERVER_ERROR,
	f"Error requesting that {mig_name} delete {instance.name}.",
	)

	try:
	# This is actually an extended operation that you have to poll to get its
	# status, but we just check the status once because it appears that errors
	# always show up here and all we just want to return success in marking for
	# deletion. We don't need to wait for the deletion to actually take place.
	operation.result()
	except google.api_core.exceptions.ClientError as e:
	print(e)
	# Unpack the actual usable error message
	msg = (
	f"Error requesting that {mig_name} delete {instance.name}:"
	"\n"
	+ "\n".join(
	[f"{err.code}: {err.message}" for err in e.response.error.errors]
	)
	)
	print(msg)
	# We're not actually totally sure whether this is a client or server error
	# for the overall request, but let's call it a client error (the only client
	# here is our VM instances, so I think we can be a bit loose).
	return flask.abort(BAD_REQUEST, msg)

	success_msg = f"{instance.name} has been marked for deletion by {mig_name}."
	print(success_msg)
	return success_msg


	def should_scale_down(mig_name: str, project: str, region: str):
	start = time.time()
	print(f"Polling {mig_name} for stability")
	while time.time() - start < STABILIZE_TIMEOUT_SECONDS:
	try:
	mig = migs_client.get(
	project=project, region=region, instance_group_manager=mig_name
	)
	except google.api_core.exceptions.NotFound as e:
	print(e)
	return flask.abort(
	e.code, f"Cannot find {mig_name} in region={region}, project={project}"
	)
	if mig.status.is_stable:
	break
	# We sleep for a random amount of time here to avoid synchronizing callers
	# waiting for the MIG to be stable.
	sleep_secs = random.randint(1, 15)
	print(f"{mig_name} is not stable. Retrying in {sleep_secs} seconds")
	time.sleep(sleep_secs)
	else:
	return flask.abort(
	GATEWAY_TIMEOUT, "Timed out waiting for the MIG to become stable"
	)
	autoscaler = autoscalers_client.get(
	project=project,
	region=region,
	autoscaler=_get_name_from_resource(mig.status.autoscaler),
	)
	response = "true" if autoscaler.recommended_size < mig.target_size else "false"
	print(
	f"Autoscaler recommends size {autoscaler.recommended_size} and"
	f" {mig_name} is targetting size {mig.target_size}. Sending: {response}"
	)
	return response


	@functions_framework.http
	def delete_self(request: flask.Request):
	"""HTTP Cloud Function to delete the instance group making the request.
	Args:
	request: The request object.
	https://flask.palletsprojects.com/en/1.1.x/api/#incoming-request-data
	Returns:
	The response text, or any set of values that can be turned into a
	Response object using `make_response`
	https://flask.palletsprojects.com/en/1.1.x/api/#flask.make_response.
	Note:
	For more information on how Flask integrates with Cloud
	Functions, see the `Writing HTTP functions` page.
	https://cloud.google.com/functions/docs/writing/http#http_frameworks
	"""
	if request.method not in ALLOWED_HTTP_METHODS:
	return flask.abort(
	BAD_REQUEST,
	f"Invalid method {request.method}."
	f" Allowed methods: {ALLOWED_HTTP_METHODS}",
	)

	# No path is needed, since the token and method contain all the information we
	# need. Maybe that design was a mistake, but since the resource being operated
	# on is always the instance making the call, it seemed handy.
	if request.path != "/":
	return flask.abort(
	BAD_REQUEST,
	f"Invalid request path {request.path}. Only root path is valid).",
	)

	auth_header = request.headers.get("Authorization")
	if auth_header is None:
	return flask.abort(UNAUTHORIZED, "Authorization header is missing")
	if not auth_header.startswith(AUTH_HEADER_PREFIX):
	return flask.abort(
	UNAUTHORIZED,
	f"Authorization header does not start with expected string"
	f" {AUTH_HEADER_PREFIX}.",
	)

	token = auth_header[len(AUTH_HEADER_PREFIX) :]

	try:
	# We don't verify audience here because Cloud IAM will have already done so
	# and jwt's matching of audiences is exact, which means trailing slashes or
	# http vs https matters and that's pretty brittle.
	token_payload = _verify_token(token)
	except (ValueError, google.auth.exceptions.GoogleAuthError) as e:
	print(e)
	return flask.abort(UNAUTHORIZED, "Decoding bearer token failed.")

	print(f"Token payload: {token_payload}")

	try:
	compute_info = token_payload["google"]["compute_engine"]
	except KeyError:
	return flask.abort(
	UNAUTHORIZED,
	"Bearer token payload does not have expected field google.compute",
	)

	project = compute_info["project_id"]
	zone = compute_info["zone"]
	region = _get_region(zone)
	instance_name = compute_info["instance_name"]

	if request.method == "DELETE":
	print(f"Received request to delete {instance_name}")
	else:
	assert request.method == "GET"
	print(f"Received inquiry whether to delete {instance_name}")
	try:
	instance = instances_client.get(
	instance=instance_name, project=project, zone=zone
	)
	except (
	google.api_core.exceptions.NotFound,
	google.api_core.exceptions.Forbidden,
	) as e:
	print(e)
	return flask.abort(
	e.code, f"Cannot view {instance_name} in zone={zone}, project={project}"
	)

	instance_id = int(compute_info["instance_id"])
	# Verify it's actually the same instance. Names get reused, but IDs don't.
	# For some reason you can't reference anything by ID in the API.
	if instance.id != instance_id:
	return flask.abort(
	BAD_REQUEST,
	f"Existing instance of the same name {instance.name} has a different"
	f" ID {instance.id} than token specifies {instance_id}.",
	)

	mig_name = _get_from_items(instance.metadata.items, MIG_METADATA_KEY)

	if mig_name is None:
	return flask.abort(
	BAD_REQUEST,
	(
	f"Instance is not part of a managed instance group."
	f" Did not find {MIG_METADATA_KEY} in metadata."
	),
	)
	mig_name = _get_name_from_resource(mig_name)

	# General good practice would be to compile the regex once, but the only way
	# to do that is to make it a global, which makes this difficult to test and
	# compiling this regex should not be expensive.
	allowed_mig_pattern = os.environ.get(ALLOWED_MIG_PATTERN_ENV_VARIABLE)
	if allowed_mig_pattern is None:
	flask.abort(
	INTERNAL_SERVER_ERROR,
	f"Missing required environment variable"
	f" {ALLOWED_MIG_PATTERN_ENV_VARIABLE}",
	)

	if not re.fullmatch(allowed_mig_pattern, mig_name):
	return flask.abort(FORBIDDEN, f"No access to MIG {mig_name}")

	if request.method == "DELETE":
	return delete_instance_from_mig(
	mig_name=mig_name, project=project, region=region, instance=instance
	)

	assert request.method == "GET"
	return should_scale_down(mig_name=mig_name, project=project, region=region)