experimental/distributed/python/example.py - 3p/openxla/iree - Git at Google

 # Copyright 2023 The IREE Authors
 #
 # Licensed under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 from iree.experimental.runtime.distributed import run_ranks
 import iree.compiler
 import tempfile
 import numpy as np
 import os

 """
 Example of distributed execution across 2 devices of a small model
 with just an all-reduce operation.
 all_reduce([1, 2, 3, 4], [5, 6, 7, 8]) -> [6, 8, 10, 12].

 Dependecies at:
 runtime/bindings/python/iree/runtime/distributed/setup.sh
 """
 mlir = """
     func.func @all_reduce_sum(%input : tensor<4xf32>) -> tensor<4xf32> {
     %out = "stablehlo.all_reduce"(%input) ({
         ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
         %sum = stablehlo.add %arg0, %arg1 : tensor<f32>
         stablehlo.return %sum : tensor<f32>
         }) {channel_handle = #stablehlo.channel_handle<handle = 1, type = 1>,
             replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>,
             use_global_device_ids} : (tensor<4xf32>) -> tensor<4xf32>
     return %out : tensor<4xf32>
     }
 """

 inputs = [
     [np.array([1, 2, 3, 4], dtype=np.float32)],
     [np.array([5, 6, 7, 8], dtype=np.float32)],
 ]

 for rank in range(len(inputs)):
     print(f"Rank {rank} argument = {inputs[rank]}")

 with tempfile.TemporaryDirectory() as tmp_dir:
     module_filepath = os.path.join(tmp_dir, "module.vmfb")
     iree.compiler.tools.compile_str(
         input_str=mlir,
         output_file=module_filepath,
         target_backends=["cuda"],
         input_type="stablehlo",
     )

     num_ranks = len(inputs)
     # Ranks on the 0th axis.
     outputs = run_ranks(
         num_ranks=num_ranks,
         function="all_reduce_sum",
         driver="cuda",
         module_filepath=module_filepath,
         inputs=inputs,
     )
     for rank in range(num_ranks):
         print(f"Rank {rank} result = {outputs[rank]}")
	# Copyright 2023 The IREE Authors
	#
	# Licensed under the Apache License v2.0 with LLVM Exceptions.
	# See https://llvm.org/LICENSE.txt for license information.
	# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

	from iree.experimental.runtime.distributed import run_ranks
	import iree.compiler
	import tempfile
	import numpy as np
	import os

	"""
	Example of distributed execution across 2 devices of a small model
	with just an all-reduce operation.
	all_reduce([1, 2, 3, 4], [5, 6, 7, 8]) -> [6, 8, 10, 12].

	Dependecies at:
	runtime/bindings/python/iree/runtime/distributed/setup.sh
	"""
	mlir = """
	func.func @all_reduce_sum(%input : tensor<4xf32>) -> tensor<4xf32> {
	%out = "stablehlo.all_reduce"(%input) ({
	^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
	%sum = stablehlo.add %arg0, %arg1 : tensor<f32>
	stablehlo.return %sum : tensor<f32>
	}) {channel_handle = #stablehlo.channel_handle<handle = 1, type = 1>,
	replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>,
	use_global_device_ids} : (tensor<4xf32>) -> tensor<4xf32>
	return %out : tensor<4xf32>
	}
	"""

	inputs = [
	[np.array([1, 2, 3, 4], dtype=np.float32)],
	[np.array([5, 6, 7, 8], dtype=np.float32)],
	]

	for rank in range(len(inputs)):
	print(f"Rank {rank} argument = {inputs[rank]}")

	with tempfile.TemporaryDirectory() as tmp_dir:
	module_filepath = os.path.join(tmp_dir, "module.vmfb")
	iree.compiler.tools.compile_str(
	input_str=mlir,
	output_file=module_filepath,
	target_backends=["cuda"],
	input_type="stablehlo",
	)

	num_ranks = len(inputs)
	# Ranks on the 0th axis.
	outputs = run_ranks(
	num_ranks=num_ranks,
	function="all_reduce_sum",
	driver="cuda",
	module_filepath=module_filepath,
	inputs=inputs,
	)
	for rank in range(num_ranks):
	print(f"Rank {rank} result = {outputs[rank]}")