tools/test/iree-run-module-multi.mlir - 3p/openxla/iree - Git at Google

 // Tests that multiple devices are supported through iree-run-module by
 // providing two local thread pools. This is not optimal and not an intended
 // route for multi-device CPU workloads but requires no additional hardware
 // resources for the test and still verifies the compiler/runtime tooling
 // rendezvous of devices as specified on the command line.

 // RUN: (iree-compile %s \
 // RUN:      --iree-execution-model=async-external \
 // RUN:      --iree-hal-target-device=device_a=local[0] \
 // RUN:      --iree-hal-target-device=device_b=local[1] \
 // RUN:      --iree-hal-local-target-device-backends=vmvx | \
 // RUN:  iree-run-module \
 // RUN:      --module=- \
 // RUN:      --function=multi_device_mul \
 // RUN:      --input=4xf32=10,11,12,13 \
 // RUN:      --device=local-task \
 // RUN:      --device=local-task \
 // RUN:      --task_topology_group_count=1) | \
 // RUN: FileCheck %s

 // CHECK: EXEC @multi_device_mul
 // CHECK-NEXT: result[0]: hal.buffer_view
 // CHECK-NEXT: 4xf32=0 55 144 273
 func.func public @multi_device_mul(
   // Input argument is resident on device_a (tooling default to first device).
   %input_a: tensor<4xf32> {iree.abi.affinity = #hal.device.promise<@device_a>}
 ) -> (
   // Output result is expected to be on device_a (though not required).
   tensor<4xf32> {iree.abi.affinity = #hal.device.promise<@device_a>}
 ) {
   // Compute on device_a (input is there).
   %constant_a = arith.constant dense<[0.0, 1.0, 2.0, 3.0]> : tensor<4xf32>
   %transient_a = arith.mulf %input_a, %constant_a : tensor<4xf32>
   // Transfer the result from device_a -> device_b.
   %transient_b = flow.tensor.transfer %transient_a : tensor<4xf32> to #hal.device.promise<@device_b>
   // Compute on device_b.
   %constant_b = arith.constant dense<[4.0, 5.0, 6.0, 7.0]> : tensor<4xf32>
   %result_b = arith.mulf %transient_b, %constant_b : tensor<4xf32>
   // Transfer the result from device_b -> device_a.
   %result_a = flow.tensor.transfer %result_b : tensor<4xf32> to #hal.device.promise<@device_a>
   // Return the result on device_a (as required by ABI attr).
   func.return %result_a : tensor<4xf32>
 }
	// Tests that multiple devices are supported through iree-run-module by
	// providing two local thread pools. This is not optimal and not an intended
	// route for multi-device CPU workloads but requires no additional hardware
	// resources for the test and still verifies the compiler/runtime tooling
	// rendezvous of devices as specified on the command line.

	// RUN: (iree-compile %s \
	// RUN: --iree-execution-model=async-external \
	// RUN: --iree-hal-target-device=device_a=local[0] \
	// RUN: --iree-hal-target-device=device_b=local[1] \
	// RUN: --iree-hal-local-target-device-backends=vmvx \| \
	// RUN: iree-run-module \
	// RUN: --module=- \
	// RUN: --function=multi_device_mul \
	// RUN: --input=4xf32=10,11,12,13 \
	// RUN: --device=local-task \
	// RUN: --device=local-task \
	// RUN: --task_topology_group_count=1) \| \
	// RUN: FileCheck %s

	// CHECK: EXEC @multi_device_mul
	// CHECK-NEXT: result[0]: hal.buffer_view
	// CHECK-NEXT: 4xf32=0 55 144 273
	func.func public @multi_device_mul(
	// Input argument is resident on device_a (tooling default to first device).
	%input_a: tensor<4xf32> {iree.abi.affinity = #hal.device.promise<@device_a>}
	) -> (
	// Output result is expected to be on device_a (though not required).
	tensor<4xf32> {iree.abi.affinity = #hal.device.promise<@device_a>}
	) {
	// Compute on device_a (input is there).
	%constant_a = arith.constant dense<[0.0, 1.0, 2.0, 3.0]> : tensor<4xf32>
	%transient_a = arith.mulf %input_a, %constant_a : tensor<4xf32>
	// Transfer the result from device_a -> device_b.
	%transient_b = flow.tensor.transfer %transient_a : tensor<4xf32> to #hal.device.promise<@device_b>
	// Compute on device_b.
	%constant_b = arith.constant dense<[4.0, 5.0, 6.0, 7.0]> : tensor<4xf32>
	%result_b = arith.mulf %transient_b, %constant_b : tensor<4xf32>
	// Transfer the result from device_b -> device_a.
	%result_a = flow.tensor.transfer %result_b : tensor<4xf32> to #hal.device.promise<@device_a>
	// Return the result on device_a (as required by ABI attr).
	func.return %result_a : tensor<4xf32>
	}