tests/transform_dialect/cuda/reduction.mlir - 3p/openxla/iree - Git at Google

 !in_tensor_t = tensor<8x64xf32>
 !out_tensor_t = tensor<8xf32>

 func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) {
   %cst = arith.constant -0.000000e+00 : f32

   %0 = tensor.empty() : !out_tensor_t
   %1 = linalg.fill ins(%cst : f32) outs(%0 : !out_tensor_t) ->   !out_tensor_t
   %2 = linalg.generic {
     indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
                      affine_map<(d0, d1) -> (d0)>],
     iterator_types = ["parallel", "reduction"]}
     ins(%arg : !in_tensor_t) outs(%1 : !out_tensor_t) {
       ^bb0(%arg3: f32, %arg4: f32):
         %3 = arith.addf %arg3, %arg4 : f32
         linalg.yield %3 : f32
       } -> !out_tensor_t
   return %2 : !out_tensor_t
 }

 // RUN: iree-compile %s --iree-hal-target-backends=cuda \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
 // RUN:     --iree-codegen-transform-dialect-library=%p/reduction_codegen_spec.mlir@codegen | \
 // RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\
 // RUN: FileCheck %s --check-prefix=EXEC

 /// Note: the current --iree-codegen-llvmgpu-enable-transform-dialect-jit only works for exactly this reduction atm.
 // RUN: iree-compile %s --iree-hal-target-backends=cuda | \
 // RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" |\
 // RUN: FileCheck %s --check-prefix=EXEC

 //      EXEC: result[0]: hal.buffer_view
 // EXEC-NEXT: 8xf32=64 64 64 64 64 64 64 64
	!in_tensor_t = tensor<8x64xf32>
	!out_tensor_t = tensor<8xf32>

	func.func @reduce(%arg : !in_tensor_t) -> (!out_tensor_t) {
	%cst = arith.constant -0.000000e+00 : f32

	%0 = tensor.empty() : !out_tensor_t
	%1 = linalg.fill ins(%cst : f32) outs(%0 : !out_tensor_t) -> !out_tensor_t
	%2 = linalg.generic {
	indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
	affine_map<(d0, d1) -> (d0)>],
	iterator_types = ["parallel", "reduction"]}
	ins(%arg : !in_tensor_t) outs(%1 : !out_tensor_t) {
	^bb0(%arg3: f32, %arg4: f32):
	%3 = arith.addf %arg3, %arg4 : f32
	linalg.yield %3 : f32
	} -> !out_tensor_t
	return %2 : !out_tensor_t
	}

	// RUN: iree-compile %s --iree-hal-target-backends=cuda \
	// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
	// RUN: --iree-codegen-transform-dialect-library=%p/reduction_codegen_spec.mlir@codegen \| \
	// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" \|\
	// RUN: FileCheck %s --check-prefix=EXEC

	/// Note: the current --iree-codegen-llvmgpu-enable-transform-dialect-jit only works for exactly this reduction atm.
	// RUN: iree-compile %s --iree-hal-target-backends=cuda \| \
	// RUN: iree-run-module --module=- --function=reduce --device=cuda --input="8x64xf32=1" \|\
	// RUN: FileCheck %s --check-prefix=EXEC

	// EXEC: result[0]: hal.buffer_view
	// EXEC-NEXT: 8xf32=64 64 64 64 64 64 64 64