tests/transform_dialect/cuda/softmax_partial.mlir - 3p/openxla/iree - Git at Google


 // RUN: iree-opt %s --iree-hal-target-backends=cuda \
 // RUN:     --iree-abi-transformation-pipeline \
 // RUN:     --iree-flow-transformation-pipeline  \
 // RUN:     --iree-stream-transformation-pipeline \
 // RUN:     --iree-hal-configuration-pipeline | \
 // RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-lower-executable-target)))' \
 // RUN:     --iree-codegen-llvmgpu-use-transform-dialect=%p/softmax_partial_codegen_spec.mlir \
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false | \
 // RUN: FileCheck %s --check-prefix=CHECK-SHUFFLE

 // RUN: iree-compile %s --iree-hal-target-backends=cuda \
 // RUN:     --iree-opt-const-expr-hoisting=false --iree-opt-const-eval=false \
 /// Constant JIT'ing must be disabled because the transform-dialect debug
 /// flags leak to the JIT session, which doesn't know what to do with them.
 // RUN:     --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
 // RUN:     --iree-codegen-llvmgpu-use-transform-dialect=%p/softmax_partial_codegen_spec.mlir | \
 // RUN: iree-run-module --module=- --function=softmax_partial --device=cuda | \
 // RUN: FileCheck %s

 !tmp_tensor_t = tensor<16x128xf32>
 !out_tensor_t = tensor<16x128x128xf32>

 // Compilation checks that shuffles are produced.
 // CHECK-SHUFFLE: gpu.shuffle  xor

 // Execution only checks that @softmax_partial runs.
 //      CHECK: EXEC @softmax_partial
 //      CHECK: 16x128x128xf32=[
 // CHECK-SAME:                [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

 func.func @softmax_partial() -> !out_tensor_t {
   %cst = arith.constant -3.40282347E+38 : f32
   %cst_0 = arith.constant dense<1121212.000000e+00> : !out_tensor_t
   %cst_1 = arith.constant dense<5.000000e+00> : !out_tensor_t
   %0 = util.optimization_barrier %cst_1 : !out_tensor_t

   %1 = tensor.empty() : !tmp_tensor_t
   %2 = linalg.fill ins(%cst : f32) outs(%1 : !tmp_tensor_t) -> !tmp_tensor_t
   %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
                                         affine_map<(d0, d1, d2) -> (d0, d1)>],
                        iterator_types = ["parallel", "parallel", "reduction"]}
   ins(%0 : !out_tensor_t) outs(%2 : !tmp_tensor_t) {
   ^bb0(%arg0: f32, %arg1: f32):
     %8 = arith.maxf %arg0, %arg1 : f32
     linalg.yield %8 : f32
   } -> !tmp_tensor_t

   // This has been fused manually to avoid the fusion on tensors pass and reduce noise atm.
   %4 = tensor.empty() : !out_tensor_t
   %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
                                         affine_map<(d0, d1, d2) -> (d0, d1)>,
                                         affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
                        iterator_types = ["parallel", "parallel", "parallel"]}
   ins(%0, %3 : !out_tensor_t, !tmp_tensor_t) outs(%4 : !out_tensor_t) {
   ^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
     %6 = arith.subf %arg0, %arg1 : f32
     %7 = math.exp %6 : f32
     linalg.yield %7 : f32
   } -> !out_tensor_t

   return %5: !out_tensor_t
 }

	// RUN: iree-opt %s --iree-hal-target-backends=cuda \
	// RUN: --iree-abi-transformation-pipeline \
	// RUN: --iree-flow-transformation-pipeline \
	// RUN: --iree-stream-transformation-pipeline \
	// RUN: --iree-hal-configuration-pipeline \| \
	// RUN: iree-opt --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-lower-executable-target)))' \
	// RUN: --iree-codegen-llvmgpu-use-transform-dialect=%p/softmax_partial_codegen_spec.mlir \
	// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \| \
	// RUN: FileCheck %s --check-prefix=CHECK-SHUFFLE

	// RUN: iree-compile %s --iree-hal-target-backends=cuda \
	// RUN: --iree-opt-const-expr-hoisting=false --iree-opt-const-eval=false \
	/// Constant JIT'ing must be disabled because the transform-dialect debug
	/// flags leak to the JIT session, which doesn't know what to do with them.
	// RUN: --iree-codegen-llvmgpu-enable-transform-dialect-jit=false \
	// RUN: --iree-codegen-llvmgpu-use-transform-dialect=%p/softmax_partial_codegen_spec.mlir \| \
	// RUN: iree-run-module --module=- --function=softmax_partial --device=cuda \| \
	// RUN: FileCheck %s

	!tmp_tensor_t = tensor<16x128xf32>
	!out_tensor_t = tensor<16x128x128xf32>

	// Compilation checks that shuffles are produced.
	// CHECK-SHUFFLE: gpu.shuffle xor

	// Execution only checks that @softmax_partial runs.
	// CHECK: EXEC @softmax_partial
	// CHECK: 16x128x128xf32=[
	// CHECK-SAME: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

	func.func @softmax_partial() -> !out_tensor_t {
	%cst = arith.constant -3.40282347E+38 : f32
	%cst_0 = arith.constant dense<1121212.000000e+00> : !out_tensor_t
	%cst_1 = arith.constant dense<5.000000e+00> : !out_tensor_t
	%0 = util.optimization_barrier %cst_1 : !out_tensor_t

	%1 = tensor.empty() : !tmp_tensor_t
	%2 = linalg.fill ins(%cst : f32) outs(%1 : !tmp_tensor_t) -> !tmp_tensor_t
	%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
	affine_map<(d0, d1, d2) -> (d0, d1)>],
	iterator_types = ["parallel", "parallel", "reduction"]}
	ins(%0 : !out_tensor_t) outs(%2 : !tmp_tensor_t) {
	^bb0(%arg0: f32, %arg1: f32):
	%8 = arith.maxf %arg0, %arg1 : f32
	linalg.yield %8 : f32
	} -> !tmp_tensor_t

	// This has been fused manually to avoid the fusion on tensors pass and reduce noise atm.
	%4 = tensor.empty() : !out_tensor_t
	%5 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
	affine_map<(d0, d1, d2) -> (d0, d1)>,
	affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
	iterator_types = ["parallel", "parallel", "parallel"]}
	ins(%0, %3 : !out_tensor_t, !tmp_tensor_t) outs(%4 : !out_tensor_t) {
	^bb0(%arg0: f32, %arg1: f32, %arg2: f32):
	%6 = arith.subf %arg0, %arg1 : f32
	%7 = math.exp %6 : f32
	linalg.yield %7 : f32
	} -> !out_tensor_t

	return %5: !out_tensor_t
	}