samples/custom_dispatch/cpu/mlp_plugin/mlp_linalg.mlir - 3p/openxla/iree - Git at Google

 // RUN: iree-compile --iree-preprocessing-pdl-spec-filename=%p/mlp_linalg_spec.pdl.mlir %s --compile-to=preprocessing | FileCheck %s

 // RUN: iree-compile --iree-preprocessing-pdl-spec-filename=%p/mlp_linalg_spec.pdl.mlir %s | \
 // RUN:   iree-run-module --device=local-sync \
 // RUN:     --executable_plugin=$IREE_BINARY_DIR/samples/custom_dispatch/cpu/mlp_plugin/mlp_plugin$IREE_DYLIB_EXT \
 // RUN:     --module=- \
 // RUN:     --function=mlp_invocation \
 // RUN:     --input="2x4xf32=[[2.0, 2.0, 2.0, 2.0], [-2.0, -2.0, -2.0, -2.0]]" \
 // RUN:     --input="4x8xf32=[[3.0, -3.0, 3.0, -3.0], [3.0, -3.0, 3.0, -3.0], [3.0, -3.0, 3.0, -3.0], [3.0, -3.0, 3.0, -3.0], [3.0, -3.0, 3.0, -3.0], [3.0, -3.0, 3.0, -3.0], [3.0, -3.0, 3.0, -3.0], [3.0, -3.0, 3.0, -3.0]]" | \
 // RUN:   FileCheck %s --check-prefix=OUTPUT

 // CHECK-LABEL:   stream.executable private @mlp_external_f32_f32_f32_i32_i32_i32_i1_executable
 //       CHECK:   stream.executable.export public @mlp_external_entry_point
 //       CHECK:   builtin.module
 //       CHECK:     func.func private @mlp_external
 //  CHECK-SAME:         (memref<f32>, index, memref<f32>, index, memref<f32>, index, i32, i32, i32, i1)
 //  CHECK-SAME:         attributes {llvm.bareptr = [true]}
 //       CHECK:     func.func @mlp_external_entry_point
 //  CHECK-SAME:         %[[ARG0:[a-zA-Z0-9]+]]: !stream.binding
 //  CHECK-SAME:         %[[ARG1:[a-zA-Z0-9]+]]: !stream.binding
 //  CHECK-SAME:         %[[ARG2:[a-zA-Z0-9]+]]: !stream.binding
 //  CHECK-SAME:         %[[ARG3:[a-zA-Z0-9]+]]: i32
 //  CHECK-SAME:         %[[ARG4:[a-zA-Z0-9]+]]: i32
 //  CHECK-SAME:         %[[ARG5:[a-zA-Z0-9]+]]: i32
 //  CHECK-SAME:         %[[ARG6:[a-zA-Z0-9]+]]: i1
 //  CHECK-SAME:         %[[ARG7:[a-zA-Z0-9]+]]: index
 //  CHECK-SAME:         %[[ARG8:[a-zA-Z0-9]+]]: index
 //  CHECK-SAME:         %[[ARG9:[a-zA-Z0-9]+]]: index
 //  CHECK-SAME:         %[[ARG10:[a-zA-Z0-9]+]]: index
 //  CHECK-SAME:         %[[ARG11:[a-zA-Z0-9]+]]: index
 //  CHECK-SAME:         %[[ARG12:[a-zA-Z0-9]+]]: index
 //       CHECK:       %[[C0:.+]] = arith.constant 0 : index
 //       CHECK:       %[[STREAM0:.+]] = stream.binding.subspan %[[ARG0]][%[[C0]]] : !stream.binding -> memref<?x?xf32, strided<[?, 1], offset: ?>>{%[[ARG7]], %[[ARG8]]}
 //  CHECK-NEXT:       %[[STREAM0_BASE:[a-zA-Z0-9_]+]], %[[OFFSET0:[a-zA-Z0-9_]+]],
 //  CHECK-SAME:             = iree_codegen.extract_strided_metadata %[[STREAM0]]
 //       CHECK:       %[[STREAM1:.+]] = stream.binding.subspan %[[ARG1]][%[[C0]]] : !stream.binding -> memref<?x?xf32, strided<[?, 1], offset: ?>>{%[[ARG9]], %[[ARG10]]}
 //  CHECK-NEXT:       %[[STREAM1_BASE:[a-zA-Z0-9_]+]], %[[OFFSET1:[a-zA-Z0-9_]+]],
 //  CHECK-SAME:             = iree_codegen.extract_strided_metadata %[[STREAM1]]
 //       CHECK:       %[[STREAM2:.+]] = stream.binding.subspan %[[ARG2]][%[[C0]]] : !stream.binding -> memref<?x?xf32, strided<[?, 1], offset: ?>>{%[[ARG11]], %[[ARG12]]}
 //  CHECK-NEXT:       %[[STREAM2_BASE:[a-zA-Z0-9_]+]], %[[OFFSET2:[a-zA-Z0-9_]+]],
 //  CHECK-SAME:             = iree_codegen.extract_strided_metadata %[[STREAM2]]
 //       CHECK:       call @mlp_external
 //  CHECK-SAME:           %[[STREAM0_BASE]], %[[OFFSET0]], %[[STREAM1_BASE]], %[[OFFSET1]], %[[STREAM2_BASE]], %[[OFFSET2]], %[[ARG3]], %[[ARG4]], %[[ARG5]], %[[ARG6]]

 //       CHECK:     util.func public @mlp_invocation
 //  CHECK-SAME:         (%[[ARG0:[a-zA-Z0-9]+]]: !hal.buffer_view, %[[ARG1:[a-zA-Z0-9]+]]: !hal.buffer_view)
 //   CHECK-DAG:       %[[DORELU:.+]] = arith.constant false
 //   CHECK-DAG:       %[[MDIM0:.+]] = hal.buffer_view.dim<%[[ARG0]] : !hal.buffer_view>[0] : index
 //   CHECK-DAG:       %[[MDIM1:.+]] = hal.buffer_view.dim<%[[ARG0]] : !hal.buffer_view>[1] : index
 //   CHECK-DAG:       %[[LHS:.+]] = hal.tensor.import %[[ARG0]] "input0" : !hal.buffer_view -> tensor<?x?xf32>{%[[MDIM0]], %[[MDIM1]]}
 //   CHECK-DAG:       %[[NDIM0:.+]] = hal.buffer_view.dim<%[[ARG1]] : !hal.buffer_view>[0] : index
 //   CHECK-DAG:       %[[NDIM1:.+]] = hal.buffer_view.dim<%[[ARG1]] : !hal.buffer_view>[1] : index
 //   CHECK-DAG:       %[[RHS:.+]] = hal.tensor.import %[[ARG1]] "input1" : !hal.buffer_view -> tensor<?x?xf32>{%[[NDIM0]], %[[NDIM1]]}
 //   CHECK-DAG:       %[[M_I32:.+]] = arith.index_cast %[[MDIM0]] : index to i32
 //   CHECK-DAG:       %[[N_I32:.+]] = arith.index_cast %[[NDIM1]] : index to i32
 //   CHECK-DAG:       %[[K_I32:.+]] = arith.index_cast %[[MDIM1]] : index to i32
 //       CHECK:       %[[RESULT:.+]] = flow.dispatch
 //  CHECK-SAME:           @mlp_external_f32_f32_f32_i32_i32_i32_i1_executable::@mlp_external_entry_point
 //  CHECK-SAME:           (%[[LHS]], %[[RHS]], %[[M_I32]], %[[N_I32]], %[[K_I32]], %[[DORELU]], %[[MDIM0]], %[[MDIM1]], %[[NDIM0]], %[[NDIM1]], %[[MDIM0]], %[[NDIM1]])
 //       CHECK:       linalg.generic
 //  CHECK-SAME:           ins(%[[RESULT]] :

 #x86_64_target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {
   data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
   native_vector_size = 32 : index,
   target_triple = "x86_64-none-elf"
 }>

 // The target devices that the program will run on. We can compile and run with
 // multiple targets, but this example is maintaining an implicit requirement
 // that the custom kernel being spliced in is supported by the target device,
 // hence we only support llvm-cpu here.
 #cpu_target = #hal.device.target<"local", [
   #x86_64_target
 ]> : !hal.device

 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @example attributes {hal.device.targets = [#cpu_target]} {
   func.func @mlp_invocation(%lhs: tensor<?x?xf32>,
                             %rhs: tensor<?x?xf32>) -> (tensor<?x?xf32>) {
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %cst = arith.constant 0.0 : f32
     %dim0 = tensor.dim %lhs, %c0 : tensor<?x?xf32>
     %dim1 = tensor.dim %rhs, %c1 : tensor<?x?xf32>
     %empty = tensor.empty(%dim0, %dim1) : tensor<?x?xf32>
     %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<?x?xf32>) -> tensor<?x?xf32>
     %matmul = linalg.matmul ins(%lhs, %rhs : tensor<?x?xf32>, tensor<?x?xf32>)
         outs(%fill : tensor<?x?xf32>) -> tensor<?x?xf32>
     %relu = linalg.generic {
         indexing_maps = [#map, #map],
         iterator_types = ["parallel", "parallel"]}
         ins(%matmul : tensor<?x?xf32>) outs(%empty : tensor<?x?xf32>) {
       ^bb0(%b0 : f32, %b1 : f32):
         %0 = arith.maximumf %b0, %cst : f32
         linalg.yield %0 : f32
       } -> tensor<?x?xf32>
     %neg = linalg.generic {
         indexing_maps = [#map, #map],
         iterator_types  = ["parallel", "parallel"]}
         ins(%relu : tensor<?x?xf32>) outs(%empty : tensor<?x?xf32>) {
       ^bb0(%b0 : f32, %b1 : f32):
         %0 = arith.negf %b0 : f32
         linalg.yield %0 : f32
     } -> tensor<?x?xf32>
     return %neg : tensor<?x?xf32>
   }
 }  // module

   // OUTPUT-LABEL: EXEC @mlp_invocation
   //       OUTPUT: [Plugin]: M = 2, N = 8, K = 4, doRelu = 0
   //       OUTPUT: 2x8xf32=[-24 -0 -24 -0 -24 -0 -24 -0][-0 -24 -0 -24 -0 -24 -0 -24]
	// RUN: iree-compile --iree-preprocessing-pdl-spec-filename=%p/mlp_linalg_spec.pdl.mlir %s --compile-to=preprocessing \| FileCheck %s

	// RUN: iree-compile --iree-preprocessing-pdl-spec-filename=%p/mlp_linalg_spec.pdl.mlir %s \| \
	// RUN: iree-run-module --device=local-sync \
	// RUN: --executable_plugin=$IREE_BINARY_DIR/samples/custom_dispatch/cpu/mlp_plugin/mlp_plugin$IREE_DYLIB_EXT \
	// RUN: --module=- \
	// RUN: --function=mlp_invocation \
	// RUN: --input="2x4xf32=[[2.0, 2.0, 2.0, 2.0], [-2.0, -2.0, -2.0, -2.0]]" \
	// RUN: --input="4x8xf32=[[3.0, -3.0, 3.0, -3.0], [3.0, -3.0, 3.0, -3.0], [3.0, -3.0, 3.0, -3.0], [3.0, -3.0, 3.0, -3.0], [3.0, -3.0, 3.0, -3.0], [3.0, -3.0, 3.0, -3.0], [3.0, -3.0, 3.0, -3.0], [3.0, -3.0, 3.0, -3.0]]" \| \
	// RUN: FileCheck %s --check-prefix=OUTPUT

	// CHECK-LABEL: stream.executable private @mlp_external_f32_f32_f32_i32_i32_i32_i1_executable
	// CHECK: stream.executable.export public @mlp_external_entry_point
	// CHECK: builtin.module
	// CHECK: func.func private @mlp_external
	// CHECK-SAME: (memref<f32>, index, memref<f32>, index, memref<f32>, index, i32, i32, i32, i1)
	// CHECK-SAME: attributes {llvm.bareptr = [true]}
	// CHECK: func.func @mlp_external_entry_point
	// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: !stream.binding
	// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: !stream.binding
	// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: !stream.binding
	// CHECK-SAME: %[[ARG3:[a-zA-Z0-9]+]]: i32
	// CHECK-SAME: %[[ARG4:[a-zA-Z0-9]+]]: i32
	// CHECK-SAME: %[[ARG5:[a-zA-Z0-9]+]]: i32
	// CHECK-SAME: %[[ARG6:[a-zA-Z0-9]+]]: i1
	// CHECK-SAME: %[[ARG7:[a-zA-Z0-9]+]]: index
	// CHECK-SAME: %[[ARG8:[a-zA-Z0-9]+]]: index
	// CHECK-SAME: %[[ARG9:[a-zA-Z0-9]+]]: index
	// CHECK-SAME: %[[ARG10:[a-zA-Z0-9]+]]: index
	// CHECK-SAME: %[[ARG11:[a-zA-Z0-9]+]]: index
	// CHECK-SAME: %[[ARG12:[a-zA-Z0-9]+]]: index
	// CHECK: %[[C0:.+]] = arith.constant 0 : index
	// CHECK: %[[STREAM0:.+]] = stream.binding.subspan %[[ARG0]][%[[C0]]] : !stream.binding -> memref<?x?xf32, strided<[?, 1], offset: ?>>{%[[ARG7]], %[[ARG8]]}
	// CHECK-NEXT: %[[STREAM0_BASE:[a-zA-Z0-9_]+]], %[[OFFSET0:[a-zA-Z0-9_]+]],
	// CHECK-SAME: = iree_codegen.extract_strided_metadata %[[STREAM0]]
	// CHECK: %[[STREAM1:.+]] = stream.binding.subspan %[[ARG1]][%[[C0]]] : !stream.binding -> memref<?x?xf32, strided<[?, 1], offset: ?>>{%[[ARG9]], %[[ARG10]]}
	// CHECK-NEXT: %[[STREAM1_BASE:[a-zA-Z0-9_]+]], %[[OFFSET1:[a-zA-Z0-9_]+]],
	// CHECK-SAME: = iree_codegen.extract_strided_metadata %[[STREAM1]]
	// CHECK: %[[STREAM2:.+]] = stream.binding.subspan %[[ARG2]][%[[C0]]] : !stream.binding -> memref<?x?xf32, strided<[?, 1], offset: ?>>{%[[ARG11]], %[[ARG12]]}
	// CHECK-NEXT: %[[STREAM2_BASE:[a-zA-Z0-9_]+]], %[[OFFSET2:[a-zA-Z0-9_]+]],
	// CHECK-SAME: = iree_codegen.extract_strided_metadata %[[STREAM2]]
	// CHECK: call @mlp_external
	// CHECK-SAME: %[[STREAM0_BASE]], %[[OFFSET0]], %[[STREAM1_BASE]], %[[OFFSET1]], %[[STREAM2_BASE]], %[[OFFSET2]], %[[ARG3]], %[[ARG4]], %[[ARG5]], %[[ARG6]]

	// CHECK: util.func public @mlp_invocation
	// CHECK-SAME: (%[[ARG0:[a-zA-Z0-9]+]]: !hal.buffer_view, %[[ARG1:[a-zA-Z0-9]+]]: !hal.buffer_view)
	// CHECK-DAG: %[[DORELU:.+]] = arith.constant false
	// CHECK-DAG: %[[MDIM0:.+]] = hal.buffer_view.dim<%[[ARG0]] : !hal.buffer_view>[0] : index
	// CHECK-DAG: %[[MDIM1:.+]] = hal.buffer_view.dim<%[[ARG0]] : !hal.buffer_view>[1] : index
	// CHECK-DAG: %[[LHS:.+]] = hal.tensor.import %[[ARG0]] "input0" : !hal.buffer_view -> tensor<?x?xf32>{%[[MDIM0]], %[[MDIM1]]}
	// CHECK-DAG: %[[NDIM0:.+]] = hal.buffer_view.dim<%[[ARG1]] : !hal.buffer_view>[0] : index
	// CHECK-DAG: %[[NDIM1:.+]] = hal.buffer_view.dim<%[[ARG1]] : !hal.buffer_view>[1] : index
	// CHECK-DAG: %[[RHS:.+]] = hal.tensor.import %[[ARG1]] "input1" : !hal.buffer_view -> tensor<?x?xf32>{%[[NDIM0]], %[[NDIM1]]}
	// CHECK-DAG: %[[M_I32:.+]] = arith.index_cast %[[MDIM0]] : index to i32
	// CHECK-DAG: %[[N_I32:.+]] = arith.index_cast %[[NDIM1]] : index to i32
	// CHECK-DAG: %[[K_I32:.+]] = arith.index_cast %[[MDIM1]] : index to i32
	// CHECK: %[[RESULT:.+]] = flow.dispatch
	// CHECK-SAME: @mlp_external_f32_f32_f32_i32_i32_i32_i1_executable::@mlp_external_entry_point
	// CHECK-SAME: (%[[LHS]], %[[RHS]], %[[M_I32]], %[[N_I32]], %[[K_I32]], %[[DORELU]], %[[MDIM0]], %[[MDIM1]], %[[NDIM0]], %[[NDIM1]], %[[MDIM0]], %[[NDIM1]])
	// CHECK: linalg.generic
	// CHECK-SAME: ins(%[[RESULT]] :

	#x86_64_target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {
	data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
	native_vector_size = 32 : index,
	target_triple = "x86_64-none-elf"
	}>

	// The target devices that the program will run on. We can compile and run with
	// multiple targets, but this example is maintaining an implicit requirement
	// that the custom kernel being spliced in is supported by the target device,
	// hence we only support llvm-cpu here.
	#cpu_target = #hal.device.target<"local", [
	#x86_64_target
	]> : !hal.device

	#map = affine_map<(d0, d1) -> (d0, d1)>
	module @example attributes {hal.device.targets = [#cpu_target]} {
	func.func @mlp_invocation(%lhs: tensor<?x?xf32>,
	%rhs: tensor<?x?xf32>) -> (tensor<?x?xf32>) {
	%c0 = arith.constant 0 : index
	%c1 = arith.constant 1 : index
	%cst = arith.constant 0.0 : f32
	%dim0 = tensor.dim %lhs, %c0 : tensor<?x?xf32>
	%dim1 = tensor.dim %rhs, %c1 : tensor<?x?xf32>
	%empty = tensor.empty(%dim0, %dim1) : tensor<?x?xf32>
	%fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<?x?xf32>) -> tensor<?x?xf32>
	%matmul = linalg.matmul ins(%lhs, %rhs : tensor<?x?xf32>, tensor<?x?xf32>)
	outs(%fill : tensor<?x?xf32>) -> tensor<?x?xf32>
	%relu = linalg.generic {
	indexing_maps = [#map, #map],
	iterator_types = ["parallel", "parallel"]}
	ins(%matmul : tensor<?x?xf32>) outs(%empty : tensor<?x?xf32>) {
	^bb0(%b0 : f32, %b1 : f32):
	%0 = arith.maximumf %b0, %cst : f32
	linalg.yield %0 : f32
	} -> tensor<?x?xf32>
	%neg = linalg.generic {
	indexing_maps = [#map, #map],
	iterator_types = ["parallel", "parallel"]}
	ins(%relu : tensor<?x?xf32>) outs(%empty : tensor<?x?xf32>) {
	^bb0(%b0 : f32, %b1 : f32):
	%0 = arith.negf %b0 : f32
	linalg.yield %0 : f32
	} -> tensor<?x?xf32>
	return %neg : tensor<?x?xf32>
	}
	} // module

	// OUTPUT-LABEL: EXEC @mlp_invocation
	// OUTPUT: [Plugin]: M = 2, N = 8, K = 4, doRelu = 0
	// OUTPUT: 2x8xf32=[-24 -0 -24 -0 -24 -0 -24 -0][-0 -24 -0 -24 -0 -24 -0 -24]