samples/custom_dispatch/cpu/mlp_plugin/mlp.mlir - 3p/openxla/iree - Git at Google

 // RUN: iree-compile --iree-preprocessing-transform-spec-filename=%p/mlp_spec.mlir  %s | \
 // RUN: iree-run-module --device=local-sync \
 // RUN:     --executable_plugin=$IREE_BINARY_DIR/samples/custom_dispatch/cpu/mlp_plugin/mlp_plugin$IREE_DYLIB_EXT \
 // RUN:     --module=- \
 // RUN:     --function=mlp_invocation \
 // RUN:     --input="2x2xf32=[[2.0, 2.0], [-2.0, -2.0]]" \
 // RUN:     --input="2x2xf32=[[3.0, -3.0], [3.0, -3.0]]"

 // The implementation of MLP is matched using a transform dialect script and is forwarded to a system plugin.

 #x86_64_target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {
   data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
   native_vector_size = 32 : index,
   target_triple = "x86_64-none-elf"
 }>

 // The target devices that the program will run on. We can compile and run with
 // multiple targets, but this example is maintaining an implicit requirement
 // that the custom kernel being spliced in is supported by the target device,
 // hence we only support llvm-cpu here.
 #cpu_target = #hal.device.target<"llvm-cpu", [
   #x86_64_target
 ]>

 #map = affine_map<(d0, d1) -> (d0, d1)>
 module @example attributes {hal.device.targets = [#cpu_target]} {

   // CHECK-LABEL: EXEC @mlp_invocation
   //       CHECK: [Plugin]: M = 2, N = 2, K = 2
   //       CHECK: 2x2xf32=[-12 0][0 -12]
   func.func @mlp_invocation(%lhs: tensor<?x?xf32>,
                             %rhs: tensor<?x?xf32>) -> (tensor<?x?xf32>) {
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
     %cst = arith.constant 0.0 : f32
     %dim0 = tensor.dim %lhs, %c0 : tensor<?x?xf32>
     %dim1 = tensor.dim %rhs, %c1 : tensor<?x?xf32>
     %empty = tensor.empty(%dim0, %dim1) : tensor<?x?xf32>
     %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<?x?xf32>) -> tensor<?x?xf32>
     %matmul = linalg.matmul ins(%lhs, %rhs : tensor<?x?xf32>, tensor<?x?xf32>)
         outs(%fill : tensor<?x?xf32>) -> tensor<?x?xf32>
     %relu = linalg.generic {
         indexing_maps = [#map, #map],
         iterator_types = ["parallel", "parallel"]}
         ins(%matmul : tensor<?x?xf32>) outs(%empty : tensor<?x?xf32>) {
       ^bb0(%b0 : f32, %b1 : f32):
         %0 = arith.maximumf %b0, %cst : f32
         linalg.yield %0 : f32
       } -> tensor<?x?xf32>
     %neg = linalg.generic {
         indexing_maps = [#map, #map],
         iterator_types  = ["parallel", "parallel"]}
         ins(%relu : tensor<?x?xf32>) outs(%empty : tensor<?x?xf32>) {
       ^bb0(%b0 : f32, %b1 : f32):
         %0 = arith.negf %b0 : f32
         linalg.yield %0 : f32
     } -> tensor<?x?xf32>
     return %neg : tensor<?x?xf32>
   }
 }  // module
	// RUN: iree-compile --iree-preprocessing-transform-spec-filename=%p/mlp_spec.mlir %s \| \
	// RUN: iree-run-module --device=local-sync \
	// RUN: --executable_plugin=$IREE_BINARY_DIR/samples/custom_dispatch/cpu/mlp_plugin/mlp_plugin$IREE_DYLIB_EXT \
	// RUN: --module=- \
	// RUN: --function=mlp_invocation \
	// RUN: --input="2x2xf32=[[2.0, 2.0], [-2.0, -2.0]]" \
	// RUN: --input="2x2xf32=[[3.0, -3.0], [3.0, -3.0]]"

	// The implementation of MLP is matched using a transform dialect script and is forwarded to a system plugin.

	#x86_64_target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {
	data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
	native_vector_size = 32 : index,
	target_triple = "x86_64-none-elf"
	}>

	// The target devices that the program will run on. We can compile and run with
	// multiple targets, but this example is maintaining an implicit requirement
	// that the custom kernel being spliced in is supported by the target device,
	// hence we only support llvm-cpu here.
	#cpu_target = #hal.device.target<"llvm-cpu", [
	#x86_64_target
	]>

	#map = affine_map<(d0, d1) -> (d0, d1)>
	module @example attributes {hal.device.targets = [#cpu_target]} {

	// CHECK-LABEL: EXEC @mlp_invocation
	// CHECK: [Plugin]: M = 2, N = 2, K = 2
	// CHECK: 2x2xf32=[-12 0][0 -12]
	func.func @mlp_invocation(%lhs: tensor<?x?xf32>,
	%rhs: tensor<?x?xf32>) -> (tensor<?x?xf32>) {
	%c0 = arith.constant 0 : index
	%c1 = arith.constant 1 : index
	%cst = arith.constant 0.0 : f32
	%dim0 = tensor.dim %lhs, %c0 : tensor<?x?xf32>
	%dim1 = tensor.dim %rhs, %c1 : tensor<?x?xf32>
	%empty = tensor.empty(%dim0, %dim1) : tensor<?x?xf32>
	%fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<?x?xf32>) -> tensor<?x?xf32>
	%matmul = linalg.matmul ins(%lhs, %rhs : tensor<?x?xf32>, tensor<?x?xf32>)
	outs(%fill : tensor<?x?xf32>) -> tensor<?x?xf32>
	%relu = linalg.generic {
	indexing_maps = [#map, #map],
	iterator_types = ["parallel", "parallel"]}
	ins(%matmul : tensor<?x?xf32>) outs(%empty : tensor<?x?xf32>) {
	^bb0(%b0 : f32, %b1 : f32):
	%0 = arith.maximumf %b0, %cst : f32
	linalg.yield %0 : f32
	} -> tensor<?x?xf32>
	%neg = linalg.generic {
	indexing_maps = [#map, #map],
	iterator_types = ["parallel", "parallel"]}
	ins(%relu : tensor<?x?xf32>) outs(%empty : tensor<?x?xf32>) {
	^bb0(%b0 : f32, %b1 : f32):
	%0 = arith.negf %b0 : f32
	linalg.yield %0 : f32
	} -> tensor<?x?xf32>
	return %neg : tensor<?x?xf32>
	}
	} // module