samples/custom_dispatch/cpu/embedded/example_transform.mlir - 3p/openxla/iree - Git at Google

 // RUN: iree-compile %s \
 // RUN:     --iree-hal-executable-object-search-path=$IREE_BINARY_DIR \
 // RUN:     --iree-preprocessing-transform-spec-filename=%p/example_transform_spec.mlir | \
 // RUN: iree-run-module \
 // RUN:     --device=local-sync \
 // RUN:     --module=- \
 // RUN:     --function=mixed_invocation \
 // RUN:     --input=5xf32=7 \
 // RUN:     --input=5xf32=4 \
 // RUN:     --input=10xf32=-4 \
 // RUN:     --input=10xf32=3 | \
 // RUN: FileCheck %s

 // The configuration used for executable compilation.
 // This lets the compiler and runtime know the format and requirements of the
 // executable binaries produced and multiple variants with differing formats
 // and compilation options (architectures, etc) can be embedded for runtime
 // selection.
 #x86_64_target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {
   data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
   native_vector_size = 32 : index,
   target_triple = "x86_64-none-elf"
 }>

 // The target devices that the program will run on. We can compile and run with
 // multiple targets, but this example is maintaining an implicit requirement
 // that the custom kernel being spliced in is supported by the target device,
 // hence we only support llvm-cpu here.
 #cpu_target = #hal.device.target<"local", [
   #x86_64_target
 ]> : !hal.device

 #map = affine_map<(d0, d1) -> (d0, d1)>
 #map1 = affine_map<(d0, d1) -> (d0)>
 module @example attributes {hal.device.targets = [#cpu_target]} {

   // CHECK-LABEL: EXEC @mixed_invocation
   func.func @mixed_invocation(%lhs: tensor<?xf32>,
                               %rhs: tensor<?xf32>,
                               %lhs_static: tensor<10xf32>,
                               %rhs_static: tensor<10xf32>) -> (tensor<?xf32>, tensor<10xf32>) {
     %c0 = arith.constant 0 : index
     %dim = tensor.dim %lhs, %c0 : tensor<?xf32>
     %empty = tensor.empty(%dim) : tensor<?xf32>
     %max = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
                                             affine_map<(d0) -> (d0)>,
                                             affine_map<(d0) -> (d0)>],
                            iterator_types = ["parallel"]}
                            ins(%lhs, %rhs : tensor<?xf32>, tensor<?xf32>)
                            outs(%empty : tensor<?xf32>) {
     ^bb0(%in: f32, %in0: f32, %out: f32):
       %m = arith.mulf %in, %in0 : f32
       linalg.yield %m : f32
     } -> tensor<?xf32>
     %abs = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
                                             affine_map<(d0) -> (d0)>],
                            iterator_types = ["parallel"]}
                            ins(%max : tensor<?xf32>)
                            outs(%empty : tensor<?xf32>) {
     ^bb0(%in: f32, %out: f32):
       %a = math.absf %in : f32
       linalg.yield %a : f32
     } -> tensor<?xf32>
     %neg = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
                                             affine_map<(d0) -> (d0)>],
                            iterator_types = ["parallel"]}
                            ins(%abs : tensor<?xf32>)
                            outs(%empty : tensor<?xf32>) {
     ^bb0(%in: f32, %out: f32):
       %n = arith.negf %in : f32
       linalg.yield %n : f32
     } -> tensor<?xf32>

     %empty_static = tensor.empty() : tensor<10xf32>
     %max_static = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
                                             affine_map<(d0) -> (d0)>,
                                             affine_map<(d0) -> (d0)>],
                            iterator_types = ["parallel"]}
                            ins(%lhs_static, %rhs_static : tensor<10xf32>, tensor<10xf32>)
                            outs(%empty_static : tensor<10xf32>) {
     ^bb0(%in: f32, %in0: f32, %out: f32):
       %m = arith.mulf %in, %in0 : f32
       linalg.yield %m : f32
     } -> tensor<10xf32>
     %abs_static = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
                                             affine_map<(d0) -> (d0)>],
                            iterator_types = ["parallel"]}
                            ins(%max_static : tensor<10xf32>)
                            outs(%empty_static : tensor<10xf32>) {
     ^bb0(%in: f32, %out: f32):
       %a = math.absf %in : f32
       linalg.yield %a : f32
     } -> tensor<10xf32>
     %neg_static = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
                                             affine_map<(d0) -> (d0)>],
                            iterator_types = ["parallel"]}
                            ins(%abs_static : tensor<10xf32>)
                            outs(%empty_static : tensor<10xf32>) {
     ^bb0(%in: f32, %out: f32):
       %n = arith.negf %in : f32
       linalg.yield %n : f32
     } -> tensor<10xf32>

     // Add 1 to show that it actually runs the custom kernel.
     // CHECK: 5xf32=-27 -27 -27 -27 -27
     // CHECK: 10xf32=-11 -11 -11 -11 -11 -11 -11 -11 -11 -11
     return %neg, %neg_static : tensor<?xf32>, tensor<10xf32>
   }
 }  // module
	// RUN: iree-compile %s \
	// RUN: --iree-hal-executable-object-search-path=$IREE_BINARY_DIR \
	// RUN: --iree-preprocessing-transform-spec-filename=%p/example_transform_spec.mlir \| \
	// RUN: iree-run-module \
	// RUN: --device=local-sync \
	// RUN: --module=- \
	// RUN: --function=mixed_invocation \
	// RUN: --input=5xf32=7 \
	// RUN: --input=5xf32=4 \
	// RUN: --input=10xf32=-4 \
	// RUN: --input=10xf32=3 \| \
	// RUN: FileCheck %s

	// The configuration used for executable compilation.
	// This lets the compiler and runtime know the format and requirements of the
	// executable binaries produced and multiple variants with differing formats
	// and compilation options (architectures, etc) can be embedded for runtime
	// selection.
	#x86_64_target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {
	data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
	native_vector_size = 32 : index,
	target_triple = "x86_64-none-elf"
	}>

	// The target devices that the program will run on. We can compile and run with
	// multiple targets, but this example is maintaining an implicit requirement
	// that the custom kernel being spliced in is supported by the target device,
	// hence we only support llvm-cpu here.
	#cpu_target = #hal.device.target<"local", [
	#x86_64_target
	]> : !hal.device

	#map = affine_map<(d0, d1) -> (d0, d1)>
	#map1 = affine_map<(d0, d1) -> (d0)>
	module @example attributes {hal.device.targets = [#cpu_target]} {

	// CHECK-LABEL: EXEC @mixed_invocation
	func.func @mixed_invocation(%lhs: tensor<?xf32>,
	%rhs: tensor<?xf32>,
	%lhs_static: tensor<10xf32>,
	%rhs_static: tensor<10xf32>) -> (tensor<?xf32>, tensor<10xf32>) {
	%c0 = arith.constant 0 : index
	%dim = tensor.dim %lhs, %c0 : tensor<?xf32>
	%empty = tensor.empty(%dim) : tensor<?xf32>
	%max = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
	affine_map<(d0) -> (d0)>,
	affine_map<(d0) -> (d0)>],
	iterator_types = ["parallel"]}
	ins(%lhs, %rhs : tensor<?xf32>, tensor<?xf32>)
	outs(%empty : tensor<?xf32>) {
	^bb0(%in: f32, %in0: f32, %out: f32):
	%m = arith.mulf %in, %in0 : f32
	linalg.yield %m : f32
	} -> tensor<?xf32>
	%abs = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
	affine_map<(d0) -> (d0)>],
	iterator_types = ["parallel"]}
	ins(%max : tensor<?xf32>)
	outs(%empty : tensor<?xf32>) {
	^bb0(%in: f32, %out: f32):
	%a = math.absf %in : f32
	linalg.yield %a : f32
	} -> tensor<?xf32>
	%neg = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
	affine_map<(d0) -> (d0)>],
	iterator_types = ["parallel"]}
	ins(%abs : tensor<?xf32>)
	outs(%empty : tensor<?xf32>) {
	^bb0(%in: f32, %out: f32):
	%n = arith.negf %in : f32
	linalg.yield %n : f32
	} -> tensor<?xf32>

	%empty_static = tensor.empty() : tensor<10xf32>
	%max_static = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
	affine_map<(d0) -> (d0)>,
	affine_map<(d0) -> (d0)>],
	iterator_types = ["parallel"]}
	ins(%lhs_static, %rhs_static : tensor<10xf32>, tensor<10xf32>)
	outs(%empty_static : tensor<10xf32>) {
	^bb0(%in: f32, %in0: f32, %out: f32):
	%m = arith.mulf %in, %in0 : f32
	linalg.yield %m : f32
	} -> tensor<10xf32>
	%abs_static = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
	affine_map<(d0) -> (d0)>],
	iterator_types = ["parallel"]}
	ins(%max_static : tensor<10xf32>)
	outs(%empty_static : tensor<10xf32>) {
	^bb0(%in: f32, %out: f32):
	%a = math.absf %in : f32
	linalg.yield %a : f32
	} -> tensor<10xf32>
	%neg_static = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
	affine_map<(d0) -> (d0)>],
	iterator_types = ["parallel"]}
	ins(%abs_static : tensor<10xf32>)
	outs(%empty_static : tensor<10xf32>) {
	^bb0(%in: f32, %out: f32):
	%n = arith.negf %in : f32
	linalg.yield %n : f32
	} -> tensor<10xf32>

	// Add 1 to show that it actually runs the custom kernel.
	// CHECK: 5xf32=-27 -27 -27 -27 -27
	// CHECK: 10xf32=-11 -11 -11 -11 -11 -11 -11 -11 -11 -11
	return %neg, %neg_static : tensor<?xf32>, tensor<10xf32>
	}
	} // module