blob: 2e7dc2c69cd20a5b3733a7cbe8e7dfae4109d54c [file] [log] [blame]
// RUN: iree-compile --compile-mode=hal-executable \
// RUN: --mlir-print-ir-after=iree-hal-serialize-executables \
// RUN: --iree-hal-target-backends=vmvx %s \
// RUN: --o=/dev/null 2>&1 | FileCheck %s
// Each entry point has a layout specification indicating the total number of
// push constants available and the descriptor sets and their bindings.
// Push constants are dense (0..N) while the sets/bindings are sparse and may
// contain unused or omitted entries.
#pipeline_layout = #hal.pipeline.layout<constants = 2, bindings = [
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>
]>
// A single executable source definition is allowed per translation in this mode
// as linking and multi-executable embedding support requires our host-side IR.
hal.executable.source public @executable {
// Exported functions are declared with the layout they use and may optionally
// contain other information - though when hand-authoring that's usually
// omitted.
hal.executable.export public @mul layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%c1 = arith.constant 1 : index
hal.return %c1, %c1, %c1 : index, index, index
}
// The inner module defining the executable. This may have any number of
// private functions and only those with declared entry points will be
// exported.
builtin.module {
func.func @mul() {
// Push constants are loaded by ordinal.
%offset = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index
%length = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : index
// Bindings are dereferenced by their set/binding ordinal and may have a
// byte offset from the base of the descriptor. Alignment information when
// available can help code generation emit better loads/stores.
%s0b0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(32) : !flow.dispatch.tensor<readonly:tensor<4xf32>>
%s0b1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(32) offset(%offset) : !flow.dispatch.tensor<readonly:tensor<4xf32>>
%s0b2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(32) : !flow.dispatch.tensor<writeonly:tensor<4xf32>>
// Workgroup information can be queried from the interface.
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_size_x = hal.interface.workgroup.size[0] : index
// Actual program:
%base_index = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%per_step = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg0 = %base_index to %length step %per_step {
%5 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 4)>(%arg0)[%workgroup_size_x]
%6 = flow.dispatch.tensor.load %s0b0, offsets = [%arg0], sizes = [%5], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4xf32>> -> tensor<?xf32>
%7 = flow.dispatch.tensor.load %s0b1, offsets = [%arg0], sizes = [%5], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4xf32>> -> tensor<?xf32>
%8 = tensor.empty(%5) : tensor<?xf32>
%9 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%6, %7 : tensor<?xf32>, tensor<?xf32>) outs(%8 : tensor<?xf32>) attrs = {name = "mul.1"} {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
%s0b10 = arith.mulf %arg1, %arg2 : f32
linalg.yield %s0b10 : f32
} -> tensor<?xf32>
flow.dispatch.tensor.store %9, %s0b2, offsets = [%arg0], sizes = [%5], strides = [1] : tensor<?xf32> -> !flow.dispatch.tensor<writeonly:tensor<4xf32>>
}
return
}
}
}
// Just check that there's the expected flatbuffers prefix bytes.
// CHECK: hal.executable.binary public @vmvx_bytecode_fb attributes {data = dense<{{.+}}> : vector<{{.+}}xi8>, format = "vmvx-bytecode-fb", mime_type = "application/x-flatbuffers"}