blob: 44eeb6407be70e3c22cfd719607091a9ef78956e [file] [log] [blame]
// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --iree-codegen-llvmgpu-use-vector-distribution \
// RUN: --iree-codegen-reorder-workgroups-strategy=transpose \
// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s --check-prefix=OPT-OUT
// Check that applying `ReorderWorkgroups*` enables or disables workgroup reordering.
// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --iree-codegen-llvmgpu-use-vector-distribution \
// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s --check-prefix=OPT-IN
// Check that applying the `no_reduce_shared_memory_bank_conflicts` pipeline option attribute disables shared memory padding.
// OPT-OUT: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
// OPT-OUT-SAME: gpu_pipeline_options = #iree_gpu.pipeline_options<no_reduce_shared_memory_bank_conflicts = true>
// OPT-OUT-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
// OPT-IN: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
// OPT-IN-SAME: gpu_pipeline_options = #iree_gpu.pipeline_options<no_reduce_shared_memory_bank_conflicts = true>
// OPT-IN-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
#config = #iree_gpu.lowering_config<{workgroup = [128, 128, 0], reduction = [0, 0, 32]}>
#pipeline_layout = #hal.pipeline.layout<bindings = [
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>
]>
hal.executable public @main_0_dispatch_0 {
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
hal.executable.export public @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32 ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
// OPT-OUT-LABEL: func.func @main_0_dispatch_0_matmul_transpose_b
// OPT-OUT: memref.alloc() : memref<128x32xf16, #gpu.address_space<workgroup>>
// OPT-OUT: memref.alloc() : memref<128x32xf16, #gpu.address_space<workgroup>>
// OPT-OUT-DAG: %[[WG_Y:.+]] = hal.interface.workgroup.id[1] : index
// OPT-OUT-DAG: %[[WG_X:.+]] = hal.interface.workgroup.id[0] : index
// OPT-OUT-DAG: arith.muli %[[WG_Y]], %{{.+}} : index
// OPT-OUT-DAG: arith.addi %{{.+}}, %[[WG_X]] : index
// OPT-OUT: scf.for
// OPT-IN-LABEL: func.func @main_0_dispatch_0_matmul_transpose_b
// OPT-IN: memref.alloc() : memref<128x32xf16, #gpu.address_space<workgroup>>
// OPT-IN: memref.alloc() : memref<128x32xf16, #gpu.address_space<workgroup>>
// OPT-IN-DAG: %[[WG_Y:.+]] = hal.interface.workgroup.id[1] : index
// OPT-IN-DAG: %[[WG_X:.+]] = hal.interface.workgroup.id[0] : index
// OPT-IN: scf.for
func.func @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32()
attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64, {
mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2>,
gpu_pipeline_options = #iree_gpu.pipeline_options<no_reduce_shared_memory_bank_conflicts = true> // Disable the 'reduceSharedMemoryBankConflicts' pass.
}>} {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x1280xf16>>
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<10240x1280xf16>>
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x10240xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1280xf16>> -> tensor<2048x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [10240, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<10240x1280xf16>> -> tensor<10240x1280xf16>
%5 = tensor.empty() : tensor<2048x10240xf32>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2048x10240xf32>) -> tensor<2048x10240xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>,
affine_map<(d0, d1, d2) -> (d1, d2)>,
affine_map<(d0, d1, d2) -> (d0, d1)>],
iterator_types = ["parallel", "parallel", "reduction"],
lowering_config = #config}
ins(%3, %4 : tensor<2048x1280xf16>, tensor<10240x1280xf16>)
outs(%6 : tensor<2048x10240xf32>) {
^bb0(%in: f16, %in_0: f16, %out: f32):
%8 = arith.extf %in : f16 to f32
%9 = arith.extf %in_0 : f16 to f32
%10 = arith.mulf %8, %9 : f32
%11 = arith.addf %out, %10 : f32
linalg.yield %11 : f32
} -> tensor<2048x10240xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 10240], strides = [1, 1] : tensor<2048x10240xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x10240xf32>>
return
}
}
}
}
// -----
// Check that applying the `reorder_workgroups_strategy = Transpose` pipeline option attribute enables workgroup reordering.
// OPT-OUT: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
// OPT-OUT-SAME: gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = Transpose>
// OPT-OUT-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
// OPT-IN: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
// OPT-IN-SAME: gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = Transpose>
// OPT-IN-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
#config = #iree_gpu.lowering_config<{workgroup = [128, 128, 0], reduction = [0, 0, 32]}>
#pipeline_layout = #hal.pipeline.layout<bindings = [
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>
]>
hal.executable public @main_0_dispatch_0 {
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
hal.executable.export public @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32 ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
// OPT-OUT-LABEL: func.func @main_0_dispatch_0_matmul_transpose_b
// OPT-OUT: memref.alloc() : memref<128x36xf16, #gpu.address_space<workgroup>>
// OPT-OUT: memref.alloc() : memref<128x36xf16, #gpu.address_space<workgroup>>
// OPT-OUT-DAG: %[[WG_Y:.+]] = hal.interface.workgroup.id[1] : index
// OPT-OUT-DAG: %[[WG_X:.+]] = hal.interface.workgroup.id[0] : index
// OPT-OUT-DAG: arith.muli %[[WG_Y]], %{{.+}} : index
// OPT-OUT-DAG: arith.addi %{{.+}}, %[[WG_X]] : index
// OPT-OUT: scf.for
// OPT-IN-LABEL: func.func @main_0_dispatch_0_matmul_transpose_b
// OPT-IN: memref.alloc() : memref<128x36xf16, #gpu.address_space<workgroup>>
// OPT-IN: memref.alloc() : memref<128x36xf16, #gpu.address_space<workgroup>>
// OPT-IN-DAG: %[[WG_Y:.+]] = hal.interface.workgroup.id[1] : index
// OPT-IN-DAG: %[[WG_X:.+]] = hal.interface.workgroup.id[0] : index
// OPT-IN-DAG: arith.muli %[[WG_Y]], %{{.+}} : index
// OPT-IN-DAG: arith.addi %{{.+}}, %[[WG_X]] : index
// OPT-IN: scf.for
func.func @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32()
attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64, {
mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2>,
gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = Transpose> // enable the 'reorderWorkgroups' pass.
}>} {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x1280xf16>>
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<10240x1280xf16>>
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x10240xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1280xf16>> -> tensor<2048x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [10240, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<10240x1280xf16>> -> tensor<10240x1280xf16>
%5 = tensor.empty() : tensor<2048x10240xf32>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2048x10240xf32>) -> tensor<2048x10240xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>,
affine_map<(d0, d1, d2) -> (d1, d2)>,
affine_map<(d0, d1, d2) -> (d0, d1)>],
iterator_types = ["parallel", "parallel", "reduction"],
lowering_config = #config}
ins(%3, %4 : tensor<2048x1280xf16>, tensor<10240x1280xf16>)
outs(%6 : tensor<2048x10240xf32>) {
^bb0(%in: f16, %in_0: f16, %out: f32):
%8 = arith.extf %in : f16 to f32
%9 = arith.extf %in_0 : f16 to f32
%10 = arith.mulf %8, %9 : f32
%11 = arith.addf %out, %10 : f32
linalg.yield %11 : f32
} -> tensor<2048x10240xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 10240], strides = [1, 1] : tensor<2048x10240xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x10240xf32>>
return
}
}
}
}
// -----
// Check that applying the `reorder_workgroups_strategy = None` pipeline option disables workgroup reordering.
// OPT-OUT: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
// OPT-OUT-SAME: gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = None>
// OPT-OUT-SAME: mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
#config = #iree_gpu.lowering_config<{workgroup = [128, 128, 0], reduction = [0, 0, 32]}>
#pipeline_layout = #hal.pipeline.layout<bindings = [
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>
]>
hal.executable public @main_0_dispatch_0 {
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
hal.executable.export public @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32 ordinal(0) layout(#pipeline_layout) {
^bb0(%arg0: !hal.device):
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
hal.return %x, %y, %z : index, index, index
}
builtin.module {
// OPT-OUT-LABEL: func.func @main_0_dispatch_0_matmul_transpose_b
// OPT-OUT: memref.alloc() : memref<128x36xf16, #gpu.address_space<workgroup>>
// OPT-OUT: memref.alloc() : memref<128x36xf16, #gpu.address_space<workgroup>>
// OPT-OUT-DAG: hal.interface.workgroup.id[1] : index
// OPT-OUT-DAG: hal.interface.workgroup.id[0] : index
// OPT-OUT-NEXT: scf.for
func.func @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32()
attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64, {
mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2>,
gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = None> // Disable the 'reorderWorkgroups' pass.
}>} {
%cst = arith.constant 0.000000e+00 : f16
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<2048x1280xf16>>
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<10240x1280xf16>>
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<2048x10240xf32>>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1280xf16>> -> tensor<2048x1280xf16>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [10240, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<10240x1280xf16>> -> tensor<10240x1280xf16>
%5 = tensor.empty() : tensor<2048x10240xf32>
%6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<2048x10240xf32>) -> tensor<2048x10240xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>,
affine_map<(d0, d1, d2) -> (d1, d2)>,
affine_map<(d0, d1, d2) -> (d0, d1)>],
iterator_types = ["parallel", "parallel", "reduction"],
lowering_config = #config}
ins(%3, %4 : tensor<2048x1280xf16>, tensor<10240x1280xf16>)
outs(%6 : tensor<2048x10240xf32>) {
^bb0(%in: f16, %in_0: f16, %out: f32):
%8 = arith.extf %in : f16 to f32
%9 = arith.extf %in_0 : f16 to f32
%10 = arith.mulf %8, %9 : f32
%11 = arith.addf %out, %10 : f32
linalg.yield %11 : f32
} -> tensor<2048x10240xf32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 10240], strides = [1, 1] : tensor<2048x10240xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x10240xf32>>
return
}
}
}
}