blob: 9f3ee9cc8132eeb16073112fd2711a94c1165244 [file]
// RUN: iree-opt -split-input-file -pass-pipeline="hal.executable(hal.executable.target(iree-codegen-spirv-linalg-tile-and-distribute,iree-codegen-linalg-tile-and-fuse,canonicalize,cse))" -iree-spirv-enable-vectorization %s | IreeFileCheck %s
// RUN: iree-opt -split-input-file -pass-pipeline="hal.executable(hal.executable.target(iree-codegen-spirv-linalg-tile-and-distribute,iree-codegen-linalg-tile-and-fuse,canonicalize,cse))" -iree-spirv-enable-vectorization -iree-spirv-use-workgroup-memory %s | IreeFileCheck %s -check-prefix=PROMOTE
hal.executable @matmul_static_shape attributes {sym_visibility = "private"} {
hal.interface @legacy_io {
hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
hal.executable.target @vulkan, filter="dylib*" {
hal.executable.entry_point @matmul_static_shape attributes {
interface = @legacy_io, ordinal = 0 : i32,
signature = (!flow.dispatch.tensor<readonly:?x?xf32>, !flow.dispatch.tensor<readonly:?x?xf32>,
!flow.dispatch.tensor<writeonly:?x?xf32>) -> ()}
module attributes {
spv.target_env =
#spv.target_env<#spv.vce<v1.5,
[Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess,
StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess,
UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform,
GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot,
GroupNonUniformShuffle, GroupNonUniformShuffleRelative, VariablePointers,
VariablePointersStorageBuffer, CooperativeMatrixNV],
[SPV_KHR_16bit_storage, SPV_KHR_8bit_storage,
SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers,
SPV_NV_cooperative_matrix]>, NVIDIA:DiscreteGPU,
{cooperative_matrix_properties_nv = [
{a_type = i8, b_type = i8, c_type = i32, k_size = 32 : i32,
m_size = 8 : i32, n_size = 8 : i32, result_type = i32, scope = 3 : i32},
{a_type = f16, b_type = f16, c_type = f16, k_size = 16 : i32,
m_size = 16 : i32, n_size = 16 : i32, result_type = f16,
scope = 3 : i32},
{a_type = f16, b_type = f16, c_type = f32, k_size = 16 : i32,
m_size = 16 : i32, n_size = 16 : i32, result_type = f32,
scope = 3 : i32}],
max_compute_shared_memory_size = 49152 : i32,
max_compute_workgroup_invocations = 1024 : i32,
max_compute_workgroup_size = dense<[2147483647, 65535, 65535]> : vector<3xi32>,
subgroup_size = 32 : i32}>} {
func @matmul_static_shape()
attributes {vkspv.num_workgroups_fn = @matmul_static_shape__num_workgroups__} {
%arg0 = iree.placeholder for "interface buffer"
{binding = @legacy_io::@arg0, operand_result_num = 0 : i32} : memref<4096x4096xf16>
%arg1 = iree.placeholder for "interface buffer"
{binding = @legacy_io::@arg1, operand_result_num = 1 : i32} : memref<4096x4096xf16>
%ret0 = iree.placeholder for "interface buffer"
{binding = @legacy_io::@ret0, operand_result_num = 2 : i32} : memref<4096x4096xf16>
linalg.matmul ins(%arg0, %arg1 : memref<4096x4096xf16>, memref<4096x4096xf16>)
outs(%ret0 : memref<4096x4096xf16>)
return
}
func private @matmul_static_shape__num_workgroups__
(!shapex.ranked_shape<[4096, 4096]>, !shapex.ranked_shape<[4096, 4096]>,
!shapex.ranked_shape<[4096, 4096]>) -> (index, index, index)
hal.interface @legacy_io attributes {sym_visibility = "private"} {
hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
}
}
// CHECK-DAG: #[[MAP0:.+]] = affine_map<()[s0] -> (s0 * 64)>
// CHECK: func @matmul_static_shape
// CHECK-DAG: %[[ARG0:.+]] = iree.placeholder {{.*}} {binding = @legacy_io::@arg0
// CHECK-DAG: %[[ARG1:.+]] = iree.placeholder {{.*}} {binding = @legacy_io::@arg1
// CHECK-DAG: %[[RET0:.+]] = iree.placeholder {{.*}} {binding = @legacy_io::@ret0
// CHECK-DAG: %[[C0:.+]] = constant 0 : index
// CHECK-DAG: %[[CST:.+]] = constant 0.0
// CHECK-DAG: %[[C16:.+]] = constant 16 : index
// CHECK-DAG: %[[C32:.+]] = constant 32 : index
// CHECK-DAG: %[[C48:.+]] = constant 48 : index
// CHECK: %[[BIDX:.+]] = "gpu.block_id"() {dimension = "x"}
// CHECK: %[[BIDY:.+]] = "gpu.block_id"() {dimension = "y"}
// CHECK: %[[BOFFSET_Y:.+]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
// CHECK: %[[BOFFSET_X:.+]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
// CHECK: %[[SUBVIEW_RESULT:.+]] = subview %[[RET0]]
// CHECK-SAME: [%[[BOFFSET_Y]], %[[BOFFSET_X]]] [64, 64]
// CHECK-DAG: %[[READ_INIT_0_0:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_RESULT]][%[[C0]], %[[C0]]]
// CHECK-DAG: %[[READ_INIT_0_1:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_RESULT]][%[[C0]], %[[C16]]]
// CHECK-DAG: %[[READ_INIT_0_2:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_RESULT]][%[[C0]], %[[C32]]]
// CHECK-DAG: %[[READ_INIT_0_3:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_RESULT]][%[[C0]], %[[C48]]]
// CHECK-DAG: %[[READ_INIT_1_0:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_RESULT]][%[[C16]], %[[C0]]]
// CHECK-DAG: %[[READ_INIT_1_1:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_RESULT]][%[[C16]], %[[C16]]]
// CHECK-DAG: %[[READ_INIT_1_2:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_RESULT]][%[[C16]], %[[C32]]]
// CHECK-DAG: %[[READ_INIT_1_3:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_RESULT]][%[[C16]], %[[C48]]]
// CHECK-DAG: %[[READ_INIT_2_0:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_RESULT]][%[[C32]], %[[C0]]]
// CHECK-DAG: %[[READ_INIT_2_1:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_RESULT]][%[[C32]], %[[C16]]]
// CHECK-DAG: %[[READ_INIT_2_2:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_RESULT]][%[[C32]], %[[C32]]]
// CHECK-DAG: %[[READ_INIT_2_3:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_RESULT]][%[[C32]], %[[C48]]]
// CHECK-DAG: %[[READ_INIT_3_0:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_RESULT]][%[[C48]], %[[C0]]]
// CHECK-DAG: %[[READ_INIT_3_1:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_RESULT]][%[[C48]], %[[C16]]]
// CHECK-DAG: %[[READ_INIT_3_2:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_RESULT]][%[[C48]], %[[C32]]]
// CHECK-DAG: %[[READ_INIT_3_3:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_RESULT]][%[[C48]], %[[C48]]]
// CHECK: %[[FOR_RES:.+]]:16 = scf.for %[[IV0:.+]] = {{.*}} to
// CHECK-SAME: iter_args(%[[ACC_0_0:.+]] = %[[READ_INIT_0_0]],
// CHECK-SAME: %[[ACC_0_1:.+]] = %[[READ_INIT_0_1]],
// CHECK-SAME: %[[ACC_0_2:.+]] = %[[READ_INIT_0_2]],
// CHECK-SAME: %[[ACC_0_3:.+]] = %[[READ_INIT_0_3]],
// CHECK-SAME: %[[ACC_1_0:.+]] = %[[READ_INIT_1_0]],
// CHECK-SAME: %[[ACC_1_1:.+]] = %[[READ_INIT_1_1]],
// CHECK-SAME: %[[ACC_1_2:.+]] = %[[READ_INIT_1_2]],
// CHECK-SAME: %[[ACC_1_3:.+]] = %[[READ_INIT_1_3]],
// CHECK-SAME: %[[ACC_2_0:.+]] = %[[READ_INIT_2_0]],
// CHECK-SAME: %[[ACC_2_1:.+]] = %[[READ_INIT_2_1]],
// CHECK-SAME: %[[ACC_2_2:.+]] = %[[READ_INIT_2_2]],
// CHECK-SAME: %[[ACC_2_3:.+]] = %[[READ_INIT_2_3]],
// CHECK-SAME: %[[ACC_3_0:.+]] = %[[READ_INIT_3_0]],
// CHECK-SAME: %[[ACC_3_1:.+]] = %[[READ_INIT_3_1]],
// CHECK-SAME: %[[ACC_3_2:.+]] = %[[READ_INIT_3_2]],
// CHECK-SAME: %[[ACC_3_3:.+]] = %[[READ_INIT_3_3]])
// CHECK: %[[SUBVIEW_LHS:.+]] = subview %[[ARG0]]
// CHECK-SAME: [%[[BOFFSET_Y]], %[[IV0]]] [64, 32]
// CHECK: %[[SUBVIEW_RHS:.+]] = subview %[[ARG1]]
// CHECK-SAME: [%[[IV0]], %[[BOFFSET_X]]] [32, 64]
// CHECK-DAG: %[[READ_LHS_0_0:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_LHS]][%[[C0]], %[[C0]]]
// CHECK-DAG: %[[READ_LHS_0_1:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_LHS]][%[[C0]], %[[C16]]]
// CHECK-DAG: %[[READ_LHS_1_0:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_LHS]][%[[C16]], %[[C0]]]
// CHECK-DAG: %[[READ_LHS_1_1:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_LHS]][%[[C16]], %[[C16]]]
// CHECK-DAG: %[[READ_LHS_2_0:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_LHS]][%[[C32]], %[[C0]]]
// CHECK-DAG: %[[READ_LHS_2_1:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_LHS]][%[[C32]], %[[C16]]]
// CHECK-DAG: %[[READ_LHS_3_0:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_LHS]][%[[C48]], %[[C0]]]
// CHECK-DAG: %[[READ_LHS_3_1:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_LHS]][%[[C48]], %[[C16]]]
// CHECK-DAG: %[[READ_RHS_0_0:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_RHS]][%[[C0]], %[[C0]]]
// CHECK-DAG: %[[READ_RHS_0_1:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_RHS]][%[[C0]], %[[C16]]]
// CHECK-DAG: %[[READ_RHS_0_2:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_RHS]][%[[C0]], %[[C32]]]
// CHECK-DAG: %[[READ_RHS_0_3:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_RHS]][%[[C0]], %[[C48]]]
// CHECK-DAG: %[[READ_RHS_1_0:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_RHS]][%[[C16]], %[[C0]]]
// CHECK-DAG: %[[READ_RHS_1_1:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_RHS]][%[[C16]], %[[C16]]]
// CHECK-DAG: %[[READ_RHS_1_2:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_RHS]][%[[C16]], %[[C32]]]
// CHECK-DAG: %[[READ_RHS_1_3:.+]] = vector.transfer_read
// CHECK-SAME: %[[SUBVIEW_RHS]][%[[C16]], %[[C48]]]
// CHECK: %[[CONTRACT_0_0_1:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_0_0]], %[[READ_RHS_0_0]], %[[ACC_0_0]]
// CHECK: %[[CONTRACT_0_0:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_0_1]], %[[READ_RHS_1_0]], %[[CONTRACT_0_0_1]]
// CHECK: %[[CONTRACT_0_1_1:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_0_0]], %[[READ_RHS_0_1]], %[[ACC_0_1]]
// CHECK: %[[CONTRACT_0_1:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_0_1]], %[[READ_RHS_1_1]], %[[CONTRACT_0_1_1]]
// CHECK: %[[CONTRACT_0_2_1:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_0_0]], %[[READ_RHS_0_2]], %[[ACC_0_2]]
// CHECK: %[[CONTRACT_0_2:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_0_1]], %[[READ_RHS_1_2]], %[[CONTRACT_0_2_1]]
// CHECK: %[[CONTRACT_0_3_1:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_0_0]], %[[READ_RHS_0_3]], %[[ACC_0_3]]
// CHECK: %[[CONTRACT_0_3:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_0_1]], %[[READ_RHS_1_3]], %[[CONTRACT_0_3_1]]
// CHECK: %[[CONTRACT_1_0_1:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_1_0]], %[[READ_RHS_0_0]], %[[ACC_1_0]]
// CHECK: %[[CONTRACT_1_0:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_1_1]], %[[READ_RHS_1_0]], %[[CONTRACT_1_0_1]]
// CHECK: %[[CONTRACT_1_1_1:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_1_0]], %[[READ_RHS_0_1]], %[[ACC_1_1]]
// CHECK: %[[CONTRACT_1_1:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_1_1]], %[[READ_RHS_1_1]], %[[CONTRACT_1_1_1]]
// CHECK: %[[CONTRACT_1_2_1:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_1_0]], %[[READ_RHS_0_2]], %[[ACC_1_2]]
// CHECK: %[[CONTRACT_1_2:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_1_1]], %[[READ_RHS_1_2]], %[[CONTRACT_1_2_1]]
// CHECK: %[[CONTRACT_1_3_1:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_1_0]], %[[READ_RHS_0_3]], %[[ACC_1_3]]
// CHECK: %[[CONTRACT_1_3:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_1_1]], %[[READ_RHS_1_3]], %[[CONTRACT_1_3_1]]
// CHECK: %[[CONTRACT_2_0_1:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_2_0]], %[[READ_RHS_0_0]], %[[ACC_2_0]]
// CHECK: %[[CONTRACT_2_0:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_2_1]], %[[READ_RHS_1_0]], %[[CONTRACT_2_0_1]]
// CHECK: %[[CONTRACT_2_1_1:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_2_0]], %[[READ_RHS_0_1]], %[[ACC_2_1]]
// CHECK: %[[CONTRACT_2_1:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_2_1]], %[[READ_RHS_1_1]], %[[CONTRACT_2_1_1]]
// CHECK: %[[CONTRACT_2_2_1:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_2_0]], %[[READ_RHS_0_2]], %[[ACC_2_2]]
// CHECK: %[[CONTRACT_2_2:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_2_1]], %[[READ_RHS_1_2]], %[[CONTRACT_2_2_1]]
// CHECK: %[[CONTRACT_2_3_1:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_2_0]], %[[READ_RHS_0_3]], %[[ACC_2_3]]
// CHECK: %[[CONTRACT_2_3:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_2_1]], %[[READ_RHS_1_3]], %[[CONTRACT_2_3_1]]
// CHECK: %[[CONTRACT_3_0_1:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_3_0]], %[[READ_RHS_0_0]], %[[ACC_3_0]]
// CHECK: %[[CONTRACT_3_0:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_3_1]], %[[READ_RHS_1_0]], %[[CONTRACT_3_0_1]]
// CHECK: %[[CONTRACT_3_1_1:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_3_0]], %[[READ_RHS_0_1]], %[[ACC_3_1]]
// CHECK: %[[CONTRACT_3_1:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_3_1]], %[[READ_RHS_1_1]], %[[CONTRACT_3_1_1]]
// CHECK: %[[CONTRACT_3_2_1:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_3_0]], %[[READ_RHS_0_2]], %[[ACC_3_2]]
// CHECK: %[[CONTRACT_3_2:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_3_1]], %[[READ_RHS_1_2]], %[[CONTRACT_3_2_1]]
// CHECK: %[[CONTRACT_3_3_1:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_3_0]], %[[READ_RHS_0_3]], %[[ACC_3_3]]
// CHECK: %[[CONTRACT_3_3:.+]] = vector.contract
// CHECK-SAME: %[[READ_LHS_3_1]], %[[READ_RHS_1_3]], %[[CONTRACT_3_3_1]]
// CHECK: scf.yield %[[CONTRACT_0_0]], %[[CONTRACT_0_1]],
// CHECK-SAME: %[[CONTRACT_0_2]], %[[CONTRACT_0_3]], %[[CONTRACT_1_0]],
// CHECK-SAME: %[[CONTRACT_1_1]], %[[CONTRACT_1_2]], %[[CONTRACT_1_3]],
// CHECK-SAME: %[[CONTRACT_2_0]], %[[CONTRACT_2_1]], %[[CONTRACT_2_2]],
// CHECK-SAME: %[[CONTRACT_2_3]], %[[CONTRACT_3_0]], %[[CONTRACT_3_1]],
// CHECK-SAME: %[[CONTRACT_3_2]], %[[CONTRACT_3_3]]
// CHECK-DAG: vector.transfer_write %[[FOR_RES]]#0, %[[SUBVIEW_RESULT]][%[[C0]], %[[C0]]]
// CHECK-DAG: vector.transfer_write %[[FOR_RES]]#1, %[[SUBVIEW_RESULT]][%[[C0]], %[[C16]]]
// CHECK-DAG: vector.transfer_write %[[FOR_RES]]#2, %[[SUBVIEW_RESULT]][%[[C0]], %[[C32]]]
// CHECK-DAG: vector.transfer_write %[[FOR_RES]]#3, %[[SUBVIEW_RESULT]][%[[C0]], %[[C48]]]
// CHECK-DAG: vector.transfer_write %[[FOR_RES]]#4, %[[SUBVIEW_RESULT]][%[[C16]], %[[C0]]]
// CHECK-DAG: vector.transfer_write %[[FOR_RES]]#5, %[[SUBVIEW_RESULT]][%[[C16]], %[[C16]]]
// CHECK-DAG: vector.transfer_write %[[FOR_RES]]#6, %[[SUBVIEW_RESULT]][%[[C16]], %[[C32]]]
// CHECK-DAG: vector.transfer_write %[[FOR_RES]]#7, %[[SUBVIEW_RESULT]][%[[C16]], %[[C48]]]
// CHECK-DAG: vector.transfer_write %[[FOR_RES]]#8, %[[SUBVIEW_RESULT]][%[[C32]], %[[C0]]]
// CHECK-DAG: vector.transfer_write %[[FOR_RES]]#9, %[[SUBVIEW_RESULT]][%[[C32]], %[[C16]]]
// CHECK-DAG: vector.transfer_write %[[FOR_RES]]#10, %[[SUBVIEW_RESULT]][%[[C32]], %[[C32]]]
// CHECK-DAG: vector.transfer_write %[[FOR_RES]]#11, %[[SUBVIEW_RESULT]][%[[C32]], %[[C48]]]
// CHECK-DAG: vector.transfer_write %[[FOR_RES]]#12, %[[SUBVIEW_RESULT]][%[[C48]], %[[C0]]]
// CHECK-DAG: vector.transfer_write %[[FOR_RES]]#13, %[[SUBVIEW_RESULT]][%[[C48]], %[[C16]]]
// CHECK-DAG: vector.transfer_write %[[FOR_RES]]#14, %[[SUBVIEW_RESULT]][%[[C48]], %[[C32]]]
// CHECK-DAG: vector.transfer_write %[[FOR_RES]]#15, %[[SUBVIEW_RESULT]][%[[C48]], %[[C48]]]
// PROMOTE-DAG: #[[MAP4:.+]] = affine_map<()[s0] -> (s0 * 64 - (s0 floordiv 2) * 128)>
// PROMOTE: func @matmul_static_shape
// PROMOTE-DAG: %[[ARG0:.+]] = iree.placeholder {{.*}} {binding = @legacy_io::@arg0
// PROMOTE-DAG: %[[ARG1:.+]] = iree.placeholder {{.*}} {binding = @legacy_io::@arg1
// PROMOTE-DAG: %[[RET0:.+]] = iree.placeholder {{.*}} {binding = @legacy_io::@ret0
// PROMOTE-DAG: %[[C0:.+]] = constant 0 : index
// PROMOTE-DAG: %[[C2:.+]] = constant 2
// PROMOTE-DAG: %[[C16:.+]] = constant 16
// PROMOTE-DAG: %[[C32:.+]] = constant 32
// PROMOTE-DAG: %[[C48:.+]] = constant 48
// PROMOTE-DAG: %[[ALLOC1:.+]] = alloc() : memref<128x32xf16, 3>
// PROMOTE-DAG: %[[ALLOC2:.+]] = alloc() : memref<32x128xf16, 3>
// PROMOTE: %[[RESULT_SUBVIEW:.+]] = subview %[[RET0]]
// PROMOTE: %[[WGMEM_LHS_SUBVIEW:.+]] = subview %[[ALLOC1]][0, 0] [128, 32] [1, 1]
// PROMOTE: %[[WGMEM_RHS_SUBVIEW:.+]] = subview %[[ALLOC2]][0, 0] [32, 128] [1, 1]
// PROMOTE: %[[SG_X:.+]] = gpu.subgroup_id
// PROMOTE: %[[SG_Y:.+]] = divi_signed %[[SG_X]], %[[C2]]
// PROMOTE: %[[SGOFFSET_Y:.+]] = affine.apply #[[MAP4]]()[%[[SG_Y]]]
// PROMOTE: %[[SG_LHS_SUBVIEW:.+]] = subview %[[WGMEM_LHS_SUBVIEW]][%[[SGOFFSET_Y]], 0]
// PROMOTE: %[[SGOFFSET_X:.+]] = affine.apply #[[MAP4]]()[%[[SG_X]]]
// PROMOTE: %[[SG_RHS_SUBVIEW:.+]] = subview %[[WGMEM_RHS_SUBVIEW]][0, %[[SGOFFSET_X]]]
// PROMOTE: %[[SG_RESULT_SUBVIEW:.+]] = subview %[[RESULT_SUBVIEW]][%[[SGOFFSET_Y]], %[[SGOFFSET_X]]]
// PROMOTE-DAG: %[[READ_INIT_0_0:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_RESULT_SUBVIEW]][%[[C0]], %[[C0]]]
// PROMOTE-DAG: %[[READ_INIT_0_1:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_RESULT_SUBVIEW]][%[[C0]], %[[C16]]]
// PROMOTE-DAG: %[[READ_INIT_0_2:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_RESULT_SUBVIEW]][%[[C0]], %[[C32]]]
// PROMOTE-DAG: %[[READ_INIT_0_3:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_RESULT_SUBVIEW]][%[[C0]], %[[C48]]]
// PROMOTE-DAG: %[[READ_INIT_1_0:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_RESULT_SUBVIEW]][%[[C16]], %[[C0]]]
// PROMOTE-DAG: %[[READ_INIT_1_1:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_RESULT_SUBVIEW]][%[[C16]], %[[C16]]]
// PROMOTE-DAG: %[[READ_INIT_1_2:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_RESULT_SUBVIEW]][%[[C16]], %[[C32]]]
// PROMOTE-DAG: %[[READ_INIT_1_3:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_RESULT_SUBVIEW]][%[[C16]], %[[C48]]]
// PROMOTE-DAG: %[[READ_INIT_2_0:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_RESULT_SUBVIEW]][%[[C32]], %[[C0]]]
// PROMOTE-DAG: %[[READ_INIT_2_1:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_RESULT_SUBVIEW]][%[[C32]], %[[C16]]]
// PROMOTE-DAG: %[[READ_INIT_2_2:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_RESULT_SUBVIEW]][%[[C32]], %[[C32]]]
// PROMOTE-DAG: %[[READ_INIT_2_3:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_RESULT_SUBVIEW]][%[[C32]], %[[C48]]]
// PROMOTE-DAG: %[[READ_INIT_3_0:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_RESULT_SUBVIEW]][%[[C48]], %[[C0]]]
// PROMOTE-DAG: %[[READ_INIT_3_1:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_RESULT_SUBVIEW]][%[[C48]], %[[C16]]]
// PROMOTE-DAG: %[[READ_INIT_3_2:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_RESULT_SUBVIEW]][%[[C48]], %[[C32]]]
// PROMOTE-DAG: %[[READ_INIT_3_3:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_RESULT_SUBVIEW]][%[[C48]], %[[C48]]]
// PROMOTE: %[[FOR_RES:.+]]:16 = scf.for %[[IV0:.+]] = {{.*}} to
// PROMOTE-SAME: iter_args(%[[ACC_0_0:.+]] = %[[READ_INIT_0_0]],
// PROMOTE-SAME: %[[ACC_0_1:.+]] = %[[READ_INIT_0_1]],
// PROMOTE-SAME: %[[ACC_0_2:.+]] = %[[READ_INIT_0_2]],
// PROMOTE-SAME: %[[ACC_0_3:.+]] = %[[READ_INIT_0_3]],
// PROMOTE-SAME: %[[ACC_1_0:.+]] = %[[READ_INIT_1_0]],
// PROMOTE-SAME: %[[ACC_1_1:.+]] = %[[READ_INIT_1_1]],
// PROMOTE-SAME: %[[ACC_1_2:.+]] = %[[READ_INIT_1_2]],
// PROMOTE-SAME: %[[ACC_1_3:.+]] = %[[READ_INIT_1_3]],
// PROMOTE-SAME: %[[ACC_2_0:.+]] = %[[READ_INIT_2_0]],
// PROMOTE-SAME: %[[ACC_2_1:.+]] = %[[READ_INIT_2_1]],
// PROMOTE-SAME: %[[ACC_2_2:.+]] = %[[READ_INIT_2_2]],
// PROMOTE-SAME: %[[ACC_2_3:.+]] = %[[READ_INIT_2_3]],
// PROMOTE-SAME: %[[ACC_3_0:.+]] = %[[READ_INIT_3_0]],
// PROMOTE-SAME: %[[ACC_3_1:.+]] = %[[READ_INIT_3_1]],
// PROMOTE-SAME: %[[ACC_3_2:.+]] = %[[READ_INIT_3_2]],
// PROMOTE-SAME: %[[ACC_3_3:.+]] = %[[READ_INIT_3_3]])
// PROMOTE: %[[LHS_SUBVIEW:.+]] = subview %[[ARG0]]
// PROMOTE: %[[RHS_SUBVIEW:.+]] = subview %[[ARG1]]
// PROMOTE: linalg.copy(%[[LHS_SUBVIEW]], %[[WGMEM_LHS_SUBVIEW]])
// PROMOTE: linalg.copy(%[[RHS_SUBVIEW]], %[[WGMEM_RHS_SUBVIEW]])
// PROMOTE-DAG: %[[READ_LHS_0_0:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_LHS_SUBVIEW]][%[[C0]], %[[C0]]]
// PROMOTE-DAG: %[[READ_LHS_0_1:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_LHS_SUBVIEW]][%[[C0]], %[[C16]]]
// PROMOTE-DAG: %[[READ_LHS_1_0:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_LHS_SUBVIEW]][%[[C16]], %[[C0]]]
// PROMOTE-DAG: %[[READ_LHS_1_1:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_LHS_SUBVIEW]][%[[C16]], %[[C16]]]
// PROMOTE-DAG: %[[READ_LHS_2_0:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_LHS_SUBVIEW]][%[[C32]], %[[C0]]]
// PROMOTE-DAG: %[[READ_LHS_2_1:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_LHS_SUBVIEW]][%[[C32]], %[[C16]]]
// PROMOTE-DAG: %[[READ_LHS_3_0:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_LHS_SUBVIEW]][%[[C48]], %[[C0]]]
// PROMOTE-DAG: %[[READ_LHS_3_1:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_LHS_SUBVIEW]][%[[C48]], %[[C16]]]
// PROMOTE-DAG: %[[READ_RHS_0_0:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_RHS_SUBVIEW]][%[[C0]], %[[C0]]]
// PROMOTE-DAG: %[[READ_RHS_0_1:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_RHS_SUBVIEW]][%[[C0]], %[[C16]]]
// PROMOTE-DAG: %[[READ_RHS_0_2:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_RHS_SUBVIEW]][%[[C0]], %[[C32]]]
// PROMOTE-DAG: %[[READ_RHS_0_3:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_RHS_SUBVIEW]][%[[C0]], %[[C48]]]
// PROMOTE-DAG: %[[READ_RHS_1_0:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_RHS_SUBVIEW]][%[[C16]], %[[C0]]]
// PROMOTE-DAG: %[[READ_RHS_1_1:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_RHS_SUBVIEW]][%[[C16]], %[[C16]]]
// PROMOTE-DAG: %[[READ_RHS_1_2:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_RHS_SUBVIEW]][%[[C16]], %[[C32]]]
// PROMOTE-DAG: %[[READ_RHS_1_3:.+]] = vector.transfer_read
// PROMOTE-SAME: %[[SG_RHS_SUBVIEW]][%[[C16]], %[[C48]]]
// PROMOTE: %[[CONTRACT_0_0_1:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_0_0]], %[[READ_RHS_0_0]], %[[ACC_0_0]]
// PROMOTE: %[[CONTRACT_0_0:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_0_1]], %[[READ_RHS_1_0]], %[[CONTRACT_0_0_1]]
// PROMOTE: %[[CONTRACT_0_1_1:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_0_0]], %[[READ_RHS_0_1]], %[[ACC_0_1]]
// PROMOTE: %[[CONTRACT_0_1:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_0_1]], %[[READ_RHS_1_1]], %[[CONTRACT_0_1_1]]
// PROMOTE: %[[CONTRACT_0_2_1:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_0_0]], %[[READ_RHS_0_2]], %[[ACC_0_2]]
// PROMOTE: %[[CONTRACT_0_2:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_0_1]], %[[READ_RHS_1_2]], %[[CONTRACT_0_2_1]]
// PROMOTE: %[[CONTRACT_0_3_1:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_0_0]], %[[READ_RHS_0_3]], %[[ACC_0_3]]
// PROMOTE: %[[CONTRACT_0_3:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_0_1]], %[[READ_RHS_1_3]], %[[CONTRACT_0_3_1]]
// PROMOTE: %[[CONTRACT_1_0_1:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_1_0]], %[[READ_RHS_0_0]], %[[ACC_1_0]]
// PROMOTE: %[[CONTRACT_1_0:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_1_1]], %[[READ_RHS_1_0]], %[[CONTRACT_1_0_1]]
// PROMOTE: %[[CONTRACT_1_1_1:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_1_0]], %[[READ_RHS_0_1]], %[[ACC_1_1]]
// PROMOTE: %[[CONTRACT_1_1:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_1_1]], %[[READ_RHS_1_1]], %[[CONTRACT_1_1_1]]
// PROMOTE: %[[CONTRACT_1_2_1:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_1_0]], %[[READ_RHS_0_2]], %[[ACC_1_2]]
// PROMOTE: %[[CONTRACT_1_2:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_1_1]], %[[READ_RHS_1_2]], %[[CONTRACT_1_2_1]]
// PROMOTE: %[[CONTRACT_1_3_1:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_1_0]], %[[READ_RHS_0_3]], %[[ACC_1_3]]
// PROMOTE: %[[CONTRACT_1_3:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_1_1]], %[[READ_RHS_1_3]], %[[CONTRACT_1_3_1]]
// PROMOTE: %[[CONTRACT_2_0_1:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_2_0]], %[[READ_RHS_0_0]], %[[ACC_2_0]]
// PROMOTE: %[[CONTRACT_2_0:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_2_1]], %[[READ_RHS_1_0]], %[[CONTRACT_2_0_1]]
// PROMOTE: %[[CONTRACT_2_1_1:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_2_0]], %[[READ_RHS_0_1]], %[[ACC_2_1]]
// PROMOTE: %[[CONTRACT_2_1:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_2_1]], %[[READ_RHS_1_1]], %[[CONTRACT_2_1_1]]
// PROMOTE: %[[CONTRACT_2_2_1:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_2_0]], %[[READ_RHS_0_2]], %[[ACC_2_2]]
// PROMOTE: %[[CONTRACT_2_2:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_2_1]], %[[READ_RHS_1_2]], %[[CONTRACT_2_2_1]]
// PROMOTE: %[[CONTRACT_2_3_1:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_2_0]], %[[READ_RHS_0_3]], %[[ACC_2_3]]
// PROMOTE: %[[CONTRACT_2_3:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_2_1]], %[[READ_RHS_1_3]], %[[CONTRACT_2_3_1]]
// PROMOTE: %[[CONTRACT_3_0_1:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_3_0]], %[[READ_RHS_0_0]], %[[ACC_3_0]]
// PROMOTE: %[[CONTRACT_3_0:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_3_1]], %[[READ_RHS_1_0]], %[[CONTRACT_3_0_1]]
// PROMOTE: %[[CONTRACT_3_1_1:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_3_0]], %[[READ_RHS_0_1]], %[[ACC_3_1]]
// PROMOTE: %[[CONTRACT_3_1:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_3_1]], %[[READ_RHS_1_1]], %[[CONTRACT_3_1_1]]
// PROMOTE: %[[CONTRACT_3_2_1:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_3_0]], %[[READ_RHS_0_2]], %[[ACC_3_2]]
// PROMOTE: %[[CONTRACT_3_2:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_3_1]], %[[READ_RHS_1_2]], %[[CONTRACT_3_2_1]]
// PROMOTE: %[[CONTRACT_3_3_1:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_3_0]], %[[READ_RHS_0_3]], %[[ACC_3_3]]
// PROMOTE: %[[CONTRACT_3_3:.+]] = vector.contract
// PROMOTE-SAME: %[[READ_LHS_3_1]], %[[READ_RHS_1_3]], %[[CONTRACT_3_3_1]]
// PROMOTE: scf.yield %[[CONTRACT_0_0]], %[[CONTRACT_0_1]],
// PROMOTE-SAME: %[[CONTRACT_0_2]], %[[CONTRACT_0_3]], %[[CONTRACT_1_0]],
// PROMOTE-SAME: %[[CONTRACT_1_1]], %[[CONTRACT_1_2]], %[[CONTRACT_1_3]],
// PROMOTE-SAME: %[[CONTRACT_2_0]], %[[CONTRACT_2_1]], %[[CONTRACT_2_2]],
// PROMOTE-SAME: %[[CONTRACT_2_3]], %[[CONTRACT_3_0]], %[[CONTRACT_3_1]],
// PROMOTE-SAME: %[[CONTRACT_3_2]], %[[CONTRACT_3_3]]
// PROMOTE-DAG: vector.transfer_write %[[FOR_RES]]#0, %[[SG_RESULT_SUBVIEW]][%[[C0]], %[[C0]]]
// PROMOTE-DAG: vector.transfer_write %[[FOR_RES]]#1, %[[SG_RESULT_SUBVIEW]][%[[C0]], %[[C16]]]
// PROMOTE-DAG: vector.transfer_write %[[FOR_RES]]#2, %[[SG_RESULT_SUBVIEW]][%[[C0]], %[[C32]]]
// PROMOTE-DAG: vector.transfer_write %[[FOR_RES]]#3, %[[SG_RESULT_SUBVIEW]][%[[C0]], %[[C48]]]
// PROMOTE-DAG: vector.transfer_write %[[FOR_RES]]#4, %[[SG_RESULT_SUBVIEW]][%[[C16]], %[[C0]]]
// PROMOTE-DAG: vector.transfer_write %[[FOR_RES]]#5, %[[SG_RESULT_SUBVIEW]][%[[C16]], %[[C16]]]
// PROMOTE-DAG: vector.transfer_write %[[FOR_RES]]#6, %[[SG_RESULT_SUBVIEW]][%[[C16]], %[[C32]]]
// PROMOTE-DAG: vector.transfer_write %[[FOR_RES]]#7, %[[SG_RESULT_SUBVIEW]][%[[C16]], %[[C48]]]
// PROMOTE-DAG: vector.transfer_write %[[FOR_RES]]#8, %[[SG_RESULT_SUBVIEW]][%[[C32]], %[[C0]]]
// PROMOTE-DAG: vector.transfer_write %[[FOR_RES]]#9, %[[SG_RESULT_SUBVIEW]][%[[C32]], %[[C16]]]
// PROMOTE-DAG: vector.transfer_write %[[FOR_RES]]#10, %[[SG_RESULT_SUBVIEW]][%[[C32]], %[[C32]]]
// PROMOTE-DAG: vector.transfer_write %[[FOR_RES]]#11, %[[SG_RESULT_SUBVIEW]][%[[C32]], %[[C48]]]
// PROMOTE-DAG: vector.transfer_write %[[FOR_RES]]#12, %[[SG_RESULT_SUBVIEW]][%[[C48]], %[[C0]]]
// PROMOTE-DAG: vector.transfer_write %[[FOR_RES]]#13, %[[SG_RESULT_SUBVIEW]][%[[C48]], %[[C16]]]
// PROMOTE-DAG: vector.transfer_write %[[FOR_RES]]#14, %[[SG_RESULT_SUBVIEW]][%[[C48]], %[[C32]]]
// PROMOTE-DAG: vector.transfer_write %[[FOR_RES]]#15, %[[SG_RESULT_SUBVIEW]][%[[C48]], %[[C48]]]