blob: c844f502c194065776c2224d8d0c37fcc9eb8207 [file] [log] [blame]
// RUN: iree-opt %s --iree-codegen-linalg-bufferize -canonicalize -cse -split-input-file | IreeFileCheck %s
func @tile_from_tensor_load() {
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%M = hal.interface.load.constant offset = 0 : index
%N = hal.interface.load.constant offset = 1 : index
%K = hal.interface.load.constant offset = 2 : index
%0 = hal.interface.binding.subspan @io::@TENSOR_LHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %K}
%1 = hal.interface.binding.subspan @io::@TENSOR_RHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%K, %N}
%2 = hal.interface.binding.subspan @io::@TENSOR_INIT[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %N}
%3 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%M, %N}
%4 = hal.interface.workgroup.id[0] : index
%5 = hal.interface.workgroup.id[1] : index
scf.for %arg0 = %5 to %c2 step %c2 {
scf.for %arg1 = %4 to %c4 step %c4 {
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %c0], sizes = [%c1, %c3], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<1x3xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%c0, %arg1], sizes = [%c3, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<3x1xf32>
%8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<1x1xf32>
%9 = linalg.matmul ins(%6, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32>
flow.dispatch.tensor.store %9, %3, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding @TENSOR_LHS, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @TENSOR_RHS, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @TENSOR_INIT, set=0, binding=2, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=3, type="StorageBuffer", access="Write|Discard"
}
// CHECK-LABEL: func @tile_from_tensor_load()
// CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_LHS
// CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_RHS
// CHECK-DAG: %[[TENSOR_INIT:.+]] = hal.interface.binding.subspan @io::@TENSOR_INIT
// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan @io::@ret0
// CHECK: scf.for %[[IV0:.+]] = {{.+}} {
// CHECK: scf.for %[[IV1:.+]] = {{.+}} {
// CHECK-DAG: %[[LHS:.+]] = memref.subview %[[TENSOR_LHS]][%[[IV0]], 0] [1, 3] [1, 1]
// CHECK-DAG: %[[RHS:.+]] = memref.subview %[[TENSOR_RHS]][0, %[[IV1]]] [3, 1] [1, 1]
// CHECK-DAG: %[[INIT:.+]] = memref.subview %[[TENSOR_INIT]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1]
// CHECK-DAG: %[[RESULT:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1]
// CHECK: linalg.copy(%[[INIT]], %[[RESULT]])
// CHECK: linalg.matmul
// CHECK-SAME: ins(%[[LHS]], %[[RHS]]
// CHECK-SAME: outs(%[[RESULT]]
// -----
func @tile_from_tensor_load_inplace() {
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%M = hal.interface.load.constant offset = 0 : index
%N = hal.interface.load.constant offset = 1 : index
%K = hal.interface.load.constant offset = 2 : index
%0 = hal.interface.binding.subspan @io::@TENSOR_LHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %K}
%1 = hal.interface.binding.subspan @io::@TENSOR_RHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%K, %N}
%2 = hal.interface.binding.subspan @io::@TENSOR_INIT[%c0] : !flow.dispatch.tensor<readwrite:?x?xf32>{%M, %N}
%4 = hal.interface.workgroup.id[0] : index
%5 = hal.interface.workgroup.id[1] : index
scf.for %arg0 = %5 to %c2 step %c2 {
scf.for %arg1 = %4 to %c4 step %c4 {
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %c0], sizes = [%c1, %c3], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<1x3xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%c0, %arg1], sizes = [%c3, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<3x1xf32>
%8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readwrite:?x?xf32> -> tensor<1x1xf32>
%9 = linalg.matmul ins(%6, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32>
flow.dispatch.tensor.store %9, %2, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:?x?xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding @TENSOR_LHS, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @TENSOR_RHS, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @TENSOR_INIT, set=0, binding=2, type="StorageBuffer", access="Read|Write"
}
// CHECK-LABEL: func @tile_from_tensor_load_inplace()
// CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_LHS
// CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_RHS
// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan @io::@TENSOR_INIT
// CHECK: scf.for %[[IV0:.+]] = {{.+}} {
// CHECK: scf.for %[[IV1:.+]] = {{.+}} {
// CHECK-DAG: %[[LHS:.+]] = memref.subview %[[TENSOR_LHS]][%[[IV0]], 0] [1, 3] [1, 1]
// CHECK-DAG: %[[RHS:.+]] = memref.subview %[[TENSOR_RHS]][0, %[[IV1]]] [3, 1] [1, 1]
// CHECK-DAG: %[[RESULT:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1]
// CHECK: linalg.matmul
// CHECK-SAME: ins(%[[LHS]], %[[RHS]]
// CHECK-SAME: outs(%[[RESULT]]
// -----
func @tile_from_tensor_load_inplace_and_copy() {
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%M = hal.interface.load.constant offset = 0 : index
%N = hal.interface.load.constant offset = 1 : index
%K = hal.interface.load.constant offset = 2 : index
%0 = hal.interface.binding.subspan @io::@TENSOR_LHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %K}
%1 = hal.interface.binding.subspan @io::@TENSOR_RHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%K, %N}
%2 = hal.interface.binding.subspan @io::@TENSOR_INIT[%c0] : !flow.dispatch.tensor<readwrite:?x?xf32>{%M, %N}
%3 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%M, %N}
%4 = hal.interface.workgroup.id[0] : index
%5 = hal.interface.workgroup.id[1] : index
scf.for %arg0 = %5 to %c2 step %c2 {
scf.for %arg1 = %4 to %c4 step %c4 {
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %c0], sizes = [%c1, %c3], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<1x3xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%c0, %arg1], sizes = [%c3, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<3x1xf32>
%8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readwrite:?x?xf32> -> tensor<1x1xf32>
%9 = linalg.matmul ins(%6, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32>
flow.dispatch.tensor.store %9, %2, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:?x?xf32>
flow.dispatch.tensor.store %9, %3, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding @TENSOR_LHS, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @TENSOR_RHS, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @TENSOR_INIT, set=0, binding=2, type="StorageBuffer", access="Read|Write"
hal.interface.binding @ret0, set=0, binding=3, type="StorageBuffer", access="Write|Discard"
}
// CHECK-LABEL: func @tile_from_tensor_load_inplace_and_copy()
// CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_LHS
// CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_RHS
// CHECK-DAG: %[[RETURN1:.+]] = hal.interface.binding.subspan @io::@TENSOR_INIT
// CHECK-DAG: %[[RETURN2:.+]] = hal.interface.binding.subspan @io::@ret0
// CHECK: scf.for %[[IV0:.+]] = {{.+}} {
// CHECK: scf.for %[[IV1:.+]] = {{.+}} {
// CHECK-DAG: %[[LHS:.+]] = memref.subview %[[TENSOR_LHS]][%[[IV0]], 0] [1, 3] [1, 1]
// CHECK-DAG: %[[RHS:.+]] = memref.subview %[[TENSOR_RHS]][0, %[[IV1]]] [3, 1] [1, 1]
// CHECK-DAG: %[[RESULT1:.+]] = memref.subview %[[RETURN1]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1]
// CHECK: linalg.matmul
// CHECK-SAME: ins(%[[LHS]], %[[RHS]]
// CHECK-SAME: outs(%[[RESULT1]]
// CHECK: %[[RESULT2:.+]] = memref.subview %[[RETURN2]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1]
// CHECK: linalg.copy(%[[RESULT1]], %[[RESULT2]])
// -----
#map = affine_map<(d0, d1) -> (d0, d1)>
func @tile_from_pointwise_lhs() {
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%M = hal.interface.load.constant offset = 0 : index
%N = hal.interface.load.constant offset = 1 : index
%K = hal.interface.load.constant offset = 2 : index
%0 = hal.interface.binding.subspan @io::@TENSOR_LHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %K}
%1 = hal.interface.binding.subspan @io::@TENSOR_RHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%K, %N}
%2 = hal.interface.binding.subspan @io::@TENSOR_INIT[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %N}
%3 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%M, %N}
%4 = hal.interface.workgroup.id[0] : index
%5 = hal.interface.workgroup.id[1] : index
scf.for %arg0 = %5 to %c2 step %c2 {
scf.for %arg1 = %4 to %c4 step %c4 {
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %c0], sizes = [%c1, %c3], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<1x3xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%c0, %arg1], sizes = [%c3, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<3x1xf32>
%shape = linalg.init_tensor [1, 3] : tensor<1x3xf32>
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]}
ins(%6 : tensor<1x3xf32>) outs(%shape : tensor<1x3xf32>) {
^bb0(%arg2: f32, %s: f32): // no predecessors
linalg.yield %arg2 : f32
} -> tensor<1x3xf32>
%9 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<1x1xf32>
%10 = linalg.matmul ins(%8, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%9 : tensor<1x1xf32>) -> tensor<1x1xf32>
flow.dispatch.tensor.store %10, %3, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding @TENSOR_LHS, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @TENSOR_RHS, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @TENSOR_INIT, set=0, binding=2, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=3, type="StorageBuffer", access="Write|Discard"
}
// CHECK-LABEL: func @tile_from_pointwise_lhs()
// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<1x3xf32>
// CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_LHS
// CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_RHS
// CHECK-DAG: %[[TENSOR_INIT:.+]] = hal.interface.binding.subspan @io::@TENSOR_INIT
// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan @io::@ret0
// CHECK: scf.for %[[IV0:.+]] = {{.+}} {
// CHECK: scf.for %[[IV1:.+]] = {{.+}} {
// CHECK-DAG: %[[LHS:.+]] = memref.subview %[[TENSOR_LHS]][%[[IV0]], 0] [1, 3] [1, 1]
// CHECK-DAG: %[[RHS:.+]] = memref.subview %[[TENSOR_RHS]][0, %[[IV1]]] [3, 1] [1, 1]
// CHECK: linalg.generic
// CHECK-SAME: ins(%[[LHS]] :
// CHECK-SAME: outs(%[[ALLOC]]
// CHECK-DAG: %[[INIT:.+]] = memref.subview %[[TENSOR_INIT]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1]
// CHECK-DAG: %[[RESULT:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1]
// CHECK: linalg.copy(%[[INIT]], %[[RESULT]])
// CHECK: linalg.matmul
// CHECK-SAME: ins(%[[ALLOC]], %[[RHS]]
// CHECK-SAME: outs(%[[RESULT]]
// -----
#map = affine_map<(d0, d1) -> (d0, d1)>
func @tile_from_pointwise_lhs_inplace() {
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%M = hal.interface.load.constant offset = 0 : index
%N = hal.interface.load.constant offset = 1 : index
%K = hal.interface.load.constant offset = 2 : index
%0 = hal.interface.binding.subspan @io::@TENSOR_LHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %K}
%1 = hal.interface.binding.subspan @io::@TENSOR_RHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%K, %N}
%2 = hal.interface.binding.subspan @io::@TENSOR_INIT[%c0] : !flow.dispatch.tensor<readwrite:?x?xf32>{%M, %N}
%4 = hal.interface.workgroup.id[0] : index
%5 = hal.interface.workgroup.id[1] : index
scf.for %arg0 = %5 to %c2 step %c2 {
scf.for %arg1 = %4 to %c4 step %c4 {
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %c0], sizes = [%c1, %c3], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<1x3xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%c0, %arg1], sizes = [%c3, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<3x1xf32>
%shape = linalg.init_tensor [1, 3] : tensor<1x3xf32>
%8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]}
ins(%6 : tensor<1x3xf32>) outs(%shape : tensor<1x3xf32>) {
^bb0(%arg2: f32, %s: f32): // no predecessors
linalg.yield %arg2 : f32
} -> tensor<1x3xf32>
%9 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readwrite:?x?xf32> -> tensor<1x1xf32>
%10 = linalg.matmul ins(%8, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%9 : tensor<1x1xf32>) -> tensor<1x1xf32>
flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:?x?xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding @TENSOR_LHS, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @TENSOR_RHS, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @TENSOR_INIT, set=0, binding=2, type="StorageBuffer", access="Read|Write"
}
// CHECK-LABEL: func @tile_from_pointwise_lhs_inplace()
// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<1x3xf32>
// CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_LHS
// CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_RHS
// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan @io::@TENSOR_INIT
// CHECK: scf.for %[[IV0:.+]] = {{.+}} {
// CHECK: scf.for %[[IV1:.+]] = {{.+}} {
// CHECK-DAG: %[[LHS:.+]] = memref.subview %[[TENSOR_LHS]][%[[IV0]], 0] [1, 3] [1, 1]
// CHECK-DAG: %[[RHS:.+]] = memref.subview %[[TENSOR_RHS]][0, %[[IV1]]] [3, 1] [1, 1]
// CHECK: linalg.generic
// CHECK-SAME: ins(%[[LHS]] :
// CHECK-SAME: outs(%[[ALLOC]]
// CHECK-DAG: %[[RESULT:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1]
// CHECK: linalg.matmul
// CHECK-SAME: ins(%[[ALLOC]], %[[RHS]]
// CHECK-SAME: outs(%[[RESULT]]
// -----
#map = affine_map<(d0, d1) -> (d0, d1)>
func @tile_from_pointwise_outs() {
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%M = hal.interface.load.constant offset = 0 : index
%N = hal.interface.load.constant offset = 1 : index
%K = hal.interface.load.constant offset = 2 : index
%0 = hal.interface.binding.subspan @io::@TENSOR_LHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %K}
%1 = hal.interface.binding.subspan @io::@TENSOR_RHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%K, %N}
%2 = hal.interface.binding.subspan @io::@TENSOR_INIT[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %N}
%3 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%M, %N}
%4 = hal.interface.workgroup.id[0] : index
%5 = hal.interface.workgroup.id[1] : index
scf.for %arg0 = %5 to %c2 step %c2 {
scf.for %arg1 = %4 to %c4 step %c4 {
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %c0], sizes = [%c1, %c3], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<1x3xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%c0, %arg1], sizes = [%c3, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<3x1xf32>
%8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<1x1xf32>
%shape = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]}
ins(%8 : tensor<1x1xf32>) outs(%shape : tensor<1x1xf32>) {
^bb0(%arg2: f32, %s: f32): // no predecessors
linalg.yield %arg2 : f32
} -> tensor<1x1xf32>
%10 = linalg.matmul ins(%6, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%9 : tensor<1x1xf32>) -> tensor<1x1xf32>
flow.dispatch.tensor.store %10, %3, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding @TENSOR_LHS, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @TENSOR_RHS, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @TENSOR_INIT, set=0, binding=2, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=3, type="StorageBuffer", access="Write|Discard"
}
// CHECK-LABEL: func @tile_from_pointwise_outs()
// CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_LHS
// CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_RHS
// CHECK-DAG: %[[TENSOR_INIT:.+]] = hal.interface.binding.subspan @io::@TENSOR_INIT
// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan @io::@ret0
// CHECK: scf.for %[[IV0:.+]] = {{.+}} {
// CHECK: scf.for %[[IV1:.+]] = {{.+}} {
// CHECK-DAG: %[[RESULT:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1]
// CHECK-DAG: %[[LHS:.+]] = memref.subview %[[TENSOR_LHS]][%[[IV0]], 0] [1, 3] [1, 1]
// CHECK-DAG: %[[RHS:.+]] = memref.subview %[[TENSOR_RHS]][0, %[[IV1]]] [3, 1] [1, 1]
// CHECK-DAG: %[[INIT:.+]] = memref.subview %[[TENSOR_INIT]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1]
// CHECK: linalg.generic
// CHECK-SAME: ins(%[[INIT]] :
// CHECK-SAME: outs(%[[RESULT]]
// CHECK: linalg.matmul
// CHECK-SAME: ins(%[[LHS]], %[[RHS]]
// CHECK-SAME: outs(%[[RESULT]]
// -----
#map = affine_map<(d0, d1) -> (d0, d1)>
func @tile_from_pointwise_outs_inplace() {
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%M = hal.interface.load.constant offset = 0 : index
%N = hal.interface.load.constant offset = 1 : index
%K = hal.interface.load.constant offset = 2 : index
%0 = hal.interface.binding.subspan @io::@TENSOR_LHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %K}
%1 = hal.interface.binding.subspan @io::@TENSOR_RHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%K, %N}
%2 = hal.interface.binding.subspan @io::@TENSOR_INIT[%c0] : !flow.dispatch.tensor<readwrite:?x?xf32>{%M, %N}
%4 = hal.interface.workgroup.id[0] : index
%5 = hal.interface.workgroup.id[1] : index
scf.for %arg0 = %5 to %c2 step %c2 {
scf.for %arg1 = %4 to %c4 step %c4 {
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %c0], sizes = [%c1, %c3], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<1x3xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%c0, %arg1], sizes = [%c3, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<3x1xf32>
%8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readwrite:?x?xf32> -> tensor<1x1xf32>
%shape = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]}
ins(%8 : tensor<1x1xf32>) outs(%shape : tensor<1x1xf32>) {
^bb0(%arg2: f32, %s: f32): // no predecessors
linalg.yield %arg2 : f32
} -> tensor<1x1xf32>
%10 = linalg.matmul ins(%6, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%9 : tensor<1x1xf32>) -> tensor<1x1xf32>
flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:?x?xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding @TENSOR_LHS, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @TENSOR_RHS, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @TENSOR_INIT, set=0, binding=2, type="StorageBuffer", access="Read|Write"
}
// CHECK-LABEL: func @tile_from_pointwise_outs_inplace()
// CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_LHS
// CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_RHS
// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan @io::@TENSOR_INIT
// CHECK: scf.for %[[IV0:.+]] = {{.+}} {
// CHECK: scf.for %[[IV1:.+]] = {{.+}} {
// CHECK-DAG: %[[RESULT:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1]
// CHECK-DAG: %[[LHS:.+]] = memref.subview %[[TENSOR_LHS]][%[[IV0]], 0] [1, 3] [1, 1]
// CHECK-DAG: %[[RHS:.+]] = memref.subview %[[TENSOR_RHS]][0, %[[IV1]]] [3, 1] [1, 1]
// CHECK: linalg.generic
// CHECK-SAME: outs(%[[RESULT]]
// CHECK: linalg.matmul
// CHECK-SAME: ins(%[[LHS]], %[[RHS]]
// CHECK-SAME: outs(%[[RESULT]]
// -----
#map = affine_map<(d0, d1) -> (d0, d1)>
func @tile_from_matmul_outs() {
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%M = hal.interface.load.constant offset = 0 : index
%N = hal.interface.load.constant offset = 1 : index
%K = hal.interface.load.constant offset = 2 : index
%0 = hal.interface.binding.subspan @io::@TENSOR_LHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %K}
%1 = hal.interface.binding.subspan @io::@TENSOR_RHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%K, %N}
%2 = hal.interface.binding.subspan @io::@TENSOR_INIT[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %N}
%3 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%M, %N}
%4 = hal.interface.workgroup.id[0] : index
%5 = hal.interface.workgroup.id[1] : index
scf.for %arg0 = %5 to %c2 step %c2 {
scf.for %arg1 = %4 to %c4 step %c4 {
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %c0], sizes = [%c1, %c3], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<1x3xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%c0, %arg1], sizes = [%c3, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<3x1xf32>
%8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<1x1xf32>
%shape = linalg.init_tensor [1, 1] : tensor<1x1xf32>
%9 = linalg.matmul ins(%6, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32>
%10 = linalg.matmul ins(%6, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%9 : tensor<1x1xf32>) -> tensor<1x1xf32>
flow.dispatch.tensor.store %10, %3, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding @TENSOR_LHS, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @TENSOR_RHS, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @TENSOR_INIT, set=0, binding=2, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=3, type="StorageBuffer", access="Write|Discard"
}
// CHECK-LABEL: func @tile_from_matmul_outs()
// CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_LHS
// CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_RHS
// CHECK-DAG: %[[TENSOR_INIT:.+]] = hal.interface.binding.subspan @io::@TENSOR_INIT
// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan @io::@ret0
// CHECK: scf.for %[[IV0:.+]] = {{.+}} {
// CHECK: scf.for %[[IV1:.+]] = {{.+}} {
// CHECK-DAG: %[[LHS:.+]] = memref.subview %[[TENSOR_LHS]][%[[IV0]], 0] [1, 3] [1, 1]
// CHECK-DAG: %[[RHS:.+]] = memref.subview %[[TENSOR_RHS]][0, %[[IV1]]] [3, 1] [1, 1]
// CHECK-DAG: %[[INIT:.+]] = memref.subview %[[TENSOR_INIT]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1]
// CHECK-DAG: %[[RESULT:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1]
// CHECK: linalg.copy(%[[INIT]], %[[RESULT]])
// CHECK: linalg.matmul
// CHECK-SAME: outs(%[[RESULT]]
// CHECK: linalg.matmul
// CHECK-SAME: outs(%[[RESULT]]
// -----
#map = affine_map<(d0, d1) -> (d0, d1)>
func @tile_from_matmul_outs_inplace() {
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%M = hal.interface.load.constant offset = 0 : index
%N = hal.interface.load.constant offset = 1 : index
%K = hal.interface.load.constant offset = 2 : index
%0 = hal.interface.binding.subspan @io::@TENSOR_LHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %K}
%1 = hal.interface.binding.subspan @io::@TENSOR_RHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%K, %N}
%2 = hal.interface.binding.subspan @io::@TENSOR_INIT[%c0] : !flow.dispatch.tensor<readwrite:?x?xf32>{%M, %N}
%4 = hal.interface.workgroup.id[0] : index
%5 = hal.interface.workgroup.id[1] : index
scf.for %arg0 = %5 to %c2 step %c2 {
scf.for %arg1 = %4 to %c4 step %c4 {
%6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %c0], sizes = [%c1, %c3], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<1x3xf32>
%7 = flow.dispatch.tensor.load %1, offsets = [%c0, %arg1], sizes = [%c3, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<3x1xf32>
%8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readwrite:?x?xf32> -> tensor<1x1xf32>
%9 = linalg.matmul ins(%6, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32>
%10 = linalg.matmul ins(%6, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%9 : tensor<1x1xf32>) -> tensor<1x1xf32>
flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:?x?xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding @TENSOR_LHS, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @TENSOR_RHS, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @TENSOR_INIT, set=0, binding=2, type="StorageBuffer", access="Read|Write"
}
// CHECK-LABEL: func @tile_from_matmul_outs_inplace()
// CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_LHS
// CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_RHS
// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan @io::@TENSOR_INIT
// CHECK: scf.for %[[IV0:.+]] = {{.+}} {
// CHECK: scf.for %[[IV1:.+]] = {{.+}} {
// CHECK-DAG: %[[RESULT:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1]
// CHECK-DAG: %[[LHS:.+]] = memref.subview %[[TENSOR_LHS]][%[[IV0]], 0] [1, 3] [1, 1]
// CHECK-DAG: %[[RHS:.+]] = memref.subview %[[TENSOR_RHS]][0, %[[IV1]]] [3, 1] [1, 1]
// CHECK: linalg.matmul
// CHECK-SAME: outs(%[[RESULT]]
// CHECK: linalg.matmul
// CHECK-SAME: outs(%[[RESULT]]
// -----
func @bufferize_dynamic() {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%4 = hal.interface.load.constant offset = 0 : index
%5 = hal.interface.load.constant offset = 1 : index
%6 = hal.interface.load.constant offset = 2 : index
%7 = hal.interface.load.constant offset = 3 : index
%8 = hal.interface.load.constant offset = 4 : index
%9 = hal.interface.load.constant offset = 5 : index
%10 = hal.interface.load.constant offset = 6 : index
%11 = hal.interface.load.constant offset = 7 : index
%LHS = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%4, %5}
%RHS = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%6, %7}
%INIT = hal.interface.binding.subspan @io::@arg2[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%8, %9}
%RET = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%10, %11}
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%20 = arith.muli %workgroup_size_y, %workgroup_id_y : index
%21 = arith.muli %workgroup_size_y, %workgroup_count_y : index
scf.for %arg0 = %20 to %4 step %21 {
%22 = arith.muli %workgroup_size_x, %workgroup_id_x : index
%23 = arith.muli %workgroup_size_x, %workgroup_count_x : index
scf.for %arg1 = %22 to %7 step %23 {
%24 = affine.min affine_map<(d0)[s0, s1] -> (s1, -d0 + s0)>(%arg0)[%4, %workgroup_size_y]
%25 = flow.dispatch.tensor.load %LHS, offsets = [%arg0, %c0], sizes = [%24, %5], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32>
%26 = affine.min affine_map<(d0)[s0, s1] -> (s1, -d0 + s0)>(%arg1)[%7, %workgroup_size_x]
%27 = flow.dispatch.tensor.load %RHS, offsets = [%c0, %arg1], sizes = [%6, %26], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32>
%28 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg0)[%workgroup_size_y, %8]
%29 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg1)[%workgroup_size_x, %9]
%30 = flow.dispatch.tensor.load %INIT, offsets = [%arg0, %arg1], sizes = [%28, %29], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32>
%31 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%25, %27 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%30 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %31, %RET, offsets = [%arg0, %arg1], sizes = [%28, %29], strides = [%c1, %c1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @arg2, set=0, binding=2, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=3, type="StorageBuffer", access="Write|Discard"
}
// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (s1, -d0 + s0)>
// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>
// CHECK: func @bufferize_dynamic()
// CHECK: %[[DIM0:.+]] = hal.interface.load.constant offset = 0 : index
// CHECK: %[[DIM1:.+]] = hal.interface.load.constant offset = 1 : index
// CHECK: %[[DIM2:.+]] = hal.interface.load.constant offset = 2 : index
// CHECK: %[[DIM3:.+]] = hal.interface.load.constant offset = 3 : index
// CHECK: %[[DIM4:.+]] = hal.interface.load.constant offset = 4 : index
// CHECK: %[[DIM5:.+]] = hal.interface.load.constant offset = 5 : index
// CHECK: %[[DIM6:.+]] = hal.interface.load.constant offset = 6 : index
// CHECK: %[[DIM7:.+]] = hal.interface.load.constant offset = 7 : index
// CHECK: %[[LHS:.+]] = hal.interface.binding.subspan @io::@arg0[%{{.+}}] : memref<?x?xf32>{%[[DIM0]], %[[DIM1]]}
// CHECK: %[[RHS:.+]] = hal.interface.binding.subspan @io::@arg1[%{{.+}}] : memref<?x?xf32>{%[[DIM2]], %[[DIM3]]}
// CHECK: %[[INIT:.+]] = hal.interface.binding.subspan @io::@arg2[%{{.+}}] : memref<?x?xf32>{%[[DIM4]], %[[DIM5]]}
// CHECK: %[[RESULT:.+]] = hal.interface.binding.subspan @io::@ret0[%{{.+}}] : memref<?x?xf32>{%[[DIM6]], %[[DIM7]]}
// CHECK-DAG: %[[WGSIZE_X:.+]] = hal.interface.workgroup.size[0]
// CHECK-DAG: %[[WGSIZE_Y:.+]] = hal.interface.workgroup.size[1]
// CHECK: scf.for %[[IV0:.+]] = {{.+}} {
// CHECK: scf.for %[[IV1:.+]] = {{.+}} {
// CHECK: %[[TILE_M:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[DIM0]], %[[WGSIZE_Y]]]
// CHECK: %[[LHS_TILE:.+]] = memref.subview %[[LHS]][%[[IV0]], 0] [%[[TILE_M]], %[[DIM1]]]
// CHECK: %[[TILE_N:.+]] = affine.min #[[MAP0]](%[[IV1]])[%[[DIM3]], %[[WGSIZE_X]]]
// CHECK-DAG: %[[RHS_TILE:.+]] = memref.subview %[[RHS]][0, %[[IV1]]] [%[[DIM2]], %[[TILE_N]]]
// CHECK: %[[TILE_M_2:.+]] = affine.min #[[MAP2]](%[[IV0]])[%[[WGSIZE_Y]], %[[DIM4]]]
// CHECK: %[[TILE_N_2:.+]] = affine.min #[[MAP2]](%[[IV1]])[%[[WGSIZE_X]], %[[DIM5]]]
// CHECK-DAG: %[[INIT_TILE:.+]] = memref.subview %[[INIT]][%[[IV0]], %[[IV1]]] [%[[TILE_M_2]], %[[TILE_N_2]]]
// CHECK-DAG: %[[RESULT_TILE:.+]] = memref.subview %[[RESULT]][%[[IV0]], %[[IV1]]] [%[[TILE_M_2]], %[[TILE_N_2]]]
// CHECK: linalg.copy(%[[INIT_TILE]], %[[RESULT_TILE]])
// CHECK: linalg.matmul
// CHECK-SAME: ins(%[[LHS_TILE]], %[[RHS_TILE]]
// CHECK-SAME: outs(%[[RESULT_TILE]]
// -----
func @bufferize_dynamic_inplace() {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%4 = hal.interface.load.constant offset = 0 : index
%5 = hal.interface.load.constant offset = 1 : index
%6 = hal.interface.load.constant offset = 2 : index
%7 = hal.interface.load.constant offset = 3 : index
%8 = hal.interface.load.constant offset = 4 : index
%9 = hal.interface.load.constant offset = 5 : index
%LHS = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%4, %5}
%RHS = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%6, %7}
%OUT = hal.interface.binding.subspan @io::@arg2[%c0] : !flow.dispatch.tensor<readwrite:?x?xf32>{%8, %9}
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%20 = arith.muli %workgroup_size_y, %workgroup_id_y : index
%21 = arith.muli %workgroup_size_y, %workgroup_count_y : index
scf.for %arg0 = %20 to %4 step %21 {
%22 = arith.muli %workgroup_size_x, %workgroup_id_x : index
%23 = arith.muli %workgroup_size_x, %workgroup_count_x : index
scf.for %arg1 = %22 to %7 step %23 {
%24 = affine.min affine_map<(d0)[s0, s1] -> (s1, -d0 + s0)>(%arg0)[%4, %workgroup_size_y]
%25 = flow.dispatch.tensor.load %LHS, offsets = [%arg0, %c0], sizes = [%24, %5], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32>
%26 = affine.min affine_map<(d0)[s0, s1] -> (s1, -d0 + s0)>(%arg1)[%7, %workgroup_size_x]
%27 = flow.dispatch.tensor.load %RHS, offsets = [%c0, %arg1], sizes = [%6, %26], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32>
%28 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg0)[%workgroup_size_y, %8]
%29 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg1)[%workgroup_size_x, %9]
%30 = flow.dispatch.tensor.load %OUT, offsets = [%arg0, %arg1], sizes = [%28, %29], strides = [%c1, %c1] : !flow.dispatch.tensor<readwrite:?x?xf32> -> tensor<?x?xf32>
%31 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%25, %27 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%30 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %31, %OUT, offsets = [%arg0, %arg1], sizes = [%28, %29], strides = [%c1, %c1] : tensor<?x?xf32> -> !flow.dispatch.tensor<readwrite:?x?xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @arg2, set=0, binding=2, type="StorageBuffer", access="Read|Write"
}
// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (s1, -d0 + s0)>
// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>
// CHECK: func @bufferize_dynamic_inplace()
// CHECK: %[[DIM0:.+]] = hal.interface.load.constant offset = 0 : index
// CHECK: %[[DIM1:.+]] = hal.interface.load.constant offset = 1 : index
// CHECK: %[[DIM2:.+]] = hal.interface.load.constant offset = 2 : index
// CHECK: %[[DIM3:.+]] = hal.interface.load.constant offset = 3 : index
// CHECK: %[[DIM4:.+]] = hal.interface.load.constant offset = 4 : index
// CHECK: %[[DIM5:.+]] = hal.interface.load.constant offset = 5 : index
// CHECK: %[[LHS:.+]] = hal.interface.binding.subspan @io::@arg0[%{{.+}}] : memref<?x?xf32>{%[[DIM0]], %[[DIM1]]}
// CHECK: %[[RHS:.+]] = hal.interface.binding.subspan @io::@arg1[%{{.+}}] : memref<?x?xf32>{%[[DIM2]], %[[DIM3]]}
// CHECK: %[[RESULT:.+]] = hal.interface.binding.subspan @io::@arg2[%{{.+}}] : memref<?x?xf32>{%[[DIM4]], %[[DIM5]]}
// CHECK-DAG: %[[WGSIZE_X:.+]] = hal.interface.workgroup.size[0]
// CHECK-DAG: %[[WGSIZE_Y:.+]] = hal.interface.workgroup.size[1]
// CHECK: scf.for %[[IV0:.+]] = {{.+}} {
// CHECK: scf.for %[[IV1:.+]] = {{.+}} {
// CHECK: %[[TILE_M:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[DIM0]], %[[WGSIZE_Y]]]
// CHECK: %[[LHS_TILE:.+]] = memref.subview %[[LHS]][%[[IV0]], 0] [%[[TILE_M]], %[[DIM1]]]
// CHECK: %[[TILE_N:.+]] = affine.min #[[MAP0]](%[[IV1]])[%[[DIM3]], %[[WGSIZE_X]]]
// CHECK-DAG: %[[RHS_TILE:.+]] = memref.subview %[[RHS]][0, %[[IV1]]] [%[[DIM2]], %[[TILE_N]]]
// CHECK: %[[TILE_M_2:.+]] = affine.min #[[MAP2]](%[[IV0]])[%[[WGSIZE_Y]], %[[DIM4]]]
// CHECK: %[[TILE_N_2:.+]] = affine.min #[[MAP2]](%[[IV1]])[%[[WGSIZE_X]], %[[DIM5]]]
// CHECK-DAG: %[[RESULT_TILE:.+]] = memref.subview %[[RESULT]][%[[IV0]], %[[IV1]]] [%[[TILE_M_2]], %[[TILE_N_2]]]
// CHECK: linalg.matmul
// CHECK-SAME: ins(%[[LHS_TILE]], %[[RHS_TILE]]
// CHECK-SAME: outs(%[[RESULT_TILE]]
// -----
func @reshape_simple() {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%c4 = arith.constant 4 : index
%c12 = arith.constant 12 : index
%0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:12xi32>
%1 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:3x4xi32>
%2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:12xi32> -> tensor<12xi32>
%3 = linalg.tensor_expand_shape %2 [[0, 1]] : tensor<12xi32> into tensor<3x4xi32>
flow.dispatch.tensor.store %3, %1, offsets = [], sizes = [], strides = [] : tensor<3x4xi32> -> !flow.dispatch.tensor<writeonly:3x4xi32>
return
}
hal.interface private @io {
hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
}
// CHECK: func @reshape_simple()
// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0
// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0
// CHECK: %[[RESHAPE:.+]] = memref.expand_shape %[[ARG0]] {{\[}}[0, 1]]
// CHECK: linalg.copy(%[[RESHAPE]], %[[RET0]])
// -----
func @reshape_fused_source() {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%c4 = arith.constant 4 : index
%c12 = arith.constant 12 : index
%0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:12xi32>
%1 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:3x4xi32>
%2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:12xi32> -> tensor<12xi32>
%3 = linalg.tensor_expand_shape %2 [[0, 1]] : tensor<12xi32> into tensor<3x4xi32>
%4 = linalg.init_tensor [3, 4] : tensor<3x4xi32>
%5 = linalg.generic {
indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
iterator_types = ["parallel", "parallel"]}
ins(%3 : tensor<3x4xi32>) outs(%4 : tensor<3x4xi32>) {
^bb0(%arg0 : i32, %arg1 : i32):
%6 = arith.addi %arg0, %arg0 : i32
linalg.yield %6 : i32
} -> tensor<3x4xi32>
flow.dispatch.tensor.store %5, %1, offsets = [], sizes = [], strides = [] : tensor<3x4xi32> -> !flow.dispatch.tensor<writeonly:3x4xi32>
return
}
hal.interface private @io {
hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
}
// CHECK: func @reshape_fused_source()
// CHECK: %[[C0:.+]] = arith.constant 0
// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0[%[[C0]]] : memref<12xi32>
// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0[%[[C0]]] : memref<3x4xi32>
// CHECK: %[[RESHAPE:.+]] = memref.expand_shape %[[ARG0]] {{\[}}[0, 1]]
// CHECK: linalg.generic
// CHECK-SAME: ins(%[[RESHAPE]] : memref<3x4xi32>)
// CHECK-SAME: outs(%[[RET0]] : memref<3x4xi32>)
// -----
func @reshape_fused_source_and_copyout() {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%c4 = arith.constant 4 : index
%c12 = arith.constant 12 : index
%0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:12xi32>
%1 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:3x4xi32>
%2 = hal.interface.binding.subspan @io::@ret1[%c0] : !flow.dispatch.tensor<writeonly:3x4xi32>
%3 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:12xi32> -> tensor<12xi32>
%4 = linalg.tensor_expand_shape %3 [[0, 1]] : tensor<12xi32> into tensor<3x4xi32>
%5 = linalg.init_tensor [3, 4] : tensor<3x4xi32>
%6 = linalg.generic {
indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
iterator_types = ["parallel", "parallel"]}
ins(%4 : tensor<3x4xi32>) outs(%5 : tensor<3x4xi32>) {
^bb0(%arg0 : i32, %arg1 : i32):
%7 = arith.addi %arg0, %arg0 : i32
linalg.yield %7 : i32
} -> tensor<3x4xi32>
flow.dispatch.tensor.store %6, %1, offsets = [], sizes = [], strides = [] : tensor<3x4xi32> -> !flow.dispatch.tensor<writeonly:3x4xi32>
flow.dispatch.tensor.store %4, %2, offsets = [], sizes = [], strides = [] : tensor<3x4xi32> -> !flow.dispatch.tensor<writeonly:3x4xi32>
return
}
hal.interface private @io {
hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
hal.interface.binding @ret1, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
// CHECK: func @reshape_fused_source_and_copyout()
// CHECK: %[[C0:.+]] = arith.constant 0
// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0[%[[C0]]] : memref<12xi32>
// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0[%[[C0]]] : memref<3x4xi32>
// CHECK-DAG: %[[RET1:.+]] = hal.interface.binding.subspan @io::@ret1[%[[C0]]] : memref<3x4xi32>
// CHECK: %[[RESHAPE:.+]] = memref.expand_shape %[[ARG0]] {{\[}}[0, 1]]
// CHECK: linalg.generic
// CHECK-SAME: ins(%[[RESHAPE]] : memref<3x4xi32>)
// CHECK-SAME: outs(%[[RET0]] : memref<3x4xi32>)
// CHECK: linalg.copy(%[[RESHAPE]], %[[RET1]])
// -----
func @reshape_fused_target() {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%c4 = arith.constant 4 : index
%c12 = arith.constant 12 : index
%0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:3x4xi32>
%1 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:12xi32>
%2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:3x4xi32> -> tensor<3x4xi32>
%3 = linalg.init_tensor [3, 4] : tensor<3x4xi32>
%4 = linalg.generic {
indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
iterator_types = ["parallel", "parallel"]}
ins(%2 : tensor<3x4xi32>) outs(%3 : tensor<3x4xi32>) {
^bb0(%arg0 : i32, %arg1 : i32):
%5 = arith.addi %arg0, %arg0 : i32
linalg.yield %5 : i32
} -> tensor<3x4xi32>
%5 = linalg.tensor_collapse_shape %4 [[0, 1]] : tensor<3x4xi32> into tensor<12xi32>
flow.dispatch.tensor.store %5, %1, offsets = [], sizes = [], strides = [] : tensor<12xi32> -> !flow.dispatch.tensor<writeonly:12xi32>
return
}
hal.interface private @io {
hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
}
// CHECK: func @reshape_fused_target()
// CHECK: %[[C0:.+]] = arith.constant 0
// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0[%[[C0]]] : memref<3x4xi32>
// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0[%[[C0]]] : memref<12xi32>
// CHECK: %[[RESHAPE:.+]] = memref.expand_shape %[[RET0]] {{\[}}[0, 1]]
// CHECK: linalg.generic
// CHECK-SAME: ins(%[[ARG0]] : memref<3x4xi32>)
// CHECK-SAME: outs(%[[RESHAPE]] : memref<3x4xi32>)
// -----
func @dot_general_lowering() {
%cst = arith.constant 0.000000e+00 : f32
%c3 = arith.constant 3 : index
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:1x1x2xf32>
%1 = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:2x3xf32>
%2 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:1x3xf32>
%3 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x1x2xf32> -> tensor<1x1x2xf32>
%4 = linalg.tensor_collapse_shape %3 [[0, 1], [2]] : tensor<1x1x2xf32> into tensor<1x2xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%5 = arith.muli %workgroup_size_y, %workgroup_id_y : index
%6 = arith.muli %workgroup_size_y, %workgroup_count_y : index
scf.for %arg0 = %5 to %c1 step %6 {
%7 = arith.muli %workgroup_size_x, %workgroup_id_x : index
%8 = arith.muli %workgroup_size_x, %workgroup_count_x : index
scf.for %arg1 = %7 to %c3 step %8 {
%9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 1)>(%arg0)[%workgroup_size_y]
%10 = tensor.extract_slice %4[%arg0, 0] [%9, 2] [1, 1] : tensor<1x2xf32> to tensor<?x2xf32>
%11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 3)>(%arg1)[%workgroup_size_x]
%12 = flow.dispatch.tensor.load %1, offsets = [%c0, %arg1], sizes = [%c2, %11], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:2x3xf32> -> tensor<2x?xf32>
%13 = linalg.init_tensor [%9, %11] : tensor<?x?xf32>
%14 = linalg.fill(%cst, %13) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
%15 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%10, %12 : tensor<?x2xf32>, tensor<2x?xf32>) outs(%14 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %15, %2, offsets = [%arg0, %arg1], sizes = [%9, %11], strides = [%c1, %c1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:1x3xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @arg1, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
}
// CHECK-LABEL: func @dot_general_lowering()
// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan @io::@arg0
// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan @io::@arg1
// CHECK-DAG: %[[RESHAPE_LHS:.+]] = memref.collapse_shape %[[LHS]]
// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan @io::@ret0
// CHECK: scf.for %[[IV0:.+]] = {{.+}} {
// CHECK: scf.for %[[IV1:.+]] = {{.+}} {
// CHECK-DAG: %[[LHS_TILE:.+]] = memref.subview %[[RESHAPE_LHS]][%[[IV0]], 0]
// CHECK-DAG: %[[RESULT_TILE:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]]
// CHECK-DAG: %[[RHS_TILE:.+]] = memref.subview %[[RHS]][0, %[[IV1]]]
// CHECK: linalg.fill(%{{.+}}, %[[RESULT_TILE]])
// CHECK: linalg.matmul
// CHECK-SAME: ins(%[[LHS_TILE]], %[[RHS_TILE]]
// CHECK-SAME: outs(%[[RESULT_TILE]]
// -----
func @slice() {
%c0 = arith.constant 0 : index
%2 = hal.interface.load.constant offset = 0 : index
%3 = hal.interface.load.constant offset = 1 : index
%4 = hal.interface.load.constant offset = 2 : index
%5 = hal.interface.load.constant offset = 3 : index
%0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xi32>{%2, %3}
%1 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xi32>{%4, %5}
%6 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:?x?xi32> -> tensor<?x?xi32>
%7 = tensor.extract_slice %6[%2, %3] [%4, %5] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32>
flow.dispatch.tensor.store %7, %1, offsets = [], sizes = [], strides = [] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:?x?xi32>
return
}
hal.interface private @io {
hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
}
// CHECK-LABEL: func @slice()
// CHECK-DAG: %[[ARG:.+]] = hal.interface.binding.subspan @io::@arg0
// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan @io::@ret0
// CHECK: %[[SUBVIEW:.+]] = memref.subview %[[ARG]]
// CHECK: linalg.copy(%[[SUBVIEW]], %[[RETURN]])
// -----
func @slice_rank_reducing() {
%c0 = arith.constant 0 : index
%2 = hal.interface.load.constant offset = 0 : index
%3 = hal.interface.load.constant offset = 1 : index
%4 = hal.interface.load.constant offset = 2 : index
%5 = hal.interface.load.constant offset = 3 : index
%8 = hal.interface.load.constant offset = 4 : index
%0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?x?xi32>{%8, %8, %8}
%1 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xi32>{%4, %5}
%6 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:?x?x?xi32> -> tensor<?x?x?xi32>
%7 = tensor.extract_slice %6[%2, %2, %3] [%4, 1, %5] [1, 1, 1] : tensor<?x?x?xi32> to tensor<?x?xi32>
flow.dispatch.tensor.store %7, %1, offsets = [], sizes = [], strides = [] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:?x?xi32>
return
}
hal.interface private @io {
hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
}
// CHECK-LABEL: func @slice_rank_reducing()
// CHECK-DAG: %[[ARG:.+]] = hal.interface.binding.subspan @io::@arg0
// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan @io::@ret0
// CHECK: %[[SUBVIEW:.+]] = memref.subview %[[ARG]]
// CHECK: linalg.copy(%[[SUBVIEW]], %[[RETURN]])
// -----
func @slice_multiple_copy() {
%c0 = arith.constant 0 : index
%3 = hal.interface.load.constant offset = 0 : index
%4 = hal.interface.load.constant offset = 1 : index
%5 = hal.interface.load.constant offset = 2 : index
%6 = hal.interface.load.constant offset = 3 : index
%7 = hal.interface.load.constant offset = 4 : index
%8 = hal.interface.load.constant offset = 5 : index
%12 = hal.interface.load.constant offset = 6 : index
%0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?x?xi32>{%12, %12, %12}
%1 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?x?xi32>{%6, %7, %8}
%2 = hal.interface.binding.subspan @io::@ret1[%c0] : !flow.dispatch.tensor<writeonly:?x?xi32>{%6, %8}
%9 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:?x?x?xi32> -> tensor<?x?x?xi32>
%10 = tensor.extract_slice %9[%3, %4, %5] [%6, %7, %8] [1, 1, 1] : tensor<?x?x?xi32> to tensor<?x?x?xi32>
%11 = tensor.extract_slice %9[%3, %4, %5] [%6, 1, %8] [1, 1, 1] : tensor<?x?x?xi32> to tensor<?x?xi32>
flow.dispatch.tensor.store %10, %1, offsets = [], sizes = [], strides = [] : tensor<?x?x?xi32> -> !flow.dispatch.tensor<writeonly:?x?x?xi32>
flow.dispatch.tensor.store %11, %2, offsets = [%3, %5], sizes = [%6, %8], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:?x?xi32>
return
}
hal.interface private @io {
hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
hal.interface.binding @ret1, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
// CHECK-LABEL: func @slice_multiple_copy()
// CHECK-DAG: %[[ARG:.+]] = hal.interface.binding.subspan @io::@arg0
// CHECK-DAG: %[[RETURN1:.+]] = hal.interface.binding.subspan @io::@ret0
// CHECK-DAG: %[[RETURN2:.+]] = hal.interface.binding.subspan @io::@ret1
// CHECK-DAG: %[[SIZE1:.+]] = hal.interface.load.constant offset = 3 : index
// CHECK-DAG: %[[SIZE2:.+]] = hal.interface.load.constant offset = 4 : index
// CHECK-DAG: %[[SIZE3:.+]] = hal.interface.load.constant offset = 5 : index
// CHECK: %[[SUBVIEW1:.+]] = memref.subview %[[ARG]][%{{.+}}, %{{.+}}, %{{.+}}] [%[[SIZE1]], %[[SIZE2]], %[[SIZE3]]]
// CHECK: linalg.copy(%[[SUBVIEW1]], %[[RETURN1]])
// CHECK-DAG: %[[SUBVIEW2:.+]] = memref.subview %[[ARG]][%{{.+}}, %{{.+}}, %{{.+}}] [%[[SIZE1]], 1, %[[SIZE3]]]
// CHECK-DAG: %[[RETURNVIEW:.+]] = memref.subview %[[RETURN2]]
// CHECK: linalg.copy(%[[SUBVIEW2]], %[[RETURNVIEW]])
// -----
func @slice_in_place() {
%c0 = arith.constant 0 : index
%2 = hal.interface.load.constant offset = 0 : index
%3 = hal.interface.load.constant offset = 1 : index
%0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readwrite:?x?xi32>{%2, %3}
%6 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:?x?xi32> -> tensor<?x?xi32>
flow.dispatch.tensor.store %6, %0, offsets = [], sizes = [], strides = [] : tensor<?x?xi32> -> !flow.dispatch.tensor<readwrite:?x?xi32>
return
}
hal.interface private @io {
hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read|Write"
}
// CHECK-LABEL: func @slice_in_place()
// CHECK-NOT: linalg.copy
// -----
func @slice_whole_stride_dispatch_0() {
%c0 = arith.constant 0 : index
%dim0 = hal.interface.load.constant offset = 0 : index
%dim1 = hal.interface.load.constant offset = 1 : index
%dim2 = hal.interface.load.constant offset = 2 : index
%dim3 = hal.interface.load.constant offset = 3 : index
%0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xi32>{%dim0, %dim1}
%1 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xi32>{%dim2, %dim3}
%2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:?x?xi32> -> tensor<?x?xi32>
%3 = tensor.extract_slice %2[1, 0] [1, 4] [1, 1] : tensor<?x?xi32> to tensor<1x4xi32>
flow.dispatch.tensor.store %3, %1, offsets = [], sizes = [], strides = [] : tensor<1x4xi32> -> !flow.dispatch.tensor<writeonly:?x?xi32>
return
}
hal.interface private @io {
hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
}
// CHECK-LABEL: func @slice_whole_stride_dispatch_0()
// CHECK-DAG: %[[INPUT:.+]] = hal.interface.binding.subspan @io::@arg0
// CHECK-DAG: %[[OUTPUT:.+]] = hal.interface.binding.subspan @io::@ret0
// CHECK: %[[SUBVIEW:.+]] = memref.subview %[[INPUT]]
// CHECK: linalg.copy(%[[SUBVIEW]], %[[OUTPUT]])
// -----
func @subtensor_insert() {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%dim0 = hal.interface.load.constant offset = 0 : index
%dim1 = hal.interface.load.constant offset = 1 : index
%dim2 = hal.interface.load.constant offset = 2 : index
%dim3 = hal.interface.load.constant offset = 3 : index
%dim4 = hal.interface.load.constant offset = 4 : index
%dim5 = hal.interface.load.constant offset = 5 : index
%0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xi32>{%dim0, %dim1}
%1 = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:?x?xi32>{%dim2, %dim3}
%2 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xi32>{%dim4, %dim5}
%3 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:?x?xi32> -> tensor<?x?xi32>
%4 = flow.dispatch.tensor.load %1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:?x?xi32> -> tensor<?x?xi32>
%5 = tensor.dim %3, %c0 : tensor<?x?xi32>
%6 = tensor.dim %3, %c1 : tensor<?x?xi32>
%7 = tensor.insert_slice %3 into %4[3, 4] [%5, %6] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32>
flow.dispatch.tensor.store %7, %2, offsets = [], sizes = [], strides = [] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:?x?xi32>
return
}
hal.interface private @io {
hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
// CHECK-LABEL: func @subtensor_insert()
// CHECK-DAG: %[[C0:.+]] = arith.constant 0
// CHECK-DAG: %[[C1:.+]] = arith.constant 1
// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0
// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan @io::@arg1
// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0
// CHECK-DAG: %[[D0:.+]] = memref.dim %[[ARG0]], %[[C0]]
// CHECK-DAG: %[[D1:.+]] = memref.dim %[[ARG0]], %[[C1]]
// CHECK: linalg.copy(%[[ARG1]], %[[RET0]])
// CHECK: %[[SUBVIEW:.+]] = memref.subview %[[RET0]][3, 4] [%[[D0]], %[[D1]]] [1, 1]
// CHECK: linalg.copy(%[[ARG0]], %[[SUBVIEW]])
// -----
func @tensor_extract() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:i32>
%1 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:3x9xi32>
%2 = linalg.init_tensor [3, 9] : tensor<3x9xi32>
%3 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
%4 = tensor.extract %3[] : tensor<i32>
%5 = linalg.fill(%4, %2) : i32, tensor<3x9xi32> -> tensor<3x9xi32>
flow.dispatch.tensor.store %5, %1, offsets = [], sizes = [], strides = [] : tensor<3x9xi32> -> !flow.dispatch.tensor<writeonly:3x9xi32>
return
}
hal.interface private @io {
hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
}
// CHECK-LABEL: func @tensor_extract()
// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0
// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0
// CHECK: %[[LOAD:.+]] = memref.load %[[ARG0]]
// CHECK: linalg.fill(%[[LOAD]], %[[RET0]])
// -----
func @load_to_store() {
%c0 = arith.constant 0 : index
%1 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:3x4xi32>
%2 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:3x4xi32>
%3 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:3x4xi32> -> tensor<3x4xi32>
flow.dispatch.tensor.store %3, %1, offsets = [], sizes = [], strides = [] : tensor<3x4xi32> -> !flow.dispatch.tensor<writeonly:3x4xi32>
return
}
hal.interface private @io {
hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
}
// CHECK-LABEL: func @load_to_store()
// CHECK: %[[OUT:.+]] = hal.interface.binding.subspan @io::@ret0[%c0] : memref<3x4xi32>
// CHECK: %[[IN:.+]] = hal.interface.binding.subspan @io::@arg0[%c0] : memref<3x4xi32>
// CHECK: linalg.copy(%[[IN]], %[[OUT]]) : memref<3x4xi32>, memref<3x4xi32>
// -----
func @constant() {
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]> : tensor<2x2x3xi32>
%0 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:2x2x3xi32>
flow.dispatch.tensor.store %cst, %0, offsets = [], sizes = [], strides = [] : tensor<2x2x3xi32> -> !flow.dispatch.tensor<writeonly:2x2x3xi32>
return
}
// CHECK-LABEL: func @constant()
// CHECK: %[[CST:.+]] = arith.constant {{.+}} : tensor<2x2x3xi32>
// CHECK: %[[MEMREF:.+]] = memref.buffer_cast %[[CST]] : memref<2x2x3xi32>
// CHECK: %[[RESULT:.+]] = hal.interface.binding.subspan @io::@ret0
// CHECK: linalg.copy(%[[MEMREF]], %[[RESULT]])
// -----
func @rhs_non_splat_constant() {
%c0 = arith.constant 0 : index
%cst = arith.constant dense<[[0.706495285, -0.567672312, 0.483717591, 0.522725761, 0.7563259], [-0.0899272263, -0.283501834, -0.350822538, -0.351515919, -0.337136656], [-0.451804549, 0.372324884, -0.620518147, 0.235451385, 0.851095855]]> : tensor<3x5xf32>
%cst_0 = arith.constant 0.000000e+00 : f32
%c5 = arith.constant 5 : index
%c1 = arith.constant 1 : index
%0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:1x5x3x1xf32>
%1 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:5x5xf32>
%2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x5x3x1xf32> -> tensor<1x5x3x1xf32>
%3 = linalg.tensor_collapse_shape %2 [[0, 1], [2, 3]] : tensor<1x5x3x1xf32> into tensor<5x3xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg0 = %4 to %c5 step %5 {
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg1 = %6 to %c5 step %7 {
%8 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 5)>(%arg0)[%workgroup_size_y]
%9 = tensor.extract_slice %3[%arg0, 0] [%8, 3] [1, 1] : tensor<5x3xf32> to tensor<?x3xf32>
%10 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 5)>(%arg1)[%workgroup_size_x]
%11 = tensor.extract_slice %cst[0, %arg1] [3, %10] [1, 1] : tensor<3x5xf32> to tensor<3x?xf32>
%12 = linalg.init_tensor [%8, %10] : tensor<?x?xf32>
%13 = linalg.fill(%cst_0, %12) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
%14 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%9, %11 : tensor<?x3xf32>, tensor<3x?xf32>) outs(%13 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %14, %1, offsets = [%arg0, %arg1], sizes = [%8, %10], strides = [%c1, %c1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:5x5xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
}
// CHECK-LABEL: func @rhs_non_splat_constant
// CHECK-DAG: %[[CONSTANT:.+]] = arith.constant {{.+}} : tensor<3x5xf32>
// CHECK-DAG: %[[RHS:.+]] = memref.buffer_cast %[[CONSTANT]]
// CHECK-DAG: %[[LHS_INPUT:.+]] = hal.interface.binding.subspan @io::@arg0[%{{.+}}] : memref<1x5x3x1xf32>
// CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan @io::@ret0[%{{.+}}] : memref<5x5xf32>
// CHECK: %[[LHS:.+]] = memref.collapse_shape %[[LHS_INPUT]]
// CHECK: scf.for %[[IV0:.+]] =
// CHECK: scf.for %[[IV1:.+]] =
// CHECK-DAG: %[[LHS_SUBVIEW:.+]] = memref.subview %[[LHS]][%[[IV0]], 0]
// CHECK-DAG: %[[RHS_SUBVIEW:.+]] = memref.subview %[[RHS]][0, %[[IV1]]]
// CHECK-DAG: %[[RESULT_SUBVIEW:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]]
// CHECK: linalg.fill(%{{.+}}, %[[RESULT_SUBVIEW]])
// CHECK: linalg.matmul
// CHECK-SAME: ins(%[[LHS_SUBVIEW]], %[[RHS_SUBVIEW]]
// CHECK-SAME: outs(%[[RESULT_SUBVIEW]]
// -----
func @gather() {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%dim0 = hal.interface.load.constant offset = 0 : index
%dim1 = hal.interface.load.constant offset = 1 : index
%dim2 = hal.interface.load.constant offset = 2 : index
%dim3 = hal.interface.load.constant offset = 3 : index
%dim4 = hal.interface.load.constant offset = 4 : index
%0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%dim0, %dim1}
%1 = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:?xi32>{%dim2}
%2 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%dim3, %dim4}
%4 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = []: !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:?xi32> -> tensor<?xi32>
%d0 = tensor.dim %5, %c0 : tensor<?xi32>
%d1 = tensor.dim %4, %c1 : tensor<?x?xf32>
%3 = linalg.init_tensor [%d0, %d1] : tensor<?x?xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%5 : tensor<?xi32>) outs(%3 : tensor<?x?xf32>) {
^bb0( %arg2: i32, %arg3: f32): // no predecessors
%iv1 = linalg.index 1 : index
%8 = arith.index_cast %arg2 : i32 to index
%9 = tensor.extract %4[%8, %iv1] : tensor<?x?xf32>
linalg.yield %9 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %7, %2, offsets = [], sizes = [], strides = [] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32>
return
}
hal.interface private @io {
hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
// CHECK-LABEL: func @gather()
// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0
// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan @io::@arg1
// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0
// CHECK: linalg.generic
// CHECK: %[[VAL:.+]] = memref.load %[[ARG0]]
// CHECK: linalg.yield %[[VAL]]
// -----
func @pooling_nhwc_sum() {
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%0 = hal.interface.binding.subspan @io::@ro0[%c0] : !flow.dispatch.tensor<readonly:f32>
%1 = hal.interface.binding.subspan @io::@ro1[%c0] : !flow.dispatch.tensor<readonly:1x4x6x1xf32>
%2 = hal.interface.binding.subspan @io::@wo2[%c0] : !flow.dispatch.tensor<writeonly:1x2x2x1xf32>
%3 = linalg.init_tensor [2, 3] : tensor<2x3xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:f32> -> tensor<f32>
%5 = tensor.extract %4[] : tensor<f32>
%6 = flow.dispatch.tensor.load %1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x4x6x1xf32> -> tensor<1x4x6x1xf32>
%7 = linalg.init_tensor [1, 2, 2, 1] : tensor<1x2x2x1xf32>
%8 = linalg.fill(%5, %7) : f32, tensor<1x2x2x1xf32> -> tensor<1x2x2x1xf32>
%9 = linalg.pooling_nhwc_sum {
dilations = dense<1> : vector<2xi64>,
strides = dense<[2, 3]> : vector<2xi64>
} ins(%6, %3 : tensor<1x4x6x1xf32>, tensor<2x3xf32>)
outs(%8 : tensor<1x2x2x1xf32>) -> tensor<1x2x2x1xf32>
flow.dispatch.tensor.store %9, %2, offsets = [], sizes = [], strides = [] : tensor<1x2x2x1xf32> -> !flow.dispatch.tensor<writeonly:1x2x2x1xf32>
return
}
hal.interface private @io {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
// CHECK-LABEL: func @pooling_nhwc_sum
// CHECK: %[[WINDOW:.+]] = memref.alloc() : memref<2x3xf32>
// CHECK-DAG: %[[INPUT:.+]] = hal.interface.binding.subspan @io::@ro1[%c0] : memref<1x4x6x1xf32>
// CHECK-DAG: %[[INIT:.+]] = hal.interface.binding.subspan @io::@ro0[%c0] : memref<f32>
// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@wo2[%c0] : memref<1x2x2x1xf32>
// CHECK: %[[INIT_VAL:.+]] = memref.load %[[INIT]][] : memref<f32>
// CHECK: linalg.fill(%[[INIT_VAL]], %[[RET0]]) : f32, memref<1x2x2x1xf32>
// CHECK: linalg.pooling_nhwc_sum
// CHECK-SAME: dilations = dense<1> : vector<2xi64>
// CHECK-SAME: strides = dense<[2, 3]> : vector<2xi64>
// CHECK-SAME: ins(%[[INPUT]], %[[WINDOW]] : memref<1x4x6x1xf32>, memref<2x3xf32>)
// CHECK-SAME: outs(%[[RET0]] : memref<1x2x2x1xf32>)
// -----
func @read_only_subtensor() {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%pc0 = hal.interface.load.constant offset = 0 : index
%pc1 = hal.interface.load.constant offset = 1 : index
%pc2 = hal.interface.load.constant offset = 2 : index
%pc3 = hal.interface.load.constant offset = 3 : index
%pc4 = hal.interface.load.constant offset = 4 : index
%pc5 = hal.interface.load.constant offset = 5 : index
%0 = hal.interface.binding.subspan @io::@wo2[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%pc0, %pc1}
%1 = hal.interface.binding.subspan @io::@ro0[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%pc2, %pc3}
%2 = flow.dispatch.tensor.load %1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32>
%3 = hal.interface.binding.subspan @io::@ro1[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%pc4, %pc5}
%4 = flow.dispatch.tensor.load %3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
%dim0 = tensor.dim %2, %c0 : tensor<?x?xf32>
%dim1 = tensor.dim %2, %c1 : tensor<?x?xf32>
scf.for %arg0 = %5 to %dim0 step %6 {
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg1 = %7 to %dim1 step %8 {
%9 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg0)[%workgroup_size_y, %dim0]
%10 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg1)[%workgroup_size_x, %dim1]
%11 = tensor.extract_slice %2[%arg0, %arg1] [%9, %10] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%12 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg0)[%workgroup_size_y, %dim0]
%13 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg1)[%workgroup_size_x, %dim1]
%14 = tensor.extract_slice %2[%arg0, %arg1] [%12, %13] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%15 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg0)[%workgroup_size_y, %dim0]
%16 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg1)[%workgroup_size_x, %dim1]
%17 = tensor.extract_slice %4[%arg0, %arg1] [%15, %16] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%18 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg0)[%workgroup_size_y, %dim0]
%19 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg1)[%workgroup_size_x, %dim1]
%20 = tensor.extract_slice %4[%arg0, %arg1] [%18, %19] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%21 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg0)[%workgroup_size_y, %dim0]
%22 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg1)[%workgroup_size_x, %dim1]
%23 = linalg.init_tensor [%21, %22] : tensor<?x?xf32>
%24 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11, %14, %17, %20 : tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) outs(%23 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%25 = arith.mulf %arg4, %arg5 : f32
%26 = arith.mulf %arg2, %arg3 : f32
%27 = arith.addf %26, %25 : f32
%28 = math.sqrt %27 : f32
linalg.yield %28 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %24, %0, offsets = [%arg0, %arg1], sizes = [%21, %22], strides = [%c1, %c1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32>
}
}
return
}hal.interface private @io {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
// CHECK-LABEL: func @read_only_subtensor
// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@ro0[%c0] : memref<?x?xf32>
// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan @io::@ro1[%c0] : memref<?x?xf32>
// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@wo2[%c0] : memref<?x?xf32>
// CHECK: scf.for
// CHECK: scf.for
// CHECK-DAG: %[[SV1:.+]] = memref.subview %[[ARG0]]
// CHECK-DAG: %[[SV2:.+]] = memref.subview %[[ARG1]]
// CHECK-DAG: %[[SV3:.+]] = memref.subview %[[RET0]]
// CHECK: linalg.generic
// CHECK-SAME: ins(%[[SV1]], %[[SV1]], %[[SV2]], %[[SV2]] :
// CHECK-SAME: outs(%[[SV3]] :
// -----
func @reshape_read_only() {
%c0 = arith.constant 0 : index
%dim0 = hal.interface.load.constant offset = 0 : index
%dim1 = hal.interface.load.constant offset = 1 : index
%dim2 = hal.interface.load.constant offset = 2 : index
%0 = hal.interface.binding.subspan @io::@ro0[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%dim0, %dim1}
%1 = hal.interface.binding.subspan @io::@wo0[%c0] : !flow.dispatch.tensor<writeonly:?xf32>{%dim2}
%2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32>
%3 = linalg.tensor_collapse_shape %2 [[0, 1]]
: tensor<?x?xf32> into tensor<?xf32>
%4 = tensor.dim %3, %c0 : tensor<?xf32>
%5 = linalg.init_tensor [%4] : tensor<?xf32>
%6 = linalg.generic {
indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
iterator_types = ["parallel"]}
ins(%3 : tensor<?xf32>) outs(%5 : tensor<?xf32>) {
^bb0(%arg0 : f32, %arg1 : f32):
%7 = arith.addf %arg0, %arg0 : f32
linalg.yield %7 : f32
} -> tensor<?xf32>
flow.dispatch.tensor.store %6, %1, offsets = [], sizes = [], strides = []: tensor<?xf32> -> !flow.dispatch.tensor<writeonly:?xf32>
return
}
// CHECK-LABEL: func @reshape_read_only
// CHECK-DAG: %[[INPUT:.+]] = hal.interface.binding.subspan @io::@ro0
// CHECK-DAG: %[[OUTPUT:.+]] = hal.interface.binding.subspan @io::@wo0
// CHECK: %[[RESHAPE:.+]] = memref.collapse_shape %[[INPUT]]
// CHECK: linalg.generic
// CHECK-SAME: ins(%[[RESHAPE]] : memref<?xf32>)
// CHECK-SAME: outs(%[[OUTPUT]] : memref<?xf32>)
// -----
func @use_buffer_for_operand_when_output_tensor_not_used() {
%c0 = arith.constant 0 : index
%input_subspan = hal.interface.binding.subspan @interface_io::@ro0[%c0] : !flow.dispatch.tensor<readonly:1x225x225x16xf32>
%filter_subspan = hal.interface.binding.subspan @interface_io::@ro1[%c0] : !flow.dispatch.tensor<readonly:3x3x16x32xf32>
%offset_subspan = hal.interface.binding.subspan @interface_io::@ro2[%c0] : !flow.dispatch.tensor<readonly:32xf32>
%output_subspan = hal.interface.binding.subspan @interface_io::@wo3[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%input = flow.dispatch.tensor.load %input_subspan, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x225x225x16xf32> -> tensor<1x225x225x16xf32>
%filter = flow.dispatch.tensor.load %filter_subspan, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:3x3x16x32xf32> -> tensor<3x3x16x32xf32>
%offset = flow.dispatch.tensor.load %offset_subspan, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<32xf32>
%cst = arith.constant 0.0 : f32
%0 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%1 = linalg.fill(%cst, %0) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%2 = linalg.conv_2d_nhwc_hwcf
{dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
ins(%input, %filter : tensor<1x225x225x16xf32>, tensor<3x3x16x32xf32>)
outs(%1 : tensor<1x112x112x32xf32>)
-> tensor<1x112x112x32xf32>
%3 = linalg.generic {
indexing_maps = [
affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
affine_map<(d0, d1, d2, d3) -> (d3)>,
affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
ins(%2, %offset: tensor<1x112x112x32xf32>, tensor<32xf32>)
outs(%0 : tensor<1x112x112x32xf32>) {
^bb0(%a: f32, %b: f32, %c: f32):
%sub = arith.subf %a, %b : f32
linalg.yield %sub : f32
} -> tensor<1x112x112x32xf32>
flow.dispatch.tensor.store %3, %output_subspan, offsets = [], sizes = [], strides = [] : tensor<1x112x112x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
return
}
hal.interface private @interface_io {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @ro2, set=0, binding=2, type="StorageBuffer", access="Read"
hal.interface.binding @wo3, set=0, binding=3, type="StorageBuffer", access="Write|Discard"
}
// CHECK: func @use_buffer_for_operand_when_output_tensor_not_used()
// CHECK-NOT: memref.alloc
// CHECK: %[[OUTPUT:.+]] = hal.interface.binding.subspan @interface_io::@wo3
// CHECK: linalg.fill(%{{.+}}, %[[OUTPUT]])
// CHECK-NEXT: linalg.conv_2d_nhwc_hwcf
// CHECK-SAME: outs(%[[OUTPUT]] : memref<1x112x112x32xf32>)
// CHECK-NEXT: linalg.generic
// CHECK-SAME: ins(%[[OUTPUT]], %{{.+}} : memref<1x112x112x32xf32>, memref<32xf32>)
// CHECK-SAME: outs(%[[OUTPUT]] : memref<1x112x112x32xf32>)
// -----
func @dont_use_buffer_for_operand_when_output_tensor_used() {
%c0 = arith.constant 0 : index
%input_subspan = hal.interface.binding.subspan @interface_io::@ro0[%c0] : !flow.dispatch.tensor<readonly:1x225x225x16xf32>
%filter_subspan = hal.interface.binding.subspan @interface_io::@ro1[%c0] : !flow.dispatch.tensor<readonly:3x3x16x32xf32>
%offset_subspan = hal.interface.binding.subspan @interface_io::@ro2[%c0] : !flow.dispatch.tensor<readonly:32xf32>
%output_subspan = hal.interface.binding.subspan @interface_io::@wo3[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%input = flow.dispatch.tensor.load %input_subspan, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x225x225x16xf32> -> tensor<1x225x225x16xf32>
%filter = flow.dispatch.tensor.load %filter_subspan, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:3x3x16x32xf32> -> tensor<3x3x16x32xf32>
%offset = flow.dispatch.tensor.load %offset_subspan, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<32xf32>
%cst0 = arith.constant 0.0 : f32
%cst1 = arith.constant 1.0 : f32
%0 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
%1 = linalg.fill(%cst0, %0) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%2 = linalg.conv_2d_nhwc_hwcf
{dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
ins(%input, %filter : tensor<1x225x225x16xf32>, tensor<3x3x16x32xf32>)
outs(%1 : tensor<1x112x112x32xf32>)
-> tensor<1x112x112x32xf32>
%3 = linalg.fill(%cst1, %0) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
%4 = linalg.generic {
indexing_maps = [
affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>,
affine_map<(d0, d1, d2, d3) -> (d3)>,
affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>],
iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
ins(%2, %offset: tensor<1x112x112x32xf32>, tensor<32xf32>)
outs(%3 : tensor<1x112x112x32xf32>) {
^bb0(%a: f32, %b: f32, %c: f32):
%sub = arith.subf %a, %b : f32
%add = arith.addf %sub, %c : f32
linalg.yield %add : f32
} -> tensor<1x112x112x32xf32>
flow.dispatch.tensor.store %4, %output_subspan, offsets = [], sizes = [], strides = []: tensor<1x112x112x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
return
}
// CHECK-LABEL: func @dont_use_buffer_for_operand_when_output_tensor_used()
// CHECK: %[[ALLOC:.+]] = memref.alloc
// CHECK: %[[OUTPUT:.+]] = hal.interface.binding.subspan @interface_io::@wo3
// CHECK: linalg.fill(%{{.+}}, %[[ALLOC]])
// CHECK-NEXT: linalg.conv_2d_nhwc_hwcf
// CHECK-SAME: outs(%[[ALLOC]] : memref<1x112x112x32xf32>)
// CHECK-NEXT: linalg.fill(%{{.+}}, %[[OUTPUT]])
// CHECK-NEXT: linalg.generic
// CHECK-SAME: ins(%[[ALLOC]], %{{.+}} : memref<1x112x112x32xf32>, memref<32xf32>)
// CHECK-SAME: outs(%[[OUTPUT]] : memref<1x112x112x32xf32>)
// -----
func @bufferize_cst_output_tensor() {
%c0 = arith.constant 0 : index
%cst1 = arith.constant dense<-2147483648> : tensor<i32>
%zero = arith.constant 0.000000e+00 : f32
%cst5 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
%input = hal.interface.binding.subspan @io::@ro0[%c0] : !flow.dispatch.tensor<readonly:5xf32>
%output = hal.interface.binding.subspan @io::@wo1[%c0] : !flow.dispatch.tensor<writeonly:i32>
%1 = flow.dispatch.tensor.load %input, offsets=[], sizes=[], strides=[] : !flow.dispatch.tensor<readonly:5xf32> -> tensor<5xf32>
%2 = linalg.generic {
indexing_maps = [affine_map<(d0) -> (-d0 + 4)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>],
iterator_types = ["reduction"]}
ins(%1, %cst5 : tensor<5xf32>, tensor<5xi32>)
outs(%cst1 : tensor<i32>) {
^bb0(%arg0: f32, %arg1: i32, %arg2: i32):
%8 = arith.cmpf oeq, %arg0, %zero : f32
%9 = arith.extui %8 : i1 to i32
%10 = arith.muli %9, %arg1 : i32
%11 = arith.cmpi sgt, %10, %arg2 : i32
%12 = select %11, %10, %arg2 : i32
linalg.yield %12 : i32
} -> tensor<i32>
flow.dispatch.tensor.store %2, %output, offsets=[], sizes=[], strides=[] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32>
return
}
hal.interface private @interface_io {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @wo1, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
}
// CHECK-LABEL: func @bufferize_cst_output_tensor()
// CHECK-DAG: %[[CST1:.+]] = arith.constant dense<-2147483648> : tensor<i32>
// CHECK-DAG: %[[CST5:.+]] = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32>
// CHECK: %[[CAST1:.+]] = memref.buffer_cast %[[CST1]] : memref<i32>
// CHECK: %[[CAST5:.+]] = memref.buffer_cast %[[CST5]] : memref<5xi32>
// CHECK: %[[INPUT:.+]] = hal.interface.binding.subspan @io::@ro0[%c0] : memref<5xf32>
// CHECK: %[[OUTPUT:.+]] = hal.interface.binding.subspan @io::@wo1[%c0] : memref<i32>
// CHECK: linalg.copy(%[[CAST1]], %[[OUTPUT]])
// CHECK: linalg.generic
// CHECK-SAME: ins(%[[INPUT]], %[[CAST5]] : memref<5xf32>, memref<5xi32>)
// CHECK-SAME: outs(%[[OUTPUT]] : memref<i32>)
// -----
func @cast_follwed_by_store() {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%c4 = arith.constant 4 : index
%c64 = arith.constant 64 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:4x32x1024xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:4x1024x64xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:4x32x64xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
scf.for %arg0 = %workgroup_id_z to %c4 step %workgroup_count_z {
%3 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_y]
scf.for %arg1 = %3 to %c32 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x]
scf.for %arg2 = %5 to %c64 step %6 {
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [%c1, %c32, 1024], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:4x32x1024xf32> -> tensor<?x?x1024xf32>
%8 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0, %arg2], sizes = [%c1, 1024, %c32], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:4x1024x64xf32> -> tensor<?x1024x?xf32>
%9 = linalg.init_tensor [1, 32, 32] : tensor<1x32x32xf32>
%10 = linalg.fill(%cst, %9) {__internal_linalg_transform__ = "workgroup"} : f32, tensor<1x32x32xf32> -> tensor<1x32x32xf32>
%11 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup", is_root_op} ins(%7, %8 : tensor<?x?x1024xf32>, tensor<?x1024x?xf32>) outs(%10 : tensor<1x32x32xf32>) -> tensor<1x32x32xf32>
%12 = tensor.cast %11 : tensor<1x32x32xf32> to tensor<?x?x?xf32>
flow.dispatch.tensor.store %12, %2, offsets = [%arg0, %arg1, %arg2], sizes = [%c1, %c32, %c32], strides = [1, 1, 1] : tensor<?x?x?xf32> -> !flow.dispatch.tensor<writeonly:4x32x64xf32>
}
}
}
return
}
// CHECK-LABEL: func @cast_follwed_by_store()
// CHECK-DAG: %[[ZERO:.+]] = arith.constant 0.000000e+00 : f32
// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<4x32x1024xf32>
// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<4x1024x64xf32>
// CHECK-DAG: %[[RESULT:.+]] = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<4x32x64xf32>
// CHECK: %[[LHSV:.+]] = memref.subview %[[LHS]]
// CHECK: %[[RHSV:.+]] = memref.subview %[[RHS]]
// CHECK: %[[RESULTV:.+]] = memref.subview %[[RESULT]]
// CHECK: linalg.fill(%[[ZERO]], %[[RESULTV]])
// CHECK: linalg.batch_matmul {{.*}} ins(%[[LHSV]], %[[RHSV]] : {{.*}}) outs(%[[RESULTV]]
// -----
func @rank_reduced_subtensor_insert() {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%dim0 = hal.interface.load.constant offset = 0 : index
%dim1 = hal.interface.load.constant offset = 1 : index
%dim2 = hal.interface.load.constant offset = 2 : index
%dim3 = hal.interface.load.constant offset = 3 : index
%dim4 = hal.interface.load.constant offset = 4 : index
%0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%dim0, %dim1}
%1 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<readwrite:?x?x?xf32>{%dim2, %dim3, %dim4}
%2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32>
%3 = flow.dispatch.tensor.load %1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:?x?x?xf32> -> tensor<?x?x?xf32>
%4 = tensor.dim %3, %c1 : tensor<?x?x?xf32>
%5 = tensor.dim %3, %c2 : tensor<?x?x?xf32>
%6 = tensor.insert_slice %2 into %3[0, 0, 0] [1, %4, %5] [1, 1, 1] : tensor<?x?xf32> into tensor<?x?x?xf32>
flow.dispatch.tensor.store %6, %1, offsets = [], sizes = [], strides = [] : tensor<?x?x?xf32> -> !flow.dispatch.tensor<readwrite:?x?x?xf32>
return
}
hal.interface private @io {
hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Read|Write"
}
// CHECK-LABEL: func @rank_reduced_subtensor_insert()
// CHECK-DAG: %[[ARG:.+]] = hal.interface.binding.subspan @io::@arg0
// CHECK-DAG: %[[RET:.+]] = hal.interface.binding.subspan @io::@ret0
// CHECK: %[[SUBVIEW:.+]] = memref.subview %[[RET]]
// CHECK: linalg.copy(%[[ARG]], %[[SUBVIEW]])
// -----
#map0 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
func @bufferize_transfer_op() {
%c3 = arith.constant 3 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:2x3xf32>
%1 = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:3x4xf32>
%2 = hal.interface.binding.subspan @io::@arg2[%c0] : !flow.dispatch.tensor<readonly:2x4xf32>
%3 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:2x4xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [%c0, %c0], sizes = [%c1, %c3], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:2x3xf32> -> tensor<2x3xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [%c0, %c0], sizes = [%c3, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:3x4xf32> -> tensor<3x1xf32>
%6 = flow.dispatch.tensor.load %2, offsets = [%c0, %c0], sizes = [%c1, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:2x4xf32> -> tensor<2x1xf32>
%7 = vector.transfer_read %4[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<2x3xf32>, vector<1x1xf32>
%8 = vector.transfer_read %4[%c0, %c1], %cst {in_bounds = [true, true]} : tensor<2x3xf32>, vector<1x1xf32>
%9 = vector.transfer_read %4[%c0, %c2], %cst {in_bounds = [true, true]} : tensor<2x3xf32>, vector<1x1xf32>
%10 = vector.transfer_read %4[%c1, %c0], %cst {in_bounds = [true, true]} : tensor<2x3xf32>, vector<1x1xf32>
%11 = vector.transfer_read %4[%c1, %c1], %cst {in_bounds = [true, true]} : tensor<2x3xf32>, vector<1x1xf32>
%12 = vector.transfer_read %4[%c1, %c2], %cst {in_bounds = [true, true]} : tensor<2x3xf32>, vector<1x1xf32>
%13 = vector.transfer_read %5[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<3x1xf32>, vector<1x1xf32>
%14 = vector.transfer_read %5[%c1, %c0], %cst {in_bounds = [true, true]} : tensor<3x1xf32>, vector<1x1xf32>
%15 = vector.transfer_read %5[%c2, %c0], %cst {in_bounds = [true, true]} : tensor<3x1xf32>, vector<1x1xf32>
%16 = vector.transfer_read %6[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<2x1xf32>, vector<1x1xf32>
%17 = vector.transfer_read %6[%c1, %c0], %cst {in_bounds = [true, true]} : tensor<2x1xf32>, vector<1x1xf32>
%18 = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %7, %13, %16 : vector<1x1xf32>, vector<1x1xf32> into vector<1x1xf32>
%19 = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %8, %14, %18 : vector<1x1xf32>, vector<1x1xf32> into vector<1x1xf32>
%20 = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %9, %15, %19 : vector<1x1xf32>, vector<1x1xf32> into vector<1x1xf32>
%21 = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %10, %13, %17 : vector<1x1xf32>, vector<1x1xf32> into vector<1x1xf32>
%22 = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %11, %14, %21 : vector<1x1xf32>, vector<1x1xf32> into vector<1x1xf32>
%23 = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %12, %15, %22 : vector<1x1xf32>, vector<1x1xf32> into vector<1x1xf32>
%24 = vector.transfer_write %20, %6[%c0, %c0] {in_bounds = [true, true]} : vector<1x1xf32>, tensor<2x1xf32>
%25 = vector.transfer_write %23, %24[%c1, %c0] {in_bounds = [true, true]} : vector<1x1xf32>, tensor<2x1xf32>
flow.dispatch.tensor.store %25, %3, offsets = [%c0, %c0], sizes = [%c1, %c1], strides = [%c1, %c1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:2x4xf32>
return
}
hal.interface private @io {
hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @arg2, set=0, binding=2, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=3, type="StorageBuffer", access="Write|Discard"
}
// CHECK-LABEL: func @bufferize_transfer_op()
// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0
// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan @io::@arg1
// CHECK-DAG: %[[ARG2:.+]] = hal.interface.binding.subspan @io::@arg2
// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0
// CHECK-DAG: %[[ARG0V:.+]] = memref.subview %[[ARG0]]
// CHECK-DAG: %[[ARG1V:.+]] = memref.subview %[[ARG1]]
// CHECK-DAG: %[[ARG2V:.+]] = memref.subview %[[ARG2]]
// CHECK-COUNT-6: vector.transfer_read %[[ARG0V]]
// CHECK-COUNT-3: vector.transfer_read %[[ARG1V]]
// CHECK-COUNT-2: vector.transfer_read %[[ARG2V]]
// CHECK: %[[RET0V:.+]] = memref.subview %[[RET0]]
// CHECK: linalg.copy(%[[ARG2V]], %[[RET0V]])
// CHECK: vector.transfer_write %{{.+}}, %[[RET0V]]
// CHECK: vector.transfer_write %{{.+}}, %[[RET0V]]
// -----
#map0 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
func @bufferize_transfer_op_inplace() {
%c3 = arith.constant 3 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:2x3xf32>
%1 = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:3x4xf32>
%3 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<readwrite:2x4xf32>
%4 = flow.dispatch.tensor.load %0, offsets = [%c0, %c0], sizes = [%c1, %c3], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:2x3xf32> -> tensor<2x3xf32>
%5 = flow.dispatch.tensor.load %1, offsets = [%c0, %c0], sizes = [%c3, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:3x4xf32> -> tensor<3x1xf32>
%6 = flow.dispatch.tensor.load %3, offsets = [%c0, %c0], sizes = [%c1, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readwrite:2x4xf32> -> tensor<2x1xf32>
%7 = vector.transfer_read %4[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<2x3xf32>, vector<1x1xf32>
%8 = vector.transfer_read %4[%c0, %c1], %cst {in_bounds = [true, true]} : tensor<2x3xf32>, vector<1x1xf32>
%9 = vector.transfer_read %4[%c0, %c2], %cst {in_bounds = [true, true]} : tensor<2x3xf32>, vector<1x1xf32>
%10 = vector.transfer_read %4[%c1, %c0], %cst {in_bounds = [true, true]} : tensor<2x3xf32>, vector<1x1xf32>
%11 = vector.transfer_read %4[%c1, %c1], %cst {in_bounds = [true, true]} : tensor<2x3xf32>, vector<1x1xf32>
%12 = vector.transfer_read %4[%c1, %c2], %cst {in_bounds = [true, true]} : tensor<2x3xf32>, vector<1x1xf32>
%13 = vector.transfer_read %5[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<3x1xf32>, vector<1x1xf32>
%14 = vector.transfer_read %5[%c1, %c0], %cst {in_bounds = [true, true]} : tensor<3x1xf32>, vector<1x1xf32>
%15 = vector.transfer_read %5[%c2, %c0], %cst {in_bounds = [true, true]} : tensor<3x1xf32>, vector<1x1xf32>
%16 = vector.transfer_read %6[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<2x1xf32>, vector<1x1xf32>
%17 = vector.transfer_read %6[%c1, %c0], %cst {in_bounds = [true, true]} : tensor<2x1xf32>, vector<1x1xf32>
%18 = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %7, %13, %16 : vector<1x1xf32>, vector<1x1xf32> into vector<1x1xf32>
%19 = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %8, %14, %18 : vector<1x1xf32>, vector<1x1xf32> into vector<1x1xf32>
%20 = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %9, %15, %19 : vector<1x1xf32>, vector<1x1xf32> into vector<1x1xf32>
%21 = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %10, %13, %17 : vector<1x1xf32>, vector<1x1xf32> into vector<1x1xf32>
%22 = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %11, %14, %21 : vector<1x1xf32>, vector<1x1xf32> into vector<1x1xf32>
%23 = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %12, %15, %22 : vector<1x1xf32>, vector<1x1xf32> into vector<1x1xf32>
%24 = vector.transfer_write %20, %6[%c0, %c0] {in_bounds = [true, true]} : vector<1x1xf32>, tensor<2x1xf32>
%25 = vector.transfer_write %23, %24[%c1, %c0] {in_bounds = [true, true]} : vector<1x1xf32>, tensor<2x1xf32>
flow.dispatch.tensor.store %25, %3, offsets = [%c0, %c0], sizes = [%c1, %c1], strides = [%c1, %c1] : tensor<2x1xf32> -> !flow.dispatch.tensor<readwrite:2x4xf32>
return
}
hal.interface private @io {
hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=3, type="StorageBuffer", access="Write|Discard"
}
// CHECK-LABEL: func @bufferize_transfer_op_inplace()
// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0
// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan @io::@arg1
// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0
// CHECK-DAG: %[[ARG0V:.+]] = memref.subview %[[ARG0]]
// CHECK-DAG: %[[ARG1V:.+]] = memref.subview %[[ARG1]]
// CHECK-DAG: %[[RET0V:.+]] = memref.subview %[[RET0]]
// CHECK-COUNT-6: vector.transfer_read %[[ARG0V]]
// CHECK-COUNT-3: vector.transfer_read %[[ARG1V]]
// CHECK-COUNT-2: vector.transfer_read %[[RET0V]]
// CHECK-NOT: linalg.copy
// CHECK: vector.transfer_write %{{.+}}, %[[RET0V]]
// CHECK: vector.transfer_write %{{.+}}, %[[RET0V]]
// -----
#map = affine_map<(d0, d1) -> (d0, d1)>
func @multi_result() {
%c0 = arith.constant 0 : index
%c2 = arith.constant 2 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%dim0 = hal.interface.load.constant offset = 0 : index
%dim1 = hal.interface.load.constant offset = 1 : index
%dim2 = hal.interface.load.constant offset = 2 : index
%dim3 = hal.interface.load.constant offset = 3 : index
%dim4 = hal.interface.load.constant offset = 4 : index
%dim5 = hal.interface.load.constant offset = 5 : index
%dim6 = hal.interface.load.constant offset = 6 : index
%dim7 = hal.interface.load.constant offset = 7 : index
%0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%dim0, %dim1}
%1 = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%dim2, %dim3}
%2 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%dim4, %dim5}
%3 = hal.interface.binding.subspan @io::@ret1[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%dim6, %dim7}
%4 = hal.interface.load.constant offset = 8 : index
%5 = hal.interface.load.constant offset = 9 : index
%6 = hal.interface.load.constant offset = 10 : index
%7 = hal.interface.load.constant offset = 11 : index
%8 = hal.interface.workgroup.id[0] : index
%9 = hal.interface.workgroup.id[1] : index
%10 = hal.interface.workgroup.count[0] : index
%11 = hal.interface.workgroup.count[1] : index
%12 = hal.interface.workgroup.size[0] : index
%13 = hal.interface.workgroup.size[1] : index
%14 = arith.muli %9, %13 : index
%15 = arith.muli %11, %13 : index
%16 = arith.muli %8, %12 : index
%17 = arith.muli %10, %12 : index
scf.for %arg0 = %14 to %4 step %15 {
scf.for %arg1 = %16 to %5 step %17 {
%18 = affine.min affine_map<(d0)[s0, s1] -> (s1, -d0 + s0)>(%arg0)[%4, %13]
%19 = affine.min affine_map<(d0)[s0, s1] -> (s1, -d0 + s0)>(%arg1)[%5, %12]
%20 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1], sizes = [%18, %19], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32>
%21 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [%18, %19], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32>
%shape = linalg.init_tensor [%18, %19] : tensor<?x?xf32>
%22:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel"]}
ins(%20, %21 : tensor<?x?xf32>, tensor<?x?xf32>)
outs(%shape, %shape : tensor<?x?xf32>, tensor<?x?xf32>) {
^bb0(%arg2: f32, %arg3 : f32, %arg4 : f32, %arg5 : f32): // no predecessors
%23 = arith.mulf %arg2, %arg3 : f32
%24 = arith.addf %arg2, %arg3 : f32
linalg.yield %23, %24 : f32, f32
} -> (tensor<?x?xf32>, tensor<?x?xf32>)
flow.dispatch.tensor.store %22#0, %2, offsets = [%arg0, %arg1], sizes = [%18, %19], strides = [%c1, %c1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32>
flow.dispatch.tensor.store %22#1, %3, offsets = [%arg0, %arg1], sizes = [%18, %19], strides = [%c1, %c1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
hal.interface.binding @ret1, set=0, binding=3, type="StorageBuffer", access="Write|Discard"
}
// CHECK-LABEL: func @multi_result()
// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0
// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan @io::@arg1
// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0
// CHECK-DAG: %[[RET1:.+]] = hal.interface.binding.subspan @io::@ret1
// CHECK-DAG: %[[ARG0V:.+]] = memref.subview %[[ARG0]]
// CHECK-DAG: %[[ARG1V:.+]] = memref.subview %[[ARG1]]
// CHECK-DAG: %[[RET0V:.+]] = memref.subview %[[RET0]]
// CHECK-DAG: %[[RET1V:.+]] = memref.subview %[[RET1]]
// CHECK: linalg.generic
// CHECK-SAME: ins(%[[ARG0V]], %[[ARG1V]]
// CHECK-SAME: outs(%[[RET0V]], %[[RET1V]]
// -----
#map0 = affine_map<()[s0] -> (s0 * 64)>
#map1 = affine_map<()[s0] -> (s0 * 16)>
module {
func @padded_matmul() {
%c0 = arith.constant 0 : index
%c12544 = arith.constant 12544 : index
%c64 = arith.constant 64 : index
%c16 = arith.constant 16 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:12544x27xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:27x16xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:12544x16xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply #map0()[%workgroup_id_y]
%4 = affine.apply #map0()[%workgroup_count_y]
scf.for %arg0 = %3 to %c12544 step %4 {
%5 = affine.apply #map1()[%workgroup_id_x]
%6 = affine.apply #map1()[%workgroup_count_x]
scf.for %arg1 = %5 to %c16 step %6 {
%7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [64, 27], strides = [1, 1] : !flow.dispatch.tensor<readonly:12544x27xf32> -> tensor<64x27xf32>
%8 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [27, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:27x16xf32> -> tensor<27x16xf32>
%9 = linalg.init_tensor [64, 16] : tensor<64x16xf32>
%10 = linalg.fill(%cst, %9) {__internal_linalg_transform__ = "workgroup"} : f32, tensor<64x16xf32> -> tensor<64x16xf32>
%11 = linalg.pad_tensor %7 low[0, 0] high[0, 5] {
^bb0(%arg2: index, %arg3: index): // no predecessors
linalg.yield %cst : f32
} : tensor<64x27xf32> to tensor<64x32xf32>
%12 = linalg.pad_tensor %8 low[0, 0] high[5, 0] {
^bb0(%arg2: index, %arg3: index): // no predecessors
linalg.yield %cst : f32
} : tensor<27x16xf32> to tensor<32x16xf32>
%13 = linalg.matmul ins(%11, %12 : tensor<64x32xf32>, tensor<32x16xf32>) outs(%10 : tensor<64x16xf32>) -> tensor<64x16xf32>
%14 = tensor.cast %13 : tensor<64x16xf32> to tensor<?x?xf32>
flow.dispatch.tensor.store %14, %2, offsets = [%arg0, %arg1], sizes = [%c64, %c16], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:12544x16xf32>
}
}
return
}
}
// CHECK-LABEL: func @padded_matmul()
// CHECK-DAG: %[[LHS_PADDED:.+]] = memref.alloc() : memref<64x32xf32>
// CHECK-DAG: %[[RHS_PADDED:.+]] = memref.alloc() : memref<32x16xf32>
// CHECK-DAG: %[[C0:.+]] = arith.constant 0.000000e+00 : f32
// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<12544x27xf32>
// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<27x16xf32>
// CHECK-DAG: %[[DST:.+]] = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<12544x16xf32>
// CHECK-DAG: %[[LHS_V:.+]] = memref.subview %[[LHS]][%{{.*}}, 0] [64, 27] [1, 1]
// CHECK-DAG: %[[RHS_V:.+]] = memref.subview %[[RHS]][0, %{{.*}}] [27, 16] [1, 1]
// CHECK-DAG: %[[DST_V:.+]] = memref.subview %[[DST]][%{{.*}}, %{{.*}}] [64, 16] [1, 1]
// CHECK: linalg.fill(%[[C0]], %[[DST_V]])
// CHECK: linalg.fill(%[[C0]], %[[LHS_PADDED]]) : f32, memref<64x32xf32>
// CHECK: %[[LHS_PADDED_INTER:.+]] = memref.subview %[[LHS_PADDED]][0, 0] [64, 27] [1, 1]
// CHECK: linalg.copy(%[[LHS_V]], %[[LHS_PADDED_INTER]])
// CHECK: linalg.fill(%[[C0]], %[[RHS_PADDED]]) : f32, memref<32x16xf32>
// CHECK: %[[RHS_PADDED_INTER:.+]] = memref.subview %[[RHS_PADDED]][0, 0] [27, 16] [1, 1]
// CHECK: linalg.copy(%[[RHS_V]], %[[RHS_PADDED_INTER]])
// CHECK: linalg.matmul ins(%[[LHS_PADDED]], %[[RHS_PADDED]] : memref<64x32xf32>, memref<32x16xf32>)
// -----
func @dot_general_padded() {
%c0 = arith.constant 0 : index
%c3 = arith.constant 3 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0.000000e+00 : f32
%m = hal.interface.load.constant offset = 0 : index
%n = hal.interface.load.constant offset = 1 : index
%k = hal.interface.load.constant offset = 2 : index
%0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%m, %k}
%1 = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%k, %n}
%2 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%m, %n}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg0 = %3 to %m step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg1 = %5 to %n step %6 {
%7 = affine.min affine_map<(d0)[s0] -> (4, -d0 + s0)>(%arg0)[%m]
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x2xf32>
%9 = affine.min affine_map<(d0)[s0] -> (4, -d0 + s0)>(%arg1)[%n]
%10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [2, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<2x?xf32>
%11 = affine.min affine_map<(d0)[s0] -> (4, -d0 + s0)>(%arg0)[%m]
%12 = affine.min affine_map<(d0)[s0] -> (4, -d0 + s0)>(%arg1)[%n]
%13 = linalg.pad_tensor %8 low[0, 0] high[1, 2] {
^bb0(%arg2: index, %arg3: index): // no predecessors
linalg.yield %cst : f32
} : tensor<?x2xf32> to tensor<4x4xf32>
%14 = linalg.pad_tensor %10 low[0, 0] high[2, 3] {
^bb0(%arg2: index, %arg3: index): // no predecessors
linalg.yield %cst : f32
} : tensor<2x?xf32> to tensor<4x4xf32>
%15 = linalg.init_tensor [4, 4] : tensor<4x4xf32>
%16 = linalg.fill(%cst, %15) : f32, tensor<4x4xf32> -> tensor<4x4xf32>
%17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%13, %14 : tensor<4x4xf32>, tensor<4x4xf32>) outs(%16 : tensor<4x4xf32>) -> tensor<4x4xf32>
%18 = tensor.extract_slice %17[0, 0] [%7, %9] [1, 1] : tensor<4x4xf32> to tensor<?x?xf32>
flow.dispatch.tensor.store %18, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
// CHECK: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (4, -d0 + s0)>
// CHECK: func @dot_general_padded
// CHECK-DAG: %[[ALLOC_RET0:.+]] = memref.alloc
// CHECK-DAG: %[[ALLOC_ARG1:.+]] = memref.alloc
// CHECK-DAG: %[[ALLOC_ARG0:.+]] = memref.alloc
// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0[{{.*}}] : memref<?x?xf32>
// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan @io::@arg1[{{.*}}] : memref<?x?xf32>
// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0[{{.*}}] : memref<?x?xf32>
// CHECK-DAG: %[[M:.+]] = hal.interface.load.constant offset = 0
// CHECK-DAG: %[[N:.+]] = hal.interface.load.constant offset = 1
// CHECK: scf.for %[[IV0:.+]] = %{{.+}} to %[[M]]
// CHECK: scf.for %[[IV1:.+]] = %{{.+}} to %[[N]]
// CHECK-DAG: %[[TILE_M:.+]] = affine.min #[[MAP1]](%[[IV0]])[%[[M]]]
// CHECK-DAG: %[[TILE_N:.+]] = affine.min #[[MAP1]](%[[IV1]])[%[[N]]]
// CHECK-DAG: %[[ARG0_SV:.+]] = memref.subview %[[ARG0]]
// CHECK-DAG: %[[ARG1_SV:.+]] = memref.subview %[[ARG1]]
// CHECK: linalg.fill(%{{.*}}, %[[ALLOC_ARG0]]
// CHECK: %[[ALLOC_ARG0_SV:.+]] = memref.subview %[[ALLOC_ARG0]]
// CHECK: linalg.copy(%[[ARG0_SV]], %[[ALLOC_ARG0_SV]])
// CHECK: linalg.fill(%{{.*}}, %[[ALLOC_ARG1]]
// CHECK: linalg.copy(%[[ARG1_SV]]
// CHECK: linalg.fill(%{{.*}}, %[[ALLOC_RET0]]
// CHECK: linalg.matmul
// CHECK-SAME: ins(%[[ALLOC_ARG0]], %[[ALLOC_ARG1]]
// CHECK-SAME: outs(%[[ALLOC_RET0]]
// CHECK-DAG: %[[RET0_SV:.+]] = memref.subview %[[RET0]]
// CHECK-DAG: %[[ALLOC_RET0_SV:.+]] = memref.subview
// CHECK: linalg.copy(%[[ALLOC_RET0_SV]], %[[RET0_SV]])
// -----
func @im2col() {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%c112 = arith.constant 112 : index
%c32 = arith.constant 32 : index
%c16 = arith.constant 16 : index
%c4 = arith.constant 4 : index
%0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:1x225x225x8xf32>
%1 = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:3x3x8x32xf32>
%2 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_id_z = hal.interface.workgroup.id[2] : index
%workgroup_count_z = hal.interface.workgroup.count[2] : index
%3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_z]
%4 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_count_z]
scf.for %arg0 = %3 to %c112 step %4 {
%5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y]
%6 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_count_y]
scf.for %arg1 = %5 to %c112 step %6 {
%7 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%8 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg2 = %7 to %c32 step %8 {
%9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0)
%10 = affine.min affine_map<(d0) -> (33, d0 * -2 + 225)>(%arg0)
%11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1)
%12 = affine.min affine_map<(d0) -> (33, d0 * -2 + 225)>(%arg1)
%13 = flow.dispatch.tensor.load %0, offsets = [0, %9, %11, 0], sizes = [1, %10, %12, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x8xf32> -> tensor<1x?x?x8xf32>
%14 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 8, 4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x8x32xf32> -> tensor<3x3x8x4xf32>
%15 = linalg.init_tensor [1, 16, 16, 4] : tensor<1x16x16x4xf32>
%16 = linalg.fill(%cst, %15) {__internal_linalg_transform__ = "workgroup"} : f32, tensor<1x16x16x4xf32> -> tensor<1x16x16x4xf32>
%17 = linalg.init_tensor [1, 16, 16, 3, 3, 8] : tensor<1x16x16x3x3x8xf32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1 * 2 + d3, d2 * 2 + d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<1x?x?x8xf32>) outs(%17 : tensor<1x16x16x3x3x8xf32>) {
^bb0(%arg3: f32, %arg4: f32): // no predecessors
linalg.yield %arg3 : f32
} -> tensor<1x16x16x3x3x8xf32>
%19 = linalg.tensor_collapse_shape %18 [[0, 1, 2], [3, 4, 5]] : tensor<1x16x16x3x3x8xf32> into tensor<256x72xf32>
%20 = linalg.tensor_collapse_shape %14 [[0, 1, 2], [3]] : tensor<3x3x8x4xf32> into tensor<72x4xf32>
%21 = linalg.tensor_collapse_shape %16 [[0, 1, 2], [3]] : tensor<1x16x16x4xf32> into tensor<256x4xf32>
%22 = linalg.matmul ins(%19, %20 : tensor<256x72xf32>, tensor<72x4xf32>) outs(%21 : tensor<256x4xf32>) -> tensor<256x4xf32>
%23 = linalg.tensor_expand_shape %22 [[0, 1, 2], [3]] : tensor<256x4xf32> into tensor<1x16x16x4xf32>
%24 = tensor.cast %23 : tensor<1x16x16x4xf32> to tensor<1x?x?x?xf32>
flow.dispatch.tensor.store %24, %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, %c16, %c16, %c4], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32>
}
}
}
return
}
hal.interface private @io {
hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
// CHECK-LABEL: func @im2col
// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0
// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan @io::@arg1
// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0
// CHECK-DAG: %[[ALLOC_ARG0:.+]] = memref.alloc() : memref<1x16x16x3x3x8xf32>
// CHECK-DAG: %[[ALLOC_ARG1:.+]] = memref.alloc() : memref<3x3x8x4xf32>
// CHECK-DAG: %[[ALLOC_RET0:.+]] = memref.alloc() : memref<1x16x16x4xf32>
// CHECK: scf.for
// CHECK: scf.for
// CHECK: scf.for
// CHECK-DAG: %[[ARG0_SV:.+]] = memref.subview %[[ARG0]]
// CHECK-DAG: %[[ARG1_SV:.+]] = memref.subview %[[ARG1]]
// CHECK-DAG: linalg.copy(%[[ARG1_SV]], %[[ALLOC_ARG1]])
// CHECK-DAG: linalg.fill(%{{.*}}, %[[ALLOC_RET0]]
// CHECK: linalg.generic
// CHECK-SAME: ins(%[[ARG0_SV]]
// CHECK-SAME: outs(%[[ALLOC_ARG0]]
// CHECK-DAG: %[[ALLOC_ARG0_RESHAPE:.+]] = memref.collapse_shape %[[ALLOC_ARG0]]
// CHECK-DAG: %[[ALLOC_ARG1_RESHAPE:.+]] = memref.collapse_shape %[[ALLOC_ARG1]]
// CHECK-DAG: %[[ALLOC_RET0_RESHAPE:.+]] = memref.collapse_shape %[[ALLOC_RET0]]
// CHECK: linalg.matmul
// CHECK-SAME: ins(%[[ALLOC_ARG0_RESHAPE]], %[[ALLOC_ARG1_RESHAPE]]
// CHECK-SAME: outs(%[[ALLOC_RET0_RESHAPE]]
// CHECK: %[[RET0_SV:.+]] = memref.subview %[[RET0]]
// CHECK: linalg.copy(%[[ALLOC_RET0]], %[[RET0_SV]])
// -----
func @multi_result_reduce() {
%c0 = arith.constant 0 : index
%c0_i32 = arith.constant 0 : i32
%c-2147483648_i32 = arith.constant -2147483648 : i32
%c2 = arith.constant 2 : index
%d0 = hal.interface.load.constant offset = 0 : index
%d1 = hal.interface.load.constant offset = 1 : index
%d2 = hal.interface.load.constant offset = 2 : index
%0 = hal.interface.binding.subspan @io::@ro0[%c0] : !flow.dispatch.tensor<readonly:?x?xi32>{%d0, %d1}
%1 = hal.interface.binding.subspan @io::@ro1[%c0] : !flow.dispatch.tensor<readonly:?x?xi32>{%d0, %d1}
%2 = hal.interface.binding.subspan @io::@wo0[%c0] : !flow.dispatch.tensor<writeonly:?xi32>{%d2}
%3 = hal.interface.binding.subspan @io::@wo1[%c0] : !flow.dispatch.tensor<writeonly:?xi32>{%d2}
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x]
%5 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x]
scf.for %arg0 = %4 to %d1 step %5 {
%6 = affine.min affine_map<(d0)[s0] -> (128, -d0 + s0)>(%arg0)[%d1]
%7 = flow.dispatch.tensor.load %0, offsets = [0, %arg0], sizes = [%d0, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xi32> -> tensor<?x?xi32>
%9 = flow.dispatch.tensor.load %1, offsets = [0, %arg0], sizes = [%d0, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xi32> -> tensor<?x?xi32>
%13 = linalg.init_tensor [%6] : tensor<?xi32>
%14 = linalg.fill(%c-2147483648_i32, %13) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[128]]}} : i32, tensor<?xi32> -> tensor<?xi32>
%17 = linalg.fill(%c0_i32, %13) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[128]]}} : i32, tensor<?xi32> -> tensor<?xi32>
%18:2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%7, %9 : tensor<?x?xi32>, tensor<?x?xi32>) outs(%14, %17 : tensor<?xi32>, tensor<?xi32>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[128]]}} {
^bb0(%arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32): // no predecessors
%19 = arith.cmpi sge, %arg1, %arg3 : i32
%20 = select %19, %arg1, %arg3 : i32
%21 = arith.cmpi eq, %arg1, %arg3 : i32
%22 = arith.cmpi slt, %arg2, %arg4 : i32
%23 = select %22, %arg2, %arg4 : i32
%24 = select %19, %arg2, %arg4 : i32
%25 = select %21, %23, %24 : i32
linalg.yield %20, %25 : i32, i32
} -> (tensor<?xi32>, tensor<?xi32>)
flow.dispatch.tensor.store %18#0, %2, offsets = [%arg0], sizes = [%6], strides = [1] : tensor<?xi32> -> !flow.dispatch.tensor<writeonly:?xi32>
flow.dispatch.tensor.store %18#1, %3, offsets = [%arg0], sizes = [%6], strides = [1] : tensor<?xi32> -> !flow.dispatch.tensor<writeonly:?xi32>
}
return
}
hal.interface private @io {
hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo0, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
hal.interface.binding @wo1, set=0, binding=3, type="StorageBuffer", access="Write|Discard"
}
// CHECK-LABEL: func @multi_result_reduce
// CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@ro0
// CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan @io::@ro1
// CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@wo0
// CHECK-DAG: %[[RET1:.+]] = hal.interface.binding.subspan @io::@wo1
// CHECK: scf.for
// CHECK-DAG: %[[ARG0_SV:.+]] = memref.subview %[[ARG0]]
// CHECK-DAG: %[[ARG1_SV:.+]] = memref.subview %[[ARG1]]
// CHECK-DAG: %[[RET0_SV:.+]] = memref.subview %[[RET0]]
// CHECK-DAG: linalg.fill(%{{.*}}, %[[RET0_SV]]
// CHECK-DAG: %[[RET1_SV:.+]] = memref.subview %[[RET1]]
// CHECK-DAG: linalg.fill(%{{.*}}, %[[RET1_SV]]
// CHECK: linalg.generic
// CHECK-SAME: ins(%[[ARG0_SV]], %[[ARG1_SV]]
// CHECK-SAME: outs(%[[RET0_SV]], %[[RET1_SV]]
// -----
#config0 = {tileSizes = [[64, 64]]}
#config1 = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 24], [4, 4, 4]]}
#map0 = affine_map<()[s0] -> (s0 * 64)>
#map1 = affine_map<(d0) -> (64, -d0 + 250)>
#map2 = affine_map<(d0) -> (64, -d0 + 370)>
#map3 = affine_map<(d0) -> (32, -d0 + 250)>
#map4 = affine_map<(d0) -> (24, -d0 + 144)>
#map5 = affine_map<(d0) -> (32, -d0 + 370)>
#map6 = affine_map<(d0, d1) -> (32, d0 - d1)>
module {
func @l1_tiled_matmul_no_fill() {
%cst = arith.constant 0.000000e+00 : f32
%c32 = arith.constant 32 : index
%c24 = arith.constant 24 : index
%c144 = arith.constant 144 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c250 = arith.constant 250 : index
%c370 = arith.constant 370 : index
%0 = hal.interface.binding.subspan @io::@ro1[%c0] : !flow.dispatch.tensor<readonly:250x144xf32>
%1 = hal.interface.binding.subspan @io::@ro2[%c0] : !flow.dispatch.tensor<readonly:144x370xf32>
%init = hal.interface.binding.subspan @io::@ro3[%c0] : !flow.dispatch.tensor<readonly:250x370xf32>
%2 = hal.interface.binding.subspan @io::@wo[%c0] : !flow.dispatch.tensor<writeonly:250x370xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply #map0()[%workgroup_id_y]
%4 = affine.apply #map0()[%workgroup_count_y]
scf.for %arg0 = %3 to %c250 step %4 {
%5 = affine.apply #map0()[%workgroup_id_x]
%6 = affine.apply #map0()[%workgroup_count_x]
scf.for %arg1 = %5 to %c370 step %6 {
%7 = affine.min #map1(%arg0)
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 144], strides = [1, 1] : !flow.dispatch.tensor<readonly:250x144xf32> -> tensor<?x144xf32>
%9 = affine.min #map2(%arg1)
%10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [144, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:144x370xf32> -> tensor<144x?xf32>
%11 = flow.dispatch.tensor.load %init, offsets = [%arg0, %arg1], sizes = [%7, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:250x370xf32> -> tensor<?x?xf32>
%13 = scf.for %arg2 = %c0 to %c250 step %c32 iter_args(%arg3 = %11) -> (tensor<?x?xf32>) {
%14 = scf.for %arg4 = %c0 to %c370 step %c32 iter_args(%arg5 = %arg3) -> (tensor<?x?xf32>) {
%15 = scf.for %arg6 = %c0 to %c144 step %c24 iter_args(%arg7 = %arg5) -> (tensor<?x?xf32>) {
%16 = affine.min #map3(%arg2)
%17 = affine.min #map4(%arg6)
%18 = tensor.extract_slice %8[%arg2, %arg6] [%16, %17] [1, 1] : tensor<?x144xf32> to tensor<?x?xf32>
%19 = affine.min #map5(%arg4)
%20 = tensor.extract_slice %10[%arg6, %arg4] [%17, %19] [1, 1] : tensor<144x?xf32> to tensor<?x?xf32>
%21 = tensor.dim %arg7, %c0 : tensor<?x?xf32>
%22 = affine.min #map6(%21, %arg2)
%23 = tensor.dim %arg7, %c1 : tensor<?x?xf32>
%24 = affine.min #map6(%23, %arg4)
%25 = tensor.extract_slice %arg7[%arg2, %arg4] [%22, %24] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%26 = linalg.matmul {__internal_linalg_transform__ = "workgroup_l1_tile", lowering.config = #config1} ins(%18, %20 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%25 : tensor<?x?xf32>) -> tensor<?x?xf32>
%27 = tensor.insert_slice %26 into %arg7[%arg2, %arg4] [%22, %24] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
scf.yield %27 : tensor<?x?xf32>
}
scf.yield %15 : tensor<?x?xf32>
}
scf.yield %14 : tensor<?x?xf32>
}
flow.dispatch.tensor.store %13, %2, offsets = [%arg0, %arg1], sizes = [%7, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:250x370xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding @ro1, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro2, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @ro3, set=0, binding=2, type="StorageBuffer", access="Read"
hal.interface.binding @wo, set=0, binding=3, type="StorageBuffer", access="Write|Discard"
}
}
// CHECK-LABEL: l1_tiled_matmul_no_fill
// CHECK-DAG: %[[M:.+]] = arith.constant 250 : index
// CHECK-DAG: %[[N:.+]] = arith.constant 370 : index
// CHECK-DAG: %[[K:.+]] = arith.constant 144 : index
// CHECK-DAG: %[[L1_MN_SIZE:.+]] = arith.constant 32 : index
// CHECK-DAG: %[[L1_K_SIZE:.+]] = arith.constant 24 : index
// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan @io::@ro1[%{{.*}}] : memref<250x144xf32>
// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan @io::@ro2[%{{.*}}] : memref<144x370xf32>
// CHECK-DAG: %[[INIT:.+]] = hal.interface.binding.subspan @io::@ro3[%{{.*}}] : memref<250x370xf32>
// CHECK-DAG: %[[DST:.+]] = hal.interface.binding.subspan @io::@wo[%{{.*}}] : memref<250x370xf32>
// CHECK: scf.for %[[WORKGROUP_I:.+]] = %{{.*}} to %[[M]] step %{{.*}} {
// CHECK: scf.for %[[WORKGROUP_J:.+]] = %{{.*}} to %[[N]] step %{{.*}} {
// CHECK-DAG: %[[WORKGROUP_I_SIZE:.+]] = affine.min #{{.*}}(%[[WORKGROUP_I]])
// CHECK-DAG: %[[LHS_WORKGROUP_TILE:.+]] = memref.subview %[[LHS]][%[[WORKGROUP_I]], 0] [%[[WORKGROUP_I_SIZE]], 144] [1, 1] : memref<250x144xf32> to memref<?x144xf32
// CHECK-DAG: %[[WORKGROUP_J_SIZE:.+]] = affine.min #{{.*}}(%[[WORKGROUP_J]])
// CHECK-DAG: %[[RHS_WORKGROUP_TILE:.+]] = memref.subview %[[RHS]][0, %[[WORKGROUP_J]]] [144, %[[WORKGROUP_J_SIZE]]] [1, 1] : memref<144x370xf32> to memref<144x?xf32
// CHECK-DAG: %[[INIT_WORKGROUP_TILE:.+]] = memref.subview %[[INIT]][%[[WORKGROUP_I]], %[[WORKGROUP_J]]] [%[[WORKGROUP_I_SIZE]], %[[WORKGROUP_J_SIZE]]]
// CHECK-DAG: %[[DST_WORKGROUP_TILE:.+]] = memref.subview %[[DST]][%[[WORKGROUP_I]], %[[WORKGROUP_J]]] [%[[WORKGROUP_I_SIZE]], %[[WORKGROUP_J_SIZE]]]
// CHECK: linalg.copy(%[[INIT_WORKGROUP_TILE]], %[[DST_WORKGROUP_TILE]])
// CHECK: scf.for %[[L1_I:.+]] = %{{.*}} to %[[M]] step %[[L1_MN_SIZE]] {
// CHECK: scf.for %[[L1_J:.+]] = %{{.*}} to %[[N]] step %[[L1_MN_SIZE]] {
// CHECK: scf.for %[[L1_K:.+]] = %{{.*}} to %[[K]] step %[[L1_K_SIZE]] {
// CHECK-DAG: %[[LHS_L1_TILE:.+]] = memref.subview %[[LHS_WORKGROUP_TILE]][%[[L1_I]], %[[L1_K]]]
// CHECK-DAG: %[[RHS_L1_TILE:.+]] = memref.subview %[[RHS_WORKGROUP_TILE]][%[[L1_K]], %[[L1_J]]]
// CHECK-DAG: %[[L1_I_SIZE:.+]] = affine.min #{{.*}}(%[[WORKGROUP_I_SIZE]], %[[L1_I]])
// CHECK-DAG: %[[L1_J_SIZE:.+]] = affine.min #{{.*}}(%[[WORKGROUP_J_SIZE]], %[[L1_J]])
// CHECK-DAG: %[[DST_L1_TILE:.+]] = memref.subview %[[DST_WORKGROUP_TILE]][%[[L1_I]], %[[L1_J]]]
// CHECK: linalg.matmul
// CHECK-SAME: ins(%[[LHS_L1_TILE]], %[[RHS_L1_TILE]]
// CHECK-SAME: outs(%[[DST_L1_TILE]]
// -----
#config0 = {tileSizes = [[64, 64]]}
#config1 = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 24], [4, 4, 4]]}
#map0 = affine_map<()[s0] -> (s0 * 64)>
#map1 = affine_map<(d0) -> (64, -d0 + 250)>
#map2 = affine_map<(d0) -> (64, -d0 + 370)>
#map3 = affine_map<(d0) -> (32, -d0 + 250)>
#map4 = affine_map<(d0) -> (24, -d0 + 144)>
#map5 = affine_map<(d0) -> (32, -d0 + 370)>
#map6 = affine_map<(d0, d1) -> (32, d0 - d1)>
module {
func @l1_tiled_matmul_no_fill_readwrite() {
%cst = arith.constant 0.000000e+00 : f32
%c32 = arith.constant 32 : index
%c24 = arith.constant 24 : index
%c144 = arith.constant 144 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c250 = arith.constant 250 : index
%c370 = arith.constant 370 : index
%0 = hal.interface.binding.subspan @io::@ro1[%c0] : !flow.dispatch.tensor<readonly:250x144xf32>
%1 = hal.interface.binding.subspan @io::@ro2[%c0] : !flow.dispatch.tensor<readonly:144x370xf32>
%2 = hal.interface.binding.subspan @io::@wo[%c0] : !flow.dispatch.tensor<readwrite:250x370xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply #map0()[%workgroup_id_y]
%4 = affine.apply #map0()[%workgroup_count_y]
scf.for %arg0 = %3 to %c250 step %4 {
%5 = affine.apply #map0()[%workgroup_id_x]
%6 = affine.apply #map0()[%workgroup_count_x]
scf.for %arg1 = %5 to %c370 step %6 {
%7 = affine.min #map1(%arg0)
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 144], strides = [1, 1] : !flow.dispatch.tensor<readonly:250x144xf32> -> tensor<?x144xf32>
%9 = affine.min #map2(%arg1)
%10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [144, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:144x370xf32> -> tensor<144x?xf32>
%11 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [%7, %9], strides = [1, 1] : !flow.dispatch.tensor<readwrite:250x370xf32> -> tensor<?x?xf32>
%13 = scf.for %arg2 = %c0 to %c250 step %c32 iter_args(%arg3 = %11) -> (tensor<?x?xf32>) {
%14 = scf.for %arg4 = %c0 to %c370 step %c32 iter_args(%arg5 = %arg3) -> (tensor<?x?xf32>) {
%15 = scf.for %arg6 = %c0 to %c144 step %c24 iter_args(%arg7 = %arg5) -> (tensor<?x?xf32>) {
%16 = affine.min #map3(%arg2)
%17 = affine.min #map4(%arg6)
%18 = tensor.extract_slice %8[%arg2, %arg6] [%16, %17] [1, 1] : tensor<?x144xf32> to tensor<?x?xf32>
%19 = affine.min #map5(%arg4)
%20 = tensor.extract_slice %10[%arg6, %arg4] [%17, %19] [1, 1] : tensor<144x?xf32> to tensor<?x?xf32>
%21 = tensor.dim %arg7, %c0 : tensor<?x?xf32>
%22 = affine.min #map6(%21, %arg2)
%23 = tensor.dim %arg7, %c1 : tensor<?x?xf32>
%24 = affine.min #map6(%23, %arg4)
%25 = tensor.extract_slice %arg7[%arg2, %arg4] [%22, %24] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%26 = linalg.matmul {__internal_linalg_transform__ = "workgroup_l1_tile", lowering.config = #config1} ins(%18, %20 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%25 : tensor<?x?xf32>) -> tensor<?x?xf32>
%27 = tensor.insert_slice %26 into %arg7[%arg2, %arg4] [%22, %24] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
scf.yield %27 : tensor<?x?xf32>
}
scf.yield %15 : tensor<?x?xf32>
}
scf.yield %14 : tensor<?x?xf32>
}
flow.dispatch.tensor.store %13, %2, offsets = [%arg0, %arg1], sizes = [%7, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<readwrite:250x370xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding @ro1, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @ro2, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @wo, set=0, binding=2, type="StorageBuffer", access="Read|Write"
}
}
// CHECK-LABEL: l1_tiled_matmul_no_fill_readwrite
// CHECK-DAG: %[[M:.+]] = arith.constant 250 : index
// CHECK-DAG: %[[N:.+]] = arith.constant 370 : index
// CHECK-DAG: %[[K:.+]] = arith.constant 144 : index
// CHECK-DAG: %[[L1_MN_SIZE:.+]] = arith.constant 32 : index
// CHECK-DAG: %[[L1_K_SIZE:.+]] = arith.constant 24 : index
// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan @io::@ro1[%{{.*}}] : memref<250x144xf32>
// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan @io::@ro2[%{{.*}}] : memref<144x370xf32>
// CHECK-DAG: %[[DST:.+]] = hal.interface.binding.subspan @io::@wo[%{{.*}}] : memref<250x370xf32>
// CHECK: scf.for %[[WORKGROUP_I:.+]] = %{{.*}} to %[[M]] step %{{.*}} {
// CHECK: scf.for %[[WORKGROUP_J:.+]] = %{{.*}} to %[[N]] step %{{.*}} {
// CHECK-DAG: %[[WORKGROUP_I_SIZE:.+]] = affine.min #{{.*}}(%[[WORKGROUP_I]])
// CHECK-DAG: %[[LHS_WORKGROUP_TILE:.+]] = memref.subview %[[LHS]][%[[WORKGROUP_I]], 0] [%[[WORKGROUP_I_SIZE]], 144] [1, 1] : memref<250x144xf32> to memref<?x144xf32
// CHECK-DAG: %[[WORKGROUP_J_SIZE:.+]] = affine.min #{{.*}}(%[[WORKGROUP_J]])
// CHECK-DAG: %[[RHS_WORKGROUP_TILE:.+]] = memref.subview %[[RHS]][0, %[[WORKGROUP_J]]] [144, %[[WORKGROUP_J_SIZE]]] [1, 1] : memref<144x370xf32> to memref<144x?xf32
// CHECK-DAG: %[[DST_WORKGROUP_TILE:.+]] = memref.subview %[[DST]][%[[WORKGROUP_I]], %[[WORKGROUP_J]]] [%[[WORKGROUP_I_SIZE]], %[[WORKGROUP_J_SIZE]]]
// CHECK: scf.for %[[L1_I:.+]] = %{{.*}} to %[[M]] step %[[L1_MN_SIZE]] {
// CHECK: scf.for %[[L1_J:.+]] = %{{.*}} to %[[N]] step %[[L1_MN_SIZE]] {
// CHECK: scf.for %[[L1_K:.+]] = %{{.*}} to %[[K]] step %[[L1_K_SIZE]] {
// CHECK-DAG: %[[LHS_L1_TILE:.+]] = memref.subview %[[LHS_WORKGROUP_TILE]][%[[L1_I]], %[[L1_K]]]
// CHECK-DAG: %[[RHS_L1_TILE:.+]] = memref.subview %[[RHS_WORKGROUP_TILE]][%[[L1_K]], %[[L1_J]]]
// CHECK-DAG: %[[L1_I_SIZE:.+]] = affine.min #{{.*}}(%[[WORKGROUP_I_SIZE]], %[[L1_I]])
// CHECK-DAG: %[[L1_J_SIZE:.+]] = affine.min #{{.*}}(%[[WORKGROUP_J_SIZE]], %[[L1_J]])
// CHECK-DAG: %[[DST_L1_TILE:.+]] = memref.subview %[[DST_WORKGROUP_TILE]][%[[L1_I]], %[[L1_J]]]
// CHECK: linalg.matmul
// CHECK-SAME: ins(%[[LHS_L1_TILE]], %[[RHS_L1_TILE]]
// CHECK-SAME: outs(%[[DST_L1_TILE]]
// -----
#config0 = {tileSizes = [[64, 64]]}
#config1 = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 24], [4, 4, 4]]}
#map0 = affine_map<()[s0] -> (s0 * 64)>
#map1 = affine_map<(d0) -> (64, -d0 + 250)>
#map2 = affine_map<(d0) -> (64, -d0 + 370)>
#map3 = affine_map<(d0) -> (32, -d0 + 250)>
#map4 = affine_map<(d0) -> (24, -d0 + 144)>
#map5 = affine_map<(d0) -> (32, -d0 + 370)>
#map6 = affine_map<(d0, d1) -> (32, d0 - d1)>
module {
func @l1_tiled_matmul() {
%cst = arith.constant 0.000000e+00 : f32
%c32 = arith.constant 32 : index
%c24 = arith.constant 24 : index
%c144 = arith.constant 144 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c250 = arith.constant 250 : index
%c370 = arith.constant 370 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:250x144xf32>
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:144x370xf32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:250x370xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply #map0()[%workgroup_id_y]
%4 = affine.apply #map0()[%workgroup_count_y]
scf.for %arg0 = %3 to %c250 step %4 {
%5 = affine.apply #map0()[%workgroup_id_x]
%6 = affine.apply #map0()[%workgroup_count_x]
scf.for %arg1 = %5 to %c370 step %6 {
%7 = affine.min #map1(%arg0)
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 144], strides = [1, 1] : !flow.dispatch.tensor<readonly:250x144xf32> -> tensor<?x144xf32>
%9 = affine.min #map2(%arg1)
%10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [144, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:144x370xf32> -> tensor<144x?xf32>
%11 = linalg.init_tensor [%7, %9] : tensor<?x?xf32>
%12 = linalg.fill(%cst, %11) {__internal_linalg_transform__ = "workgroup", lowering.config = #config0} : f32, tensor<?x?xf32> -> tensor<?x?xf32>
%13 = scf.for %arg2 = %c0 to %c250 step %c32 iter_args(%arg3 = %12) -> (tensor<?x?xf32>) {
%14 = scf.for %arg4 = %c0 to %c370 step %c32 iter_args(%arg5 = %arg3) -> (tensor<?x?xf32>) {
%15 = scf.for %arg6 = %c0 to %c144 step %c24 iter_args(%arg7 = %arg5) -> (tensor<?x?xf32>) {
%16 = affine.min #map3(%arg2)
%17 = affine.min #map4(%arg6)
%18 = tensor.extract_slice %8[%arg2, %arg6] [%16, %17] [1, 1] : tensor<?x144xf32> to tensor<?x?xf32>
%19 = affine.min #map5(%arg4)
%20 = tensor.extract_slice %10[%arg6, %arg4] [%17, %19] [1, 1] : tensor<144x?xf32> to tensor<?x?xf32>
%21 = tensor.dim %arg7, %c0 : tensor<?x?xf32>
%22 = affine.min #map6(%21, %arg2)
%23 = tensor.dim %arg7, %c1 : tensor<?x?xf32>
%24 = affine.min #map6(%23, %arg4)
%25 = tensor.extract_slice %arg7[%arg2, %arg4] [%22, %24] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%26 = linalg.matmul {__internal_linalg_transform__ = "workgroup_l1_tile", lowering.config = #config1} ins(%18, %20 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%25 : tensor<?x?xf32>) -> tensor<?x?xf32>
%27 = tensor.insert_slice %26 into %arg7[%arg2, %arg4] [%22, %24] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
scf.yield %27 : tensor<?x?xf32>
}
scf.yield %15 : tensor<?x?xf32>
}
scf.yield %14 : tensor<?x?xf32>
}
flow.dispatch.tensor.store %13, %2, offsets = [%arg0, %arg1], sizes = [%7, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:250x370xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
}
// CHECK-LABEL: l1_tiled_matmul
// CHECK-DAG: %[[M:.+]] = arith.constant 250 : index
// CHECK-DAG: %[[N:.+]] = arith.constant 370 : index
// CHECK-DAG: %[[K:.+]] = arith.constant 144 : index
// CHECK-DAG: %[[L1_MN_SIZE:.+]] = arith.constant 32 : index
// CHECK-DAG: %[[L1_K_SIZE:.+]] = arith.constant 24 : index
// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan @io::@s0b0_ro_external[%{{.*}}] : memref<250x144xf32>
// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan @io::@s0b1_ro_external[%{{.*}}] : memref<144x370xf32>
// CHECK-DAG: %[[DST:.+]] = hal.interface.binding.subspan @io::@s0b2_xw_external[%{{.*}}] : memref<250x370xf32>
// CHECK: scf.for %[[WORKGROUP_I:.+]] = %{{.*}} to %[[M]] step %{{.*}} {
// CHECK: scf.for %[[WORKGROUP_J:.+]] = %{{.*}} to %[[N]] step %{{.*}} {
// CHECK-DAG: %[[WORKGROUP_I_SIZE:.+]] = affine.min #{{.*}}(%[[WORKGROUP_I]])
// CHECK-DAG: %[[LHS_WORKGROUP_TILE:.+]] = memref.subview %[[LHS]][%[[WORKGROUP_I]], 0] [%[[WORKGROUP_I_SIZE]], 144] [1, 1] : memref<250x144xf32> to memref<?x144xf32
// CHECK-DAG: %[[WORKGROUP_J_SIZE:.+]] = affine.min #{{.*}}(%[[WORKGROUP_J]])
// CHECK-DAG: %[[RHS_WORKGROUP_TILE:.+]] = memref.subview %[[RHS]][0, %[[WORKGROUP_J]]] [144, %[[WORKGROUP_J_SIZE]]] [1, 1] : memref<144x370xf32> to memref<144x?xf32
// CHECK-DAG: %[[DST_WORKGROUP_TILE:.+]] = memref.subview %[[DST]][%[[WORKGROUP_I]], %[[WORKGROUP_J]]] [%[[WORKGROUP_I_SIZE]], %[[WORKGROUP_J_SIZE]]]
// CHECK: scf.for %[[L1_I:.+]] = %{{.*}} to %[[M]] step %[[L1_MN_SIZE]] {
// CHECK: scf.for %[[L1_J:.+]] = %{{.*}} to %[[N]] step %[[L1_MN_SIZE]] {
// CHECK: scf.for %[[L1_K:.+]] = %{{.*}} to %[[K]] step %[[L1_K_SIZE]] {
// CHECK-DAG: %[[LHS_L1_TILE:.+]] = memref.subview %[[LHS_WORKGROUP_TILE]][%[[L1_I]], %[[L1_K]]]
// CHECK-DAG: %[[RHS_L1_TILE:.+]] = memref.subview %[[RHS_WORKGROUP_TILE]][%[[L1_K]], %[[L1_J]]]
// CHECK-DAG: %[[L1_I_SIZE:.+]] = affine.min #{{.*}}(%[[WORKGROUP_I_SIZE]], %[[L1_I]])
// CHECK-DAG: %[[L1_J_SIZE:.+]] = affine.min #{{.*}}(%[[WORKGROUP_J_SIZE]], %[[L1_J]])
// CHECK-DAG: %[[DST_L1_TILE:.+]] = memref.subview %[[DST_WORKGROUP_TILE]][%[[L1_I]], %[[L1_J]]]
// CHECK: linalg.matmul
// CHECK-SAME: ins(%[[LHS_L1_TILE]], %[[RHS_L1_TILE]]
// CHECK-SAME: outs(%[[DST_L1_TILE]]
// -----
func @sort1D() {
%c4 = arith.constant 4 : index
%c3 = arith.constant 3 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%0 = hal.interface.binding.subspan @io::@ro[%c0] : !flow.dispatch.tensor<readonly:4xi32>
%1 = hal.interface.binding.subspan @io::@wo[%c0] : !flow.dispatch.tensor<writeonly:4xi32>
%2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:4xi32> -> tensor<4xi32>
%3 = scf.for %arg0 = %c0 to %c4 step %c1 iter_args(%arg1 = %2) -> (tensor<4xi32>) {
%4 = scf.for %arg2 = %c0 to %c3 step %c1 iter_args(%arg3 = %arg1) -> (tensor<4xi32>) {
%5 = arith.addi %arg2, %c1 : index
%6 = tensor.extract %arg3[%arg2] : tensor<4xi32>
%7 = tensor.extract %arg3[%5] : tensor<4xi32>
%8 = arith.cmpi sgt, %6, %7 : i32
%11 = scf.if %8 -> (tensor<4xi32>) {
%12 = tensor.insert %6 into %arg3[%5] : tensor<4xi32>
%13 = tensor.insert %7 into %12[%arg2] : tensor<4xi32>
scf.yield %13 : tensor<4xi32>
} else {
scf.yield %arg3 : tensor<4xi32>
}
scf.yield %11 : tensor<4xi32>
}
scf.yield %4 : tensor<4xi32>
}
flow.dispatch.tensor.store %3, %1, offsets = [], sizes = [], strides = [] : tensor<4xi32> -> !flow.dispatch.tensor<writeonly:4xi32>
return
}
hal.interface private @io {
hal.interface.binding @ro, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @wo, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
}
// CHECK-LABEL: func @sort1D()
// CHECK-DAG: %[[INPUT:.+]] = hal.interface.binding.subspan @io::@ro
// CHECK-DAG: %[[OUTPUT:.+]] = hal.interface.binding.subspan @io::@wo
// CHECK: linalg.copy(%[[INPUT]], %[[OUTPUT]])
// CHECK: scf.for %[[ARG0:.+]] =
// CHECK: scf.for %[[ARG1:.+]] =
// CHECK-DAG: %[[P1:.+]] = arith.addi %[[ARG1]]
// CHECK-DAG: %[[V1:.+]] = memref.load %[[OUTPUT]][%[[ARG1]]]
// CHECK-DAG: %[[V2:.+]] = memref.load %[[OUTPUT]][%[[P1]]]
// CHECK: scf.if
// CHECK-DAG: memref.store %[[V1]], %[[OUTPUT]][%[[P1]]]
// CHECK-DAG: memref.store %[[V2]], %[[OUTPUT]][%[[ARG1]]]
// -----
func @sort1D_inplace() {
%c4 = arith.constant 4 : index
%c3 = arith.constant 3 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%0 = hal.interface.binding.subspan @io::@rw[%c0] : !flow.dispatch.tensor<readwrite:4xi32>
%2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:4xi32> -> tensor<4xi32>
%3 = scf.for %arg0 = %c0 to %c4 step %c1 iter_args(%arg1 = %2) -> (tensor<4xi32>) {
%4 = scf.for %arg2 = %c0 to %c3 step %c1 iter_args(%arg3 = %arg1) -> (tensor<4xi32>) {
%5 = arith.addi %arg2, %c1 : index
%6 = tensor.extract %arg3[%arg2] : tensor<4xi32>
%7 = tensor.extract %arg3[%5] : tensor<4xi32>
%8 = arith.cmpi sgt, %6, %7 : i32
%11 = scf.if %8 -> (tensor<4xi32>) {
%12 = tensor.insert %6 into %arg3[%5] : tensor<4xi32>
%13 = tensor.insert %7 into %12[%arg2] : tensor<4xi32>
scf.yield %13 : tensor<4xi32>
} else {
scf.yield %arg3 : tensor<4xi32>
}
scf.yield %11 : tensor<4xi32>
}
scf.yield %4 : tensor<4xi32>
}
flow.dispatch.tensor.store %3, %0, offsets = [], sizes = [], strides = [] : tensor<4xi32> -> !flow.dispatch.tensor<readwrite:4xi32>
return
}
hal.interface private @io {
hal.interface.binding @rw, set=0, binding=0, type="StorageBuffer", access="Read|Write"
}
// CHECK-LABEL: func @sort1D_inplace()
// CHECK-DAG: %[[INOUT:.+]] = hal.interface.binding.subspan @io::@rw
// CHECK: scf.for %[[ARG0:.+]] =
// CHECK: scf.for %[[ARG1:.+]] =
// CHECK-DAG: %[[P1:.+]] = arith.addi %[[ARG1]]
// CHECK-DAG: %[[V1:.+]] = memref.load %[[INOUT]][%[[ARG1]]]
// CHECK-DAG: %[[V2:.+]] = memref.load %[[INOUT]][%[[P1]]]
// CHECK: scf.if
// CHECK-DAG: memref.store %[[V1]], %[[INOUT]][%[[P1]]]
// CHECK-DAG: memref.store %[[V2]], %[[INOUT]][%[[ARG1]]]
// -----
func @linalg_ext_sort_1d() {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan @io::@rw[%c0] : !flow.dispatch.tensor<readwrite:128xi32>
%1 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:128xi32> -> tensor<128xi32>
%2 = linalg_ext.sort dimension(0) outs(%1 : tensor<128xi32>) {
^bb0(%arg0: i32, %arg1: i32): // no predecessors
%3 = arith.cmpi sgt, %arg0, %arg1 : i32
linalg_ext.yield %3 : i1
} -> tensor<128xi32>
flow.dispatch.tensor.store %2, %0, offsets = [], sizes = [], strides = [] : tensor<128xi32> -> !flow.dispatch.tensor<readwrite:128xi32>
return
}
// CHECK-LABEL: func @linalg_ext_sort_1d()
// CHECK-DAG: %[[INOUT:.+]] = hal.interface.binding.subspan @io::@rw
// CHECK: linalg_ext.sort
// CHECK-SAME: dimension(0)
// CHECK-SAME: outs(%[[INOUT]] : memref<128xi32>)
// -----
builtin.func @tensor_insert_slice() {
%c0 = arith.constant 0 : index
%1 = hal.interface.load.constant offset = 0 : index
%2 = hal.interface.load.constant offset = 1 : index
%d0 = hal.interface.load.constant offset = 2 : index
%d1 = hal.interface.load.constant offset = 3 : index
%d2 = hal.interface.load.constant offset = 4 : index
%d3 = hal.interface.load.constant offset = 5 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:?x?xi32>{%d0, %d1}
%3 = hal.interface.binding.subspan @io::@s0b1_xw_external[%c0] : !flow.dispatch.tensor<writeonly:?x?xi32>{%d2, %d3}
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%4 = affine.apply affine_map<()[s0, s1] -> (s1 * s0)>()[%workgroup_size_y, %workgroup_id_y]
%5 = affine.apply affine_map<()[s0, s1] -> (s1 * s0)>()[%workgroup_size_y, %workgroup_count_y]
scf.for %arg0 = %4 to %d0 step %5 {
%6 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg0)[%workgroup_size_y, %d0]
%7 = affine.apply affine_map<()[s0, s1] -> (s1 * s0)>()[%workgroup_size_x, %workgroup_id_x]
%8 = affine.apply affine_map<()[s0, s1] -> (s1 * s0)>()[%workgroup_size_x, %workgroup_count_x]
scf.for %arg1 = %7 to %d1 step %8 {
%9 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg1)[%workgroup_size_x, %d1]
%10 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1], sizes = [%6, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xi32> -> tensor<?x?xi32>
%11 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%arg0)[%1]
%12 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%arg1)[%2]
flow.dispatch.tensor.store %10, %3, offsets = [%11, %12], sizes = [%6, %9], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:?x?xi32>
}
}
return
}
hal.interface @io attributes {push_constants = 2 : index, sym_visibility = "private"} {
hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @s0b1_xw_external, set=0, binding=1, type="StorageBuffer", access="Write|Discard"
}
// CHECK: #[[MAP:.+]] = affine_map<(d0)[s0] -> (d0 + s0)>
// CHECK: func @tensor_insert_slice()
// CHECK-DAG: %[[SRC:.+]] = hal.interface.binding.subspan @io::@s0b0_ro_external[%{{.+}}] : memref<?x?xi32>
// CHECK-DAG: %[[DST:.+]] = hal.interface.binding.subspan @io::@s0b1_xw_external[%{{.+}}] : memref<?x?xi32>
// CHECK-DAG: %[[OFFSET_Y:.+]] = hal.interface.load.constant offset = 0
// CHECK-DAG: %[[OFFSET_X:.+]] = hal.interface.load.constant offset = 1
// CHECK: scf.for %[[IV0:.+]] =
// CHECK: scf.for %[[IV1:.+]] =
// CHECK-DAG: %[[SRC_VIEW:.+]] = memref.subview %[[SRC]][%[[IV0]], %[[IV1]]]
// CHECK-DAG: %[[DST_IDX_Y:.+]] = affine.apply #[[MAP]](%[[IV0]])[%[[OFFSET_Y]]]
// CHECK-DAG: %[[DST_IDX_X:.+]] = affine.apply #[[MAP]](%[[IV1]])[%[[OFFSET_X]]]
// CHECK: %[[DST_VIEW:.+]] = memref.subview %[[DST]][%[[DST_IDX_Y]], %[[DST_IDX_X]]]
// CHECK: linalg.copy(%[[SRC_VIEW]], %[[DST_VIEW]])
// -----
builtin.func @dynamic_update_slice() {
%c0 = arith.constant 0 : index
%c3 = arith.constant 3 : index
%c0_i32 = arith.constant 0 : i32
%d0 = hal.interface.load.constant offset = 0 : index
%d1 = hal.interface.load.constant offset = 1 : index
%0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:?xi32>{%d0}
%1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:i32>
%2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:?x?xi32>{%d1, %d0}
%3 = flow.dispatch.tensor.load %1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32>
%4 = tensor.extract %3[] : tensor<i32>
%5 = arith.cmpi slt, %4, %c0_i32 : i32
%6 = select %5, %4, %c0_i32 : i32
%7 = arith.cmpi sgt, %6, %c0_i32 : i32
%8 = select %7, %6, %c0_i32 : i32
%9 = arith.index_cast %8 : i32 to index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x]
%11 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x]
scf.for %arg0 = %10 to %d0 step %11 {
%12 = affine.min affine_map<(d0)[s0] -> (64, -d0 + s0)>(%arg0)[%d0]
%13 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [%12], strides = [1] : !flow.dispatch.tensor<readonly:?xi32> -> tensor<?xi32>
%14 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%arg0)[%9]
flow.dispatch.tensor.store %13, %2, offsets = [0, %14], sizes = [1, %12], strides = [1, 1] : tensor<?xi32> -> !flow.dispatch.tensor<writeonly:?x?xi32>
}
return
}
hal.interface @io attributes {sym_visibility = "private"} {
hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard"
}
// CHECK-LABEL: func @dynamic_update_slice()
// CHECK-DAG: %[[SRC:.+]] = hal.interface.binding.subspan @io::@s0b0_ro_external[%{{.+}}] : memref<?xi32>
// CHECK-DAG: %[[DST:.+]] = hal.interface.binding.subspan @io::@s0b2_xw_external[%{{.+}}] : memref<?x?xi32>
// CHECK-DAG: %[[OFFSET_Y:.+]] = hal.interface.load.constant offset = 0
// CHECK-DAG: %[[OFFSET_X:.+]] = hal.interface.load.constant offset = 1
// CHECK: scf.for %[[IV0:.+]] =
// CHECK: %[[SRC_VIEW:.+]] = memref.subview %[[SRC]][%[[IV0]]]
// CHECK-SAME: : memref<?xi32> to memref<?xi32, #{{.+}}>
// CHECK: %[[DST_VIEW:.+]] = memref.subview %[[DST]][0, %{{.+}}] [1, %{{.+}}]
// CHECK-SAME: : memref<?x?xi32> to memref<?xi32, #{{.+}}>
// CHECK: linalg.copy(%[[SRC_VIEW]], %[[DST_VIEW]])
// -----
func @multi_level_tile_fuse() {
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%cst = arith.constant 0.000000e+00 : f32
%m = hal.interface.load.constant offset = 0 : index
%n = hal.interface.load.constant offset = 1 : index
%k = hal.interface.load.constant offset = 2 : index
%0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%m, %k}
%1 = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%k, %n}
%2 = hal.interface.binding.subspan @io::@arg2[%c0] : !flow.dispatch.tensor<readonly:f32>
%3 = hal.interface.binding.subspan @io::@arg3[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%m, %n}
%4 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:f32> -> tensor<f32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg0 = %5 to %m step %6 {
%7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg1 = %7 to %n step %8 {
%9 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg0)[%workgroup_size_y, %m]
%10 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg1)[%workgroup_size_x, %n]
%11 = linalg.init_tensor [%9, %10] : tensor<?x?xf32>
%13 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%9, %k], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32>
%15 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [%k, %10], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32>
%16 = linalg.init_tensor [%9, %10] : tensor<?x?xf32>
%17 = linalg.fill(%cst, %16) {__internal_linalg_transform__ = "workgroup"} : f32, tensor<?x?xf32> -> tensor<?x?xf32>
%18 = scf.for %arg2 = %c0 to %9 step %c4 iter_args(%arg3 = %17) -> (tensor<?x?xf32>) {
%20 = scf.for %arg4 = %c0 to %10 step %c4 iter_args(%arg5 = %arg3) -> (tensor<?x?xf32>) {
%21 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%9, %arg2)
%22 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%10, %arg4)
%23 = tensor.extract_slice %arg5[%arg2, %arg4] [%21, %22] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%24 = scf.for %arg6 = %c0 to %21 step %c4 iter_args(%arg7 = %23) -> (tensor<?x?xf32>) {
%26 = scf.for %arg8 = %c0 to %22 step %c4 iter_args(%arg9 = %arg7) -> (tensor<?x?xf32>) {
%27 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%21, %arg6)
%28 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg2)
%29 = tensor.extract_slice %13[%28, 0] [%27, %k] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%30 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%22, %arg8)
%31 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg8, %arg4)
%32 = tensor.extract_slice %15[0, %31] [%k, %30] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%33 = tensor.extract_slice %arg9[%arg6, %arg8] [%27, %30] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%34 = linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[], [4, 4, 4], [4, 4, 4]]}} ins(%29, %32 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%33 : tensor<?x?xf32>) -> tensor<?x?xf32>
%35 = tensor.insert_slice %34 into %arg9[%arg6, %arg8] [%27, %30] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
scf.yield %35 : tensor<?x?xf32>
}
scf.yield %26 : tensor<?x?xf32>
}
%25 = tensor.insert_slice %24 into %arg5[%arg2, %arg4] [%21, %22] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
scf.yield %25 : tensor<?x?xf32>
}
scf.yield %20 : tensor<?x?xf32>
}
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4, %18 : tensor<f32>, tensor<?x?xf32>) outs(%11 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
%20 = arith.addf %arg2, %arg3 : f32
linalg.yield %20 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %19, %3, offsets = [%arg0, %arg1], sizes = [%9, %10], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @arg2, set=0, binding=2, type="StorageBuffer", access="Read"
hal.interface.binding @arg3, set=0, binding=3, type="StorageBuffer", access="Write|Discard"
}
// CHECK-LABEL: func @multi_level_tile_fuse()
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[M:.+]] = hal.interface.load.constant offset = 0
// CHECK-DAG: %[[N:.+]] = hal.interface.load.constant offset = 1
// CHECK-DAG: %[[K:.+]] = hal.interface.load.constant offset = 2
// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan @io::@arg0[%[[C0]]] : memref<?x?xf32>{%[[M]], %[[K]]}
// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan @io::@arg1[%[[C0]]] : memref<?x?xf32>{%[[K]], %[[N]]}
// CHECK-DAG: %[[SCALAR:.+]] = hal.interface.binding.subspan @io::@arg2[%[[C0]]] : memref<f32>
// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan @io::@arg3[%[[C0]]] : memref<?x?xf32>{%[[M]], %[[N]]}
// CHECK: scf.for
// CHECK: scf.for
// CHECK-DAG: %[[LHS_SUBVIEW1:.+]] = memref.subview %[[LHS]]
// CHECK-DAG: %[[RHS_SUBVIEW1:.+]] = memref.subview %[[RHS]]
// CHECK-DAG: %[[OUT_SUBVIEW1:.+]] = memref.subview %[[OUT]]
// CHECK: linalg.fill(%{{.+}}, %[[OUT_SUBVIEW1]])
// CHECK: scf.for
// CHECK: scf.for
// CHECK: %[[OUT_SUBVIEW2:.+]] = memref.subview %[[OUT_SUBVIEW1]]
// CHECK: scf.for
// CHECK: scf.for
// CHECK-DAG: %[[LHS_SUBVIEW2:.+]] = memref.subview %[[LHS_SUBVIEW1]]
// CHECK-DAG: %[[RHS_SUBVIEW2:.+]] = memref.subview %[[RHS_SUBVIEW1]]
// CHECK-DAG: %[[OUT_SUBVIEW3:.+]] = memref.subview %[[OUT_SUBVIEW2]]
// CHECK: linalg.matmul
// CHECK-SAME: ins(%[[LHS_SUBVIEW2]], %[[RHS_SUBVIEW2]] :
// CHECK-SAME: outs(%[[OUT_SUBVIEW3]] :
// CHECK: linalg.generic
// CHECK-SAME: ins(%[[SCALAR]], %[[OUT_SUBVIEW1]] :
// CHECK-SAME: outs(%[[OUT_SUBVIEW1]] :
// -----
func @operand_fusion() {
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%cst = arith.constant 0.000000e+00 : f32
%m = hal.interface.load.constant offset = 0 : index
%n = hal.interface.load.constant offset = 1 : index
%k = hal.interface.load.constant offset = 2 : index
%0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%m, %k}
%1 = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%k, %n}
%2 = hal.interface.binding.subspan @io::@arg2[%c0] : !flow.dispatch.tensor<readonly:f32>
%3 = hal.interface.binding.subspan @io::@arg3[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%m, %n}
%4 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:f32> -> tensor<f32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%5 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg0 = %5 to %c2 step %6 {
%7 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x]
%8 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x]
scf.for %arg1 = %7 to %c1 step %8 {
%9 = affine.min affine_map<(d0) -> (4, -d0 + 2)>(%arg0)
%10 = affine.min affine_map<(d0) -> (4, -d0 + 1)>(%arg1)
%11 = linalg.init_tensor [%9, %10] : tensor<?x?xf32>
%12 = affine.min affine_map<(d0) -> (-d0 + 2, 4)>(%arg0)
%13 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%12, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32>
%14 = affine.min affine_map<(d0) -> (-d0 + 1, 4)>(%arg1)
%15 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [3, %14], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32>
%16 = linalg.init_tensor [%12, %14] : tensor<?x?xf32>
%17 = linalg.fill(%cst, %16) {__internal_linalg_transform__ = "workgroup"} : f32, tensor<?x?xf32> -> tensor<?x?xf32>
%18 = linalg.matmul {__internal_linalg_transform__ = "workgroup"}
ins(%13, %15 : tensor<?x?xf32>, tensor<?x?xf32>)
outs(%17: tensor<?x?xf32>) -> tensor<?x?xf32>
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4, %18 : tensor<f32>, tensor<?x?xf32>) outs(%11 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
%20 = arith.addf %arg2, %arg3 : f32
linalg.yield %20 : f32
} -> tensor<?x?xf32>
flow.dispatch.tensor.store %19, %3, offsets = [%arg0, %arg1], sizes = [%9, %10], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32>
}
}
return
}
hal.interface private @io {
hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read"
hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read"
hal.interface.binding @arg2, set=0, binding=2, type="StorageBuffer", access="Read"
hal.interface.binding @arg3, set=0, binding=3, type="StorageBuffer", access="Write|Discard"
}
// CHECK-LABEL: func @operand_fusion()
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-DAG: %[[M:.+]] = hal.interface.load.constant offset = 0
// CHECK-DAG: %[[N:.+]] = hal.interface.load.constant offset = 1
// CHECK-DAG: %[[K:.+]] = hal.interface.load.constant offset = 2
// CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan @io::@arg0[%[[C0]]] : memref<?x?xf32>{%[[M]], %[[K]]}
// CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan @io::@arg1[%[[C0]]] : memref<?x?xf32>{%[[K]], %[[N]]}
// CHECK-DAG: %[[SCALAR:.+]] = hal.interface.binding.subspan @io::@arg2[%[[C0]]] : memref<f32>
// CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan @io::@arg3[%[[C0]]] : memref<?x?xf32>{%[[M]], %[[N]]}
// CHECK: scf.for
// CHECK: scf.for
// CHECK-DAG: %[[LHS_SUBVIEW1:.+]] = memref.subview %[[LHS]]
// CHECK-DAG: %[[RHS_SUBVIEW1:.+]] = memref.subview %[[RHS]]
// CHECK-DAG: %[[OUT_SUBVIEW1:.+]] = memref.subview %[[OUT]]
// CHECK: linalg.fill(%{{.+}}, %[[OUT_SUBVIEW1]])
// CHECK: linalg.matmul
// CHECK-SAME: ins(%[[LHS_SUBVIEW1]], %[[RHS_SUBVIEW1]] :
// CHECK-SAME: outs(%[[OUT_SUBVIEW1]] :
// CHECK: linalg.generic
// CHECK-SAME: ins(%[[SCALAR]], %[[OUT_SUBVIEW1]] :
// CHECK-SAME: outs(%[[OUT_SUBVIEW1]] :