| // RUN: iree-opt %s --iree-codegen-linalg-bufferize -canonicalize -cse -split-input-file | IreeFileCheck %s |
| |
| func @tile_from_tensor_load() { |
| %c0 = arith.constant 0 : index |
| %c2 = arith.constant 2 : index |
| %c4 = arith.constant 4 : index |
| %c1 = arith.constant 1 : index |
| %c3 = arith.constant 3 : index |
| %M = hal.interface.load.constant offset = 0 : index |
| %N = hal.interface.load.constant offset = 1 : index |
| %K = hal.interface.load.constant offset = 2 : index |
| %0 = hal.interface.binding.subspan @io::@TENSOR_LHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %K} |
| %1 = hal.interface.binding.subspan @io::@TENSOR_RHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%K, %N} |
| %2 = hal.interface.binding.subspan @io::@TENSOR_INIT[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %N} |
| %3 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%M, %N} |
| %4 = hal.interface.workgroup.id[0] : index |
| %5 = hal.interface.workgroup.id[1] : index |
| scf.for %arg0 = %5 to %c2 step %c2 { |
| scf.for %arg1 = %4 to %c4 step %c4 { |
| %6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %c0], sizes = [%c1, %c3], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<1x3xf32> |
| %7 = flow.dispatch.tensor.load %1, offsets = [%c0, %arg1], sizes = [%c3, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<3x1xf32> |
| %8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<1x1xf32> |
| %9 = linalg.matmul ins(%6, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32> |
| flow.dispatch.tensor.store %9, %3, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32> |
| } |
| } |
| return |
| } |
| |
| hal.interface private @io { |
| hal.interface.binding @TENSOR_LHS, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @TENSOR_RHS, set=0, binding=1, type="StorageBuffer", access="Read" |
| hal.interface.binding @TENSOR_INIT, set=0, binding=2, type="StorageBuffer", access="Read" |
| hal.interface.binding @ret0, set=0, binding=3, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK-LABEL: func @tile_from_tensor_load() |
| // CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_LHS |
| // CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_RHS |
| // CHECK-DAG: %[[TENSOR_INIT:.+]] = hal.interface.binding.subspan @io::@TENSOR_INIT |
| // CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan @io::@ret0 |
| // CHECK: scf.for %[[IV0:.+]] = {{.+}} { |
| // CHECK: scf.for %[[IV1:.+]] = {{.+}} { |
| // CHECK-DAG: %[[LHS:.+]] = memref.subview %[[TENSOR_LHS]][%[[IV0]], 0] [1, 3] [1, 1] |
| // CHECK-DAG: %[[RHS:.+]] = memref.subview %[[TENSOR_RHS]][0, %[[IV1]]] [3, 1] [1, 1] |
| // CHECK-DAG: %[[INIT:.+]] = memref.subview %[[TENSOR_INIT]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1] |
| // CHECK-DAG: %[[RESULT:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1] |
| // CHECK: linalg.copy(%[[INIT]], %[[RESULT]]) |
| // CHECK: linalg.matmul |
| // CHECK-SAME: ins(%[[LHS]], %[[RHS]] |
| // CHECK-SAME: outs(%[[RESULT]] |
| |
| // ----- |
| |
| func @tile_from_tensor_load_inplace() { |
| %c0 = arith.constant 0 : index |
| %c2 = arith.constant 2 : index |
| %c4 = arith.constant 4 : index |
| %c1 = arith.constant 1 : index |
| %c3 = arith.constant 3 : index |
| %M = hal.interface.load.constant offset = 0 : index |
| %N = hal.interface.load.constant offset = 1 : index |
| %K = hal.interface.load.constant offset = 2 : index |
| %0 = hal.interface.binding.subspan @io::@TENSOR_LHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %K} |
| %1 = hal.interface.binding.subspan @io::@TENSOR_RHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%K, %N} |
| %2 = hal.interface.binding.subspan @io::@TENSOR_INIT[%c0] : !flow.dispatch.tensor<readwrite:?x?xf32>{%M, %N} |
| %4 = hal.interface.workgroup.id[0] : index |
| %5 = hal.interface.workgroup.id[1] : index |
| scf.for %arg0 = %5 to %c2 step %c2 { |
| scf.for %arg1 = %4 to %c4 step %c4 { |
| %6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %c0], sizes = [%c1, %c3], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<1x3xf32> |
| %7 = flow.dispatch.tensor.load %1, offsets = [%c0, %arg1], sizes = [%c3, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<3x1xf32> |
| %8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readwrite:?x?xf32> -> tensor<1x1xf32> |
| %9 = linalg.matmul ins(%6, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32> |
| flow.dispatch.tensor.store %9, %2, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:?x?xf32> |
| } |
| } |
| return |
| } |
| |
| hal.interface private @io { |
| hal.interface.binding @TENSOR_LHS, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @TENSOR_RHS, set=0, binding=1, type="StorageBuffer", access="Read" |
| hal.interface.binding @TENSOR_INIT, set=0, binding=2, type="StorageBuffer", access="Read|Write" |
| } |
| // CHECK-LABEL: func @tile_from_tensor_load_inplace() |
| // CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_LHS |
| // CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_RHS |
| // CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan @io::@TENSOR_INIT |
| // CHECK: scf.for %[[IV0:.+]] = {{.+}} { |
| // CHECK: scf.for %[[IV1:.+]] = {{.+}} { |
| // CHECK-DAG: %[[LHS:.+]] = memref.subview %[[TENSOR_LHS]][%[[IV0]], 0] [1, 3] [1, 1] |
| // CHECK-DAG: %[[RHS:.+]] = memref.subview %[[TENSOR_RHS]][0, %[[IV1]]] [3, 1] [1, 1] |
| // CHECK-DAG: %[[RESULT:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1] |
| // CHECK: linalg.matmul |
| // CHECK-SAME: ins(%[[LHS]], %[[RHS]] |
| // CHECK-SAME: outs(%[[RESULT]] |
| |
| // ----- |
| |
| func @tile_from_tensor_load_inplace_and_copy() { |
| %c0 = arith.constant 0 : index |
| %c2 = arith.constant 2 : index |
| %c4 = arith.constant 4 : index |
| %c1 = arith.constant 1 : index |
| %c3 = arith.constant 3 : index |
| %M = hal.interface.load.constant offset = 0 : index |
| %N = hal.interface.load.constant offset = 1 : index |
| %K = hal.interface.load.constant offset = 2 : index |
| %0 = hal.interface.binding.subspan @io::@TENSOR_LHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %K} |
| %1 = hal.interface.binding.subspan @io::@TENSOR_RHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%K, %N} |
| %2 = hal.interface.binding.subspan @io::@TENSOR_INIT[%c0] : !flow.dispatch.tensor<readwrite:?x?xf32>{%M, %N} |
| %3 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%M, %N} |
| %4 = hal.interface.workgroup.id[0] : index |
| %5 = hal.interface.workgroup.id[1] : index |
| scf.for %arg0 = %5 to %c2 step %c2 { |
| scf.for %arg1 = %4 to %c4 step %c4 { |
| %6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %c0], sizes = [%c1, %c3], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<1x3xf32> |
| %7 = flow.dispatch.tensor.load %1, offsets = [%c0, %arg1], sizes = [%c3, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<3x1xf32> |
| %8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readwrite:?x?xf32> -> tensor<1x1xf32> |
| %9 = linalg.matmul ins(%6, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32> |
| flow.dispatch.tensor.store %9, %2, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:?x?xf32> |
| flow.dispatch.tensor.store %9, %3, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32> |
| } |
| } |
| return |
| } |
| |
| hal.interface private @io { |
| hal.interface.binding @TENSOR_LHS, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @TENSOR_RHS, set=0, binding=1, type="StorageBuffer", access="Read" |
| hal.interface.binding @TENSOR_INIT, set=0, binding=2, type="StorageBuffer", access="Read|Write" |
| hal.interface.binding @ret0, set=0, binding=3, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK-LABEL: func @tile_from_tensor_load_inplace_and_copy() |
| // CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_LHS |
| // CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_RHS |
| // CHECK-DAG: %[[RETURN1:.+]] = hal.interface.binding.subspan @io::@TENSOR_INIT |
| // CHECK-DAG: %[[RETURN2:.+]] = hal.interface.binding.subspan @io::@ret0 |
| // CHECK: scf.for %[[IV0:.+]] = {{.+}} { |
| // CHECK: scf.for %[[IV1:.+]] = {{.+}} { |
| // CHECK-DAG: %[[LHS:.+]] = memref.subview %[[TENSOR_LHS]][%[[IV0]], 0] [1, 3] [1, 1] |
| // CHECK-DAG: %[[RHS:.+]] = memref.subview %[[TENSOR_RHS]][0, %[[IV1]]] [3, 1] [1, 1] |
| // CHECK-DAG: %[[RESULT1:.+]] = memref.subview %[[RETURN1]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1] |
| // CHECK: linalg.matmul |
| // CHECK-SAME: ins(%[[LHS]], %[[RHS]] |
| // CHECK-SAME: outs(%[[RESULT1]] |
| // CHECK: %[[RESULT2:.+]] = memref.subview %[[RETURN2]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1] |
| // CHECK: linalg.copy(%[[RESULT1]], %[[RESULT2]]) |
| |
| // ----- |
| |
| #map = affine_map<(d0, d1) -> (d0, d1)> |
| func @tile_from_pointwise_lhs() { |
| %c0 = arith.constant 0 : index |
| %c2 = arith.constant 2 : index |
| %c4 = arith.constant 4 : index |
| %c1 = arith.constant 1 : index |
| %c3 = arith.constant 3 : index |
| %M = hal.interface.load.constant offset = 0 : index |
| %N = hal.interface.load.constant offset = 1 : index |
| %K = hal.interface.load.constant offset = 2 : index |
| %0 = hal.interface.binding.subspan @io::@TENSOR_LHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %K} |
| %1 = hal.interface.binding.subspan @io::@TENSOR_RHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%K, %N} |
| %2 = hal.interface.binding.subspan @io::@TENSOR_INIT[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %N} |
| %3 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%M, %N} |
| %4 = hal.interface.workgroup.id[0] : index |
| %5 = hal.interface.workgroup.id[1] : index |
| scf.for %arg0 = %5 to %c2 step %c2 { |
| scf.for %arg1 = %4 to %c4 step %c4 { |
| %6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %c0], sizes = [%c1, %c3], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<1x3xf32> |
| %7 = flow.dispatch.tensor.load %1, offsets = [%c0, %arg1], sizes = [%c3, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<3x1xf32> |
| %shape = linalg.init_tensor [1, 3] : tensor<1x3xf32> |
| %8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} |
| ins(%6 : tensor<1x3xf32>) outs(%shape : tensor<1x3xf32>) { |
| ^bb0(%arg2: f32, %s: f32): // no predecessors |
| linalg.yield %arg2 : f32 |
| } -> tensor<1x3xf32> |
| %9 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<1x1xf32> |
| %10 = linalg.matmul ins(%8, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%9 : tensor<1x1xf32>) -> tensor<1x1xf32> |
| flow.dispatch.tensor.store %10, %3, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32> |
| } |
| } |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @TENSOR_LHS, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @TENSOR_RHS, set=0, binding=1, type="StorageBuffer", access="Read" |
| hal.interface.binding @TENSOR_INIT, set=0, binding=2, type="StorageBuffer", access="Read" |
| hal.interface.binding @ret0, set=0, binding=3, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK-LABEL: func @tile_from_pointwise_lhs() |
| // CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<1x3xf32> |
| // CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_LHS |
| // CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_RHS |
| // CHECK-DAG: %[[TENSOR_INIT:.+]] = hal.interface.binding.subspan @io::@TENSOR_INIT |
| // CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan @io::@ret0 |
| // CHECK: scf.for %[[IV0:.+]] = {{.+}} { |
| // CHECK: scf.for %[[IV1:.+]] = {{.+}} { |
| // CHECK-DAG: %[[LHS:.+]] = memref.subview %[[TENSOR_LHS]][%[[IV0]], 0] [1, 3] [1, 1] |
| // CHECK-DAG: %[[RHS:.+]] = memref.subview %[[TENSOR_RHS]][0, %[[IV1]]] [3, 1] [1, 1] |
| // CHECK: linalg.generic |
| // CHECK-SAME: ins(%[[LHS]] : |
| // CHECK-SAME: outs(%[[ALLOC]] |
| // CHECK-DAG: %[[INIT:.+]] = memref.subview %[[TENSOR_INIT]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1] |
| // CHECK-DAG: %[[RESULT:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1] |
| // CHECK: linalg.copy(%[[INIT]], %[[RESULT]]) |
| // CHECK: linalg.matmul |
| // CHECK-SAME: ins(%[[ALLOC]], %[[RHS]] |
| // CHECK-SAME: outs(%[[RESULT]] |
| |
| // ----- |
| |
| #map = affine_map<(d0, d1) -> (d0, d1)> |
| func @tile_from_pointwise_lhs_inplace() { |
| %c0 = arith.constant 0 : index |
| %c2 = arith.constant 2 : index |
| %c4 = arith.constant 4 : index |
| %c1 = arith.constant 1 : index |
| %c3 = arith.constant 3 : index |
| %M = hal.interface.load.constant offset = 0 : index |
| %N = hal.interface.load.constant offset = 1 : index |
| %K = hal.interface.load.constant offset = 2 : index |
| %0 = hal.interface.binding.subspan @io::@TENSOR_LHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %K} |
| %1 = hal.interface.binding.subspan @io::@TENSOR_RHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%K, %N} |
| %2 = hal.interface.binding.subspan @io::@TENSOR_INIT[%c0] : !flow.dispatch.tensor<readwrite:?x?xf32>{%M, %N} |
| %4 = hal.interface.workgroup.id[0] : index |
| %5 = hal.interface.workgroup.id[1] : index |
| scf.for %arg0 = %5 to %c2 step %c2 { |
| scf.for %arg1 = %4 to %c4 step %c4 { |
| %6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %c0], sizes = [%c1, %c3], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<1x3xf32> |
| %7 = flow.dispatch.tensor.load %1, offsets = [%c0, %arg1], sizes = [%c3, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<3x1xf32> |
| %shape = linalg.init_tensor [1, 3] : tensor<1x3xf32> |
| %8 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} |
| ins(%6 : tensor<1x3xf32>) outs(%shape : tensor<1x3xf32>) { |
| ^bb0(%arg2: f32, %s: f32): // no predecessors |
| linalg.yield %arg2 : f32 |
| } -> tensor<1x3xf32> |
| %9 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readwrite:?x?xf32> -> tensor<1x1xf32> |
| %10 = linalg.matmul ins(%8, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%9 : tensor<1x1xf32>) -> tensor<1x1xf32> |
| flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:?x?xf32> |
| } |
| } |
| return |
| } |
| |
| hal.interface private @io { |
| hal.interface.binding @TENSOR_LHS, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @TENSOR_RHS, set=0, binding=1, type="StorageBuffer", access="Read" |
| hal.interface.binding @TENSOR_INIT, set=0, binding=2, type="StorageBuffer", access="Read|Write" |
| } |
| // CHECK-LABEL: func @tile_from_pointwise_lhs_inplace() |
| // CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<1x3xf32> |
| // CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_LHS |
| // CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_RHS |
| // CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan @io::@TENSOR_INIT |
| // CHECK: scf.for %[[IV0:.+]] = {{.+}} { |
| // CHECK: scf.for %[[IV1:.+]] = {{.+}} { |
| // CHECK-DAG: %[[LHS:.+]] = memref.subview %[[TENSOR_LHS]][%[[IV0]], 0] [1, 3] [1, 1] |
| // CHECK-DAG: %[[RHS:.+]] = memref.subview %[[TENSOR_RHS]][0, %[[IV1]]] [3, 1] [1, 1] |
| // CHECK: linalg.generic |
| // CHECK-SAME: ins(%[[LHS]] : |
| // CHECK-SAME: outs(%[[ALLOC]] |
| // CHECK-DAG: %[[RESULT:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1] |
| // CHECK: linalg.matmul |
| // CHECK-SAME: ins(%[[ALLOC]], %[[RHS]] |
| // CHECK-SAME: outs(%[[RESULT]] |
| |
| // ----- |
| |
| #map = affine_map<(d0, d1) -> (d0, d1)> |
| func @tile_from_pointwise_outs() { |
| %c0 = arith.constant 0 : index |
| %c2 = arith.constant 2 : index |
| %c4 = arith.constant 4 : index |
| %c1 = arith.constant 1 : index |
| %c3 = arith.constant 3 : index |
| %M = hal.interface.load.constant offset = 0 : index |
| %N = hal.interface.load.constant offset = 1 : index |
| %K = hal.interface.load.constant offset = 2 : index |
| %0 = hal.interface.binding.subspan @io::@TENSOR_LHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %K} |
| %1 = hal.interface.binding.subspan @io::@TENSOR_RHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%K, %N} |
| %2 = hal.interface.binding.subspan @io::@TENSOR_INIT[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %N} |
| %3 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%M, %N} |
| %4 = hal.interface.workgroup.id[0] : index |
| %5 = hal.interface.workgroup.id[1] : index |
| scf.for %arg0 = %5 to %c2 step %c2 { |
| scf.for %arg1 = %4 to %c4 step %c4 { |
| %6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %c0], sizes = [%c1, %c3], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<1x3xf32> |
| %7 = flow.dispatch.tensor.load %1, offsets = [%c0, %arg1], sizes = [%c3, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<3x1xf32> |
| %8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<1x1xf32> |
| %shape = linalg.init_tensor [1, 1] : tensor<1x1xf32> |
| %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} |
| ins(%8 : tensor<1x1xf32>) outs(%shape : tensor<1x1xf32>) { |
| ^bb0(%arg2: f32, %s: f32): // no predecessors |
| linalg.yield %arg2 : f32 |
| } -> tensor<1x1xf32> |
| %10 = linalg.matmul ins(%6, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%9 : tensor<1x1xf32>) -> tensor<1x1xf32> |
| flow.dispatch.tensor.store %10, %3, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32> |
| } |
| } |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @TENSOR_LHS, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @TENSOR_RHS, set=0, binding=1, type="StorageBuffer", access="Read" |
| hal.interface.binding @TENSOR_INIT, set=0, binding=2, type="StorageBuffer", access="Read" |
| hal.interface.binding @ret0, set=0, binding=3, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK-LABEL: func @tile_from_pointwise_outs() |
| // CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_LHS |
| // CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_RHS |
| // CHECK-DAG: %[[TENSOR_INIT:.+]] = hal.interface.binding.subspan @io::@TENSOR_INIT |
| // CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan @io::@ret0 |
| // CHECK: scf.for %[[IV0:.+]] = {{.+}} { |
| // CHECK: scf.for %[[IV1:.+]] = {{.+}} { |
| // CHECK-DAG: %[[RESULT:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1] |
| // CHECK-DAG: %[[LHS:.+]] = memref.subview %[[TENSOR_LHS]][%[[IV0]], 0] [1, 3] [1, 1] |
| // CHECK-DAG: %[[RHS:.+]] = memref.subview %[[TENSOR_RHS]][0, %[[IV1]]] [3, 1] [1, 1] |
| // CHECK-DAG: %[[INIT:.+]] = memref.subview %[[TENSOR_INIT]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1] |
| // CHECK: linalg.generic |
| // CHECK-SAME: ins(%[[INIT]] : |
| // CHECK-SAME: outs(%[[RESULT]] |
| // CHECK: linalg.matmul |
| // CHECK-SAME: ins(%[[LHS]], %[[RHS]] |
| // CHECK-SAME: outs(%[[RESULT]] |
| |
| // ----- |
| |
| #map = affine_map<(d0, d1) -> (d0, d1)> |
| func @tile_from_pointwise_outs_inplace() { |
| %c0 = arith.constant 0 : index |
| %c2 = arith.constant 2 : index |
| %c4 = arith.constant 4 : index |
| %c1 = arith.constant 1 : index |
| %c3 = arith.constant 3 : index |
| %M = hal.interface.load.constant offset = 0 : index |
| %N = hal.interface.load.constant offset = 1 : index |
| %K = hal.interface.load.constant offset = 2 : index |
| %0 = hal.interface.binding.subspan @io::@TENSOR_LHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %K} |
| %1 = hal.interface.binding.subspan @io::@TENSOR_RHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%K, %N} |
| %2 = hal.interface.binding.subspan @io::@TENSOR_INIT[%c0] : !flow.dispatch.tensor<readwrite:?x?xf32>{%M, %N} |
| %4 = hal.interface.workgroup.id[0] : index |
| %5 = hal.interface.workgroup.id[1] : index |
| scf.for %arg0 = %5 to %c2 step %c2 { |
| scf.for %arg1 = %4 to %c4 step %c4 { |
| %6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %c0], sizes = [%c1, %c3], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<1x3xf32> |
| %7 = flow.dispatch.tensor.load %1, offsets = [%c0, %arg1], sizes = [%c3, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<3x1xf32> |
| %8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readwrite:?x?xf32> -> tensor<1x1xf32> |
| %shape = linalg.init_tensor [1, 1] : tensor<1x1xf32> |
| %9 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} |
| ins(%8 : tensor<1x1xf32>) outs(%shape : tensor<1x1xf32>) { |
| ^bb0(%arg2: f32, %s: f32): // no predecessors |
| linalg.yield %arg2 : f32 |
| } -> tensor<1x1xf32> |
| %10 = linalg.matmul ins(%6, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%9 : tensor<1x1xf32>) -> tensor<1x1xf32> |
| flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:?x?xf32> |
| } |
| } |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @TENSOR_LHS, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @TENSOR_RHS, set=0, binding=1, type="StorageBuffer", access="Read" |
| hal.interface.binding @TENSOR_INIT, set=0, binding=2, type="StorageBuffer", access="Read|Write" |
| } |
| // CHECK-LABEL: func @tile_from_pointwise_outs_inplace() |
| // CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_LHS |
| // CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_RHS |
| // CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan @io::@TENSOR_INIT |
| // CHECK: scf.for %[[IV0:.+]] = {{.+}} { |
| // CHECK: scf.for %[[IV1:.+]] = {{.+}} { |
| // CHECK-DAG: %[[RESULT:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1] |
| // CHECK-DAG: %[[LHS:.+]] = memref.subview %[[TENSOR_LHS]][%[[IV0]], 0] [1, 3] [1, 1] |
| // CHECK-DAG: %[[RHS:.+]] = memref.subview %[[TENSOR_RHS]][0, %[[IV1]]] [3, 1] [1, 1] |
| // CHECK: linalg.generic |
| // CHECK-SAME: outs(%[[RESULT]] |
| // CHECK: linalg.matmul |
| // CHECK-SAME: ins(%[[LHS]], %[[RHS]] |
| // CHECK-SAME: outs(%[[RESULT]] |
| |
| // ----- |
| |
| #map = affine_map<(d0, d1) -> (d0, d1)> |
| func @tile_from_matmul_outs() { |
| %c0 = arith.constant 0 : index |
| %c2 = arith.constant 2 : index |
| %c4 = arith.constant 4 : index |
| %c1 = arith.constant 1 : index |
| %c3 = arith.constant 3 : index |
| %M = hal.interface.load.constant offset = 0 : index |
| %N = hal.interface.load.constant offset = 1 : index |
| %K = hal.interface.load.constant offset = 2 : index |
| %0 = hal.interface.binding.subspan @io::@TENSOR_LHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %K} |
| %1 = hal.interface.binding.subspan @io::@TENSOR_RHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%K, %N} |
| %2 = hal.interface.binding.subspan @io::@TENSOR_INIT[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %N} |
| %3 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%M, %N} |
| %4 = hal.interface.workgroup.id[0] : index |
| %5 = hal.interface.workgroup.id[1] : index |
| scf.for %arg0 = %5 to %c2 step %c2 { |
| scf.for %arg1 = %4 to %c4 step %c4 { |
| %6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %c0], sizes = [%c1, %c3], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<1x3xf32> |
| %7 = flow.dispatch.tensor.load %1, offsets = [%c0, %arg1], sizes = [%c3, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<3x1xf32> |
| %8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<1x1xf32> |
| %shape = linalg.init_tensor [1, 1] : tensor<1x1xf32> |
| %9 = linalg.matmul ins(%6, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32> |
| %10 = linalg.matmul ins(%6, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%9 : tensor<1x1xf32>) -> tensor<1x1xf32> |
| flow.dispatch.tensor.store %10, %3, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32> |
| } |
| } |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @TENSOR_LHS, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @TENSOR_RHS, set=0, binding=1, type="StorageBuffer", access="Read" |
| hal.interface.binding @TENSOR_INIT, set=0, binding=2, type="StorageBuffer", access="Read" |
| hal.interface.binding @ret0, set=0, binding=3, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK-LABEL: func @tile_from_matmul_outs() |
| // CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_LHS |
| // CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_RHS |
| // CHECK-DAG: %[[TENSOR_INIT:.+]] = hal.interface.binding.subspan @io::@TENSOR_INIT |
| // CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan @io::@ret0 |
| // CHECK: scf.for %[[IV0:.+]] = {{.+}} { |
| // CHECK: scf.for %[[IV1:.+]] = {{.+}} { |
| // CHECK-DAG: %[[LHS:.+]] = memref.subview %[[TENSOR_LHS]][%[[IV0]], 0] [1, 3] [1, 1] |
| // CHECK-DAG: %[[RHS:.+]] = memref.subview %[[TENSOR_RHS]][0, %[[IV1]]] [3, 1] [1, 1] |
| // CHECK-DAG: %[[INIT:.+]] = memref.subview %[[TENSOR_INIT]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1] |
| // CHECK-DAG: %[[RESULT:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1] |
| // CHECK: linalg.copy(%[[INIT]], %[[RESULT]]) |
| // CHECK: linalg.matmul |
| // CHECK-SAME: outs(%[[RESULT]] |
| // CHECK: linalg.matmul |
| // CHECK-SAME: outs(%[[RESULT]] |
| |
| // ----- |
| |
| #map = affine_map<(d0, d1) -> (d0, d1)> |
| func @tile_from_matmul_outs_inplace() { |
| %c0 = arith.constant 0 : index |
| %c2 = arith.constant 2 : index |
| %c4 = arith.constant 4 : index |
| %c1 = arith.constant 1 : index |
| %c3 = arith.constant 3 : index |
| %M = hal.interface.load.constant offset = 0 : index |
| %N = hal.interface.load.constant offset = 1 : index |
| %K = hal.interface.load.constant offset = 2 : index |
| %0 = hal.interface.binding.subspan @io::@TENSOR_LHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%M, %K} |
| %1 = hal.interface.binding.subspan @io::@TENSOR_RHS[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%K, %N} |
| %2 = hal.interface.binding.subspan @io::@TENSOR_INIT[%c0] : !flow.dispatch.tensor<readwrite:?x?xf32>{%M, %N} |
| %4 = hal.interface.workgroup.id[0] : index |
| %5 = hal.interface.workgroup.id[1] : index |
| scf.for %arg0 = %5 to %c2 step %c2 { |
| scf.for %arg1 = %4 to %c4 step %c4 { |
| %6 = flow.dispatch.tensor.load %0, offsets = [%arg0, %c0], sizes = [%c1, %c3], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<1x3xf32> |
| %7 = flow.dispatch.tensor.load %1, offsets = [%c0, %arg1], sizes = [%c3, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<3x1xf32> |
| %8 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readwrite:?x?xf32> -> tensor<1x1xf32> |
| %9 = linalg.matmul ins(%6, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32> |
| %10 = linalg.matmul ins(%6, %7 : tensor<1x3xf32>, tensor<3x1xf32>) outs(%9 : tensor<1x1xf32>) -> tensor<1x1xf32> |
| flow.dispatch.tensor.store %10, %2, offsets = [%arg0, %arg1], sizes = [%c1, %c1], strides = [%c1, %c1] : tensor<1x1xf32> -> !flow.dispatch.tensor<readwrite:?x?xf32> |
| } |
| } |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @TENSOR_LHS, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @TENSOR_RHS, set=0, binding=1, type="StorageBuffer", access="Read" |
| hal.interface.binding @TENSOR_INIT, set=0, binding=2, type="StorageBuffer", access="Read|Write" |
| } |
| // CHECK-LABEL: func @tile_from_matmul_outs_inplace() |
| // CHECK-DAG: %[[TENSOR_LHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_LHS |
| // CHECK-DAG: %[[TENSOR_RHS:.+]] = hal.interface.binding.subspan @io::@TENSOR_RHS |
| // CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan @io::@TENSOR_INIT |
| // CHECK: scf.for %[[IV0:.+]] = {{.+}} { |
| // CHECK: scf.for %[[IV1:.+]] = {{.+}} { |
| // CHECK-DAG: %[[RESULT:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]] [1, 1] [1, 1] |
| // CHECK-DAG: %[[LHS:.+]] = memref.subview %[[TENSOR_LHS]][%[[IV0]], 0] [1, 3] [1, 1] |
| // CHECK-DAG: %[[RHS:.+]] = memref.subview %[[TENSOR_RHS]][0, %[[IV1]]] [3, 1] [1, 1] |
| // CHECK: linalg.matmul |
| // CHECK-SAME: outs(%[[RESULT]] |
| // CHECK: linalg.matmul |
| // CHECK-SAME: outs(%[[RESULT]] |
| |
| |
| // ----- |
| |
| func @bufferize_dynamic() { |
| %c0 = arith.constant 0 : index |
| %c1 = arith.constant 1 : index |
| %4 = hal.interface.load.constant offset = 0 : index |
| %5 = hal.interface.load.constant offset = 1 : index |
| %6 = hal.interface.load.constant offset = 2 : index |
| %7 = hal.interface.load.constant offset = 3 : index |
| %8 = hal.interface.load.constant offset = 4 : index |
| %9 = hal.interface.load.constant offset = 5 : index |
| %10 = hal.interface.load.constant offset = 6 : index |
| %11 = hal.interface.load.constant offset = 7 : index |
| %LHS = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%4, %5} |
| %RHS = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%6, %7} |
| %INIT = hal.interface.binding.subspan @io::@arg2[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%8, %9} |
| %RET = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%10, %11} |
| %workgroup_size_x = hal.interface.workgroup.size[0] : index |
| %workgroup_size_y = hal.interface.workgroup.size[1] : index |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index |
| %20 = arith.muli %workgroup_size_y, %workgroup_id_y : index |
| %21 = arith.muli %workgroup_size_y, %workgroup_count_y : index |
| scf.for %arg0 = %20 to %4 step %21 { |
| %22 = arith.muli %workgroup_size_x, %workgroup_id_x : index |
| %23 = arith.muli %workgroup_size_x, %workgroup_count_x : index |
| scf.for %arg1 = %22 to %7 step %23 { |
| %24 = affine.min affine_map<(d0)[s0, s1] -> (s1, -d0 + s0)>(%arg0)[%4, %workgroup_size_y] |
| %25 = flow.dispatch.tensor.load %LHS, offsets = [%arg0, %c0], sizes = [%24, %5], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32> |
| %26 = affine.min affine_map<(d0)[s0, s1] -> (s1, -d0 + s0)>(%arg1)[%7, %workgroup_size_x] |
| %27 = flow.dispatch.tensor.load %RHS, offsets = [%c0, %arg1], sizes = [%6, %26], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32> |
| %28 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg0)[%workgroup_size_y, %8] |
| %29 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg1)[%workgroup_size_x, %9] |
| %30 = flow.dispatch.tensor.load %INIT, offsets = [%arg0, %arg1], sizes = [%28, %29], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32> |
| %31 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%25, %27 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%30 : tensor<?x?xf32>) -> tensor<?x?xf32> |
| flow.dispatch.tensor.store %31, %RET, offsets = [%arg0, %arg1], sizes = [%28, %29], strides = [%c1, %c1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32> |
| } |
| } |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read" |
| hal.interface.binding @arg2, set=0, binding=2, type="StorageBuffer", access="Read" |
| hal.interface.binding @ret0, set=0, binding=3, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (s1, -d0 + s0)> |
| // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)> |
| // CHECK: func @bufferize_dynamic() |
| // CHECK: %[[DIM0:.+]] = hal.interface.load.constant offset = 0 : index |
| // CHECK: %[[DIM1:.+]] = hal.interface.load.constant offset = 1 : index |
| // CHECK: %[[DIM2:.+]] = hal.interface.load.constant offset = 2 : index |
| // CHECK: %[[DIM3:.+]] = hal.interface.load.constant offset = 3 : index |
| // CHECK: %[[DIM4:.+]] = hal.interface.load.constant offset = 4 : index |
| // CHECK: %[[DIM5:.+]] = hal.interface.load.constant offset = 5 : index |
| // CHECK: %[[DIM6:.+]] = hal.interface.load.constant offset = 6 : index |
| // CHECK: %[[DIM7:.+]] = hal.interface.load.constant offset = 7 : index |
| // CHECK: %[[LHS:.+]] = hal.interface.binding.subspan @io::@arg0[%{{.+}}] : memref<?x?xf32>{%[[DIM0]], %[[DIM1]]} |
| // CHECK: %[[RHS:.+]] = hal.interface.binding.subspan @io::@arg1[%{{.+}}] : memref<?x?xf32>{%[[DIM2]], %[[DIM3]]} |
| // CHECK: %[[INIT:.+]] = hal.interface.binding.subspan @io::@arg2[%{{.+}}] : memref<?x?xf32>{%[[DIM4]], %[[DIM5]]} |
| // CHECK: %[[RESULT:.+]] = hal.interface.binding.subspan @io::@ret0[%{{.+}}] : memref<?x?xf32>{%[[DIM6]], %[[DIM7]]} |
| // CHECK-DAG: %[[WGSIZE_X:.+]] = hal.interface.workgroup.size[0] |
| // CHECK-DAG: %[[WGSIZE_Y:.+]] = hal.interface.workgroup.size[1] |
| // CHECK: scf.for %[[IV0:.+]] = {{.+}} { |
| // CHECK: scf.for %[[IV1:.+]] = {{.+}} { |
| // CHECK: %[[TILE_M:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[DIM0]], %[[WGSIZE_Y]]] |
| // CHECK: %[[LHS_TILE:.+]] = memref.subview %[[LHS]][%[[IV0]], 0] [%[[TILE_M]], %[[DIM1]]] |
| // CHECK: %[[TILE_N:.+]] = affine.min #[[MAP0]](%[[IV1]])[%[[DIM3]], %[[WGSIZE_X]]] |
| // CHECK-DAG: %[[RHS_TILE:.+]] = memref.subview %[[RHS]][0, %[[IV1]]] [%[[DIM2]], %[[TILE_N]]] |
| // CHECK: %[[TILE_M_2:.+]] = affine.min #[[MAP2]](%[[IV0]])[%[[WGSIZE_Y]], %[[DIM4]]] |
| // CHECK: %[[TILE_N_2:.+]] = affine.min #[[MAP2]](%[[IV1]])[%[[WGSIZE_X]], %[[DIM5]]] |
| // CHECK-DAG: %[[INIT_TILE:.+]] = memref.subview %[[INIT]][%[[IV0]], %[[IV1]]] [%[[TILE_M_2]], %[[TILE_N_2]]] |
| // CHECK-DAG: %[[RESULT_TILE:.+]] = memref.subview %[[RESULT]][%[[IV0]], %[[IV1]]] [%[[TILE_M_2]], %[[TILE_N_2]]] |
| // CHECK: linalg.copy(%[[INIT_TILE]], %[[RESULT_TILE]]) |
| // CHECK: linalg.matmul |
| // CHECK-SAME: ins(%[[LHS_TILE]], %[[RHS_TILE]] |
| // CHECK-SAME: outs(%[[RESULT_TILE]] |
| |
| // ----- |
| |
| func @bufferize_dynamic_inplace() { |
| %c0 = arith.constant 0 : index |
| %c1 = arith.constant 1 : index |
| %4 = hal.interface.load.constant offset = 0 : index |
| %5 = hal.interface.load.constant offset = 1 : index |
| %6 = hal.interface.load.constant offset = 2 : index |
| %7 = hal.interface.load.constant offset = 3 : index |
| %8 = hal.interface.load.constant offset = 4 : index |
| %9 = hal.interface.load.constant offset = 5 : index |
| %LHS = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%4, %5} |
| %RHS = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%6, %7} |
| %OUT = hal.interface.binding.subspan @io::@arg2[%c0] : !flow.dispatch.tensor<readwrite:?x?xf32>{%8, %9} |
| %workgroup_size_x = hal.interface.workgroup.size[0] : index |
| %workgroup_size_y = hal.interface.workgroup.size[1] : index |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index |
| %20 = arith.muli %workgroup_size_y, %workgroup_id_y : index |
| %21 = arith.muli %workgroup_size_y, %workgroup_count_y : index |
| scf.for %arg0 = %20 to %4 step %21 { |
| %22 = arith.muli %workgroup_size_x, %workgroup_id_x : index |
| %23 = arith.muli %workgroup_size_x, %workgroup_count_x : index |
| scf.for %arg1 = %22 to %7 step %23 { |
| %24 = affine.min affine_map<(d0)[s0, s1] -> (s1, -d0 + s0)>(%arg0)[%4, %workgroup_size_y] |
| %25 = flow.dispatch.tensor.load %LHS, offsets = [%arg0, %c0], sizes = [%24, %5], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32> |
| %26 = affine.min affine_map<(d0)[s0, s1] -> (s1, -d0 + s0)>(%arg1)[%7, %workgroup_size_x] |
| %27 = flow.dispatch.tensor.load %RHS, offsets = [%c0, %arg1], sizes = [%6, %26], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32> |
| %28 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg0)[%workgroup_size_y, %8] |
| %29 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg1)[%workgroup_size_x, %9] |
| %30 = flow.dispatch.tensor.load %OUT, offsets = [%arg0, %arg1], sizes = [%28, %29], strides = [%c1, %c1] : !flow.dispatch.tensor<readwrite:?x?xf32> -> tensor<?x?xf32> |
| %31 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%25, %27 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%30 : tensor<?x?xf32>) -> tensor<?x?xf32> |
| flow.dispatch.tensor.store %31, %OUT, offsets = [%arg0, %arg1], sizes = [%28, %29], strides = [%c1, %c1] : tensor<?x?xf32> -> !flow.dispatch.tensor<readwrite:?x?xf32> |
| } |
| } |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read" |
| hal.interface.binding @arg2, set=0, binding=2, type="StorageBuffer", access="Read|Write" |
| } |
| // CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0, s1] -> (s1, -d0 + s0)> |
| // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)> |
| // CHECK: func @bufferize_dynamic_inplace() |
| // CHECK: %[[DIM0:.+]] = hal.interface.load.constant offset = 0 : index |
| // CHECK: %[[DIM1:.+]] = hal.interface.load.constant offset = 1 : index |
| // CHECK: %[[DIM2:.+]] = hal.interface.load.constant offset = 2 : index |
| // CHECK: %[[DIM3:.+]] = hal.interface.load.constant offset = 3 : index |
| // CHECK: %[[DIM4:.+]] = hal.interface.load.constant offset = 4 : index |
| // CHECK: %[[DIM5:.+]] = hal.interface.load.constant offset = 5 : index |
| // CHECK: %[[LHS:.+]] = hal.interface.binding.subspan @io::@arg0[%{{.+}}] : memref<?x?xf32>{%[[DIM0]], %[[DIM1]]} |
| // CHECK: %[[RHS:.+]] = hal.interface.binding.subspan @io::@arg1[%{{.+}}] : memref<?x?xf32>{%[[DIM2]], %[[DIM3]]} |
| // CHECK: %[[RESULT:.+]] = hal.interface.binding.subspan @io::@arg2[%{{.+}}] : memref<?x?xf32>{%[[DIM4]], %[[DIM5]]} |
| // CHECK-DAG: %[[WGSIZE_X:.+]] = hal.interface.workgroup.size[0] |
| // CHECK-DAG: %[[WGSIZE_Y:.+]] = hal.interface.workgroup.size[1] |
| // CHECK: scf.for %[[IV0:.+]] = {{.+}} { |
| // CHECK: scf.for %[[IV1:.+]] = {{.+}} { |
| // CHECK: %[[TILE_M:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[DIM0]], %[[WGSIZE_Y]]] |
| // CHECK: %[[LHS_TILE:.+]] = memref.subview %[[LHS]][%[[IV0]], 0] [%[[TILE_M]], %[[DIM1]]] |
| // CHECK: %[[TILE_N:.+]] = affine.min #[[MAP0]](%[[IV1]])[%[[DIM3]], %[[WGSIZE_X]]] |
| // CHECK-DAG: %[[RHS_TILE:.+]] = memref.subview %[[RHS]][0, %[[IV1]]] [%[[DIM2]], %[[TILE_N]]] |
| // CHECK: %[[TILE_M_2:.+]] = affine.min #[[MAP2]](%[[IV0]])[%[[WGSIZE_Y]], %[[DIM4]]] |
| // CHECK: %[[TILE_N_2:.+]] = affine.min #[[MAP2]](%[[IV1]])[%[[WGSIZE_X]], %[[DIM5]]] |
| // CHECK-DAG: %[[RESULT_TILE:.+]] = memref.subview %[[RESULT]][%[[IV0]], %[[IV1]]] [%[[TILE_M_2]], %[[TILE_N_2]]] |
| // CHECK: linalg.matmul |
| // CHECK-SAME: ins(%[[LHS_TILE]], %[[RHS_TILE]] |
| // CHECK-SAME: outs(%[[RESULT_TILE]] |
| |
| // ----- |
| |
| func @reshape_simple() { |
| %c0 = arith.constant 0 : index |
| %c1 = arith.constant 1 : index |
| %c3 = arith.constant 3 : index |
| %c4 = arith.constant 4 : index |
| %c12 = arith.constant 12 : index |
| %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:12xi32> |
| %1 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:3x4xi32> |
| %2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:12xi32> -> tensor<12xi32> |
| %3 = linalg.tensor_expand_shape %2 [[0, 1]] : tensor<12xi32> into tensor<3x4xi32> |
| flow.dispatch.tensor.store %3, %1, offsets = [], sizes = [], strides = [] : tensor<3x4xi32> -> !flow.dispatch.tensor<writeonly:3x4xi32> |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK: func @reshape_simple() |
| // CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0 |
| // CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0 |
| // CHECK: %[[RESHAPE:.+]] = memref.expand_shape %[[ARG0]] {{\[}}[0, 1]] |
| // CHECK: linalg.copy(%[[RESHAPE]], %[[RET0]]) |
| |
| // ----- |
| |
| func @reshape_fused_source() { |
| %c0 = arith.constant 0 : index |
| %c1 = arith.constant 1 : index |
| %c3 = arith.constant 3 : index |
| %c4 = arith.constant 4 : index |
| %c12 = arith.constant 12 : index |
| %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:12xi32> |
| %1 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:3x4xi32> |
| %2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:12xi32> -> tensor<12xi32> |
| %3 = linalg.tensor_expand_shape %2 [[0, 1]] : tensor<12xi32> into tensor<3x4xi32> |
| %4 = linalg.init_tensor [3, 4] : tensor<3x4xi32> |
| %5 = linalg.generic { |
| indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], |
| iterator_types = ["parallel", "parallel"]} |
| ins(%3 : tensor<3x4xi32>) outs(%4 : tensor<3x4xi32>) { |
| ^bb0(%arg0 : i32, %arg1 : i32): |
| %6 = arith.addi %arg0, %arg0 : i32 |
| linalg.yield %6 : i32 |
| } -> tensor<3x4xi32> |
| flow.dispatch.tensor.store %5, %1, offsets = [], sizes = [], strides = [] : tensor<3x4xi32> -> !flow.dispatch.tensor<writeonly:3x4xi32> |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK: func @reshape_fused_source() |
| // CHECK: %[[C0:.+]] = arith.constant 0 |
| // CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0[%[[C0]]] : memref<12xi32> |
| // CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0[%[[C0]]] : memref<3x4xi32> |
| // CHECK: %[[RESHAPE:.+]] = memref.expand_shape %[[ARG0]] {{\[}}[0, 1]] |
| // CHECK: linalg.generic |
| // CHECK-SAME: ins(%[[RESHAPE]] : memref<3x4xi32>) |
| // CHECK-SAME: outs(%[[RET0]] : memref<3x4xi32>) |
| |
| // ----- |
| |
| func @reshape_fused_source_and_copyout() { |
| %c0 = arith.constant 0 : index |
| %c1 = arith.constant 1 : index |
| %c3 = arith.constant 3 : index |
| %c4 = arith.constant 4 : index |
| %c12 = arith.constant 12 : index |
| %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:12xi32> |
| %1 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:3x4xi32> |
| %2 = hal.interface.binding.subspan @io::@ret1[%c0] : !flow.dispatch.tensor<writeonly:3x4xi32> |
| %3 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:12xi32> -> tensor<12xi32> |
| %4 = linalg.tensor_expand_shape %3 [[0, 1]] : tensor<12xi32> into tensor<3x4xi32> |
| %5 = linalg.init_tensor [3, 4] : tensor<3x4xi32> |
| %6 = linalg.generic { |
| indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], |
| iterator_types = ["parallel", "parallel"]} |
| ins(%4 : tensor<3x4xi32>) outs(%5 : tensor<3x4xi32>) { |
| ^bb0(%arg0 : i32, %arg1 : i32): |
| %7 = arith.addi %arg0, %arg0 : i32 |
| linalg.yield %7 : i32 |
| } -> tensor<3x4xi32> |
| flow.dispatch.tensor.store %6, %1, offsets = [], sizes = [], strides = [] : tensor<3x4xi32> -> !flow.dispatch.tensor<writeonly:3x4xi32> |
| flow.dispatch.tensor.store %4, %2, offsets = [], sizes = [], strides = [] : tensor<3x4xi32> -> !flow.dispatch.tensor<writeonly:3x4xi32> |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard" |
| hal.interface.binding @ret1, set=0, binding=2, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK: func @reshape_fused_source_and_copyout() |
| // CHECK: %[[C0:.+]] = arith.constant 0 |
| // CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0[%[[C0]]] : memref<12xi32> |
| // CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0[%[[C0]]] : memref<3x4xi32> |
| // CHECK-DAG: %[[RET1:.+]] = hal.interface.binding.subspan @io::@ret1[%[[C0]]] : memref<3x4xi32> |
| // CHECK: %[[RESHAPE:.+]] = memref.expand_shape %[[ARG0]] {{\[}}[0, 1]] |
| // CHECK: linalg.generic |
| // CHECK-SAME: ins(%[[RESHAPE]] : memref<3x4xi32>) |
| // CHECK-SAME: outs(%[[RET0]] : memref<3x4xi32>) |
| // CHECK: linalg.copy(%[[RESHAPE]], %[[RET1]]) |
| |
| // ----- |
| |
| func @reshape_fused_target() { |
| %c0 = arith.constant 0 : index |
| %c1 = arith.constant 1 : index |
| %c3 = arith.constant 3 : index |
| %c4 = arith.constant 4 : index |
| %c12 = arith.constant 12 : index |
| %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:3x4xi32> |
| %1 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:12xi32> |
| %2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:3x4xi32> -> tensor<3x4xi32> |
| %3 = linalg.init_tensor [3, 4] : tensor<3x4xi32> |
| %4 = linalg.generic { |
| indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], |
| iterator_types = ["parallel", "parallel"]} |
| ins(%2 : tensor<3x4xi32>) outs(%3 : tensor<3x4xi32>) { |
| ^bb0(%arg0 : i32, %arg1 : i32): |
| %5 = arith.addi %arg0, %arg0 : i32 |
| linalg.yield %5 : i32 |
| } -> tensor<3x4xi32> |
| %5 = linalg.tensor_collapse_shape %4 [[0, 1]] : tensor<3x4xi32> into tensor<12xi32> |
| flow.dispatch.tensor.store %5, %1, offsets = [], sizes = [], strides = [] : tensor<12xi32> -> !flow.dispatch.tensor<writeonly:12xi32> |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK: func @reshape_fused_target() |
| // CHECK: %[[C0:.+]] = arith.constant 0 |
| // CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0[%[[C0]]] : memref<3x4xi32> |
| // CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0[%[[C0]]] : memref<12xi32> |
| // CHECK: %[[RESHAPE:.+]] = memref.expand_shape %[[RET0]] {{\[}}[0, 1]] |
| // CHECK: linalg.generic |
| // CHECK-SAME: ins(%[[ARG0]] : memref<3x4xi32>) |
| // CHECK-SAME: outs(%[[RESHAPE]] : memref<3x4xi32>) |
| |
| // ----- |
| |
| func @dot_general_lowering() { |
| %cst = arith.constant 0.000000e+00 : f32 |
| %c3 = arith.constant 3 : index |
| %c0 = arith.constant 0 : index |
| %c2 = arith.constant 2 : index |
| %c1 = arith.constant 1 : index |
| %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:1x1x2xf32> |
| %1 = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:2x3xf32> |
| %2 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:1x3xf32> |
| %3 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x1x2xf32> -> tensor<1x1x2xf32> |
| %4 = linalg.tensor_collapse_shape %3 [[0, 1], [2]] : tensor<1x1x2xf32> into tensor<1x2xf32> |
| %workgroup_size_x = hal.interface.workgroup.size[0] : index |
| %workgroup_size_y = hal.interface.workgroup.size[1] : index |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index |
| %5 = arith.muli %workgroup_size_y, %workgroup_id_y : index |
| %6 = arith.muli %workgroup_size_y, %workgroup_count_y : index |
| scf.for %arg0 = %5 to %c1 step %6 { |
| %7 = arith.muli %workgroup_size_x, %workgroup_id_x : index |
| %8 = arith.muli %workgroup_size_x, %workgroup_count_x : index |
| scf.for %arg1 = %7 to %c3 step %8 { |
| %9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 1)>(%arg0)[%workgroup_size_y] |
| %10 = tensor.extract_slice %4[%arg0, 0] [%9, 2] [1, 1] : tensor<1x2xf32> to tensor<?x2xf32> |
| %11 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 3)>(%arg1)[%workgroup_size_x] |
| %12 = flow.dispatch.tensor.load %1, offsets = [%c0, %arg1], sizes = [%c2, %11], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:2x3xf32> -> tensor<2x?xf32> |
| %13 = linalg.init_tensor [%9, %11] : tensor<?x?xf32> |
| %14 = linalg.fill(%cst, %13) : f32, tensor<?x?xf32> -> tensor<?x?xf32> |
| %15 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%10, %12 : tensor<?x2xf32>, tensor<2x?xf32>) outs(%14 : tensor<?x?xf32>) -> tensor<?x?xf32> |
| flow.dispatch.tensor.store %15, %2, offsets = [%arg0, %arg1], sizes = [%9, %11], strides = [%c1, %c1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:1x3xf32> |
| } |
| } |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @arg1, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK-LABEL: func @dot_general_lowering() |
| // CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan @io::@arg0 |
| // CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan @io::@arg1 |
| // CHECK-DAG: %[[RESHAPE_LHS:.+]] = memref.collapse_shape %[[LHS]] |
| // CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan @io::@ret0 |
| // CHECK: scf.for %[[IV0:.+]] = {{.+}} { |
| // CHECK: scf.for %[[IV1:.+]] = {{.+}} { |
| // CHECK-DAG: %[[LHS_TILE:.+]] = memref.subview %[[RESHAPE_LHS]][%[[IV0]], 0] |
| // CHECK-DAG: %[[RESULT_TILE:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]] |
| // CHECK-DAG: %[[RHS_TILE:.+]] = memref.subview %[[RHS]][0, %[[IV1]]] |
| // CHECK: linalg.fill(%{{.+}}, %[[RESULT_TILE]]) |
| // CHECK: linalg.matmul |
| // CHECK-SAME: ins(%[[LHS_TILE]], %[[RHS_TILE]] |
| // CHECK-SAME: outs(%[[RESULT_TILE]] |
| |
| // ----- |
| |
| func @slice() { |
| %c0 = arith.constant 0 : index |
| %2 = hal.interface.load.constant offset = 0 : index |
| %3 = hal.interface.load.constant offset = 1 : index |
| %4 = hal.interface.load.constant offset = 2 : index |
| %5 = hal.interface.load.constant offset = 3 : index |
| %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xi32>{%2, %3} |
| %1 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xi32>{%4, %5} |
| %6 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:?x?xi32> -> tensor<?x?xi32> |
| %7 = tensor.extract_slice %6[%2, %3] [%4, %5] [1, 1] : tensor<?x?xi32> to tensor<?x?xi32> |
| flow.dispatch.tensor.store %7, %1, offsets = [], sizes = [], strides = [] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:?x?xi32> |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK-LABEL: func @slice() |
| // CHECK-DAG: %[[ARG:.+]] = hal.interface.binding.subspan @io::@arg0 |
| // CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan @io::@ret0 |
| // CHECK: %[[SUBVIEW:.+]] = memref.subview %[[ARG]] |
| // CHECK: linalg.copy(%[[SUBVIEW]], %[[RETURN]]) |
| |
| // ----- |
| |
| func @slice_rank_reducing() { |
| %c0 = arith.constant 0 : index |
| %2 = hal.interface.load.constant offset = 0 : index |
| %3 = hal.interface.load.constant offset = 1 : index |
| %4 = hal.interface.load.constant offset = 2 : index |
| %5 = hal.interface.load.constant offset = 3 : index |
| %8 = hal.interface.load.constant offset = 4 : index |
| %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?x?xi32>{%8, %8, %8} |
| %1 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xi32>{%4, %5} |
| %6 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:?x?x?xi32> -> tensor<?x?x?xi32> |
| %7 = tensor.extract_slice %6[%2, %2, %3] [%4, 1, %5] [1, 1, 1] : tensor<?x?x?xi32> to tensor<?x?xi32> |
| flow.dispatch.tensor.store %7, %1, offsets = [], sizes = [], strides = [] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:?x?xi32> |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK-LABEL: func @slice_rank_reducing() |
| // CHECK-DAG: %[[ARG:.+]] = hal.interface.binding.subspan @io::@arg0 |
| // CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan @io::@ret0 |
| // CHECK: %[[SUBVIEW:.+]] = memref.subview %[[ARG]] |
| // CHECK: linalg.copy(%[[SUBVIEW]], %[[RETURN]]) |
| |
| // ----- |
| |
| func @slice_multiple_copy() { |
| %c0 = arith.constant 0 : index |
| %3 = hal.interface.load.constant offset = 0 : index |
| %4 = hal.interface.load.constant offset = 1 : index |
| %5 = hal.interface.load.constant offset = 2 : index |
| %6 = hal.interface.load.constant offset = 3 : index |
| %7 = hal.interface.load.constant offset = 4 : index |
| %8 = hal.interface.load.constant offset = 5 : index |
| %12 = hal.interface.load.constant offset = 6 : index |
| %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?x?xi32>{%12, %12, %12} |
| %1 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?x?xi32>{%6, %7, %8} |
| %2 = hal.interface.binding.subspan @io::@ret1[%c0] : !flow.dispatch.tensor<writeonly:?x?xi32>{%6, %8} |
| %9 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:?x?x?xi32> -> tensor<?x?x?xi32> |
| %10 = tensor.extract_slice %9[%3, %4, %5] [%6, %7, %8] [1, 1, 1] : tensor<?x?x?xi32> to tensor<?x?x?xi32> |
| %11 = tensor.extract_slice %9[%3, %4, %5] [%6, 1, %8] [1, 1, 1] : tensor<?x?x?xi32> to tensor<?x?xi32> |
| flow.dispatch.tensor.store %10, %1, offsets = [], sizes = [], strides = [] : tensor<?x?x?xi32> -> !flow.dispatch.tensor<writeonly:?x?x?xi32> |
| flow.dispatch.tensor.store %11, %2, offsets = [%3, %5], sizes = [%6, %8], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:?x?xi32> |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard" |
| hal.interface.binding @ret1, set=0, binding=2, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK-LABEL: func @slice_multiple_copy() |
| // CHECK-DAG: %[[ARG:.+]] = hal.interface.binding.subspan @io::@arg0 |
| // CHECK-DAG: %[[RETURN1:.+]] = hal.interface.binding.subspan @io::@ret0 |
| // CHECK-DAG: %[[RETURN2:.+]] = hal.interface.binding.subspan @io::@ret1 |
| // CHECK-DAG: %[[SIZE1:.+]] = hal.interface.load.constant offset = 3 : index |
| // CHECK-DAG: %[[SIZE2:.+]] = hal.interface.load.constant offset = 4 : index |
| // CHECK-DAG: %[[SIZE3:.+]] = hal.interface.load.constant offset = 5 : index |
| // CHECK: %[[SUBVIEW1:.+]] = memref.subview %[[ARG]][%{{.+}}, %{{.+}}, %{{.+}}] [%[[SIZE1]], %[[SIZE2]], %[[SIZE3]]] |
| // CHECK: linalg.copy(%[[SUBVIEW1]], %[[RETURN1]]) |
| // CHECK-DAG: %[[SUBVIEW2:.+]] = memref.subview %[[ARG]][%{{.+}}, %{{.+}}, %{{.+}}] [%[[SIZE1]], 1, %[[SIZE3]]] |
| // CHECK-DAG: %[[RETURNVIEW:.+]] = memref.subview %[[RETURN2]] |
| // CHECK: linalg.copy(%[[SUBVIEW2]], %[[RETURNVIEW]]) |
| |
| // ----- |
| |
| func @slice_in_place() { |
| %c0 = arith.constant 0 : index |
| %2 = hal.interface.load.constant offset = 0 : index |
| %3 = hal.interface.load.constant offset = 1 : index |
| %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readwrite:?x?xi32>{%2, %3} |
| %6 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:?x?xi32> -> tensor<?x?xi32> |
| flow.dispatch.tensor.store %6, %0, offsets = [], sizes = [], strides = [] : tensor<?x?xi32> -> !flow.dispatch.tensor<readwrite:?x?xi32> |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read|Write" |
| } |
| // CHECK-LABEL: func @slice_in_place() |
| // CHECK-NOT: linalg.copy |
| |
| |
| // ----- |
| |
| func @slice_whole_stride_dispatch_0() { |
| %c0 = arith.constant 0 : index |
| %dim0 = hal.interface.load.constant offset = 0 : index |
| %dim1 = hal.interface.load.constant offset = 1 : index |
| %dim2 = hal.interface.load.constant offset = 2 : index |
| %dim3 = hal.interface.load.constant offset = 3 : index |
| %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xi32>{%dim0, %dim1} |
| %1 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xi32>{%dim2, %dim3} |
| %2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:?x?xi32> -> tensor<?x?xi32> |
| %3 = tensor.extract_slice %2[1, 0] [1, 4] [1, 1] : tensor<?x?xi32> to tensor<1x4xi32> |
| flow.dispatch.tensor.store %3, %1, offsets = [], sizes = [], strides = [] : tensor<1x4xi32> -> !flow.dispatch.tensor<writeonly:?x?xi32> |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK-LABEL: func @slice_whole_stride_dispatch_0() |
| // CHECK-DAG: %[[INPUT:.+]] = hal.interface.binding.subspan @io::@arg0 |
| // CHECK-DAG: %[[OUTPUT:.+]] = hal.interface.binding.subspan @io::@ret0 |
| // CHECK: %[[SUBVIEW:.+]] = memref.subview %[[INPUT]] |
| // CHECK: linalg.copy(%[[SUBVIEW]], %[[OUTPUT]]) |
| |
| // ----- |
| |
| func @subtensor_insert() { |
| %c0 = arith.constant 0 : index |
| %c1 = arith.constant 1 : index |
| %dim0 = hal.interface.load.constant offset = 0 : index |
| %dim1 = hal.interface.load.constant offset = 1 : index |
| %dim2 = hal.interface.load.constant offset = 2 : index |
| %dim3 = hal.interface.load.constant offset = 3 : index |
| %dim4 = hal.interface.load.constant offset = 4 : index |
| %dim5 = hal.interface.load.constant offset = 5 : index |
| %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xi32>{%dim0, %dim1} |
| %1 = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:?x?xi32>{%dim2, %dim3} |
| %2 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xi32>{%dim4, %dim5} |
| %3 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:?x?xi32> -> tensor<?x?xi32> |
| %4 = flow.dispatch.tensor.load %1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:?x?xi32> -> tensor<?x?xi32> |
| %5 = tensor.dim %3, %c0 : tensor<?x?xi32> |
| %6 = tensor.dim %3, %c1 : tensor<?x?xi32> |
| %7 = tensor.insert_slice %3 into %4[3, 4] [%5, %6] [1, 1] : tensor<?x?xi32> into tensor<?x?xi32> |
| flow.dispatch.tensor.store %7, %2, offsets = [], sizes = [], strides = [] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:?x?xi32> |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read" |
| hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK-LABEL: func @subtensor_insert() |
| // CHECK-DAG: %[[C0:.+]] = arith.constant 0 |
| // CHECK-DAG: %[[C1:.+]] = arith.constant 1 |
| // CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0 |
| // CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan @io::@arg1 |
| // CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0 |
| // CHECK-DAG: %[[D0:.+]] = memref.dim %[[ARG0]], %[[C0]] |
| // CHECK-DAG: %[[D1:.+]] = memref.dim %[[ARG0]], %[[C1]] |
| // CHECK: linalg.copy(%[[ARG1]], %[[RET0]]) |
| // CHECK: %[[SUBVIEW:.+]] = memref.subview %[[RET0]][3, 4] [%[[D0]], %[[D1]]] [1, 1] |
| // CHECK: linalg.copy(%[[ARG0]], %[[SUBVIEW]]) |
| |
| // ----- |
| |
| func @tensor_extract() { |
| %c0 = arith.constant 0 : index |
| %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:i32> |
| %1 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:3x9xi32> |
| %2 = linalg.init_tensor [3, 9] : tensor<3x9xi32> |
| %3 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32> |
| %4 = tensor.extract %3[] : tensor<i32> |
| %5 = linalg.fill(%4, %2) : i32, tensor<3x9xi32> -> tensor<3x9xi32> |
| flow.dispatch.tensor.store %5, %1, offsets = [], sizes = [], strides = [] : tensor<3x9xi32> -> !flow.dispatch.tensor<writeonly:3x9xi32> |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK-LABEL: func @tensor_extract() |
| // CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0 |
| // CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0 |
| // CHECK: %[[LOAD:.+]] = memref.load %[[ARG0]] |
| // CHECK: linalg.fill(%[[LOAD]], %[[RET0]]) |
| |
| // ----- |
| |
| func @load_to_store() { |
| %c0 = arith.constant 0 : index |
| %1 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:3x4xi32> |
| %2 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:3x4xi32> |
| %3 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:3x4xi32> -> tensor<3x4xi32> |
| flow.dispatch.tensor.store %3, %1, offsets = [], sizes = [], strides = [] : tensor<3x4xi32> -> !flow.dispatch.tensor<writeonly:3x4xi32> |
| return |
| } |
| |
| hal.interface private @io { |
| hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard" |
| } |
| |
| // CHECK-LABEL: func @load_to_store() |
| // CHECK: %[[OUT:.+]] = hal.interface.binding.subspan @io::@ret0[%c0] : memref<3x4xi32> |
| // CHECK: %[[IN:.+]] = hal.interface.binding.subspan @io::@arg0[%c0] : memref<3x4xi32> |
| // CHECK: linalg.copy(%[[IN]], %[[OUT]]) : memref<3x4xi32>, memref<3x4xi32> |
| |
| // ----- |
| |
| func @constant() { |
| %c0 = arith.constant 0 : index |
| %cst = arith.constant dense<[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]> : tensor<2x2x3xi32> |
| %0 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:2x2x3xi32> |
| flow.dispatch.tensor.store %cst, %0, offsets = [], sizes = [], strides = [] : tensor<2x2x3xi32> -> !flow.dispatch.tensor<writeonly:2x2x3xi32> |
| return |
| } |
| // CHECK-LABEL: func @constant() |
| // CHECK: %[[CST:.+]] = arith.constant {{.+}} : tensor<2x2x3xi32> |
| // CHECK: %[[MEMREF:.+]] = memref.buffer_cast %[[CST]] : memref<2x2x3xi32> |
| // CHECK: %[[RESULT:.+]] = hal.interface.binding.subspan @io::@ret0 |
| // CHECK: linalg.copy(%[[MEMREF]], %[[RESULT]]) |
| |
| // ----- |
| |
| func @rhs_non_splat_constant() { |
| %c0 = arith.constant 0 : index |
| %cst = arith.constant dense<[[0.706495285, -0.567672312, 0.483717591, 0.522725761, 0.7563259], [-0.0899272263, -0.283501834, -0.350822538, -0.351515919, -0.337136656], [-0.451804549, 0.372324884, -0.620518147, 0.235451385, 0.851095855]]> : tensor<3x5xf32> |
| %cst_0 = arith.constant 0.000000e+00 : f32 |
| %c5 = arith.constant 5 : index |
| %c1 = arith.constant 1 : index |
| %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:1x5x3x1xf32> |
| %1 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:5x5xf32> |
| %2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x5x3x1xf32> -> tensor<1x5x3x1xf32> |
| %3 = linalg.tensor_collapse_shape %2 [[0, 1], [2, 3]] : tensor<1x5x3x1xf32> into tensor<5x3xf32> |
| %workgroup_size_x = hal.interface.workgroup.size[0] : index |
| %workgroup_size_y = hal.interface.workgroup.size[1] : index |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index |
| %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y] |
| %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y] |
| scf.for %arg0 = %4 to %c5 step %5 { |
| %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x] |
| %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x] |
| scf.for %arg1 = %6 to %c5 step %7 { |
| %8 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 5)>(%arg0)[%workgroup_size_y] |
| %9 = tensor.extract_slice %3[%arg0, 0] [%8, 3] [1, 1] : tensor<5x3xf32> to tensor<?x3xf32> |
| %10 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 5)>(%arg1)[%workgroup_size_x] |
| %11 = tensor.extract_slice %cst[0, %arg1] [3, %10] [1, 1] : tensor<3x5xf32> to tensor<3x?xf32> |
| %12 = linalg.init_tensor [%8, %10] : tensor<?x?xf32> |
| %13 = linalg.fill(%cst_0, %12) : f32, tensor<?x?xf32> -> tensor<?x?xf32> |
| %14 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%9, %11 : tensor<?x3xf32>, tensor<3x?xf32>) outs(%13 : tensor<?x?xf32>) -> tensor<?x?xf32> |
| flow.dispatch.tensor.store %14, %1, offsets = [%arg0, %arg1], sizes = [%8, %10], strides = [%c1, %c1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:5x5xf32> |
| } |
| } |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK-LABEL: func @rhs_non_splat_constant |
| // CHECK-DAG: %[[CONSTANT:.+]] = arith.constant {{.+}} : tensor<3x5xf32> |
| // CHECK-DAG: %[[RHS:.+]] = memref.buffer_cast %[[CONSTANT]] |
| // CHECK-DAG: %[[LHS_INPUT:.+]] = hal.interface.binding.subspan @io::@arg0[%{{.+}}] : memref<1x5x3x1xf32> |
| // CHECK-DAG: %[[RETURN:.+]] = hal.interface.binding.subspan @io::@ret0[%{{.+}}] : memref<5x5xf32> |
| // CHECK: %[[LHS:.+]] = memref.collapse_shape %[[LHS_INPUT]] |
| // CHECK: scf.for %[[IV0:.+]] = |
| // CHECK: scf.for %[[IV1:.+]] = |
| // CHECK-DAG: %[[LHS_SUBVIEW:.+]] = memref.subview %[[LHS]][%[[IV0]], 0] |
| // CHECK-DAG: %[[RHS_SUBVIEW:.+]] = memref.subview %[[RHS]][0, %[[IV1]]] |
| // CHECK-DAG: %[[RESULT_SUBVIEW:.+]] = memref.subview %[[RETURN]][%[[IV0]], %[[IV1]]] |
| // CHECK: linalg.fill(%{{.+}}, %[[RESULT_SUBVIEW]]) |
| // CHECK: linalg.matmul |
| // CHECK-SAME: ins(%[[LHS_SUBVIEW]], %[[RHS_SUBVIEW]] |
| // CHECK-SAME: outs(%[[RESULT_SUBVIEW]] |
| |
| // ----- |
| |
| func @gather() { |
| %c0 = arith.constant 0 : index |
| %c1 = arith.constant 1 : index |
| %dim0 = hal.interface.load.constant offset = 0 : index |
| %dim1 = hal.interface.load.constant offset = 1 : index |
| %dim2 = hal.interface.load.constant offset = 2 : index |
| %dim3 = hal.interface.load.constant offset = 3 : index |
| %dim4 = hal.interface.load.constant offset = 4 : index |
| %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%dim0, %dim1} |
| %1 = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:?xi32>{%dim2} |
| %2 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%dim3, %dim4} |
| %4 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = []: !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32> |
| %5 = flow.dispatch.tensor.load %1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:?xi32> -> tensor<?xi32> |
| %d0 = tensor.dim %5, %c0 : tensor<?xi32> |
| %d1 = tensor.dim %4, %c1 : tensor<?x?xf32> |
| %3 = linalg.init_tensor [%d0, %d1] : tensor<?x?xf32> |
| %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%5 : tensor<?xi32>) outs(%3 : tensor<?x?xf32>) { |
| ^bb0( %arg2: i32, %arg3: f32): // no predecessors |
| %iv1 = linalg.index 1 : index |
| %8 = arith.index_cast %arg2 : i32 to index |
| %9 = tensor.extract %4[%8, %iv1] : tensor<?x?xf32> |
| linalg.yield %9 : f32 |
| } -> tensor<?x?xf32> |
| flow.dispatch.tensor.store %7, %2, offsets = [], sizes = [], strides = [] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32> |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read" |
| hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK-LABEL: func @gather() |
| // CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0 |
| // CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan @io::@arg1 |
| // CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0 |
| // CHECK: linalg.generic |
| // CHECK: %[[VAL:.+]] = memref.load %[[ARG0]] |
| // CHECK: linalg.yield %[[VAL]] |
| |
| // ----- |
| |
| func @pooling_nhwc_sum() { |
| %c2 = arith.constant 2 : index |
| %c0 = arith.constant 0 : index |
| %c1 = arith.constant 1 : index |
| %0 = hal.interface.binding.subspan @io::@ro0[%c0] : !flow.dispatch.tensor<readonly:f32> |
| %1 = hal.interface.binding.subspan @io::@ro1[%c0] : !flow.dispatch.tensor<readonly:1x4x6x1xf32> |
| %2 = hal.interface.binding.subspan @io::@wo2[%c0] : !flow.dispatch.tensor<writeonly:1x2x2x1xf32> |
| %3 = linalg.init_tensor [2, 3] : tensor<2x3xf32> |
| %4 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:f32> -> tensor<f32> |
| %5 = tensor.extract %4[] : tensor<f32> |
| %6 = flow.dispatch.tensor.load %1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x4x6x1xf32> -> tensor<1x4x6x1xf32> |
| %7 = linalg.init_tensor [1, 2, 2, 1] : tensor<1x2x2x1xf32> |
| %8 = linalg.fill(%5, %7) : f32, tensor<1x2x2x1xf32> -> tensor<1x2x2x1xf32> |
| %9 = linalg.pooling_nhwc_sum { |
| dilations = dense<1> : vector<2xi64>, |
| strides = dense<[2, 3]> : vector<2xi64> |
| } ins(%6, %3 : tensor<1x4x6x1xf32>, tensor<2x3xf32>) |
| outs(%8 : tensor<1x2x2x1xf32>) -> tensor<1x2x2x1xf32> |
| flow.dispatch.tensor.store %9, %2, offsets = [], sizes = [], strides = [] : tensor<1x2x2x1xf32> -> !flow.dispatch.tensor<writeonly:1x2x2x1xf32> |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read" |
| hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK-LABEL: func @pooling_nhwc_sum |
| // CHECK: %[[WINDOW:.+]] = memref.alloc() : memref<2x3xf32> |
| // CHECK-DAG: %[[INPUT:.+]] = hal.interface.binding.subspan @io::@ro1[%c0] : memref<1x4x6x1xf32> |
| // CHECK-DAG: %[[INIT:.+]] = hal.interface.binding.subspan @io::@ro0[%c0] : memref<f32> |
| // CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@wo2[%c0] : memref<1x2x2x1xf32> |
| // CHECK: %[[INIT_VAL:.+]] = memref.load %[[INIT]][] : memref<f32> |
| // CHECK: linalg.fill(%[[INIT_VAL]], %[[RET0]]) : f32, memref<1x2x2x1xf32> |
| // CHECK: linalg.pooling_nhwc_sum |
| // CHECK-SAME: dilations = dense<1> : vector<2xi64> |
| // CHECK-SAME: strides = dense<[2, 3]> : vector<2xi64> |
| // CHECK-SAME: ins(%[[INPUT]], %[[WINDOW]] : memref<1x4x6x1xf32>, memref<2x3xf32>) |
| // CHECK-SAME: outs(%[[RET0]] : memref<1x2x2x1xf32>) |
| |
| // ----- |
| |
| func @read_only_subtensor() { |
| %c0 = arith.constant 0 : index |
| %c1 = arith.constant 1 : index |
| %pc0 = hal.interface.load.constant offset = 0 : index |
| %pc1 = hal.interface.load.constant offset = 1 : index |
| %pc2 = hal.interface.load.constant offset = 2 : index |
| %pc3 = hal.interface.load.constant offset = 3 : index |
| %pc4 = hal.interface.load.constant offset = 4 : index |
| %pc5 = hal.interface.load.constant offset = 5 : index |
| %0 = hal.interface.binding.subspan @io::@wo2[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%pc0, %pc1} |
| %1 = hal.interface.binding.subspan @io::@ro0[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%pc2, %pc3} |
| %2 = flow.dispatch.tensor.load %1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32> |
| %3 = hal.interface.binding.subspan @io::@ro1[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%pc4, %pc5} |
| %4 = flow.dispatch.tensor.load %3, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32> |
| %workgroup_size_x = hal.interface.workgroup.size[0] : index |
| %workgroup_size_y = hal.interface.workgroup.size[1] : index |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index |
| %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y] |
| %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y] |
| %dim0 = tensor.dim %2, %c0 : tensor<?x?xf32> |
| %dim1 = tensor.dim %2, %c1 : tensor<?x?xf32> |
| scf.for %arg0 = %5 to %dim0 step %6 { |
| %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x] |
| %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x] |
| scf.for %arg1 = %7 to %dim1 step %8 { |
| %9 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg0)[%workgroup_size_y, %dim0] |
| %10 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg1)[%workgroup_size_x, %dim1] |
| %11 = tensor.extract_slice %2[%arg0, %arg1] [%9, %10] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32> |
| %12 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg0)[%workgroup_size_y, %dim0] |
| %13 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg1)[%workgroup_size_x, %dim1] |
| %14 = tensor.extract_slice %2[%arg0, %arg1] [%12, %13] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32> |
| %15 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg0)[%workgroup_size_y, %dim0] |
| %16 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg1)[%workgroup_size_x, %dim1] |
| %17 = tensor.extract_slice %4[%arg0, %arg1] [%15, %16] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32> |
| %18 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg0)[%workgroup_size_y, %dim0] |
| %19 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg1)[%workgroup_size_x, %dim1] |
| %20 = tensor.extract_slice %4[%arg0, %arg1] [%18, %19] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32> |
| %21 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg0)[%workgroup_size_y, %dim0] |
| %22 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg1)[%workgroup_size_x, %dim1] |
| %23 = linalg.init_tensor [%21, %22] : tensor<?x?xf32> |
| %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11, %14, %17, %20 : tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) outs(%23 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { |
| ^bb0(%arg2: f32, %arg3: f32, %arg4: f32, %arg5: f32, %arg6: f32): // no predecessors |
| %25 = arith.mulf %arg4, %arg5 : f32 |
| %26 = arith.mulf %arg2, %arg3 : f32 |
| %27 = arith.addf %26, %25 : f32 |
| %28 = math.sqrt %27 : f32 |
| linalg.yield %28 : f32 |
| } -> tensor<?x?xf32> |
| flow.dispatch.tensor.store %24, %0, offsets = [%arg0, %arg1], sizes = [%21, %22], strides = [%c1, %c1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32> |
| } |
| } |
| return |
| }hal.interface private @io { |
| hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read" |
| hal.interface.binding @wo2, set=0, binding=2, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK-LABEL: func @read_only_subtensor |
| // CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@ro0[%c0] : memref<?x?xf32> |
| // CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan @io::@ro1[%c0] : memref<?x?xf32> |
| // CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@wo2[%c0] : memref<?x?xf32> |
| // CHECK: scf.for |
| // CHECK: scf.for |
| // CHECK-DAG: %[[SV1:.+]] = memref.subview %[[ARG0]] |
| // CHECK-DAG: %[[SV2:.+]] = memref.subview %[[ARG1]] |
| // CHECK-DAG: %[[SV3:.+]] = memref.subview %[[RET0]] |
| // CHECK: linalg.generic |
| // CHECK-SAME: ins(%[[SV1]], %[[SV1]], %[[SV2]], %[[SV2]] : |
| // CHECK-SAME: outs(%[[SV3]] : |
| |
| // ----- |
| |
| func @reshape_read_only() { |
| %c0 = arith.constant 0 : index |
| %dim0 = hal.interface.load.constant offset = 0 : index |
| %dim1 = hal.interface.load.constant offset = 1 : index |
| %dim2 = hal.interface.load.constant offset = 2 : index |
| %0 = hal.interface.binding.subspan @io::@ro0[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%dim0, %dim1} |
| %1 = hal.interface.binding.subspan @io::@wo0[%c0] : !flow.dispatch.tensor<writeonly:?xf32>{%dim2} |
| %2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32> |
| %3 = linalg.tensor_collapse_shape %2 [[0, 1]] |
| : tensor<?x?xf32> into tensor<?xf32> |
| %4 = tensor.dim %3, %c0 : tensor<?xf32> |
| %5 = linalg.init_tensor [%4] : tensor<?xf32> |
| %6 = linalg.generic { |
| indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], |
| iterator_types = ["parallel"]} |
| ins(%3 : tensor<?xf32>) outs(%5 : tensor<?xf32>) { |
| ^bb0(%arg0 : f32, %arg1 : f32): |
| %7 = arith.addf %arg0, %arg0 : f32 |
| linalg.yield %7 : f32 |
| } -> tensor<?xf32> |
| flow.dispatch.tensor.store %6, %1, offsets = [], sizes = [], strides = []: tensor<?xf32> -> !flow.dispatch.tensor<writeonly:?xf32> |
| return |
| } |
| // CHECK-LABEL: func @reshape_read_only |
| // CHECK-DAG: %[[INPUT:.+]] = hal.interface.binding.subspan @io::@ro0 |
| // CHECK-DAG: %[[OUTPUT:.+]] = hal.interface.binding.subspan @io::@wo0 |
| // CHECK: %[[RESHAPE:.+]] = memref.collapse_shape %[[INPUT]] |
| // CHECK: linalg.generic |
| // CHECK-SAME: ins(%[[RESHAPE]] : memref<?xf32>) |
| // CHECK-SAME: outs(%[[OUTPUT]] : memref<?xf32>) |
| |
| // ----- |
| |
| func @use_buffer_for_operand_when_output_tensor_not_used() { |
| %c0 = arith.constant 0 : index |
| |
| %input_subspan = hal.interface.binding.subspan @interface_io::@ro0[%c0] : !flow.dispatch.tensor<readonly:1x225x225x16xf32> |
| %filter_subspan = hal.interface.binding.subspan @interface_io::@ro1[%c0] : !flow.dispatch.tensor<readonly:3x3x16x32xf32> |
| %offset_subspan = hal.interface.binding.subspan @interface_io::@ro2[%c0] : !flow.dispatch.tensor<readonly:32xf32> |
| %output_subspan = hal.interface.binding.subspan @interface_io::@wo3[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> |
| |
| %input = flow.dispatch.tensor.load %input_subspan, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x225x225x16xf32> -> tensor<1x225x225x16xf32> |
| %filter = flow.dispatch.tensor.load %filter_subspan, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:3x3x16x32xf32> -> tensor<3x3x16x32xf32> |
| %offset = flow.dispatch.tensor.load %offset_subspan, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<32xf32> |
| |
| %cst = arith.constant 0.0 : f32 |
| %0 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32> |
| %1 = linalg.fill(%cst, %0) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32> |
| %2 = linalg.conv_2d_nhwc_hwcf |
| {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} |
| ins(%input, %filter : tensor<1x225x225x16xf32>, tensor<3x3x16x32xf32>) |
| outs(%1 : tensor<1x112x112x32xf32>) |
| -> tensor<1x112x112x32xf32> |
| %3 = linalg.generic { |
| indexing_maps = [ |
| affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, |
| affine_map<(d0, d1, d2, d3) -> (d3)>, |
| affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], |
| iterator_types = ["parallel", "parallel", "parallel", "parallel"]} |
| ins(%2, %offset: tensor<1x112x112x32xf32>, tensor<32xf32>) |
| outs(%0 : tensor<1x112x112x32xf32>) { |
| ^bb0(%a: f32, %b: f32, %c: f32): |
| %sub = arith.subf %a, %b : f32 |
| linalg.yield %sub : f32 |
| } -> tensor<1x112x112x32xf32> |
| flow.dispatch.tensor.store %3, %output_subspan, offsets = [], sizes = [], strides = [] : tensor<1x112x112x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32> |
| return |
| } |
| |
| hal.interface private @interface_io { |
| hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read" |
| hal.interface.binding @ro2, set=0, binding=2, type="StorageBuffer", access="Read" |
| hal.interface.binding @wo3, set=0, binding=3, type="StorageBuffer", access="Write|Discard" |
| } |
| |
| // CHECK: func @use_buffer_for_operand_when_output_tensor_not_used() |
| |
| // CHECK-NOT: memref.alloc |
| // CHECK: %[[OUTPUT:.+]] = hal.interface.binding.subspan @interface_io::@wo3 |
| // CHECK: linalg.fill(%{{.+}}, %[[OUTPUT]]) |
| // CHECK-NEXT: linalg.conv_2d_nhwc_hwcf |
| // CHECK-SAME: outs(%[[OUTPUT]] : memref<1x112x112x32xf32>) |
| // CHECK-NEXT: linalg.generic |
| // CHECK-SAME: ins(%[[OUTPUT]], %{{.+}} : memref<1x112x112x32xf32>, memref<32xf32>) |
| // CHECK-SAME: outs(%[[OUTPUT]] : memref<1x112x112x32xf32>) |
| |
| // ----- |
| |
| func @dont_use_buffer_for_operand_when_output_tensor_used() { |
| %c0 = arith.constant 0 : index |
| |
| %input_subspan = hal.interface.binding.subspan @interface_io::@ro0[%c0] : !flow.dispatch.tensor<readonly:1x225x225x16xf32> |
| %filter_subspan = hal.interface.binding.subspan @interface_io::@ro1[%c0] : !flow.dispatch.tensor<readonly:3x3x16x32xf32> |
| %offset_subspan = hal.interface.binding.subspan @interface_io::@ro2[%c0] : !flow.dispatch.tensor<readonly:32xf32> |
| %output_subspan = hal.interface.binding.subspan @interface_io::@wo3[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> |
| |
| %input = flow.dispatch.tensor.load %input_subspan, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:1x225x225x16xf32> -> tensor<1x225x225x16xf32> |
| %filter = flow.dispatch.tensor.load %filter_subspan, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:3x3x16x32xf32> -> tensor<3x3x16x32xf32> |
| %offset = flow.dispatch.tensor.load %offset_subspan, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:32xf32> -> tensor<32xf32> |
| |
| %cst0 = arith.constant 0.0 : f32 |
| %cst1 = arith.constant 1.0 : f32 |
| %0 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32> |
| %1 = linalg.fill(%cst0, %0) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32> |
| %2 = linalg.conv_2d_nhwc_hwcf |
| {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} |
| ins(%input, %filter : tensor<1x225x225x16xf32>, tensor<3x3x16x32xf32>) |
| outs(%1 : tensor<1x112x112x32xf32>) |
| -> tensor<1x112x112x32xf32> |
| %3 = linalg.fill(%cst1, %0) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32> |
| %4 = linalg.generic { |
| indexing_maps = [ |
| affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, |
| affine_map<(d0, d1, d2, d3) -> (d3)>, |
| affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], |
| iterator_types = ["parallel", "parallel", "parallel", "parallel"]} |
| ins(%2, %offset: tensor<1x112x112x32xf32>, tensor<32xf32>) |
| outs(%3 : tensor<1x112x112x32xf32>) { |
| ^bb0(%a: f32, %b: f32, %c: f32): |
| %sub = arith.subf %a, %b : f32 |
| %add = arith.addf %sub, %c : f32 |
| linalg.yield %add : f32 |
| } -> tensor<1x112x112x32xf32> |
| flow.dispatch.tensor.store %4, %output_subspan, offsets = [], sizes = [], strides = []: tensor<1x112x112x32xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32> |
| return |
| } |
| |
| // CHECK-LABEL: func @dont_use_buffer_for_operand_when_output_tensor_used() |
| // CHECK: %[[ALLOC:.+]] = memref.alloc |
| // CHECK: %[[OUTPUT:.+]] = hal.interface.binding.subspan @interface_io::@wo3 |
| // CHECK: linalg.fill(%{{.+}}, %[[ALLOC]]) |
| // CHECK-NEXT: linalg.conv_2d_nhwc_hwcf |
| // CHECK-SAME: outs(%[[ALLOC]] : memref<1x112x112x32xf32>) |
| // CHECK-NEXT: linalg.fill(%{{.+}}, %[[OUTPUT]]) |
| // CHECK-NEXT: linalg.generic |
| // CHECK-SAME: ins(%[[ALLOC]], %{{.+}} : memref<1x112x112x32xf32>, memref<32xf32>) |
| // CHECK-SAME: outs(%[[OUTPUT]] : memref<1x112x112x32xf32>) |
| |
| // ----- |
| |
| func @bufferize_cst_output_tensor() { |
| %c0 = arith.constant 0 : index |
| %cst1 = arith.constant dense<-2147483648> : tensor<i32> |
| %zero = arith.constant 0.000000e+00 : f32 |
| %cst5 = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32> |
| %input = hal.interface.binding.subspan @io::@ro0[%c0] : !flow.dispatch.tensor<readonly:5xf32> |
| %output = hal.interface.binding.subspan @io::@wo1[%c0] : !flow.dispatch.tensor<writeonly:i32> |
| %1 = flow.dispatch.tensor.load %input, offsets=[], sizes=[], strides=[] : !flow.dispatch.tensor<readonly:5xf32> -> tensor<5xf32> |
| %2 = linalg.generic { |
| indexing_maps = [affine_map<(d0) -> (-d0 + 4)>, affine_map<(d0) -> (d0)>, affine_map<(d0) -> ()>], |
| iterator_types = ["reduction"]} |
| ins(%1, %cst5 : tensor<5xf32>, tensor<5xi32>) |
| outs(%cst1 : tensor<i32>) { |
| ^bb0(%arg0: f32, %arg1: i32, %arg2: i32): |
| %8 = arith.cmpf oeq, %arg0, %zero : f32 |
| %9 = arith.extui %8 : i1 to i32 |
| %10 = arith.muli %9, %arg1 : i32 |
| %11 = arith.cmpi sgt, %10, %arg2 : i32 |
| %12 = select %11, %10, %arg2 : i32 |
| linalg.yield %12 : i32 |
| } -> tensor<i32> |
| flow.dispatch.tensor.store %2, %output, offsets=[], sizes=[], strides=[] : tensor<i32> -> !flow.dispatch.tensor<writeonly:i32> |
| return |
| } |
| |
| hal.interface private @interface_io { |
| hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @wo1, set=0, binding=1, type="StorageBuffer", access="Write|Discard" |
| } |
| |
| // CHECK-LABEL: func @bufferize_cst_output_tensor() |
| |
| // CHECK-DAG: %[[CST1:.+]] = arith.constant dense<-2147483648> : tensor<i32> |
| // CHECK-DAG: %[[CST5:.+]] = arith.constant dense<[1, 2, 3, 4, 5]> : tensor<5xi32> |
| // CHECK: %[[CAST1:.+]] = memref.buffer_cast %[[CST1]] : memref<i32> |
| // CHECK: %[[CAST5:.+]] = memref.buffer_cast %[[CST5]] : memref<5xi32> |
| // CHECK: %[[INPUT:.+]] = hal.interface.binding.subspan @io::@ro0[%c0] : memref<5xf32> |
| // CHECK: %[[OUTPUT:.+]] = hal.interface.binding.subspan @io::@wo1[%c0] : memref<i32> |
| // CHECK: linalg.copy(%[[CAST1]], %[[OUTPUT]]) |
| // CHECK: linalg.generic |
| // CHECK-SAME: ins(%[[INPUT]], %[[CAST5]] : memref<5xf32>, memref<5xi32>) |
| // CHECK-SAME: outs(%[[OUTPUT]] : memref<i32>) |
| |
| // ----- |
| |
| func @cast_follwed_by_store() { |
| %c0 = arith.constant 0 : index |
| %cst = arith.constant 0.000000e+00 : f32 |
| %c4 = arith.constant 4 : index |
| %c64 = arith.constant 64 : index |
| %c1 = arith.constant 1 : index |
| %c32 = arith.constant 32 : index |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:4x32x1024xf32> |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:4x1024x64xf32> |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:4x32x64xf32> |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index |
| %workgroup_id_z = hal.interface.workgroup.id[2] : index |
| %workgroup_count_z = hal.interface.workgroup.count[2] : index |
| scf.for %arg0 = %workgroup_id_z to %c4 step %workgroup_count_z { |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_y] |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_y] |
| scf.for %arg1 = %3 to %c32 step %4 { |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x] |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_count_x] |
| scf.for %arg2 = %5 to %c64 step %6 { |
| %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1, 0], sizes = [%c1, %c32, 1024], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:4x32x1024xf32> -> tensor<?x?x1024xf32> |
| %8 = flow.dispatch.tensor.load %1, offsets = [%arg0, 0, %arg2], sizes = [%c1, 1024, %c32], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:4x1024x64xf32> -> tensor<?x1024x?xf32> |
| %9 = linalg.init_tensor [1, 32, 32] : tensor<1x32x32xf32> |
| %10 = linalg.fill(%cst, %9) {__internal_linalg_transform__ = "workgroup"} : f32, tensor<1x32x32xf32> -> tensor<1x32x32xf32> |
| %11 = linalg.batch_matmul {__internal_linalg_transform__ = "workgroup", is_root_op} ins(%7, %8 : tensor<?x?x1024xf32>, tensor<?x1024x?xf32>) outs(%10 : tensor<1x32x32xf32>) -> tensor<1x32x32xf32> |
| %12 = tensor.cast %11 : tensor<1x32x32xf32> to tensor<?x?x?xf32> |
| flow.dispatch.tensor.store %12, %2, offsets = [%arg0, %arg1, %arg2], sizes = [%c1, %c32, %c32], strides = [1, 1, 1] : tensor<?x?x?xf32> -> !flow.dispatch.tensor<writeonly:4x32x64xf32> |
| } |
| } |
| } |
| return |
| } |
| |
| // CHECK-LABEL: func @cast_follwed_by_store() |
| // CHECK-DAG: %[[ZERO:.+]] = arith.constant 0.000000e+00 : f32 |
| // CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<4x32x1024xf32> |
| // CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<4x1024x64xf32> |
| // CHECK-DAG: %[[RESULT:.+]] = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<4x32x64xf32> |
| // CHECK: %[[LHSV:.+]] = memref.subview %[[LHS]] |
| // CHECK: %[[RHSV:.+]] = memref.subview %[[RHS]] |
| // CHECK: %[[RESULTV:.+]] = memref.subview %[[RESULT]] |
| // CHECK: linalg.fill(%[[ZERO]], %[[RESULTV]]) |
| // CHECK: linalg.batch_matmul {{.*}} ins(%[[LHSV]], %[[RHSV]] : {{.*}}) outs(%[[RESULTV]] |
| |
| // ----- |
| |
| func @rank_reduced_subtensor_insert() { |
| %c0 = arith.constant 0 : index |
| %c1 = arith.constant 1 : index |
| %c2 = arith.constant 2 : index |
| %dim0 = hal.interface.load.constant offset = 0 : index |
| %dim1 = hal.interface.load.constant offset = 1 : index |
| %dim2 = hal.interface.load.constant offset = 2 : index |
| %dim3 = hal.interface.load.constant offset = 3 : index |
| %dim4 = hal.interface.load.constant offset = 4 : index |
| %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%dim0, %dim1} |
| %1 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<readwrite:?x?x?xf32>{%dim2, %dim3, %dim4} |
| %2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32> |
| %3 = flow.dispatch.tensor.load %1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:?x?x?xf32> -> tensor<?x?x?xf32> |
| %4 = tensor.dim %3, %c1 : tensor<?x?x?xf32> |
| %5 = tensor.dim %3, %c2 : tensor<?x?x?xf32> |
| %6 = tensor.insert_slice %2 into %3[0, 0, 0] [1, %4, %5] [1, 1, 1] : tensor<?x?xf32> into tensor<?x?x?xf32> |
| flow.dispatch.tensor.store %6, %1, offsets = [], sizes = [], strides = [] : tensor<?x?x?xf32> -> !flow.dispatch.tensor<readwrite:?x?x?xf32> |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @ret0, set=0, binding=1, type="StorageBuffer", access="Read|Write" |
| } |
| // CHECK-LABEL: func @rank_reduced_subtensor_insert() |
| // CHECK-DAG: %[[ARG:.+]] = hal.interface.binding.subspan @io::@arg0 |
| // CHECK-DAG: %[[RET:.+]] = hal.interface.binding.subspan @io::@ret0 |
| // CHECK: %[[SUBVIEW:.+]] = memref.subview %[[RET]] |
| // CHECK: linalg.copy(%[[ARG]], %[[SUBVIEW]]) |
| |
| // ----- |
| |
| #map0 = affine_map<(d0, d1, d2) -> (d0, d2)> |
| #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> |
| #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> |
| func @bufferize_transfer_op() { |
| %c3 = arith.constant 3 : index |
| %cst = arith.constant 0.000000e+00 : f32 |
| %c0 = arith.constant 0 : index |
| %c2 = arith.constant 2 : index |
| %c1 = arith.constant 1 : index |
| %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:2x3xf32> |
| %1 = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:3x4xf32> |
| %2 = hal.interface.binding.subspan @io::@arg2[%c0] : !flow.dispatch.tensor<readonly:2x4xf32> |
| %3 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:2x4xf32> |
| %4 = flow.dispatch.tensor.load %0, offsets = [%c0, %c0], sizes = [%c1, %c3], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:2x3xf32> -> tensor<2x3xf32> |
| %5 = flow.dispatch.tensor.load %1, offsets = [%c0, %c0], sizes = [%c3, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:3x4xf32> -> tensor<3x1xf32> |
| %6 = flow.dispatch.tensor.load %2, offsets = [%c0, %c0], sizes = [%c1, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:2x4xf32> -> tensor<2x1xf32> |
| %7 = vector.transfer_read %4[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<2x3xf32>, vector<1x1xf32> |
| %8 = vector.transfer_read %4[%c0, %c1], %cst {in_bounds = [true, true]} : tensor<2x3xf32>, vector<1x1xf32> |
| %9 = vector.transfer_read %4[%c0, %c2], %cst {in_bounds = [true, true]} : tensor<2x3xf32>, vector<1x1xf32> |
| %10 = vector.transfer_read %4[%c1, %c0], %cst {in_bounds = [true, true]} : tensor<2x3xf32>, vector<1x1xf32> |
| %11 = vector.transfer_read %4[%c1, %c1], %cst {in_bounds = [true, true]} : tensor<2x3xf32>, vector<1x1xf32> |
| %12 = vector.transfer_read %4[%c1, %c2], %cst {in_bounds = [true, true]} : tensor<2x3xf32>, vector<1x1xf32> |
| %13 = vector.transfer_read %5[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<3x1xf32>, vector<1x1xf32> |
| %14 = vector.transfer_read %5[%c1, %c0], %cst {in_bounds = [true, true]} : tensor<3x1xf32>, vector<1x1xf32> |
| %15 = vector.transfer_read %5[%c2, %c0], %cst {in_bounds = [true, true]} : tensor<3x1xf32>, vector<1x1xf32> |
| %16 = vector.transfer_read %6[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<2x1xf32>, vector<1x1xf32> |
| %17 = vector.transfer_read %6[%c1, %c0], %cst {in_bounds = [true, true]} : tensor<2x1xf32>, vector<1x1xf32> |
| %18 = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %7, %13, %16 : vector<1x1xf32>, vector<1x1xf32> into vector<1x1xf32> |
| %19 = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %8, %14, %18 : vector<1x1xf32>, vector<1x1xf32> into vector<1x1xf32> |
| %20 = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %9, %15, %19 : vector<1x1xf32>, vector<1x1xf32> into vector<1x1xf32> |
| %21 = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %10, %13, %17 : vector<1x1xf32>, vector<1x1xf32> into vector<1x1xf32> |
| %22 = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %11, %14, %21 : vector<1x1xf32>, vector<1x1xf32> into vector<1x1xf32> |
| %23 = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %12, %15, %22 : vector<1x1xf32>, vector<1x1xf32> into vector<1x1xf32> |
| %24 = vector.transfer_write %20, %6[%c0, %c0] {in_bounds = [true, true]} : vector<1x1xf32>, tensor<2x1xf32> |
| %25 = vector.transfer_write %23, %24[%c1, %c0] {in_bounds = [true, true]} : vector<1x1xf32>, tensor<2x1xf32> |
| flow.dispatch.tensor.store %25, %3, offsets = [%c0, %c0], sizes = [%c1, %c1], strides = [%c1, %c1] : tensor<2x1xf32> -> !flow.dispatch.tensor<writeonly:2x4xf32> |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read" |
| hal.interface.binding @arg2, set=0, binding=2, type="StorageBuffer", access="Read" |
| hal.interface.binding @ret0, set=0, binding=3, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK-LABEL: func @bufferize_transfer_op() |
| // CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0 |
| // CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan @io::@arg1 |
| // CHECK-DAG: %[[ARG2:.+]] = hal.interface.binding.subspan @io::@arg2 |
| // CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0 |
| // CHECK-DAG: %[[ARG0V:.+]] = memref.subview %[[ARG0]] |
| // CHECK-DAG: %[[ARG1V:.+]] = memref.subview %[[ARG1]] |
| // CHECK-DAG: %[[ARG2V:.+]] = memref.subview %[[ARG2]] |
| // CHECK-COUNT-6: vector.transfer_read %[[ARG0V]] |
| // CHECK-COUNT-3: vector.transfer_read %[[ARG1V]] |
| // CHECK-COUNT-2: vector.transfer_read %[[ARG2V]] |
| // CHECK: %[[RET0V:.+]] = memref.subview %[[RET0]] |
| // CHECK: linalg.copy(%[[ARG2V]], %[[RET0V]]) |
| // CHECK: vector.transfer_write %{{.+}}, %[[RET0V]] |
| // CHECK: vector.transfer_write %{{.+}}, %[[RET0V]] |
| |
| // ----- |
| |
| #map0 = affine_map<(d0, d1, d2) -> (d0, d2)> |
| #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> |
| #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> |
| func @bufferize_transfer_op_inplace() { |
| %c3 = arith.constant 3 : index |
| %cst = arith.constant 0.000000e+00 : f32 |
| %c0 = arith.constant 0 : index |
| %c2 = arith.constant 2 : index |
| %c1 = arith.constant 1 : index |
| %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:2x3xf32> |
| %1 = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:3x4xf32> |
| %3 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<readwrite:2x4xf32> |
| %4 = flow.dispatch.tensor.load %0, offsets = [%c0, %c0], sizes = [%c1, %c3], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:2x3xf32> -> tensor<2x3xf32> |
| %5 = flow.dispatch.tensor.load %1, offsets = [%c0, %c0], sizes = [%c3, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:3x4xf32> -> tensor<3x1xf32> |
| %6 = flow.dispatch.tensor.load %3, offsets = [%c0, %c0], sizes = [%c1, %c1], strides = [%c1, %c1] : !flow.dispatch.tensor<readwrite:2x4xf32> -> tensor<2x1xf32> |
| %7 = vector.transfer_read %4[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<2x3xf32>, vector<1x1xf32> |
| %8 = vector.transfer_read %4[%c0, %c1], %cst {in_bounds = [true, true]} : tensor<2x3xf32>, vector<1x1xf32> |
| %9 = vector.transfer_read %4[%c0, %c2], %cst {in_bounds = [true, true]} : tensor<2x3xf32>, vector<1x1xf32> |
| %10 = vector.transfer_read %4[%c1, %c0], %cst {in_bounds = [true, true]} : tensor<2x3xf32>, vector<1x1xf32> |
| %11 = vector.transfer_read %4[%c1, %c1], %cst {in_bounds = [true, true]} : tensor<2x3xf32>, vector<1x1xf32> |
| %12 = vector.transfer_read %4[%c1, %c2], %cst {in_bounds = [true, true]} : tensor<2x3xf32>, vector<1x1xf32> |
| %13 = vector.transfer_read %5[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<3x1xf32>, vector<1x1xf32> |
| %14 = vector.transfer_read %5[%c1, %c0], %cst {in_bounds = [true, true]} : tensor<3x1xf32>, vector<1x1xf32> |
| %15 = vector.transfer_read %5[%c2, %c0], %cst {in_bounds = [true, true]} : tensor<3x1xf32>, vector<1x1xf32> |
| %16 = vector.transfer_read %6[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<2x1xf32>, vector<1x1xf32> |
| %17 = vector.transfer_read %6[%c1, %c0], %cst {in_bounds = [true, true]} : tensor<2x1xf32>, vector<1x1xf32> |
| %18 = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %7, %13, %16 : vector<1x1xf32>, vector<1x1xf32> into vector<1x1xf32> |
| %19 = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %8, %14, %18 : vector<1x1xf32>, vector<1x1xf32> into vector<1x1xf32> |
| %20 = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %9, %15, %19 : vector<1x1xf32>, vector<1x1xf32> into vector<1x1xf32> |
| %21 = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %10, %13, %17 : vector<1x1xf32>, vector<1x1xf32> into vector<1x1xf32> |
| %22 = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %11, %14, %21 : vector<1x1xf32>, vector<1x1xf32> into vector<1x1xf32> |
| %23 = vector.contract {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} %12, %15, %22 : vector<1x1xf32>, vector<1x1xf32> into vector<1x1xf32> |
| %24 = vector.transfer_write %20, %6[%c0, %c0] {in_bounds = [true, true]} : vector<1x1xf32>, tensor<2x1xf32> |
| %25 = vector.transfer_write %23, %24[%c1, %c0] {in_bounds = [true, true]} : vector<1x1xf32>, tensor<2x1xf32> |
| flow.dispatch.tensor.store %25, %3, offsets = [%c0, %c0], sizes = [%c1, %c1], strides = [%c1, %c1] : tensor<2x1xf32> -> !flow.dispatch.tensor<readwrite:2x4xf32> |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read" |
| hal.interface.binding @ret0, set=0, binding=3, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK-LABEL: func @bufferize_transfer_op_inplace() |
| // CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0 |
| // CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan @io::@arg1 |
| // CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0 |
| // CHECK-DAG: %[[ARG0V:.+]] = memref.subview %[[ARG0]] |
| // CHECK-DAG: %[[ARG1V:.+]] = memref.subview %[[ARG1]] |
| // CHECK-DAG: %[[RET0V:.+]] = memref.subview %[[RET0]] |
| // CHECK-COUNT-6: vector.transfer_read %[[ARG0V]] |
| // CHECK-COUNT-3: vector.transfer_read %[[ARG1V]] |
| // CHECK-COUNT-2: vector.transfer_read %[[RET0V]] |
| // CHECK-NOT: linalg.copy |
| // CHECK: vector.transfer_write %{{.+}}, %[[RET0V]] |
| // CHECK: vector.transfer_write %{{.+}}, %[[RET0V]] |
| |
| // ----- |
| |
| #map = affine_map<(d0, d1) -> (d0, d1)> |
| func @multi_result() { |
| %c0 = arith.constant 0 : index |
| %c2 = arith.constant 2 : index |
| %c4 = arith.constant 4 : index |
| %c1 = arith.constant 1 : index |
| %c3 = arith.constant 3 : index |
| %dim0 = hal.interface.load.constant offset = 0 : index |
| %dim1 = hal.interface.load.constant offset = 1 : index |
| %dim2 = hal.interface.load.constant offset = 2 : index |
| %dim3 = hal.interface.load.constant offset = 3 : index |
| %dim4 = hal.interface.load.constant offset = 4 : index |
| %dim5 = hal.interface.load.constant offset = 5 : index |
| %dim6 = hal.interface.load.constant offset = 6 : index |
| %dim7 = hal.interface.load.constant offset = 7 : index |
| %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%dim0, %dim1} |
| %1 = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%dim2, %dim3} |
| %2 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%dim4, %dim5} |
| %3 = hal.interface.binding.subspan @io::@ret1[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%dim6, %dim7} |
| %4 = hal.interface.load.constant offset = 8 : index |
| %5 = hal.interface.load.constant offset = 9 : index |
| %6 = hal.interface.load.constant offset = 10 : index |
| %7 = hal.interface.load.constant offset = 11 : index |
| %8 = hal.interface.workgroup.id[0] : index |
| %9 = hal.interface.workgroup.id[1] : index |
| %10 = hal.interface.workgroup.count[0] : index |
| %11 = hal.interface.workgroup.count[1] : index |
| %12 = hal.interface.workgroup.size[0] : index |
| %13 = hal.interface.workgroup.size[1] : index |
| %14 = arith.muli %9, %13 : index |
| %15 = arith.muli %11, %13 : index |
| %16 = arith.muli %8, %12 : index |
| %17 = arith.muli %10, %12 : index |
| scf.for %arg0 = %14 to %4 step %15 { |
| scf.for %arg1 = %16 to %5 step %17 { |
| %18 = affine.min affine_map<(d0)[s0, s1] -> (s1, -d0 + s0)>(%arg0)[%4, %13] |
| %19 = affine.min affine_map<(d0)[s0, s1] -> (s1, -d0 + s0)>(%arg1)[%5, %12] |
| %20 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1], sizes = [%18, %19], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32> |
| %21 = flow.dispatch.tensor.load %1, offsets = [%arg0, %arg1], sizes = [%18, %19], strides = [%c1, %c1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32> |
| %shape = linalg.init_tensor [%18, %19] : tensor<?x?xf32> |
| %22:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel"]} |
| ins(%20, %21 : tensor<?x?xf32>, tensor<?x?xf32>) |
| outs(%shape, %shape : tensor<?x?xf32>, tensor<?x?xf32>) { |
| ^bb0(%arg2: f32, %arg3 : f32, %arg4 : f32, %arg5 : f32): // no predecessors |
| %23 = arith.mulf %arg2, %arg3 : f32 |
| %24 = arith.addf %arg2, %arg3 : f32 |
| linalg.yield %23, %24 : f32, f32 |
| } -> (tensor<?x?xf32>, tensor<?x?xf32>) |
| flow.dispatch.tensor.store %22#0, %2, offsets = [%arg0, %arg1], sizes = [%18, %19], strides = [%c1, %c1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32> |
| flow.dispatch.tensor.store %22#1, %3, offsets = [%arg0, %arg1], sizes = [%18, %19], strides = [%c1, %c1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32> |
| } |
| } |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read" |
| hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard" |
| hal.interface.binding @ret1, set=0, binding=3, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK-LABEL: func @multi_result() |
| // CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0 |
| // CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan @io::@arg1 |
| // CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0 |
| // CHECK-DAG: %[[RET1:.+]] = hal.interface.binding.subspan @io::@ret1 |
| // CHECK-DAG: %[[ARG0V:.+]] = memref.subview %[[ARG0]] |
| // CHECK-DAG: %[[ARG1V:.+]] = memref.subview %[[ARG1]] |
| // CHECK-DAG: %[[RET0V:.+]] = memref.subview %[[RET0]] |
| // CHECK-DAG: %[[RET1V:.+]] = memref.subview %[[RET1]] |
| // CHECK: linalg.generic |
| // CHECK-SAME: ins(%[[ARG0V]], %[[ARG1V]] |
| // CHECK-SAME: outs(%[[RET0V]], %[[RET1V]] |
| |
| // ----- |
| |
| #map0 = affine_map<()[s0] -> (s0 * 64)> |
| #map1 = affine_map<()[s0] -> (s0 * 16)> |
| module { |
| func @padded_matmul() { |
| %c0 = arith.constant 0 : index |
| %c12544 = arith.constant 12544 : index |
| %c64 = arith.constant 64 : index |
| %c16 = arith.constant 16 : index |
| %cst = arith.constant 0.000000e+00 : f32 |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:12544x27xf32> |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:27x16xf32> |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:12544x16xf32> |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index |
| %3 = affine.apply #map0()[%workgroup_id_y] |
| %4 = affine.apply #map0()[%workgroup_count_y] |
| scf.for %arg0 = %3 to %c12544 step %4 { |
| %5 = affine.apply #map1()[%workgroup_id_x] |
| %6 = affine.apply #map1()[%workgroup_count_x] |
| scf.for %arg1 = %5 to %c16 step %6 { |
| %7 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [64, 27], strides = [1, 1] : !flow.dispatch.tensor<readonly:12544x27xf32> -> tensor<64x27xf32> |
| %8 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [27, 16], strides = [1, 1] : !flow.dispatch.tensor<readonly:27x16xf32> -> tensor<27x16xf32> |
| %9 = linalg.init_tensor [64, 16] : tensor<64x16xf32> |
| %10 = linalg.fill(%cst, %9) {__internal_linalg_transform__ = "workgroup"} : f32, tensor<64x16xf32> -> tensor<64x16xf32> |
| %11 = linalg.pad_tensor %7 low[0, 0] high[0, 5] { |
| ^bb0(%arg2: index, %arg3: index): // no predecessors |
| linalg.yield %cst : f32 |
| } : tensor<64x27xf32> to tensor<64x32xf32> |
| %12 = linalg.pad_tensor %8 low[0, 0] high[5, 0] { |
| ^bb0(%arg2: index, %arg3: index): // no predecessors |
| linalg.yield %cst : f32 |
| } : tensor<27x16xf32> to tensor<32x16xf32> |
| %13 = linalg.matmul ins(%11, %12 : tensor<64x32xf32>, tensor<32x16xf32>) outs(%10 : tensor<64x16xf32>) -> tensor<64x16xf32> |
| %14 = tensor.cast %13 : tensor<64x16xf32> to tensor<?x?xf32> |
| flow.dispatch.tensor.store %14, %2, offsets = [%arg0, %arg1], sizes = [%c64, %c16], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:12544x16xf32> |
| } |
| } |
| return |
| } |
| } |
| |
| // CHECK-LABEL: func @padded_matmul() |
| // CHECK-DAG: %[[LHS_PADDED:.+]] = memref.alloc() : memref<64x32xf32> |
| // CHECK-DAG: %[[RHS_PADDED:.+]] = memref.alloc() : memref<32x16xf32> |
| // CHECK-DAG: %[[C0:.+]] = arith.constant 0.000000e+00 : f32 |
| // CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : memref<12544x27xf32> |
| // CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : memref<27x16xf32> |
| // CHECK-DAG: %[[DST:.+]] = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : memref<12544x16xf32> |
| // CHECK-DAG: %[[LHS_V:.+]] = memref.subview %[[LHS]][%{{.*}}, 0] [64, 27] [1, 1] |
| // CHECK-DAG: %[[RHS_V:.+]] = memref.subview %[[RHS]][0, %{{.*}}] [27, 16] [1, 1] |
| // CHECK-DAG: %[[DST_V:.+]] = memref.subview %[[DST]][%{{.*}}, %{{.*}}] [64, 16] [1, 1] |
| // CHECK: linalg.fill(%[[C0]], %[[DST_V]]) |
| // CHECK: linalg.fill(%[[C0]], %[[LHS_PADDED]]) : f32, memref<64x32xf32> |
| // CHECK: %[[LHS_PADDED_INTER:.+]] = memref.subview %[[LHS_PADDED]][0, 0] [64, 27] [1, 1] |
| // CHECK: linalg.copy(%[[LHS_V]], %[[LHS_PADDED_INTER]]) |
| // CHECK: linalg.fill(%[[C0]], %[[RHS_PADDED]]) : f32, memref<32x16xf32> |
| // CHECK: %[[RHS_PADDED_INTER:.+]] = memref.subview %[[RHS_PADDED]][0, 0] [27, 16] [1, 1] |
| // CHECK: linalg.copy(%[[RHS_V]], %[[RHS_PADDED_INTER]]) |
| // CHECK: linalg.matmul ins(%[[LHS_PADDED]], %[[RHS_PADDED]] : memref<64x32xf32>, memref<32x16xf32>) |
| |
| // ----- |
| |
| func @dot_general_padded() { |
| %c0 = arith.constant 0 : index |
| %c3 = arith.constant 3 : index |
| %c1 = arith.constant 1 : index |
| %cst = arith.constant 0.000000e+00 : f32 |
| %m = hal.interface.load.constant offset = 0 : index |
| %n = hal.interface.load.constant offset = 1 : index |
| %k = hal.interface.load.constant offset = 2 : index |
| %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%m, %k} |
| %1 = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%k, %n} |
| %2 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%m, %n} |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y] |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y] |
| scf.for %arg0 = %3 to %m step %4 { |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x] |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x] |
| scf.for %arg1 = %5 to %n step %6 { |
| %7 = affine.min affine_map<(d0)[s0] -> (4, -d0 + s0)>(%arg0)[%m] |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x2xf32> |
| %9 = affine.min affine_map<(d0)[s0] -> (4, -d0 + s0)>(%arg1)[%n] |
| %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [2, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<2x?xf32> |
| %11 = affine.min affine_map<(d0)[s0] -> (4, -d0 + s0)>(%arg0)[%m] |
| %12 = affine.min affine_map<(d0)[s0] -> (4, -d0 + s0)>(%arg1)[%n] |
| %13 = linalg.pad_tensor %8 low[0, 0] high[1, 2] { |
| ^bb0(%arg2: index, %arg3: index): // no predecessors |
| linalg.yield %cst : f32 |
| } : tensor<?x2xf32> to tensor<4x4xf32> |
| %14 = linalg.pad_tensor %10 low[0, 0] high[2, 3] { |
| ^bb0(%arg2: index, %arg3: index): // no predecessors |
| linalg.yield %cst : f32 |
| } : tensor<2x?xf32> to tensor<4x4xf32> |
| %15 = linalg.init_tensor [4, 4] : tensor<4x4xf32> |
| %16 = linalg.fill(%cst, %15) : f32, tensor<4x4xf32> -> tensor<4x4xf32> |
| %17 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} ins(%13, %14 : tensor<4x4xf32>, tensor<4x4xf32>) outs(%16 : tensor<4x4xf32>) -> tensor<4x4xf32> |
| %18 = tensor.extract_slice %17[0, 0] [%7, %9] [1, 1] : tensor<4x4xf32> to tensor<?x?xf32> |
| flow.dispatch.tensor.store %18, %2, offsets = [%arg0, %arg1], sizes = [%11, %12], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32> |
| } |
| } |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read" |
| hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK: #[[MAP1:.+]] = affine_map<(d0)[s0] -> (4, -d0 + s0)> |
| // CHECK: func @dot_general_padded |
| // CHECK-DAG: %[[ALLOC_RET0:.+]] = memref.alloc |
| // CHECK-DAG: %[[ALLOC_ARG1:.+]] = memref.alloc |
| // CHECK-DAG: %[[ALLOC_ARG0:.+]] = memref.alloc |
| // CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0[{{.*}}] : memref<?x?xf32> |
| // CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan @io::@arg1[{{.*}}] : memref<?x?xf32> |
| // CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0[{{.*}}] : memref<?x?xf32> |
| // CHECK-DAG: %[[M:.+]] = hal.interface.load.constant offset = 0 |
| // CHECK-DAG: %[[N:.+]] = hal.interface.load.constant offset = 1 |
| // CHECK: scf.for %[[IV0:.+]] = %{{.+}} to %[[M]] |
| // CHECK: scf.for %[[IV1:.+]] = %{{.+}} to %[[N]] |
| // CHECK-DAG: %[[TILE_M:.+]] = affine.min #[[MAP1]](%[[IV0]])[%[[M]]] |
| // CHECK-DAG: %[[TILE_N:.+]] = affine.min #[[MAP1]](%[[IV1]])[%[[N]]] |
| // CHECK-DAG: %[[ARG0_SV:.+]] = memref.subview %[[ARG0]] |
| // CHECK-DAG: %[[ARG1_SV:.+]] = memref.subview %[[ARG1]] |
| // CHECK: linalg.fill(%{{.*}}, %[[ALLOC_ARG0]] |
| // CHECK: %[[ALLOC_ARG0_SV:.+]] = memref.subview %[[ALLOC_ARG0]] |
| // CHECK: linalg.copy(%[[ARG0_SV]], %[[ALLOC_ARG0_SV]]) |
| // CHECK: linalg.fill(%{{.*}}, %[[ALLOC_ARG1]] |
| // CHECK: linalg.copy(%[[ARG1_SV]] |
| // CHECK: linalg.fill(%{{.*}}, %[[ALLOC_RET0]] |
| // CHECK: linalg.matmul |
| // CHECK-SAME: ins(%[[ALLOC_ARG0]], %[[ALLOC_ARG1]] |
| // CHECK-SAME: outs(%[[ALLOC_RET0]] |
| // CHECK-DAG: %[[RET0_SV:.+]] = memref.subview %[[RET0]] |
| // CHECK-DAG: %[[ALLOC_RET0_SV:.+]] = memref.subview |
| // CHECK: linalg.copy(%[[ALLOC_RET0_SV]], %[[RET0_SV]]) |
| |
| // ----- |
| |
| func @im2col() { |
| %c0 = arith.constant 0 : index |
| %cst = arith.constant 0.000000e+00 : f32 |
| %c112 = arith.constant 112 : index |
| %c32 = arith.constant 32 : index |
| %c16 = arith.constant 16 : index |
| %c4 = arith.constant 4 : index |
| %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:1x225x225x8xf32> |
| %1 = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:3x3x8x32xf32> |
| %2 = hal.interface.binding.subspan @io::@ret0[%c0] : !flow.dispatch.tensor<writeonly:1x112x112x32xf32> |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index |
| %workgroup_id_z = hal.interface.workgroup.id[2] : index |
| %workgroup_count_z = hal.interface.workgroup.count[2] : index |
| %3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_z] |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_count_z] |
| scf.for %arg0 = %3 to %c112 step %4 { |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_id_y] |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%workgroup_count_y] |
| scf.for %arg1 = %5 to %c112 step %6 { |
| %7 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x] |
| %8 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x] |
| scf.for %arg2 = %7 to %c32 step %8 { |
| %9 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg0) |
| %10 = affine.min affine_map<(d0) -> (33, d0 * -2 + 225)>(%arg0) |
| %11 = affine.apply affine_map<(d0) -> (d0 * 2)>(%arg1) |
| %12 = affine.min affine_map<(d0) -> (33, d0 * -2 + 225)>(%arg1) |
| %13 = flow.dispatch.tensor.load %0, offsets = [0, %9, %11, 0], sizes = [1, %10, %12, 8], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x8xf32> -> tensor<1x?x?x8xf32> |
| %14 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, %arg2], sizes = [3, 3, 8, 4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x8x32xf32> -> tensor<3x3x8x4xf32> |
| %15 = linalg.init_tensor [1, 16, 16, 4] : tensor<1x16x16x4xf32> |
| %16 = linalg.fill(%cst, %15) {__internal_linalg_transform__ = "workgroup"} : f32, tensor<1x16x16x4xf32> -> tensor<1x16x16x4xf32> |
| %17 = linalg.init_tensor [1, 16, 16, 3, 3, 8] : tensor<1x16x16x3x3x8xf32> |
| %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1 * 2 + d3, d2 * 2 + d4, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<1x?x?x8xf32>) outs(%17 : tensor<1x16x16x3x3x8xf32>) { |
| ^bb0(%arg3: f32, %arg4: f32): // no predecessors |
| linalg.yield %arg3 : f32 |
| } -> tensor<1x16x16x3x3x8xf32> |
| %19 = linalg.tensor_collapse_shape %18 [[0, 1, 2], [3, 4, 5]] : tensor<1x16x16x3x3x8xf32> into tensor<256x72xf32> |
| %20 = linalg.tensor_collapse_shape %14 [[0, 1, 2], [3]] : tensor<3x3x8x4xf32> into tensor<72x4xf32> |
| %21 = linalg.tensor_collapse_shape %16 [[0, 1, 2], [3]] : tensor<1x16x16x4xf32> into tensor<256x4xf32> |
| %22 = linalg.matmul ins(%19, %20 : tensor<256x72xf32>, tensor<72x4xf32>) outs(%21 : tensor<256x4xf32>) -> tensor<256x4xf32> |
| %23 = linalg.tensor_expand_shape %22 [[0, 1, 2], [3]] : tensor<256x4xf32> into tensor<1x16x16x4xf32> |
| %24 = tensor.cast %23 : tensor<1x16x16x4xf32> to tensor<1x?x?x?xf32> |
| flow.dispatch.tensor.store %24, %2, offsets = [0, %arg0, %arg1, %arg2], sizes = [1, %c16, %c16, %c4], strides = [1, 1, 1, 1] : tensor<1x?x?x?xf32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xf32> |
| } |
| } |
| } |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read" |
| hal.interface.binding @ret0, set=0, binding=2, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK-LABEL: func @im2col |
| // CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@arg0 |
| // CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan @io::@arg1 |
| // CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@ret0 |
| // CHECK-DAG: %[[ALLOC_ARG0:.+]] = memref.alloc() : memref<1x16x16x3x3x8xf32> |
| // CHECK-DAG: %[[ALLOC_ARG1:.+]] = memref.alloc() : memref<3x3x8x4xf32> |
| // CHECK-DAG: %[[ALLOC_RET0:.+]] = memref.alloc() : memref<1x16x16x4xf32> |
| // CHECK: scf.for |
| // CHECK: scf.for |
| // CHECK: scf.for |
| // CHECK-DAG: %[[ARG0_SV:.+]] = memref.subview %[[ARG0]] |
| // CHECK-DAG: %[[ARG1_SV:.+]] = memref.subview %[[ARG1]] |
| // CHECK-DAG: linalg.copy(%[[ARG1_SV]], %[[ALLOC_ARG1]]) |
| // CHECK-DAG: linalg.fill(%{{.*}}, %[[ALLOC_RET0]] |
| // CHECK: linalg.generic |
| // CHECK-SAME: ins(%[[ARG0_SV]] |
| // CHECK-SAME: outs(%[[ALLOC_ARG0]] |
| // CHECK-DAG: %[[ALLOC_ARG0_RESHAPE:.+]] = memref.collapse_shape %[[ALLOC_ARG0]] |
| // CHECK-DAG: %[[ALLOC_ARG1_RESHAPE:.+]] = memref.collapse_shape %[[ALLOC_ARG1]] |
| // CHECK-DAG: %[[ALLOC_RET0_RESHAPE:.+]] = memref.collapse_shape %[[ALLOC_RET0]] |
| // CHECK: linalg.matmul |
| // CHECK-SAME: ins(%[[ALLOC_ARG0_RESHAPE]], %[[ALLOC_ARG1_RESHAPE]] |
| // CHECK-SAME: outs(%[[ALLOC_RET0_RESHAPE]] |
| // CHECK: %[[RET0_SV:.+]] = memref.subview %[[RET0]] |
| // CHECK: linalg.copy(%[[ALLOC_RET0]], %[[RET0_SV]]) |
| |
| // ----- |
| |
| func @multi_result_reduce() { |
| %c0 = arith.constant 0 : index |
| %c0_i32 = arith.constant 0 : i32 |
| %c-2147483648_i32 = arith.constant -2147483648 : i32 |
| %c2 = arith.constant 2 : index |
| %d0 = hal.interface.load.constant offset = 0 : index |
| %d1 = hal.interface.load.constant offset = 1 : index |
| %d2 = hal.interface.load.constant offset = 2 : index |
| %0 = hal.interface.binding.subspan @io::@ro0[%c0] : !flow.dispatch.tensor<readonly:?x?xi32>{%d0, %d1} |
| %1 = hal.interface.binding.subspan @io::@ro1[%c0] : !flow.dispatch.tensor<readonly:?x?xi32>{%d0, %d1} |
| %2 = hal.interface.binding.subspan @io::@wo0[%c0] : !flow.dispatch.tensor<writeonly:?xi32>{%d2} |
| %3 = hal.interface.binding.subspan @io::@wo1[%c0] : !flow.dispatch.tensor<writeonly:?xi32>{%d2} |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index |
| %4 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_id_x] |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 128)>()[%workgroup_count_x] |
| scf.for %arg0 = %4 to %d1 step %5 { |
| %6 = affine.min affine_map<(d0)[s0] -> (128, -d0 + s0)>(%arg0)[%d1] |
| %7 = flow.dispatch.tensor.load %0, offsets = [0, %arg0], sizes = [%d0, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xi32> -> tensor<?x?xi32> |
| %9 = flow.dispatch.tensor.load %1, offsets = [0, %arg0], sizes = [%d0, %6], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xi32> -> tensor<?x?xi32> |
| %13 = linalg.init_tensor [%6] : tensor<?xi32> |
| %14 = linalg.fill(%c-2147483648_i32, %13) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[128]]}} : i32, tensor<?xi32> -> tensor<?xi32> |
| %17 = linalg.fill(%c0_i32, %13) {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[128]]}} : i32, tensor<?xi32> -> tensor<?xi32> |
| %18:2 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0)>], iterator_types = ["parallel", "reduction"]} ins(%7, %9 : tensor<?x?xi32>, tensor<?x?xi32>) outs(%14, %17 : tensor<?xi32>, tensor<?xi32>) attrs = {__internal_linalg_transform__ = "workgroup", lowering.config = {tileSizes = [[128]]}} { |
| ^bb0(%arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32): // no predecessors |
| %19 = arith.cmpi sge, %arg1, %arg3 : i32 |
| %20 = select %19, %arg1, %arg3 : i32 |
| %21 = arith.cmpi eq, %arg1, %arg3 : i32 |
| %22 = arith.cmpi slt, %arg2, %arg4 : i32 |
| %23 = select %22, %arg2, %arg4 : i32 |
| %24 = select %19, %arg2, %arg4 : i32 |
| %25 = select %21, %23, %24 : i32 |
| linalg.yield %20, %25 : i32, i32 |
| } -> (tensor<?xi32>, tensor<?xi32>) |
| flow.dispatch.tensor.store %18#0, %2, offsets = [%arg0], sizes = [%6], strides = [1] : tensor<?xi32> -> !flow.dispatch.tensor<writeonly:?xi32> |
| flow.dispatch.tensor.store %18#1, %3, offsets = [%arg0], sizes = [%6], strides = [1] : tensor<?xi32> -> !flow.dispatch.tensor<writeonly:?xi32> |
| } |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @ro0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @ro1, set=0, binding=1, type="StorageBuffer", access="Read" |
| hal.interface.binding @wo0, set=0, binding=2, type="StorageBuffer", access="Write|Discard" |
| hal.interface.binding @wo1, set=0, binding=3, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK-LABEL: func @multi_result_reduce |
| // CHECK-DAG: %[[ARG0:.+]] = hal.interface.binding.subspan @io::@ro0 |
| // CHECK-DAG: %[[ARG1:.+]] = hal.interface.binding.subspan @io::@ro1 |
| // CHECK-DAG: %[[RET0:.+]] = hal.interface.binding.subspan @io::@wo0 |
| // CHECK-DAG: %[[RET1:.+]] = hal.interface.binding.subspan @io::@wo1 |
| // CHECK: scf.for |
| // CHECK-DAG: %[[ARG0_SV:.+]] = memref.subview %[[ARG0]] |
| // CHECK-DAG: %[[ARG1_SV:.+]] = memref.subview %[[ARG1]] |
| // CHECK-DAG: %[[RET0_SV:.+]] = memref.subview %[[RET0]] |
| // CHECK-DAG: linalg.fill(%{{.*}}, %[[RET0_SV]] |
| // CHECK-DAG: %[[RET1_SV:.+]] = memref.subview %[[RET1]] |
| // CHECK-DAG: linalg.fill(%{{.*}}, %[[RET1_SV]] |
| // CHECK: linalg.generic |
| // CHECK-SAME: ins(%[[ARG0_SV]], %[[ARG1_SV]] |
| // CHECK-SAME: outs(%[[RET0_SV]], %[[RET1_SV]] |
| |
| // ----- |
| |
| #config0 = {tileSizes = [[64, 64]]} |
| #config1 = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 24], [4, 4, 4]]} |
| #map0 = affine_map<()[s0] -> (s0 * 64)> |
| #map1 = affine_map<(d0) -> (64, -d0 + 250)> |
| #map2 = affine_map<(d0) -> (64, -d0 + 370)> |
| #map3 = affine_map<(d0) -> (32, -d0 + 250)> |
| #map4 = affine_map<(d0) -> (24, -d0 + 144)> |
| #map5 = affine_map<(d0) -> (32, -d0 + 370)> |
| #map6 = affine_map<(d0, d1) -> (32, d0 - d1)> |
| module { |
| func @l1_tiled_matmul_no_fill() { |
| %cst = arith.constant 0.000000e+00 : f32 |
| %c32 = arith.constant 32 : index |
| %c24 = arith.constant 24 : index |
| %c144 = arith.constant 144 : index |
| %c0 = arith.constant 0 : index |
| %c1 = arith.constant 1 : index |
| %c250 = arith.constant 250 : index |
| %c370 = arith.constant 370 : index |
| %0 = hal.interface.binding.subspan @io::@ro1[%c0] : !flow.dispatch.tensor<readonly:250x144xf32> |
| %1 = hal.interface.binding.subspan @io::@ro2[%c0] : !flow.dispatch.tensor<readonly:144x370xf32> |
| %init = hal.interface.binding.subspan @io::@ro3[%c0] : !flow.dispatch.tensor<readonly:250x370xf32> |
| %2 = hal.interface.binding.subspan @io::@wo[%c0] : !flow.dispatch.tensor<writeonly:250x370xf32> |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index |
| %3 = affine.apply #map0()[%workgroup_id_y] |
| %4 = affine.apply #map0()[%workgroup_count_y] |
| scf.for %arg0 = %3 to %c250 step %4 { |
| %5 = affine.apply #map0()[%workgroup_id_x] |
| %6 = affine.apply #map0()[%workgroup_count_x] |
| scf.for %arg1 = %5 to %c370 step %6 { |
| %7 = affine.min #map1(%arg0) |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 144], strides = [1, 1] : !flow.dispatch.tensor<readonly:250x144xf32> -> tensor<?x144xf32> |
| %9 = affine.min #map2(%arg1) |
| %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [144, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:144x370xf32> -> tensor<144x?xf32> |
| %11 = flow.dispatch.tensor.load %init, offsets = [%arg0, %arg1], sizes = [%7, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:250x370xf32> -> tensor<?x?xf32> |
| %13 = scf.for %arg2 = %c0 to %c250 step %c32 iter_args(%arg3 = %11) -> (tensor<?x?xf32>) { |
| %14 = scf.for %arg4 = %c0 to %c370 step %c32 iter_args(%arg5 = %arg3) -> (tensor<?x?xf32>) { |
| %15 = scf.for %arg6 = %c0 to %c144 step %c24 iter_args(%arg7 = %arg5) -> (tensor<?x?xf32>) { |
| %16 = affine.min #map3(%arg2) |
| %17 = affine.min #map4(%arg6) |
| %18 = tensor.extract_slice %8[%arg2, %arg6] [%16, %17] [1, 1] : tensor<?x144xf32> to tensor<?x?xf32> |
| %19 = affine.min #map5(%arg4) |
| %20 = tensor.extract_slice %10[%arg6, %arg4] [%17, %19] [1, 1] : tensor<144x?xf32> to tensor<?x?xf32> |
| %21 = tensor.dim %arg7, %c0 : tensor<?x?xf32> |
| %22 = affine.min #map6(%21, %arg2) |
| %23 = tensor.dim %arg7, %c1 : tensor<?x?xf32> |
| %24 = affine.min #map6(%23, %arg4) |
| %25 = tensor.extract_slice %arg7[%arg2, %arg4] [%22, %24] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32> |
| %26 = linalg.matmul {__internal_linalg_transform__ = "workgroup_l1_tile", lowering.config = #config1} ins(%18, %20 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%25 : tensor<?x?xf32>) -> tensor<?x?xf32> |
| %27 = tensor.insert_slice %26 into %arg7[%arg2, %arg4] [%22, %24] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32> |
| scf.yield %27 : tensor<?x?xf32> |
| } |
| scf.yield %15 : tensor<?x?xf32> |
| } |
| scf.yield %14 : tensor<?x?xf32> |
| } |
| flow.dispatch.tensor.store %13, %2, offsets = [%arg0, %arg1], sizes = [%7, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:250x370xf32> |
| } |
| } |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @ro1, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @ro2, set=0, binding=1, type="StorageBuffer", access="Read" |
| hal.interface.binding @ro3, set=0, binding=2, type="StorageBuffer", access="Read" |
| hal.interface.binding @wo, set=0, binding=3, type="StorageBuffer", access="Write|Discard" |
| } |
| } |
| |
| // CHECK-LABEL: l1_tiled_matmul_no_fill |
| // CHECK-DAG: %[[M:.+]] = arith.constant 250 : index |
| // CHECK-DAG: %[[N:.+]] = arith.constant 370 : index |
| // CHECK-DAG: %[[K:.+]] = arith.constant 144 : index |
| // CHECK-DAG: %[[L1_MN_SIZE:.+]] = arith.constant 32 : index |
| // CHECK-DAG: %[[L1_K_SIZE:.+]] = arith.constant 24 : index |
| // CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan @io::@ro1[%{{.*}}] : memref<250x144xf32> |
| // CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan @io::@ro2[%{{.*}}] : memref<144x370xf32> |
| // CHECK-DAG: %[[INIT:.+]] = hal.interface.binding.subspan @io::@ro3[%{{.*}}] : memref<250x370xf32> |
| // CHECK-DAG: %[[DST:.+]] = hal.interface.binding.subspan @io::@wo[%{{.*}}] : memref<250x370xf32> |
| // CHECK: scf.for %[[WORKGROUP_I:.+]] = %{{.*}} to %[[M]] step %{{.*}} { |
| // CHECK: scf.for %[[WORKGROUP_J:.+]] = %{{.*}} to %[[N]] step %{{.*}} { |
| // CHECK-DAG: %[[WORKGROUP_I_SIZE:.+]] = affine.min #{{.*}}(%[[WORKGROUP_I]]) |
| // CHECK-DAG: %[[LHS_WORKGROUP_TILE:.+]] = memref.subview %[[LHS]][%[[WORKGROUP_I]], 0] [%[[WORKGROUP_I_SIZE]], 144] [1, 1] : memref<250x144xf32> to memref<?x144xf32 |
| // CHECK-DAG: %[[WORKGROUP_J_SIZE:.+]] = affine.min #{{.*}}(%[[WORKGROUP_J]]) |
| // CHECK-DAG: %[[RHS_WORKGROUP_TILE:.+]] = memref.subview %[[RHS]][0, %[[WORKGROUP_J]]] [144, %[[WORKGROUP_J_SIZE]]] [1, 1] : memref<144x370xf32> to memref<144x?xf32 |
| // CHECK-DAG: %[[INIT_WORKGROUP_TILE:.+]] = memref.subview %[[INIT]][%[[WORKGROUP_I]], %[[WORKGROUP_J]]] [%[[WORKGROUP_I_SIZE]], %[[WORKGROUP_J_SIZE]]] |
| // CHECK-DAG: %[[DST_WORKGROUP_TILE:.+]] = memref.subview %[[DST]][%[[WORKGROUP_I]], %[[WORKGROUP_J]]] [%[[WORKGROUP_I_SIZE]], %[[WORKGROUP_J_SIZE]]] |
| // CHECK: linalg.copy(%[[INIT_WORKGROUP_TILE]], %[[DST_WORKGROUP_TILE]]) |
| // CHECK: scf.for %[[L1_I:.+]] = %{{.*}} to %[[M]] step %[[L1_MN_SIZE]] { |
| // CHECK: scf.for %[[L1_J:.+]] = %{{.*}} to %[[N]] step %[[L1_MN_SIZE]] { |
| // CHECK: scf.for %[[L1_K:.+]] = %{{.*}} to %[[K]] step %[[L1_K_SIZE]] { |
| // CHECK-DAG: %[[LHS_L1_TILE:.+]] = memref.subview %[[LHS_WORKGROUP_TILE]][%[[L1_I]], %[[L1_K]]] |
| // CHECK-DAG: %[[RHS_L1_TILE:.+]] = memref.subview %[[RHS_WORKGROUP_TILE]][%[[L1_K]], %[[L1_J]]] |
| // CHECK-DAG: %[[L1_I_SIZE:.+]] = affine.min #{{.*}}(%[[WORKGROUP_I_SIZE]], %[[L1_I]]) |
| // CHECK-DAG: %[[L1_J_SIZE:.+]] = affine.min #{{.*}}(%[[WORKGROUP_J_SIZE]], %[[L1_J]]) |
| // CHECK-DAG: %[[DST_L1_TILE:.+]] = memref.subview %[[DST_WORKGROUP_TILE]][%[[L1_I]], %[[L1_J]]] |
| // CHECK: linalg.matmul |
| // CHECK-SAME: ins(%[[LHS_L1_TILE]], %[[RHS_L1_TILE]] |
| // CHECK-SAME: outs(%[[DST_L1_TILE]] |
| |
| |
| // ----- |
| |
| #config0 = {tileSizes = [[64, 64]]} |
| #config1 = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 24], [4, 4, 4]]} |
| #map0 = affine_map<()[s0] -> (s0 * 64)> |
| #map1 = affine_map<(d0) -> (64, -d0 + 250)> |
| #map2 = affine_map<(d0) -> (64, -d0 + 370)> |
| #map3 = affine_map<(d0) -> (32, -d0 + 250)> |
| #map4 = affine_map<(d0) -> (24, -d0 + 144)> |
| #map5 = affine_map<(d0) -> (32, -d0 + 370)> |
| #map6 = affine_map<(d0, d1) -> (32, d0 - d1)> |
| module { |
| func @l1_tiled_matmul_no_fill_readwrite() { |
| %cst = arith.constant 0.000000e+00 : f32 |
| %c32 = arith.constant 32 : index |
| %c24 = arith.constant 24 : index |
| %c144 = arith.constant 144 : index |
| %c0 = arith.constant 0 : index |
| %c1 = arith.constant 1 : index |
| %c250 = arith.constant 250 : index |
| %c370 = arith.constant 370 : index |
| %0 = hal.interface.binding.subspan @io::@ro1[%c0] : !flow.dispatch.tensor<readonly:250x144xf32> |
| %1 = hal.interface.binding.subspan @io::@ro2[%c0] : !flow.dispatch.tensor<readonly:144x370xf32> |
| %2 = hal.interface.binding.subspan @io::@wo[%c0] : !flow.dispatch.tensor<readwrite:250x370xf32> |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index |
| %3 = affine.apply #map0()[%workgroup_id_y] |
| %4 = affine.apply #map0()[%workgroup_count_y] |
| scf.for %arg0 = %3 to %c250 step %4 { |
| %5 = affine.apply #map0()[%workgroup_id_x] |
| %6 = affine.apply #map0()[%workgroup_count_x] |
| scf.for %arg1 = %5 to %c370 step %6 { |
| %7 = affine.min #map1(%arg0) |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 144], strides = [1, 1] : !flow.dispatch.tensor<readonly:250x144xf32> -> tensor<?x144xf32> |
| %9 = affine.min #map2(%arg1) |
| %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [144, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:144x370xf32> -> tensor<144x?xf32> |
| %11 = flow.dispatch.tensor.load %2, offsets = [%arg0, %arg1], sizes = [%7, %9], strides = [1, 1] : !flow.dispatch.tensor<readwrite:250x370xf32> -> tensor<?x?xf32> |
| %13 = scf.for %arg2 = %c0 to %c250 step %c32 iter_args(%arg3 = %11) -> (tensor<?x?xf32>) { |
| %14 = scf.for %arg4 = %c0 to %c370 step %c32 iter_args(%arg5 = %arg3) -> (tensor<?x?xf32>) { |
| %15 = scf.for %arg6 = %c0 to %c144 step %c24 iter_args(%arg7 = %arg5) -> (tensor<?x?xf32>) { |
| %16 = affine.min #map3(%arg2) |
| %17 = affine.min #map4(%arg6) |
| %18 = tensor.extract_slice %8[%arg2, %arg6] [%16, %17] [1, 1] : tensor<?x144xf32> to tensor<?x?xf32> |
| %19 = affine.min #map5(%arg4) |
| %20 = tensor.extract_slice %10[%arg6, %arg4] [%17, %19] [1, 1] : tensor<144x?xf32> to tensor<?x?xf32> |
| %21 = tensor.dim %arg7, %c0 : tensor<?x?xf32> |
| %22 = affine.min #map6(%21, %arg2) |
| %23 = tensor.dim %arg7, %c1 : tensor<?x?xf32> |
| %24 = affine.min #map6(%23, %arg4) |
| %25 = tensor.extract_slice %arg7[%arg2, %arg4] [%22, %24] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32> |
| %26 = linalg.matmul {__internal_linalg_transform__ = "workgroup_l1_tile", lowering.config = #config1} ins(%18, %20 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%25 : tensor<?x?xf32>) -> tensor<?x?xf32> |
| %27 = tensor.insert_slice %26 into %arg7[%arg2, %arg4] [%22, %24] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32> |
| scf.yield %27 : tensor<?x?xf32> |
| } |
| scf.yield %15 : tensor<?x?xf32> |
| } |
| scf.yield %14 : tensor<?x?xf32> |
| } |
| flow.dispatch.tensor.store %13, %2, offsets = [%arg0, %arg1], sizes = [%7, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<readwrite:250x370xf32> |
| } |
| } |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @ro1, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @ro2, set=0, binding=1, type="StorageBuffer", access="Read" |
| hal.interface.binding @wo, set=0, binding=2, type="StorageBuffer", access="Read|Write" |
| } |
| } |
| |
| // CHECK-LABEL: l1_tiled_matmul_no_fill_readwrite |
| // CHECK-DAG: %[[M:.+]] = arith.constant 250 : index |
| // CHECK-DAG: %[[N:.+]] = arith.constant 370 : index |
| // CHECK-DAG: %[[K:.+]] = arith.constant 144 : index |
| // CHECK-DAG: %[[L1_MN_SIZE:.+]] = arith.constant 32 : index |
| // CHECK-DAG: %[[L1_K_SIZE:.+]] = arith.constant 24 : index |
| // CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan @io::@ro1[%{{.*}}] : memref<250x144xf32> |
| // CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan @io::@ro2[%{{.*}}] : memref<144x370xf32> |
| // CHECK-DAG: %[[DST:.+]] = hal.interface.binding.subspan @io::@wo[%{{.*}}] : memref<250x370xf32> |
| // CHECK: scf.for %[[WORKGROUP_I:.+]] = %{{.*}} to %[[M]] step %{{.*}} { |
| // CHECK: scf.for %[[WORKGROUP_J:.+]] = %{{.*}} to %[[N]] step %{{.*}} { |
| // CHECK-DAG: %[[WORKGROUP_I_SIZE:.+]] = affine.min #{{.*}}(%[[WORKGROUP_I]]) |
| // CHECK-DAG: %[[LHS_WORKGROUP_TILE:.+]] = memref.subview %[[LHS]][%[[WORKGROUP_I]], 0] [%[[WORKGROUP_I_SIZE]], 144] [1, 1] : memref<250x144xf32> to memref<?x144xf32 |
| // CHECK-DAG: %[[WORKGROUP_J_SIZE:.+]] = affine.min #{{.*}}(%[[WORKGROUP_J]]) |
| // CHECK-DAG: %[[RHS_WORKGROUP_TILE:.+]] = memref.subview %[[RHS]][0, %[[WORKGROUP_J]]] [144, %[[WORKGROUP_J_SIZE]]] [1, 1] : memref<144x370xf32> to memref<144x?xf32 |
| // CHECK-DAG: %[[DST_WORKGROUP_TILE:.+]] = memref.subview %[[DST]][%[[WORKGROUP_I]], %[[WORKGROUP_J]]] [%[[WORKGROUP_I_SIZE]], %[[WORKGROUP_J_SIZE]]] |
| // CHECK: scf.for %[[L1_I:.+]] = %{{.*}} to %[[M]] step %[[L1_MN_SIZE]] { |
| // CHECK: scf.for %[[L1_J:.+]] = %{{.*}} to %[[N]] step %[[L1_MN_SIZE]] { |
| // CHECK: scf.for %[[L1_K:.+]] = %{{.*}} to %[[K]] step %[[L1_K_SIZE]] { |
| // CHECK-DAG: %[[LHS_L1_TILE:.+]] = memref.subview %[[LHS_WORKGROUP_TILE]][%[[L1_I]], %[[L1_K]]] |
| // CHECK-DAG: %[[RHS_L1_TILE:.+]] = memref.subview %[[RHS_WORKGROUP_TILE]][%[[L1_K]], %[[L1_J]]] |
| // CHECK-DAG: %[[L1_I_SIZE:.+]] = affine.min #{{.*}}(%[[WORKGROUP_I_SIZE]], %[[L1_I]]) |
| // CHECK-DAG: %[[L1_J_SIZE:.+]] = affine.min #{{.*}}(%[[WORKGROUP_J_SIZE]], %[[L1_J]]) |
| // CHECK-DAG: %[[DST_L1_TILE:.+]] = memref.subview %[[DST_WORKGROUP_TILE]][%[[L1_I]], %[[L1_J]]] |
| // CHECK: linalg.matmul |
| // CHECK-SAME: ins(%[[LHS_L1_TILE]], %[[RHS_L1_TILE]] |
| // CHECK-SAME: outs(%[[DST_L1_TILE]] |
| |
| // ----- |
| |
| #config0 = {tileSizes = [[64, 64]]} |
| #config1 = {nativeVectorSize = [4, 4, 4], tileSizes = [[64, 64], [32, 32, 24], [4, 4, 4]]} |
| #map0 = affine_map<()[s0] -> (s0 * 64)> |
| #map1 = affine_map<(d0) -> (64, -d0 + 250)> |
| #map2 = affine_map<(d0) -> (64, -d0 + 370)> |
| #map3 = affine_map<(d0) -> (32, -d0 + 250)> |
| #map4 = affine_map<(d0) -> (24, -d0 + 144)> |
| #map5 = affine_map<(d0) -> (32, -d0 + 370)> |
| #map6 = affine_map<(d0, d1) -> (32, d0 - d1)> |
| module { |
| func @l1_tiled_matmul() { |
| %cst = arith.constant 0.000000e+00 : f32 |
| %c32 = arith.constant 32 : index |
| %c24 = arith.constant 24 : index |
| %c144 = arith.constant 144 : index |
| %c0 = arith.constant 0 : index |
| %c1 = arith.constant 1 : index |
| %c250 = arith.constant 250 : index |
| %c370 = arith.constant 370 : index |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:250x144xf32> |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:144x370xf32> |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:250x370xf32> |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index |
| %3 = affine.apply #map0()[%workgroup_id_y] |
| %4 = affine.apply #map0()[%workgroup_count_y] |
| scf.for %arg0 = %3 to %c250 step %4 { |
| %5 = affine.apply #map0()[%workgroup_id_x] |
| %6 = affine.apply #map0()[%workgroup_count_x] |
| scf.for %arg1 = %5 to %c370 step %6 { |
| %7 = affine.min #map1(%arg0) |
| %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 144], strides = [1, 1] : !flow.dispatch.tensor<readonly:250x144xf32> -> tensor<?x144xf32> |
| %9 = affine.min #map2(%arg1) |
| %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [144, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:144x370xf32> -> tensor<144x?xf32> |
| %11 = linalg.init_tensor [%7, %9] : tensor<?x?xf32> |
| %12 = linalg.fill(%cst, %11) {__internal_linalg_transform__ = "workgroup", lowering.config = #config0} : f32, tensor<?x?xf32> -> tensor<?x?xf32> |
| %13 = scf.for %arg2 = %c0 to %c250 step %c32 iter_args(%arg3 = %12) -> (tensor<?x?xf32>) { |
| %14 = scf.for %arg4 = %c0 to %c370 step %c32 iter_args(%arg5 = %arg3) -> (tensor<?x?xf32>) { |
| %15 = scf.for %arg6 = %c0 to %c144 step %c24 iter_args(%arg7 = %arg5) -> (tensor<?x?xf32>) { |
| %16 = affine.min #map3(%arg2) |
| %17 = affine.min #map4(%arg6) |
| %18 = tensor.extract_slice %8[%arg2, %arg6] [%16, %17] [1, 1] : tensor<?x144xf32> to tensor<?x?xf32> |
| %19 = affine.min #map5(%arg4) |
| %20 = tensor.extract_slice %10[%arg6, %arg4] [%17, %19] [1, 1] : tensor<144x?xf32> to tensor<?x?xf32> |
| %21 = tensor.dim %arg7, %c0 : tensor<?x?xf32> |
| %22 = affine.min #map6(%21, %arg2) |
| %23 = tensor.dim %arg7, %c1 : tensor<?x?xf32> |
| %24 = affine.min #map6(%23, %arg4) |
| %25 = tensor.extract_slice %arg7[%arg2, %arg4] [%22, %24] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32> |
| %26 = linalg.matmul {__internal_linalg_transform__ = "workgroup_l1_tile", lowering.config = #config1} ins(%18, %20 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%25 : tensor<?x?xf32>) -> tensor<?x?xf32> |
| %27 = tensor.insert_slice %26 into %arg7[%arg2, %arg4] [%22, %24] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32> |
| scf.yield %27 : tensor<?x?xf32> |
| } |
| scf.yield %15 : tensor<?x?xf32> |
| } |
| scf.yield %14 : tensor<?x?xf32> |
| } |
| flow.dispatch.tensor.store %13, %2, offsets = [%arg0, %arg1], sizes = [%7, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:250x370xf32> |
| } |
| } |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" |
| hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" |
| } |
| } |
| |
| // CHECK-LABEL: l1_tiled_matmul |
| // CHECK-DAG: %[[M:.+]] = arith.constant 250 : index |
| // CHECK-DAG: %[[N:.+]] = arith.constant 370 : index |
| // CHECK-DAG: %[[K:.+]] = arith.constant 144 : index |
| // CHECK-DAG: %[[L1_MN_SIZE:.+]] = arith.constant 32 : index |
| // CHECK-DAG: %[[L1_K_SIZE:.+]] = arith.constant 24 : index |
| // CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan @io::@s0b0_ro_external[%{{.*}}] : memref<250x144xf32> |
| // CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan @io::@s0b1_ro_external[%{{.*}}] : memref<144x370xf32> |
| // CHECK-DAG: %[[DST:.+]] = hal.interface.binding.subspan @io::@s0b2_xw_external[%{{.*}}] : memref<250x370xf32> |
| // CHECK: scf.for %[[WORKGROUP_I:.+]] = %{{.*}} to %[[M]] step %{{.*}} { |
| // CHECK: scf.for %[[WORKGROUP_J:.+]] = %{{.*}} to %[[N]] step %{{.*}} { |
| // CHECK-DAG: %[[WORKGROUP_I_SIZE:.+]] = affine.min #{{.*}}(%[[WORKGROUP_I]]) |
| // CHECK-DAG: %[[LHS_WORKGROUP_TILE:.+]] = memref.subview %[[LHS]][%[[WORKGROUP_I]], 0] [%[[WORKGROUP_I_SIZE]], 144] [1, 1] : memref<250x144xf32> to memref<?x144xf32 |
| // CHECK-DAG: %[[WORKGROUP_J_SIZE:.+]] = affine.min #{{.*}}(%[[WORKGROUP_J]]) |
| // CHECK-DAG: %[[RHS_WORKGROUP_TILE:.+]] = memref.subview %[[RHS]][0, %[[WORKGROUP_J]]] [144, %[[WORKGROUP_J_SIZE]]] [1, 1] : memref<144x370xf32> to memref<144x?xf32 |
| // CHECK-DAG: %[[DST_WORKGROUP_TILE:.+]] = memref.subview %[[DST]][%[[WORKGROUP_I]], %[[WORKGROUP_J]]] [%[[WORKGROUP_I_SIZE]], %[[WORKGROUP_J_SIZE]]] |
| // CHECK: scf.for %[[L1_I:.+]] = %{{.*}} to %[[M]] step %[[L1_MN_SIZE]] { |
| // CHECK: scf.for %[[L1_J:.+]] = %{{.*}} to %[[N]] step %[[L1_MN_SIZE]] { |
| // CHECK: scf.for %[[L1_K:.+]] = %{{.*}} to %[[K]] step %[[L1_K_SIZE]] { |
| // CHECK-DAG: %[[LHS_L1_TILE:.+]] = memref.subview %[[LHS_WORKGROUP_TILE]][%[[L1_I]], %[[L1_K]]] |
| // CHECK-DAG: %[[RHS_L1_TILE:.+]] = memref.subview %[[RHS_WORKGROUP_TILE]][%[[L1_K]], %[[L1_J]]] |
| // CHECK-DAG: %[[L1_I_SIZE:.+]] = affine.min #{{.*}}(%[[WORKGROUP_I_SIZE]], %[[L1_I]]) |
| // CHECK-DAG: %[[L1_J_SIZE:.+]] = affine.min #{{.*}}(%[[WORKGROUP_J_SIZE]], %[[L1_J]]) |
| // CHECK-DAG: %[[DST_L1_TILE:.+]] = memref.subview %[[DST_WORKGROUP_TILE]][%[[L1_I]], %[[L1_J]]] |
| // CHECK: linalg.matmul |
| // CHECK-SAME: ins(%[[LHS_L1_TILE]], %[[RHS_L1_TILE]] |
| // CHECK-SAME: outs(%[[DST_L1_TILE]] |
| |
| // ----- |
| |
| func @sort1D() { |
| %c4 = arith.constant 4 : index |
| %c3 = arith.constant 3 : index |
| %c0 = arith.constant 0 : index |
| %c1 = arith.constant 1 : index |
| %0 = hal.interface.binding.subspan @io::@ro[%c0] : !flow.dispatch.tensor<readonly:4xi32> |
| %1 = hal.interface.binding.subspan @io::@wo[%c0] : !flow.dispatch.tensor<writeonly:4xi32> |
| %2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:4xi32> -> tensor<4xi32> |
| %3 = scf.for %arg0 = %c0 to %c4 step %c1 iter_args(%arg1 = %2) -> (tensor<4xi32>) { |
| %4 = scf.for %arg2 = %c0 to %c3 step %c1 iter_args(%arg3 = %arg1) -> (tensor<4xi32>) { |
| %5 = arith.addi %arg2, %c1 : index |
| %6 = tensor.extract %arg3[%arg2] : tensor<4xi32> |
| %7 = tensor.extract %arg3[%5] : tensor<4xi32> |
| %8 = arith.cmpi sgt, %6, %7 : i32 |
| %11 = scf.if %8 -> (tensor<4xi32>) { |
| %12 = tensor.insert %6 into %arg3[%5] : tensor<4xi32> |
| %13 = tensor.insert %7 into %12[%arg2] : tensor<4xi32> |
| scf.yield %13 : tensor<4xi32> |
| } else { |
| scf.yield %arg3 : tensor<4xi32> |
| } |
| scf.yield %11 : tensor<4xi32> |
| } |
| scf.yield %4 : tensor<4xi32> |
| } |
| flow.dispatch.tensor.store %3, %1, offsets = [], sizes = [], strides = [] : tensor<4xi32> -> !flow.dispatch.tensor<writeonly:4xi32> |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @ro, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @wo, set=0, binding=1, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK-LABEL: func @sort1D() |
| // CHECK-DAG: %[[INPUT:.+]] = hal.interface.binding.subspan @io::@ro |
| // CHECK-DAG: %[[OUTPUT:.+]] = hal.interface.binding.subspan @io::@wo |
| // CHECK: linalg.copy(%[[INPUT]], %[[OUTPUT]]) |
| // CHECK: scf.for %[[ARG0:.+]] = |
| // CHECK: scf.for %[[ARG1:.+]] = |
| // CHECK-DAG: %[[P1:.+]] = arith.addi %[[ARG1]] |
| // CHECK-DAG: %[[V1:.+]] = memref.load %[[OUTPUT]][%[[ARG1]]] |
| // CHECK-DAG: %[[V2:.+]] = memref.load %[[OUTPUT]][%[[P1]]] |
| // CHECK: scf.if |
| // CHECK-DAG: memref.store %[[V1]], %[[OUTPUT]][%[[P1]]] |
| // CHECK-DAG: memref.store %[[V2]], %[[OUTPUT]][%[[ARG1]]] |
| |
| |
| // ----- |
| |
| func @sort1D_inplace() { |
| %c4 = arith.constant 4 : index |
| %c3 = arith.constant 3 : index |
| %c0 = arith.constant 0 : index |
| %c1 = arith.constant 1 : index |
| %0 = hal.interface.binding.subspan @io::@rw[%c0] : !flow.dispatch.tensor<readwrite:4xi32> |
| %2 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:4xi32> -> tensor<4xi32> |
| %3 = scf.for %arg0 = %c0 to %c4 step %c1 iter_args(%arg1 = %2) -> (tensor<4xi32>) { |
| %4 = scf.for %arg2 = %c0 to %c3 step %c1 iter_args(%arg3 = %arg1) -> (tensor<4xi32>) { |
| %5 = arith.addi %arg2, %c1 : index |
| %6 = tensor.extract %arg3[%arg2] : tensor<4xi32> |
| %7 = tensor.extract %arg3[%5] : tensor<4xi32> |
| %8 = arith.cmpi sgt, %6, %7 : i32 |
| %11 = scf.if %8 -> (tensor<4xi32>) { |
| %12 = tensor.insert %6 into %arg3[%5] : tensor<4xi32> |
| %13 = tensor.insert %7 into %12[%arg2] : tensor<4xi32> |
| scf.yield %13 : tensor<4xi32> |
| } else { |
| scf.yield %arg3 : tensor<4xi32> |
| } |
| scf.yield %11 : tensor<4xi32> |
| } |
| scf.yield %4 : tensor<4xi32> |
| } |
| flow.dispatch.tensor.store %3, %0, offsets = [], sizes = [], strides = [] : tensor<4xi32> -> !flow.dispatch.tensor<readwrite:4xi32> |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @rw, set=0, binding=0, type="StorageBuffer", access="Read|Write" |
| } |
| // CHECK-LABEL: func @sort1D_inplace() |
| // CHECK-DAG: %[[INOUT:.+]] = hal.interface.binding.subspan @io::@rw |
| // CHECK: scf.for %[[ARG0:.+]] = |
| // CHECK: scf.for %[[ARG1:.+]] = |
| // CHECK-DAG: %[[P1:.+]] = arith.addi %[[ARG1]] |
| // CHECK-DAG: %[[V1:.+]] = memref.load %[[INOUT]][%[[ARG1]]] |
| // CHECK-DAG: %[[V2:.+]] = memref.load %[[INOUT]][%[[P1]]] |
| // CHECK: scf.if |
| // CHECK-DAG: memref.store %[[V1]], %[[INOUT]][%[[P1]]] |
| // CHECK-DAG: memref.store %[[V2]], %[[INOUT]][%[[ARG1]]] |
| |
| // ----- |
| |
| func @linalg_ext_sort_1d() { |
| %c0 = arith.constant 0 : index |
| %0 = hal.interface.binding.subspan @io::@rw[%c0] : !flow.dispatch.tensor<readwrite:128xi32> |
| %1 = flow.dispatch.tensor.load %0, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readwrite:128xi32> -> tensor<128xi32> |
| %2 = linalg_ext.sort dimension(0) outs(%1 : tensor<128xi32>) { |
| ^bb0(%arg0: i32, %arg1: i32): // no predecessors |
| %3 = arith.cmpi sgt, %arg0, %arg1 : i32 |
| linalg_ext.yield %3 : i1 |
| } -> tensor<128xi32> |
| flow.dispatch.tensor.store %2, %0, offsets = [], sizes = [], strides = [] : tensor<128xi32> -> !flow.dispatch.tensor<readwrite:128xi32> |
| return |
| } |
| // CHECK-LABEL: func @linalg_ext_sort_1d() |
| // CHECK-DAG: %[[INOUT:.+]] = hal.interface.binding.subspan @io::@rw |
| // CHECK: linalg_ext.sort |
| // CHECK-SAME: dimension(0) |
| // CHECK-SAME: outs(%[[INOUT]] : memref<128xi32>) |
| |
| // ----- |
| |
| builtin.func @tensor_insert_slice() { |
| %c0 = arith.constant 0 : index |
| %1 = hal.interface.load.constant offset = 0 : index |
| %2 = hal.interface.load.constant offset = 1 : index |
| %d0 = hal.interface.load.constant offset = 2 : index |
| %d1 = hal.interface.load.constant offset = 3 : index |
| %d2 = hal.interface.load.constant offset = 4 : index |
| %d3 = hal.interface.load.constant offset = 5 : index |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:?x?xi32>{%d0, %d1} |
| %3 = hal.interface.binding.subspan @io::@s0b1_xw_external[%c0] : !flow.dispatch.tensor<writeonly:?x?xi32>{%d2, %d3} |
| %workgroup_size_x = hal.interface.workgroup.size[0] : index |
| %workgroup_size_y = hal.interface.workgroup.size[1] : index |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index |
| %4 = affine.apply affine_map<()[s0, s1] -> (s1 * s0)>()[%workgroup_size_y, %workgroup_id_y] |
| %5 = affine.apply affine_map<()[s0, s1] -> (s1 * s0)>()[%workgroup_size_y, %workgroup_count_y] |
| scf.for %arg0 = %4 to %d0 step %5 { |
| %6 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg0)[%workgroup_size_y, %d0] |
| %7 = affine.apply affine_map<()[s0, s1] -> (s1 * s0)>()[%workgroup_size_x, %workgroup_id_x] |
| %8 = affine.apply affine_map<()[s0, s1] -> (s1 * s0)>()[%workgroup_size_x, %workgroup_count_x] |
| scf.for %arg1 = %7 to %d1 step %8 { |
| %9 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg1)[%workgroup_size_x, %d1] |
| %10 = flow.dispatch.tensor.load %0, offsets = [%arg0, %arg1], sizes = [%6, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xi32> -> tensor<?x?xi32> |
| %11 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%arg0)[%1] |
| %12 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%arg1)[%2] |
| flow.dispatch.tensor.store %10, %3, offsets = [%11, %12], sizes = [%6, %9], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:?x?xi32> |
| } |
| } |
| return |
| } |
| hal.interface @io attributes {push_constants = 2 : index, sym_visibility = "private"} { |
| hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @s0b1_xw_external, set=0, binding=1, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK: #[[MAP:.+]] = affine_map<(d0)[s0] -> (d0 + s0)> |
| // CHECK: func @tensor_insert_slice() |
| // CHECK-DAG: %[[SRC:.+]] = hal.interface.binding.subspan @io::@s0b0_ro_external[%{{.+}}] : memref<?x?xi32> |
| // CHECK-DAG: %[[DST:.+]] = hal.interface.binding.subspan @io::@s0b1_xw_external[%{{.+}}] : memref<?x?xi32> |
| // CHECK-DAG: %[[OFFSET_Y:.+]] = hal.interface.load.constant offset = 0 |
| // CHECK-DAG: %[[OFFSET_X:.+]] = hal.interface.load.constant offset = 1 |
| // CHECK: scf.for %[[IV0:.+]] = |
| // CHECK: scf.for %[[IV1:.+]] = |
| // CHECK-DAG: %[[SRC_VIEW:.+]] = memref.subview %[[SRC]][%[[IV0]], %[[IV1]]] |
| // CHECK-DAG: %[[DST_IDX_Y:.+]] = affine.apply #[[MAP]](%[[IV0]])[%[[OFFSET_Y]]] |
| // CHECK-DAG: %[[DST_IDX_X:.+]] = affine.apply #[[MAP]](%[[IV1]])[%[[OFFSET_X]]] |
| // CHECK: %[[DST_VIEW:.+]] = memref.subview %[[DST]][%[[DST_IDX_Y]], %[[DST_IDX_X]]] |
| // CHECK: linalg.copy(%[[SRC_VIEW]], %[[DST_VIEW]]) |
| |
| |
| // ----- |
| |
| builtin.func @dynamic_update_slice() { |
| %c0 = arith.constant 0 : index |
| %c3 = arith.constant 3 : index |
| %c0_i32 = arith.constant 0 : i32 |
| %d0 = hal.interface.load.constant offset = 0 : index |
| %d1 = hal.interface.load.constant offset = 1 : index |
| %0 = hal.interface.binding.subspan @io::@s0b0_ro_external[%c0] : !flow.dispatch.tensor<readonly:?xi32>{%d0} |
| %1 = hal.interface.binding.subspan @io::@s0b1_ro_external[%c0] : !flow.dispatch.tensor<readonly:i32> |
| %2 = hal.interface.binding.subspan @io::@s0b2_xw_external[%c0] : !flow.dispatch.tensor<writeonly:?x?xi32>{%d1, %d0} |
| %3 = flow.dispatch.tensor.load %1, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:i32> -> tensor<i32> |
| %4 = tensor.extract %3[] : tensor<i32> |
| %5 = arith.cmpi slt, %4, %c0_i32 : i32 |
| %6 = select %5, %4, %c0_i32 : i32 |
| %7 = arith.cmpi sgt, %6, %c0_i32 : i32 |
| %8 = select %7, %6, %c0_i32 : i32 |
| %9 = arith.index_cast %8 : i32 to index |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index |
| %10 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_id_x] |
| %11 = affine.apply affine_map<()[s0] -> (s0 * 64)>()[%workgroup_count_x] |
| scf.for %arg0 = %10 to %d0 step %11 { |
| %12 = affine.min affine_map<(d0)[s0] -> (64, -d0 + s0)>(%arg0)[%d0] |
| %13 = flow.dispatch.tensor.load %0, offsets = [%arg0], sizes = [%12], strides = [1] : !flow.dispatch.tensor<readonly:?xi32> -> tensor<?xi32> |
| %14 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)>(%arg0)[%9] |
| flow.dispatch.tensor.store %13, %2, offsets = [0, %14], sizes = [1, %12], strides = [1, 1] : tensor<?xi32> -> !flow.dispatch.tensor<writeonly:?x?xi32> |
| } |
| return |
| } |
| hal.interface @io attributes {sym_visibility = "private"} { |
| hal.interface.binding @s0b0_ro_external, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @s0b1_ro_external, set=0, binding=1, type="StorageBuffer", access="Read" |
| hal.interface.binding @s0b2_xw_external, set=0, binding=2, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK-LABEL: func @dynamic_update_slice() |
| // CHECK-DAG: %[[SRC:.+]] = hal.interface.binding.subspan @io::@s0b0_ro_external[%{{.+}}] : memref<?xi32> |
| // CHECK-DAG: %[[DST:.+]] = hal.interface.binding.subspan @io::@s0b2_xw_external[%{{.+}}] : memref<?x?xi32> |
| // CHECK-DAG: %[[OFFSET_Y:.+]] = hal.interface.load.constant offset = 0 |
| // CHECK-DAG: %[[OFFSET_X:.+]] = hal.interface.load.constant offset = 1 |
| // CHECK: scf.for %[[IV0:.+]] = |
| // CHECK: %[[SRC_VIEW:.+]] = memref.subview %[[SRC]][%[[IV0]]] |
| // CHECK-SAME: : memref<?xi32> to memref<?xi32, #{{.+}}> |
| // CHECK: %[[DST_VIEW:.+]] = memref.subview %[[DST]][0, %{{.+}}] [1, %{{.+}}] |
| // CHECK-SAME: : memref<?x?xi32> to memref<?xi32, #{{.+}}> |
| // CHECK: linalg.copy(%[[SRC_VIEW]], %[[DST_VIEW]]) |
| |
| // ----- |
| |
| func @multi_level_tile_fuse() { |
| %c4 = arith.constant 4 : index |
| %c0 = arith.constant 0 : index |
| %c1 = arith.constant 1 : index |
| %c2 = arith.constant 2 : index |
| %cst = arith.constant 0.000000e+00 : f32 |
| %m = hal.interface.load.constant offset = 0 : index |
| %n = hal.interface.load.constant offset = 1 : index |
| %k = hal.interface.load.constant offset = 2 : index |
| %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%m, %k} |
| %1 = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%k, %n} |
| %2 = hal.interface.binding.subspan @io::@arg2[%c0] : !flow.dispatch.tensor<readonly:f32> |
| %3 = hal.interface.binding.subspan @io::@arg3[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%m, %n} |
| %4 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:f32> -> tensor<f32> |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index |
| %workgroup_size_x = hal.interface.workgroup.size[0] : index |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index |
| %workgroup_size_y = hal.interface.workgroup.size[1] : index |
| %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y] |
| %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y] |
| scf.for %arg0 = %5 to %m step %6 { |
| %7 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x] |
| %8 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x] |
| scf.for %arg1 = %7 to %n step %8 { |
| %9 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg0)[%workgroup_size_y, %m] |
| %10 = affine.min affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>(%arg1)[%workgroup_size_x, %n] |
| %11 = linalg.init_tensor [%9, %10] : tensor<?x?xf32> |
| %13 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%9, %k], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32> |
| %15 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [%k, %10], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32> |
| %16 = linalg.init_tensor [%9, %10] : tensor<?x?xf32> |
| %17 = linalg.fill(%cst, %16) {__internal_linalg_transform__ = "workgroup"} : f32, tensor<?x?xf32> -> tensor<?x?xf32> |
| %18 = scf.for %arg2 = %c0 to %9 step %c4 iter_args(%arg3 = %17) -> (tensor<?x?xf32>) { |
| %20 = scf.for %arg4 = %c0 to %10 step %c4 iter_args(%arg5 = %arg3) -> (tensor<?x?xf32>) { |
| %21 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%9, %arg2) |
| %22 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%10, %arg4) |
| %23 = tensor.extract_slice %arg5[%arg2, %arg4] [%21, %22] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32> |
| %24 = scf.for %arg6 = %c0 to %21 step %c4 iter_args(%arg7 = %23) -> (tensor<?x?xf32>) { |
| %26 = scf.for %arg8 = %c0 to %22 step %c4 iter_args(%arg9 = %arg7) -> (tensor<?x?xf32>) { |
| %27 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%21, %arg6) |
| %28 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg6, %arg2) |
| %29 = tensor.extract_slice %13[%28, 0] [%27, %k] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32> |
| %30 = affine.min affine_map<(d0, d1) -> (4, d0 - d1)>(%22, %arg8) |
| %31 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%arg8, %arg4) |
| %32 = tensor.extract_slice %15[0, %31] [%k, %30] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32> |
| %33 = tensor.extract_slice %arg9[%arg6, %arg8] [%27, %30] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32> |
| %34 = linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering.config = {nativeVectorSize = [4, 4, 4], tileSizes = [[], [4, 4, 4], [4, 4, 4]]}} ins(%29, %32 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%33 : tensor<?x?xf32>) -> tensor<?x?xf32> |
| %35 = tensor.insert_slice %34 into %arg9[%arg6, %arg8] [%27, %30] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32> |
| scf.yield %35 : tensor<?x?xf32> |
| } |
| scf.yield %26 : tensor<?x?xf32> |
| } |
| %25 = tensor.insert_slice %24 into %arg5[%arg2, %arg4] [%21, %22] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32> |
| scf.yield %25 : tensor<?x?xf32> |
| } |
| scf.yield %20 : tensor<?x?xf32> |
| } |
| %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4, %18 : tensor<f32>, tensor<?x?xf32>) outs(%11 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { |
| ^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors |
| %20 = arith.addf %arg2, %arg3 : f32 |
| linalg.yield %20 : f32 |
| } -> tensor<?x?xf32> |
| flow.dispatch.tensor.store %19, %3, offsets = [%arg0, %arg1], sizes = [%9, %10], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32> |
| } |
| } |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read" |
| hal.interface.binding @arg2, set=0, binding=2, type="StorageBuffer", access="Read" |
| hal.interface.binding @arg3, set=0, binding=3, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK-LABEL: func @multi_level_tile_fuse() |
| // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index |
| // CHECK-DAG: %[[M:.+]] = hal.interface.load.constant offset = 0 |
| // CHECK-DAG: %[[N:.+]] = hal.interface.load.constant offset = 1 |
| // CHECK-DAG: %[[K:.+]] = hal.interface.load.constant offset = 2 |
| // CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan @io::@arg0[%[[C0]]] : memref<?x?xf32>{%[[M]], %[[K]]} |
| // CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan @io::@arg1[%[[C0]]] : memref<?x?xf32>{%[[K]], %[[N]]} |
| // CHECK-DAG: %[[SCALAR:.+]] = hal.interface.binding.subspan @io::@arg2[%[[C0]]] : memref<f32> |
| // CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan @io::@arg3[%[[C0]]] : memref<?x?xf32>{%[[M]], %[[N]]} |
| // CHECK: scf.for |
| // CHECK: scf.for |
| // CHECK-DAG: %[[LHS_SUBVIEW1:.+]] = memref.subview %[[LHS]] |
| // CHECK-DAG: %[[RHS_SUBVIEW1:.+]] = memref.subview %[[RHS]] |
| // CHECK-DAG: %[[OUT_SUBVIEW1:.+]] = memref.subview %[[OUT]] |
| // CHECK: linalg.fill(%{{.+}}, %[[OUT_SUBVIEW1]]) |
| // CHECK: scf.for |
| // CHECK: scf.for |
| // CHECK: %[[OUT_SUBVIEW2:.+]] = memref.subview %[[OUT_SUBVIEW1]] |
| // CHECK: scf.for |
| // CHECK: scf.for |
| // CHECK-DAG: %[[LHS_SUBVIEW2:.+]] = memref.subview %[[LHS_SUBVIEW1]] |
| // CHECK-DAG: %[[RHS_SUBVIEW2:.+]] = memref.subview %[[RHS_SUBVIEW1]] |
| // CHECK-DAG: %[[OUT_SUBVIEW3:.+]] = memref.subview %[[OUT_SUBVIEW2]] |
| // CHECK: linalg.matmul |
| // CHECK-SAME: ins(%[[LHS_SUBVIEW2]], %[[RHS_SUBVIEW2]] : |
| // CHECK-SAME: outs(%[[OUT_SUBVIEW3]] : |
| // CHECK: linalg.generic |
| // CHECK-SAME: ins(%[[SCALAR]], %[[OUT_SUBVIEW1]] : |
| // CHECK-SAME: outs(%[[OUT_SUBVIEW1]] : |
| |
| // ----- |
| |
| func @operand_fusion() { |
| %c4 = arith.constant 4 : index |
| %c0 = arith.constant 0 : index |
| %c1 = arith.constant 1 : index |
| %c2 = arith.constant 2 : index |
| %cst = arith.constant 0.000000e+00 : f32 |
| %m = hal.interface.load.constant offset = 0 : index |
| %n = hal.interface.load.constant offset = 1 : index |
| %k = hal.interface.load.constant offset = 2 : index |
| %0 = hal.interface.binding.subspan @io::@arg0[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%m, %k} |
| %1 = hal.interface.binding.subspan @io::@arg1[%c0] : !flow.dispatch.tensor<readonly:?x?xf32>{%k, %n} |
| %2 = hal.interface.binding.subspan @io::@arg2[%c0] : !flow.dispatch.tensor<readonly:f32> |
| %3 = hal.interface.binding.subspan @io::@arg3[%c0] : !flow.dispatch.tensor<writeonly:?x?xf32>{%m, %n} |
| %4 = flow.dispatch.tensor.load %2, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:f32> -> tensor<f32> |
| %workgroup_id_x = hal.interface.workgroup.id[0] : index |
| %workgroup_count_x = hal.interface.workgroup.count[0] : index |
| %workgroup_id_y = hal.interface.workgroup.id[1] : index |
| %workgroup_count_y = hal.interface.workgroup.count[1] : index |
| %5 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y] |
| %6 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y] |
| scf.for %arg0 = %5 to %c2 step %6 { |
| %7 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_x] |
| %8 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_x] |
| scf.for %arg1 = %7 to %c1 step %8 { |
| %9 = affine.min affine_map<(d0) -> (4, -d0 + 2)>(%arg0) |
| %10 = affine.min affine_map<(d0) -> (4, -d0 + 1)>(%arg1) |
| %11 = linalg.init_tensor [%9, %10] : tensor<?x?xf32> |
| %12 = affine.min affine_map<(d0) -> (-d0 + 2, 4)>(%arg0) |
| %13 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%12, 3], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32> |
| %14 = affine.min affine_map<(d0) -> (-d0 + 1, 4)>(%arg1) |
| %15 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [3, %14], strides = [1, 1] : !flow.dispatch.tensor<readonly:?x?xf32> -> tensor<?x?xf32> |
| %16 = linalg.init_tensor [%12, %14] : tensor<?x?xf32> |
| %17 = linalg.fill(%cst, %16) {__internal_linalg_transform__ = "workgroup"} : f32, tensor<?x?xf32> -> tensor<?x?xf32> |
| %18 = linalg.matmul {__internal_linalg_transform__ = "workgroup"} |
| ins(%13, %15 : tensor<?x?xf32>, tensor<?x?xf32>) |
| outs(%17: tensor<?x?xf32>) -> tensor<?x?xf32> |
| %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> ()>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%4, %18 : tensor<f32>, tensor<?x?xf32>) outs(%11 : tensor<?x?xf32>) attrs = {__internal_linalg_transform__ = "workgroup"} { |
| ^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors |
| %20 = arith.addf %arg2, %arg3 : f32 |
| linalg.yield %20 : f32 |
| } -> tensor<?x?xf32> |
| flow.dispatch.tensor.store %19, %3, offsets = [%arg0, %arg1], sizes = [%9, %10], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:?x?xf32> |
| } |
| } |
| return |
| } |
| hal.interface private @io { |
| hal.interface.binding @arg0, set=0, binding=0, type="StorageBuffer", access="Read" |
| hal.interface.binding @arg1, set=0, binding=1, type="StorageBuffer", access="Read" |
| hal.interface.binding @arg2, set=0, binding=2, type="StorageBuffer", access="Read" |
| hal.interface.binding @arg3, set=0, binding=3, type="StorageBuffer", access="Write|Discard" |
| } |
| // CHECK-LABEL: func @operand_fusion() |
| // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index |
| // CHECK-DAG: %[[M:.+]] = hal.interface.load.constant offset = 0 |
| // CHECK-DAG: %[[N:.+]] = hal.interface.load.constant offset = 1 |
| // CHECK-DAG: %[[K:.+]] = hal.interface.load.constant offset = 2 |
| // CHECK-DAG: %[[LHS:.+]] = hal.interface.binding.subspan @io::@arg0[%[[C0]]] : memref<?x?xf32>{%[[M]], %[[K]]} |
| // CHECK-DAG: %[[RHS:.+]] = hal.interface.binding.subspan @io::@arg1[%[[C0]]] : memref<?x?xf32>{%[[K]], %[[N]]} |
| // CHECK-DAG: %[[SCALAR:.+]] = hal.interface.binding.subspan @io::@arg2[%[[C0]]] : memref<f32> |
| // CHECK-DAG: %[[OUT:.+]] = hal.interface.binding.subspan @io::@arg3[%[[C0]]] : memref<?x?xf32>{%[[M]], %[[N]]} |
| // CHECK: scf.for |
| // CHECK: scf.for |
| // CHECK-DAG: %[[LHS_SUBVIEW1:.+]] = memref.subview %[[LHS]] |
| // CHECK-DAG: %[[RHS_SUBVIEW1:.+]] = memref.subview %[[RHS]] |
| // CHECK-DAG: %[[OUT_SUBVIEW1:.+]] = memref.subview %[[OUT]] |
| // CHECK: linalg.fill(%{{.+}}, %[[OUT_SUBVIEW1]]) |
| // CHECK: linalg.matmul |
| // CHECK-SAME: ins(%[[LHS_SUBVIEW1]], %[[RHS_SUBVIEW1]] : |
| // CHECK-SAME: outs(%[[OUT_SUBVIEW1]] : |
| // CHECK: linalg.generic |
| // CHECK-SAME: ins(%[[SCALAR]], %[[OUT_SUBVIEW1]] : |
| // CHECK-SAME: outs(%[[OUT_SUBVIEW1]] : |