| // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-propagate-reshapes-by-expansion))" \ |
| // RUN: --split-input-file %s --mlir-print-local-scope | FileCheck %s |
| |
| func.func @reshape_and_lowering_config(%src: tensor<3x4xf16>, %dest: tensor<12xf16>, %dest2: tensor<12xf16>) -> tensor<12xf16> { |
| %collapse = tensor.collapse_shape %src [[0, 1]] : tensor<3x4xf16> into tensor<12xf16> |
| %copy = linalg.copy ins(%collapse : tensor<12xf16>) outs(%dest: tensor<12xf16>) -> tensor<12xf16> |
| %copy2 = linalg.copy {lowering_config = #iree_gpu.derived_thread_config} ins(%copy : tensor<12xf16>) outs(%dest2: tensor<12xf16>) -> tensor<12xf16> |
| return %copy2: tensor<12xf16> |
| } |
| |
| // CHECK-LABEL: func @reshape_and_lowering_config |
| // CHECK-SAME: %[[SRC:[A-Za-z0-9]+]]: tensor<3x4xf16> |
| // CHECK: %[[COPY1:.+]] = linalg.generic {{.*}} ins(%[[SRC]] |
| // CHECK: %[[COLLAPSE:.+]] = tensor.collapse_shape %[[COPY1]] |
| // CHECK: linalg.copy |
| // CHECK-SAME: lowering_config = #iree_gpu.derived_thread_config |
| // CHECK-SAME: ins(%[[COLLAPSE]] |
| |
| // ----- |
| |
| #pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [ |
| #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">], flags = Indirect> |
| func.func @fold_collapse_into_loads_dynamic() -> tensor<?x32xf32> { |
| %c0 = arith.constant 0 : index |
| %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index |
| %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) |
| flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x?x32xf32>>{%0} |
| %2 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [2, %0, 32], strides = [1, 1, 1] |
| : !flow.dispatch.tensor<readonly:tensor<2x?x32xf32>>{%0} -> tensor<2x?x32xf32> |
| %3 = tensor.collapse_shape %2 [[0, 1], [2]] : tensor<2x?x32xf32> into tensor<?x32xf32> |
| return %3 : tensor<?x32xf32> |
| } |
| // CHECK-LABEL: func @fold_collapse_into_loads_dynamic() |
| // CHECK: %[[CONST:.+]] = hal.interface.constant.load |
| // CHECK: %[[SHAPE:.+]] = affine.apply affine_map<()[s0] -> (s0 * 2)>()[%[[CONST]]] |
| // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan |
| // CHECK-SAME: !flow.dispatch.tensor<readonly:tensor<?x32xf32>>{%[[SHAPE]]} |
| // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]] |
| // CHECK-SAME: offsets = [0, 0], sizes = [%[[SHAPE]], 32], strides = [1, 1] |
| // CHECK-SAME: !flow.dispatch.tensor<readonly:tensor<?x32xf32>>{%[[SHAPE]]} |
| |
| // ----- |
| |
| #pipeline_layout = #hal.pipeline.layout<constants = 2, bindings = [ |
| #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">], flags = Indirect> |
| func.func @fold_expand_into_loads_dynamic() -> tensor<2x?x16x32xf32> { |
| %c0 = arith.constant 0 : index |
| %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index |
| %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) |
| flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x?x32xf32>>{%0} |
| %2 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [2, %0, 32], strides = [1, 1, 1] |
| : !flow.dispatch.tensor<readonly:tensor<2x?x32xf32>>{%0} -> tensor<2x?x32xf32> |
| %3 = affine.apply affine_map<()[s0] -> (s0 floordiv 2)>()[%0] |
| %4 = tensor.expand_shape %2 [[0], [1, 2], [3]] output_shape [2, %3, 16, 32] : tensor<2x?x32xf32> into tensor<2x?x16x32xf32> |
| return %4 : tensor<2x?x16x32xf32> |
| } |
| // CHECK-LABEL: func @fold_expand_into_loads_dynamic() |
| // CHECK-DAG: %[[C16:.+]] = arith.constant 16 : index |
| // CHECK-DAG: %[[CONST:.+]] = hal.interface.constant.load |
| // CHECK: %[[SHAPE:.+]] = arith.divui %[[CONST]], %[[C16]] |
| // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan |
| // CHECK-SAME: !flow.dispatch.tensor<readonly:tensor<2x?x16x32xf32>>{%[[SHAPE]]} |
| // CHECK: %[[LOAD:.+]] = flow.dispatch.tensor.load %[[SUBSPAN]] |
| // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [2, %[[SHAPE]], 16, 32], strides = [1, 1, 1, 1] |
| // CHECK-SAME: !flow.dispatch.tensor<readonly:tensor<2x?x16x32xf32>>{%[[SHAPE]]} |
| |
| // ----- |
| |
| #pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [ |
| #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> |
| func.func @fold_collapse_into_stores_dynamic(%arg0 : tensor<2x?x32xf32>) { |
| %c0 = arith.constant 0 : index |
| %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index |
| %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) |
| flags("ReadOnly|Indirect") : !flow.dispatch.tensor<writeonly:tensor<?x32xf32>>{%0} |
| %2 = tensor.collapse_shape %arg0 [[0, 1], [2]] : tensor<2x?x32xf32> into tensor<?x32xf32> |
| flow.dispatch.tensor.store %2, %1, offsets = [0, 0], sizes = [%0, 32], strides = [1, 1] |
| : tensor<?x32xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x32xf32>>{%0} |
| return |
| } |
| // CHECK-LABEL: func @fold_collapse_into_stores_dynamic( |
| // CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index |
| // CHECK: %[[CONST:.+]] = hal.interface.constant.load |
| // CHECK: %[[SHAPE:.+]] = arith.divui %[[CONST]], %[[C2]] |
| // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan |
| // CHECK-SAME: !flow.dispatch.tensor<writeonly:tensor<2x?x32xf32>>{%[[SHAPE]]} |
| // CHECK: flow.dispatch.tensor.store %{{.+}}, %[[SUBSPAN]] |
| // CHECK-SAME: offsets = [0, 0, 0], sizes = [2, %[[SHAPE]], 32], strides = [1, 1, 1] |
| // CHECK-SAME: !flow.dispatch.tensor<writeonly:tensor<2x?x32xf32>>{%[[SHAPE]]} |
| |
| // ----- |
| |
| #pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [ |
| #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> |
| func.func @expand_dest_forall() { |
| %cst = arith.constant 0.000000e+00 : f16 |
| %c0 = arith.constant 0 : index |
| %index = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index |
| %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) |
| flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<?x64x32xf32>>{%index} |
| %1 = tensor.empty(%index) : tensor<?x64x32xf32> |
| %extra = tensor.empty() : tensor<32x32xf32> |
| %2 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 32) step (16, 16) |
| shared_outs(%arg2 = %1) -> (tensor<?x64x32xf32>) { |
| %extracted_slice = tensor.extract_slice %arg2[%c0, %arg0, %arg1] [1, 16, 16] [1, 1, 1] |
| : tensor<?x64x32xf32> to tensor<1x16x16xf32> |
| %expanded = tensor.expand_shape %extracted_slice [[0], [1], [2, 3, 4]] |
| output_shape [1, 16, 2, 4, 2] : tensor<1x16x16xf32> into tensor<1x16x2x4x2xf32> |
| %expanded_barrier = util.optimization_barrier %expanded : tensor<1x16x2x4x2xf32> |
| %collapsed = tensor.collapse_shape %expanded_barrier [[0], [1], [2, 3, 4]] : tensor<1x16x2x4x2xf32> into tensor<1x16x16xf32> |
| scf.forall.in_parallel { |
| tensor.parallel_insert_slice %collapsed into %arg2[%c0, %arg0, %arg1] [1, 16, 16] [1, 1, 1] |
| : tensor<1x16x16xf32> into tensor<?x64x32xf32> |
| } |
| } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} |
| flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [%index, 64, 32], strides = [1, 1, 1] |
| : tensor<?x64x32xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x64x32xf32>>{%index} |
| return |
| } |
| |
| // CHECK-LABEL: func @expand_dest_forall( |
| // CHECK: %[[LOAD_CONST:.+]] = hal.interface.constant.load |
| // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan |
| // CHECK: %[[EMPTY:.+]] = tensor.empty(%[[LOAD_CONST]]) : tensor<?x64x4x4x2xf32> |
| // CHECK: %[[SCFFORALL:.+]] = scf.forall (%[[ARG0:.+]], %[[ARG1:.+]]) = (0, 0) |
| // CHECK-SAME: shared_outs(%[[ARG2:.+]] = %[[EMPTY]]) -> (tensor<?x64x4x4x2xf32>) { |
| // CHECK-DAG: %[[OFFSET:.+]] = affine.apply affine_map<()[s0] -> (s0 floordiv 8)>()[%[[ARG1]]] |
| // CHECK: %[[EXTRACT:.+]] = tensor.extract_slice %[[ARG2]] |
| // CHECK-SAME: [0, %[[ARG0]], %[[OFFSET]], 0, 0] [1, 16, 2, 4, 2] [1, 1, 1, 1, 1] |
| // CHECK-SAME: tensor<?x64x4x4x2xf32> to tensor<1x16x2x4x2xf32> |
| // CHECK: %[[BARRIER:.+]] = util.optimization_barrier %[[EXTRACT]] : tensor<1x16x2x4x2xf32> |
| // CHECK: tensor.parallel_insert_slice %[[BARRIER]] into %[[ARG2]] |
| // CHECK-SAME: [0, %[[ARG0]], %[[OFFSET]], 0, 0] [1, 16, 2, 4, 2] [1, 1, 1, 1, 1] |
| // CHECK-SAME: tensor<1x16x2x4x2xf32> into tensor<?x64x4x4x2xf32> |
| // CHECK: flow.dispatch.tensor.store %[[SCFFORALL]], %[[SUBSPAN]] |
| // CHECK-SAME: offsets = [0, 0, 0, 0, 0], sizes = [%[[LOAD_CONST]], 64, 4, 4, 2], strides = [1, 1, 1, 1, 1] |
| // CHECK-SAME: !flow.dispatch.tensor<writeonly:tensor<?x64x4x4x2xf32>>{%[[LOAD_CONST]]} |
| |
| // ----- |
| #pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [ |
| #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> |
| func.func @expand_dest_forall_multiresult() { |
| %cst = arith.constant 0.000000e+00 : f16 |
| %c0 = arith.constant 0 : index |
| %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) |
| flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<32xf32>> |
| %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) |
| offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<32x32xf32>> |
| %2 = tensor.empty() : tensor<32xf32> |
| %3 = tensor.empty() : tensor<32x32xf32> |
| %4:2 = scf.forall (%arg0) = (0) to (32) step (16) |
| shared_outs(%arg1 = %3, %arg2 = %2) -> (tensor<32x32xf32>, tensor<32xf32>) { |
| %extracted_slice = tensor.extract_slice %arg2[%arg0] [16] [1] : tensor<32xf32> to tensor<16xf32> |
| %expanded = tensor.expand_shape %extracted_slice [[0, 1]] output_shape [2, 8] |
| : tensor<16xf32> into tensor<2x8xf32> |
| %5 = util.optimization_barrier %expanded : tensor<2x8xf32> |
| %collapsed = tensor.collapse_shape %5 [[0, 1]] : tensor<2x8xf32> into tensor<16xf32> |
| scf.forall.in_parallel { |
| tensor.parallel_insert_slice %arg1 into %arg1[%c0, %c0] [32, 32] [1, 1] |
| : tensor<32x32xf32> into tensor<32x32xf32> |
| tensor.parallel_insert_slice %collapsed into %arg2[%arg0] [16] [1] |
| : tensor<16xf32> into tensor<32xf32> |
| } |
| } {mapping = [#iree_codegen.workgroup_mapping<y>]} |
| flow.dispatch.tensor.store %4#1, %0, offsets = [0], sizes = [32], strides = [1] |
| : tensor<32xf32> -> !flow.dispatch.tensor<writeonly:tensor<32xf32>> |
| flow.dispatch.tensor.store %4#0, %1, offsets = [0, 0], sizes = [32, 32], strides = [1, 1] |
| : tensor<32x32xf32> -> !flow.dispatch.tensor<writeonly:tensor<32x32xf32>> |
| return |
| } |
| |
| // CHECK-LABEL: func @expand_dest_forall_multiresult( |
| // CHECK: %[[SUBSPAN0:.+]] = hal.interface.binding.subspan |
| // CHECK: %[[SUBSPAN1:.+]] = hal.interface.binding.subspan |
| // CHECK: %[[EMPTY0:.+]] = tensor.empty() : tensor<32x32xf32> |
| // CHECK: %[[EMPTY1:.+]] = tensor.empty() : tensor<4x8xf32> |
| // CHECK: %[[SCFFORALL:.+]]:2 = scf.forall (%[[ARG0:.+]]) = (0) to (32) step (16) |
| // CHECK-SAME: shared_outs(%[[ARG1:.+]] = %[[EMPTY0]], %[[ARG2:.+]] = %[[EMPTY1]]) |
| // CHECK-SAME: -> (tensor<32x32xf32>, tensor<4x8xf32>) { |
| // CHECK-DAG: %[[OFFSET:.+]] = affine.apply affine_map<()[s0] -> (s0 floordiv 8)>()[%[[ARG0]]] |
| // CHECK: %[[EXTRACT:.+]] = tensor.extract_slice %[[ARG2]] |
| // CHECK-SAME: [%[[OFFSET]], 0] [2, 8] [1, 1] |
| // CHECK-SAME: tensor<4x8xf32> to tensor<2x8xf32> |
| // CHECK: %[[BARRIER:.+]] = util.optimization_barrier %[[EXTRACT]] : tensor<2x8xf32> |
| // CHECK: tensor.parallel_insert_slice %[[ARG1]] into %[[ARG1]] |
| // CHECK-SAME: tensor<32x32xf32> into tensor<32x32xf32> |
| // CHECK: tensor.parallel_insert_slice %[[BARRIER]] into %[[ARG2]] |
| // CHECK-SAME: [%[[OFFSET]], 0] [2, 8] [1, 1] |
| // CHECK-SAME: tensor<2x8xf32> into tensor<4x8xf32> |
| // CHECK: flow.dispatch.tensor.store %[[SCFFORALL]]#1, %[[SUBSPAN0]] |
| // CHECK-SAME: offsets = [0, 0], sizes = [4, 8], strides = [1, 1] |
| // CHECK-SAME: !flow.dispatch.tensor<writeonly:tensor<4x8xf32>> |
| // CHECK: flow.dispatch.tensor.store %[[SCFFORALL]]#0, %[[SUBSPAN1]] |
| // CHECK-SAME: offsets = [0, 0], sizes = [32, 32], strides = [1, 1] |
| // CHECK-SAME: !flow.dispatch.tensor<writeonly:tensor<32x32xf32>> |
| |
| |
| // ----- |
| |
| #pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [ |
| #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> |
| func.func @noexpand_dest_forall_dynamicpacked() { |
| %index1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index |
| %index2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index |
| %index3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index |
| %cst = arith.constant 0.000000e+00 : f16 |
| %c0 = arith.constant 0 : index |
| %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) |
| flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<32xf32>> |
| %2 = tensor.empty() : tensor<32xf32> |
| %4 = scf.forall (%arg0) = (0) to (32) step (16) |
| shared_outs(%arg2 = %2) -> (tensor<32xf32>) { |
| %extracted_slice = tensor.extract_slice %arg2[%arg0] [%index1] [1] : tensor<32xf32> to tensor<?xf32> |
| %expanded = tensor.expand_shape %extracted_slice [[0, 1]] output_shape [%index2, %index3] |
| : tensor<?xf32> into tensor<?x?xf32> |
| %5 = util.optimization_barrier %expanded : tensor<?x?xf32> |
| %collapsed = tensor.collapse_shape %5 [[0, 1]] : tensor<?x?xf32> into tensor<?xf32> |
| scf.forall.in_parallel { |
| tensor.parallel_insert_slice %collapsed into %arg2[%arg0] [%index1] [1] |
| : tensor<?xf32> into tensor<32xf32> |
| } |
| } {mapping = [#iree_codegen.workgroup_mapping<y>]} |
| flow.dispatch.tensor.store %4, %0, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> |
| -> !flow.dispatch.tensor<writeonly:tensor<32xf32>> |
| return |
| } |
| |
| // CHECK-LABEL: func @noexpand_dest_forall_dynamicpacked( |
| // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan |
| // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<32xf32> |
| // CHECK: %[[SCFFORALL:.+]] = scf.forall (%[[ARG0:.+]]) = (0) to (32) step (16) |
| // CHECK-SAME: shared_outs(%[[ARG2:.+]] = %[[EMPTY]]) -> (tensor<32xf32>) { |
| // CHECK: flow.dispatch.tensor.store %[[SCFFORALL]], %[[SUBSPAN]] |
| // CHECK-SAME: offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> |
| // CHECK-SAME: !flow.dispatch.tensor<writeonly:tensor<32xf32>> |
| |
| // ----- |
| #pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [ |
| #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> |
| func.func @expand_dest_forall_unsupporteduse() { |
| %c0 = arith.constant 0 : index |
| %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) |
| flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<32xf32>> |
| %2 = tensor.empty() : tensor<32xf32> |
| %4 = scf.forall (%arg0) = (0) to (32) step (16) |
| shared_outs(%arg2 = %2) -> (tensor<32xf32>) { |
| %extracted_slice = tensor.extract_slice %arg2[%arg0] [16] [1] : tensor<32xf32> to tensor<16xf32> |
| %arith_op = arith.negf %extracted_slice : tensor<16xf32> |
| %expanded = tensor.expand_shape %arith_op [[0, 1]] output_shape [2, 8] |
| : tensor<16xf32> into tensor<2x8xf32> |
| %5 = util.optimization_barrier %expanded : tensor<2x8xf32> |
| %collapsed = tensor.collapse_shape %5 [[0, 1]] : tensor<2x8xf32> into tensor<16xf32> |
| scf.forall.in_parallel { |
| tensor.parallel_insert_slice %collapsed into %arg2[%arg0] [16] [1] |
| : tensor<16xf32> into tensor<32xf32> |
| } |
| } {mapping = [#iree_codegen.workgroup_mapping<y>]} |
| flow.dispatch.tensor.store %4, %0, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor<writeonly:tensor<32xf32>> |
| return |
| } |
| |
| // CHECK-LABEL: func @expand_dest_forall_unsupporteduse( |
| // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan |
| // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<32xf32> |
| // CHECK: %[[SCFFORALL:.+]] = scf.forall (%[[ARG0:.+]]) = (0) to (32) step (16) |
| // CHECK-SAME: shared_outs(%[[ARG2:.+]] = %[[EMPTY]]) -> (tensor<32xf32>) { |
| // CHECK: flow.dispatch.tensor.store %[[SCFFORALL]], %[[SUBSPAN]] |
| // CHECK-SAME: offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> |
| // CHECK-SAME: !flow.dispatch.tensor<writeonly:tensor<32xf32>> |
| |
| |
| // ----- |
| |
| #pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [ |
| #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> |
| func.func @noexpand_dest_forall_nomapping() { |
| %c0 = arith.constant 0 : index |
| %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) |
| flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<32xf32>> |
| %2 = tensor.empty() : tensor<32xf32> |
| %4 = scf.forall (%arg0) = (0) to (32) step (16) |
| shared_outs(%arg2 = %2) -> (tensor<32xf32>) { |
| %extracted_slice = tensor.extract_slice %arg2[%arg0] [16] [1] : tensor<32xf32> to tensor<16xf32> |
| %expanded = tensor.expand_shape %extracted_slice [[0, 1]] output_shape [2, 8] |
| : tensor<16xf32> into tensor<2x8xf32> |
| %5 = util.optimization_barrier %expanded : tensor<2x8xf32> |
| %collapsed = tensor.collapse_shape %5 [[0, 1]] : tensor<2x8xf32> into tensor<16xf32> |
| scf.forall.in_parallel { |
| tensor.parallel_insert_slice %collapsed into %arg2[%arg0] [16] [1] |
| : tensor<16xf32> into tensor<32xf32> |
| } |
| } |
| flow.dispatch.tensor.store %4, %0, offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor<writeonly:tensor<32xf32>> |
| return |
| } |
| |
| // CHECK-LABEL: func @noexpand_dest_forall_nomapping( |
| // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan |
| // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<32xf32> |
| // CHECK: %[[SCFFORALL:.+]] = scf.forall (%[[ARG0:.+]]) = (0) to (32) step (16) |
| // CHECK-SAME: shared_outs(%[[ARG2:.+]] = %[[EMPTY]]) -> (tensor<32xf32>) { |
| // CHECK: flow.dispatch.tensor.store %[[SCFFORALL]], %[[SUBSPAN]] |
| // CHECK-SAME: offsets = [0], sizes = [32], strides = [1] : tensor<32xf32> |
| // CHECK-SAME: !flow.dispatch.tensor<writeonly:tensor<32xf32>> |
| |
| |
| // ----- |
| |
| #pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [ |
| #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> |
| func.func @noexpand_dest_forall_notfullslicestore() { |
| %c0 = arith.constant 0 : index |
| %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) |
| flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<34xf32>> |
| %2 = tensor.empty() : tensor<32xf32> |
| %4 = scf.forall (%arg0) = (0) to (32) step (16) |
| shared_outs(%arg2 = %2) -> (tensor<32xf32>) { |
| %extracted_slice = tensor.extract_slice %arg2[%arg0] [16] [1] : tensor<32xf32> to tensor<16xf32> |
| %expanded = tensor.expand_shape %extracted_slice [[0, 1]] output_shape [2, 8] |
| : tensor<16xf32> into tensor<2x8xf32> |
| %5 = util.optimization_barrier %expanded : tensor<2x8xf32> |
| %collapsed = tensor.collapse_shape %5 [[0, 1]] : tensor<2x8xf32> into tensor<16xf32> |
| scf.forall.in_parallel { |
| tensor.parallel_insert_slice %collapsed into %arg2[%arg0] [16] [1] |
| : tensor<16xf32> into tensor<32xf32> |
| } |
| } {mapping = [#iree_codegen.workgroup_mapping<y>]} |
| flow.dispatch.tensor.store %4, %0, offsets = [1], sizes = [32], strides = [1] : tensor<32xf32> -> !flow.dispatch.tensor<writeonly:tensor<34xf32>> |
| return |
| } |
| |
| // CHECK-LABEL: func @noexpand_dest_forall_notfullslicestore( |
| // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan |
| // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<32xf32> |
| // CHECK: %[[SCFFORALL:.+]] = scf.forall (%[[ARG0:.+]]) = (0) to (32) step (16) |
| // CHECK-SAME: shared_outs(%[[ARG2:.+]] = %[[EMPTY]]) -> (tensor<32xf32>) { |
| // CHECK: flow.dispatch.tensor.store %[[SCFFORALL]], %[[SUBSPAN]] |
| // CHECK-SAME: offsets = [1], sizes = [32], strides = [1] : tensor<32xf32> |
| // CHECK-SAME: !flow.dispatch.tensor<writeonly:tensor<34xf32>> |
| |
| // ----- |
| #pipeline_layout = #hal.pipeline.layout<constants = 1, bindings = [ |
| #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect> |
| func.func @expand_dest_forall_chained() { |
| %cst = arith.constant 0.000000e+00 : f16 |
| %c0 = arith.constant 0 : index |
| %index = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : index |
| %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) |
| flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<?x64x32xf32>>{%index} |
| %1 = tensor.empty(%index) : tensor<?x64x32xf32> |
| %extra = tensor.empty() : tensor<32x32xf32> |
| %2 = scf.forall (%arg0, %arg1) = (0, 0) to (64, 32) step (16, 16) |
| shared_outs(%arg2 = %1) -> (tensor<?x64x32xf32>) { |
| %extracted_slice = tensor.extract_slice %arg2[%c0, %arg0, %arg1] [1, 16, 16] [1, 1, 1] |
| : tensor<?x64x32xf32> to tensor<1x16x16xf32> |
| %expanded = tensor.expand_shape %extracted_slice [[0], [1], [2, 3, 4]] |
| output_shape [1, 16, 2, 4, 2] : tensor<1x16x16xf32> into tensor<1x16x2x4x2xf32> |
| %expanded2 = tensor.expand_shape %expanded [[0], [1, 2], [3], [4], [5]] |
| output_shape [1, 8, 2, 2, 4, 2] : tensor<1x16x2x4x2xf32> into tensor<1x8x2x2x4x2xf32> |
| %expanded_barrier = util.optimization_barrier %expanded2 : tensor<1x8x2x2x4x2xf32> |
| %collapsed = tensor.collapse_shape %expanded_barrier [[0], [1, 2], [3], [4], [5]] : tensor<1x8x2x2x4x2xf32> into tensor<1x16x2x4x2xf32> |
| %collapsed2 = tensor.collapse_shape %collapsed [[0], [1], [2, 3, 4]] : tensor<1x16x2x4x2xf32> into tensor<1x16x16xf32> |
| scf.forall.in_parallel { |
| tensor.parallel_insert_slice %collapsed2 into %arg2[%c0, %arg0, %arg1] [1, 16, 16] [1, 1, 1] |
| : tensor<1x16x16xf32> into tensor<?x64x32xf32> |
| } |
| } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]} |
| flow.dispatch.tensor.store %2, %0, offsets = [0, 0, 0], sizes = [%index, 64, 32], strides = [1, 1, 1] |
| : tensor<?x64x32xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x64x32xf32>>{%index} |
| return |
| } |
| |
| // CHECK-LABEL: func @expand_dest_forall_chained( |
| // CHECK: %[[LOAD_CONST:.+]] = hal.interface.constant.load |
| // CHECK: %[[SUBSPAN:.+]] = hal.interface.binding.subspan |
| // CHECK: %[[EMPTY:.+]] = tensor.empty(%[[LOAD_CONST]]) : tensor<?x32x2x4x4x2xf32> |
| // CHECK: %[[SCFFORALL:.+]] = scf.forall (%[[ARG0:.+]], %[[ARG1:.+]]) = (0, 0) |
| // CHECK-SAME: shared_outs(%[[ARG2:.+]] = %[[EMPTY]]) -> (tensor<?x32x2x4x4x2xf32>) { |
| // CHECK-DAG: %[[OFFSET0:.+]] = affine.apply affine_map<()[s0] -> (s0 floordiv 8)>()[%[[ARG1]]] |
| // CHECK-DAG: %[[OFFSET1:.+]] = affine.apply affine_map<()[s0] -> (s0 floordiv 2)>()[%[[ARG0]]] |
| // CHECK: %[[EXTRACT:.+]] = tensor.extract_slice %[[ARG2]] |
| // CHECK-SAME: [0, %[[OFFSET1]], 0, %[[OFFSET0]], 0, 0] [1, 8, 2, 2, 4, 2] [1, 1, 1, 1, 1, 1] |
| // CHECK-SAME: tensor<?x32x2x4x4x2xf32> to tensor<1x8x2x2x4x2xf32> |
| // CHECK: %[[BARRIER:.+]] = util.optimization_barrier %[[EXTRACT]] : tensor<1x8x2x2x4x2xf32> |
| // CHECK: tensor.parallel_insert_slice %[[BARRIER]] into %[[ARG2]] |
| // CHECK-SAME: [0, %[[OFFSET1]], 0, %[[OFFSET0]], 0, 0] [1, 8, 2, 2, 4, 2] [1, 1, 1, 1, 1, 1] |
| // CHECK-SAME: tensor<1x8x2x2x4x2xf32> into tensor<?x32x2x4x4x2xf32> |
| // CHECK: flow.dispatch.tensor.store %[[SCFFORALL]], %[[SUBSPAN]] |
| // CHECK-SAME: offsets = [0, 0, 0, 0, 0, 0], sizes = [%[[LOAD_CONST]], 32, 2, 4, 4, 2], strides = [1, 1, 1, 1, 1, 1] |
| // CHECK-SAME: !flow.dispatch.tensor<writeonly:tensor<?x32x2x4x4x2xf32>>{%[[LOAD_CONST]]} |