| // RUN: iree-opt --iree-transform-dialect-interpreter --split-input-file --canonicalize --cse %s | FileCheck %s |
| |
| #layout_col_major = #iree_vector_ext.nested_layout< |
| subgroup_tile = [1, 1], |
| batch_tile = [1, 2], |
| outer_tile = [1, 1], |
| thread_tile = [4, 8], |
| element_tile = [4, 1], |
| |
| subgroup_strides = [1, 1], |
| thread_strides = [8, 1] |
| > |
| |
| // CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> ((s0 floordiv 8) * 4 - ((s0 floordiv 8) floordiv 4) * 16)> |
| // CHECK: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 mod 8)> |
| // CHECK: #[[$MAP2:.+]] = affine_map<()[s0] -> (s0 mod 8 + 8)> |
| // CHECK-LABEL: @distribute_transfer_read_col_major |
| func.func @distribute_transfer_read_col_major(%arg0: memref<32x32xf16>) -> vector<16x16xf16> { |
| %c0 = arith.constant 0 : index |
| %cst = arith.constant 0.0 : f16 |
| %root = vector.transfer_read %arg0[%c0, %c0], %cst {in_bounds = [true, true]} |
| : memref<32x32xf16>, vector<16x16xf16> |
| %rootl = iree_vector_ext.to_layout %root to layout(#layout_col_major) : vector<16x16xf16> |
| func.return %rootl : vector<16x16xf16> |
| } |
| |
| builtin.module attributes { transform.with_named_sequence } { |
| transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { |
| %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op |
| transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op |
| transform.yield |
| } |
| } |
| |
| // CHECK: %[[IDX:.+]] = gpu.thread_id x |
| // CHECK: %[[X:.+]] = affine.apply #[[$MAP]]()[%[[IDX]]] |
| // CHECK: %[[Y:.+]] = affine.apply #[[$MAP1]]()[%[[IDX]]] |
| // CHECK: %[[RD00:.+]] = vector.transfer_read %arg0[%[[X]], %[[Y]]], {{.*}} : memref<32x32xf16>, vector<4x1xf16> |
| // CHECK: vector.insert_strided_slice %[[RD00]], %{{.*}} {offsets = [0, 0, 0, 0, 0, 0], strides = [1, 1]} : vector<4x1xf16> into vector<1x2x1x1x4x1xf16> |
| // CHECK: %[[X_PLUS_BATCH:.+]] = affine.apply #[[$MAP2]]()[%[[IDX]]] |
| // CHECK: vector.transfer_read %arg0[%[[X]], %[[X_PLUS_BATCH]]], %{{.*}} {in_bounds = [true, true]} : memref<32x32xf16>, vector<4x1xf16> |
| // CHECK: vector.insert_strided_slice {{.*}} {offsets = [0, 1, 0, 0, 0, 0] |
| // CHECK: iree_vector_ext.to_simd %{{.*}} : vector<1x2x1x1x4x1xf16> -> vector<16x16xf16> |
| |
| // ----- |
| |
| #layout_row_major = #iree_vector_ext.nested_layout< |
| subgroup_tile = [1, 1], |
| batch_tile = [2, 2], |
| outer_tile = [1, 1], |
| thread_tile = [8, 1], |
| element_tile = [1, 8], |
| |
| subgroup_strides = [1, 1], |
| thread_strides = [1, 1] |
| > |
| |
| // CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 - (s1 floordiv 8) * 8)> |
| // CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 + 8)> |
| // CHECK-DAG: #[[$MAP2:.+]] = affine_map<()[s0, s1] -> (s0 + s1 - (s1 floordiv 8) * 8 + 8)> |
| |
| func.func @distribute_transfer_read_row_major_with_nontrivial_index(%a: index, %b: index, %arg0: memref<32x32x32x32xf16>) -> vector<16x16xf16> { |
| %c0 = arith.constant 0 : index |
| %cst = arith.constant 0.0 : f16 |
| %root = vector.transfer_read %arg0[%c0, %c0, %a, %b], %cst |
| {in_bounds = [true, true], |
| permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)>} |
| : memref<32x32x32x32xf16>, vector<16x16xf16> |
| %rootl = iree_vector_ext.to_layout %root to layout(#layout_row_major) : vector<16x16xf16> |
| func.return %rootl : vector<16x16xf16> |
| } |
| |
| builtin.module attributes { transform.with_named_sequence } { |
| transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { |
| %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op |
| transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op |
| transform.yield |
| } |
| } |
| // CHECK-LABEL: @distribute_transfer_read_row_major_with_nontrivial_index |
| // CHECK-SAME: %[[I0:.+]]: index, %[[I1:.+]]: index |
| |
| // CHECK: %[[IDX:.+]] = gpu.thread_id x |
| // CHECK: %[[OFF0:.+]] = affine.apply #[[$MAP]]()[%[[I0]], %[[IDX]]] |
| // CHECK: vector.transfer_read %{{.*}}[%c0, %c0, %[[OFF0]], %[[I1]]] |
| // CHECK: %[[OFF1:.+]] = affine.apply #[[$MAP1]]()[%[[I1]]] |
| // CHECK: vector.transfer_read %{{.*}}[%c0, %c0, %[[OFF0]], %[[OFF1]]] |
| // CHECK: %[[OFF2:.+]] = affine.apply #[[$MAP2]]()[%[[I0]], %[[IDX]]] |
| // CHECK: vector.transfer_read %{{.*}}[%c0, %c0, %[[OFF2]], %[[I1]]] |
| // CHECK: vector.transfer_read %{{.*}}[%c0, %c0, %[[OFF2]], %[[OFF1]]] |
| |
| // ----- |
| |
| #layout_col_major = #iree_vector_ext.nested_layout< |
| subgroup_tile = [1, 1], |
| batch_tile = [1, 2], |
| outer_tile = [1, 1], |
| thread_tile = [4, 8], |
| element_tile = [4, 1], |
| |
| subgroup_strides = [1, 1], |
| thread_strides = [8, 1] |
| > |
| |
| func.func @distribute_transfer_read_col_major_with_broadcast(%a: index, %b: index, %arg0: memref<32x32x32x32xf16>) -> vector<16x16xf16> { |
| %c0 = arith.constant 0 : index |
| %cst = arith.constant 0.0 : f16 |
| %root = vector.transfer_read %arg0[%c0, %c0, %a, %b], %cst |
| {in_bounds = [true, true], |
| permutation_map = affine_map<(d0, d1, d2, d3) -> (0, 0)>} |
| : memref<32x32x32x32xf16>, vector<16x16xf16> |
| %rootl = iree_vector_ext.to_layout %root to layout(#layout_col_major) : vector<16x16xf16> |
| func.return %rootl : vector<16x16xf16> |
| } |
| |
| builtin.module attributes { transform.with_named_sequence } { |
| transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { |
| %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op |
| transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op |
| transform.yield |
| } |
| } |
| |
| // CHECK: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (0, 0)> |
| |
| // CHECK-LABEL: @distribute_transfer_read_col_major_with_broadcast |
| // CHECK-SAME: %[[I0:.+]]: index, %[[I1:.+]]: index |
| |
| // CHECK: %[[BROADCAST_READ:.+]] = vector.transfer_read %{{.*}}[%c0, %c0, %[[I0]], %[[I1]]], %{{.*}} permutation_map = #[[$MAP]] |
| // CHECK: vector.insert_strided_slice %[[BROADCAST_READ]], %{{.*}} {offsets = [0, 0, 0, 0, 0, 0] |
| // CHECK: vector.insert_strided_slice %[[BROADCAST_READ]], %{{.*}} {offsets = [0, 1, 0, 0, 0, 0] |
| |
| // ----- |
| |
| #layout_row_major = #iree_vector_ext.nested_layout< |
| subgroup_tile = [1, 1], |
| batch_tile = [2, 2], |
| outer_tile = [1, 1], |
| thread_tile = [8, 1], |
| element_tile = [1, 8], |
| |
| subgroup_strides = [1, 1], |
| thread_strides = [1, 1] |
| > |
| |
| // CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 - (s1 floordiv 8) * 8)> |
| // CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 + 8)> |
| // CHECK-DAG: #[[$MAP2:.+]] = affine_map<()[s0, s1] -> (s0 + s1 - (s1 floordiv 8) * 8 + 8)> |
| // CHECK-DAG: #[[$PERM:.+]] = affine_map<(d0, d1, d2, d3) -> (d3, d2)> |
| |
| func.func @distribute_transfer_read_row_major_transpose(%a: index, %b: index, %arg0: memref<32x32x32x32xf16>) -> vector<16x16xf16> { |
| %c0 = arith.constant 0 : index |
| %cst = arith.constant 0.0 : f16 |
| %root = vector.transfer_read %arg0[%c0, %c0, %a, %b], %cst |
| {in_bounds = [true, true], |
| permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d2)>} |
| : memref<32x32x32x32xf16>, vector<16x16xf16> |
| %rootl = iree_vector_ext.to_layout %root to layout(#layout_row_major) : vector<16x16xf16> |
| func.return %rootl : vector<16x16xf16> |
| } |
| |
| builtin.module attributes { transform.with_named_sequence } { |
| transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { |
| %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op |
| transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op |
| transform.yield |
| } |
| } |
| |
| // CHECK-LABEL: @distribute_transfer_read_row_major_transpose |
| // CHECK-SAME: %[[I0:.+]]: index, %[[I1:.+]]: index |
| |
| // CHECK: %[[IDX:.+]] = gpu.thread_id x |
| // CHECK: %[[LIN_ID0:.+]] = affine.apply #[[$MAP:.+]]()[%[[I1]], %[[IDX]]] |
| // CHECK: vector.transfer_read %{{.*}}[%c0, %c0, %[[I0]], %[[LIN_ID0]]], {{.*}} permutation_map = #[[$PERM]] |
| // CHECK: %[[I0_PLUS_8:.+]] = affine.apply #[[$MAP1]]()[%[[I0]]] |
| // CHECK: vector.transfer_read %{{.*}}[%c0, %c0, %[[I0_PLUS_8]], %[[LIN_ID0]]], {{.*}} permutation_map = #[[$PERM]] |
| // CHECK: %[[LIN_ID1:.+]] = affine.apply #[[$MAP2]]()[%[[I1]], %[[IDX]]] |
| // CHECK: vector.transfer_read %{{.*}}[%c0, %c0, %[[I0]], %[[LIN_ID1]]], {{.*}} permutation_map = #[[$PERM]] |
| // CHECK: vector.transfer_read %{{.*}}[%c0, %c0, %[[I0_PLUS_8]], %[[LIN_ID1]]], %cst_0 {in_bounds = [true, true], permutation_map = #[[$PERM]]} : memref<32x32x32x32xf16>, vector<1x8xf16> |
| |
| // ----- |
| |
| #layout_col_major = #iree_vector_ext.nested_layout< |
| subgroup_tile = [1, 1], |
| batch_tile = [1, 2], |
| outer_tile = [1, 1], |
| thread_tile = [4, 8], |
| element_tile = [4, 1], |
| |
| subgroup_strides = [1, 1], |
| thread_strides = [8, 1] |
| > |
| |
| // CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1, d2, d3) -> (d3, d2)> |
| |
| // CHECK-LABEL: @distribute_transfer_read_col_major_transpose |
| func.func @distribute_transfer_read_col_major_transpose(%a: index, %b: index, %arg0: memref<32x32x32x32xf16>) -> vector<16x16xf16> { |
| %c0 = arith.constant 0 : index |
| %cst = arith.constant 0.0 : f16 |
| %root = vector.transfer_read %arg0[%c0, %c0, %a, %b], %cst |
| {in_bounds = [true, true], |
| permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d2)>} |
| : memref<32x32x32x32xf16>, vector<16x16xf16> |
| %rootl = iree_vector_ext.to_layout %root to layout(#layout_col_major) : vector<16x16xf16> |
| func.return %rootl : vector<16x16xf16> |
| } |
| |
| builtin.module attributes { transform.with_named_sequence } { |
| transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { |
| %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op |
| transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op |
| transform.yield |
| } |
| } |
| |
| // CHECK: vector.transfer_read {{.*}} permutation_map = #[[$MAP2]] |
| // CHECK: vector.transfer_read {{.*}} permutation_map = #[[$MAP2]] |
| |
| // ----- |
| |
| #layout = #iree_vector_ext.nested_layout< |
| subgroup_tile = [7, 3, 1, 1], |
| batch_tile = [3, 5, 2, 1], |
| outer_tile = [1, 1, 2, 4], |
| thread_tile = [1, 1, 2, 2], |
| element_tile = [1, 1, 1, 2], |
| |
| subgroup_strides = [3, 1, 1, 1], |
| thread_strides = [1, 1, 1, 2] |
| > |
| |
| func.func @distribute_transfer_read_row_major_with_permutations(%a: index, %b: index, %arg0: memref<32x32x32x32xf16>) -> vector<21x15x8x16xf16> { |
| %c0 = arith.constant 0 : index |
| %cst = arith.constant 0.0 : f16 |
| %root = vector.transfer_read %arg0[%c0, %c0, %a, %b], %cst |
| {in_bounds = [true, true, true, true], |
| permutation_map = affine_map<(d0, d1, d2, d3) -> (d0, d3, 0, d1)>} |
| : memref<32x32x32x32xf16>, vector<21x15x8x16xf16> |
| %rootl = iree_vector_ext.to_layout %root to layout(#layout) : vector<21x15x8x16xf16> |
| func.return %rootl : vector<21x15x8x16xf16> |
| } |
| |
| builtin.module attributes { transform.with_named_sequence } { |
| transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { |
| %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op |
| transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op |
| transform.yield |
| } |
| } |
| // CHECK-LABEL: @distribute_transfer_read_row_major_with_permutations |
| |
| // Verify that there are (batch0: 3) * (batch1: 5) * (outer3: 4) = 60 total |
| // unique transfer read ops. The broadcasted dimension (2) CSEs the duplicate |
| // reads. |
| // CHECK-COUNT-60: vector.transfer_read |
| |
| // ----- |
| |
| #layout = #iree_vector_ext.nested_layout< |
| subgroup_tile = [1], |
| batch_tile = [1], |
| outer_tile = [1], |
| thread_tile = [4], |
| element_tile = [4], |
| |
| subgroup_strides = [1], |
| thread_strides = [16] |
| > |
| |
| // CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0] -> ((s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)> |
| |
| // CHECK-LABEL: @distribute_transfer_read_broadcast |
| func.func @distribute_transfer_read_broadcast(%arg0: memref<32x32xf16>) -> vector<16xf16> { |
| %c0 = arith.constant 0 : index |
| %cst = arith.constant 0.0 : f16 |
| %root = vector.transfer_read %arg0[%c0, %c0], %cst |
| {in_bounds = [true]} : memref<32x32xf16>, vector<16xf16> |
| %rootl = iree_vector_ext.to_layout %root to layout(#layout) : vector<16xf16> |
| func.return %rootl : vector<16xf16> |
| } |
| |
| builtin.module attributes { transform.with_named_sequence } { |
| transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { |
| %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op |
| transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op |
| transform.yield |
| } |
| } |
| |
| // CHECK: %[[IDX:.+]] = gpu.thread_id x |
| // CHECK: %[[LANEY:.+]] = affine.apply #[[$MAP]]()[%[[IDX]]] |
| // CHECK: %[[RD:.+]] = vector.transfer_read %{{.*}}[%c0, %[[LANEY:.+]]], {{.*}} : memref<32x32xf16>, vector<4xf16> |
| |
| // ----- |
| |
| #layout = #iree_vector_ext.nested_layout< |
| subgroup_tile = [2], |
| batch_tile = [1], |
| outer_tile = [1], |
| thread_tile = [16], |
| element_tile = [4], |
| |
| subgroup_strides = [1], |
| thread_strides = [1] |
| > |
| |
| // CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 4 + (s0 floordiv 64) * 64 - ((s0 floordiv 64) floordiv 2) * 128 - (s0 floordiv 16) * 64)> |
| |
| // CHECK-LABEL: @distribute_transfer_read_broadcast2 |
| func.func @distribute_transfer_read_broadcast2(%arg0: memref<32x128xf16>) -> vector<128xf16> { |
| %c0 = arith.constant 0 : index |
| %cst = arith.constant 0.0 : f16 |
| %root = vector.transfer_read %arg0[%c0, %c0], %cst |
| {in_bounds = [true]} : memref<32x128xf16>, vector<128xf16> |
| %rootl = iree_vector_ext.to_layout %root to layout(#layout) : vector<128xf16> |
| func.return %rootl : vector<128xf16> |
| } |
| |
| builtin.module attributes { transform.with_named_sequence } { |
| transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { |
| %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op |
| transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op |
| transform.yield |
| } |
| } |
| |
| // CHECK: %[[IDX:.+]] = gpu.thread_id x |
| // CHECK: %[[LANEY:.+]] = affine.apply #[[$MAP]]()[%[[IDX]]] |
| // CHECK: %[[RD:.+]] = vector.transfer_read %{{.*}}[%c0, %[[LANEY:.+]]], {{.*}} : memref<32x128xf16>, vector<4xf16> |
| |
| // ----- |
| |
| #layout = #iree_vector_ext.nested_layout< |
| subgroup_tile = [], |
| batch_tile = [], |
| outer_tile = [], |
| thread_tile = [], |
| element_tile = [], |
| |
| subgroup_strides = [], |
| thread_strides = [] |
| > |
| |
| // CHECK-LABEL: @distribute_transfer_read_0d |
| func.func @distribute_transfer_read_0d(%arg0: memref<128xf16>) -> vector<f16> { |
| %c0 = arith.constant 0 : index |
| %cst = arith.constant 0.0 : f16 |
| %root = vector.transfer_read %arg0[%c0], %cst |
| {in_bounds = []} : memref<128xf16>, vector<f16> |
| %rootl = iree_vector_ext.to_layout %root to layout(#layout) : vector<f16> |
| func.return %rootl : vector<f16> |
| } |
| |
| |
| builtin.module attributes { transform.with_named_sequence } { |
| transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { |
| %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op |
| transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op |
| transform.yield |
| } |
| } |
| |
| // CHECK: %[[RD:.+]] = vector.transfer_read %{{.*}}[%c0] |
| // CHECK-SAME: memref<128xf16>, vector<f16> |
| // CHECK: iree_vector_ext.to_simd %[[RD]] |
| |
| // ----- |
| |
| #layout_row_major = #iree_vector_ext.nested_layout< |
| subgroup_tile = [1, 1], |
| batch_tile = [2, 2], |
| outer_tile = [1, 1], |
| thread_tile = [8, 1], |
| element_tile = [1, 8], |
| |
| subgroup_strides = [1, 1], |
| thread_strides = [1, 1] |
| > |
| |
| // CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 mod 8)> |
| // CHECK: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 mod 8 + 8)> |
| |
| // CHECK-LABEL: @distribute_transfer_write_row_major |
| func.func @distribute_transfer_write_row_major(%root: vector<16x16xf16>, %alloc: memref<64x64xf16>) { |
| %c0 = arith.constant 0 : index |
| %rootl = iree_vector_ext.to_layout %root to layout(#layout_row_major) : vector<16x16xf16> |
| vector.transfer_write %rootl, %alloc[%c0, %c0] |
| {in_bounds = [true, true]} |
| : vector<16x16xf16>, memref<64x64xf16> |
| func.return |
| } |
| |
| builtin.module attributes { transform.with_named_sequence } { |
| transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { |
| %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op |
| transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op |
| transform.yield |
| } |
| } |
| |
| // CHECK: %[[IDX:.+]] = gpu.thread_id x |
| // CHECK: %[[LANEX:.+]] = affine.apply #[[$MAP]]()[%[[IDX]]] |
| // CHECK: %[[SLICE:.+]] = vector.extract %{{.*}}[0, 0, 0, 0] : vector<1x8xf16> from vector<2x2x1x1x1x8xf16> |
| // CHECK: vector.transfer_write %[[SLICE]], %{{.*}}[%[[LANEX]], %c0] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16> |
| // CHECK: vector.extract %{{.*}}[0, 1, 0, 0] |
| // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[LANEX]], %c8] |
| // CHECK: %[[LANEX_PLUS_VECDIMX:.+]] = affine.apply #[[$MAP1]]()[%[[IDX]]] |
| // CHECK: vector.extract %{{.*}}[1, 0, 0, 0] |
| // CHECK: vector.transfer_write %{{.*}}[%[[LANEX_PLUS_VECDIMX]], %c0] |
| // CHECK: vector.extract %{{.*}}[1, 1, 0, 0] |
| // CHECK: vector.transfer_write %{{.*}}[%[[LANEX_PLUS_VECDIMX]], %c8] |
| |
| // ----- |
| |
| #layout_col_major = #iree_vector_ext.nested_layout< |
| subgroup_tile = [1, 1], |
| batch_tile = [1, 2], |
| outer_tile = [1, 1], |
| thread_tile = [4, 8], |
| element_tile = [4, 1], |
| |
| subgroup_strides = [1, 1], |
| thread_strides = [8, 1] |
| > |
| |
| // CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0] -> ((s0 floordiv 8) * 4 - ((s0 floordiv 8) floordiv 4) * 16)> |
| // CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> (s0 mod 8)> |
| // CHECK-DAG: #[[$MAP2:.+]] = affine_map<()[s0] -> (s0 mod 8 + 8)> |
| |
| // CHECK-LABEL: @distribute_transfer_write_col_major |
| func.func @distribute_transfer_write_col_major(%root: vector<16x16xf16>, %alloc: memref<64x64xf16>) { |
| %c0 = arith.constant 0 : index |
| %rootl = iree_vector_ext.to_layout %root to layout(#layout_col_major) : vector<16x16xf16> |
| vector.transfer_write %rootl, %alloc[%c0, %c0] |
| {in_bounds = [true, true]} |
| : vector<16x16xf16>, memref<64x64xf16> |
| func.return |
| } |
| |
| builtin.module attributes { transform.with_named_sequence } { |
| transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { |
| %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op |
| transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op |
| transform.yield |
| } |
| } |
| |
| // CHECK: %[[IDX:.+]] = gpu.thread_id x |
| // CHECK: %[[LANEY:.+]] = affine.apply #[[$MAP]]()[%[[IDX]]] |
| // CHECK: %[[LANEY2:.+]] = affine.apply #[[$MAP1]]()[%[[IDX]]] |
| // CHECK: vector.extract %{{.*}}[0, 0, 0, 0] |
| // CHECK: vector.transfer_write %{{.*}}[%[[LANEY]], %[[LANEY2]]] |
| // CHECK: %[[LANEX:.+]] = affine.apply #[[$MAP2]]()[%[[IDX]]] |
| // CHECK: vector.extract %{{.*}}[0, 1, 0, 0] |
| // CHECK: vector.transfer_write {{.*}}[%[[LANEY]], %[[LANEX]]] |
| |
| // ----- |
| |
| #layout_row_major = #iree_vector_ext.nested_layout< |
| subgroup_tile = [1, 1], |
| batch_tile = [2, 2], |
| outer_tile = [1, 1], |
| thread_tile = [8, 1], |
| element_tile = [1, 8], |
| |
| subgroup_strides = [1, 1], |
| thread_strides = [1, 1] |
| > |
| |
| // CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 - (s1 floordiv 8) * 8)> |
| // CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2, d3) -> (d3, d2)> |
| // CHECK-DAG: #[[$MAP2:.+]] = affine_map<()[s0] -> (s0 + 8)> |
| // CHECK-DAG: #[[$MAP3:.+]] = affine_map<()[s0, s1] -> (s0 + s1 - (s1 floordiv 8) * 8 + 8)> |
| |
| func.func @distribute_transfer_write_row_major_with_nontrivial_index(%root: vector<16x16xf16>, %a: index, %b: index, %alloc: memref<32x32x32x32xf16>) { |
| %c0 = arith.constant 0 : index |
| %rootl = iree_vector_ext.to_layout %root to layout(#layout_row_major) : vector<16x16xf16> |
| vector.transfer_write %rootl, %alloc[%c0, %c0, %a, %b] |
| {in_bounds = [true, true], |
| permutation_map = affine_map<(d0, d1, d2, d3) -> (d3, d2)>} |
| : vector<16x16xf16>, memref<32x32x32x32xf16> |
| func.return |
| } |
| |
| builtin.module attributes { transform.with_named_sequence } { |
| transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { |
| %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op |
| transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op |
| transform.yield |
| } |
| } |
| |
| // CHECK-LABEL: @distribute_transfer_write_row_major_with_nontrivial_index |
| // CHECK-SAME: vector<16x16xf16>, %[[I0:.+]]: index, %[[I1:.+]]: index |
| |
| // CHECK: %[[IDX:.+]] = gpu.thread_id x |
| // CHECK: %[[LIN_ID0:.+]] = affine.apply #[[$MAP]]()[%[[I1]], %[[IDX]]] |
| // CHECK: vector.extract %{{.*}}[0, 0, 0, 0] |
| // CHECK: vector.transfer_write %{{.*}}[%c0, %c0, %[[I0]], %[[LIN_ID0]]] {{.*}} permutation_map = #[[$MAP1]] |
| // CHECK: %[[LIN_ID1:.+]] = affine.apply #[[$MAP2]]()[%[[I0]]] |
| // CHECK: vector.extract %{{.*}}[0, 1, 0, 0] |
| // CHECK: vector.transfer_write %{{.*}}[%c0, %c0, %[[LIN_ID1]], %[[LIN_ID0]]] {{.*}} permutation_map = #[[$MAP1]] |
| // CHECK: %[[LIN_ID2:.+]] = affine.apply #[[$MAP3]]()[%[[I1]], %[[IDX]]] |
| // CHECK: vector.extract %{{.*}}[1, 0, 0, 0] |
| // CHECK: vector.transfer_write %{{.*}}[%c0, %c0, %[[I0]], %[[LIN_ID2]]] {{.*}} permutation_map = #[[$MAP1]] |
| // CHECK: vector.extract %{{.*}}[1, 1, 0, 0] |
| // CHECK: vector.transfer_write %{{.*}}[%c0, %c0, %[[LIN_ID1]], %[[LIN_ID2]]] {{.*}} permutation_map = #[[$MAP1]] |
| |
| // ----- |
| |
| #layout_row_major = #iree_vector_ext.nested_layout< |
| subgroup_tile = [1, 1], |
| batch_tile = [2, 2], |
| outer_tile = [1, 1], |
| thread_tile = [8, 1], |
| element_tile = [1, 8], |
| |
| subgroup_strides = [1, 1], |
| thread_strides = [1, 1] |
| > |
| |
| func.func @distribute_transfer_read_write(%a: index, %b: index, |
| %arg0: memref<32x32x32x32xf16>, |
| %arg1: memref<32x32x32x32xf16>) { |
| %c0 = arith.constant 0 : index |
| %cst = arith.constant 0.0 : f16 |
| %root = vector.transfer_read %arg0[%c0, %c0, %a, %b], %cst |
| {in_bounds = [true, true], |
| permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)>} |
| : memref<32x32x32x32xf16>, vector<16x16xf16> |
| |
| %rootl = iree_vector_ext.to_layout %root to layout(#layout_row_major) : vector<16x16xf16> |
| |
| vector.transfer_write %rootl, %arg1[%c0, %c0, %a, %b] |
| {in_bounds = [true, true], |
| permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d3)>} |
| : vector<16x16xf16>, memref<32x32x32x32xf16> |
| return |
| } |
| |
| builtin.module attributes { transform.with_named_sequence } { |
| transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { |
| %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op |
| transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op |
| transform.yield |
| } |
| } |
| |
| // CHECK: %[[B00:.+]] = vector.transfer_read %{{.*}}[%c0, %c0, %[[LANEX:[a-zA-Z0-9]+]], %[[OFFSET0:[a-zA-Z0-9]+]]] |
| // CHECK: %[[B10:.+]] = vector.transfer_read %{{.*}}[%c0, %c0, %[[LANEX]], %[[OFFSET1:[a-zA-Z0-9]+]]] |
| // CHECK: %[[B01:.+]] = vector.transfer_read %{{.*}}[%c0, %c0, %[[LANEX_PLUS_BATCH:[a-zA-Z0-9]+]], %[[OFFSET0]]] |
| // CHECK: %[[B11:.+]] = vector.transfer_read %{{.*}}[%c0, %c0, %[[LANEX_PLUS_BATCH]], %[[OFFSET1]]] |
| // CHECK: vector.transfer_write %[[B00]], %{{.*}}[%c0, %c0, %[[LANEX]], %[[OFFSET0]]] |
| // CHECK: vector.transfer_write %[[B10]], %{{.*}}[%c0, %c0, %[[LANEX]], %[[OFFSET1]]] |
| // CHECK: vector.transfer_write %[[B01]], %{{.*}}[%c0, %c0, %[[LANEX_PLUS_BATCH]], %[[OFFSET0]]] |
| // CHECK: vector.transfer_write %[[B11]], %{{.*}}[%c0, %c0, %[[LANEX_PLUS_BATCH]], %[[OFFSET1]]] |
| |
| // ----- |
| |
| // A: shape = 128x8, layout = layoutA |
| #layout_a = #iree_vector_ext.nested_layout< |
| subgroup_tile = [4, 1], |
| batch_tile = [1, 1], |
| outer_tile = [1, 1], |
| thread_tile = [32, 2], |
| element_tile = [1, 4], |
| |
| subgroup_strides = [2, 1], |
| thread_strides = [1, 32] |
| > |
| |
| // B: shape = 8x64, layout = layoutB |
| #layout_b = #iree_vector_ext.nested_layout< |
| subgroup_tile = [1, 2], |
| batch_tile = [1, 1], |
| outer_tile = [1, 1], |
| thread_tile = [2, 32], |
| element_tile = [4, 1], |
| |
| subgroup_strides = [1, 1], |
| thread_strides = [32, 1] |
| > |
| |
| // C: shape = 128x64, layout = layoutC |
| #layout_c = #iree_vector_ext.nested_layout< |
| subgroup_tile = [4, 2], |
| batch_tile = [1, 1], |
| outer_tile = [4, 1], |
| thread_tile = [2, 32], |
| element_tile = [4, 1], |
| |
| subgroup_strides = [2, 1], |
| thread_strides = [32, 1] |
| > |
| |
| // CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 4) * 128 - (s0 floordiv 32) * 32)> |
| // CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> ((s0 floordiv 32) * 4 - ((s0 floordiv 32) floordiv 2) * 8)> |
| // CHECK-DAG: #[[$MAP2:.+]] = affine_map<()[s0] -> (s0 + (s0 floordiv 64) * 32 - ((s0 floordiv 64) floordiv 2) * 64 - (s0 floordiv 32) * 32)> |
| // CHECK-DAG: #[[$MAP3:.+]] = affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 4) * 128 + (s0 floordiv 32) * 4 - ((s0 floordiv 32) floordiv 2) * 8)> |
| // CHECK-DAG: #[[$MAP4:.+]] = affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 4) * 128 + (s0 floordiv 32) * 4 - ((s0 floordiv 32) floordiv 2) * 8 + 8)> |
| // CHECK-DAG: #[[$MAP5:.+]] = affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 4) * 128 + (s0 floordiv 32) * 4 - ((s0 floordiv 32) floordiv 2) * 8 + 16)> |
| // CHECK-DAG: #[[$MAP6:.+]] = affine_map<()[s0] -> ((s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 4) * 128 + (s0 floordiv 32) * 4 - ((s0 floordiv 32) floordiv 2) * 8 + 24)> |
| // CHECK-LABEL: @mfma_64x128x8_read |
| func.func @mfma_64x128x8_read(%mem: memref<128x8xf16>, |
| %mem1: memref<8x64xf16>, |
| %mem2: memref<128x64xf16>) |
| -> (vector<128x8xf16>, vector<8x64xf16>, vector<128x64xf16>) { |
| |
| %c0 = arith.constant 0 : index |
| %cst = arith.constant 0.0 : f16 |
| |
| // CHECK: %[[IDX:.+]] = gpu.thread_id x |
| |
| // CHECK-DAG: %[[LHSM:.+]] = affine.apply #[[$MAP]]()[%[[IDX]]] |
| // LHSK = RHSK |
| // CHECK-DAG: %[[LHSK:.+]] = affine.apply #[[$MAP1]]()[%[[IDX]]] |
| // ACCN = RHSN |
| // CHECK-DAG: %[[RHSN:.+]] = affine.apply #[[$MAP2]]()[%[[IDX]]] |
| |
| // M is unrolled 4 times. |
| // CHECK-DAG: %[[ACCM0:.+]] = affine.apply #[[$MAP3]]()[%[[IDX]]] |
| // CHECK-DAG: %[[ACCM1:.+]] = affine.apply #[[$MAP4]]()[%[[IDX]]] |
| // CHECK-DAG: %[[ACCM2:.+]] = affine.apply #[[$MAP5]]()[%[[IDX]]] |
| // CHECK-DAG: %[[ACCM3:.+]] = affine.apply #[[$MAP6]]()[%[[IDX]]] |
| |
| // M, K |
| // CHECK-DAG: transfer_read %{{.*}}[%[[LHSM]], %[[LHSK]]] |
| // K, N |
| // CHECK-DAG: transfer_read %{{.*}}[%[[LHSK]], %[[RHSN]]] |
| // M, N |
| // CHECK-DAG: transfer_read %{{.*}}[%[[ACCM0]], %[[RHSN]]] |
| // CHECK-DAG: transfer_read %{{.*}}[%[[ACCM1]], %[[RHSN]]] |
| // CHECK-DAG: transfer_read %{{.*}}[%[[ACCM2]], %[[RHSN]]] |
| // CHECK-DAG: transfer_read %{{.*}}[%[[ACCM3]], %[[RHSN]] |
| |
| %a = vector.transfer_read %mem[%c0, %c0], %cst |
| {in_bounds = [true, true]} |
| : memref<128x8xf16>, vector<128x8xf16> |
| %b = vector.transfer_read %mem1[%c0, %c0], %cst |
| {in_bounds = [true, true]} |
| : memref<8x64xf16>, vector<8x64xf16> |
| %c = vector.transfer_read %mem2[%c0, %c0], %cst |
| {in_bounds = [true, true]} |
| : memref<128x64xf16>, vector<128x64xf16> |
| |
| %A = iree_vector_ext.to_layout %a to layout(#layout_a) : vector<128x8xf16> |
| %B = iree_vector_ext.to_layout %b to layout(#layout_b) : vector<8x64xf16> |
| %C = iree_vector_ext.to_layout %c to layout(#layout_c) : vector<128x64xf16> |
| |
| return %A, %B, %C : vector<128x8xf16>, vector<8x64xf16>, vector<128x64xf16> |
| } |
| |
| builtin.module attributes { transform.with_named_sequence } { |
| transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { |
| %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op |
| transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op |
| transform.yield |
| } |
| } |
| |
| // ----- |
| |
| #layout = #iree_vector_ext.nested_layout< |
| subgroup_tile = [2, 1], |
| batch_tile = [1, 1], |
| outer_tile = [1, 1], |
| thread_tile = [32, 2], |
| element_tile = [1, 4], |
| |
| subgroup_strides = [2, 1], |
| thread_strides = [1, 32] |
| > |
| |
| func.func @transposed_read_64x8(%mem: memref<8x64xf16>) |
| -> (vector<64x8xf16>) { |
| |
| %c0 = arith.constant 0 : index |
| %cst = arith.constant 0.0 : f16 |
| |
| %read = vector.transfer_read %mem[%c0, %c0], %cst |
| {in_bounds = [true, true], permutation_map = affine_map<(d0, d1) -> (d1, d0)>} |
| : memref<8x64xf16>, vector<64x8xf16> |
| %readl = iree_vector_ext.to_layout %read to layout(#layout) : vector<64x8xf16> |
| |
| return %readl : vector<64x8xf16> |
| } |
| |
| builtin.module attributes { transform.with_named_sequence } { |
| transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { |
| %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op |
| transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op |
| transform.yield |
| } |
| } |
| |
| // CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 + (s0 floordiv 128) * 32 - ((s0 floordiv 128) floordiv 2) * 64 - (s0 floordiv 32) * 32)> |
| // CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> ((s0 floordiv 32) * 4 - ((s0 floordiv 32) floordiv 2) * 8)> |
| |
| // CHECK-LABEL: @transposed_read_64x8 |
| |
| // CHECK: %[[IDX:.+]] = gpu.thread_id x |
| // CHECK-DAG: %[[M:.+]] = affine.apply #[[$MAP]]()[%[[IDX]]] |
| // CHECK-DAG: %[[N:.+]] = affine.apply #[[$MAP1]]()[%[[IDX]]] |
| // CHECK: transfer_read %{{.*}}[%[[N]], %[[M]] |
| |
| // ----- |
| |
| #layout = #iree_vector_ext.nested_layout< |
| subgroup_tile = [2, 2], |
| batch_tile = [2, 4], |
| outer_tile = [1, 1], |
| thread_tile = [4, 16], |
| element_tile = [4, 1], |
| subgroup_strides = [2, 1], |
| thread_strides = [16, 1] |
| > |
| |
| func.func @broadcast(%src: vector<128xf16>) -> (vector<64x128xf16>) { |
| %bcast = vector.broadcast %src |
| : vector<128xf16> to vector<64x128xf16> |
| %bcastl = iree_vector_ext.to_layout %bcast to layout(#layout) : vector<64x128xf16> |
| return %bcastl : vector<64x128xf16> |
| } |
| |
| builtin.module attributes { transform.with_named_sequence } { |
| transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { |
| %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op |
| transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op |
| transform.yield |
| } |
| } |
| |
| // CHECK: vector.extract {{.*}}[0, 0] : vector<1xf16> from vector<4x1x1xf16> |
| // CHECK: vector.broadcast {{.*}} : vector<1xf16> to vector<4x1xf16> |
| // CHECK: vector.insert {{.*}} [0, 0, 0, 0] : vector<4x1xf16> into vector<2x4x1x1x4x1xf16> |
| // CHECK: vector.extract {{.*}}[1, 0] : vector<1xf16> from vector<4x1x1xf16> |
| // CHECK: vector.broadcast {{.*}} : vector<1xf16> to vector<4x1xf16> |
| // CHECK: vector.insert {{.*}} [0, 1, 0, 0] : vector<4x1xf16> into vector<2x4x1x1x4x1xf16> |
| // CHECK: vector.extract {{.*}}[2, 0] : vector<1xf16> from vector<4x1x1xf16> |
| // CHECK: vector.broadcast {{.*}} : vector<1xf16> to vector<4x1xf16> |
| // CHECK: vector.insert {{.*}} [0, 2, 0, 0] : vector<4x1xf16> into vector<2x4x1x1x4x1xf16> |
| // CHECK: vector.extract {{.*}}[3, 0] : vector<1xf16> from vector<4x1x1xf16> |
| // CHECK: vector.broadcast {{.*}} : vector<1xf16> to vector<4x1xf16> |
| |
| // CHECK: vector.insert {{.*}} [1, 0, 0, 0] : vector<4x1xf16> into vector<2x4x1x1x4x1xf16> |
| // CHECK: vector.insert {{.*}} [1, 1, 0, 0] : vector<4x1xf16> into vector<2x4x1x1x4x1xf16> |
| // CHECK: vector.insert {{.*}} [1, 2, 0, 0] : vector<4x1xf16> into vector<2x4x1x1x4x1xf16> |
| // CHECK: vector.insert {{.*}} [1, 3, 0, 0] : vector<4x1xf16> into vector<2x4x1x1x4x1xf16> |
| |
| // ----- |
| |
| #layout = #iree_vector_ext.nested_layout< |
| subgroup_tile = [2, 2, 2], |
| batch_tile = [2, 2, 1], |
| outer_tile = [2, 1, 1], |
| thread_tile = [4, 16, 8], |
| element_tile = [1, 4, 4], |
| subgroup_strides = [4, 2, 1], |
| thread_strides = [128, 8, 1] |
| > |
| |
| func.func @broadcast(%src: vector<64xf16>) -> (vector<32x256x64xf16>) { |
| %bcast = vector.broadcast %src |
| : vector<64xf16> to vector<32x256x64xf16> |
| %bcastl = iree_vector_ext.to_layout %bcast to layout(#layout) : vector<32x256x64xf16> |
| return %bcastl : vector<32x256x64xf16> |
| } |
| |
| builtin.module attributes { transform.with_named_sequence } { |
| transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { |
| %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op |
| transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op |
| transform.yield |
| } |
| } |
| |
| // CHECK: %[[EXTRACT:.+]] = vector.extract %{{.*}}[0, 0] : vector<4xf16> from vector<1x1x4xf16> |
| // CHECK: %[[BCAST:.+]] = vector.broadcast %[[EXTRACT]] : vector<4xf16> to vector<1x4x4xf16> |
| // CHECK: vector.insert %[[BCAST]], %{{.*}} [0, 0, 0, 0, 0, 0] : vector<1x4x4xf16> into vector<2x2x1x2x1x1x1x4x4xf16> |
| // CHECK: vector.insert %[[BCAST]], %{{.*}} [0, 0, 0, 1, 0, 0] : vector<1x4x4xf16> into vector<2x2x1x2x1x1x1x4x4xf16> |
| // CHECK: vector.insert %[[BCAST]], %{{.*}} [0, 1, 0, 0, 0, 0] : vector<1x4x4xf16> into vector<2x2x1x2x1x1x1x4x4xf16> |
| // CHECK: vector.insert %[[BCAST]], %{{.*}} [0, 1, 0, 1, 0, 0] : vector<1x4x4xf16> into vector<2x2x1x2x1x1x1x4x4xf16> |
| // CHECK: vector.insert %[[BCAST]], %{{.*}} [1, 0, 0, 0, 0, 0] : vector<1x4x4xf16> into vector<2x2x1x2x1x1x1x4x4xf16> |
| // CHECK: vector.insert %[[BCAST]], %{{.*}} [1, 0, 0, 1, 0, 0] : vector<1x4x4xf16> into vector<2x2x1x2x1x1x1x4x4xf16> |
| // CHECK: vector.insert %[[BCAST]], %{{.*}} [1, 1, 0, 0, 0, 0] : vector<1x4x4xf16> into vector<2x2x1x2x1x1x1x4x4xf16> |
| // CHECK: vector.insert %[[BCAST]], %{{.*}} [1, 1, 0, 1, 0, 0] : vector<1x4x4xf16> into vector<2x2x1x2x1x1x1x4x4xf16> |
| |
| // ----- |
| |
| #layout = #iree_vector_ext.nested_layout< |
| subgroup_tile = [2, 2, 2], |
| batch_tile = [2, 2, 1], |
| outer_tile = [2, 1, 1], |
| thread_tile = [4, 16, 8], |
| element_tile = [1, 4, 4], |
| subgroup_strides = [4, 2, 1], |
| thread_strides = [128, 8, 1] |
| > |
| |
| func.func @scalar_broadcast(%src: f16) -> (vector<32x256x64xf16>) { |
| %bcast = vector.broadcast %src : f16 to vector<32x256x64xf16> |
| %bcastl = iree_vector_ext.to_layout %bcast to layout(#layout) : vector<32x256x64xf16> |
| return %bcastl : vector<32x256x64xf16> |
| } |
| |
| builtin.module attributes { transform.with_named_sequence } { |
| transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { |
| %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op |
| transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op |
| transform.yield |
| } |
| } |
| |
| // CHECK-LABEL: func @scalar_broadcast |
| // CHECK-SAME: (%[[SRC:.*]]: f16) |
| // CHECK: %[[BCAST:.*]] = vector.broadcast %[[SRC]] : f16 to vector<2x2x1x2x1x1x1x4x4xf16> |
| // CHECK: iree_vector_ext.to_simd %[[BCAST]] : vector<2x2x1x2x1x1x1x4x4xf16> -> vector<32x256x64xf16> |
| |
| // ----- |
| |
| #layout = #iree_vector_ext.nested_layout< |
| subgroup_tile = [2, 2, 2], |
| batch_tile = [2, 2, 1], |
| outer_tile = [2, 1, 1], |
| thread_tile = [4, 16, 8], |
| element_tile = [1, 4, 4], |
| subgroup_strides = [4, 2, 1], |
| thread_strides = [128, 8, 1] |
| > |
| |
| func.func @zero_rank_broadcast(%src: vector<f16>) -> (vector<32x256x64xf16>) { |
| %bcast = vector.broadcast %src : vector<f16> to vector<32x256x64xf16> |
| %bcastl = iree_vector_ext.to_layout %bcast to layout(#layout) : vector<32x256x64xf16> |
| return %bcastl : vector<32x256x64xf16> |
| } |
| |
| builtin.module attributes { transform.with_named_sequence } { |
| transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { |
| %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op |
| transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op |
| transform.yield |
| } |
| } |
| |
| // CHECK-LABEL: func @zero_rank_broadcast |
| // CHECK-SAME: (%[[SRC:.*]]: vector<f16>) |
| // CHECK: %[[SRC_SIMT:.*]] = iree_vector_ext.to_simt %[[SRC]] : vector<f16> |
| // CHECK: %[[EXTRACT:.*]] = vector.extract %[[SRC_SIMT]] |
| // CHECK: %[[BCAST:.*]] = vector.broadcast %[[EXTRACT]] : f16 to vector<1x4x4xf16> |
| // CHECK: vector.insert %[[BCAST]], %{{.*}} |
| // CHECK: vector.insert %[[BCAST]], %{{.*}} |
| // CHECK: vector.insert %[[BCAST]], %{{.*}} |
| // CHECK: vector.insert %[[BCAST]], %{{.*}} |
| // CHECK: vector.insert %[[BCAST]], %{{.*}} |
| // CHECK: vector.insert %[[BCAST]], %{{.*}} |
| // CHECK: vector.insert %[[BCAST]], %{{.*}} |
| // CHECK: %[[OUT:.*]] = vector.insert %[[BCAST]], %{{.*}} |
| // CHECK: iree_vector_ext.to_simd %[[OUT]] : vector<2x2x1x2x1x1x1x4x4xf16> -> vector<32x256x64xf16> |
| |
| // ----- |
| |
| #layout = #iree_vector_ext.nested_layout< |
| subgroup_tile = [2, 2, 2], |
| batch_tile = [2, 2, 1], |
| outer_tile = [2, 1, 1], |
| thread_tile = [4, 16, 8], |
| element_tile = [1, 4, 4], |
| subgroup_strides = [4, 2, 1], |
| thread_strides = [128, 8, 1] |
| > |
| |
| func.func @gather(%base: memref<32x256x64xf16>, %index_vec: vector<32x256x64xindex>)-> (vector<32x256x64xf16>){ |
| %c0 = arith.constant 0 : index |
| %mask = arith.constant dense<true> : vector<32x256x64xi1> |
| %pass = arith.constant dense<0.000000e+00> : vector<32x256x64xf16> |
| %0 = vector.gather %base[%c0, %c0, %c0] [%index_vec], %mask, %pass : memref<32x256x64xf16>, vector<32x256x64xindex>, vector<32x256x64xi1>, vector<32x256x64xf16> into vector<32x256x64xf16> |
| %1 = iree_vector_ext.to_layout %0 to layout(#layout) : vector<32x256x64xf16> |
| return %1 : vector<32x256x64xf16> |
| } |
| |
| builtin.module attributes { transform.with_named_sequence } { |
| transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { |
| %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op |
| transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op |
| transform.yield |
| } |
| } |
| |
| // CHECK-LABEL: func @gather |
| // CHECK-SAME: (%[[SRC:.*]]: memref<32x256x64xf16>, %[[INDEX:.*]]: vector<32x256x64xindex>) |
| // CHECK: %[[DIST_INDEX:.*]] = iree_vector_ext.to_simt %[[INDEX]] : vector<32x256x64xindex> -> vector<2x2x1x2x1x1x1x4x4xindex> |
| // CHECK: %[[GATHER:.*]] = vector.gather %[[SRC]][%c0, %c0, %c0] [%[[DIST_INDEX]]] |
| // CHECK: iree_vector_ext.to_simd %[[GATHER]] : vector<2x2x1x2x1x1x1x4x4xf16> -> vector<32x256x64xf16> |
| |
| // ----- |
| |
| #layout = #iree_vector_ext.nested_layout< |
| subgroup_tile = [2, 2], |
| batch_tile = [2, 4], |
| outer_tile = [2, 1], |
| thread_tile = [4, 16], |
| element_tile = [2, 2], |
| subgroup_strides = [2, 1], |
| thread_strides = [16, 1] |
| > |
| |
| func.func @transpose(%src: vector<256x64xf16>) -> (vector<64x256xf16>) { |
| %transp = vector.transpose %src, [1, 0] |
| : vector<256x64xf16> to vector<64x256xf16> |
| %transpl = iree_vector_ext.to_layout %transp to layout(#layout) : vector<64x256xf16> |
| %sqrt = math.sqrt %transpl : vector<64x256xf16> |
| return %sqrt : vector<64x256xf16> |
| } |
| |
| builtin.module attributes { transform.with_named_sequence } { |
| transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { |
| %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op |
| transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op |
| transform.yield |
| } |
| } |
| |
| // CHECK-LABEL: func @transpose |
| // CHECK: iree_vector_ext.to_simt %{{.*}} : vector<256x64xf16> -> vector<4x2x1x2x2x2xf16> |
| // CHECK: vector.transpose %{{.*}}, [1, 0, 3, 2, 5, 4] : vector<4x2x1x2x2x2xf16> to vector<2x4x2x1x2x2xf16> |
| // CHECK: math.sqrt %{{.*}} : vector<2x4x2x1x2x2xf16> |
| // CHECK: iree_vector_ext.to_simd %{{.*}} : vector<2x4x2x1x2x2xf16> -> vector<64x256xf16> |
| |
| // ----- |
| |
| #layout = #iree_vector_ext.nested_layout< |
| subgroup_tile = [2, 2], |
| batch_tile = [2, 4], |
| outer_tile = [2, 1], |
| thread_tile = [4, 16], |
| element_tile = [2, 2], |
| subgroup_strides = [2, 1], |
| thread_strides = [16, 1] |
| > |
| |
| func.func @transpose(%src: vector<64x256xf16>) -> (vector<256x64xf16>) { |
| %srcl = iree_vector_ext.to_layout %src to layout(#layout) : vector<64x256xf16> |
| %transp = vector.transpose %srcl, [1, 0] |
| : vector<64x256xf16> to vector<256x64xf16> |
| %sqrt = math.sqrt %transp : vector<256x64xf16> |
| return %sqrt : vector<256x64xf16> |
| } |
| |
| builtin.module attributes { transform.with_named_sequence } { |
| transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { |
| %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op |
| transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op |
| transform.yield |
| } |
| } |
| |
| // CHECK-LABEL: func @transpose |
| // CHECK: iree_vector_ext.to_simt %{{.*}} : vector<64x256xf16> -> vector<2x4x2x1x2x2xf16> |
| // CHECK: vector.transpose %{{.*}}, [1, 0, 3, 2, 5, 4] : vector<2x4x2x1x2x2xf16> to vector<4x2x1x2x2x2xf16> |
| // CHECK: math.sqrt %{{.*}} : vector<4x2x1x2x2x2xf16> |
| // CHECK: iree_vector_ext.to_simd %{{.*}} : vector<4x2x1x2x2x2xf16> -> vector<256x64xf16> |
| |
| // ----- |
| |
| #layout = #iree_vector_ext.nested_layout< |
| subgroup_tile = [2, 1, 1], |
| batch_tile = [1, 2, 4], |
| outer_tile = [1, 1, 1], |
| thread_tile = [4, 8, 2], |
| element_tile = [4, 1, 2], |
| |
| subgroup_strides = [1, 1, 1], |
| thread_strides = [16, 2, 1] |
| > |
| |
| func.func @transpose_3d(%arr: memref<32x32x32xf16>) -> () { |
| %c0 = arith.constant 0 : index |
| %cst_0 = arith.constant 0.0 : f16 |
| %cst0_1 = arith.constant dense<0.0> : vector<16xf16> |
| %root = vector.transfer_read %arr[%c0, %c0, %c0], %cst_0 { |
| in_bounds = [true, true, true] |
| } : memref<32x32x32xf16>, vector<32x16x16xf16> |
| %rootl = iree_vector_ext.to_layout %root to layout(#layout) : vector<32x16x16xf16> |
| %t = vector.transpose %rootl, [1, 2, 0] : vector<32x16x16xf16> to vector<16x16x32xf16> |
| vector.transfer_write %t, %arr[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<16x16x32xf16>, memref<32x32x32xf16> |
| func.return |
| } |
| |
| builtin.module attributes { transform.with_named_sequence } { |
| transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { |
| %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op |
| transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op |
| transform.yield |
| } |
| } |
| |
| // CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0] -> ((s0 floordiv 64) * 16 - ((s0 floordiv 64) floordiv 2) * 32 + (s0 floordiv 16) * 4 - ((s0 floordiv 16) floordiv 4) * 16)> |
| // CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0] -> ((s0 floordiv 2) mod 8)> |
| // CHECK-DAG: #[[$MAP2:.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 2) * 4)> |
| // CHECK-DAG: #[[$MAP3:.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 2) * 4 + 4)> |
| // CHECK-DAG: #[[$MAP4:.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 2) * 4 + 8)> |
| // CHECK-DAG: #[[$MAP5:.+]] = affine_map<()[s0] -> (s0 * 2 - (s0 floordiv 2) * 4 + 12)> |
| // CHECK-DAG: #[[$MAP6:.+]] = affine_map<()[s0] -> ((s0 floordiv 2) mod 8 + 8)> |
| |
| // CHECK-LABEL: func @transpose_3d |
| // CHECK-DAG: %[[IDX:.+]] = gpu.thread_id x |
| |
| // CHECK-DAG: %[[DIM:.+]] = affine.apply #[[$MAP]]()[%[[IDX]]] |
| // CHECK-DAG: %[[DIM1:.+]] = affine.apply #[[$MAP1]]()[%[[IDX]]] |
| // CHECK-DAG: %[[DIM2:.+]] = affine.apply #[[$MAP2]]()[%[[IDX]]] |
| // CHECK-DAG: %[[DIM3:.+]] = affine.apply #[[$MAP3]]()[%[[IDX]]] |
| // CHECK-DAG: %[[DIM4:.+]] = affine.apply #[[$MAP4]]()[%[[IDX]]] |
| // CHECK-DAG: %[[DIM5:.+]] = affine.apply #[[$MAP5]]()[%[[IDX]]] |
| // CHECK-DAG: %[[DIM6:.+]] = affine.apply #[[$MAP6]]()[%[[IDX]]] |
| // CHECK-DAG: %[[RD0:.+]] = vector.transfer_read %arg0[%[[DIM]], %[[DIM1]], %[[DIM2]]], {{.*}} : memref<32x32x32xf16>, vector<4x1x2xf16> |
| // CHECK-DAG: %[[RD1:.+]] = vector.transfer_read %arg0[%[[DIM]], %[[DIM1]], %[[DIM3]]] |
| // CHECK-DAG: %[[RD2:.+]] = vector.transfer_read %arg0[%[[DIM]], %[[DIM1]], %[[DIM4]]] |
| // CHECK-DAG: %[[RD3:.+]] = vector.transfer_read %arg0[%[[DIM]], %[[DIM1]], %[[DIM5]]] |
| // CHECK-DAG: %[[RD4:.+]] = vector.transfer_read %arg0[%[[DIM]], %[[DIM6]], %[[DIM2]]] |
| // CHECK-DAG: %[[RD5:.+]] = vector.transfer_read %arg0[%[[DIM]], %[[DIM6]], %[[DIM3]]] |
| // CHECK-DAG: %[[RD6:.+]] = vector.transfer_read %arg0[%[[DIM]], %[[DIM6]], %[[DIM4]]] |
| // CHECK-DAG: %[[RD7:.+]] = vector.transfer_read %arg0[%[[DIM]], %[[DIM6]], %[[DIM5]]] |
| |
| // CHECK: vector.transpose %{{.*}}, [1, 2, 0, 4, 5, 3, 7, 8, 6] : vector<1x2x4x1x1x1x4x1x2xf16> to vector<2x4x1x1x1x1x1x2x4xf16> |
| |
| // CHECK-DAG: vector.transfer_write %{{.*}}, %arg0[%[[DIM1]], %[[DIM2]], %[[DIM]]] {{.*}} : vector<1x2x4xf16>, memref<32x32x32xf16> |
| // CHECK-DAG: vector.transfer_write %{{.*}}, %arg0[%[[DIM1]], %[[DIM3]], %[[DIM]]] |
| // CHECK-DAG: vector.transfer_write %{{.*}}, %arg0[%[[DIM1]], %[[DIM4]], %[[DIM]]] |
| // CHECK-DAG: vector.transfer_write %{{.*}}, %arg0[%[[DIM1]], %[[DIM5]], %[[DIM]]] |
| // CHECK-DAG: vector.transfer_write %{{.*}}, %arg0[%[[DIM6]], %[[DIM2]], %[[DIM]]] |
| // CHECK-DAG: vector.transfer_write %{{.*}}, %arg0[%[[DIM6]], %[[DIM3]], %[[DIM]]] |
| // CHECK-DAG: vector.transfer_write %{{.*}}, %arg0[%[[DIM6]], %[[DIM4]], %[[DIM]]] |
| // CHECK-DAG: vector.transfer_write %{{.*}}, %arg0[%[[DIM6]], %[[DIM5]], %[[DIM]]] |
| |
| // ----- |
| |
| #layout = #iree_vector_ext.nested_layout< |
| subgroup_tile = [1, 1], |
| batch_tile = [2, 2], |
| outer_tile = [1, 1], |
| thread_tile = [16, 4], |
| element_tile = [1, 4], |
| |
| subgroup_strides = [1, 1], |
| thread_strides = [1, 16] |
| > |
| |
| func.func @distribute_scf_for(%arr: memref<32x32xf16>, %a: vector<32x32xf16>) -> vector<f32> { |
| %c0 = arith.constant 0 : index |
| %c1 = arith.constant 1 : index |
| %c128 = arith.constant 128 : index |
| %cst = arith.constant dense<0.000000e+00> : vector<f32> |
| %cst_0 = arith.constant 0.0 : f16 |
| %out = scf.for %i = %c0 to %c128 step %c1 iter_args(%arg0 = %cst) -> (vector<f32>) { |
| %root = vector.transfer_read %arr[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<32x32xf16>, vector<32x32xf16> |
| %rootl = iree_vector_ext.to_layout %root to layout(#layout) : vector<32x32xf16> |
| %b = arith.addf %rootl, %a : vector<32x32xf16> |
| %c = arith.extf %b : vector<32x32xf16> to vector<32x32xf32> |
| %init = vector.extract %arg0[] : f32 from vector<f32> |
| %root_red = vector.multi_reduction<add>, %c, %init [0, 1] : vector<32x32xf32> to f32 |
| %d = vector.broadcast %root_red : f32 to vector<f32> |
| scf.yield %d : vector<f32> |
| } |
| return %out : vector<f32> |
| } |
| |
| builtin.module attributes { transform.with_named_sequence } { |
| transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { |
| %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op |
| transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op |
| transform.yield |
| } |
| } |
| |
| // CHECK-LABEL: func @distribute_scf_for |
| // CHECK: %[[ROOT:.*]] = arith.constant dense<0.000000e+00> : vector<f32> |
| // CHECK: iter_args(%[[ARG0:.*]] = %[[ROOT]]) -> (vector<f32>) |
| // CHECK: %[[A:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<32x32xf16> -> vector<2x2x1x1x1x4xf16> |
| // CHECK: %[[B:.*]] = arith.addf %{{.*}}, %[[A]] |
| // CHECK: %[[C:.*]] = arith.extf %[[B]] |
| // CHECK-NEXT: %[[D:.*]] = vector.extract %[[ARG0]][] |
| // Local reduction |
| // CHECK: vector.multi_reduction <add>, %[[C]], %{{.*}} [0, 1, 2, 3, 4, 5] : vector<2x2x1x1x1x4xf32> to f32 |
| // Global reduction |
| // CHECK: gpu.subgroup_reduce add %{{.*}} cluster(size = 16) : (f32) -> f32 |
| // CHECK-NEXT: gpu.subgroup_reduce add %{{.*}} cluster(size = 4, stride = 16) : (f32) -> f32 |
| // Accumulator reduction |
| // CHECK: vector.broadcast %[[D]] : f32 to vector<1xf32> |
| // CHECK: arith.addf %{{.*}}, %{{.*}} : vector<1xf32> |
| |
| // ----- |
| |
| #contraction_accesses = [ |
| affine_map<(m, k) -> (m, k)>, |
| affine_map<(m, k) -> (m, k)>, |
| affine_map<(m, k) -> ()> |
| ] |
| |
| #contraction_trait = { |
| indexing_maps = #contraction_accesses, |
| iterator_types = ["reduction", "reduction"], |
| kind = #vector.kind<maxnumf> |
| } |
| |
| #nested = #iree_vector_ext.nested_layout< |
| subgroup_tile = [1, 1], |
| batch_tile = [2, 2], |
| outer_tile = [1, 1], |
| thread_tile = [16, 4], |
| element_tile = [1, 4], |
| |
| subgroup_strides = [1, 1], |
| thread_strides = [1, 16] |
| > |
| |
| func.func @contraction_32x32_alldims(%arg0: vector<32x32xf32>, %arg1: f32) -> f32 { |
| %arg0l = iree_vector_ext.to_layout %arg0 to layout(#nested) : vector<32x32xf32> |
| %0 = vector.contract #contraction_trait %arg0l, %arg0l, %arg1 : vector<32x32xf32>, vector<32x32xf32> into f32 |
| return %0 : f32 |
| } |
| |
| builtin.module attributes { transform.with_named_sequence } { |
| transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { |
| %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op |
| transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op |
| transform.yield |
| } |
| } |
| |
| // CHECK-LABEL: func @contraction_32x32_alldims |
| // Local contraction |
| // CHECK: vector.contract {{.*}} vector<2x2x1x1x1x4xf32>, vector<2x2x1x1x1x4xf32> into f32 |
| // Global reduction |
| // CHECK: gpu.subgroup_reduce maxnumf %{{.*}} cluster(size = 16) : (f32) -> f32 |
| // CHECK-NEXT: gpu.subgroup_reduce maxnumf %{{.*}} cluster(size = 4, stride = 16) : (f32) -> f32 |
| // Accumulator reduction |
| // CHECK: arith.maxnumf %{{.*}}, %{{.*}} : vector<1xf32> |
| |
| // ----- |
| |
| #contraction_accesses = [ |
| affine_map<(m, k) -> (m, k)>, |
| affine_map<(m, k) -> (m, k)>, |
| affine_map<(m, k) -> ()> |
| ] |
| |
| #contraction_trait = { |
| indexing_maps = #contraction_accesses, |
| iterator_types = ["reduction", "reduction"], |
| kind = #vector.kind<maxnumf> |
| } |
| |
| #layout = #iree_vector_ext.nested_layout< |
| subgroup_tile = [1, 1], |
| batch_tile = [2, 2], |
| outer_tile = [1, 1], |
| thread_tile = [16, 4], |
| element_tile = [1, 4], |
| |
| subgroup_strides = [1, 1], |
| thread_strides = [1, 16] |
| > |
| |
| func.func @distribute_scf_for_contraction(%arr: memref<32x32xf16>, %a: vector<32x32xf16>) -> vector<f32> { |
| %c0 = arith.constant 0 : index |
| %c1 = arith.constant 1 : index |
| %c128 = arith.constant 128 : index |
| %cst = arith.constant dense<0.000000e+00> : vector<f32> |
| %cst_0 = arith.constant 0.0 : f16 |
| %out = scf.for %i = %c0 to %c128 step %c1 iter_args(%arg0 = %cst) -> (vector<f32>) { |
| %root = vector.transfer_read %arr[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<32x32xf16>, vector<32x32xf16> |
| %rootl = iree_vector_ext.to_layout %root to layout(#layout) : vector<32x32xf16> |
| %b = arith.addf %rootl, %a : vector<32x32xf16> |
| %c = arith.extf %b : vector<32x32xf16> to vector<32x32xf32> |
| %init = vector.extract %arg0[] : f32 from vector<f32> |
| %root_red = vector.contract #contraction_trait %c, %c, %init : vector<32x32xf32>, vector<32x32xf32> into f32 |
| %d = vector.broadcast %root_red : f32 to vector<f32> |
| scf.yield %d : vector<f32> |
| } |
| return %out : vector<f32> |
| } |
| |
| builtin.module attributes { transform.with_named_sequence } { |
| transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { |
| %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op |
| transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op |
| transform.yield |
| } |
| } |
| |
| // CHECK-LABEL: func @distribute_scf_for_contraction |
| // CHECK: %[[ROOT:.*]] = arith.constant dense<0.000000e+00> : vector<f32> |
| // CHECK: iter_args(%[[ARG0:.*]] = %[[ROOT]]) -> (vector<f32>) |
| // CHECK: %[[A:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<32x32xf16> -> vector<2x2x1x1x1x4xf16> |
| // CHECK: %[[B:.*]] = arith.addf %{{.*}}, %[[A]] |
| // CHECK: %[[C:.*]] = arith.extf %[[B]] |
| // CHECK-NEXT: %[[D:.*]] = vector.extract %[[ARG0]][] |
| // Local contraction |
| // CHECK: vector.contract {{.*}} vector<2x2x1x1x1x4xf32>, vector<2x2x1x1x1x4xf32> into f32 |
| // Global reduction |
| // CHECK: gpu.subgroup_reduce maxnumf %{{.*}} cluster(size = 16) : (f32) -> f32 |
| // CHECK-NEXT: gpu.subgroup_reduce maxnumf %{{.*}} cluster(size = 4, stride = 16) : (f32) -> f32 |
| // Accumulator reduction |
| // CHECK: arith.maxnumf %{{.*}}, %{{.*}} : vector<1xf32> |
| |
| // ----- |
| |
| #contraction_accesses = [ |
| affine_map<(m, k) -> (m, k)>, |
| affine_map<(m, k) -> (k)>, |
| affine_map<(m, k) -> (m)> |
| ] |
| |
| #contraction_trait = { |
| indexing_maps = #contraction_accesses, |
| iterator_types = ["parallel", "reduction"], |
| kind = #vector.kind<maxnumf> |
| } |
| |
| #layout_a = #iree_vector_ext.nested_layout< |
| subgroup_tile = [1, 1], |
| batch_tile = [2, 2], |
| outer_tile = [1, 1], |
| thread_tile = [16, 4], |
| element_tile = [1, 4], |
| |
| subgroup_strides = [1, 1], |
| thread_strides = [1, 16] |
| > |
| |
| #layout_b = #iree_vector_ext.nested_layout< |
| subgroup_tile = [1], |
| batch_tile = [2], |
| outer_tile = [1], |
| thread_tile = [4], |
| element_tile = [4], |
| |
| subgroup_strides = [1], |
| thread_strides = [16] |
| > |
| |
| #layout_c = #iree_vector_ext.nested_layout< |
| subgroup_tile = [1], |
| batch_tile = [2], |
| outer_tile = [1], |
| thread_tile = [16], |
| element_tile = [1], |
| |
| subgroup_strides = [1], |
| thread_strides = [1] |
| > |
| |
| func.func @contraction_dim1(%a: vector<32x32xf32>, %b: vector<32xf32>, %init: vector<32xf32>) -> vector<32xf32> { |
| %al = iree_vector_ext.to_layout %a to layout(#layout_a) : vector<32x32xf32> |
| %bl = iree_vector_ext.to_layout %b to layout(#layout_b) : vector<32xf32> |
| %output = vector.contract #contraction_trait %al, %bl, %init : vector<32x32xf32>, vector<32xf32>, vector<32xf32> into vector<32xf32> |
| %0 = iree_vector_ext.to_layout %output to layout(#layout_c) : vector<32xf32> |
| return %0 : vector<32xf32> |
| } |
| |
| builtin.module attributes { transform.with_named_sequence } { |
| transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { |
| %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op |
| transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op |
| transform.yield |
| } |
| } |
| |
| // CHECK-LABEL: func @contraction_dim1 |
| // Local contraction |
| // CHECK: vector.contract {{.*}} vector<2x2x1x1x1x4xf32>, vector<2x1x4xf32> into vector<2x1x1xf32> |
| // Global reduction |
| // CHECK: vector.extract %{{.*}}[0, 0, 0] |
| // CHECK-NEXT: gpu.subgroup_reduce maxnumf %{{.*}} cluster(size = 4, stride = 16) : (f32) -> f32 |
| // CHECK: vector.extract %{{.*}}[1, 0, 0] |
| // CHECK-NEXT: gpu.subgroup_reduce maxnumf %{{.*}} cluster(size = 4, stride = 16) : (f32) -> f32 |
| // Accumulator reduction |
| // CHECK: arith.maxnumf %{{.*}}, %{{.*}} : vector<2x1x1xf32> |
| |
| // ----- |
| |
| #layout = #iree_vector_ext.nested_layout< |
| subgroup_tile = [1], |
| batch_tile = [1], |
| outer_tile = [1], |
| thread_tile = [32], |
| element_tile = [2], |
| |
| subgroup_strides = [1], |
| thread_strides = [1] |
| > |
| |
| func.func @zero_d_vector_extract(%vec : vector<64xf32>, %acc : vector<f32>) -> f32 { |
| %layouted = iree_vector_ext.to_layout %vec to layout(#layout) : vector<64xf32> |
| %scalar = vector.extract %acc[] : f32 from vector<f32> |
| %out = vector.multi_reduction <add>, %layouted, %scalar [0] : vector<64xf32> to f32 |
| return %out : f32 |
| } |
| |
| builtin.module attributes { transform.with_named_sequence } { |
| transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) { |
| %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op |
| transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op |
| transform.yield |
| } |
| } |
| |
| // CHECK-LABEL: func @zero_d_vector_extract |
| // CHECK-SAME: %[[VEC:.+]]: vector<64xf32>, %[[ACC:.+]]: vector<f32> |
| // CHECK-DAG: %[[SIMT_ACC:.+]] = iree_vector_ext.to_simt %[[ACC]] : vector<f32> -> vector<f32> |
| // CHECK-DAG: %[[SCALAR:.+]] = vector.extract %[[SIMT_ACC]][] : f32 from vector<f32> |
| // CHECK-DAG: %[[SIMT:.+]] = iree_vector_ext.to_simt %[[VEC]] : vector<64xf32> -> vector<1x1x2xf32> |
| // CHECK: %[[LOCAL:.+]] = vector.multi_reduction <add>, %[[SIMT]], %{{.*}} |
| // CHECK: gpu.subgroup_reduce add %[[LOCAL]] |
| // Accumulator addition |
| // CHECK: %[[BROADCASTED:.+]] = vector.broadcast %[[SCALAR]] : f32 to vector<1xf32> |
| // CHECK: arith.addf %{{.*}}, %[[BROADCASTED]] |