Revert "Data tiling: transpose narrow-N into narrow-M" (#17503)
Reverts iree-org/iree#17446
Reason: postsubmit failures on arm64,
https://github.com/iree-org/iree/pull/17446#issuecomment-2135891362
diff --git a/compiler/src/iree/compiler/Codegen/Common/CPU/CPUMaterializeEncodingPass.cpp b/compiler/src/iree/compiler/Codegen/Common/CPU/CPUMaterializeEncodingPass.cpp
index c1948b9..8597360 100644
--- a/compiler/src/iree/compiler/Codegen/Common/CPU/CPUMaterializeEncodingPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/CPU/CPUMaterializeEncodingPass.cpp
@@ -498,8 +498,20 @@
if (enumeratedTileMxNxK.empty()) {
return failure();
}
+ // Check if the encoding specifies static narrow sizes for the M/N dimensions.
+ // This can be used to choose a correspondingly narrow tile shape.
+ // With microkernels, we keep this logic in sync with the set of actual
+ // optimized microkernel tile functions to avoid a tile shape specialization
+ // causing a fallback to a slow generic tile function. At the moment,
+ // microkernel tile functions are only specialize for narrow M, not for narrow
+ // N. Accordingly, we leave matmulNarrowN as 0 (default) when microkernels are
+ // used. Generally it would be best to deal with narrow-N cases by transposing
+ // the whole matmul and swapping LHS<->RHS, reducing the narrow-N case to
+ // narrow-M.
int64_t matmulNarrowM = getIntOrZero(encoding.getMatmulNarrow_M());
- int64_t matmulNarrowN = getIntOrZero(encoding.getMatmulNarrow_N());
+ int64_t matmulNarrowN = hasUkernel(targetAttr, "mmt4d")
+ ? 0
+ : getIntOrZero(encoding.getMatmulNarrow_N());
// Choose a final matmul TileMxNxK from the above-enumarated tile shapes,
// taking narrow dimensions into account.
TileMxNxK chosenTileMxNxK =
diff --git a/compiler/src/iree/compiler/Codegen/Common/CPU/test/llvmcpu_materialize_encoding.mlir b/compiler/src/iree/compiler/Codegen/Common/CPU/test/llvmcpu_materialize_encoding.mlir
index 64a87b0..eaa1661 100644
--- a/compiler/src/iree/compiler/Codegen/Common/CPU/test/llvmcpu_materialize_encoding.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/CPU/test/llvmcpu_materialize_encoding.mlir
@@ -257,45 +257,6 @@
#map = affine_map<(d0, d1, d2) -> (d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
-func.func @matvec_shaped_matmul_lowering_f32f32f32_aarch64(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {
- hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz"}>
-} {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<16x16xf32>
- %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<16x1xf32>
- %2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<16x1xf32>
- %padded = tensor.pad %0 low[0, 0] high[%c0, %c0] {
- ^bb0(%arg3: index, %arg4: index):
- tensor.yield %cst : f32
- } : tensor<16x16xf32> to tensor<?x?xf32>
- %3 = iree_encoding.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_encoding.encoding<role = LHS, element_types = [f32, f32, f32], original_type = tensor<16x16xf32>, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
- %padded_0 = tensor.pad %1 low[0, 0] high[%c0, %c0] {
- ^bb0(%arg3: index, %arg4: index):
- tensor.yield %cst : f32
- } : tensor<16x1xf32> to tensor<?x?xf32>
- %4 = iree_encoding.set_encoding %padded_0 : tensor<?x?xf32> -> tensor<?x?xf32, #iree_encoding.encoding<role = RHS, element_types = [f32, f32, f32], original_type = tensor<16x1xf32>, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
- %padded_1 = tensor.pad %2 low[0, 0] high[%c0, %c0] {
- ^bb0(%arg3: index, %arg4: index):
- tensor.yield %cst : f32
- } : tensor<16x1xf32> to tensor<?x?xf32>
- %5 = iree_encoding.set_encoding %padded_1 : tensor<?x?xf32> -> tensor<?x?xf32, #iree_encoding.encoding<role = RESULT, element_types = [f32, f32, f32], original_type = tensor<16x1xf32>, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
- %6 = linalg.matmul ins(%3, %4 : tensor<?x?xf32, #iree_encoding.encoding<role = LHS, element_types = [f32, f32, f32], original_type = tensor<16x16xf32>, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>, tensor<?x?xf32, #iree_encoding.encoding<role = RHS, element_types = [f32, f32, f32], original_type = tensor<16x1xf32>, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>) outs(%5 : tensor<?x?xf32, #iree_encoding.encoding<role = RESULT, element_types = [f32, f32, f32], original_type = tensor<16x1xf32>, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>) -> tensor<?x?xf32, #iree_encoding.encoding<role = RESULT, element_types = [f32, f32, f32], original_type = tensor<16x1xf32>, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
- %7 = iree_encoding.unset_encoding %6 : tensor<?x?xf32, #iree_encoding.encoding<role = RESULT, element_types = [f32, f32, f32], original_type = tensor<16x1xf32>, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> tensor<?x?xf32>
- %extracted_slice = tensor.extract_slice %7[0, 0] [16, 1] [1, 1] : tensor<?x?xf32> to tensor<16x1xf32>
- %8 = hal.tensor.export %extracted_slice "output0" : tensor<16x1xf32> -> !hal.buffer_view
- func.return %8 : !hal.buffer_view
-}
-// CHECK-LABEL: func @matvec_shaped_matmul_lowering_f32f32f32_aarch64(
-// CHECK: %[[MMT4D:.+]] = linalg.mmt4d
-// CHECK-SAME: ins({{.*}} : tensor<1x16x1x1xf32>, tensor<2x16x8x1xf32>)
-// CHECK-SAME: outs({{.*}} : tensor<1x2x1x8xf32>) -> tensor<1x2x1x8xf32>
-
-// -----
-
-#map = affine_map<(d0, d1, d2) -> (d0, d2)>
-#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
-#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
func.func @matmul_lowering_f32f32f32_aarch64() attributes {
hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz", ukernels = "all"}>
} {
@@ -359,45 +320,6 @@
#map = affine_map<(d0, d1, d2) -> (d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
-func.func @matvec_lowering_f32f32f32_aarch64(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {
- hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz"}>
-} {
- %c0 = arith.constant 0 : index
- %cst = arith.constant 0.000000e+00 : f32
- %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<16x16xf32>
- %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<16xf32>
- %2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<16xf32>
- %padded = tensor.pad %0 low[0, 0] high[%c0, %c0] {
- ^bb0(%arg3: index, %arg4: index):
- tensor.yield %cst : f32
- } : tensor<16x16xf32> to tensor<?x?xf32>
- %3 = iree_encoding.set_encoding %padded : tensor<?x?xf32> -> tensor<?x?xf32, #iree_encoding.encoding<role = LHS, element_types = [f32, f32, f32], original_type = tensor<16x16xf32>, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>]>>
- %padded_0 = tensor.pad %1 low[0] high[%c0] {
- ^bb0(%arg3: index):
- tensor.yield %cst : f32
- } : tensor<16xf32> to tensor<?xf32>
- %4 = iree_encoding.set_encoding %padded_0 : tensor<?xf32> -> tensor<?xf32, #iree_encoding.encoding<role = RHS, element_types = [f32, f32, f32], original_type = tensor<16xf32>, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>]>>
- %padded_1 = tensor.pad %2 low[0] high[%c0] {
- ^bb0(%arg3: index):
- tensor.yield %cst : f32
- } : tensor<16xf32> to tensor<?xf32>
- %5 = iree_encoding.set_encoding %padded_1 : tensor<?xf32> -> tensor<?xf32, #iree_encoding.encoding<role = RESULT, element_types = [f32, f32, f32], original_type = tensor<16xf32>, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>]>>
- %6 = linalg.matvec ins(%3, %4 : tensor<?x?xf32, #iree_encoding.encoding<role = LHS, element_types = [f32, f32, f32], original_type = tensor<16x16xf32>, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>]>>, tensor<?xf32, #iree_encoding.encoding<role = RHS, element_types = [f32, f32, f32], original_type = tensor<16xf32>, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>]>>) outs(%5 : tensor<?xf32, #iree_encoding.encoding<role = RESULT, element_types = [f32, f32, f32], original_type = tensor<16xf32>, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>]>>) -> tensor<?xf32, #iree_encoding.encoding<role = RESULT, element_types = [f32, f32, f32], original_type = tensor<16xf32>, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>]>>
- %7 = iree_encoding.unset_encoding %6 : tensor<?xf32, #iree_encoding.encoding<role = RESULT, element_types = [f32, f32, f32], original_type = tensor<16xf32>, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>]>> -> tensor<?xf32>
- %extracted_slice = tensor.extract_slice %7[0] [16] [1] : tensor<?xf32> to tensor<16xf32>
- %8 = hal.tensor.export %extracted_slice "output0" : tensor<16xf32> -> !hal.buffer_view
- func.return %8 : !hal.buffer_view
-}
-// CHECK-LABEL: func @matvec_lowering_f32f32f32_aarch64(
-// CHECK: %[[MMT4D:.+]] = linalg.mmt4d
-// CHECK-SAME: ins({{.*}} : tensor<1x16x1x1xf32>, tensor<2x16x8x1xf32>)
-// CHECK-SAME: outs({{.*}} : tensor<1x2x1x8xf32>) -> tensor<1x2x1x8xf32>
-
-// -----
-
-#map = affine_map<(d0, d1, d2) -> (d0, d2)>
-#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
-#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
func.func @matvec_lowering_f32f32f32_aarch64() attributes {
hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz"}>
} {
@@ -434,18 +356,18 @@
// CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1)
// CHECK-SAME: !flow.dispatch.tensor<readonly:tensor<1x16x1x1xf32>>
// CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2)
-// CHECK-SAME: !flow.dispatch.tensor<readwrite:tensor<1x2x1x8xf32>>
+// CHECK-SAME: !flow.dispatch.tensor<readwrite:tensor<2x1x8x1xf32>>
// CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]]
// CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [2, 16, 8, 1], strides = [1, 1, 1, 1]
// CHECK: %[[RHS:.+]] = flow.dispatch.tensor.load %[[RHS_BINDING]]
// CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [1, 16, 1, 1], strides = [1, 1, 1, 1]
// CHECK: %[[OUTS:.+]] = flow.dispatch.tensor.load %[[OUTS_BINDING]]
-// CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [1, 2, 1, 8], strides = [1, 1, 1, 1]
+// CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [2, 1, 8, 1], strides = [1, 1, 1, 1]
// CHECK: %[[MMT4D:.+]] = linalg.mmt4d
-// CHECK-SAME: ins(%[[RHS]], %[[LHS]] :
+// CHECK-SAME: ins(%[[LHS]], %[[RHS]] :
// CHECK-SAME: outs(%[[OUTS]] :
// CHECK: flow.dispatch.tensor.store %[[MMT4D]], %[[OUTS_BINDING]]
-// CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [1, 2, 1, 8], strides = [1, 1, 1, 1]
+// CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [2, 1, 8, 1], strides = [1, 1, 1, 1]
// -----
@@ -2246,10 +2168,10 @@
// CHECK-NEXT: linalg.yield %[[RHS_EXT_OP]] : i32
// CHECK: %[[INIT_FILL:.+]] = tensor.empty() : tensor<688x16xi32>
// CHECK: %[[EXPAND_RHS:.+]] = tensor.expand_shape %[[RHS_EXT]] {{\[}}[0, 1], [2, 3]] output_shape [1, 64, 1, 2] : tensor<64x2xi32> into tensor<1x64x1x2xi32>
-// CHECK: %[[EXPAND_INIT:.+]] = tensor.expand_shape %[[INIT_FILL:.+]] {{\[}}[0, 1], [2, 3]] output_shape [1, 688, 1, 16] : tensor<688x16xi32> into tensor<1x688x1x16xi32>
-// CHECK: %[[FILL:.+]] = linalg.fill ins(%[[C0_I32]] : i32) outs(%[[EXPAND_INIT]] : tensor<1x688x1x16xi32>) -> tensor<1x688x1x16xi32>
-// CHECK: %[[MMT4D:.+]] = linalg.mmt4d ins(%[[EXPAND_RHS]], %[[LHS_EXT]] : tensor<1x64x1x2xi32>, tensor<688x64x16x2xi32>) outs(%[[FILL]] : tensor<1x688x1x16xi32>) -> tensor<1x688x1x16xi32>
-// CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[MMT4D]] {{\[}}[0, 1], [2, 3]] : tensor<1x688x1x16xi32> into tensor<688x16xi32>
+// CHECK: %[[EXPAND_INIT:.+]] = tensor.expand_shape %[[INIT_FILL:.+]] {{\[}}[0, 1], [2, 3]] output_shape [688, 1, 16, 1] : tensor<688x16xi32> into tensor<688x1x16x1xi32>
+// CHECK: %[[FILL:.+]] = linalg.fill ins(%[[C0_I32]] : i32) outs(%[[EXPAND_INIT]] : tensor<688x1x16x1xi32>) -> tensor<688x1x16x1xi32>
+// CHECK: %[[MMT4D:.+]] = linalg.mmt4d ins(%[[LHS_EXT]], %[[EXPAND_RHS]] : tensor<688x64x16x2xi32>, tensor<1x64x1x2xi32>) outs(%[[FILL]] : tensor<688x1x16x1xi32>) -> tensor<688x1x16x1xi32>
+// CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[MMT4D]] {{\[}}[0, 1], [2, 3]] : tensor<688x1x16x1xi32> into tensor<688x16xi32>
// CHECK: %[[INIT_UNPACK:.+]] = tensor.empty() : tensor<11008xi32>
// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[COLLAPSED]] outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [16] into %[[INIT_UNPACK]] : tensor<688x16xi32> -> tensor<11008xi32>
// CHECK: return %[[UNPACK]]
@@ -2320,10 +2242,10 @@
// CHECK-NEXT: linalg.yield %[[RHS_EXT_OP]] : i32
// CHECK: %[[INIT_FILL:.+]] = tensor.empty() : tensor<1x16xi32>
// CHECK: %[[EXPAND_RHS:.+]] = tensor.expand_shape %[[RHS_EXT]] {{\[}}[0, 1], [2, 3]] output_shape [1, 64, 1, 2] : tensor<64x2xi32> into tensor<1x64x1x2xi32>
-// CHECK: %[[EXPAND_INIT:.+]] = tensor.expand_shape %[[INIT_FILL:.+]] {{\[}}[0, 1], [2, 3]] output_shape [1, 1, 1, 16] : tensor<1x16xi32> into tensor<1x1x1x16xi32>
-// CHECK: %[[FILL:.+]] = linalg.fill ins(%[[C0_I32]] : i32) outs(%[[EXPAND_INIT]] : tensor<1x1x1x16xi32>) -> tensor<1x1x1x16xi32>
-// CHECK: %[[MMT4D:.+]] = linalg.mmt4d ins(%[[EXPAND_RHS]], %[[LHS_EXT]] : tensor<1x64x1x2xi32>, tensor<1x64x16x2xi32>) outs(%[[FILL]] : tensor<1x1x1x16xi32>) -> tensor<1x1x1x16xi32>
-// CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[MMT4D]] {{\[}}[0, 1], [2, 3]] : tensor<1x1x1x16xi32> into tensor<1x16xi32>
+// CHECK: %[[EXPAND_INIT:.+]] = tensor.expand_shape %[[INIT_FILL:.+]] {{\[}}[0, 1], [2, 3]] output_shape [1, 1, 16, 1] : tensor<1x16xi32> into tensor<1x1x16x1xi32>
+// CHECK: %[[FILL:.+]] = linalg.fill ins(%[[C0_I32]] : i32) outs(%[[EXPAND_INIT]] : tensor<1x1x16x1xi32>) -> tensor<1x1x16x1xi32>
+// CHECK: %[[MMT4D:.+]] = linalg.mmt4d ins(%[[LHS_EXT]], %[[EXPAND_RHS]] : tensor<1x64x16x2xi32>, tensor<1x64x1x2xi32>) outs(%[[FILL]] : tensor<1x1x16x1xi32>) -> tensor<1x1x16x1xi32>
+// CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[MMT4D]] {{\[}}[0, 1], [2, 3]] : tensor<1x1x16x1xi32> into tensor<1x16xi32>
// CHECK: %[[INIT_UNPACK:.+]] = tensor.empty() : tensor<15xi32>
// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[COLLAPSED]] outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [16] into %[[INIT_UNPACK]] : tensor<1x16xi32> -> tensor<15xi32>
// CHECK: return %[[UNPACK]]
@@ -2404,38 +2326,77 @@
// -----
-func.func @batch_matvec(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+func.func @batch_matvec(%arg0: tensor<32x11008x128xi8>, %arg1: tensor<32x128xi8>) -> tensor<32x11008xi32> attributes {
hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512vnni"}>
} {
+ %c32 = arith.constant 32 : index
%c0 = arith.constant 0 : index
- %c0_i32 = arith.constant 0 : i32
+ %c1 = arith.constant 1 : index
+ %c128 = arith.constant 128 : index
+ %c11008 = arith.constant 11008 : index
%c0_i8 = arith.constant 0 : i8
- %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32x11008x128xi8>
- %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<32x128xi8>
- %2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<32x11008xi32>
- %padded = tensor.pad %0 low[0, 0, 0] high[%c0, %c0, %c0] {
- ^bb0(%arg3: index, %arg4: index, %arg5: index):
+ %c0_i32 = arith.constant 0 : i32
+ %padded = tensor.pad %arg0 low[0, 0, 0] high[%c0, %c0, %c0] {
+ ^bb0(%arg2: index, %arg3: index, %arg4: index):
tensor.yield %c0_i8 : i8
} : tensor<32x11008x128xi8> to tensor<?x?x?xi8>
- %3 = iree_encoding.set_encoding %padded : tensor<?x?x?xi8> -> tensor<?x?x?xi8, #iree_encoding.encoding<role = LHS, element_types = [i8, i8, i32], original_type = tensor<32x11008x128xi8>, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
- %padded_0 = tensor.pad %1 low[0, 0] high[%c0, %c0] {
- ^bb0(%arg3: index, %arg4: index):
+ %4 = iree_encoding.set_encoding %padded : tensor<?x?x?xi8> -> tensor<?x?x?xi8, #iree_encoding.encoding<role = LHS, element_types = [i8, i8, i32], matmul_narrow_N = 1 : index, original_type = tensor<32x11008x128xi8>, user_indexing_maps = [#map, #map1, #map2]>>
+ %5 = tensor.empty(%c32, %c11008, %c128) : tensor<?x?x?xi32, #iree_encoding.encoding<role = LHS, element_types = [i8, i8, i32], matmul_narrow_N = 1 : index, original_type = tensor<32x11008x128xi8>, user_indexing_maps = [#map, #map1, #map2]>>
+ %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<?x?x?xi8, #iree_encoding.encoding<role = LHS, element_types = [i8, i8, i32], matmul_narrow_N = 1 : index, original_type = tensor<32x11008x128xi8>, user_indexing_maps = [#map, #map1, #map2]>>) outs(%5 : tensor<?x?x?xi32, #iree_encoding.encoding<role = LHS, element_types = [i8, i8, i32], matmul_narrow_N = 1 : index, original_type = tensor<32x11008x128xi8>, user_indexing_maps = [#map, #map1, #map2]>>) {
+ ^bb0(%in: i8, %out: i32):
+ %17 = arith.extsi %in : i8 to i32
+ linalg.yield %17 : i32
+ } -> tensor<?x?x?xi32, #iree_encoding.encoding<role = LHS, element_types = [i8, i8, i32], matmul_narrow_N = 1 : index, original_type = tensor<32x11008x128xi8>, user_indexing_maps = [#map, #map1, #map2]>>
+ %padded_0 = tensor.pad %arg1 low[0, 0] high[%c0, %c0] {
+ ^bb0(%arg2: index, %arg3: index):
tensor.yield %c0_i8 : i8
} : tensor<32x128xi8> to tensor<?x?xi8>
- %4 = iree_encoding.set_encoding %padded_0 : tensor<?x?xi8> -> tensor<?x?xi8, #iree_encoding.encoding<role = RHS, element_types = [i8, i8, i32], original_type = tensor<32x128xi8>, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
- %padded_1 = tensor.pad %2 low[0, 0] high[%c0, %c0] {
- ^bb0(%arg3: index, %arg4: index):
- tensor.yield %c0_i32 : i32
- } : tensor<32x11008xi32> to tensor<?x?xi32>
- %5 = iree_encoding.set_encoding %padded_1 : tensor<?x?xi32> -> tensor<?x?xi32, #iree_encoding.encoding<role = RESULT, element_types = [i8, i8, i32], original_type = tensor<32x11008xi32>, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
- %6 = linalg.batch_matvec ins(%3, %4 : tensor<?x?x?xi8, #iree_encoding.encoding<role = LHS, element_types = [i8, i8, i32], original_type = tensor<32x11008x128xi8>, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>, tensor<?x?xi8, #iree_encoding.encoding<role = RHS, element_types = [i8, i8, i32], original_type = tensor<32x128xi8>, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>) outs(%5 : tensor<?x?xi32, #iree_encoding.encoding<role = RESULT, element_types = [i8, i8, i32], original_type = tensor<32x11008xi32>, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>) -> tensor<?x?xi32, #iree_encoding.encoding<role = RESULT, element_types = [i8, i8, i32], original_type = tensor<32x11008xi32>, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>
- %7 = iree_encoding.unset_encoding %6 : tensor<?x?xi32, #iree_encoding.encoding<role = RESULT, element_types = [i8, i8, i32], original_type = tensor<32x11008xi32>, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> tensor<?x?xi32>
- %extracted_slice = tensor.extract_slice %7[0, 0] [32, 11008] [1, 1] : tensor<?x?xi32> to tensor<32x11008xi32>
- %8 = hal.tensor.export %extracted_slice "output0" : tensor<32x11008xi32> -> !hal.buffer_view
- func.return %8 : !hal.buffer_view
+ %7 = iree_encoding.set_encoding %padded_0 : tensor<?x?xi8> -> tensor<?x?xi8, #iree_encoding.encoding<role = RHS, element_types = [i8, i8, i32], matmul_narrow_N = 1 : index, original_type = tensor<32x128xi8>, user_indexing_maps = [#map, #map1, #map2]>>
+ %8 = tensor.empty(%c32, %c128) : tensor<?x?xi32, #iree_encoding.encoding<role = RHS, element_types = [i8, i8, i32], matmul_narrow_N = 1 : index, original_type = tensor<32x128xi8>, user_indexing_maps = [#map, #map1, #map2]>>
+ %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%7 : tensor<?x?xi8, #iree_encoding.encoding<role = RHS, element_types = [i8, i8, i32], matmul_narrow_N = 1 : index, original_type = tensor<32x128xi8>, user_indexing_maps = [#map, #map1, #map2]>>) outs(%8 : tensor<?x?xi32, #iree_encoding.encoding<role = RHS, element_types = [i8, i8, i32], matmul_narrow_N = 1 : index, original_type = tensor<32x128xi8>, user_indexing_maps = [#map, #map1, #map2]>>) {
+ ^bb0(%in: i8, %out: i32):
+ %17 = arith.extsi %in : i8 to i32
+ linalg.yield %17 : i32
+ } -> tensor<?x?xi32, #iree_encoding.encoding<role = RHS, element_types = [i8, i8, i32], matmul_narrow_N = 1 : index, original_type = tensor<32x128xi8>, user_indexing_maps = [#map, #map1, #map2]>>
+ %10 = tensor.empty(%c32, %c11008) : tensor<?x?xi32, #iree_encoding.encoding<role = RESULT, element_types = [i8, i8, i32], matmul_narrow_N = 1 : index, original_type = tensor<32x11008xi32>, user_indexing_maps = [#map, #map1, #map2]>>
+ %11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor<?x?xi32, #iree_encoding.encoding<role = RESULT, element_types = [i8, i8, i32], matmul_narrow_N = 1 : index, original_type = tensor<32x11008xi32>, user_indexing_maps = [#map, #map1, #map2]>>) -> tensor<?x?xi32, #iree_encoding.encoding<role = RESULT, element_types = [i8, i8, i32], matmul_narrow_N = 1 : index, original_type = tensor<32x11008xi32>, user_indexing_maps = [#map, #map1, #map2]>>
+ %12 = linalg.batch_matvec ins(%6, %9 : tensor<?x?x?xi32, #iree_encoding.encoding<role = LHS, element_types = [i8, i8, i32], matmul_narrow_N = 1 : index, original_type = tensor<32x11008x128xi8>, user_indexing_maps = [#map, #map1, #map2]>>, tensor<?x?xi32, #iree_encoding.encoding<role = RHS, element_types = [i8, i8, i32], matmul_narrow_N = 1 : index, original_type = tensor<32x128xi8>, user_indexing_maps = [#map, #map1, #map2]>>) outs(%11 : tensor<?x?xi32, #iree_encoding.encoding<role = RESULT, element_types = [i8, i8, i32], matmul_narrow_N = 1 : index, original_type = tensor<32x11008xi32>, user_indexing_maps = [#map, #map1, #map2]>>) -> tensor<?x?xi32, #iree_encoding.encoding<role = RESULT, element_types = [i8, i8, i32], matmul_narrow_N = 1 : index, original_type = tensor<32x11008xi32>, user_indexing_maps = [#map, #map1, #map2]>>
+ %13 = iree_encoding.unset_encoding %12 : tensor<?x?xi32, #iree_encoding.encoding<role = RESULT, element_types = [i8, i8, i32], matmul_narrow_N = 1 : index, original_type = tensor<32x11008xi32>, user_indexing_maps = [#map, #map1, #map2]>> -> tensor<?x?xi32>
+ %extracted_slice = tensor.extract_slice %13[0, 0] [32, 11008] [1, 1] : tensor<?x?xi32> to tensor<32x11008xi32>
+ return %extracted_slice : tensor<32x11008xi32>
}
+// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
+// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
// CHECK-LABEL: func.func @batch_matvec(
+// CHECK-SAME: %[[LHS:.+]]: tensor<32x11008x128xi8>, %[[RHS:.+]]: tensor<32x128xi8>) -> tensor<32x11008xi32>
+// CHECK-DAG: %[[C0_I32:.+]] = arith.constant 0 : i32
+// CHECK-DAG: %[[INIT_LHS_PACK:.+]] = tensor.empty() : tensor<32x688x64x16x2xi8>
+// CHECK-DAG: %[[LHS_PACK:.+]] = tensor.pack %[[LHS]] outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 2] into %[[INIT_LHS_PACK]] : tensor<32x11008x128xi8> -> tensor<32x688x64x16x2xi8>
+// CHECK-DAG: %[[INIT_LHS_EXT:.+]] = tensor.empty() : tensor<32x688x64x16x2xi32>
+// CHECK: %[[LHS_EXT:.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP0]]], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%[[LHS_PACK]] : tensor<32x688x64x16x2xi8>) outs(%[[INIT_LHS_EXT]] : tensor<32x688x64x16x2xi32>) {
+// CHECK-NEXT: ^bb0(%[[LHS_EXT_ARG_IN:.+]]: i8, %[[LHS_EXT_ARG_OUT:.+]]: i32):
+// CHECK-NEXT: %[[LHS_EXT_OP:.+]] = arith.extsi %[[LHS_EXT_ARG_IN]] : i8 to i32
+// CHECK-NEXT: linalg.yield %[[LHS_EXT_OP]] : i32
+// CHECK-DAG: %[[INIT_RHS_PACK:.+]] = tensor.empty() : tensor<32x64x2xi8>
+// CHECK-DAG: %[[RHS_PACK:.+]] = tensor.pack %[[RHS]] outer_dims_perm = [0, 1] inner_dims_pos = [1] inner_tiles = [2] into %[[INIT_RHS_PACK]] : tensor<32x128xi8> -> tensor<32x64x2xi8>
+// CHECK-DAG: %[[INIT_RHS_EXT:.+]] = tensor.empty() : tensor<32x64x2xi32>
+// CHECK: %[[RHS_EXT:.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel"]} ins(%[[RHS_PACK]] : tensor<32x64x2xi8>) outs(%[[INIT_RHS_EXT]] : tensor<32x64x2xi32>) {
+// CHECK-NEXT: ^bb0(%[[RHS_EXT_ARG_IN:.+]]: i8, %[[RHS_EXT_ARG_OUT:.+]]: i32):
+// CHECK-NEXT: %[[RHS_EXT_OP:.+]] = arith.extsi %[[RHS_EXT_ARG_IN]] : i8 to i32
+// CHECK-NEXT: linalg.yield %[[RHS_EXT_OP]] : i32
+// CHECK: %[[INIT_FILL:.+]] = tensor.empty() : tensor<32x688x16xi32>
+// CHECK: %[[EXPAND_RHS:.+]] = tensor.expand_shape %[[RHS_EXT]] {{\[}}[0], [1, 2], [3, 4]] output_shape [32, 1, 64, 1, 2] : tensor<32x64x2xi32> into tensor<32x1x64x1x2xi32>
+// CHECK: %[[EXPAND_INIT:.+]] = tensor.expand_shape %[[INIT_FILL:.+]] {{\[}}[0], [1, 2], [3, 4]] output_shape [32, 688, 1, 16, 1] : tensor<32x688x16xi32> into tensor<32x688x1x16x1xi32>
+// CHECK: %[[FILL:.+]] = linalg.fill ins(%[[C0_I32]] : i32) outs(%[[EXPAND_INIT]] : tensor<32x688x1x16x1xi32>) -> tensor<32x688x1x16x1xi32>
+// CHECK: %[[MMT4D:.+]] = linalg.batch_mmt4d ins(%[[LHS_EXT]], %[[EXPAND_RHS]] : tensor<32x688x64x16x2xi32>, tensor<32x1x64x1x2xi32>) outs(%[[FILL]] : tensor<32x688x1x16x1xi32>) -> tensor<32x688x1x16x1xi32>
+// CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[MMT4D]] {{\[}}[0], [1, 2], [3, 4]] : tensor<32x688x1x16x1xi32> into tensor<32x688x16xi32>
+// CHECK: %[[INIT_UNPACK:.+]] = tensor.empty() : tensor<32x11008xi32>
+// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[COLLAPSED]] outer_dims_perm = [0, 1] inner_dims_pos = [1] inner_tiles = [16] into %[[INIT_UNPACK]] : tensor<32x688x16xi32> -> tensor<32x11008xi32>
+// CHECK: return %[[UNPACK]]
// -----
diff --git a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp
index 8387431..6cea37a 100644
--- a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp
@@ -9,8 +9,6 @@
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
-#include <numeric>
-
namespace mlir::iree_compiler {
using IREE::Encoding::EncodingAttr;
@@ -18,98 +16,23 @@
using IREE::Encoding::getEncodingAttr;
using IREE::Encoding::getEncodingContractionDims;
-// If tensorType has the encoding of a matmul RESULT with narrow N, returns
-// the transposed type. Otherwise, just returns tensorType.
-static RankedTensorType transposeIfNarrowNResult(RankedTensorType tensorType) {
- auto encoding =
- llvm::dyn_cast_or_null<EncodingAttr>(tensorType.getEncoding());
- if (!encoding) {
- return tensorType;
- }
- if (!isNarrowNResult(encoding)) {
- return tensorType;
- }
- auto newRole = encoding.getRole().getValue();
- TypeAttr originalTypeAttr = encoding.getOriginalType();
- RankedTensorType originalType = tensorType;
- if (originalTypeAttr) {
- originalType =
- llvm::dyn_cast<RankedTensorType>(originalTypeAttr.getValue());
- }
- SmallVector<int64_t> newOriginalShape(originalType.getShape());
- auto userIndexingMaps = encoding.getUserIndexingMaps();
- SmallVector<AffineMap> maps;
- for (auto a : userIndexingMaps) {
- maps.push_back(cast<AffineMapAttr>(a).getAffineMap());
- }
- auto cDims = linalg::inferContractionDims(maps);
- SmallVector<int64_t> newShape(tensorType.getShape());
- SmallVector<int64_t> permIndices(maps[0].getNumDims());
- std::iota(std::begin(permIndices), std::end(permIndices), 0);
- // Matrix case: there are both M and N dimensions. Transposing means swapping
- // them.
- if (cDims->m.size() == 1 && cDims->n.size() == 1) {
- int m = cDims->m[0];
- int n = cDims->n[0];
- std::swap(permIndices[m], permIndices[n]);
- int mDim = encoding.mapDimToRoleIndex(m);
- int nDim = encoding.mapDimToRoleIndex(n);
- std::swap(newShape[mDim], newShape[nDim]);
- std::swap(newOriginalShape[mDim], newOriginalShape[nDim]);
- }
- // Vector case: there is no N dimension to swap the M dimension with. We
- // swap the maps themselves.
- if (cDims->n.empty()) {
- std::swap(maps[0], maps[1]);
- }
-
- // auto newRoundDimsTo = encoding.getRoundDimsToArray();
- SmallVector<int64_t> newRoundDimsTo(encoding.getRoundDimsToArray());
- assert(newRoundDimsTo.size() == 0 || newRoundDimsTo.size() == 3);
- if (newRoundDimsTo.size() != 0)
- std::swap(newRoundDimsTo[0], newRoundDimsTo[1]);
-
- auto context = tensorType.getContext();
- AffineMap permutation = AffineMap::getPermutationMap(permIndices, context);
- for (auto &map : maps) {
- map = map.compose(permutation);
- }
- SmallVector<Attribute> newMaps;
- for (auto map : maps) {
- newMaps.push_back(AffineMapAttr::get(map));
- }
- ArrayAttr newIndexingMaps = ArrayAttr::get(context, newMaps);
- auto elemType = tensorType.getElementType();
- OpBuilder builder(context);
-
- auto newEncoding = IREE::Encoding::EncodingAttr::get(
- context, IREE::Encoding::EncodingRoleAttr::get(context, newRole),
- encoding.getElementTypes(),
- TypeAttr::get(RankedTensorType::get(newOriginalShape, elemType)),
- encoding.getMatmulNarrow_N(), encoding.getMatmulNarrow_M(),
- newIndexingMaps, DenseI64ArrayAttr::get(context, newRoundDimsTo));
- return RankedTensorType::get(newShape, elemType, newEncoding);
-}
-
/// For a given tensor type with an encoding, return the materialized
/// type to use for it. If no encoding is set, then return the tensor type
/// itself.
static RankedTensorType
getMaterializedType(RankedTensorType tensorType,
MaterializeEncodingFn materializeEncodingFn) {
- RankedTensorType maybeTransposedTensorType =
- transposeIfNarrowNResult(tensorType);
FailureOr<MaterializeEncodingInfo> materializeEncodingInfo =
- materializeEncodingFn(maybeTransposedTensorType);
+ materializeEncodingFn(tensorType);
if (failed(materializeEncodingInfo)) {
return dropEncoding(tensorType);
}
- return cast<RankedTensorType>(tensor::PackOp::inferPackedType(
- getOriginalTypeWithEncoding(maybeTransposedTensorType)
- .clone(tensorType.getElementType()),
- materializeEncodingInfo->innerTileSizes,
- materializeEncodingInfo->innerDimsPos,
- materializeEncodingInfo->outerDimsPerm));
+ return cast<RankedTensorType>(
+ tensor::PackOp::inferPackedType(getOriginalTypeWithEncoding(tensorType)
+ .clone(tensorType.getElementType()),
+ materializeEncodingInfo->innerTileSizes,
+ materializeEncodingInfo->innerDimsPos,
+ materializeEncodingInfo->outerDimsPerm));
}
MaterializeEncodingTypeConverter::MaterializeEncodingTypeConverter(
@@ -119,9 +42,10 @@
addConversion([](IndexType indexType) { return indexType; });
addConversion([](FloatType floatType) { return floatType; });
addConversion([](MemRefType memrefType) { return memrefType; });
- addConversion([=](RankedTensorType t) -> RankedTensorType {
- return getMaterializedType(t, materializeEncodingFn);
- });
+ addConversion(
+ [materializeEncodingFn](RankedTensorType t) -> RankedTensorType {
+ return getMaterializedType(t, materializeEncodingFn);
+ });
}
MaterializeEncodingConversionTarget::MaterializeEncodingConversionTarget(
@@ -203,13 +127,4 @@
return encodingInfo;
}
-bool isNarrowNResult(EncodingAttr encoding) {
- if (encoding.getRole().getValue() != EncodingRole::RESULT) {
- return false;
- }
- IntegerAttr narrowM = encoding.getMatmulNarrow_M();
- IntegerAttr narrowN = encoding.getMatmulNarrow_N();
- return narrowN && (!narrowM || narrowM.getInt() > narrowN.getInt());
-}
-
} // namespace mlir::iree_compiler
diff --git a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h
index 42b4438..cf9e9a6 100644
--- a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h
+++ b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h
@@ -99,10 +99,6 @@
void populateMaterializeUpperBoundTileSizePatterns(
RewritePatternSet &patterns, MaterializeEncodingFn materializeEncodingFn);
-// Returns true if `encoding` represents a narrow-N matmul RESULT, e.g. the
-// result of a matvec.
-bool isNarrowNResult(IREE::Encoding::EncodingAttr encoding);
-
} // namespace mlir::iree_compiler
#endif // IREE_COMPILER_SRC_IREE_COMPILER_CODEGEN_COMMON_ENCODINGUTILS_H_
diff --git a/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp b/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp
index 689e88c..b6443d3 100644
--- a/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp
@@ -18,7 +18,6 @@
#include "llvm/ADT/SmallVectorExtras.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
#include "mlir/Dialect/MemRef/Transforms/Transforms.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -142,12 +141,11 @@
/// the canonical mmt4d input shape. If the input element type is unsigned,
/// create a producer Linalg::GenericOp on the input that unsigned extends the
/// input to the output element type. This extension is required to keep the
-/// unsignedness information on the input for ukernels. If `transpose` is true,
-/// the `linalgOp`'s indexing maps are transposed.
-static Value getMmt4dOperand(Value value, linalg::LinalgOp linalgOp,
- bool transpose, RewriterBase &rewriter,
- SmallVectorImpl<ReassociationIndices> &ri,
- ArrayRef<Type> elemTypes, int operandIdx) {
+/// unsignedness information on the input for ukernels.
+Value getMmt4dOperand(Value value, linalg::LinalgOp linalgOp,
+ RewriterBase &rewriter,
+ SmallVectorImpl<ReassociationIndices> &ri,
+ ArrayRef<Type> elemTypes, int operandIdx) {
assert(linalgOp.getNumDpsInputs() == 2);
assert(linalgOp.getNumDpsInits() == 1);
auto cDims = linalg::inferContractionDims(linalgOp);
@@ -160,7 +158,7 @@
auto type = cast<RankedTensorType>(value.getType());
RankedTensorType newType = getExpandedType(
type, /*isBatched=*/!cDims->batch.empty(),
- /*isTransposed=*/operandIdx == 2 && (transpose ^ cDims->n.empty()), ri);
+ /*isTransposed=*/operandIdx == 2 && cDims->n.empty(), ri);
expandedValue =
rewriter.create<tensor::ExpandShapeOp>(loc, newType, value, ri);
}
@@ -171,22 +169,6 @@
return expandedValue;
}
-static void transposeInPlace(MaterializeEncodingInfo &info) {
- // Vector cases: nothing to do.
- if (info.innerTileSizes.size() < 2) {
- return;
- }
- // Not a vector case, so all three arrays in `info` have size at least 2,
- // outerDimsPerm may have size 3 if there is a batch dimension, but in all
- // cases, the last 2 entries of each array are M and N, not batch.
- auto transpose = [](SmallVector<int64_t> &a) {
- std::swap(a[a.size() - 2], a[a.size() - 1]);
- };
- transpose(info.innerDimsPos);
- transpose(info.innerTileSizes);
- transpose(info.outerDimsPerm);
-}
-
//===---------------------------------------------------------------------===//
// Methods to convert `set_encoding` and `unset_encoding` operations
// to `pack` and `unpack` operations respectively.
@@ -218,18 +200,11 @@
MaterializeEncodingFn materializeEncodingFn,
MaterializeEncodingValueFn materializeEncodingValueFn) {
RankedTensorType resultType = encodingOp.getResultType();
- auto encoding = getEncodingAttr(resultType);
- if (!encoding) {
- return failure();
- }
FailureOr<MaterializeEncodingInfo> materializeEncodingInfo =
materializeEncodingFn(resultType);
if (failed(materializeEncodingInfo)) {
return rewriter.notifyMatchFailure(encodingOp, "unhandled result encoding");
}
- if (isNarrowNResult(encoding)) {
- transposeInPlace(*materializeEncodingInfo);
- }
// Create `tensor.empty` operation for the result of the pack operation.
Location loc = encodingOp.getLoc();
FailureOr<SmallVector<OpFoldResult>> innerTileSizesOfr =
@@ -239,6 +214,7 @@
return rewriter.notifyMatchFailure(
encodingOp, "failed to generate runtime tile size query");
}
+ auto encoding = getEncodingAttr(resultType);
if (!encoding) {
return failure();
}
@@ -275,9 +251,6 @@
if (failed(materializeEncodingInfo)) {
return rewriter.notifyMatchFailure(encodingOp, "unhandled source encoding");
}
- if (isNarrowNResult(getEncodingAttr(sourceType))) {
- transposeInPlace(*materializeEncodingInfo);
- }
// Create an `tensor.empty` for the result of the unpack operation.
Location loc = encodingOp.getLoc();
SmallVector<OpFoldResult> resultDims =
@@ -366,22 +339,22 @@
operands.take_front(inputs.size()),
operands.drop_front(inputs.size()));
} else {
- bool transpose = isNarrowNResult(resultEncoding);
auto elemTypes = llvm::map_to_vector(
lhsEncoding.getElementTypes().getValue(),
[](Attribute a) { return cast<TypeAttr>(a).getValue(); });
SmallVector<ReassociationIndices> ri;
- Value newLhs = getMmt4dOperand(operands[0], linalgOp, transpose, rewriter,
- ri, elemTypes, /*operandIdx=*/0);
- Value newRhs = getMmt4dOperand(operands[1], linalgOp, transpose, rewriter,
- ri, elemTypes, /*operandIdx=*/1);
+ Value newLhs =
+ getMmt4dOperand(operands[0], linalgOp, rewriter, ri, elemTypes,
+ /*operandIdx=*/0);
+ Value newRhs =
+ getMmt4dOperand(operands[1], linalgOp, rewriter, ri, elemTypes,
+ /*operandIdx=*/1);
Value newResult =
- getMmt4dOperand(operands[2], linalgOp, transpose, rewriter, ri,
- elemTypes, /*operandIdx=*/2);
- if (transpose) {
- std::swap(newLhs, newRhs);
- }
+ getMmt4dOperand(operands[2], linalgOp, rewriter, ri, elemTypes,
+ /*operandIdx=*/2);
+
Type newResultType = newResult.getType();
+
auto cDims = IREE::Encoding::getEncodingContractionDims(lhsEncoding);
if (cDims->batch.empty()) {
result = rewriter.create<linalg::Mmt4DOp>(
@@ -418,9 +391,7 @@
loc, emptyOp.getMixedSizes(), resultType.getElementType());
return newEmptyOp;
}
- if (isNarrowNResult(getEncodingAttr(emptyType))) {
- transposeInPlace(*materializeEncodingInfo);
- }
+
FailureOr<SmallVector<OpFoldResult>> innerTileSizesOfr =
getInnerTileSizesOfr(rewriter, loc, resultType, *materializeEncodingInfo,
materializeEncodingValueFn);
@@ -436,6 +407,7 @@
materializeEncodingInfo->outerDimsPerm);
Operation *newEmptyOp = rewriter.create<tensor::EmptyOp>(
loc, newShape, resultType.getElementType());
+
return newEmptyOp;
}
@@ -527,9 +499,6 @@
if (failed(encodingInfo)) {
return failure();
}
- if (isNarrowNResult(getEncodingAttr(boundTensorType))) {
- transposeInPlace(*encodingInfo);
- }
SmallVector<OpFoldResult> targetShape =
getMixedValues(originalTensorType.getShape(), dynamicDims, builder);
@@ -741,10 +710,10 @@
LogicalResult
matchAndRewrite(SetEncodingOp encodingOp, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- auto converter = static_cast<const MaterializeEncodingTypeConverter *>(
- getTypeConverter());
MaterializeEncodingFn materializeEncodingFn =
- converter->getMaterializeEncodingFn();
+ static_cast<const MaterializeEncodingTypeConverter *>(
+ getTypeConverter())
+ ->getMaterializeEncodingFn();
auto packOp = lowerSetEncodingOpToPackOp(
rewriter, encodingOp, adaptor.getSource(), materializeEncodingFn,
this->materializeEncodingValueFn);
@@ -773,10 +742,10 @@
LogicalResult
matchAndRewrite(UnsetEncodingOp encodingOp, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- auto converter = static_cast<const MaterializeEncodingTypeConverter *>(
- this->getTypeConverter());
MaterializeEncodingFn materializeEncodingFn =
- converter->getMaterializeEncodingFn();
+ static_cast<const MaterializeEncodingTypeConverter *>(
+ this->getTypeConverter())
+ ->getMaterializeEncodingFn();
auto unpackOp = lowerUnsetEncodingToUnpackOp(
rewriter, encodingOp, adaptor.getSource(), materializeEncodingFn,
this->materializeEncodingValueFn);
@@ -833,10 +802,10 @@
LogicalResult
matchAndRewrite(OpTy dpsOp, typename OpTy::Adaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- auto converter = static_cast<const MaterializeEncodingTypeConverter *>(
- this->getTypeConverter());
MaterializeEncodingFn materializeEncodingFn =
- converter->getMaterializeEncodingFn();
+ static_cast<const MaterializeEncodingTypeConverter *>(
+ this->getTypeConverter())
+ ->getMaterializeEncodingFn();
FailureOr<Operation *> convertedOp = lowerOpWithEncoding(
rewriter, dpsOp, adaptor.getInputs(), adaptor.getOutputs(),
materializeEncodingFn, this->materializeEncodingValueFn);
@@ -856,10 +825,10 @@
LogicalResult
matchAndRewrite(OpTy op, typename OpTy::Adaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- auto converter = static_cast<const MaterializeEncodingTypeConverter *>(
- this->getTypeConverter());
MaterializeEncodingFn materializeEncodingFn =
- converter->getMaterializeEncodingFn();
+ static_cast<const MaterializeEncodingTypeConverter *>(
+ this->getTypeConverter())
+ ->getMaterializeEncodingFn();
FailureOr<Operation *> convertedOp = lowerOpWithEncoding(
rewriter, op, adaptor.getOperands(), materializeEncodingFn,
this->materializeEncodingValueFn);
@@ -899,10 +868,10 @@
matchAndRewrite(mlir::linalg::ContractionOpInterface op,
ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const override {
- auto converter = static_cast<const MaterializeEncodingTypeConverter *>(
- this->getTypeConverter());
MaterializeEncodingFn materializeEncodingFn =
- converter->getMaterializeEncodingFn();
+ static_cast<const MaterializeEncodingTypeConverter *>(
+ this->getTypeConverter())
+ ->getMaterializeEncodingFn();
auto linalgOp = dyn_cast<linalg::LinalgOp>(op.getOperation());
if (!linalgOp || operands.size() != 3) {
return failure();
diff --git a/tests/e2e/linalg/BUILD.bazel b/tests/e2e/linalg/BUILD.bazel
index 791fd28..73645d2 100644
--- a/tests/e2e/linalg/BUILD.bazel
+++ b/tests/e2e/linalg/BUILD.bazel
@@ -23,7 +23,6 @@
[
"conv2d.mlir",
"fp_to_subbyte.mlir",
- "narrow_n_matmuls.mlir",
"subbyte_to_fp.mlir",
],
include = ["*.mlir"],
@@ -46,7 +45,6 @@
VMVX_SRCS = enforce_glob(
[
"conv2d.mlir",
- "narrow_n_matmuls.mlir",
],
include = ["*.mlir"],
exclude = [
@@ -67,7 +65,6 @@
[
"conv2d.mlir",
"subbyte_to_fp.mlir",
- "narrow_n_matmuls.mlir",
],
include = ["*.mlir"],
exclude = [
@@ -115,7 +112,6 @@
"subbyte_to_fp.mlir",
# currently only enabled on cuda as it can be slow on other backends.
"large_linalg_matmul.mlir",
- "narrow_n_matmuls.mlir",
],
include = ["*.mlir"],
exclude = [
diff --git a/tests/e2e/linalg/CMakeLists.txt b/tests/e2e/linalg/CMakeLists.txt
index 9794387..fdd9c04 100644
--- a/tests/e2e/linalg/CMakeLists.txt
+++ b/tests/e2e/linalg/CMakeLists.txt
@@ -16,7 +16,6 @@
SRCS
"conv2d.mlir"
"fp_to_subbyte.mlir"
- "narrow_n_matmuls.mlir"
"subbyte_to_fp.mlir"
TARGET_BACKEND
"llvm-cpu"
@@ -31,7 +30,6 @@
check_vmvx_local-task
SRCS
"conv2d.mlir"
- "narrow_n_matmuls.mlir"
TARGET_BACKEND
"vmvx"
DRIVER
@@ -43,7 +41,6 @@
check_vulkan-spirv_vulkan
SRCS
"conv2d.mlir"
- "narrow_n_matmuls.mlir"
"subbyte_to_fp.mlir"
TARGET_BACKEND
"vulkan-spirv"
@@ -84,7 +81,6 @@
"conv2d.mlir"
"fp_to_subbyte.mlir"
"large_linalg_matmul.mlir"
- "narrow_n_matmuls.mlir"
"subbyte_to_fp.mlir"
TARGET_BACKEND
"cuda"
diff --git a/tests/e2e/linalg/narrow_n_matmuls.mlir b/tests/e2e/linalg/narrow_n_matmuls.mlir
deleted file mode 100644
index 578d7f7..0000000
--- a/tests/e2e/linalg/narrow_n_matmuls.mlir
+++ /dev/null
@@ -1,126 +0,0 @@
-// Test various forms of matmuls with narrow N, in particual matvec/batch_matvec
-// (implicitly N=1) and matmuls with N=1 and N=2.
-//
-// The reason why this needs extensive e2e testing is the transposition of
-// narrow N to narrow M in data tiling (around CPUMaterializeEncodingPass).
-// It doesn't hurt to enable this case on all backends though.
-
-func.func @matvec() {
- %lhs = util.unfoldable_constant dense<[
- [1, 2, 0, 5],
- [3, 4, -1, -3],
- [5, 6, -7, 0]
- ]> : tensor<3x4xi8>
- %rhs = util.unfoldable_constant dense<[-2, 3, 4, -1]> : tensor<4xi8>
- %acc = util.unfoldable_constant dense<[1, 2, 3]> : tensor<3xi32>
- %result = linalg.matvec ins(%lhs, %rhs : tensor<3x4xi8>, tensor<4xi8>) outs(%acc : tensor<3xi32>) -> tensor<3xi32>
- check.expect_eq_const(%result, dense<
- [0, 7, -17]
- > : tensor<3xi32>) : tensor<3xi32>
- return
-}
-
-func.func @batch_matvec() {
- %lhs = util.unfoldable_constant dense<[[
- [1, 2, 0, 5],
- [3, 4, -1, -3],
- [5, 6, -7, 0]
- ], [
- [-3, 1, 4, 2],
- [-1, 0, 6, -1],
- [1, -2, 3, -4]
- ]]> : tensor<2x3x4xi8>
- %rhs = util.unfoldable_constant dense<[
- [-2, 3, 4, -1],
- [1, 2, -5, 3]
- ]> : tensor<2x4xi8>
- %acc = util.unfoldable_constant dense<[[1, 2, 3], [4, 5, 6]]> : tensor<2x3xi32>
- %result = linalg.batch_matvec ins(%lhs, %rhs : tensor<2x3x4xi8>, tensor<2x4xi8>) outs(%acc : tensor<2x3xi32>) -> tensor<2x3xi32>
- check.expect_eq_const(%result, dense<[
- [0, 7, -17],
- [-11, -29, -24]
- ]> : tensor<2x3xi32>) : tensor<2x3xi32>
- return
-}
-
-func.func @matmul_narrow_n_1() {
- %lhs = util.unfoldable_constant dense<[
- [1, 2, 0, 5],
- [3, 4, -1, -3],
- [5, 6, -7, 0]
- ]> : tensor<3x4xi8>
- %rhs = util.unfoldable_constant dense<[[-2], [3], [4], [-1]]> : tensor<4x1xi8>
- %acc = util.unfoldable_constant dense<[[1], [2], [3]]> : tensor<3x1xi32>
- %result = linalg.matmul ins(%lhs, %rhs : tensor<3x4xi8>, tensor<4x1xi8>) outs(%acc : tensor<3x1xi32>) -> tensor<3x1xi32>
- check.expect_eq_const(%result, dense<
- [[0], [7], [-17]]
- > : tensor<3x1xi32>) : tensor<3x1xi32>
- return
-}
-
-func.func @batch_matmul_narrow_n_1() {
- %lhs = util.unfoldable_constant dense<[[
- [1, 2, 0, 5],
- [3, 4, -1, -3],
- [5, 6, -7, 0]
- ], [
- [-3, 1, 4, 2],
- [-1, 0, 6, -1],
- [1, -2, 3, -4]
- ]]> : tensor<2x3x4xi8>
- %rhs = util.unfoldable_constant dense<[
- [[-2], [3], [4], [-1]],
- [[1], [2], [-5], [3]]
- ]> : tensor<2x4x1xi8>
- %acc = util.unfoldable_constant dense<[
- [[1], [2], [3]],
- [[4], [5], [6]]
- ]> : tensor<2x3x1xi32>
- %result = linalg.batch_matmul ins(%lhs, %rhs : tensor<2x3x4xi8>, tensor<2x4x1xi8>) outs(%acc : tensor<2x3x1xi32>) -> tensor<2x3x1xi32>
- check.expect_eq_const(%result, dense<[
- [[0], [7], [-17]],
- [[-11], [-29], [-24]]
- ]> : tensor<2x3x1xi32>) : tensor<2x3x1xi32>
- return
-}
-
-func.func @matmul_narrow_n_2() {
- %lhs = util.unfoldable_constant dense<[
- [1, 2, 0, 5],
- [3, 4, -1, -3],
- [5, 6, -7, 0]
- ]> : tensor<3x4xi8>
- %rhs = util.unfoldable_constant dense<[[-2, 1], [3, -1], [4, 0], [-1, 2]]> : tensor<4x2xi8>
- %acc = util.unfoldable_constant dense<[[1, -1], [2, 0], [3, 1]]> : tensor<3x2xi32>
- %result = linalg.matmul ins(%lhs, %rhs : tensor<3x4xi8>, tensor<4x2xi8>) outs(%acc : tensor<3x2xi32>) -> tensor<3x2xi32>
- check.expect_eq_const(%result, dense<
- [[0, 8], [7, -7], [-17, 0]]
- > : tensor<3x2xi32>) : tensor<3x2xi32>
- return
-}
-
-func.func @batch_matmul_narrow_n_2() {
- %lhs = util.unfoldable_constant dense<[[
- [1, 2, 0, 5],
- [3, 4, -1, -3],
- [5, 6, -7, 0]
- ], [
- [-3, 1, 4, 2],
- [-1, 0, 6, -1],
- [1, -2, 3, -4]
- ]]> : tensor<2x3x4xi8>
- %rhs = util.unfoldable_constant dense<[
- [[-2, 0], [3, 1], [4, -1], [-1, 2]],
- [[1, -2], [2, 3], [-5, -3], [3, 0]]
- ]> : tensor<2x4x2xi8>
- %acc = util.unfoldable_constant dense<[
- [[1, -1], [2, 0], [3, 1]],
- [[4, 2], [5, 1], [6, -1]]
- ]> : tensor<2x3x2xi32>
- %result = linalg.batch_matmul ins(%lhs, %rhs : tensor<2x3x4xi8>, tensor<2x4x2xi8>) outs(%acc : tensor<2x3x2xi32>) -> tensor<2x3x2xi32>
- check.expect_eq_const(%result, dense<[
- [[0, 11], [7, -1], [-17, 14]],
- [[-11, -1], [-29, -15], [-24, -18]]
- ]> : tensor<2x3x2xi32>) : tensor<2x3x2xi32>
- return
-}