Switch all x86 and RISC-V matmul codegen to use DoubleTilingExpert. (#8539) The RISC-V targets were considered as ARM configuration. This PR makes it go through Sandbox based approach. The commit also moves quantized matmul to use DoubleTilingExpert. They used old pipeline because of long compilation time. Many instructions were generated during the lowering. This is addressed in https://github.com/llvm/llvm-project/commit/1538bd518cd236f4321695e9c5f0dd24601db366 It is a step toward https://github.com/google/iree/issues/8431

commit: 0fb618ccecff6559165609bab4f5b0a934a09edb [log] [tgz]
author: Han-Chung Wang <hanchung@google.com> Mon Mar 14 23:04:25 2022 -0700
committer: GitHub <noreply@github.com> Mon Mar 14 23:04:25 2022 -0700
tree: a506cc71d7168847c6380e1fe04e78c80a3538d5
parent: 5783b51d60b62b8312d2eed76052c169cd7137e1 [diff]
diff --git a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 7dcf9d3..3ca6445 100644
--- a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp

@@ -370,37 +370,6 @@
       DispatchLoweringPassPipeline::CPUDoubleTilingExpert);
 }
 
-static LogicalResult setX86TileFuseAndVectorizeRootConfig(
-    FuncOp entryPointFn, linalg::ContractionOpInterface op,
-    ArrayRef<int64_t> flowTileSizes, int vectorSize) {
-  // Hardcoded tile sizes, where v is the native vector size.
-  // L1 tile sizes are {1, 1, ..., 8, 2v, 2v}.
-  // Vector tile sizes are {1, ..., 1, v, v}
-  SmallVector<int64_t> l1TileSizes, vectorTileSizes;
-  int64_t nLoops = cast<linalg::LinalgOp>(op.getOperation()).getNumLoops();
-  l1TileSizes.append(nLoops - 3, 1);
-  l1TileSizes.push_back(
-      getMaxTileSize(0, flowTileSizes[nLoops - 3], 8, vectorSize));
-  l1TileSizes.push_back(
-      getMaxTileSize(0, flowTileSizes[nLoops - 2], 2 * vectorSize, vectorSize));
-  vectorTileSizes.append(nLoops - 2, 1);
-  vectorTileSizes.push_back(vectorSize);
-
-  // L1/vector tile size for k dimensions.
-  auto lhsShapedType = op.lhs().getType().cast<ShapedType>();
-  int64_t K = lhsShapedType.getShape().back();
-  l1TileSizes.push_back(getMaxTileSize(0, K, 2 * vectorSize, vectorSize));
-  vectorTileSizes.push_back(vectorSize);
-  TileSizesListType tileSizes;
-  tileSizes.emplace_back(flowTileSizes.begin(), flowTileSizes.end());
-  tileSizes.push_back(l1TileSizes);
-  tileSizes.push_back(vectorTileSizes);
-
-  return setOpConfigAndEntryPointFnTranslation(
-      entryPointFn, op, tileSizes,
-      DispatchLoweringPassPipeline::CPUTileFuseAndVectorize);
-}
-
 static LogicalResult setARMRootConfig(FuncOp entryPointFn,
                                       linalg::ContractionOpInterface op,
                                       ArrayRef<int64_t> flowTileSizes,
@@ -461,23 +430,15 @@
           contractionOp.getOperation()),
       minTileSizes, maxTileSizes);
 
-  if (isX86(entryPointFn)) {
+  // TODO(dcaballe): Find better configurations for RISC-V backends.
+  if (isX86(entryPointFn) || isRISCV(entryPointFn)) {
     // There is a tileInterchange option. If it needs to be configured, we can
     // only apply the pipeline to linalg.matmul. Because we don't know the
     // number of loops when adding the pass to pass manager.
     // TODO(hanchung): Embed options into attributes, so we can control options
     // more heuristically.
-    Type lhsElemType = getElementTypeOrSelf(contractionOp.lhs().getType());
-    Type rhsElemType = getElementTypeOrSelf(contractionOp.rhs().getType());
-    Type resElemType =
-        getElementTypeOrSelf(contractionOp->getResult(0).getType());
-    if (lhsElemType == rhsElemType && rhsElemType == resElemType) {
-      return setX86SandboxRootConfig(entryPointFn, contractionOp, flowTileSizes,
-                                     vectorSize);
-    } else {
-      return setX86TileFuseAndVectorizeRootConfig(entryPointFn, contractionOp,
-                                                  flowTileSizes, vectorSize);
-    }
+    return setX86SandboxRootConfig(entryPointFn, contractionOp, flowTileSizes,
+                                   vectorSize);
   }
 
   // Fall back to ARM configurations.

diff --git a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
index f2150cf..dfa935a 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
+++ b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir

@@ -823,6 +823,54 @@
 
 // -----
 
+#executable_layout = #hal.executable.layout<push_constants = 0, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>,
+    #hal.descriptor_set.binding<2, storage_buffer>
+  ]>
+]>
+hal.executable private @matmul_riscv  {
+  hal.executable.variant public @embedded_elf_x86_64, target = #hal.executable.target<
+    "llvm",
+    "embedded-elf-riscv_32", {
+      cpu_features = "+m,+f",
+      data_layout = "e-m:e-p:32:32-i64:64-n32-S128",
+      native_vector_size = 0 : index,
+      target_triple = "riscv32-unknown-unknown-eabi-elf"
+    }> {
+    hal.executable.entry_point public @matmul_riscv layout(#executable_layout)
+    builtin.module {
+      func @matmul_riscv() {
+        %cst = arith.constant 0.0 : f32
+        %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:384x512xf32>
+        %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:512x128xf32>
+        %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:384x128xf32>
+        %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [384, 512], strides = [1, 1]
+            : !flow.dispatch.tensor<readonly:384x512xf32> -> tensor<384x512xf32>
+        %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [512, 128], strides = [1, 1]
+            : !flow.dispatch.tensor<readonly:512x128xf32> -> tensor<512x128xf32>
+        %init = linalg.init_tensor [384, 128] : tensor<384x128xf32>
+        %fill = linalg.fill(%cst, %init) : f32, tensor<384x128xf32> -> tensor<384x128xf32>
+        %gemm = linalg.matmul ins(%lhs, %rhs : tensor<384x512xf32>, tensor<512x128xf32>)
+            outs(%fill : tensor<384x128xf32>) -> tensor<384x128xf32>
+        flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [384, 128], strides = [1, 1]
+            : tensor<384x128xf32> -> !flow.dispatch.tensor<writeonly:384x128xf32>
+        return
+      }
+    }
+  }
+}
+
+//  CHECK-DAG: #[[CONFIG:.+]] =  #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [8, 32, 0], [0, 0, 16]{{\]}}>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
+//      CHECK: hal.executable.entry_point public @matmul_riscv
+// CHECK-SAME:     translation_info = #[[TRANSLATION]]
+//      CHECK: linalg.matmul
+// CHECK-SAME:     lowering_config = #[[CONFIG]]
+
+// -----
+
 #executable_layout = #hal.executable.layout<push_constants = 4, sets = [
   #hal.descriptor_set.layout<0, bindings = [
     #hal.descriptor_set.binding<0, storage_buffer>,
@@ -924,8 +972,8 @@
   }
 }
 
-//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [8, 8, 8], [1, 4, 4]{{\]}}>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUTileFuseAndVectorize>
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [8, 32, 0], [0, 0, 16]{{\]}}>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
 //      CHECK: hal.executable.entry_point public @matmul_i8_i8_i32
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
 //      CHECK:   linalg.matmul

diff --git a/iree/compiler/Codegen/Utils/Utils.cpp b/iree/compiler/Codegen/Utils/Utils.cpp
index dd4ff29..0dec12c 100644
--- a/iree/compiler/Codegen/Utils/Utils.cpp
+++ b/iree/compiler/Codegen/Utils/Utils.cpp

@@ -69,6 +69,11 @@
   return triple && triple.getValue().isX86();
 }
 
+bool isRISCV(IREE::HAL::ExecutableVariantOp variantOp) {
+  Optional<llvm::Triple> triple = getTargetTriple(variantOp);
+  return triple && triple.getValue().isRISCV();
+}
+
 //===----------------------------------------------------------------------===//
 // Utility functions to set configurations
 //===----------------------------------------------------------------------===//

diff --git a/iree/compiler/Codegen/Utils/Utils.h b/iree/compiler/Codegen/Utils/Utils.h
index 6b2489d..5110269 100644
--- a/iree/compiler/Codegen/Utils/Utils.h
+++ b/iree/compiler/Codegen/Utils/Utils.h

@@ -41,6 +41,12 @@
       entryPointFn->getParentOfType<IREE::HAL::ExecutableVariantOp>();
   return isX86(variantOp);
 }
+bool isRISCV(IREE::HAL::ExecutableVariantOp variantOp);
+inline bool isRISCV(FuncOp entryPointFn) {
+  auto variantOp =
+      entryPointFn->getParentOfType<IREE::HAL::ExecutableVariantOp>();
+  return isRISCV(variantOp);
+}
 inline bool isVMVXBackend(IREE::HAL::ExecutableVariantOp variantOp) {
   return variantOp.target().getBackend().getValue() == "vmvx";
 }
commit	0fb618ccecff6559165609bab4f5b0a934a09edb	[log] [tgz]
author	Han-Chung Wang <hanchung@google.com>	Mon Mar 14 23:04:25 2022 -0700
committer	GitHub <noreply@github.com>	Mon Mar 14 23:04:25 2022 -0700
tree	a506cc71d7168847c6380e1fe04e78c80a3538d5
parent	5783b51d60b62b8312d2eed76052c169cd7137e1 [diff]