Switch all x86 and RISC-V matmul codegen to use DoubleTilingExpert. (#8539)

The RISC-V targets were considered as ARM configuration. This PR makes
it go through Sandbox based approach.

The commit also moves quantized matmul to use DoubleTilingExpert. They
used old pipeline because of long compilation time. Many instructions
were generated during the lowering. This is addressed in https://github.com/llvm/llvm-project/commit/1538bd518cd236f4321695e9c5f0dd24601db366

It is a step toward https://github.com/google/iree/issues/8431
diff --git a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
index 7dcf9d3..3ca6445 100644
--- a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
+++ b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -370,37 +370,6 @@
       DispatchLoweringPassPipeline::CPUDoubleTilingExpert);
 }
 
-static LogicalResult setX86TileFuseAndVectorizeRootConfig(
-    FuncOp entryPointFn, linalg::ContractionOpInterface op,
-    ArrayRef<int64_t> flowTileSizes, int vectorSize) {
-  // Hardcoded tile sizes, where v is the native vector size.
-  // L1 tile sizes are {1, 1, ..., 8, 2v, 2v}.
-  // Vector tile sizes are {1, ..., 1, v, v}
-  SmallVector<int64_t> l1TileSizes, vectorTileSizes;
-  int64_t nLoops = cast<linalg::LinalgOp>(op.getOperation()).getNumLoops();
-  l1TileSizes.append(nLoops - 3, 1);
-  l1TileSizes.push_back(
-      getMaxTileSize(0, flowTileSizes[nLoops - 3], 8, vectorSize));
-  l1TileSizes.push_back(
-      getMaxTileSize(0, flowTileSizes[nLoops - 2], 2 * vectorSize, vectorSize));
-  vectorTileSizes.append(nLoops - 2, 1);
-  vectorTileSizes.push_back(vectorSize);
-
-  // L1/vector tile size for k dimensions.
-  auto lhsShapedType = op.lhs().getType().cast<ShapedType>();
-  int64_t K = lhsShapedType.getShape().back();
-  l1TileSizes.push_back(getMaxTileSize(0, K, 2 * vectorSize, vectorSize));
-  vectorTileSizes.push_back(vectorSize);
-  TileSizesListType tileSizes;
-  tileSizes.emplace_back(flowTileSizes.begin(), flowTileSizes.end());
-  tileSizes.push_back(l1TileSizes);
-  tileSizes.push_back(vectorTileSizes);
-
-  return setOpConfigAndEntryPointFnTranslation(
-      entryPointFn, op, tileSizes,
-      DispatchLoweringPassPipeline::CPUTileFuseAndVectorize);
-}
-
 static LogicalResult setARMRootConfig(FuncOp entryPointFn,
                                       linalg::ContractionOpInterface op,
                                       ArrayRef<int64_t> flowTileSizes,
@@ -461,23 +430,15 @@
           contractionOp.getOperation()),
       minTileSizes, maxTileSizes);
 
-  if (isX86(entryPointFn)) {
+  // TODO(dcaballe): Find better configurations for RISC-V backends.
+  if (isX86(entryPointFn) || isRISCV(entryPointFn)) {
     // There is a tileInterchange option. If it needs to be configured, we can
     // only apply the pipeline to linalg.matmul. Because we don't know the
     // number of loops when adding the pass to pass manager.
     // TODO(hanchung): Embed options into attributes, so we can control options
     // more heuristically.
-    Type lhsElemType = getElementTypeOrSelf(contractionOp.lhs().getType());
-    Type rhsElemType = getElementTypeOrSelf(contractionOp.rhs().getType());
-    Type resElemType =
-        getElementTypeOrSelf(contractionOp->getResult(0).getType());
-    if (lhsElemType == rhsElemType && rhsElemType == resElemType) {
-      return setX86SandboxRootConfig(entryPointFn, contractionOp, flowTileSizes,
-                                     vectorSize);
-    } else {
-      return setX86TileFuseAndVectorizeRootConfig(entryPointFn, contractionOp,
-                                                  flowTileSizes, vectorSize);
-    }
+    return setX86SandboxRootConfig(entryPointFn, contractionOp, flowTileSizes,
+                                   vectorSize);
   }
 
   // Fall back to ARM configurations.
diff --git a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
index f2150cf..dfa935a 100644
--- a/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
+++ b/iree/compiler/Codegen/LLVMCPU/test/materialize_launch_configuration.mlir
@@ -823,6 +823,54 @@
 
 // -----
 
+#executable_layout = #hal.executable.layout<push_constants = 0, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>,
+    #hal.descriptor_set.binding<2, storage_buffer>
+  ]>
+]>
+hal.executable private @matmul_riscv  {
+  hal.executable.variant public @embedded_elf_x86_64, target = #hal.executable.target<
+    "llvm",
+    "embedded-elf-riscv_32", {
+      cpu_features = "+m,+f",
+      data_layout = "e-m:e-p:32:32-i64:64-n32-S128",
+      native_vector_size = 0 : index,
+      target_triple = "riscv32-unknown-unknown-eabi-elf"
+    }> {
+    hal.executable.entry_point public @matmul_riscv layout(#executable_layout)
+    builtin.module {
+      func @matmul_riscv() {
+        %cst = arith.constant 0.0 : f32
+        %lhs_binding = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:384x512xf32>
+        %rhs_binding = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:512x128xf32>
+        %result_binding = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:384x128xf32>
+        %lhs = flow.dispatch.tensor.load %lhs_binding, offsets = [0, 0], sizes = [384, 512], strides = [1, 1]
+            : !flow.dispatch.tensor<readonly:384x512xf32> -> tensor<384x512xf32>
+        %rhs = flow.dispatch.tensor.load %rhs_binding, offsets = [0, 0], sizes = [512, 128], strides = [1, 1]
+            : !flow.dispatch.tensor<readonly:512x128xf32> -> tensor<512x128xf32>
+        %init = linalg.init_tensor [384, 128] : tensor<384x128xf32>
+        %fill = linalg.fill(%cst, %init) : f32, tensor<384x128xf32> -> tensor<384x128xf32>
+        %gemm = linalg.matmul ins(%lhs, %rhs : tensor<384x512xf32>, tensor<512x128xf32>)
+            outs(%fill : tensor<384x128xf32>) -> tensor<384x128xf32>
+        flow.dispatch.tensor.store %gemm, %result_binding, offsets = [0, 0], sizes = [384, 128], strides = [1, 1]
+            : tensor<384x128xf32> -> !flow.dispatch.tensor<writeonly:384x128xf32>
+        return
+      }
+    }
+  }
+}
+
+//  CHECK-DAG: #[[CONFIG:.+]] =  #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [8, 32, 0], [0, 0, 16]{{\]}}>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
+//      CHECK: hal.executable.entry_point public @matmul_riscv
+// CHECK-SAME:     translation_info = #[[TRANSLATION]]
+//      CHECK: linalg.matmul
+// CHECK-SAME:     lowering_config = #[[CONFIG]]
+
+// -----
+
 #executable_layout = #hal.executable.layout<push_constants = 4, sets = [
   #hal.descriptor_set.layout<0, bindings = [
     #hal.descriptor_set.binding<0, storage_buffer>,
@@ -924,8 +972,8 @@
   }
 }
 
-//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [8, 8, 8], [1, 4, 4]{{\]}}>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUTileFuseAndVectorize>
+//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [8, 32, 0], [0, 0, 16]{{\]}}>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
 //      CHECK: hal.executable.entry_point public @matmul_i8_i8_i32
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
 //      CHECK:   linalg.matmul
diff --git a/iree/compiler/Codegen/Utils/Utils.cpp b/iree/compiler/Codegen/Utils/Utils.cpp
index dd4ff29..0dec12c 100644
--- a/iree/compiler/Codegen/Utils/Utils.cpp
+++ b/iree/compiler/Codegen/Utils/Utils.cpp
@@ -69,6 +69,11 @@
   return triple && triple.getValue().isX86();
 }
 
+bool isRISCV(IREE::HAL::ExecutableVariantOp variantOp) {
+  Optional<llvm::Triple> triple = getTargetTriple(variantOp);
+  return triple && triple.getValue().isRISCV();
+}
+
 //===----------------------------------------------------------------------===//
 // Utility functions to set configurations
 //===----------------------------------------------------------------------===//
diff --git a/iree/compiler/Codegen/Utils/Utils.h b/iree/compiler/Codegen/Utils/Utils.h
index 6b2489d..5110269 100644
--- a/iree/compiler/Codegen/Utils/Utils.h
+++ b/iree/compiler/Codegen/Utils/Utils.h
@@ -41,6 +41,12 @@
       entryPointFn->getParentOfType<IREE::HAL::ExecutableVariantOp>();
   return isX86(variantOp);
 }
+bool isRISCV(IREE::HAL::ExecutableVariantOp variantOp);
+inline bool isRISCV(FuncOp entryPointFn) {
+  auto variantOp =
+      entryPointFn->getParentOfType<IREE::HAL::ExecutableVariantOp>();
+  return isRISCV(variantOp);
+}
 inline bool isVMVXBackend(IREE::HAL::ExecutableVariantOp variantOp) {
   return variantOp.target().getBackend().getValue() == "vmvx";
 }