[spirv] Add some tests for matvec pipeline configuration (#14746)

These tests were missing in #14714.
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/SPIRV/test/BUILD.bazel
index 7654675..b69b185 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/BUILD.bazel
@@ -29,6 +29,7 @@
             "config_default_linalg_ext_ops.mlir",
             "config_default_linalg_ops.mlir",
             "config_default_matmul.mlir",
+            "config_default_matvec.mlir",
             "config_default_reduction.mlir",
             "config_default_sub_byte_types.mlir",
             "config_mali_conv.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt
index d8b6981..2d496bb 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/CMakeLists.txt
@@ -25,6 +25,7 @@
     "config_default_linalg_ext_ops.mlir"
     "config_default_linalg_ops.mlir"
     "config_default_matmul.mlir"
+    "config_default_matvec.mlir"
     "config_default_reduction.mlir"
     "config_default_sub_byte_types.mlir"
     "config_mali_conv.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_matvec.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_matvec.mlir
new file mode 100644
index 0000000..7f81fa8
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_default_matvec.mlir
@@ -0,0 +1,146 @@
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(iree-spirv-lower-executable-target-pass{test-lowering-configuration=true})))' %s | FileCheck %s
+
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>,
+    #hal.descriptor_set.binding<2, storage_buffer>,
+    #hal.descriptor_set.binding<3, storage_buffer>,
+    #hal.descriptor_set.binding<4, storage_buffer>
+  ]>
+]>
+
+hal.executable @i4_dequant_matvec {
+  hal.executable.variant @vulkan_spirv_fb, target = <"vulkan-spirv", "vulkan-spirv-fb", {
+      spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, GroupNonUniform, GroupNonUniformShuffle], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
+        max_compute_shared_memory_size = 32768,
+        max_compute_workgroup_invocations = 512,
+        max_compute_workgroup_size = [512, 512, 512],
+        subgroup_size = 64>>
+    }> {
+    hal.executable.export @i4_dequant_matvec layout(#pipeline_layout)
+    builtin.module {
+      func.func @i4_dequant_matvec() {
+        %cst = arith.constant 0.000000e+00 : f32
+        %10 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>>
+        %11 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>>
+        %12 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>>
+        %13 = hal.interface.binding.subspan set(0) binding(3) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<86x128xf32>>
+        %14 = hal.interface.binding.subspan set(0) binding(4) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<4096xf32>>
+        %15 = flow.dispatch.tensor.load %10, offsets = [0, 0, 0], sizes = [4096, 86, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86x128xi4>> -> tensor<4096x86x128xi4>
+        %16 = flow.dispatch.tensor.load %11, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>> -> tensor<4096x86xf32>
+        %17 = flow.dispatch.tensor.load %12, offsets = [0, 0], sizes = [4096, 86], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x86xf32>> -> tensor<4096x86xf32>
+        %18 = flow.dispatch.tensor.load %13, offsets = [0, 0], sizes = [86, 128], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<86x128xf32>> -> tensor<86x128xf32>
+        %19 = tensor.empty() : tensor<4096xf32>
+        %20 = tensor.empty() : tensor<4096x86x128xf32>
+        %21 = linalg.fill ins(%cst : f32) outs(%19 : tensor<4096xf32>) -> tensor<4096xf32>
+        %22 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%15, %16, %17 : tensor<4096x86x128xi4>, tensor<4096x86xf32>, tensor<4096x86xf32>) outs(%20 : tensor<4096x86x128xf32>) {
+        ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32):
+          %24 = arith.extui %in : i4 to i32
+          %25 = arith.uitofp %24 : i32 to f32
+          %26 = arith.subf %25, %in_1 : f32
+          %27 = arith.mulf %26, %in_0 : f32
+          linalg.yield %27 : f32
+        } -> tensor<4096x86x128xf32>
+        %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0)>], iterator_types = ["parallel", "reduction", "reduction"]} ins(%18, %22 : tensor<86x128xf32>, tensor<4096x86x128xf32>) outs(%21 : tensor<4096xf32>) {
+        ^bb0(%in: f32, %in_0: f32, %out: f32):
+          %24 = arith.mulf %in, %in_0 : f32
+          %25 = arith.addf %24, %out : f32
+          linalg.yield %25 : f32
+        } -> tensor<4096xf32>
+        flow.dispatch.tensor.store %23, %14, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<4096xf32>>
+        return
+      }
+    }
+  }
+}
+
+//   CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1], [0, 2, 128]{{\]}}>
+//   CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVSubgroupReduce>
+// CHECK-LABEL: hal.executable.export public @i4_dequant_matvec
+//  CHECK-SAME:   translation_info = #[[$TRANSLATION]]
+//  CHECK-SAME:   workgroup_size = [64 : index, 1 : index, 1 : index]
+//       CHECK: func.func @i4_dequant_matvec()
+//       CHECK:   linalg.generic
+//  CHECK-SAME:     lowering_config = #[[$CONFIG]]
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>,
+    #hal.descriptor_set.binding<2, storage_buffer>,
+    #hal.descriptor_set.binding<3, storage_buffer>,
+    #hal.descriptor_set.binding<4, storage_buffer>
+  ]>
+]>
+
+hal.executable @i4_dequant_matvec {
+  hal.executable.variant @vulkan_spirv_fb, target = <"vulkan-spirv", "vulkan-spirv-fb", {
+      spirv.target_env = #spirv.target_env<#spirv.vce<v1.4, [Shader, GroupNonUniform, GroupNonUniformShuffle], []>, Unknown:IntegratedGPU, #spirv.resource_limits<
+        max_compute_shared_memory_size = 32768,
+        max_compute_workgroup_invocations = 1024,
+        max_compute_workgroup_size = [1024, 1024, 1024],
+        subgroup_size = 64>>
+    }> {
+    hal.executable.export @i4_dequant_matvec layout(#pipeline_layout)
+    builtin.module {
+      func.func @i4_dequant_matvec() {
+        %c32_i64 = arith.constant 32 : i64
+        %cst = arith.constant 0.000000e+00 : f32
+        %c4294967296_i64 = arith.constant 4294967296 : i64
+        %22 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x32x128xi4>>
+        %23 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x32x1xf32>>
+        %24 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<4096x32x1xf32>>
+        %25 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:tensor<1x1x32x128xf32>>
+        %26 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:tensor<1x1x4096xf32>>
+        %27 = flow.dispatch.tensor.load %22, offsets = [0, 0, 0], sizes = [4096, 32, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32x128xi4>> -> tensor<4096x32x128xi4>
+        %28 = flow.dispatch.tensor.load %23, offsets = [0, 0, 0], sizes = [4096, 32, 1], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32x1xf32>> -> tensor<4096x32x1xf32>
+        %29 = flow.dispatch.tensor.load %24, offsets = [0, 0, 0], sizes = [4096, 32, 1], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<4096x32x1xf32>> -> tensor<4096x32x1xf32>
+        %30 = flow.dispatch.tensor.load %25, offsets = [0, 0, 0, 0], sizes = [1, 1, 32, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1x32x128xf32>> -> tensor<1x1x32x128xf32>
+        %31 = tensor.empty() : tensor<1x1x4096xf32>
+        %32 = tensor.empty() : tensor<4096x32x128xf32>
+        %33 = linalg.fill ins(%cst : f32) outs(%31 : tensor<1x1x4096xf32>) -> tensor<1x1x4096xf32>
+        %34 = linalg.generic {
+            indexing_maps = [
+              affine_map<(d0, d1, d2) -> (d0, d1, d2)>,
+              affine_map<(d0, d1, d2) -> (d0, d1, 0)>,
+              affine_map<(d0, d1, d2) -> (d0, d1, 0)>,
+              affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
+            iterator_types = ["parallel", "parallel", "parallel"]}
+        ins(%27, %28, %29 : tensor<4096x32x128xi4>, tensor<4096x32x1xf32>, tensor<4096x32x1xf32>) outs(%32 : tensor<4096x32x128xf32>) {
+        ^bb0(%in: i4, %in_0: f32, %in_1: f32, %out: f32):
+          %36 = arith.extui %in : i4 to i32
+          %37 = arith.uitofp %36 : i32 to f32
+          %38 = arith.subf %37, %in_1 : f32
+          %39 = arith.mulf %38, %in_0 : f32
+          linalg.yield %39 : f32
+        } -> tensor<4096x32x128xf32>
+        %35 = linalg.generic {
+            indexing_maps = [
+              affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3, d4)>,
+              affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d4)>,
+              affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>],
+            iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]}
+        ins(%30, %34 : tensor<1x1x32x128xf32>, tensor<4096x32x128xf32>) outs(%33 : tensor<1x1x4096xf32>) {
+        ^bb0(%in: f32, %in_0: f32, %out: f32):
+          %36 = arith.mulf %in, %in_0 : f32
+          %37 = arith.addf %36, %out : f32
+          linalg.yield %37 : f32
+        } -> tensor<1x1x4096xf32>
+        flow.dispatch.tensor.store %35, %26, offsets = [0, 0, 0], sizes = [1, 1, 4096], strides = [1, 1, 1] : tensor<1x1x4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x1x4096xf32>>
+        return
+      }
+    }
+  }
+}
+
+//   CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1, 1], [0, 0, 0, 32, 128]{{\]}}>
+//   CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVSubgroupReduce>
+// CHECK-LABEL: hal.executable.export public @i4_dequant_matvec
+//  CHECK-SAME:   translation_info = #[[$TRANSLATION]]
+//  CHECK-SAME:   workgroup_size = [1024 : index, 1 : index, 1 : index]
+//       CHECK: func.func @i4_dequant_matvec()
+//       CHECK:   linalg.generic
+//  CHECK-SAME:     lowering_config = #[[$CONFIG]]