[Codegen] Add a configuration attribute dict to translation info (#16224)

The attribute as it is today is rather unwieldy, especially when trying
to add new fields to translation info. In general, translation info
needs to provide all of the configuration details for the particular
pass pipeline it specifies, so this moves the software pipelining
configuration attributes to a configuration dictionary so that the
codegen dialect doesn't have to care about the details of a particular
pass pipeline.
diff --git a/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp b/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp
index 1166e64..038d3fa 100644
--- a/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/MaterializeUserConfigs.cpp
@@ -89,10 +89,9 @@
               : clCodegenTransformDialectStrategyName;
       clTranslationInfo = IREE::Codegen::TranslationInfoAttr::get(
           context, tdPipeline,
-          /*softwarePipelineDepth=*/0,
-          /*softwarePipelineStoreStage=*/1,
           /*codegenSpec=*/
-          SymbolRefAttr::get(context, llvm::StringRef(strategyName)));
+          SymbolRefAttr::get(context, llvm::StringRef(strategyName)),
+          /*configuration=*/DictionaryAttr());
       LDBG("--clTranslationInfo: " << clTranslationInfo);
     }
 
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp
index ec17eb3..fec7bf7 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp
@@ -59,12 +59,10 @@
 
 TranslationInfoAttr TranslationInfoAttr::get(
     MLIRContext *context, DispatchLoweringPassPipeline passPipeline,
-    unsigned softwarePipelineDepth, unsigned softwarePipelineStoreStage,
-    SymbolRefAttr codegenSpec) {
+    SymbolRefAttr codegenSpec, DictionaryAttr configuration) {
   auto pipelineAttr =
       DispatchLoweringPassPipelineAttr::get(context, passPipeline);
-  return get(context, pipelineAttr, softwarePipelineDepth,
-             softwarePipelineStoreStage, codegenSpec);
+  return get(context, pipelineAttr, codegenSpec, configuration);
 }
 
 DispatchLoweringPassPipeline
@@ -75,8 +73,7 @@
 LogicalResult TranslationInfoAttr::verify(
     function_ref<InFlightDiagnostic()> emitError,
     IREE::Codegen::DispatchLoweringPassPipelineAttr passPipeline,
-    unsigned softwarePipelineDepth, unsigned softwarePipelineStoreStage,
-    SymbolRefAttr codegenSpec) {
+    SymbolRefAttr codegenSpec, DictionaryAttr configuration) {
   if (!passPipeline) {
     return emitError() << "missing pass pipeline specification";
   }
@@ -297,11 +294,10 @@
   if (!translationInfo) {
     return emitError() << "missing translation info";
   }
-  if (failed(TranslationInfoAttr::verify(
-          emitError, translationInfo.getPassPipeline(),
-          translationInfo.getSoftwarePipelineDepth(),
-          translationInfo.getSoftwarePipelineStoreStage(),
-          translationInfo.getCodegenSpec()))) {
+  if (failed(TranslationInfoAttr::verify(emitError,
+                                         translationInfo.getPassPipeline(),
+                                         translationInfo.getCodegenSpec(),
+                                         translationInfo.getConfiguration()))) {
     return failure();
   }
   return success();
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h
index f04744c..f23073f 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h
@@ -139,8 +139,7 @@
     IREE::Codegen::DispatchLoweringPassPipeline passPipeline,
     ArrayRef<int64_t> workgroupSize = {},
     std::optional<int64_t> subgroupSize = {},
-    unsigned softwarePipelineDepth = 0,
-    unsigned softwarePipelineStoreStage = 1) {
+    DictionaryAttr pipelineConfig = DictionaryAttr()) {
   MLIRContext *context = entryPointFn.getContext();
   auto config = IREE::Codegen::LoweringConfigAttr::get(context, tileSizes,
                                                        scalableTileFlags);
@@ -148,8 +147,7 @@
   if (failed(setDispatchConfig(entryPointFn, workgroupSize, subgroupSize)))
     return failure();
   auto translationInfo = IREE::Codegen::TranslationInfoAttr::get(
-      entryPointFn.getContext(), passPipeline, softwarePipelineDepth,
-      softwarePipelineStoreStage);
+      entryPointFn.getContext(), passPipeline, SymbolRefAttr(), pipelineConfig);
   return setTranslationInfo(entryPointFn, translationInfo);
 }
 
@@ -161,11 +159,10 @@
     IREE::Codegen::DispatchLoweringPassPipeline passPipeline,
     ArrayRef<int64_t> workgroupSize = {},
     std::optional<int64_t> subgroupSize = {},
-    unsigned softwarePipelineDepth = 0,
-    unsigned softwarePipelineStoreStage = 1) {
-  return setOpConfigAndEntryPointFnTranslation(
-      entryPointFn, op, tileSizes, {}, passPipeline, workgroupSize,
-      subgroupSize, softwarePipelineDepth, softwarePipelineStoreStage);
+    DictionaryAttr pipelineConfig = DictionaryAttr()) {
+  return setOpConfigAndEntryPointFnTranslation(entryPointFn, op, tileSizes, {},
+                                               passPipeline, workgroupSize,
+                                               subgroupSize, pipelineConfig);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td
index 898e05b..8b28660 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td
@@ -129,26 +129,22 @@
 
   let assemblyFormat = [{
     `<` `` $passPipeline
-    (`pipeline_depth` `=` $softwarePipelineDepth^)?
-    (`store_stage` `=` $softwarePipelineStoreStage^)?
-    (`codegen_spec` `=` $codegenSpec^)? `>`
+    (`codegen_spec` `=` $codegenSpec^)?
+    (`,` $configuration^)? `>`
   }];
 
   let parameters = (ins
     AttrParameter<"IREE::Codegen::DispatchLoweringPassPipelineAttr",
         "Name of the pipeline to be invoked on the translation unit.">:$passPipeline,
-    OptionalParameter<"unsigned",
-        "The software pipeline depth to be used">:$softwarePipelineDepth,
-    DefaultValuedParameter<"unsigned", "1",
-        "The software pipeline stage to place stores">:$softwarePipelineStoreStage,
     OptionalParameter<"SymbolRefAttr",
-        "The symbol pointing to the transform dialect codegen spec to be used">:$codegenSpec
+        "The symbol pointing to the transform dialect codegen spec to be used">:$codegenSpec,
+    OptionalParameter<"DictionaryAttr",
+        "Pipeline specific configuration">:$configuration
   );
   let builders = [
     AttrBuilder<(ins "DispatchLoweringPassPipeline":$passPipeline,
-        CArg<"unsigned", "0">:$softwarePipelineDepth,
-        CArg<"unsigned", "1">:$softwarePipelineStoreStage,
-        CArg<"SymbolRefAttr", "{}">:$codegenSpec)>
+        CArg<"SymbolRefAttr", "{}">:$codegenSpec,
+        CArg<"DictionaryAttr", "{}">:$configuration)>
   ];
   let extraClassDeclaration = [{
     // Returns the lowering pass pipeline set.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index 6c88afa..7545aa1 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -348,8 +348,9 @@
 
         return setOpConfigAndEntryPointFnTranslation(
             entryPoint, op, tileSizes, pipeline, workgroupSize, subgroupSize,
-            softwarePipelineDepth,
-            /*softwarePipelineStoreStage=*/1);
+            getSoftwarePipeliningAttrDict(op->getContext(),
+                                          softwarePipelineDepth,
+                                          /*softwarePipelineStoreStage=*/1));
       };
   // Infer the MxN size of the matmul based on operands and indexing maps.
   auto lhsShape =
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
index 350a8fe..c4ecaf6 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
@@ -90,15 +90,29 @@
   case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUMatmulSimt:
     addGPUMatmulSimtPassPipeline(pipeline);
     break;
-  case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUMatmulTensorCore:
-    addGPUMatmulTensorCorePassPipeline(
-        pipeline, translationInfo.value().getSoftwarePipelineDepth());
+  case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUMatmulTensorCore: {
+    FailureOr<int64_t> maybeDepth =
+        getSoftwarePipelineDepth(translationInfo.value().getConfiguration());
+    if (failed(maybeDepth)) {
+      variantOp.emitOpError(
+          "invalid matmul configuration without software pipelining config");
+      return signalPassFailure();
+    }
+    addGPUMatmulTensorCorePassPipeline(pipeline, *maybeDepth);
     break;
+  }
   case IREE::Codegen::DispatchLoweringPassPipeline::
-      LLVMGPUMatmulTensorCoreMmaSync:
-    addGPUMatmulTensorCoreMmaSyncPassPipeline(
-        pipeline, translationInfo.value().getSoftwarePipelineDepth());
+      LLVMGPUMatmulTensorCoreMmaSync: {
+    FailureOr<int64_t> maybeDepth =
+        getSoftwarePipelineDepth(translationInfo.value().getConfiguration());
+    if (failed(maybeDepth)) {
+      variantOp.emitOpError(
+          "invalid matmul configuration without software pipelining config");
+      return signalPassFailure();
+    }
+    addGPUMatmulTensorCoreMmaSyncPassPipeline(pipeline, *maybeDepth);
     break;
+  }
   case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUTransposeSharedMem:
     addGPUTransposePassPipeline(pipeline);
     break;
@@ -116,7 +130,7 @@
   case IREE::Codegen::DispatchLoweringPassPipeline::None:
     return;
   default:
-    variantOp.emitOpError("Unsupported pipeline on GPU target.");
+    variantOp.emitOpError("unsupported pipeline on GPU target.");
     return signalPassFailure();
   }
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp
index 02b49fd..3f669d6 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp
@@ -96,17 +96,24 @@
     return op->emitOpError("expected workgroup size for GPU pipelines");
   }
 
-  assert(translationInfo.getSoftwarePipelineStoreStage() == 1 &&
-         "Store to workgroup memory currently expected to happen in stage 1 of "
-         "software pipeline.");
+  FailureOr<int64_t> maybeDepth =
+      getSoftwarePipelineDepth(translationInfo.getConfiguration());
+  FailureOr<int64_t> maybeStage =
+      getSoftwarePipelineStoreStage(translationInfo.getConfiguration());
+  if (failed(maybeDepth) || failed(maybeStage)) {
+    return op->emitOpError(
+        "invalid matmul configuration without pipelining config");
+  }
+
+  if (*maybeStage != 1) {
+    return op->emitError(
+        "store to workgroup memory currently expected to happen in stage 1 of "
+        "software pipeline.");
+  }
 
   // Get compilation pipeline.
   StringRef pipelineName = stringifyEnum(pipeline);
 
-  assert(translationInfo.getSoftwarePipelineStoreStage() == 1 &&
-         "Store to workgroup memory currently expected to happen in stage 1 of "
-         "software pipeline.");
-
   // Get Operand/Result types.
   mlir::Type lhsType = op->getOperand(0).getType();
   mlir::Type rhsType = op->getOperand(1).getType();
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir
index c1dfa97..0175d50 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir
@@ -241,7 +241,7 @@
   }
 }
 
-//       CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulSimt>
+//       CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
 // CHECK-LABEL: hal.executable.export public @not_vmt
 //  CHECK-SAME:   subgroup_size = 64 : index
 //  CHECK-SAME:   translation_info = #[[$TRANSLATION]]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/distribute_to_thread.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/distribute_to_thread.mlir
index 0b7ba91..ce5e33a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/distribute_to_thread.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/distribute_to_thread.mlir
@@ -1,7 +1,7 @@
 // RUN: iree-opt --split-input-file --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-llvmgpu-tile-and-distribute)))))" %s | FileCheck %s
 
 #config = #iree_codegen.lowering_config<tile_sizes = [[2, 256, 4]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulSimt>
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
@@ -89,7 +89,7 @@
 
 // -----
 
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulSimt>
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
@@ -169,7 +169,7 @@
 // -----
 
 #config = #iree_codegen.lowering_config<tile_sizes = [[2, 32, 4]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulSimt>
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
@@ -378,7 +378,7 @@
 // Check contract-4d, we currently emit suboptimal code as we don't distribute
 // more than 3 dimensions but make sure we emit correct code.
 #config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 2, 256, 4]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulSimt>
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
index 01b2923..d7d54c2 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
@@ -72,7 +72,7 @@
 }
 
 //  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[4, 2, 4]{{\]}}>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulSimt>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
 //      CHECK: hal.executable.export public @dot_dispatch_1
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
 // CHECK-SAME:     workgroup_size = [2 : index, 4 : index, 1 : index]
@@ -112,7 +112,7 @@
 }
 
 //  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 128, 2]{{\]}}>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulSimt>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
 //      CHECK: hal.executable.export public @unaligned_k
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
 // CHECK-SAME:     workgroup_size = [32 : index, 8 : index, 1 : index]
@@ -327,7 +327,7 @@
 
 #compilation = #iree_codegen.compilation_info<
     lowering_config  = <tile_sizes = [[32, 128, 64]]>,
-    translation_info = <LLVMGPUMatmulSimt>,
+    translation_info = <LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>,
     workgroup_size = [16, 8, 1]>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
@@ -364,7 +364,7 @@
 }
 
 //  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 128, 64]{{\]}}
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulSimt>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
 //      CHECK: hal.executable.export public @_lowering_config_test_dispatch_1
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
 // CHECK-SAME:     workgroup_size = [16 : index, 8 : index, 1 : index]
@@ -455,7 +455,7 @@
 }
 }
 
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulSimt>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
 //      CHECK: hal.executable.export public @matmul_config_sm35
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
 
@@ -669,7 +669,7 @@
 
 
 //  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[128, 256, 32]{{\]}}
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulTensorCoreMmaSync pipeline_depth = 3>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulTensorCoreMmaSync, {pipeline_depth = 3 : i64, store_stage = 1 : i64}>
 //      CHECK: hal.executable.export public @large_matmul_f16
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
 // CHECK-SAME:     workgroup_size = [128 : index, 2 : index, 1 : index]
@@ -716,7 +716,7 @@
 
 
 //  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[128, 256, 16]{{\]}}
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulTensorCoreMmaSync pipeline_depth = 4>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulTensorCoreMmaSync, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>
 //      CHECK: hal.executable.export public @large_matmul_f32
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
 // CHECK-SAME:     workgroup_size = [128 : index, 2 : index, 1 : index]
@@ -878,7 +878,7 @@
 }
 
 //   CHECK-DAG:  #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1, 32, 128, 32]{{\]}}
-//   CHECK-DAG:  #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulSimt>
+//   CHECK-DAG:  #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
 //       CHECK:  hal.executable.export public @_main_dispatch_15_generic_512x4x42x42x64_f32
 //  CHECK-SAME:    translation_info = #[[TRANSLATION]]
 //       CHECK:  linalg.fill
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir
index fa59d7c..011994c 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/illegal_configuration.mlir
@@ -1,7 +1,7 @@
 // RUN: iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-llvmgpu-select-lowering-strategy)))" --verify-diagnostics --split-input-file %s
 
 #config = #iree_codegen.lowering_config<tile_sizes = []>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulSimt>
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
     #hal.descriptor_set.binding<0, storage_buffer>,
@@ -33,7 +33,7 @@
 // -----
 
 #config = #iree_codegen.lowering_config<tile_sizes = []>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulSimt>
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
     #hal.descriptor_set.binding<0, storage_buffer>,
@@ -65,7 +65,7 @@
 // -----
 
 #config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore>
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
     #hal.descriptor_set.binding<0, storage_buffer>,
@@ -97,7 +97,7 @@
 // -----
 
 #config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore>
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
     #hal.descriptor_set.binding<0, storage_buffer>,
@@ -129,7 +129,7 @@
 // -----
 
 #config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore>
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
     #hal.descriptor_set.binding<0, storage_buffer>,
@@ -161,7 +161,7 @@
 // -----
 
 #config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 20]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore>
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
     #hal.descriptor_set.binding<0, storage_buffer>,
@@ -193,7 +193,7 @@
 // -----
 
 #config = #iree_codegen.lowering_config<tile_sizes = [[64, 32, 16]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore>
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
     #hal.descriptor_set.binding<0, storage_buffer>,
@@ -225,7 +225,7 @@
 // -----
 
 #config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore>
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
     #hal.descriptor_set.binding<0, storage_buffer>,
@@ -257,7 +257,7 @@
 // -----
 
 #config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore>
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
     #hal.descriptor_set.binding<0, storage_buffer>,
@@ -289,7 +289,7 @@
 // -----
 
 #config = #iree_codegen.lowering_config<tile_sizes = [[2, 32, 32, 16]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore>
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb">
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
@@ -348,7 +348,7 @@
 // -----
 
 #config = #iree_codegen.lowering_config<tile_sizes = [[64, 32, 48]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCoreMmaSync>
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCoreMmaSync, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
     #hal.descriptor_set.binding<0, storage_buffer>,
@@ -381,7 +381,7 @@
 
 
 #config = #iree_codegen.lowering_config<tile_sizes = [[64, 32, 4]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCoreMmaSync>
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCoreMmaSync, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
     #hal.descriptor_set.binding<0, storage_buffer>,
@@ -413,7 +413,7 @@
 // -----
 
 #config = #iree_codegen.lowering_config<tile_sizes = [[64, 64, 64]]>
-#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCoreMmaSync>
+#translation = #iree_codegen.translation_info<LLVMGPUMatmulTensorCoreMmaSync, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
     #hal.descriptor_set.binding<0, storage_buffer>,
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir
index 19d4772..c8aa582 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/set_transform_strategy_matmul.mlir
@@ -462,7 +462,7 @@
 }
 }
 
-// CHECK:       iree_codegen.translation_info<LLVMGPUMatmulSimt>
+// CHECK:       iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
 // CHECK-LABEL: func @matmul_5_small
 
 // This matmul is considered "too small"/"degenerate" for a tensor core strategy,
@@ -505,7 +505,7 @@
 }
 }
 
-// CHECK:       iree_codegen.translation_info<LLVMGPUMatmulSimt>
+// CHECK:       iree_codegen.translation_info<LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
 // CHECK-LABEL: func @f16_matmul
 // CHECK-NOT: transform.sequence
 // CHECK-NOT: transform.named_sequence
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
index f4f84b3..2ca6c05 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
@@ -776,7 +776,9 @@
     return setOpConfigAndEntryPointFnTranslation(
         op->getParentOfType<mlir::FunctionOpInterface>(), op, tileSizes,
         CodeGenPipeline::SPIRVMatmulPromoteVectorize, workgroupSize,
-        /*subgroupSize=*/std::nullopt, pipelineDepth, storeStage);
+        /*subgroupSize=*/std::nullopt,
+        getSoftwarePipeliningAttrDict(op->getContext(), pipelineDepth,
+                                      storeStage));
   }
 
   SmallVector<int64_t> threadTileSizes(numLoops, 0);
@@ -1102,7 +1104,9 @@
 
   return setOpConfigAndEntryPointFnTranslation(
       op->getParentOfType<mlir::FunctionOpInterface>(), op, tileSizes, pipeline,
-      workgroupSize, subgroupSize, pipelineDepth, storeStage);
+      workgroupSize, subgroupSize,
+      getSoftwarePipeliningAttrDict(op->getContext(), pipelineDepth,
+                                    storeStage));
 }
 
 } // namespace detail
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVLowerExecutableTargetPass.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVLowerExecutableTargetPass.cpp
index 3943e99..721d902 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVLowerExecutableTargetPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/SPIRVLowerExecutableTargetPass.cpp
@@ -82,16 +82,34 @@
   case CodeGenPipeline::SPIRVSubgroupReduce:
     addSPIRVSubgroupReducePassPipeline(pipeline);
     break;
-  case CodeGenPipeline::SPIRVCooperativeMatrixVectorize:
-    addSPIRVCooperativeMatrixVectorizePassPipeline(
-        pipeline, translationInfo.value().getSoftwarePipelineDepth(),
-        translationInfo.value().getSoftwarePipelineStoreStage());
+  case CodeGenPipeline::SPIRVCooperativeMatrixVectorize: {
+    FailureOr<int64_t> maybeDepth =
+        getSoftwarePipelineDepth(translationInfo.value().getConfiguration());
+    FailureOr<int64_t> maybeStage = getSoftwarePipelineStoreStage(
+        translationInfo.value().getConfiguration());
+    if (failed(maybeDepth) || failed(maybeStage)) {
+      variantOp.emitOpError("invalid cooperative matrix pipeline without "
+                            "software pipelining configuration.");
+      return signalPassFailure();
+    }
+    addSPIRVCooperativeMatrixVectorizePassPipeline(pipeline, *maybeDepth,
+                                                   *maybeStage);
     break;
-  case CodeGenPipeline::SPIRVMatmulPromoteVectorize:
-    addSPIRVMatmulPromoteVectorizePassPipeline(
-        pipeline, translationInfo.value().getSoftwarePipelineDepth(),
-        translationInfo.value().getSoftwarePipelineStoreStage());
+  }
+  case CodeGenPipeline::SPIRVMatmulPromoteVectorize: {
+    FailureOr<int64_t> maybeDepth =
+        getSoftwarePipelineDepth(translationInfo.value().getConfiguration());
+    FailureOr<int64_t> maybeStage = getSoftwarePipelineStoreStage(
+        translationInfo.value().getConfiguration());
+    if (failed(maybeDepth) || failed(maybeStage)) {
+      variantOp.emitOpError(
+          "invalid matmul pipeline without software pipelining configuration.");
+      return signalPassFailure();
+    }
+    addSPIRVMatmulPromoteVectorizePassPipeline(pipeline, *maybeDepth,
+                                               *maybeStage);
     break;
+  }
   case CodeGenPipeline::SPIRVWinogradVectorize:
     addSPIRVWinogradVectorizePassPipeline(pipeline);
     break;
@@ -102,7 +120,7 @@
   case CodeGenPipeline::None:
     return;
   default:
-    variantOp.emitOpError("Unsupported pipeline on GPU target.");
+    variantOp.emitOpError("unsupported pipeline on GPU target.");
     return signalPassFailure();
   }
 
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/Verifiers.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/Verifiers.cpp
index 6795dc5..2caf9b0 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/Verifiers.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/Verifiers.cpp
@@ -44,6 +44,15 @@
     llvm::dbgs() << "]\n";
   });
 
+  FailureOr<int64_t> maybeDepth =
+      getSoftwarePipelineDepth(translationInfo.getConfiguration());
+  FailureOr<int64_t> maybeStage =
+      getSoftwarePipelineStoreStage(translationInfo.getConfiguration());
+  if (failed(maybeDepth) || failed(maybeStage)) {
+    return op->emitOpError(
+        "invalid matmul configuration without pipelining config");
+  }
+
   // Get spirv.target_env attributes
   const spirv::TargetEnvAttr targetEnvAttr = getSPIRVTargetEnvAttr(op);
   const spirv::TargetEnv targetEnv(targetEnvAttr);
@@ -122,9 +131,6 @@
     return op->emitOpError("RHS shape is indivisible by first level tile size");
   }
 
-  auto pipelineDepth = translationInfo.getSoftwarePipelineDepth();
-  pipelineDepth = pipelineDepth ? pipelineDepth : 1;
-
   return success();
 }
 
@@ -149,6 +155,15 @@
     llvm::dbgs() << "]\n";
   });
 
+  FailureOr<int64_t> maybeDepth =
+      getSoftwarePipelineDepth(translationInfo.getConfiguration());
+  FailureOr<int64_t> maybeStage =
+      getSoftwarePipelineStoreStage(translationInfo.getConfiguration());
+  if (failed(maybeDepth) || failed(maybeStage)) {
+    return op->emitOpError(
+        "invalid cooperative matrix configuration without pipelining config");
+  }
+
   // Get spirv.target_env attributes
   const spirv::TargetEnvAttr targetEnvAttr = getSPIRVTargetEnvAttr(op);
   const spirv::TargetEnv targetEnv(targetEnvAttr);
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_matmul.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_matmul.mlir
index a0290a2..a2641fe 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_matmul.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_matmul.mlir
@@ -34,7 +34,7 @@
 }
 
 //  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 512, 8, 16]{{\]}}>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize pipeline_depth = 1>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize, {pipeline_depth = 1 : i64, store_stage = 1 : i64}>
 //      CHECK: hal.executable.export public @batch_matmul_f32_16x4096x40x4096
 // CHECK-SAME:   translation_info = #[[TRANSLATION]]
 // CHECK-SAME:   workgroup_size = [2 : index, 64 : index, 1 : index]
@@ -80,7 +80,7 @@
 }
 
 //  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 128, 32]{{\]}}>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize pipeline_depth = 2 store_stage = 0>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize, {pipeline_depth = 2 : i64, store_stage = 0 : i64}>
 //      CHECK: hal.executable.export public @matmul_f16_64x640x320
 // CHECK-SAME:   translation_info = #[[TRANSLATION]]
 // CHECK-SAME:   workgroup_size = [16 : index, 16 : index, 1 : index]
@@ -126,7 +126,7 @@
 }
 
 //  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 256, 16, 32]{{\]}}>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize pipeline_depth = 1>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize, {pipeline_depth = 1 : i64, store_stage = 1 : i64}>
 //      CHECK: hal.executable.export public @batch_matmul_f32_16x4096x40x4096
 // CHECK-SAME:   translation_info = #[[TRANSLATION]]
 // CHECK-SAME:   workgroup_size = [4 : index, 32 : index, 1 : index]
@@ -178,7 +178,7 @@
 }
 
 //  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 64, 256, 32]{{\]}}>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize pipeline_depth = 1>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize, {pipeline_depth = 1 : i64, store_stage = 1 : i64}>
 //      CHECK: hal.executable.export public @batch_matmul_f16_1x4096x4096x512
 // CHECK-SAME:   translation_info = #[[TRANSLATION]]
 // CHECK-SAME:   workgroup_size = [32 : index, 8 : index, 1 : index]
@@ -252,7 +252,7 @@
 }
 
 //  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 128, 1, 16]{{\]}}>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
 //      CHECK: hal.executable.export public @matmul_multi_reduce_i4xf32xf32
 // CHECK-SAME:   translation_info = #[[TRANSLATION]]
 // CHECK-SAME:   workgroup_size = [32 : index, 8 : index, 1 : index]
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_matmul_cooperative_ops.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_matmul_cooperative_ops.mlir
index 40684d2..27b9758 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_matmul_cooperative_ops.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_amd_matmul_cooperative_ops.mlir
@@ -71,7 +71,7 @@
 }
 
 //  CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 128], [32, 64], [0, 0, 32], [16, 16, 16]{{\]}}>
-//  CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize pipeline_depth = 1 store_stage = 0>
+//  CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize, {pipeline_depth = 1 : i64, store_stage = 0 : i64}>
 //CHECK-LABEL: hal.executable.export public @matmul_256x1024x128_div_add
 // CHECK-SAME:   subgroup_size = 32 : index
 // CHECK-SAME:   translation_info = #[[$TRANSLATION]]
@@ -140,7 +140,7 @@
 }
 
 //  CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 64, 128], [1, 32, 64], [0, 0, 0, 32], [1, 16, 16, 16]{{\]}}>
-//  CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize pipeline_depth = 1 store_stage = 0>
+//  CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize, {pipeline_depth = 1 : i64, store_stage = 0 : i64}>
 //CHECK-LABEL: hal.executable.export public @batch_matmul_16x128x256x512_div
 // CHECK-SAME:   subgroup_size = 32 : index
 // CHECK-SAME:   translation_info = #[[$TRANSLATION]]
@@ -207,7 +207,7 @@
 }
 
 //  CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 64, 128], [1, 32, 64], [0, 0, 0, 32], [1, 16, 16, 16]{{\]}}>
-//  CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize pipeline_depth = 1 store_stage = 0>
+//  CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize, {pipeline_depth = 1 : i64, store_stage = 0 : i64}>
 //CHECK-LABEL: hal.executable.export public @generic_batch_matmul_32x8x512x64
 // CHECK-SAME:   subgroup_size = 32 : index
 // CHECK-SAME:   translation_info = #[[$TRANSLATION]]
@@ -268,7 +268,7 @@
 }
 
 //  CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 64, 128], [1, 32, 64], [0, 0, 0, 16], [1, 16, 16, 16]{{\]}}>
-//  CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize store_stage = 0>
+//  CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize, {pipeline_depth = 0 : i64, store_stage = 0 : i64}>
 //CHECK-LABEL: hal.executable.export public @batch_matmul_16x1024x1024x80
 // CHECK-SAME:   subgroup_size = 32 : index
 // CHECK-SAME:   translation_info = #[[$TRANSLATION]]
@@ -329,7 +329,7 @@
   }
 }
 
-//   CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize pipeline_depth = 1>
+//   CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize, {pipeline_depth = 1 : i64, store_stage = 1 : i64}>
 // CHECK-LABEL: hal.executable.export public @matmul_256x1024x8
 //   CHECK-NOT:   subgroup_size =
 //  CHECK-SAME:   translation_info = #[[$TRANSLATION]]
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul.mlir
index 972de3d..12cb667 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul.mlir
@@ -39,7 +39,7 @@
 }
 
 //  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[4, 128, 32]{{\]}}>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize pipeline_depth = 1>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize, {pipeline_depth = 1 : i64, store_stage = 1 : i64}>
 //      CHECK: hal.executable.export public @matmul_4x4096x9216
 // CHECK-SAME:   translation_info = #[[TRANSLATION]]
 // CHECK-SAME:   workgroup_size = [32 : index, 4 : index, 1 : index]
@@ -146,7 +146,7 @@
 }
 
 //  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[32, 128, 1, 32]{{\]}}>
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize pipeline_depth = 1>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize, {pipeline_depth = 1 : i64, store_stage = 1 : i64}>
 //      CHECK: hal.executable.export public @multi_reduction_transposed_b_matmul
 // CHECK-SAME:   translation_info = #[[TRANSLATION]]
 // CHECK-SAME:   workgroup_size = [32 : index, 8 : index, 1 : index]
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul_cooperative_ops.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul_cooperative_ops.mlir
index a041fc2..955e525 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul_cooperative_ops.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/config_nvidia_matmul_cooperative_ops.mlir
@@ -77,7 +77,7 @@
 }
 
 //  CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64], [32, 32], [0, 0, 32], [16, 16, 16]{{\]}}>
-//  CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize pipeline_depth = 1 store_stage = 0>
+//  CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize, {pipeline_depth = 1 : i64, store_stage = 0 : i64}>
 //CHECK-LABEL: hal.executable.export public @matmul_256x1024x128_div_add
 // CHECK-SAME:   subgroup_size = 32 : index
 // CHECK-SAME:   translation_info = #[[$TRANSLATION]]
@@ -152,7 +152,7 @@
 }
 
 //  CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 64, 64], [1, 32, 32], [0, 0, 0, 32], [1, 16, 16, 16]{{\]}}>
-//  CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize pipeline_depth = 1 store_stage = 0>
+//  CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize, {pipeline_depth = 1 : i64, store_stage = 0 : i64}>
 //CHECK-LABEL: hal.executable.export public @batch_matmul_16x128x256x512_div
 // CHECK-SAME:   subgroup_size = 32 : index
 // CHECK-SAME:   translation_info = #[[$TRANSLATION]]
@@ -225,7 +225,7 @@
 }
 
 //  CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 64, 64], [1, 32, 32], [0, 0, 0, 32], [1, 16, 16, 16]{{\]}}>
-//  CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize pipeline_depth = 1 store_stage = 0>
+//  CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize, {pipeline_depth = 1 : i64, store_stage = 0 : i64}>
 //CHECK-LABEL: hal.executable.export public @generic_batch_matmul_32x8x512x64
 // CHECK-SAME:   subgroup_size = 32 : index
 // CHECK-SAME:   translation_info = #[[$TRANSLATION]]
@@ -292,7 +292,7 @@
 }
 
 //  CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 64, 64], [1, 32, 32], [0, 0, 0, 16], [1, 16, 16, 16]{{\]}}>
-//  CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize store_stage = 0>
+//  CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize, {pipeline_depth = 0 : i64, store_stage = 0 : i64}>
 //CHECK-LABEL: hal.executable.export public @batch_matmul_16x1024x1024x80
 // CHECK-SAME:   subgroup_size = 32 : index
 // CHECK-SAME:   translation_info = #[[$TRANSLATION]]
@@ -359,7 +359,7 @@
   }
 }
 
-//   CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize pipeline_depth = 1>
+//   CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize, {pipeline_depth = 1 : i64, store_stage = 1 : i64}>
 // CHECK-LABEL: hal.executable.export public @matmul_256x1024x8
 //   CHECK-NOT:   subgroup_size =
 //  CHECK-SAME:   translation_info = #[[$TRANSLATION]]
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/illegal_configuration.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/illegal_configuration.mlir
index 1ca14b5..5aa906f 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/illegal_configuration.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/illegal_configuration.mlir
@@ -4,7 +4,7 @@
 
 #compilation = #iree_codegen.compilation_info<
     lowering_config  = <tile_sizes = []>,
-    translation_info = <SPIRVMatmulPromoteVectorize>,
+    translation_info = <SPIRVMatmulPromoteVectorize, {pipeline_depth = 0, store_stage = 1}>,
     workgroup_size = [16, 8, 1]>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
@@ -41,7 +41,7 @@
 
 #compilation = #iree_codegen.compilation_info<
     lowering_config  = <tile_sizes = [[32, 64], [4, 4], [0, 0, 4]]>,
-    translation_info = <SPIRVMatmulPromoteVectorize>>
+    translation_info = <SPIRVMatmulPromoteVectorize, {pipeline_depth = 0, store_stage = 1}>>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
     #hal.descriptor_set.binding<0, storage_buffer>,
@@ -77,7 +77,7 @@
 
 #compilation = #iree_codegen.compilation_info<
     lowering_config  = <tile_sizes = [[32, 64], [4, 4], [0, 0, 4]]>,
-    translation_info = <SPIRVMatmulPromoteVectorize>,
+    translation_info = <SPIRVMatmulPromoteVectorize, {pipeline_depth = 0, store_stage = 1}>,
     workgroup_size = [16, 8, 128]>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
@@ -114,7 +114,7 @@
 
 #compilation = #iree_codegen.compilation_info<
     lowering_config  = <tile_sizes = [[32, 64], [4, 2], [0, 0, 4]]>,
-    translation_info = <SPIRVMatmulPromoteVectorize>,
+    translation_info = <SPIRVMatmulPromoteVectorize, {pipeline_depth = 0, store_stage = 1}>,
     workgroup_size = [32, 8, 1]>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
@@ -151,7 +151,7 @@
 
 #compilation = #iree_codegen.compilation_info<
     lowering_config  = <tile_sizes = [[32, 64], [16, 8], [0, 0, 4]]>,
-    translation_info = <SPIRVMatmulPromoteVectorize>,
+    translation_info = <SPIRVMatmulPromoteVectorize, {pipeline_depth = 0, store_stage = 1}>,
     workgroup_size = [8, 2, 1]>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
@@ -188,7 +188,7 @@
 
 #compilation = #iree_codegen.compilation_info<
     lowering_config  = <tile_sizes = [[32, 60], [4, 4], [0, 0, 4]]>,
-    translation_info = <SPIRVMatmulPromoteVectorize>,
+    translation_info = <SPIRVMatmulPromoteVectorize, {pipeline_depth = 0, store_stage = 1}>,
     workgroup_size = [15, 8, 1]>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
@@ -225,7 +225,7 @@
 
 #compilation = #iree_codegen.compilation_info<
     lowering_config  = <tile_sizes = [[32, 64, 4]]>,
-    translation_info = <SPIRVMatmulPromoteVectorize>,
+    translation_info = <SPIRVMatmulPromoteVectorize, {pipeline_depth = 0, store_stage = 1}>,
     workgroup_size = [16, 8, 1]>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
@@ -262,7 +262,7 @@
 
 #compilation = #iree_codegen.compilation_info<
     lowering_config  = <tile_sizes = [[32, 64, 4]]>,
-    translation_info = <SPIRVMatmulPromoteVectorize>,
+    translation_info = <SPIRVMatmulPromoteVectorize, {pipeline_depth = 0, store_stage = 1}>,
     workgroup_size = [16, 8, 1]>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
@@ -299,7 +299,7 @@
 
 #compilation = #iree_codegen.compilation_info<
     lowering_config  = <tile_sizes = [[64, 64], [32, 32], [0, 0, 16]]>,
-    translation_info = <SPIRVCooperativeMatrixVectorize>,
+    translation_info = <SPIRVCooperativeMatrixVectorize, {pipeline_depth = 0, store_stage = 1}>,
     workgroup_size = [128, 2, 1], subgroup_size = 64>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
@@ -352,7 +352,7 @@
 
 #compilation = #iree_codegen.compilation_info<
     lowering_config  = <tile_sizes = [[64, 64], [32, 32], [0, 0, 16], [8, 8, 8]]>,
-    translation_info = <SPIRVCooperativeMatrixVectorize>,
+    translation_info = <SPIRVCooperativeMatrixVectorize, {pipeline_depth = 0, store_stage = 1}>,
     workgroup_size = [128, 2, 1], subgroup_size = 64>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
@@ -405,7 +405,7 @@
 
 #compilation = #iree_codegen.compilation_info<
     lowering_config  = <tile_sizes = [[32, 32], [8, 8], [0, 0, 4], [16, 16, 16]]>,
-    translation_info = <SPIRVCooperativeMatrixVectorize>,
+    translation_info = <SPIRVCooperativeMatrixVectorize, {pipeline_depth = 0, store_stage = 1}>,
     workgroup_size = [256, 4, 1], subgroup_size = 64>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
@@ -458,7 +458,7 @@
 
 #compilation = #iree_codegen.compilation_info<
     lowering_config  = <tile_sizes = [[64, 64], [32, 32], [0, 0, 16], [16, 16, 16]]>,
-    translation_info = <SPIRVCooperativeMatrixVectorize>,
+    translation_info = <SPIRVCooperativeMatrixVectorize, {pipeline_depth = 0, store_stage = 1}>,
     workgroup_size = [64, 2, 1], subgroup_size = 64>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
@@ -511,7 +511,7 @@
 
 #compilation = #iree_codegen.compilation_info<
     lowering_config  = <tile_sizes = [[64, 64], [32, 32], [0, 0, 16], [16, 16, 16]]>,
-    translation_info = <SPIRVCooperativeMatrixVectorize>,
+    translation_info = <SPIRVCooperativeMatrixVectorize, {pipeline_depth = 0, store_stage = 1}>,
     workgroup_size = [128, 4, 1], subgroup_size = 64>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matmul_fusion.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matmul_fusion.mlir
index 9a8834e..0a88a26 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matmul_fusion.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matmul_fusion.mlir
@@ -2,7 +2,7 @@
 
 #compilation = #iree_codegen.compilation_info<
     lowering_config  = <tile_sizes = [[32, 128, 1, 32]]>,
-    translation_info = <SPIRVMatmulPromoteVectorize pipeline_depth = 1>,
+    translation_info = <SPIRVMatmulPromoteVectorize, {pipeline_depth = 1, store_stage = 1}>,
     workgroup_size = [32, 8, 1]>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matmul_promotion.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matmul_promotion.mlir
index f6180fe..0b0fb7d 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matmul_promotion.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/lowering_matmul_promotion.mlir
@@ -4,7 +4,7 @@
 
 #compilation = #iree_codegen.compilation_info<
     lowering_config  = <tile_sizes = [[64, 64, 16]]>,
-    translation_info = <SPIRVMatmulPromoteVectorize pipeline_depth = 2>,
+    translation_info = <SPIRVMatmulPromoteVectorize, {pipeline_depth = 2, store_stage = 1}>,
     workgroup_size = [16, 8, 1]>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
@@ -57,7 +57,7 @@
 }
 
 //       CHECK-DAG: #[[MAP:.+]] = affine_map<(d0) -> ((d0 floordiv 16) mod 2)>
-//       CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize pipeline_depth = 2>
+//       CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize
 //           CHECK: hal.executable.export public @matmul_f32_128x256x64
 //      CHECK-SAME:   translation_info = #[[TRANSLATION]]
 //      CHECK-SAME:   workgroup_size = [16 : index, 8 : index, 1 : index]
@@ -149,7 +149,7 @@
 }
 
 //       CHECK-DAG: #[[MAP:.+]] = affine_map<(d0) -> ((d0 floordiv 16) mod 3)>
-//       CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize pipeline_depth = 2 store_stage = 0>
+//       CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<SPIRVMatmulPromoteVectorize
 //           CHECK: hal.executable.export public @matmul_f32_128x256x64
 //      CHECK-SAME:   translation_info = #[[TRANSLATION]]
 //      CHECK-SAME:   workgroup_size = [16 : index, 8 : index, 1 : index]
@@ -206,7 +206,7 @@
 
 #compilation = #iree_codegen.compilation_info<
     lowering_config  = <tile_sizes = [[64, 256, 32]]>,
-    translation_info = <SPIRVMatmulPromoteVectorize pipeline_depth = 1>,
+    translation_info = <SPIRVMatmulPromoteVectorize, {pipeline_depth = 1, store_stage = 1}>,
     workgroup_size = [32, 8, 1]>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_cooperative_ops.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_cooperative_ops.mlir
index fcfffc7..dcc9406 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_cooperative_ops.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_cooperative_ops.mlir
@@ -866,7 +866,7 @@
 
 #compilation = #iree_codegen.compilation_info<
     lowering_config  = <tile_sizes = [[1, 64, 64], [1, 16, 64], [0, 0, 0, 16], [1, 16, 16, 16]]>,
-    translation_info = <SPIRVCooperativeMatrixVectorize>,
+    translation_info = <SPIRVCooperativeMatrixVectorize, {pipeline_depth = 1, store_stage = 1}>,
     workgroup_size = [32, 4, 1], subgroup_size = 32>
 
 hal.executable public @batch_matmul_f16_16x4096x4096x64_truncf_mulf {
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_promotion.mlir b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_promotion.mlir
index 690e2b9..d4e11c3 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_promotion.mlir
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/test/pipeline_matmul_promotion.mlir
@@ -174,7 +174,7 @@
 
 #user_config = #iree_codegen.compilation_info<
   lowering_config = <tile_sizes = [[16, 128, 16]]>,
-  translation_info = <SPIRVMatmulPromoteVectorize>,
+  translation_info = <SPIRVMatmulPromoteVectorize, {pipeline_depth = 0, store_stage = 1}>,
   workgroup_size = [16, 8, 1]>
 
 hal.executable @matmul_f16_32x1280x1280 {
diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp b/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
index 043cd89..c40ae2b 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Utils/Utils.cpp
@@ -777,6 +777,45 @@
   }};
 }
 
+static constexpr char pipeliningDepthName[] = "pipeline_depth";
+static constexpr char pipeliningStageName[] = "store_stage";
+
+DictionaryAttr
+getSoftwarePipeliningAttrDict(MLIRContext *context,
+                              unsigned softwarePipelineDepth,
+                              unsigned softwarePipelineStoreStage) {
+  SmallVector<NamedAttribute> attrs;
+  attrs.push_back(
+      {StringAttr::get(context, pipeliningDepthName),
+       IntegerAttr::get(IntegerType::get(context, 64), softwarePipelineDepth)});
+  attrs.push_back({StringAttr::get(context, pipeliningStageName),
+                   IntegerAttr::get(IntegerType::get(context, 64),
+                                    softwarePipelineStoreStage)});
+  return DictionaryAttr::get(context, attrs);
+}
+
+FailureOr<int64_t> getSoftwarePipelineDepth(DictionaryAttr config) {
+  if (!config) {
+    return failure();
+  }
+  Attribute depth = config.get(pipeliningDepthName);
+  if (!depth) {
+    return failure();
+  }
+  return llvm::cast<IntegerAttr>(depth).getInt();
+}
+
+FailureOr<int64_t> getSoftwarePipelineStoreStage(DictionaryAttr config) {
+  if (!config) {
+    return failure();
+  }
+  Attribute stage = config.get(pipeliningStageName);
+  if (!stage) {
+    return failure();
+  }
+  return llvm::cast<IntegerAttr>(stage).getInt();
+}
+
 //===---------------------------------------------------------------------===//
 // Misc. utility functions
 //===---------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Codegen/Utils/Utils.h b/compiler/src/iree/compiler/Codegen/Utils/Utils.h
index 566c8db..a9f7393 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/Utils.h
+++ b/compiler/src/iree/compiler/Codegen/Utils/Utils.h
@@ -197,6 +197,17 @@
     linalg::DistributionMethod distributionMethod,
     int32_t maxWorkgroupParallelDims = kNumMaxParallelDims);
 
+// Helper to construct the strategy attribute dictionary for a pipeline that
+// does software pipelining.
+DictionaryAttr
+getSoftwarePipeliningAttrDict(MLIRContext *context,
+                              unsigned softwarePipelineDepth = 0,
+                              unsigned softwarePipelineStoreStage = 1);
+
+// Helpers to extract the pipelining configuration from the config dictionary.
+FailureOr<int64_t> getSoftwarePipelineDepth(DictionaryAttr);
+FailureOr<int64_t> getSoftwarePipelineStoreStage(DictionaryAttr);
+
 //===---------------------------------------------------------------------===//
 // Misc. utility functions.
 //===---------------------------------------------------------------------===//
diff --git a/tests/e2e/matmul/generate_e2e_matmul_tests.py b/tests/e2e/matmul/generate_e2e_matmul_tests.py
index 453e1c5..ca07e0c 100644
--- a/tests/e2e/matmul/generate_e2e_matmul_tests.py
+++ b/tests/e2e/matmul/generate_e2e_matmul_tests.py
@@ -450,8 +450,9 @@
         compilation_info_string = (
             f"#compilation{generate_function.compilation_index} = #iree_codegen.compilation_info<\n"
             f"  lowering_config = <tile_sizes = {compilation_info.tile_sizes}>,\n"
-            f"  translation_info = <{dispatch_lowering_pass_pipeline}\n"
-            f"  pipeline_depth = {compilation_info.software_pipeline_depth}>,\n"
+            f"  translation_info = <{dispatch_lowering_pass_pipeline},\n"
+            f"  {{ pipeline_depth = {compilation_info.software_pipeline_depth}, "
+            f"  store_stage = 1 }}>,\n"
             f"  workgroup_size = {compilation_info.workgroup_size_str()}>\n"
         )
         compilation_info_attr = (