[NFC][GPU] Move LLVMGPUPipelineOptions to iree_gpu dialect (#18458)

This moves `LLVMGPUPipelineOptions` to `Codegen/Dialect/GPU` so that
pipeline options can be set by iree_gpu lowering configuration logic
(like `setMatmulLoweringConfig` in `ConfigUtils.cpp`).

A new attribute `GPUPipelineOptionsAttr` is added, with optional
parameters defined for each existing pipeline option. The assembly of
the pipline attributes has changed, since the options are now part of
the `iree_gpu` dialect. For the purposes of setting user configurations
with transform dialect this PR changes the following:

All pipeline option attributes should now be contained in a single
`#iree_gpu.pipeline_options<>`
- `reorder_workgroups = "none"/"swizzle"/"transpose"` becomes one of the
pipeline options `reorder_workgroups_strategy = None/Swizzle/Transpose`
- `prefetch_shared_memory` becomes the pipeline option
`prefetch_shared_memory = true/false`
- `no_reduce_shared_memory_bank_conflicts` becomes the pipeline option
`no_reduce_shared_memory_bank_conflicts = true/false`

Example:

After this change, the translation_info config dict changes from
```
{reorder_workgroups = "swizzle", prefetch_shared_memory}
```
to
```
{gpu_pipeline_options =
    #iree_gpu.pipeline_options<
        prefetch_shared_memory = true, reorder_workgroups_strategy = Swizzle
    >}
```

---------

Signed-off-by: Max Dawkins <max.dawkins@gmail.com>
diff --git a/build_tools/pkgci/external_test_suite/attention_and_matmul_spec.mlir b/build_tools/pkgci/external_test_suite/attention_and_matmul_spec.mlir
index 1bdd146..c1ddf72 100644
--- a/build_tools/pkgci/external_test_suite/attention_and_matmul_spec.mlir
+++ b/build_tools/pkgci/external_test_suite/attention_and_matmul_spec.mlir
@@ -636,8 +636,8 @@
          workgroup_size = [320, 1, 1] subgroup_size = 64,
           {mma_schedule = #iree_gpu.mma_schedule<
               intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
-              subgroup_m_count = 1, subgroup_n_count = 5>
-           , reorder_workgroups = "transpose"}>
+              subgroup_m_count = 1, subgroup_n_count = 5>,
+           gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = Transpose>}>
       > -> !transform.any_param
     transform.yield %conv, %config : !transform.any_op, !transform.any_param
   }
@@ -656,8 +656,9 @@
          workgroup_size = [256, 1, 1] subgroup_size = 64,
           {mma_schedule = #iree_gpu.mma_schedule<
               intrinsic = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>,
-              subgroup_m_count = 1, subgroup_n_count = 4>
-           , reorder_workgroups = "transpose", llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}}>
+              subgroup_m_count = 1, subgroup_n_count = 4>,
+           gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = Transpose>,
+           llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}}>
       > -> !transform.any_param
     transform.yield %conv, %config : !transform.any_op, !transform.any_param
   }
@@ -676,8 +677,8 @@
          workgroup_size = [320, 1, 1] subgroup_size = 64,
           {mma_schedule = #iree_gpu.mma_schedule<
               intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
-              subgroup_m_count = 1, subgroup_n_count = 5>
-           , reorder_workgroups = "transpose"}>
+              subgroup_m_count = 1, subgroup_n_count = 5>,
+           gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = Transpose>}>
       > -> !transform.any_param
     transform.yield %conv, %config : !transform.any_op, !transform.any_param
   }
@@ -696,8 +697,8 @@
          workgroup_size = [320, 1, 1] subgroup_size = 64,
           {mma_schedule = #iree_gpu.mma_schedule<
               intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
-              subgroup_m_count = 1, subgroup_n_count = 5>
-           , reorder_workgroups = "transpose"}>
+              subgroup_m_count = 1, subgroup_n_count = 5>,
+           gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = Transpose>}>
       > -> !transform.any_param
     transform.yield %conv, %config : !transform.any_op, !transform.any_param
   }
@@ -716,8 +717,9 @@
          workgroup_size = [128, 4, 1] subgroup_size = 64,
           {mma_schedule = #iree_gpu.mma_schedule<
               intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
-              subgroup_m_count = 4, subgroup_n_count = 2>
-           , reorder_workgroups = "transpose", llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}}>
+              subgroup_m_count = 4, subgroup_n_count = 2>,
+           gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = Transpose>,
+           llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}}>
       > -> !transform.any_param
     transform.yield %conv, %config : !transform.any_op, !transform.any_param
   }
@@ -736,8 +738,8 @@
          workgroup_size = [320, 1, 1] subgroup_size = 64,
           {mma_schedule = #iree_gpu.mma_schedule<
               intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
-              subgroup_m_count = 1, subgroup_n_count = 5>
-           , reorder_workgroups = "transpose"}>
+              subgroup_m_count = 1, subgroup_n_count = 5>,
+           gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = Transpose>}>
       > -> !transform.any_param
     transform.yield %conv, %config : !transform.any_op, !transform.any_param
   }
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.h b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.h
index 3ec74b1..02721d6 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/Passes.h
@@ -101,7 +101,7 @@
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
 createConvertVectorReductionToGPUPass(bool expandSubgroupReduction = true);
 
-enum class ReorderWorkgroupsStrategy { None, Swizzle, Transpose };
+using IREE::GPU::ReorderWorkgroupsStrategy;
 
 /// Reorders workgroup IDs.
 std::unique_ptr<InterfacePass<mlir::FunctionOpInterface>>
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
index 900aeee..bb7612e 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
@@ -1353,6 +1353,25 @@
 int64_t LaneIdAttr::getRelativeIndex() const { return getDim(); }
 
 //===----------------------------------------------------------------------===//
+// GPU Pipeline Options
+//===----------------------------------------------------------------------===//
+
+GPUPipelineOptionsAttr GPUPipelineOptionsAttr::get(
+    MLIRContext *context, bool prefetchSharedMemory,
+    bool noReduceSharedMemoryBankConflicts,
+    std::optional<ReorderWorkgroupsStrategy> reorderWorkgroupsStrategy) {
+  auto strategyAttr = ReorderWorkgroupsStrategyAttr();
+  if (reorderWorkgroupsStrategy) {
+    strategyAttr =
+        ReorderWorkgroupsStrategyAttr::get(context, *reorderWorkgroupsStrategy);
+  }
+  Builder b(context);
+  return Base::get(context, b.getBoolAttr(prefetchSharedMemory),
+                   b.getBoolAttr(noReduceSharedMemoryBankConflicts),
+                   strategyAttr);
+}
+
+//===----------------------------------------------------------------------===//
 // Attribute Registration
 //===----------------------------------------------------------------------===//
 
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td
index cdb7402..968fa73 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td
@@ -430,4 +430,54 @@
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// GPU Pipeline Options
+//===----------------------------------------------------------------------===//
+
+def IREEGPU_ReorderWorkgroupsStrategyAttr :
+    EnumAttr<IREEGPU_Dialect, IREEGPU_ReorderWorkgroupsStrategy, ""> {
+  let assemblyFormat = "``$value";
+  let cppNamespace = "::mlir::iree_compiler::IREE::GPU";
+}
+
+def IREEGPU_GPUPipelineOptionsAttr : AttrDef<IREEGPU_Dialect, "GPUPipelineOptions"> {
+  let summary = "GPU pipeline options attribute.";
+  let description = [{
+    This attributes describes lowering pipeline specific configuration options:
+    * prefetch_shared_memory: Boolean option indicating whether or not to run
+      the loop prefetching pass in the lowering pipeline.
+    * no_reduce_shared_memory_bank_conflicts: Boolean option indicating whether
+      or not to skip the bank conflict reduction pass in the lowering pipeline.
+    * reorder_workgroups_strategy: Enum attribute indicating which strategy to
+      choose for the workgroup reordering pass. Options are `None`, `Swizzle`,
+      and `Transpose`.
+  }];
+
+  let mnemonic = "pipeline_options";
+  let cppNamespace = "::mlir::iree_compiler::IREE::GPU";
+
+  let parameters = (ins
+    OptionalParameter<"BoolAttr">:$prefetch_shared_memory,
+    OptionalParameter<"BoolAttr">:$no_reduce_shared_memory_bank_conflicts,
+    OptionalParameter<"ReorderWorkgroupsStrategyAttr">:$reorder_workgroups_strategy
+  );
+
+  let builders = [
+    AttrBuilder<(ins
+        CArg<"bool", "false">:$prefetch_shared_memory,
+        CArg<"bool", "false">:$no_reduce_shared_memory_bank_conflicts,
+        CArg<"std::optional<ReorderWorkgroupsStrategy>", "{}">:$reorder_workgroups_strategy)>
+  ];
+
+  let assemblyFormat = "`<` struct(params) `>`";
+
+  let extraClassDeclaration = [{
+    // Returns the key name for GPUPipelineOptionsAttr in the translation info
+    // config dictionary.
+    static StringRef getDictKeyName() {
+      return "gpu_pipeline_options";
+    }
+  }];
+}
+
 #endif // IREE_COMPILER_CODEGEN_DIALECT_GPU_IREEGPUATTRS
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td
index bdc590a..c147d88 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.td
@@ -168,4 +168,28 @@
       Lane
     ]>;
 
+//===----------------------------------------------------------------------===//
+// Pipeline options
+//===----------------------------------------------------------------------===//
+
+class IREEGPU_I32PipelineEnumAttr<string name, string summary, list<I32EnumAttrCase> cases>
+    : I32EnumAttr<name, summary, cases> {
+  let cppNamespace = "::mlir::iree_compiler::IREE::GPU";
+  let genSpecializedAttr = 0;
+}
+
+// ReorderWorkgroups EnumAttrCases.
+def ReorderWorkgroupsNone : I32EnumAttrCase<"None", 0>;
+def ReorderWorkgroupsSwizzle : I32EnumAttrCase<"Swizzle", 1>;
+def ReorderWorkgroupsTranspose : I32EnumAttrCase<"Transpose", 2>;
+
+// EnumAttr for workgroup reordering strategy enums.
+def IREEGPU_ReorderWorkgroupsStrategy : IREEGPU_I32PipelineEnumAttr<"ReorderWorkgroupsStrategy",
+    "Strategy for workgroup reordering", [
+      ReorderWorkgroupsNone,
+      ReorderWorkgroupsSwizzle,
+      ReorderWorkgroupsTranspose
+    ]> {
+}
+
 #endif // IREE_COMPILER_CODEGEN_DIALECT_GPU_IREEGPUENUMS
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
index cfe13f6..c9bb86c 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -473,4 +473,73 @@
       workgroupSize, subgroupSize, DictionaryAttr());
 }
 
+//===----------------------------------------------------------------------===//
+// Lowering Config Attributes
+//===----------------------------------------------------------------------===//
+
+GPUPipelineOptions
+getPipelineOptions(FunctionOpInterface funcOp,
+                   IREE::Codegen::TranslationInfoAttr translationInfo) {
+  GPUPipelineOptions pipelineOptions = {};
+  auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(funcOp);
+
+  if (DictionaryAttr config = translationInfo.getConfiguration()) {
+    std::optional<NamedAttribute> maybePipelineOptionsAttr =
+        config.getNamed(GPUPipelineOptionsAttr::getDictKeyName());
+    if (!maybePipelineOptionsAttr.has_value()) {
+      return pipelineOptions;
+    }
+    auto pipelineOptionsAttr =
+        cast<GPUPipelineOptionsAttr>(maybePipelineOptionsAttr->getValue());
+    BoolAttr prefetchSharedMemory =
+        pipelineOptionsAttr.getPrefetchSharedMemory();
+    if (prefetchSharedMemory) {
+      pipelineOptions.prefetchSharedMemory = prefetchSharedMemory.getValue();
+    }
+    BoolAttr noReduceBankConflicts =
+        pipelineOptionsAttr.getNoReduceSharedMemoryBankConflicts();
+    if (noReduceBankConflicts) {
+      pipelineOptions.enableReduceSharedMemoryBankConflicts =
+          !noReduceBankConflicts.getValue();
+    }
+    ReorderWorkgroupsStrategyAttr reorderWorkgroupsStrategy =
+        pipelineOptionsAttr.getReorderWorkgroupsStrategy();
+    if (reorderWorkgroupsStrategy) {
+      pipelineOptions.reorderStrategy = reorderWorkgroupsStrategy.getValue();
+    }
+  }
+
+  pipelineOptions.enableUkernels = targetAttr && hasUkernel(targetAttr);
+
+  LLVM_DEBUG(llvm::dbgs() << "GPU Pipeline Options: " << pipelineOptions
+                          << "\n");
+  return pipelineOptions;
+}
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
+                              const GPUPipelineOptions &options) {
+  StringRef reorderStr = "<not set>";
+  if (options.reorderStrategy) {
+    switch (options.reorderStrategy.value()) {
+    case ReorderWorkgroupsStrategy::Transpose:
+      reorderStr = "transpose";
+      break;
+    case ReorderWorkgroupsStrategy::Swizzle:
+      reorderStr = "swizzle";
+      break;
+    case ReorderWorkgroupsStrategy::None:
+      reorderStr = "none";
+      break;
+    default:
+      assert(false && "Unhandled reorder option");
+    }
+  }
+
+  return os << "{" << "enableReduceSharedMemoryBankConflicts = "
+            << options.enableReduceSharedMemoryBankConflicts << ", "
+            << ", prefetchSharedMemory = " << options.prefetchSharedMemory
+            << ", reorderWorkgroupsStrategy = " << reorderStr
+            << ", enableUkernels = " << options.enableUkernels << "}";
+}
+
 } // namespace mlir::iree_compiler::IREE::GPU
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h
index f87fab9..f9f3d37 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h
@@ -7,6 +7,7 @@
 #ifndef IREE_COMPILER_CODEGEN_DIALECT_GPU_TARGETUTILS_CONFIGUTILS_H_
 #define IREE_COMPILER_CODEGEN_DIALECT_GPU_TARGETUTILS_CONFIGUTILS_H_
 
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
@@ -26,6 +27,26 @@
                                            mlir::FunctionOpInterface entryPoint,
                                            Operation *op);
 
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Options
+//===----------------------------------------------------------------------===//
+
+using IREE::GPU::ReorderWorkgroupsStrategy;
+
+struct GPUPipelineOptions {
+  bool enableReduceSharedMemoryBankConflicts = true;
+  bool prefetchSharedMemory = false;
+  bool enableUkernels = false;
+  std::optional<ReorderWorkgroupsStrategy> reorderStrategy;
+};
+
+llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
+                              const GPUPipelineOptions &options);
+
+GPUPipelineOptions
+getPipelineOptions(FunctionOpInterface funcOp,
+                   IREE::Codegen::TranslationInfoAttr translationInfo);
+
 } // namespace mlir::iree_compiler::IREE::GPU
 
 #endif // IREE_COMPILER_CODEGEN_DIALECT_GPU_TARGETUTILS_CONFIGUTILS_H_
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index 118ce2b..ccf61a5 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -13,6 +13,7 @@
 #include "iree/compiler/Codegen/Common/GPU/GPUHeuristics.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.h"
 #include "iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h"
 #include "iree/compiler/Codegen/Interfaces/PartitionableLoopsInterface.h"
 #include "iree/compiler/Codegen/Interfaces/UKernelOpInterface.h"
@@ -380,9 +381,14 @@
 
   // Prefetch shared memory if requested.
   if (clLLVMGPUEnablePrefetch) {
+    auto pipelineOptions = IREE::GPU::GPUPipelineOptionsAttr::get(
+        context, /*prefetchSharedMemory=*/true,
+        /*no_reduce_shared_memory_bank_conflicts=*/false,
+        /*reorder_workgroups_strategy=*/std::nullopt);
     attrs.emplace_back(
-        StringAttr::get(context, LLVMGPUAttrNames::kPrefetchSharedMemory),
-        UnitAttr::get(context));
+        StringAttr::get(context,
+                        IREE::GPU::GPUPipelineOptionsAttr::getDictKeyName()),
+        pipelineOptions);
   }
 
   auto configDict = DictionaryAttr::get(context, attrs);
@@ -610,9 +616,14 @@
 
   // Prefetch shared memory if requested.
   if (clLLVMGPUEnablePrefetch) {
+    auto pipelineOptions = IREE::GPU::GPUPipelineOptionsAttr::get(
+        context, /*prefetchSharedMemory=*/true,
+        /*no_reduce_shared_memory_bank_conflicts=*/false,
+        /*reorder_workgroups_strategy=*/std::nullopt);
     attrs.emplace_back(
-        StringAttr::get(context, LLVMGPUAttrNames::kPrefetchSharedMemory),
-        UnitAttr::get(context));
+        StringAttr::get(context,
+                        IREE::GPU::GPUPipelineOptionsAttr::getDictKeyName()),
+        pipelineOptions);
   }
 
   auto configDict = DictionaryAttr::get(context, attrs);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
index dc96c92..640f1c3 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
@@ -8,6 +8,7 @@
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
+#include "iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h"
 #include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.h"
 #include "iree/compiler/Codegen/LLVMGPU/KernelConfig.h"
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h"
@@ -76,50 +77,6 @@
 
   void runOnOperation() override;
 };
-
-static LLVMGPUPipelineOptions
-getPipelineOptions(FunctionOpInterface funcOp,
-                   IREE::Codegen::TranslationInfoAttr translationInfo) {
-  LLVMGPUPipelineOptions pipelineOptions = {};
-  auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(funcOp);
-
-  LLVM_DEBUG(llvm::dbgs() << "Translation Info: " << translationInfo << "\n");
-  LLVM_DEBUG(llvm::dbgs() << "Target Attr: " << targetAttr << "\n");
-
-  if (DictionaryAttr config = translationInfo.getConfiguration()) {
-    if (config.contains(LLVMGPUAttrNames::kNoReduceSharedMemoryBankConflicts))
-      pipelineOptions.enableReduceSharedMemoryBankConflicts = false;
-    if (config.contains(LLVMGPUAttrNames::kPrefetchSharedMemory))
-      pipelineOptions.prefetchSharedMemory = true;
-    if (config.contains(LLVMGPUAttrNames::kReorderWorkgroups)) {
-      // Get the workgroups reorder config and enable the workgroup reordering.
-      Attribute reorderWorkgroupOption =
-          config.get(LLVMGPUAttrNames::kReorderWorkgroups);
-      if (!isa<StringAttr>(reorderWorkgroupOption))
-        funcOp.emitOpError() << "'" << LLVMGPUAttrNames::kReorderWorkgroups
-                             << "' is expected to be a string attribute";
-      StringRef reorderStr = llvm::cast<StringAttr>(reorderWorkgroupOption);
-      if (reorderStr == "transpose") {
-        pipelineOptions.reorderStrategy = ReorderWorkgroupsStrategy::Transpose;
-      } else if (reorderStr == "swizzle") {
-        pipelineOptions.reorderStrategy = ReorderWorkgroupsStrategy::Swizzle;
-      } else {
-        if (reorderStr != "none")
-          funcOp.emitOpError()
-              << "Unknown " << LLVMGPUAttrNames::kReorderWorkgroups
-              << "value: " << reorderWorkgroupOption;
-        else
-          pipelineOptions.reorderStrategy = ReorderWorkgroupsStrategy::None;
-      }
-    }
-  }
-
-  pipelineOptions.enableUkernels = targetAttr && hasUkernel(targetAttr);
-
-  LLVM_DEBUG(llvm::dbgs() << "LLVMGPU Pipeline Options: " << pipelineOptions
-                          << "\n");
-  return pipelineOptions;
-}
 } // namespace
 
 void LLVMGPULowerExecutableTargetPass::runOnOperation() {
@@ -138,8 +95,8 @@
   }
   OpPassManager &pipeline = maybePipeline.value();
 
-  LLVMGPUPipelineOptions pipelineOptions =
-      getPipelineOptions(funcOp, translationInfo);
+  IREE::GPU::GPUPipelineOptions pipelineOptions =
+      IREE::GPU::getPipelineOptions(funcOp, translationInfo);
 
   switch (translationInfo.getDispatchLoweringPassPipeline()) {
   case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUDefault:
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index c95c484..20471fd 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -75,28 +75,6 @@
                       llvm::cl::desc("Enable implicit gemm for convolutions."),
                       llvm::cl::init(false));
 
-llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
-                              const LLVMGPUPipelineOptions &options) {
-  StringRef reorderStr = "<not set>";
-  if (options.reorderStrategy) {
-    if (options.reorderStrategy == ReorderWorkgroupsStrategy::Transpose) {
-      reorderStr = "transpose";
-    } else if (options.reorderStrategy == ReorderWorkgroupsStrategy::Swizzle) {
-      reorderStr = "swizzle";
-    } else {
-      assert(options.reorderStrategy == ReorderWorkgroupsStrategy::None &&
-             "Unhandled reorder option");
-      reorderStr = "none";
-    }
-  }
-
-  return os << "{" << "enableReduceSharedMemoryBankConflicts = "
-            << options.enableReduceSharedMemoryBankConflicts << ", "
-            << ", prefetchSharedMemory = " << options.prefetchSharedMemory
-            << ", reorderWorkgroupsStrategy = " << reorderStr
-            << ", enableUkernels = " << options.enableUkernels << "}";
-}
-
 //===----------------------------------------------------------------------===//
 // Bufferization Configuration
 //===----------------------------------------------------------------------===//
@@ -488,7 +466,7 @@
 //===---------------------------------------------------------------------===//
 
 void addGPUMatmulSimtPassPipeline(OpPassManager &funcPassManager,
-                                  const LLVMGPUPipelineOptions &options) {
+                                  const GPUPipelineOptions &options) {
   tileAndDistributeToWorkgroup(funcPassManager);
 
   funcPassManager.addPass(createCanonicalizerPass());
@@ -549,7 +527,7 @@
 //===---------------------------------------------------------------------===//
 
 void addGPUMatmulTensorCorePassPipeline(OpPassManager &funcPassManager,
-                                        const LLVMGPUPipelineOptions &options,
+                                        const GPUPipelineOptions &options,
                                         unsigned pipelineDepth) {
   tileAndBufferize(funcPassManager);
 
@@ -619,7 +597,7 @@
 //===---------------------------------------------------------------------===//
 
 void addGPUMatmulTensorCoreMmaSyncPassPipeline(
-    OpPassManager &funcPassManager, const LLVMGPUPipelineOptions &options,
+    OpPassManager &funcPassManager, const GPUPipelineOptions &options,
     unsigned pipelineDepth) {
   tileAndBufferize(funcPassManager);
 
@@ -686,7 +664,7 @@
 //===---------------------------------------------------------------------===//
 
 void addGPUTransposePassPipeline(OpPassManager &funcPassManager,
-                                 const LLVMGPUPipelineOptions &options) {
+                                 const GPUPipelineOptions &options) {
   tileAndDistributeToWorkgroup(funcPassManager);
 
   funcPassManager.addPass(createCanonicalizerPass());
@@ -790,7 +768,7 @@
 }
 
 void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager,
-                                        const LLVMGPUPipelineOptions &options,
+                                        const GPUPipelineOptions &options,
                                         bool usePadToModelSharedMemcpy) {
   tileAndDistributeToWorkgroup(funcPassManager);
 
@@ -965,7 +943,7 @@
 }
 
 void addGPUDefaultPassPipeline(OpPassManager &funcPassManager,
-                               const LLVMGPUPipelineOptions &options) {
+                               const GPUPipelineOptions &options) {
   ConvertToDestinationPassingStylePassOptions dpsOptions;
   dpsOptions.useWARForCooperativeMatrixCodegen = true;
   tileAndDistributeToWorkgroup(funcPassManager, dpsOptions);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
index 2804a32..614d026 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
@@ -14,33 +14,12 @@
 
 #include "iree/compiler/Codegen/Common/GPU/Passes.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
+#include "iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h"
 #include "mlir/Pass/Pass.h"
 
 namespace mlir::iree_compiler {
 
-//===----------------------------------------------------------------------===//
-// Pass Pipeline Options
-//===----------------------------------------------------------------------===//
-
-/// Named attributes used in the `translation_info`'s config dictionary
-/// attribute. These are used to override default pass heuristics at the
-/// function granularity.
-namespace LLVMGPUAttrNames {
-inline constexpr StringLiteral kReorderWorkgroups = "reorder_workgroups";
-inline constexpr StringLiteral kNoReduceSharedMemoryBankConflicts =
-    "no_reduce_shared_memory_bank_conflicts";
-inline constexpr StringLiteral kPrefetchSharedMemory = "prefetch_shared_memory";
-} //  namespace LLVMGPUAttrNames
-
-struct LLVMGPUPipelineOptions {
-  bool enableReduceSharedMemoryBankConflicts = true;
-  bool prefetchSharedMemory = false;
-  bool enableUkernels = false;
-  std::optional<ReorderWorkgroupsStrategy> reorderStrategy;
-};
-
-llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
-                              const LLVMGPUPipelineOptions &options);
+using IREE::GPU::GPUPipelineOptions;
 
 //----------------------------------------------------------------------------//
 // LLVMGPU backend Pass Pipelines.
@@ -48,16 +27,16 @@
 
 /// Lowering using SIMT CUDA core operations.
 void addGPUMatmulSimtPassPipeline(OpPassManager &funcPassManager,
-                                  const LLVMGPUPipelineOptions &options);
+                                  const GPUPipelineOptions &options);
 
 /// Lowering using mma.sync Tensor Core operations.
 void addGPUMatmulTensorCoreMmaSyncPassPipeline(
-    OpPassManager &funcPassManager, const LLVMGPUPipelineOptions &options,
+    OpPassManager &funcPassManager, const GPUPipelineOptions &options,
     unsigned pipelineDepth);
 
 /// Lowering using wmma Tensor Core operations.
 void addGPUMatmulTensorCorePassPipeline(OpPassManager &funcPassManager,
-                                        const LLVMGPUPipelineOptions &options,
+                                        const GPUPipelineOptions &options,
                                         unsigned pipelineDepth);
 
 void addGPUPackUnPackPasses(OpPassManager &funcPassManager);
@@ -77,7 +56,7 @@
 
 /// Lowering transpose using shared memory.
 void addGPUTransposePassPipeline(OpPassManager &funcPassManager,
-                                 const LLVMGPUPipelineOptions &options);
+                                 const GPUPipelineOptions &options);
 
 /// Lowering calling vectorization patterns. Expects pass manager to be a
 /// module-level pass manager.
@@ -89,7 +68,7 @@
 
 /// Lowering based on vector distribution patterns.
 void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager,
-                                        const LLVMGPUPipelineOptions &options,
+                                        const GPUPipelineOptions &options,
                                         bool usePadToModelSharedMemcpy);
 
 /// Lowering reductions to warp reductions.
@@ -97,7 +76,7 @@
 
 /// Default pass pipeline on GPU, currently used only for the ukernel path.
 void addGPUDefaultPassPipeline(OpPassManager &funcPassManager,
-                               const LLVMGPUPipelineOptions &options);
+                               const GPUPipelineOptions &options);
 
 /// Pass pipeline to lower IREE HAL executables without tiling and distribution.
 void addGPUBaseLoweringPassPipeline(OpPassManager &pm);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir
index 8cb1b2e..e5a7bf1 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_user_vector_distribute.mlir
@@ -2,20 +2,20 @@
 // RUN:   --iree-codegen-reorder-workgroups-strategy=transpose \
 // RUN:   --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s --check-prefix=OPT-OUT
 
-// Check that applying `reorder_workgroups` enables or disables workgroup reordering.
+// Check that applying `ReorderWorkgroups*` enables or disables workgroup reordering.
 
 // RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --iree-codegen-llvmgpu-use-vector-distribution \
 // RUN:   --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target)))))" %s | FileCheck %s --check-prefix=OPT-IN
 
-// Check that applying the `no_reduce_shared_memory_bank_conflicts` unit attribute disables shared memory padding.
+// Check that applying the `no_reduce_shared_memory_bank_conflicts` pipeline option attribute disables shared memory padding.
 
 // OPT-OUT:       #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
+// OPT-OUT-SAME:    gpu_pipeline_options = #iree_gpu.pipeline_options<no_reduce_shared_memory_bank_conflicts = true>
 // OPT-OUT-SAME:    mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
-// OPT-OUT-SAME:    no_reduce_shared_memory_bank_conflicts
 
 // OPT-IN:       #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
+// OPT-IN-SAME:    gpu_pipeline_options = #iree_gpu.pipeline_options<no_reduce_shared_memory_bank_conflicts = true>
 // OPT-IN-SAME:    mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
-// OPT-IN-SAME:    no_reduce_shared_memory_bank_conflicts
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>,
@@ -48,7 +48,7 @@
       func.func @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32()
         attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64, {
           mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2>,
-          no_reduce_shared_memory_bank_conflicts  // Disable the 'reduceSharedMemoryBankConflicts' pass.
+          gpu_pipeline_options = #iree_gpu.pipeline_options<no_reduce_shared_memory_bank_conflicts = true>  // Disable the 'reduceSharedMemoryBankConflicts' pass.
         }>} {
         %cst = arith.constant 0.000000e+00 : f16
         %c0 = arith.constant 0 : index
@@ -81,15 +81,15 @@
 
 // -----
 
-// Check that applying the `reorder_workgroups = transpose` unit attribute enables workgroup reordering.
+// Check that applying the `reorder_workgroups_strategy = Transpose` pipeline option attribute enables workgroup reordering.
 
 // OPT-OUT:       #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
+// OPT-OUT-SAME:    gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = Transpose>
 // OPT-OUT-SAME:    mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
-// OPT-OUT-SAME:    reorder_workgroups = "transpose"
 
 // OPT-IN:       #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
+// OPT-IN-SAME:    gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = Transpose>
 // OPT-IN-SAME:    mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
-// OPT-IN-SAME:    reorder_workgroups = "transpose"
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>,
@@ -123,7 +123,7 @@
       func.func @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32()
         attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64, {
           mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2>,
-          reorder_workgroups = "transpose"  // enable the 'reorderWorkgroups' pass.
+          gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = Transpose>  // enable the 'reorderWorkgroups' pass.
         }>} {
         %cst = arith.constant 0.000000e+00 : f16
         %c0 = arith.constant 0 : index
@@ -155,11 +155,11 @@
 }
 
 // -----
-// Check that applying the `reorder_workgroups = none` unit attribute disables workgroup reordering.
+// Check that applying the `reorder_workgroups_strategy = None` pipeline option disables workgroup reordering.
 
 // OPT-OUT:       #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
+// OPT-OUT-SAME:    gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = None>
 // OPT-OUT-SAME:    mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
-// OPT-OUT-SAME:    reorder_workgroups = "none"
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>,
@@ -182,7 +182,7 @@
       func.func @main_0_dispatch_0_matmul_transpose_b_2048x10240x1280_f16xf16xf32()
         attributes {translation_info = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64, {
           mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>, subgroup_m_count = 2, subgroup_n_count = 2>,
-          reorder_workgroups = "none"  // Disable the 'reorderWorkgroups' pass.
+          gpu_pipeline_options = #iree_gpu.pipeline_options<reorder_workgroups_strategy = None>  // Disable the 'reorderWorkgroups' pass.
         }>} {
         %cst = arith.constant 0.000000e+00 : f16
         %c0 = arith.constant 0 : index
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir
index 03a48cc..d232c9a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute.mlir
@@ -345,7 +345,7 @@
 // CHECK:       #[[$TILE_SIZES:.+]] = #iree_codegen.lowering_config<tile_sizes =  {{\[}}[1, 32, 0, 64, 64]{{\]}}
 // CHECK:       #iree_codegen.translation_info<LLVMGPUVectorDistribute
 // CHECK-SAME:  subgroup_m_count = 1, subgroup_n_count = 1
-// CHECK-NOT:   prefetch_shared_memory
+// CHECK-NOT:   prefetch_shared_memory = true
 
 // CHECK-LABEL: func.func @attention_20x4096x64x4096x64()
 
@@ -380,7 +380,7 @@
 // CHECK:       #[[$TILE_SIZES:.+]] = #iree_codegen.lowering_config<tile_sizes =  {{\[}}[16, 0, 32, 16]{{\]}}
 // CHECK:       #iree_codegen.translation_info<LLVMGPUVectorDistribute
 // CHECK-SAME:  subgroup_m_count = 1, subgroup_n_count = 1
-// CHECK-NOT:   prefetch_shared_memory
+// CHECK-NOT:   prefetch_shared_memory = true
 
 // CHECK-LABEL: func.func @attention_large_head_dim_shared_mem()
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir
index dfcf4d9..7aa6e83 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute.mlir
@@ -46,9 +46,9 @@
 // Basic pipeline test to make sure it generates the instructions we expect.
 
 //       CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
+//  CHECK-SAME:   gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true
 //  CHECK-SAME:   mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
 //  CHECK-SAME:     subgroup_m_count = 2, subgroup_n_count = 2>
-//  CHECK-SAME:   prefetch_shared_memory
 
 //    CHECK-LABEL: func.func @matmul_256x256x256_f16_f32()
 //     CHECK-SAME:    translation_info = #[[$TRANSLATION]]
@@ -93,9 +93,9 @@
 }
 
 //       CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
+//  CHECK-SAME:   gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true
 //  CHECK-SAME:   mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
 //  CHECK-SAME:     subgroup_m_count = 2, subgroup_n_count = 2>
-//  CHECK-SAME:   prefetch_shared_memory
 
 //    CHECK-LABEL: func.func @matmul_256x256x256_f16_f16()
 //     CHECK-SAME:     translation_info = #[[$TRANSLATION]]
@@ -160,7 +160,7 @@
 }
 
 //          CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64
-//     CHECK-SAME: prefetch_shared_memory
+//     CHECK-SAME: gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true
 
 //          CHECK: func @expanded_matmul_transpose_b
 //     CHECK-SAME:     translation_info = #[[TRANSLATION]]
@@ -212,9 +212,9 @@
 // Make sure it generates the mfma instructions we expect for f8 inputs.
 
 //       CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
+//  CHECK-SAME:   gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true
 //  CHECK-SAME:   mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>,
 //  CHECK-SAME:     subgroup_m_count = 2, subgroup_n_count = 2>
-//  CHECK-SAME:   prefetch_shared_memory
 
 //    CHECK-LABEL: func.func @matmul_256x256x256_f8_f32()
 //     CHECK-SAME:    translation_info = #[[$TRANSLATION]]
@@ -261,9 +261,9 @@
 // Make sure it generates the mfma instructions we expect for integer inputs.
 
 //       CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
+//  CHECK-SAME:   gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true
 //  CHECK-SAME:   mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_I32_16x16x32_I8>,
 //  CHECK-SAME:     subgroup_m_count = 2, subgroup_n_count = 2>
-//  CHECK-SAME:   prefetch_shared_memory
 
 //    CHECK-LABEL: func.func @matmul_256x256x256_i8_i32()
 //     CHECK-SAME:    translation_info = #[[$TRANSLATION]]
@@ -310,9 +310,9 @@
 // Make sure it generates the mfma instructions we expect for integer inputs.
 
 //       CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
+//  CHECK-SAME:   gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true
 //  CHECK-SAME:   mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_I32_16x16x32_I8>,
 //  CHECK-SAME:     subgroup_m_count = 2, subgroup_n_count = 2>
-//  CHECK-SAME:   prefetch_shared_memory
 
 //    CHECK-LABEL: func.func @matmul_transpose_b_256x256x256_i8_i32()
 //     CHECK-SAME:    translation_info = #[[$TRANSLATION]]
@@ -413,9 +413,9 @@
 
 
 //       CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [128, 2, 1] subgroup_size = 64
+//  CHECK-SAME:   gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true
 //  CHECK-SAME:   mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
 //  CHECK-SAME:     subgroup_m_count = 2, subgroup_n_count = 2>
-//  CHECK-SAME: prefetch_shared_memory
 
 //    CHECK-LABEL: func.func @generic_2x1024x20x64x1280_f16
 // This has more than 2 iteartions. So we have prefetching enabled for this case. Due to
@@ -462,9 +462,9 @@
 }
 
 //       RDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 2, 1] subgroup_size = 32
+//  RDNA3-SAME:   gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true
 //  RDNA3-SAME:   mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>,
 //  RDNA3-SAME:     subgroup_m_count = 2, subgroup_n_count = 2>
-//  RDNA3-SAME:   prefetch_shared_memory
 
 //    RDNA3-LABEL: func.func @matmul_256x256x256_f16_f32
 //     RDNA3-SAME:    translation_info = #[[$TRANSLATION]]
@@ -512,9 +512,9 @@
 }
 
 //       RDNA3: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [64, 2, 1] subgroup_size = 32
+//  RDNA3-SAME:   gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true
 //  RDNA3-SAME:   mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F16>,
 //  RDNA3-SAME:     subgroup_m_count = 2, subgroup_n_count = 2>
-//  RDNA3-SAME:   prefetch_shared_memory
 
 //    RDNA3-LABEL: func.func @matmul_256x256x256_f16_f16
 //     RDNA3-SAME:    translation_info = #[[$TRANSLATION]]
@@ -563,9 +563,9 @@
 // Basic pipeline test to make sure it generates the instructions we expect.
 
 // CHECK:       #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUPadAndVectorDistribute workgroup_size = [64, 1, 1] subgroup_size = 64
+// CHECK-SAME:    gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true
 // CHECK-SAME:    mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
 // CHECK-SAME:    subgroup_m_count = 1, subgroup_n_count = 1>
-// CHECK-SAME:    prefetch_shared_memory
 
 // CHECK-LABEL: func.func @unaligned_nk_batch_matmul()
 // CHECK-SAME:    translation_info = #[[$TRANSLATION]]
@@ -649,9 +649,9 @@
 // Basic pipeline test to make sure it generates the instructions we expect.
 
 // CHECK:       #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64
+// CHECK-SAME:    gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true
 // CHECK-SAME:    mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
 // CHECK-SAME:    subgroup_m_count = 1, subgroup_n_count = 4>
-// CHECK-SAME:    prefetch_shared_memory
 
 // CHECK-LABEL: func.func @contract_schedule_considering_read_layout()
 // CHECK-SAME:    translation_info = #[[$TRANSLATION]]
@@ -709,7 +709,7 @@
 // CHECK-SAME:    mma_schedule = #iree_gpu.mma_schedule<intrinsic = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
 // CHECK-SAME:    subgroup_m_count = 1, subgroup_n_count = 1>
 // Prefetching is disabled for attention for now
-// CHECK-NOT:     prefetch_shared_memory
+// CHECK-NOT:     gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true
 
 // CHECK-LABEL: func.func @attention_20x4096x64x4096x64()
 // CHECK-SAME:    translation_info = #[[$TRANSLATION]]