[Codegen] Make amdgpu_distribute_vectors return a handle (#17239)

Now that it consumes the target handle, make it return the target
function for reuse. This makes it possible to use this op when the
target function to a named sequence is a function.
diff --git a/.github/workflows/pkgci_regression_test_amdgpu_rocm.yml b/.github/workflows/pkgci_regression_test_amdgpu_rocm.yml
index 1fa6e09..87719c5 100644
--- a/.github/workflows/pkgci_regression_test_amdgpu_rocm.yml
+++ b/.github/workflows/pkgci_regression_test_amdgpu_rocm.yml
@@ -60,7 +60,7 @@
         uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
         with:
           repository: nod-ai/SHARK-TestSuite
-          ref: 816a8af832bd8518cf966e92cf5c2929d5c11a0f
+          ref: 072e8b7f3140b31669257e6042dc1f02f2a4e2cc
           path: SHARK-TestSuite
           submodules: false
       - name: Installing external TestSuite Python requirements
@@ -123,7 +123,7 @@
         uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
         with:
           repository: nod-ai/SHARK-TestSuite
-          ref: 320edabdfba7f5cf46e5cd88569fca52308d8988
+          ref: 072e8b7f3140b31669257e6042dc1f02f2a4e2cc
           path: SHARK-TestSuite
           submodules: false
           lfs: true
diff --git a/.github/workflows/pkgci_regression_test_amdgpu_vulkan.yml b/.github/workflows/pkgci_regression_test_amdgpu_vulkan.yml
index 9e8ca70..f0dbfd8 100644
--- a/.github/workflows/pkgci_regression_test_amdgpu_vulkan.yml
+++ b/.github/workflows/pkgci_regression_test_amdgpu_vulkan.yml
@@ -57,7 +57,7 @@
         uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
         with:
           repository: nod-ai/SHARK-TestSuite
-          ref: 816a8af832bd8518cf966e92cf5c2929d5c11a0f
+          ref: 072e8b7f3140b31669257e6042dc1f02f2a4e2cc
           path: SHARK-TestSuite
           submodules: false
       - name: Installing external TestSuite Python requirements
@@ -106,7 +106,7 @@
         uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
         with:
           repository: nod-ai/SHARK-TestSuite
-          ref: 816a8af832bd8518cf966e92cf5c2929d5c11a0f
+          ref: 072e8b7f3140b31669257e6042dc1f02f2a4e2cc
           path: SHARK-TestSuite
           submodules: false
           lfs: true
diff --git a/.github/workflows/pkgci_regression_test_cpu.yml b/.github/workflows/pkgci_regression_test_cpu.yml
index e71d277..95daebc 100644
--- a/.github/workflows/pkgci_regression_test_cpu.yml
+++ b/.github/workflows/pkgci_regression_test_cpu.yml
@@ -57,7 +57,7 @@
         uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
         with:
           repository: nod-ai/SHARK-TestSuite
-          ref: 816a8af832bd8518cf966e92cf5c2929d5c11a0f
+          ref: 072e8b7f3140b31669257e6042dc1f02f2a4e2cc
           path: SHARK-TestSuite
           submodules: false
           lfs: true
@@ -122,7 +122,7 @@
         uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
         with:
           repository: nod-ai/SHARK-TestSuite
-          ref: 816a8af832bd8518cf966e92cf5c2929d5c11a0f
+          ref: 072e8b7f3140b31669257e6042dc1f02f2a4e2cc
           path: SHARK-TestSuite
           submodules: false
           lfs: true
diff --git a/.github/workflows/pkgci_regression_test_nvidiagpu_cuda.yml b/.github/workflows/pkgci_regression_test_nvidiagpu_cuda.yml
index 92a0b7b..2eeed0f 100644
--- a/.github/workflows/pkgci_regression_test_nvidiagpu_cuda.yml
+++ b/.github/workflows/pkgci_regression_test_nvidiagpu_cuda.yml
@@ -57,7 +57,7 @@
         uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
         with:
           repository: nod-ai/SHARK-TestSuite
-          ref: 816a8af832bd8518cf966e92cf5c2929d5c11a0f
+          ref: 072e8b7f3140b31669257e6042dc1f02f2a4e2cc
           path: SHARK-TestSuite
           submodules: false
       - name: Installing external TestSuite Python requirements
diff --git a/.github/workflows/pkgci_regression_test_nvidiagpu_vulkan.yml b/.github/workflows/pkgci_regression_test_nvidiagpu_vulkan.yml
index 4bbe3d3..aba5e91 100644
--- a/.github/workflows/pkgci_regression_test_nvidiagpu_vulkan.yml
+++ b/.github/workflows/pkgci_regression_test_nvidiagpu_vulkan.yml
@@ -57,7 +57,7 @@
         uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
         with:
           repository: nod-ai/SHARK-TestSuite
-          ref: 816a8af832bd8518cf966e92cf5c2929d5c11a0f
+          ref: 072e8b7f3140b31669257e6042dc1f02f2a4e2cc
           path: SHARK-TestSuite
           submodules: false
       - name: Installing external TestSuite Python requirements
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
index e73d935..54ef39a 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
@@ -1486,12 +1486,17 @@
   if (failed(distributeVectorOps(target, patterns, options))) {
     return emitDefaultSilenceableFailure(target);
   }
+  // TODO: The consumption of the target handle is only required because the
+  // transform dialect interpreter will crash without it. This op should not
+  // need to invalidate the handle.
+  results.push_back(target);
   return DiagnosedSilenceableFailure::success();
 }
 
 void transform_dialect::AMDGPUDistributeVectorsOp::getEffects(
     SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
   transform::consumesHandle(getTarget(), effects);
+  transform::producesHandle(getResult(), effects);
   transform::modifiesPayload(effects);
 }
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td
index c0fc457..7cd1454 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td
@@ -741,11 +741,11 @@
 
     let arguments = (ins TransformHandleTypeInterface:$target,
                          UnitAttr:$test_conversion);
-    let results = (outs);
+    let results = (outs TransformHandleTypeInterface:$result);
 
     let assemblyFormat = [{
       $target (`test_conversion` $test_conversion^)?
-      attr-dict `:` type($target)
+      attr-dict `:` functional-type(operands, results)
     }];
     let cppNamespace = "mlir::iree_compiler::IREE::transform_dialect";
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_contraction_distribution.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_contraction_distribution.mlir
index 8710f76..b46a8d0 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_contraction_distribution.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/amdgpu_contraction_distribution.mlir
@@ -8,7 +8,7 @@
 #map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map3 = affine_map<(d0, d1, d2) -> (d1, d0)>
 
-// A: vector<16x16>, layout = layoutA 
+// A: vector<16x16>, layout = layoutA
 #row_layout = #iree_vector_ext.per_dim_layout<[BATCHX, LANEX], [1, 16]>
 #col_layout = #iree_vector_ext.per_dim_layout<[BATCHY, LANEY, VECTORX], [1, 4, 4]>
 #layout_a = #iree_vector_ext.layout<#row_layout, #col_layout>
@@ -43,7 +43,7 @@
   }
   transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.iree.amdgpu_distribute_vectors %top_level_func test_conversion : !transform.any_op
+    transform.iree.amdgpu_distribute_vectors %top_level_func test_conversion : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
 }
@@ -84,7 +84,7 @@
   }
   transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.iree.amdgpu_distribute_vectors %top_level_func test_conversion : !transform.any_op
+    transform.iree.amdgpu_distribute_vectors %top_level_func test_conversion : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
 }
@@ -132,7 +132,7 @@
   }
   transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.iree.amdgpu_distribute_vectors %top_level_func test_conversion : !transform.any_op
+    transform.iree.amdgpu_distribute_vectors %top_level_func test_conversion : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
 }
@@ -179,7 +179,7 @@
   }
   transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
     %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.iree.amdgpu_distribute_vectors %top_level_func test_conversion : !transform.any_op
+    transform.iree.amdgpu_distribute_vectors %top_level_func test_conversion : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
 }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_mfma_transform_spec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_mfma_transform_spec.mlir
index bb0ce64..ec99893 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_mfma_transform_spec.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/attention_mfma_transform_spec.mlir
@@ -170,9 +170,8 @@
     transform.iree.set_contraction_layout_attributes %contract2, %layout16x16x16 : !transform.any_op, !transform.any_param
 
     %distribute_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
-    transform.iree.amdgpu_distribute_vectors %distribute_func test_conversion : !transform.any_op
+    %distribute_func_2 = transform.iree.amdgpu_distribute_vectors %distribute_func test_conversion : (!transform.any_op) -> !transform.any_op
 
-    %distribute_func_2 = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
     transform.apply_patterns to %distribute_func_2 {
       transform.apply_patterns.canonicalization
     } : !transform.any_op