Move ROCM tests to fix dialect not registered error (#21811)

Signed-off-by: Jorn Tuyls <jorn.tuyls@gmail.com>
diff --git a/compiler/plugins/target/ROCM/test/BUILD.bazel b/compiler/plugins/target/ROCM/test/BUILD.bazel
index 7998b76..e00154a 100644
--- a/compiler/plugins/target/ROCM/test/BUILD.bazel
+++ b/compiler/plugins/target/ROCM/test/BUILD.bazel
@@ -20,6 +20,7 @@
         "default_tuning_specs_amdgpu.mlir",
         "enable_tensor_ukernels.mlir",
         "gpu_encoding_attrs.mlir",
+        "lower_rocm_ukernel_descriptor.mlir",
         "lowering_strategy_from_tuning_spec.mlir",
         "ukernel_pipeline_transform.mlir",
     ],
diff --git a/compiler/plugins/target/ROCM/test/CMakeLists.txt b/compiler/plugins/target/ROCM/test/CMakeLists.txt
index 54cb73d..cfe872b 100644
--- a/compiler/plugins/target/ROCM/test/CMakeLists.txt
+++ b/compiler/plugins/target/ROCM/test/CMakeLists.txt
@@ -20,6 +20,7 @@
     "default_tuning_specs_amdgpu.mlir"
     "enable_tensor_ukernels.mlir"
     "gpu_encoding_attrs.mlir"
+    "lower_rocm_ukernel_descriptor.mlir"
     "lowering_strategy_from_tuning_spec.mlir"
     "ukernel_pipeline_transform.mlir"
   TOOLS
diff --git a/compiler/plugins/target/ROCM/test/lower_rocm_ukernel_descriptor.mlir b/compiler/plugins/target/ROCM/test/lower_rocm_ukernel_descriptor.mlir
new file mode 100644
index 0000000..ba1c76b
--- /dev/null
+++ b/compiler/plugins/target/ROCM/test/lower_rocm_ukernel_descriptor.mlir
@@ -0,0 +1,69 @@
+// RUN: iree-opt --iree-codegen-lower-bitcode-ukernels --split-input-file %s | FileCheck %s
+
+// CHECK-LABEL:       @pure_argmax_ukernel_test_with_provider
+// CHECK-SAME:          %[[ARG0:[a-zA-Z0-9]+]]: tensor<?xf32>
+// CHECK-SAME:          %[[ARG1:[a-zA-Z0-9]+]]: tensor<f32>
+// CHECK-SAME:          %[[ARG2:[a-zA-Z0-9]+]]: tensor<i64>
+// CHECK:               %[[C0:.*]] = arith.constant 0 : index
+// CHECK:               %[[DIM:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?xf32>
+// CHECK:               %[[FALSE:.*]] = arith.constant false
+// CHECK:               %[[MICRO_KERNEL:.+]]:2 = iree_codegen.ukernel.generic "iree_uk_amdgpu_argmax_f32i64"
+// CHECK-SAME:            ins(%[[ARG0]] : tensor<?xf32>)
+// CHECK-SAME:            outs(%[[ARG1]], %[[ARG2]] : tensor<f32>, tensor<i64>)
+// CHECK-SAME:            (%[[DIM]], %[[FALSE]] : index, i1)
+// CHECK-SAME:            fn_def_attrs {vm.import.module = "rocm"}
+// CHECK-SAME{LITERAL}:   strided_dims([[], [], []])
+// CHECK:               return %[[MICRO_KERNEL]]#1
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree_codegen.ukernel_provider = #rocm.ukernel_provider}>
+#map = affine_map<(d0) -> (d0)>
+#map1 = affine_map<(d0) -> ()>
+module attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+  func.func @pure_argmax_ukernel_test_with_provider(%arg0: tensor<?xf32>, %arg1: tensor<f32>, %arg2: tensor<i64>) -> tensor<i64> {
+    %cst = arith.constant 0.000000e+00 : f32
+    %0:2 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["reduction"]} ins(%arg0 : tensor<?xf32>) outs(%arg1, %arg2 : tensor<f32>, tensor<i64>) attrs =  {iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"iree_uk_amdgpu_argmax_f32i64", bitcode>} {
+    ^bb0(%in: f32, %out: f32, %out_0: i64):
+      %1 = linalg.index 0 : index
+      %2 = arith.index_cast %1 : index to i64
+      %3 = arith.maximumf %in, %out : f32
+      %4 = arith.cmpf ogt, %in, %out : f32
+      %5 = arith.select %4, %2, %out_0 : i64
+      linalg.yield %3, %5 : f32, i64
+    } -> (tensor<f32>, tensor<i64>)
+    return %0#1 : tensor<i64>
+  }
+}
+
+// -----
+
+// CHECK-LABEL:       @argmax_ukernel_test_with_provider
+// CHECK-SAME:          %[[ARG0:[a-zA-Z0-9]+]]: tensor<?xf32>
+// CHECK-SAME:          %[[ARG1:[a-zA-Z0-9]+]]: tensor<f32>
+// CHECK-SAME:          %[[ARG2:[a-zA-Z0-9]+]]: tensor<i64>
+// CHECK:               %[[C0:.*]] = arith.constant 0 : index
+// CHECK:               %[[DIM:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?xf32>
+// CHECK:               %[[TRUE:.*]] = arith.constant true
+// CHECK:               %[[MICRO_KERNEL:.+]]:2 = iree_codegen.ukernel.generic "iree_uk_amdgpu_argmax_f32i64"
+// CHECK-SAME:            ins(%[[ARG0]] : tensor<?xf32>)
+// CHECK-SAME:            outs(%[[ARG1]], %[[ARG2]] : tensor<f32>, tensor<i64>)
+// CHECK-SAME:            (%[[DIM]], %[[TRUE]] : index, i1)
+// CHECK-SAME:            fn_def_attrs {vm.import.module = "rocm"}
+// CHECK-SAME{LITERAL}:   strided_dims([[], [], []])
+// CHECK:               return %[[MICRO_KERNEL]]#0, %[[MICRO_KERNEL]]#1
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree_codegen.ukernel_provider = #rocm.ukernel_provider}>
+#map = affine_map<(d0) -> (d0)>
+#map1 = affine_map<(d0) -> ()>
+module attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+  func.func @argmax_ukernel_test_with_provider(%arg0: tensor<?xf32>, %arg1: tensor<f32>, %arg2: tensor<i64>) -> (tensor<f32>, tensor<i64>) {
+    %cst = arith.constant 0.000000e+00 : f32
+    %0:2 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["reduction"]} ins(%arg0 : tensor<?xf32>) outs(%arg1, %arg2 : tensor<f32>, tensor<i64>) attrs =  {iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"iree_uk_amdgpu_argmax_f32i64", bitcode>} {
+    ^bb0(%in: f32, %out: f32, %out_0: i64):
+      %1 = linalg.index 0 : index
+      %2 = arith.index_cast %1 : index to i64
+      %3 = arith.maximumf %in, %out : f32
+      %4 = arith.cmpf ogt, %in, %out : f32
+      %5 = arith.select %4, %2, %out_0 : i64
+      linalg.yield %3, %5 : f32, i64
+    } -> (tensor<f32>, tensor<i64>)
+    return %0#0, %0#1 : tensor<f32>, tensor<i64>
+  }
+}
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/lower_ukernel_bitcode_descriptor.mlir b/compiler/src/iree/compiler/Codegen/Common/test/lower_ukernel_bitcode_descriptor.mlir
index d276001..38e0590 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/lower_ukernel_bitcode_descriptor.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/lower_ukernel_bitcode_descriptor.mlir
@@ -27,73 +27,3 @@
     return %2 : tensor<16x16xf32>
   }
 }
-
-// -----
-
-// CHECK-LABEL:       @pure_argmax_ukernel_test_with_provider
-// CHECK-SAME:          %[[ARG0:[a-zA-Z0-9]+]]: tensor<?xf32>
-// CHECK-SAME:          %[[ARG1:[a-zA-Z0-9]+]]: tensor<f32>
-// CHECK-SAME:          %[[ARG2:[a-zA-Z0-9]+]]: tensor<i64>
-// CHECK:               %[[C0:.*]] = arith.constant 0 : index
-// CHECK:               %[[DIM:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?xf32>
-// CHECK:               %[[FALSE:.*]] = arith.constant false
-// CHECK:               %[[MICRO_KERNEL:.+]]:2 = iree_codegen.ukernel.generic "iree_uk_amdgpu_argmax_f32i64"
-// CHECK-SAME:            ins(%[[ARG0]] : tensor<?xf32>)
-// CHECK-SAME:            outs(%[[ARG1]], %[[ARG2]] : tensor<f32>, tensor<i64>)
-// CHECK-SAME:            (%[[DIM]], %[[FALSE]] : index, i1)
-// CHECK-SAME:            fn_def_attrs {vm.import.module = "rocm"}
-// CHECK-SAME{LITERAL}:   strided_dims([[], [], []])
-// CHECK:               return %[[MICRO_KERNEL]]#1
-#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree_codegen.ukernel_provider = #rocm.ukernel_provider}>
-#map = affine_map<(d0) -> (d0)>
-#map1 = affine_map<(d0) -> ()>
-module attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
-  func.func @pure_argmax_ukernel_test_with_provider(%arg0: tensor<?xf32>, %arg1: tensor<f32>, %arg2: tensor<i64>) -> tensor<i64> {
-    %cst = arith.constant 0.000000e+00 : f32
-    %0:2 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["reduction"]} ins(%arg0 : tensor<?xf32>) outs(%arg1, %arg2 : tensor<f32>, tensor<i64>) attrs =  {iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"iree_uk_amdgpu_argmax_f32i64", bitcode>} {
-    ^bb0(%in: f32, %out: f32, %out_0: i64):
-      %1 = linalg.index 0 : index
-      %2 = arith.index_cast %1 : index to i64
-      %3 = arith.maximumf %in, %out : f32
-      %4 = arith.cmpf ogt, %in, %out : f32
-      %5 = arith.select %4, %2, %out_0 : i64
-      linalg.yield %3, %5 : f32, i64
-    } -> (tensor<f32>, tensor<i64>)
-    return %0#1 : tensor<i64>
-  }
-}
-
-// -----
-
-// CHECK-LABEL:       @argmax_ukernel_test_with_provider
-// CHECK-SAME:          %[[ARG0:[a-zA-Z0-9]+]]: tensor<?xf32>
-// CHECK-SAME:          %[[ARG1:[a-zA-Z0-9]+]]: tensor<f32>
-// CHECK-SAME:          %[[ARG2:[a-zA-Z0-9]+]]: tensor<i64>
-// CHECK:               %[[C0:.*]] = arith.constant 0 : index
-// CHECK:               %[[DIM:.*]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?xf32>
-// CHECK:               %[[TRUE:.*]] = arith.constant true
-// CHECK:               %[[MICRO_KERNEL:.+]]:2 = iree_codegen.ukernel.generic "iree_uk_amdgpu_argmax_f32i64"
-// CHECK-SAME:            ins(%[[ARG0]] : tensor<?xf32>)
-// CHECK-SAME:            outs(%[[ARG1]], %[[ARG2]] : tensor<f32>, tensor<i64>)
-// CHECK-SAME:            (%[[DIM]], %[[TRUE]] : index, i1)
-// CHECK-SAME:            fn_def_attrs {vm.import.module = "rocm"}
-// CHECK-SAME{LITERAL}:   strided_dims([[], [], []])
-// CHECK:               return %[[MICRO_KERNEL]]#0, %[[MICRO_KERNEL]]#1
-#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree_codegen.ukernel_provider = #rocm.ukernel_provider}>
-#map = affine_map<(d0) -> (d0)>
-#map1 = affine_map<(d0) -> ()>
-module attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
-  func.func @argmax_ukernel_test_with_provider(%arg0: tensor<?xf32>, %arg1: tensor<f32>, %arg2: tensor<i64>) -> (tensor<f32>, tensor<i64>) {
-    %cst = arith.constant 0.000000e+00 : f32
-    %0:2 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["reduction"]} ins(%arg0 : tensor<?xf32>) outs(%arg1, %arg2 : tensor<f32>, tensor<i64>) attrs =  {iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"iree_uk_amdgpu_argmax_f32i64", bitcode>} {
-    ^bb0(%in: f32, %out: f32, %out_0: i64):
-      %1 = linalg.index 0 : index
-      %2 = arith.index_cast %1 : index to i64
-      %3 = arith.maximumf %in, %out : f32
-      %4 = arith.cmpf ogt, %in, %out : f32
-      %5 = arith.select %4, %2, %out_0 : i64
-      linalg.yield %3, %5 : f32, i64
-    } -> (tensor<f32>, tensor<i64>)
-    return %0#0, %0#1 : tensor<f32>, tensor<i64>
-  }
-}