Adding Transpose Microbenchmarks (#10973)

Set of microbenchmarks for validated shared mem transpose. 

Separated out from the existing `linalg_transpose.mlir` benchmarks since focusing on very large shapes for GPU. 

Part of https://github.com/iree-org/iree/issues/10005
diff --git a/tests/microbenchmarks/CMakeLists.txt b/tests/microbenchmarks/CMakeLists.txt
index eb37b11..76d7cce 100644
--- a/tests/microbenchmarks/CMakeLists.txt
+++ b/tests/microbenchmarks/CMakeLists.txt
@@ -4,7 +4,7 @@
 
 iree_microbenchmark_suite(
   NAME
-   "microbenchmark"
+   "microbenchmark_llvm-cpu"
   SRCS
     "dynamic_shape_vectorization.mlir"
     "linalg_mmt4d.mlir"
@@ -18,3 +18,13 @@
     "--iree-input-type=mhlo"
     "--iree-llvm-target-cpu-features=host"
 )
+
+iree_microbenchmark_suite(
+  NAME
+   "microbenchmark_cuda"
+  SRCS
+    "linalg_transpose.mlir"
+  FLAGS
+    "--iree-hal-target-backends=cuda"
+    "--iree-input-type=mhlo"
+)
diff --git a/tests/microbenchmarks/shared_mem_transpose.mlir b/tests/microbenchmarks/shared_mem_transpose.mlir
new file mode 100644
index 0000000..869055a
--- /dev/null
+++ b/tests/microbenchmarks/shared_mem_transpose.mlir
@@ -0,0 +1,77 @@
+// RUN: iree-run-mlir --iree-hal-target-backends=cuda --iree-llvm-link-embedded=true  %s
+
+//===----------------------------------------------------------------------===//
+// Transpose ops.
+// Naming convention: '_'.join(
+//   [transpose,
+//    {output-shape])
+//
+//===----------------------------------------------------------------------===//
+
+util.global private @"__transpose_4096_4096_input" {noinline} = dense<1.0> : tensor<4096x4096xf32>
+
+func.func @transpsoe_4096_4096() -> tensor<4096x4096xf32> {
+  %input_ptr = util.global.address @"__transpose_4096_4096_input" : !util.ptr<tensor<4096x4096xf32>>
+  %input = util.global.load.indirect %input_ptr : !util.ptr<tensor<4096x4096xf32>> -> tensor<4096x4096xf32>
+  %output = tensor.empty() : tensor<4096x4096xf32>
+  %result = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, affine_map<(d0, d1) -> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]}
+    ins(%input : tensor<4096x4096xf32>) outs(%output : tensor<4096x4096xf32>) {
+    ^bb0(%arg1: f32, %arg2: f32):
+        linalg.yield %arg1 : f32
+    } -> tensor<4096x4096xf32>
+ return %result : tensor<4096x4096xf32>
+}
+
+util.global private @"__transpose_10_2048_1024_input" {noinline} = dense<1.0> : tensor<10x2048x1024xf32>
+
+func.func @transpsoe_10_1024_2048() -> tensor<10x1024x2048xf32> {
+  %input_ptr = util.global.address @"__transpose_10_2048_1024_input" : !util.ptr<tensor<10x2048x1024xf32>>
+  %input = util.global.load.indirect %input_ptr : !util.ptr<tensor<10x2048x1024xf32>> -> tensor<10x2048x1024xf32>
+  %output = tensor.empty() : tensor<10x1024x2048xf32>
+  %result = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
+    iterator_types = ["parallel", "parallel", "parallel"]}
+    ins(%input : tensor<10x2048x1024xf32>) outs(%output : tensor<10x1024x2048xf32>) {
+    ^bb0(%arg1: f32, %arg2: f32):
+        linalg.yield %arg1 : f32
+    } -> tensor<10x1024x2048xf32>
+ return %result : tensor<10x1024x2048xf32>
+}
+
+util.global private @"__transpose_10_2048_1024_lhs" {noinline} = dense<1.0> : tensor<10x2048x1024xf32>
+util.global private @"__transpose_10_2048_1024_rhs" {noinline} = dense<1.0> : tensor<10x2048x1024xf32>
+
+func.func @transpsoe_10_1024_2048_fusion() -> tensor<10x1024x2048xf32> {
+  %lhs_ptr = util.global.address @"__transpose_10_2048_1024_lhs" : !util.ptr<tensor<10x2048x1024xf32>>
+  %lhs = util.global.load.indirect %lhs_ptr : !util.ptr<tensor<10x2048x1024xf32>> -> tensor<10x2048x1024xf32>
+  %rhs_ptr = util.global.address @"__transpose_10_2048_1024_rhs" : !util.ptr<tensor<10x2048x1024xf32>>
+  %rhs = util.global.load.indirect %rhs_ptr : !util.ptr<tensor<10x2048x1024xf32>> -> tensor<10x2048x1024xf32>
+  %output = tensor.empty() : tensor<10x1024x2048xf32>
+  %result = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
+    iterator_types = ["parallel", "parallel", "parallel"]}
+    ins(%lhs, %rhs : tensor<10x2048x1024xf32>, tensor<10x2048x1024xf32>) outs(%output : tensor<10x1024x2048xf32>) {
+    ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
+        %0 = arith.addf %arg1, %arg2 : f32
+        linalg.yield %0 : f32
+    } -> tensor<10x1024x2048xf32>
+ return %result : tensor<10x1024x2048xf32>
+}
+
+util.global private @"__transpose_10_2064_1024_input" {noinline} = dense<1.0> : tensor<10x2064x1024xf32>
+
+func.func @transpsoe_10_1024_2064_unaligned() -> tensor<10x1024x2064xf32> {
+  %input_ptr = util.global.address @"__transpose_10_2064_1024_input" : !util.ptr<tensor<10x2064x1024xf32>>
+  %input = util.global.load.indirect %input_ptr : !util.ptr<tensor<10x2064x1024xf32>> -> tensor<10x2064x1024xf32>
+  %output = tensor.empty() : tensor<10x1024x2064xf32>
+  %result = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>],
+    iterator_types = ["parallel", "parallel", "parallel"]}
+    ins(%input : tensor<10x2064x1024xf32>) outs(%output : tensor<10x1024x2064xf32>) {
+    ^bb0(%arg1: f32, %arg2: f32):
+        linalg.yield %arg1 : f32
+    } -> tensor<10x1024x2064xf32>
+ return %result : tensor<10x1024x2064xf32>
+}