[VectorDistribution] Remove signatures after distribution (#19319) This patch removes discardable attributes set for vector distribution, after the pass. These attributes otherwise keep propagating and make output IR harder to read.

commit: b124695f8139cd8695780a590487684e40cd1d0a [log] [tgz]
author: Kunwar Grover <groverkss@gmail.com> Fri Nov 29 13:58:14 2024 +0000
committer: GitHub <noreply@github.com> Fri Nov 29 13:58:14 2024 +0000
tree: 12998f0283f49497f366e314853b01550c6087ae
parent: d3eef09090d8ad5fba335d7921895e1fdf600e45 [diff]
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVectorDistribution.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVectorDistribution.cpp
index a883180..53b8676 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVectorDistribution.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUVectorDistribution.cpp

@@ -314,6 +314,11 @@
     return failure();
   }
 
+  // Remove signature after distribution.
+  root->walk([](Operation *op) {
+    op->removeDiscardableAttr(kVectorLayoutFetcherStorageAttrName);
+  });
+
   if (options.verifyConversion()) {
     WalkResult hasConversionOp = root->walk([](Operation *op) {
       if (isa<IREE::VectorExt::ToSIMDOp, IREE::VectorExt::ToSIMTOp>(op)) {

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir
index eecd2f0..7872b51 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_contract_amdgpu.mlir

@@ -83,7 +83,7 @@
 // CHECK:       %[[R_CAST:.+]] = vector.shape_cast %[[MFMA]] : vector<16xf32> to vector<4x1x4x1xf32>
 // CHECK:       %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<4x1x4x1xf32> to vector<1x1x4x1x4x1xf32>
 // CHECK:       %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x1x4x1x4x1xf32> -> vector<32x32xf32>
-// CHECK:       return {{.*}} %[[R_SIMD]]
+// CHECK:       return %[[R_SIMD]]
 
 // -----
 
@@ -161,7 +161,7 @@
 //       CHECK:   %[[R_CAST:.+]] = vector.shape_cast %[[MFMA]]  : vector<4xf32> to vector<1x1x4x1xf32>
 //       CHECK:   %[[B_OUT:.*]]  = vector.broadcast %[[R_CAST]] : vector<1x1x4x1xf32> to vector<1x1x1x1x4x1xf32>
 //       CHECK:   %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x1x1x1x4x1xf32> -> vector<16x16xf32>
-//       CHECK:   return {{.*}} %[[R_SIMD]]
+//       CHECK:   return %[[R_SIMD]]
 
 // -----
 
@@ -250,7 +250,7 @@
 //       CHECK:   %[[R1_CAST:.+]] = vector.shape_cast %[[MFMA1]] : vector<16xf32> to vector<4x1x4x1xf32>
 //       CHECK:   %[[C1_INS:.+]] = vector.insert %[[R1_CAST]], %[[C0_INS]] [1, 0] : vector<4x1x4x1xf32> into vector<2x1x4x1x4x1xf32>
 //       CHECK:   %[[R:.+]] = iree_vector_ext.to_simd %[[C1_INS]] : vector<2x1x4x1x4x1xf32> -> vector<64x32xf32>
-//       CHECK:   return {{.*}}} %[[R]]
+//       CHECK:   return %[[R]]
 
 // -----
 
@@ -589,7 +589,7 @@
 //       CHECK:   %[[R_CAST:.+]] = vector.shape_cast %[[WMMA]] : vector<8xf32> to vector<8x1x1x1xf32>
 //       CHECK:   %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<8x1x1x1xf32> to vector<1x1x8x1x1x1xf32>
 //       CHECK:   %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x1x8x1x1x1xf32> -> vector<16x16xf32>
-//       CHECK:   return {{.*}} %[[R_SIMD]]
+//       CHECK:   return %[[R_SIMD]]
 
 // -----
 
@@ -682,4 +682,4 @@
 // CHECK:       %[[R_CAST:.+]] = vector.shape_cast %[[MFMA_1]] : vector<16xf32> to vector<4x1x4x1xf32>
 // CHECK:       %[[B_OUT:.*]] = vector.broadcast %[[R_CAST]] : vector<4x1x4x1xf32> to vector<1x1x4x1x4x1xf32>
 // CHECK:       %[[R_SIMD:.+]] = iree_vector_ext.to_simd %[[B_OUT]] : vector<1x1x4x1x4x1xf32> -> vector<32x32xf32>
-// CHECK:       return {{.*}} %[[R_SIMD]]
+// CHECK:       return %[[R_SIMD]]

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution.mlir
index f33c91f..b982f63 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution.mlir

@@ -955,21 +955,11 @@
   }
 }
 
-// CHECK:      #[[$LAYOUT:.+]] = #iree_vector_ext.nested_layout
-// CHECK-SAME:   subgroup_tile = [2, 2],
-// CHECK-SAME:   batch_tile = [4, 2]
-// CHECK-SAME:   outer_tile = [1, 2]
-// CHECK-SAME:   thread_tile = [16, 4]
-// CHECK-SAME:   element_tile = [2, 2]
-// CHECK-SAME:   subgroup_strides = [1, 2],
-// CHECK-SAME:   thread_strides = [1, 16]
-
 // CHECK-LABEL: func @transpose
 // CHECK: iree_vector_ext.to_simt %{{.*}} : vector<64x256xf16> -> vector<2x4x2x1x2x2xf16>
 // CHECK: vector.transpose %{{.*}}, [1, 0, 3, 2, 5, 4] : vector<2x4x2x1x2x2xf16> to vector<4x2x1x2x2x2xf16>
 // CHECK: math.sqrt %{{.*}} : vector<4x2x1x2x2x2xf16>
 // CHECK: iree_vector_ext.to_simd %{{.*}} : vector<4x2x1x2x2x2xf16> -> vector<256x64xf16>
-// CHECK: return {{.*}}#[[$LAYOUT]]
 
 // -----
 

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir
index a503664..343366e 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_vector_distribution.mlir

@@ -19,10 +19,10 @@
   %root = arith.constant dense<0.0> : vector<128x128x128xf16>
   %rootl = iree_vector_ext.to_layout %root to layout(#nested) : vector<128x128x128xf16>
   // CHECK-DAG: %[[B:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<128x128x128xf16> -> vector<8x2x4x1x4x4x1x8x2xf16>
-  // CHECK-DAG: %[[C:.*]] = arith.mulf %[[B]], %[[ROOT]] {{.*}} : vector<8x2x4x1x4x4x1x8x2xf16>
+  // CHECK-DAG: %[[C:.*]] = arith.mulf %[[B]], %[[ROOT]] : vector<8x2x4x1x4x4x1x8x2xf16>
   %c = arith.mulf %rootl, %b : vector<128x128x128xf16>
   // CHECK-DAG: %[[A:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<128x128x128xf16> -> vector<8x2x4x1x4x4x1x8x2xf16>
-  // CHECK-DAG: %[[D:.*]] = arith.addf %[[C]], %[[A]] fastmath<reassoc,nnan> {{.*}} : vector<8x2x4x1x4x4x1x8x2xf16>
+  // CHECK-DAG: %[[D:.*]] = arith.addf %[[C]], %[[A]] fastmath<reassoc,nnan> : vector<8x2x4x1x4x4x1x8x2xf16>
   %d = arith.addf %c, %a fastmath<reassoc,nnan> : vector<128x128x128xf16>
   // CHECK: iree_vector_ext.to_simd %[[D]] : vector<8x2x4x1x4x4x1x8x2xf16> -> vector<128x128x128xf16>
   return %d : vector<128x128x128xf16>
@@ -51,10 +51,10 @@
   // CHECK: iter_args(%[[ARG0:.*]] = %[[ROOT]]) -> (vector<1x1x1x1x16x16xi32>)
   %out = scf.for %i = %c0 to %c128 step %c1 iter_args(%arg0 = %rootl) -> (vector<16x16xi32>) {
     // CHECK-DAG: %[[B:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<16x16xi32> -> vector<1x1x1x1x16x16xi32>
-    // CHECK-DAG: %[[C:.*]] = arith.muli %[[ARG0]], %[[B]] {{.*}} : vector<1x1x1x1x16x16xi32>
+    // CHECK-DAG: %[[C:.*]] = arith.muli %[[ARG0]], %[[B]] : vector<1x1x1x1x16x16xi32>
     %c = arith.muli %arg0, %b : vector<16x16xi32>
     // CHECK-DAG: %[[A:.*]] = iree_vector_ext.to_simt %{{.*}} : vector<16x16xi32> -> vector<1x1x1x1x16x16xi32>
-    // CHECK-DAG: %[[D:.*]] = arith.addi %[[C]], %[[A]] {{.*}} : vector<1x1x1x1x16x16xi32>
+    // CHECK-DAG: %[[D:.*]] = arith.addi %[[C]], %[[A]] : vector<1x1x1x1x16x16xi32>
     %d = arith.addi %c, %a : vector<16x16xi32>
     // CHECK: scf.yield %[[D]] : vector<1x1x1x1x16x16xi32>
     scf.yield %d : vector<16x16xi32>

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir
index 389339c..184d497 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_vector_distribute_gfx942.mlir

@@ -90,7 +90,7 @@
 
 //    CHECK-LABEL: func.func @matmul_256x256x256_f16_f16()
 //          CHECK:   scf.for {{.*}} = %c0 to %c256 step %c128 iter_args(%[[ARG:.+]] = {{.*}}) -> (vector<2x2x1x1x4x1xf16>)
-//          CHECK:     arith.extf %[[ARG]] {{.*}} : vector<2x2x1x1x4x1xf16> to vector<2x2x1x1x4x1xf32>
+//          CHECK:     arith.extf %[[ARG]] : vector<2x2x1x1x4x1xf16> to vector<2x2x1x1x4x1xf32>
 // CHECK-COUNT-32:     amdgpu.mfma {{.*}} {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
 //          CHECK:     %[[TRUNC:.+]] = arith.truncf %{{.*}} : vector<2x2x1x1x4x1xf32> to vector<2x2x1x1x4x1xf16>
 //          CHECK:     scf.yield %[[TRUNC]] : vector<2x2x1x1x4x1xf16>
@@ -157,7 +157,7 @@
 // This has more than 2 iteartions. So we have prefetching enabled for this case. Due to
 // prefetching, we have one iteration peeled of so upper bound is 2048 - 128 = 1920.
 //          CHECK:   scf.for {{.*}} = %c0 to %c1920 step %c128 iter_args(%[[ARG:.+]] = {{.*}}) -> (vector<4x1x1x1x4x1xf16>)
-//          CHECK:     arith.extf %[[ARG]] {{.*}} : vector<4x1x1x1x4x1xf16> to vector<4x1x1x1x4x1xf32>
+//          CHECK:     arith.extf %[[ARG]] : vector<4x1x1x1x4x1xf16> to vector<4x1x1x1x4x1xf32>
 // CHECK-COUNT-32:     amdgpu.mfma {{.*}} {blocks = 1 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32} blgp =  none : vector<4xf16>, vector<4xf16>, vector<4xf32>
 //          CHECK:     %[[TRUNC:.+]] = arith.truncf %{{.*}} : vector<4x1x1x1x4x1xf32> to vector<4x1x1x1x4x1xf16>
 //          CHECK:     scf.yield %[[TRUNC]] : vector<4x1x1x1x4x1xf16>
commit	b124695f8139cd8695780a590487684e40cd1d0a	[log] [tgz]
author	Kunwar Grover <groverkss@gmail.com>	Fri Nov 29 13:58:14 2024 +0000
committer	GitHub <noreply@github.com>	Fri Nov 29 13:58:14 2024 +0000
tree	12998f0283f49497f366e314853b01550c6087ae
parent	d3eef09090d8ad5fba335d7921895e1fdf600e45 [diff]